From ac03f1f6c27b5bfdf2cfb6d45c3667c3e0281f51 Mon Sep 17 00:00:00 2001 From: Sarah Westcott Date: Tue, 19 Nov 2013 12:15:21 -0500 Subject: [PATCH] added binary file operations to mothurout class. added comment to sequence class. permute parameter can equal 1, 2, 3, or 4 in venn command. fixed bug with sffinfo in windows with oligos file. fixed bug with summary.shared processors > 1 subsample=t, caused *ave.dist matrix values = 0. made make.biom picrust compliant. added binLabels to get.oturep command. added count file to deunique.seqs command. set.current was overwriting processors. fixed bug with pcr.seqs and aligned templates. added "group" column header to nods axes file. changed bayesian to wang in classify.seqs header. added warning about using count file with self reference in chimera.uchime. --- Mothur.xcodeproj/project.pbxproj | 10 +- chimerauchimecommand.cpp | 1 + classifyseqscommand.h | 4 +- cooccurrencecommand.h | 2 +- countseqscommand.cpp | 6 +- deuniqueseqscommand.cpp | 190 +- deuniqueseqscommand.h | 7 +- getmetacommunitycommand.h | 2 +- getoturepcommand.cpp | 24 +- getoturepcommand.h | 5 +- makebiomcommand.cpp | 2 +- makecontigscommand.cpp | 3 + mothurout.cpp | 131 ++ mothurout.h | 3 + nmdscommand.cpp | 2 +- pcrseqscommand.h | 24 +- prcseqscommand.cpp | 79 +- preclustercommand.cpp | 1 + screenseqscommand.cpp | 3 +- sequence.cpp | 43 +- sequence.hpp | 1 + setcurrentcommand.cpp | 7 +- sffinfocommand.cpp | 3797 +++++++++++++++--------------- sffinfocommand.h | 9 +- sracommand.cpp | 175 ++ sracommand.h | 46 + summarysharedcommand.cpp | 7 +- venncommand.cpp | 52 +- venncommand.h | 6 +- 29 files changed, 2547 insertions(+), 2095 deletions(-) mode change 100644 => 100755 sffinfocommand.cpp create mode 100644 sracommand.cpp create mode 100644 sracommand.h diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 2314b5d..f3b6a09 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -38,6 +38,7 @@ A73DDC3813C4BF64006AAE38 /* mothurmetastats.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A73DDC3713C4BF64006AAE38 /* mothurmetastats.cpp */; }; A741744C175CD9B1007DF49B /* makelefsecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A741744A175CD9B1007DF49B /* makelefsecommand.cpp */; }; A741FAD215D1688E0067BCC5 /* sequencecountparser.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A741FAD115D1688E0067BCC5 /* sequencecountparser.cpp */; }; + A747EC71181EA0F900345732 /* sracommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A747EC70181EA0F900345732 /* sracommand.cpp */; }; A7496D2E167B531B00CC7D7C /* kruskalwalliscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7496D2C167B531B00CC7D7C /* kruskalwalliscommand.cpp */; }; A74A9A9F148E881E00AB5E3E /* spline.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74A9A9E148E881E00AB5E3E /* spline.cpp */; }; A74C06E916A9C0A9008390A3 /* primerdesigncommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74C06E816A9C0A8008390A3 /* primerdesigncommand.cpp */; }; @@ -460,6 +461,8 @@ A741744B175CD9B1007DF49B /* makelefsecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = makelefsecommand.h; sourceTree = ""; }; A741FAD115D1688E0067BCC5 /* sequencecountparser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sequencecountparser.cpp; sourceTree = ""; }; A741FAD415D168A00067BCC5 /* sequencecountparser.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sequencecountparser.h; sourceTree = ""; }; + A747EC6F181EA0E500345732 /* sracommand.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = sracommand.h; sourceTree = ""; }; + A747EC70181EA0F900345732 /* sracommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sracommand.cpp; sourceTree = ""; }; A7496D2C167B531B00CC7D7C /* kruskalwalliscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = kruskalwalliscommand.cpp; sourceTree = ""; }; A7496D2D167B531B00CC7D7C /* kruskalwalliscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = kruskalwalliscommand.h; sourceTree = ""; }; A74A9A9D148E881E00AB5E3E /* spline.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = spline.h; sourceTree = ""; }; @@ -1575,6 +1578,8 @@ A7E9B83F12D37EC400DA6239 /* splitabundcommand.cpp */, A7E9B84212D37EC400DA6239 /* splitgroupscommand.h */, A7E9B84112D37EC400DA6239 /* splitgroupscommand.cpp */, + A747EC6F181EA0E500345732 /* sracommand.h */, + A747EC70181EA0F900345732 /* sracommand.cpp */, A7E9B85012D37EC400DA6239 /* subsamplecommand.h */, A7E9B84F12D37EC400DA6239 /* subsamplecommand.cpp */, A7E9B85812D37EC400DA6239 /* summarycommand.h */, @@ -2394,6 +2399,7 @@ A77916E8176F7F7600EEFE18 /* designmap.cpp in Sources */, A7D9378A17B146B5001E90B0 /* wilcox.cpp in Sources */, A7F24FC317EA36600021DC9A /* classifyrfsharedcommand.cpp in Sources */, + A747EC71181EA0F900345732 /* sracommand.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -2446,8 +2452,8 @@ GCC_OPTIMIZATION_LEVEL = 3; GCC_PREPROCESSOR_DEFINITIONS = ( "MOTHUR_FILES=\"\\\"../../release\\\"\"", - "VERSION=\"\\\"1.31.0\\\"\"", - "RELEASE_DATE=\"\\\"5/24/2013\\\"\"", + "VERSION=\"\\\"1.32.0\\\"\"", + "RELEASE_DATE=\"\\\"10/31/2013\\\"\"", ); GCC_VERSION = ""; "GCC_VERSION[arch=*]" = ""; diff --git a/chimerauchimecommand.cpp b/chimerauchimecommand.cpp index 346afe2..3a9f42b 100644 --- a/chimerauchimecommand.cpp +++ b/chimerauchimecommand.cpp @@ -569,6 +569,7 @@ ChimeraUchimeCommand::ChimeraUchimeCommand(string option) { if (hasName && (templatefile != "self")) { m->mothurOut("You have provided a namefile and the reference parameter is not set to self. I am not sure what reference you are trying to use, aborting."); m->mothurOutEndLine(); abort=true; } + if (hasCount && (templatefile != "self")) { m->mothurOut("You have provided a countfile and the reference parameter is not set to self. I am not sure what reference you are trying to use, aborting."); m->mothurOutEndLine(); abort=true; } if (hasGroup && (templatefile != "self")) { m->mothurOut("You have provided a group file and the reference parameter is not set to self. I am not sure what reference you are trying to use, aborting."); m->mothurOutEndLine(); abort=true; } //look for uchime exe diff --git a/classifyseqscommand.h b/classifyseqscommand.h index 59d9ee2..55546f2 100644 --- a/classifyseqscommand.h +++ b/classifyseqscommand.h @@ -166,7 +166,7 @@ static DWORD WINAPI MyClassThreadFunction(LPVOID lpParam){ //make classify Classify* myclassify; string outputMethodTag = pDataArray->method + "."; - if(pDataArray->method == "bayesian"){ myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip, pDataArray->writeShortcuts); } + if(pDataArray->method == "wang"){ myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip, pDataArray->writeShortcuts); } else if(pDataArray->method == "knn"){ myclassify = new Knn(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->gapOpen, pDataArray->gapExtend, pDataArray->match, pDataArray->misMatch, pDataArray->numWanted, pDataArray->threadID); } else if(pDataArray->method == "zap"){ outputMethodTag = pDataArray->search + "_" + outputMethodTag; @@ -174,7 +174,7 @@ static DWORD WINAPI MyClassThreadFunction(LPVOID lpParam){ else { myclassify = new AlignTree(pDataArray->templateFileName, pDataArray->taxonomyFileName, pDataArray->cutoff); } } else { - pDataArray->m->mothurOut(pDataArray->search + " is not a valid method option. I will run the command using bayesian."); + pDataArray->m->mothurOut(pDataArray->method + " is not a valid method option. I will run the command using wang."); pDataArray->m->mothurOutEndLine(); myclassify = new Bayesian(pDataArray->taxonomyFileName, pDataArray->templateFileName, pDataArray->search, pDataArray->kmerSize, pDataArray->cutoff, pDataArray->iters, pDataArray->threadID, pDataArray->flip, pDataArray->writeShortcuts); } diff --git a/cooccurrencecommand.h b/cooccurrencecommand.h index 758a9ff..94930ac 100644 --- a/cooccurrencecommand.h +++ b/cooccurrencecommand.h @@ -26,7 +26,7 @@ public: ~CooccurrenceCommand(){} vector setParameters(); - string getCommandName() { return "Cooccurrence"; } + string getCommandName() { return "cooccurrence"; } string getCommandCategory() { return "Hypothesis Testing"; } string getHelpString(); diff --git a/countseqscommand.cpp b/countseqscommand.cpp index 411a814..575ffe8 100644 --- a/countseqscommand.cpp +++ b/countseqscommand.cpp @@ -628,7 +628,7 @@ map CountSeqsCommand::processNameFile(string name) { } } in.close(); - out.close(); + if (rest != "") { vector pieces = m->splitWhiteSpace(rest); @@ -651,6 +651,7 @@ map CountSeqsCommand::processNameFile(string name) { } } + out.close(); return indexToNames; } @@ -704,7 +705,7 @@ map CountSeqsCommand::getGroupNames(string filename, set& n } } in.close(); - out.close(); + if (rest != "") { vector pieces = m->splitWhiteSpace(rest); @@ -726,6 +727,7 @@ map CountSeqsCommand::getGroupNames(string filename, set& n } } } + out.close(); for (it = groupIndex.begin(); it != groupIndex.end(); it++) { indexToGroups[it->second] = it->first; } diff --git a/deuniqueseqscommand.cpp b/deuniqueseqscommand.cpp index f72a7c9..a6c151c 100644 --- a/deuniqueseqscommand.cpp +++ b/deuniqueseqscommand.cpp @@ -9,12 +9,14 @@ #include "deuniqueseqscommand.h" #include "sequence.hpp" +#include "counttable.h" //********************************************************************************************************************** vector DeUniqueSeqsCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta",false,true,true); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(pname); + CommandParameter pname("name", "InputTypes", "", "", "namecount", "namecount", "none","name",false,false,true); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "namecount", "namecount", "none","group",false,false,true); parameters.push_back(pcount); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); @@ -31,8 +33,8 @@ vector DeUniqueSeqsCommand::setParameters(){ string DeUniqueSeqsCommand::getHelpString(){ try { string helpString = ""; - helpString += "The deunique.seqs command reads a fastafile and namefile, and creates a fastafile containing all the sequences.\n"; - helpString += "The deunique.seqs command parameters are fasta and name, both are required, unless you have valid current name and fasta files.\n"; + helpString += "The deunique.seqs command reads a fastafile and namefile or countfile, and creates a fastafile containing all the sequences. It you provide a count file with group information a group file is also created.\n"; + helpString += "The deunique.seqs command parameters are fasta, name and count. Fasta is required and you must provide either a name or count file.\n"; helpString += "The deunique.seqs command should be in the following format: \n"; helpString += "deunique.seqs(fasta=yourFastaFile, name=yourNameFile) \n"; helpString += "Example deunique.seqs(fasta=abrecovery.unique.fasta, name=abrecovery.names).\n"; @@ -49,7 +51,8 @@ string DeUniqueSeqsCommand::getOutputPattern(string type) { try { string pattern = ""; - if (type == "fasta") { pattern = "[filename],redundant.fasta"; } + if (type == "fasta") { pattern = "[filename],redundant.fasta"; } + else if (type == "group") { pattern = "[filename],redundant.groups"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } return pattern; @@ -66,6 +69,7 @@ DeUniqueSeqsCommand::DeUniqueSeqsCommand(){ setParameters(); vector tempOutNames; outputTypes["fasta"] = tempOutNames; + outputTypes["group"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "DeUniqueSeqsCommand", "DeconvoluteCommand"); @@ -98,6 +102,7 @@ DeUniqueSeqsCommand::DeUniqueSeqsCommand(string option) { //initialize outputTypes vector tempOutNames; outputTypes["fasta"] = tempOutNames; + outputTypes["group"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -119,6 +124,14 @@ DeUniqueSeqsCommand::DeUniqueSeqsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } @@ -134,16 +147,31 @@ DeUniqueSeqsCommand::DeUniqueSeqsCommand(string option) { //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; - outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it } nameFile = validParameter.validFile(parameters, "name", true); if (nameFile == "not open") { abort = true; } - else if (nameFile == "not found"){ - nameFile = m->getNameFile(); + else if (nameFile == "not found"){ nameFile = ""; } + else { m->setNameFile(nameFile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((countfile != "") && (nameFile != "")) { m->mothurOut("When executing a deunique.seqs command you must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } + + + if ((countfile == "") && (nameFile == "")) { //look for currents + nameFile = m->getNameFile(); if (nameFile != "") { m->mothurOut("Using " + nameFile + " as input file for the name parameter."); m->mothurOutEndLine(); } - else { m->mothurOut("You have no current namefile and the name parameter is required."); m->mothurOutEndLine(); abort = true; } - }else { m->setNameFile(nameFile); } + else { + countfile = m->getCountTableFile(); + if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); } + else { m->mothurOut("[ERROR]: You have no current name or count files one is required."); m->mothurOutEndLine(); abort = true; } + } + } + } } @@ -160,7 +188,9 @@ int DeUniqueSeqsCommand::execute() { //prepare filenames and open files ofstream out; - string outFastaFile = m->getRootName(m->getSimpleName(fastaFile)); + string thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(fastaFile); } + string outFastaFile = thisOutputDir + m->getRootName(m->getSimpleName(fastaFile)); int pos = outFastaFile.find("unique"); if (pos != string::npos) { outFastaFile = outputDir + outFastaFile.substr(0, pos); } else { outFastaFile = outputDir + outFastaFile; } @@ -169,55 +199,92 @@ int DeUniqueSeqsCommand::execute() { outFastaFile = getOutputFileName("fasta", variables); m->openOutputFile(outFastaFile, out); - readNamesFile(); - if (m->control_pressed) { out.close(); outputTypes.clear(); m->mothurRemove(outFastaFile); return 0; } + map nameMap; + CountTable ct; + ofstream outGroup; + string outGroupFile; + vector groups; + if (nameFile != "") { m->readNames(nameFile, nameMap); } + else { + ct.readTable(countfile, true, false); + + if (ct.hasGroupInfo()) { + thisOutputDir = outputDir; + if (outputDir == "") { thisOutputDir += m->hasPath(countfile); } + outGroupFile = thisOutputDir + m->getRootName(m->getSimpleName(countfile)); + variables["[filename]"] = outGroupFile; + outGroupFile = getOutputFileName("group", variables); + m->openOutputFile(outGroupFile, outGroup); + groups = ct.getNamesOfGroups(); + } + } + + if (m->control_pressed) { out.close(); outputTypes.clear(); m->mothurRemove(outFastaFile); if (countfile != "") { if (ct.hasGroupInfo()) { outGroup.close(); m->mothurRemove(outGroupFile); } } return 0; } ifstream in; m->openInputFile(fastaFile, in); while (!in.eof()) { - if (m->control_pressed) { in.close(); out.close(); outputTypes.clear(); m->mothurRemove(outFastaFile); return 0; } + if (m->control_pressed) { in.close(); out.close(); outputTypes.clear(); m->mothurRemove(outFastaFile); if (countfile != "") { if (ct.hasGroupInfo()) { outGroup.close(); m->mothurRemove(outGroupFile); } } return 0; } Sequence seq(in); m->gobble(in); if (seq.getName() != "") { - //look for sequence name in nameMap - map::iterator it = nameMap.find(seq.getName()); - - if (it == nameMap.end()) { m->mothurOut("[ERROR]: Your namefile does not contain " + seq.getName() + ", aborting."); m->mothurOutEndLine(); m->control_pressed = true; } - else { - vector names; - m->splitAtComma(it->second, names); - - //output sequences - for (int i = 0; i < names.size(); i++) { - out << ">" << names[i] << endl; - out << seq.getAligned() << endl; - } - - //remove seq from name map so we can check for seqs in namefile not in fastafile later - nameMap.erase(it); - } + if (nameFile != "") { + //look for sequence name in nameMap + map::iterator it = nameMap.find(seq.getName()); + + if (it == nameMap.end()) { m->mothurOut("[ERROR]: Your namefile does not contain " + seq.getName() + ", aborting."); m->mothurOutEndLine(); m->control_pressed = true; } + else { + vector names; + m->splitAtComma(it->second, names); + + //output sequences + for (int i = 0; i < names.size(); i++) { + out << ">" << names[i] << endl; + out << seq.getAligned() << endl; + } + + //remove seq from name map so we can check for seqs in namefile not in fastafile later + nameMap.erase(it); + } + }else { + if (ct.hasGroupInfo()) { + vector groupCounts = ct.getGroupCounts(seq.getName()); + int count = 1; + for (int i = 0; i < groups.size(); i++) { + for (int j = 0; j < groupCounts[i]; j++) { + outGroup << seq.getName()+"_"+toString(count) << '\t' << groups[i] << endl; count++; + } + } + + } + + int numReps = ct.getNumSeqs(seq.getName()); //will report error and set m->control_pressed if not found + for (int i = 0; i < numReps; i++) { + out << ">" << seq.getName()+"_"+toString(i+1) << endl; + out << seq.getAligned() << endl; + } + } } } in.close(); - out.close(); + out.close(); + if (countfile != "") { if (ct.hasGroupInfo()) { outGroup.close(); } } - if (nameMap.size() != 0) { //then there are names in the namefile not in the fastafile - for (map::iterator it = nameMap.begin(); it != nameMap.end(); it++) { - m->mothurOut(it->first + " is not in your fasta file, but is in your name file. Please correct."); m->mothurOutEndLine(); - } - } - - if (m->control_pressed) { outputTypes.clear(); m->mothurRemove(outFastaFile); return 0; } + + if (m->control_pressed) { outputTypes.clear(); m->mothurRemove(outFastaFile); if (countfile != "") { if (ct.hasGroupInfo()) { m->mothurRemove(outGroupFile); } }return 0; } + outputNames.push_back(outFastaFile); outputTypes["fasta"].push_back(outFastaFile); + if (countfile != "") { if (ct.hasGroupInfo()) { outputNames.push_back(outGroupFile); outputTypes["group"].push_back(outGroupFile); } } + m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); - m->mothurOut(outFastaFile); m->mothurOutEndLine(); - outputNames.push_back(outFastaFile); outputTypes["fasta"].push_back(outFastaFile); - m->mothurOutEndLine(); + for(int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + //set fasta file as new current fastafile string current = ""; @@ -225,47 +292,18 @@ int DeUniqueSeqsCommand::execute() { if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); } } - - return 0; - } - catch(exception& e) { - m->errorOut(e, "DeUniqueSeqsCommand", "execute"); - exit(1); - } -} -//********************************************************************************************************************** -int DeUniqueSeqsCommand::readNamesFile() { - try { - - ifstream inNames; - m->openInputFile(nameFile, inNames); - - string name, names; - map::iterator it; - - while(inNames){ - - if(m->control_pressed) { break; } - - inNames >> name; m->gobble(inNames); - inNames >> names; - - it = nameMap.find(name); - - if (it == nameMap.end()) { nameMap[name] = names; } - else { m->mothurOut("[ERROR]: Your namefile already contains " + name + ", aborting."); m->mothurOutEndLine(); m->control_pressed = true; } - - m->gobble(inNames); + + itTypes = outputTypes.find("group"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); } } - inNames.close(); + return 0; - } catch(exception& e) { - m->errorOut(e, "DeUniqueSeqsCommand", "readNamesFile"); + m->errorOut(e, "DeUniqueSeqsCommand", "execute"); exit(1); } } - /**************************************************************************************/ diff --git a/deuniqueseqscommand.h b/deuniqueseqscommand.h index a6d467a..042ff49 100644 --- a/deuniqueseqscommand.h +++ b/deuniqueseqscommand.h @@ -36,13 +36,10 @@ public: private: - string fastaFile, nameFile, outputDir; + string fastaFile, nameFile, outputDir, countfile; vector outputNames; bool abort; - - map nameMap; - - int readNamesFile(); + }; diff --git a/getmetacommunitycommand.h b/getmetacommunitycommand.h index d7b163e..7697430 100644 --- a/getmetacommunitycommand.h +++ b/getmetacommunitycommand.h @@ -146,7 +146,7 @@ static DWORD WINAPI MyMetaCommunityThreadFunction(LPVOID lpParam){ pDataArray->outputNames.push_back(pDataArray->matrix[i]); findQ->printZMatrix(pDataArray->matrix[i], pDataArray->m->getGroups()); - findQ->printRelAbund(pDataArray->relabunds[i], pDataArray->m->currentBinLabels); + findQ->printRelAbund(pDataArray->relabunds[i], pDataArray->m->currentSharedBinLabels); if(pDataArray->optimizegap != -1 && (numPartitions - pDataArray->minPartition) >= pDataArray->optimizegap && numPartitions >= pDataArray->minpartitions){ break; } diff --git a/getoturepcommand.cpp b/getoturepcommand.cpp index 7e71a1a..6109237 100644 --- a/getoturepcommand.cpp +++ b/getoturepcommand.cpp @@ -23,7 +23,7 @@ inline bool compareName(repStruct left, repStruct right){ //******************************************************************************************************************** //sorts lowest to highest inline bool compareBin(repStruct left, repStruct right){ - return (left.bin < right.bin); + return (left.simpleBin < right.simpleBin); } //******************************************************************************************************************** //sorts lowest to highest @@ -979,6 +979,7 @@ int GetOTURepCommand::process(ListVector* processList) { } //for each bin in the list vector + vector binLabels = processList->getLabels(); for (int i = 0; i < processList->size(); i++) { if (m->control_pressed) { out.close(); @@ -999,7 +1000,7 @@ int GetOTURepCommand::process(ListVector* processList) { if (Groups.size() == 0) { nameRep = findRep(namesInBin, ""); - newNamesOutput << i << '\t' << nameRep << '\t'; + newNamesOutput << binLabels[i] << '\t' << nameRep << '\t'; //put rep at first position in names line string outputString = nameRep + ","; @@ -1042,7 +1043,7 @@ int GetOTURepCommand::process(ListVector* processList) { nameRep = findRep(NamesInGroup[Groups[j]], Groups[j]); //output group rep and other members of this group - (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t'; + (*(filehandles[Groups[j]])) << binLabels[i] << '\t' << nameRep << '\t'; //put rep at first position in names line string outputString = nameRep + ","; @@ -1100,7 +1101,6 @@ int GetOTURepCommand::processFastaNames(string filename, string label, FastaMap* ifstream in; m->openInputFile(filename, in); - int i = 0; string tempGroup = ""; in >> tempGroup; m->gobble(in); @@ -1112,8 +1112,8 @@ int GetOTURepCommand::processFastaNames(string filename, string label, FastaMap* int thistotal = 0; while (!in.eof()) { - string rep, binnames; - in >> i >> rep >> binnames; m->gobble(in); + string rep, binnames, binLabel; + in >> binLabel >> rep >> binnames; m->gobble(in); vector names; m->splitAtComma(binnames, names); @@ -1178,7 +1178,7 @@ int GetOTURepCommand::processFastaNames(string filename, string label, FastaMap* if (sequence != "not found") { if (sorted == "") { //print them out - rep = rep + "\t" + toString(i+1); + rep = rep + "\t" + binLabel; rep = rep + "|" + toString(binsize); if (group != "") { rep = rep + "|" + group; @@ -1186,7 +1186,9 @@ int GetOTURepCommand::processFastaNames(string filename, string label, FastaMap* out << ">" << rep << endl; out << sequence << endl; }else { //save them - repStruct newRep(rep, i+1, binsize, group); + int simpleLabel; + m->mothurConvert(m->getSimpleLabel(binLabel), simpleLabel); + repStruct newRep(rep, binLabel, simpleLabel, binsize, group); reps.push_back(newRep); } }else { @@ -1204,7 +1206,7 @@ int GetOTURepCommand::processFastaNames(string filename, string label, FastaMap* //print them for (int i = 0; i < reps.size(); i++) { string sequence = fasta->getSequence(reps[i].name); - string outputName = reps[i].name + "\t" + toString(reps[i].bin); + string outputName = reps[i].name + "\t" + reps[i].bin; outputName = outputName + "|" + toString(reps[i].size); if (reps[i].group != "") { outputName = outputName + "|" + reps[i].group; @@ -1245,7 +1247,6 @@ int GetOTURepCommand::processNames(string filename, string label) { ifstream in; m->openInputFile(filename, in); - int i = 0; string rep, binnames; string tempGroup = ""; @@ -1259,7 +1260,8 @@ int GetOTURepCommand::processNames(string filename, string label) { while (!in.eof()) { if (m->control_pressed) { break; } - in >> i >> rep >> binnames; m->gobble(in); + string binLabel; + in >> binLabel >> rep >> binnames; m->gobble(in); if (countfile == "") { out2 << rep << '\t' << binnames << endl; } else { diff --git a/getoturepcommand.h b/getoturepcommand.h index ca3439d..daf5bc1 100644 --- a/getoturepcommand.h +++ b/getoturepcommand.h @@ -24,12 +24,13 @@ typedef map SeqMap; struct repStruct { string name; - int bin; + string bin; + int simpleBin; int size; string group; repStruct(){} - repStruct(string n, int b, int s, string g) : name(n), bin(b), size(s), group(g) {} + repStruct(string n, string b, int sb, int s, string g) : name(n), bin(b), size(s), group(g), simpleBin(sb) { } ~repStruct() {} }; diff --git a/makebiomcommand.cpp b/makebiomcommand.cpp index 826cf37..4c35092 100644 --- a/makebiomcommand.cpp +++ b/makebiomcommand.cpp @@ -824,7 +824,7 @@ map MakeBiomCommand::readGGOtuMap(){ vector pieces = m->splitWhiteSpace(line); if (pieces.size() != 0) { - string otuID = pieces[0]; + string otuID = pieces[1]; for (int i = 1; i < pieces.size(); i++) { otuMap[pieces[i]] = otuID; } } } diff --git a/makecontigscommand.cpp b/makecontigscommand.cpp index 87027f4..bf27d26 100644 --- a/makecontigscommand.cpp +++ b/makecontigscommand.cpp @@ -1298,9 +1298,11 @@ vector< vector > MakeContigsCommand::readFastqFiles(unsigned long int& c if (uniques.size() != 0) { for (itUniques = uniques.begin(); itUniques != uniques.end(); itUniques++) { + if (m->control_pressed) { break; } m->mothurOut("[WARNING]: did not find paired read for " + itUniques->first + ", ignoring.\n"); } for (map:: iterator it = pairUniques.begin(); it != pairUniques.end(); it++) { + if (m->control_pressed) { break; } m->mothurOut("[WARNING]: did not find paired read for " + (it->first).substr(1) + ", ignoring.\n"); } m->mothurOutEndLine(); @@ -1660,6 +1662,7 @@ fastqRead MakeContigsCommand::readFastq(ifstream& in, bool& ignore){ vector qualScores = convertQual(quality); + m->checkName(name); read.name = name; read.sequence = sequence; read.scores = qualScores; diff --git a/mothurout.cpp b/mothurout.cpp index e7f0c8e..0f24cb2 100644 --- a/mothurout.cpp +++ b/mothurout.cpp @@ -779,6 +779,8 @@ string MothurOut::getPathName(string longName){ bool MothurOut::dirCheck(string& dirName){ try { + if (dirName == "") { return false; } + string tag = ""; #ifdef USE_MPI int pid; @@ -1140,6 +1142,105 @@ int MothurOut::openInputFile(string fileName, ifstream& fileHandle){ exit(1); } } +/***********************************************************************/ +int MothurOut::openInputFileBinary(string fileName, ifstream& fileHandle){ + try { + + //get full path name + string completeFileName = getFullPathName(fileName); +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) +#ifdef USE_COMPRESSION + // check for gzipped or bzipped file + if (endsWith(completeFileName, ".gz") || endsWith(completeFileName, ".bz2")) { + string tempName = string(tmpnam(0)); + mkfifo(tempName.c_str(), 0666); + int fork_result = fork(); + if (fork_result < 0) { + cerr << "Error forking.\n"; + exit(1); + } else if (fork_result == 0) { + string command = (endsWith(completeFileName, ".gz") ? "zcat " : "bzcat ") + completeFileName + string(" > ") + tempName; + cerr << "Decompressing " << completeFileName << " via temporary named pipe " << tempName << "\n"; + system(command.c_str()); + cerr << "Done decompressing " << completeFileName << "\n"; + mothurRemove(tempName); + exit(EXIT_SUCCESS); + } else { + cerr << "waiting on child process " << fork_result << "\n"; + completeFileName = tempName; + } + } +#endif +#endif + + fileHandle.open(completeFileName.c_str(), ios::binary); + if(!fileHandle) { + mothurOut("[ERROR]: Could not open " + completeFileName); mothurOutEndLine(); + return 1; + } + else { + //check for blank file + gobble(fileHandle); + if (fileHandle.eof()) { mothurOut("[ERROR]: " + completeFileName + " is blank. Please correct."); mothurOutEndLine(); } + + return 0; + } + } + catch(exception& e) { + errorOut(e, "MothurOut", "openInputFileBinary"); + exit(1); + } +} +/***********************************************************************/ +int MothurOut::openInputFileBinary(string fileName, ifstream& fileHandle, string noerror){ + try { + + //get full path name + string completeFileName = getFullPathName(fileName); +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) +#ifdef USE_COMPRESSION + // check for gzipped or bzipped file + if (endsWith(completeFileName, ".gz") || endsWith(completeFileName, ".bz2")) { + string tempName = string(tmpnam(0)); + mkfifo(tempName.c_str(), 0666); + int fork_result = fork(); + if (fork_result < 0) { + cerr << "Error forking.\n"; + exit(1); + } else if (fork_result == 0) { + string command = (endsWith(completeFileName, ".gz") ? "zcat " : "bzcat ") + completeFileName + string(" > ") + tempName; + cerr << "Decompressing " << completeFileName << " via temporary named pipe " << tempName << "\n"; + system(command.c_str()); + cerr << "Done decompressing " << completeFileName << "\n"; + mothurRemove(tempName); + exit(EXIT_SUCCESS); + } else { + cerr << "waiting on child process " << fork_result << "\n"; + completeFileName = tempName; + } + } +#endif +#endif + + fileHandle.open(completeFileName.c_str(), ios::binary); + if(!fileHandle) { + //mothurOut("[ERROR]: Could not open " + completeFileName); mothurOutEndLine(); + return 1; + } + else { + //check for blank file + gobble(fileHandle); + //if (fileHandle.eof()) { mothurOut("[ERROR]: " + completeFileName + " is blank. Please correct."); mothurOutEndLine(); } + + return 0; + } + } + catch(exception& e) { + errorOut(e, "MothurOut", "openInputFileBinary - no error"); + exit(1); + } +} + /***********************************************************************/ int MothurOut::renameFile(string oldName, string newName){ @@ -1289,6 +1390,36 @@ int MothurOut::appendFiles(string temp, string filename) { exit(1); } } +/**************************************************************************************************/ +int MothurOut::appendBinaryFiles(string temp, string filename) { + try{ + ofstream output; + ifstream input; + + //open output file in append mode + openOutputFileBinaryAppend(filename, output); + int ableToOpen = openInputFileBinary(temp, input, "no error"); + + if (ableToOpen == 0) { //you opened it + + char buffer[4096]; + while (!input.eof()) { + input.read(buffer, 4096); + output.write(buffer, input.gcount()); + } + input.close(); + } + + output.close(); + + return ableToOpen; + } + catch(exception& e) { + errorOut(e, "MothurOut", "appendBinaryFiles"); + exit(1); + } +} + /**************************************************************************************************/ int MothurOut::appendFilesWithoutHeaders(string temp, string filename) { try{ diff --git a/mothurout.h b/mothurout.h index 1caa1fe..1747b14 100644 --- a/mothurout.h +++ b/mothurout.h @@ -82,6 +82,7 @@ class MothurOut { vector setFilePosFasta(string, int&); string sortFile(string, string); int appendFiles(string, string); + int appendBinaryFiles(string, string); int appendFilesWithoutHeaders(string, string); int renameFile(string, string); //oldname, newname string getFullPathName(string); @@ -97,6 +98,8 @@ class MothurOut { int openOutputFileAppend(string, ofstream&); int openOutputFileBinaryAppend(string, ofstream&); int openInputFile(string, ifstream&); + int openInputFileBinary(string, ifstream&); + int openInputFileBinary(string, ifstream&, string); int openInputFile(string, ifstream&, string); //no error given bool checkLocations(string&, string); //filename, inputDir. checks for file in ./, inputdir, default and mothur's exe location. Returns false if cant be found. If found completes name with location diff --git a/nmdscommand.cpp b/nmdscommand.cpp index a90ed29..82d2d7f 100644 --- a/nmdscommand.cpp +++ b/nmdscommand.cpp @@ -288,7 +288,7 @@ int NMDSCommand::execute(){ outBest.setf(ios::fixed, ios::floatfield); outBest.setf(ios::showpoint); - outBest << '\t'; + outBest << "group" << '\t'; for (int k = 0; k < bestConfig.size(); k++) { outBest << "axis" << (k+1) << '\t'; } outBest << endl; diff --git a/pcrseqscommand.h b/pcrseqscommand.h index f4481ab..ba90624 100644 --- a/pcrseqscommand.h +++ b/pcrseqscommand.h @@ -60,7 +60,7 @@ private: int readCount(set); bool readOligos(); bool readEcoli(); - int driverPcr(string, string, string, string, set&, linePair, int&, int&, bool&); + int driverPcr(string, string, string, string, set&, linePair, int&, bool&); int createProcesses(string, string, string, set&); bool isAligned(string, map&); string reverseOligo(string); @@ -98,8 +98,8 @@ struct pcrData { nomatch = nm; keepprimer = kp; keepdots = kd; + end = en; start = st; - end = en; length = l; fstart = fst; fend = fen; @@ -142,8 +142,8 @@ static DWORD WINAPI MyPcrThreadFunction(LPVOID lpParam){ set lengths; //pdiffs, bdiffs, primers, barcodes, revPrimers map faked; - vector< set > locations; //locations[0] = beginning locations, locations[1] = ending locations - locations.resize(2); + set locations; //locations = beginning locations + TrimOligos trim(pDataArray->pdiffs, 0, pDataArray->primers, faked, pDataArray->revPrimer); for(int i = 0; i < pDataArray->fend; i++){ //end is the number of sequences to process @@ -192,7 +192,7 @@ static DWORD WINAPI MyPcrThreadFunction(LPVOID lpParam){ else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerEnd-1]+1)); if (pDataArray->fileAligned) { - thisPStart = mapAligned[primerEnd-1]+1; //locations[0].insert(mapAligned[primerEnd-1]+1); + thisPStart = mapAligned[primerEnd-1]+1; //locations.insert(mapAligned[primerEnd-1]+1); locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerEnd-1]+1) + "\n"; } } @@ -202,7 +202,7 @@ static DWORD WINAPI MyPcrThreadFunction(LPVOID lpParam){ else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart])); if (pDataArray->fileAligned) { - thisPStart = mapAligned[primerStart]; //locations[0].insert(mapAligned[primerStart]); + thisPStart = mapAligned[primerStart]; //locations.insert(mapAligned[primerStart]); locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerStart]) + "\n"; } } @@ -237,7 +237,7 @@ static DWORD WINAPI MyPcrThreadFunction(LPVOID lpParam){ else { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerStart])); if (pDataArray->fileAligned) { - thisPEnd = mapAligned[primerStart]; //locations[1].insert(mapAligned[primerStart]); + thisPEnd = mapAligned[primerStart]; //locations.insert(mapAligned[primerStart]); locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerStart]) + "\n"; } @@ -248,7 +248,7 @@ static DWORD WINAPI MyPcrThreadFunction(LPVOID lpParam){ else { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerEnd-1]+1)); if (pDataArray->fileAligned) { - thisPEnd = mapAligned[primerEnd-1]+1; //locations[1].insert(mapAligned[primerEnd-1]+1); + thisPEnd = mapAligned[primerEnd-1]+1; //locations.insert(mapAligned[primerEnd-1]+1); locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerEnd-1]+1) + "\n"; } @@ -303,8 +303,7 @@ static DWORD WINAPI MyPcrThreadFunction(LPVOID lpParam){ if(goodSeq == 1) { currSeq.printSequence(goodFile); if (locationsString != "") { locationsFile << locationsString; } - if (thisPStart != -1) { locations[0].insert(thisPStart); } - if (thisPEnd != -1) { locations[1].insert(thisPEnd); } + if (thisPStart != -1) { locations.insert(thisPStart); } } else { pDataArray->badSeqNames.insert(currSeq.getName()); @@ -327,9 +326,8 @@ static DWORD WINAPI MyPcrThreadFunction(LPVOID lpParam){ if (pDataArray->m->debug) { pDataArray->m->mothurOut("[DEBUG]: fileAligned = " + toString(pDataArray->fileAligned) +'\n'); } if (pDataArray->fileAligned && !pDataArray->keepdots) { //print out smallest start value and largest end value - if ((locations[0].size() > 1) || (locations[1].size() > 1)) { pDataArray->adjustNeeded = true; } - if (pDataArray->primers.size() != 0) { set::iterator it = locations[0].begin(); pDataArray->pstart = *it; } - if (pDataArray->revPrimer.size() != 0) { set::reverse_iterator it2 = locations[1].rbegin(); pDataArray->pend = *it2; } + if (locations.size() > 1) { pDataArray->adjustNeeded = true; } + if (pDataArray->primers.size() != 0) { set::iterator it = locations.begin(); pDataArray->pstart = *it; } } return 0; diff --git a/prcseqscommand.cpp b/prcseqscommand.cpp index cb8dd49..a1eae46 100644 --- a/prcseqscommand.cpp +++ b/prcseqscommand.cpp @@ -443,13 +443,13 @@ int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string process++; }else if (pid == 0){ string locationsFile = toString(getpid()) + ".temp"; - num = driverPcr(filename, goodFileName + toString(getpid()) + ".temp", badFileName + toString(getpid()) + ".temp", locationsFile, badSeqNames, lines[process], pstart, pend, adjustNeeded); + num = driverPcr(filename, goodFileName + toString(getpid()) + ".temp", badFileName + toString(getpid()) + ".temp", locationsFile, badSeqNames, lines[process], pstart, adjustNeeded); //pass numSeqs to parent ofstream out; string tempFile = filename + toString(getpid()) + ".num.temp"; m->openOutputFile(tempFile, out); - out << pstart << '\t' << pend << '\t' << adjustNeeded << endl; + out << pstart << '\t' << adjustNeeded << endl; out << num << '\t' << badSeqNames.size() << endl; for (set::iterator it = badSeqNames.begin(); it != badSeqNames.end(); it++) { out << (*it) << endl; @@ -465,7 +465,7 @@ int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string } string locationsFile = toString(getpid()) + ".temp"; - num = driverPcr(filename, goodFileName, badFileName, locationsFile, badSeqNames, lines[0], pstart, pend, adjustNeeded); + num = driverPcr(filename, goodFileName, badFileName, locationsFile, badSeqNames, lines[0], pstart, adjustNeeded); //force parent to wait until all the processes are done for (int i=0;iopenInputFile(tempFile, in); int numBadNames = 0; string name = ""; - int tpstart = -1; int tpend = -1; bool tempAdjust = false; + int tpstart = -1; bool tempAdjust = false; if (!in.eof()) { - in >> tpstart >> tpend >> tempAdjust; m->gobble(in); + in >> tpstart >> tempAdjust; m->gobble(in); if (tempAdjust) { adjustNeeded = true; } if (tpstart != -1) { if (tpstart != pstart) { adjustNeeded = true; } if (tpstart < pstart) { pstart = tpstart; } //smallest start } - if (tpend != -1) { - if (tpend != pend) { adjustNeeded = true; } - if (tpend > pend) { pend = tpend; } //largest end - } int tempNum = 0; in >> tempNum >> numBadNames; num += tempNum; m->gobble(in); } for (int j = 0; j < numBadNames; j++) { @@ -541,7 +537,7 @@ int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string } //do your part - num = driverPcr(filename, (goodFileName+toString(processors-1)+".temp"), (badFileName+toString(processors-1)+".temp"), (locationsFile+toString(processors-1)+".temp"), badSeqNames, lines[processors-1], pstart, pend, adjustNeeded); + num = driverPcr(filename, (goodFileName+toString(processors-1)+".temp"), (badFileName+toString(processors-1)+".temp"), (locationsFile+toString(processors-1)+".temp"), badSeqNames, lines[processors-1], pstart, adjustNeeded); processIDS.push_back(processors-1); //Wait until all threads have terminated. @@ -558,11 +554,7 @@ int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string if (pDataArray[i]->pstart != pstart) { adjustNeeded = true; } if (pDataArray[i]->pstart < pstart) { pstart = pDataArray[i]->pstart; } } //smallest start - if (pDataArray[i]->pend != -1) { - if (pDataArray[i]->pend != pend) { adjustNeeded = true; } - if (pDataArray[i]->pend > pend) { pend = pDataArray[i]->pend; } - } //largest end - + for (set::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames.insert(*it); } CloseHandle(hThreadArray[i]); delete pDataArray[i]; @@ -580,7 +572,43 @@ int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string } #endif - if (fileAligned && adjustNeeded) { adjustDots(goodFileName, locationsFile, pstart, pend); } + + + + if (fileAligned) { + //find pend - pend is the biggest ending value, but we must account for when we adjust the start. That adjustment may make the "new" end larger then the largest end. So lets find out what that "new" end will be. + ifstream inLocations; + m->openInputFile(locationsFile, inLocations); + + while(!inLocations.eof()) { + + if (m->control_pressed) { break; } + + string name = ""; + int thisStart = -1; int thisEnd = -1; + if (primers.size() != 0) { inLocations >> name >> thisStart; m->gobble(inLocations); } + if (revPrimer.size() != 0) { inLocations >> name >> thisEnd; m->gobble(inLocations); } + else { pend = -1; break; } + + int myDiff = 0; + if (pstart != -1) { + if (thisStart != -1) { + if (thisStart != pstart) { myDiff += (thisStart - pstart); } + } + } + + int myEnd = thisEnd + myDiff; + //cout << name << '\t' << thisStart << '\t' << thisEnd << " diff = " << myDiff << '\t' << myEnd << endl; + + if (thisEnd != -1) { + if (myEnd > pend) { pend = myEnd; } + } + + } + inLocations.close(); + + adjustDots(goodFileName, locationsFile, pstart, pend); + } return num; @@ -592,7 +620,7 @@ int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string } //********************************************************************************************************************** -int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta, string locationsName, set& badSeqNames, linePair filePos, int& pstart, int& pend, bool& adjustNeeded){ +int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta, string locationsName, set& badSeqNames, linePair filePos, int& pstart, bool& adjustNeeded){ try { ofstream goodFile; m->openOutputFile(goodFasta, goodFile); @@ -611,8 +639,7 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta bool done = false; int count = 0; set lengths; - vector< set > locations; //locations[0] = beginning locations, locations[1] = ending locations - locations.resize(2); + set locations; //locations[0] = beginning locations, //pdiffs, bdiffs, primers, barcodes, revPrimers map faked; @@ -759,9 +786,8 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta if(goodSeq == 1) { currSeq.printSequence(goodFile); if (m->debug) { m->mothurOut("[DEBUG]: " + locationsString + "\n"); } + if (thisPStart != -1) { locations.insert(thisPStart); } if (locationsString != "") { locationsFile << locationsString; } - if (thisPStart != -1) { locations[0].insert(thisPStart); } - if (thisPEnd != -1) { locations[1].insert(thisPEnd); } } else { badSeqNames.insert(currSeq.getName()); @@ -792,9 +818,8 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta if (m->debug) { m->mothurOut("[DEBUG]: fileAligned = " + toString(fileAligned) +'\n'); } if (fileAligned && !keepdots) { //print out smallest start value and largest end value - if ((locations[0].size() > 1) || (locations[1].size() > 1)) { adjustNeeded = true; } - if (primers.size() != 0) { set::iterator it = locations[0].begin(); pstart = *it; } - if (revPrimer.size() != 0) { set::reverse_iterator it2 = locations[1].rbegin(); pend = *it2; } + if (locations.size() > 1) { adjustNeeded = true; } + if (primers.size() != 0) { set::iterator it = locations.begin(); pstart = *it; } } return count; @@ -837,6 +862,7 @@ int PcrSeqsCommand::adjustDots(string goodFasta, string locations, int pstart, i set lengths; //cout << pstart << '\t' << pend << endl; + //if (pstart > pend) { //swap them while(!inFasta.eof()) { if(m->control_pressed) { break; } @@ -847,6 +873,8 @@ int PcrSeqsCommand::adjustDots(string goodFasta, string locations, int pstart, i int thisStart = -1; int thisEnd = -1; if (primers.size() != 0) { inLocations >> name >> thisStart; m->gobble(inLocations); } if (revPrimer.size() != 0) { inLocations >> name >> thisEnd; m->gobble(inLocations); } + + //cout << seq.getName() << '\t' << thisStart << '\t' << thisEnd << '\t' << seq.getAligned().length() << endl; //cout << seq.getName() << '\t' << pstart << '\t' << pend << endl; @@ -887,7 +915,8 @@ int PcrSeqsCommand::adjustDots(string goodFasta, string locations, int pstart, i //cout << "final lengths = \n"; //for (set::iterator it = lengths.begin(); it != lengths.end(); it++) { - // cout << *it << endl; + //cout << *it << endl; + // cout << lengths.count(*it) << endl; // } return 0; diff --git a/preclustercommand.cpp b/preclustercommand.cpp index fbbe4bb..fe722a6 100644 --- a/preclustercommand.cpp +++ b/preclustercommand.cpp @@ -300,6 +300,7 @@ int PreClusterCommand::execute(){ m->mothurOut("It took " + toString(time(NULL) - start) + " secs to run pre.cluster."); m->mothurOutEndLine(); }else { + if (processors != 1) { m->mothurOut("When using running without group information mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; } if (namefile != "") { readNameFile(); } //reads fasta file and return number of seqs diff --git a/screenseqscommand.cpp b/screenseqscommand.cpp index a56b0eb..bf702e1 100644 --- a/screenseqscommand.cpp +++ b/screenseqscommand.cpp @@ -2119,7 +2119,7 @@ int ScreenSeqsCommand::screenGroupFile(map badSeqNames){ while(!inputGroups.eof()){ if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); m->mothurRemove(goodGroupFile); return 0; } - inputGroups >> seqName; m->gobble(inputGroups); inputGroups >> group; + inputGroups >> seqName; m->gobble(inputGroups); inputGroups >> group; m->gobble(inputGroups); it = badSeqNames.find(seqName); if(it != badSeqNames.end()){ @@ -2128,7 +2128,6 @@ int ScreenSeqsCommand::screenGroupFile(map badSeqNames){ else{ goodGroupOut << seqName << '\t' << group << endl; } - m->gobble(inputGroups); } if (m->control_pressed) { goodGroupOut.close(); inputGroups.close(); m->mothurRemove(goodGroupFile); return 0; } diff --git a/sequence.cpp b/sequence.cpp index d6073d7..efeb88c 100644 --- a/sequence.cpp +++ b/sequence.cpp @@ -76,7 +76,8 @@ Sequence::Sequence(istringstream& fastaString){ } } - while (!fastaString.eof()) { char c = fastaString.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there + //while (!fastaString.eof()) { char c = fastaString.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there + comment = getCommentString(fastaString); int numAmbig = 0; sequence = getSequenceString(fastaString, numAmbig); @@ -120,7 +121,8 @@ Sequence::Sequence(istringstream& fastaString, string JustUnaligned){ } } - while (!fastaString.eof()) { char c = fastaString.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there + //while (!fastaString.eof()) { char c = fastaString.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there + comment = getCommentString(fastaString); int numAmbig = 0; sequence = getSequenceString(fastaString, numAmbig); @@ -166,8 +168,8 @@ Sequence::Sequence(ifstream& fastaFile){ } } - //read real sequence - while (!fastaFile.eof()) { char c = fastaFile.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there + //while (!fastaFile.eof()) { char c = fastaFile.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there + comment = getCommentString(fastaFile); int numAmbig = 0; sequence = getSequenceString(fastaFile, numAmbig); @@ -216,9 +218,11 @@ Sequence::Sequence(ifstream& fastaFile, string& extraInfo, bool getInfo){ //read info after sequence name while (!fastaFile.eof()) { char c = fastaFile.get(); - if (c == 10 || c == 13 || c == -1){ break; } + if (c == 10 || c == 13 || c == -1){ break; } extraInfo += c; - } + } + + comment = extraInfo; int numAmbig = 0; sequence = getSequenceString(fastaFile, numAmbig); @@ -261,8 +265,8 @@ Sequence::Sequence(ifstream& fastaFile, string JustUnaligned){ } } - //read real sequence - while (!fastaFile.eof()) { char c = fastaFile.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there + //while (!fastaFile.eof()) { char c = fastaFile.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there + comment = getCommentString(fastaFile); int numAmbig = 0; sequence = getSequenceString(fastaFile, numAmbig); @@ -360,17 +364,19 @@ string Sequence::getSequenceString(ifstream& fastaFile, int& numAmbig) { string Sequence::getCommentString(ifstream& fastaFile) { try { char letter; - string sequence = ""; + string temp = ""; while(fastaFile){ letter=fastaFile.get(); - if((letter == '\r') || (letter == '\n')){ + if((letter == '\r') || (letter == '\n') || letter == -1){ m->gobble(fastaFile); //in case its a \r\n situation break; - } + }else { + temp += letter; + } } - return sequence; + return temp; } catch(exception& e) { m->errorOut(e, "Sequence", "getCommentString"); @@ -414,17 +420,19 @@ string Sequence::getSequenceString(istringstream& fastaFile, int& numAmbig) { string Sequence::getCommentString(istringstream& fastaFile) { try { char letter; - string sequence = ""; + string temp = ""; while(fastaFile){ letter=fastaFile.get(); - if((letter == '\r') || (letter == '\n')){ + if((letter == '\r') || (letter == '\n') || letter == -1){ m->gobble(fastaFile); //in case its a \r\n situation break; - } + }else { + temp += letter; + } } - return sequence; + return temp; } catch(exception& e) { m->errorOut(e, "Sequence", "getCommentString"); @@ -439,6 +447,7 @@ void Sequence::initialize(){ unaligned = ""; aligned = ""; pairwise = ""; + comment = ""; numBases = 0; alignmentLength = 0; @@ -581,7 +590,7 @@ int Sequence::getNumNs(){ void Sequence::printSequence(ostream& out){ - out << ">" << name << endl; + out << ">" << name << comment << endl; if(isAligned){ out << aligned << endl; } diff --git a/sequence.hpp b/sequence.hpp index ad3a4b4..c498994 100644 --- a/sequence.hpp +++ b/sequence.hpp @@ -76,6 +76,7 @@ private: string unaligned; string aligned; string pairwise; + string comment; int numBases; int alignmentLength; bool isAligned; diff --git a/setcurrentcommand.cpp b/setcurrentcommand.cpp index 9d59696..98ae223 100644 --- a/setcurrentcommand.cpp +++ b/setcurrentcommand.cpp @@ -417,10 +417,9 @@ SetCurrentCommand::SetCurrentCommand(string option) { else if (summaryfile == "not found") { summaryfile = ""; } if (summaryfile != "") { m->setSummaryFile(summaryfile); } - - processors = validParameter.validFile(parameters, "processors", false); - if (processors == "not found") { processors = "1"; } - if (processors != "") { m->setProcessors(processors); } + string temp = validParameter.validFile(parameters, "processors", false); + if (temp == "not found"){ temp = m->getProcessors(); } + m->setProcessors(temp); clearTypes = validParameter.validFile(parameters, "clear", false); if (clearTypes == "not found") { clearTypes = ""; } diff --git a/sffinfocommand.cpp b/sffinfocommand.cpp old mode 100644 new mode 100755 index 399f253..bccb751 --- a/sffinfocommand.cpp +++ b/sffinfocommand.cpp @@ -1,1885 +1,1912 @@ -/* - * sffinfocommand.cpp - * Mothur - * - * Created by westcott on 7/7/10. - * Copyright 2010 Schloss Lab. All rights reserved. - * - */ - -#include "sffinfocommand.h" -#include "endiannessmacros.h" -#include "trimoligos.h" -#include "sequence.hpp" -#include "qualityscores.h" - -//********************************************************************************************************************** -vector SffInfoCommand::setParameters(){ - try { - CommandParameter psff("sff", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(psff); - CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(poligos); - CommandParameter paccnos("accnos", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(paccnos); - CommandParameter psfftxt("sfftxt", "String", "", "", "", "", "","",false,false); parameters.push_back(psfftxt); - CommandParameter pflow("flow", "Boolean", "", "T", "", "", "","flow",false,false); parameters.push_back(pflow); - CommandParameter ptrim("trim", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(ptrim); - CommandParameter pfasta("fasta", "Boolean", "", "T", "", "", "","fasta",false,false); parameters.push_back(pfasta); - CommandParameter pqfile("qfile", "Boolean", "", "T", "", "", "","qfile",false,false); parameters.push_back(pqfile); - CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(ppdiffs); - CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pbdiffs); - CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pldiffs); - CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(psdiffs); - CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(ptdiffs); - CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); - CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); - - vector myArray; - for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } - return myArray; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "setParameters"); - exit(1); - } -} -//********************************************************************************************************************** -string SffInfoCommand::getHelpString(){ - try { - string helpString = ""; - helpString += "The sffinfo command reads a sff file and extracts the sequence data, or you can use it to parse a sfftxt file.\n"; - helpString += "The sffinfo command parameters are sff, fasta, qfile, accnos, flow, sfftxt, oligos, bdiffs, tdiffs, ldiffs, sdiffs, pdiffs and trim. sff is required. \n"; - helpString += "The sff parameter allows you to enter the sff file you would like to extract data from. You may enter multiple files by separating them by -'s.\n"; - helpString += "The fasta parameter allows you to indicate if you would like a fasta formatted file generated. Default=True. \n"; - helpString += "The qfile parameter allows you to indicate if you would like a quality file generated. Default=True. \n"; - helpString += "The oligos parameter allows you to provide an oligos file to split your sff file into separate sff files by barcode. \n"; - helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n"; - helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n"; - helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n"; - helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n"; - helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n"; - helpString += "The flow parameter allows you to indicate if you would like a flowgram file generated. Default=True. \n"; - helpString += "The sfftxt parameter allows you to indicate if you would like a sff.txt file generated. Default=False. \n"; - helpString += "If you want to parse an existing sfftxt file into flow, fasta and quality file, enter the file name using the sfftxt parameter. \n"; - helpString += "The trim parameter allows you to indicate if you would like a sequences and quality scores trimmed to the clipQualLeft and clipQualRight values. Default=True. \n"; - helpString += "The accnos parameter allows you to provide a accnos file containing the names of the sequences you would like extracted. You may enter multiple files by separating them by -'s. \n"; - helpString += "Example sffinfo(sff=mySffFile.sff, trim=F).\n"; - helpString += "Note: No spaces between parameter labels (i.e. sff), '=' and parameters (i.e.yourSffFileName).\n"; - return helpString; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "getHelpString"); - exit(1); - } -} - -//********************************************************************************************************************** -string SffInfoCommand::getOutputPattern(string type) { - try { - string pattern = ""; - - if (type == "fasta") { pattern = "[filename],fasta-[filename],[tag],fasta"; } - else if (type == "flow") { pattern = "[filename],flow"; } - else if (type == "sfftxt") { pattern = "[filename],sff.txt"; } - else if (type == "sff") { pattern = "[filename],[group],sff"; } - else if (type == "qfile") { pattern = "[filename],qual-[filename],[tag],qual"; } - else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } - - return pattern; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "getOutputPattern"); - exit(1); - } -} -//********************************************************************************************************************** -SffInfoCommand::SffInfoCommand(){ - try { - abort = true; calledHelp = true; - setParameters(); - vector tempOutNames; - outputTypes["fasta"] = tempOutNames; - outputTypes["flow"] = tempOutNames; - outputTypes["sfftxt"] = tempOutNames; - outputTypes["qfile"] = tempOutNames; - outputTypes["sff"] = tempOutNames; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "SffInfoCommand"); - exit(1); - } -} -//********************************************************************************************************************** - -SffInfoCommand::SffInfoCommand(string option) { - try { - abort = false; calledHelp = false; - hasAccnos = false; hasOligos = false; - split = 1; - - //allow user to run help - if(option == "help") { help(); abort = true; calledHelp = true; } - else if(option == "citation") { citation(); abort = true; calledHelp = true;} - - else { - //valid paramters for this command - vector myArray = setParameters(); - - OptionParser parser(option); - map parameters = parser.getParameters(); - - ValidParameters validParameter; - //check to make sure all parameters are valid for command - for (map::iterator it = parameters.begin(); it != parameters.end(); it++) { - if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } - } - - //initialize outputTypes - vector tempOutNames; - outputTypes["fasta"] = tempOutNames; - outputTypes["flow"] = tempOutNames; - outputTypes["sfftxt"] = tempOutNames; - outputTypes["qfile"] = tempOutNames; - outputTypes["sff"] = tempOutNames; - - //if the user changes the output directory command factory will send this info to us in the output parameter - outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } - - //if the user changes the input directory command factory will send this info to us in the output parameter - string inputDir = validParameter.validFile(parameters, "inputdir", false); if (inputDir == "not found"){ inputDir = ""; } - - sffFilename = validParameter.validFile(parameters, "sff", false); - if (sffFilename == "not found") { sffFilename = ""; } - else { - m->splitAtDash(sffFilename, filenames); - - //go through files and make sure they are good, if not, then disregard them - for (int i = 0; i < filenames.size(); i++) { - bool ignore = false; - if (filenames[i] == "current") { - filenames[i] = m->getSFFFile(); - if (filenames[i] != "") { m->mothurOut("Using " + filenames[i] + " as input file for the sff parameter where you had given current."); m->mothurOutEndLine(); } - else { - m->mothurOut("You have no current sfffile, ignoring current."); m->mothurOutEndLine(); ignore=true; - //erase from file list - filenames.erase(filenames.begin()+i); - i--; - } - } - - if (!ignore) { - if (inputDir != "") { - string path = m->hasPath(filenames[i]); - //if the user has not given a path then, add inputdir. else leave path alone. - if (path == "") { filenames[i] = inputDir + filenames[i]; } - } - - ifstream in; - int ableToOpen = m->openInputFile(filenames[i], in, "noerror"); - - //if you can't open it, try default location - if (ableToOpen == 1) { - if (m->getDefaultPath() != "") { //default path is set - string tryPath = m->getDefaultPath() + m->getSimpleName(filenames[i]); - m->mothurOut("Unable to open " + filenames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); - ifstream in2; - ableToOpen = m->openInputFile(tryPath, in2, "noerror"); - in2.close(); - filenames[i] = tryPath; - } - } - - //if you can't open it, try default location - if (ableToOpen == 1) { - if (m->getOutputDir() != "") { //default path is set - string tryPath = m->getOutputDir() + m->getSimpleName(filenames[i]); - m->mothurOut("Unable to open " + filenames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); - ifstream in2; - ableToOpen = m->openInputFile(tryPath, in2, "noerror"); - in2.close(); - filenames[i] = tryPath; - } - } - - in.close(); - - if (ableToOpen == 1) { - m->mothurOut("Unable to open " + filenames[i] + ". It will be disregarded."); m->mothurOutEndLine(); - //erase from file list - filenames.erase(filenames.begin()+i); - i--; - }else { m->setSFFFile(filenames[i]); } - } - } - - //make sure there is at least one valid file left - if (filenames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; } - } - - accnosName = validParameter.validFile(parameters, "accnos", false); - if (accnosName == "not found") { accnosName = ""; } - else { - hasAccnos = true; - m->splitAtDash(accnosName, accnosFileNames); - - //go through files and make sure they are good, if not, then disregard them - for (int i = 0; i < accnosFileNames.size(); i++) { - bool ignore = false; - if (accnosFileNames[i] == "current") { - accnosFileNames[i] = m->getAccnosFile(); - if (accnosFileNames[i] != "") { m->mothurOut("Using " + accnosFileNames[i] + " as input file for the accnos parameter where you had given current."); m->mothurOutEndLine(); } - else { - m->mothurOut("You have no current accnosfile, ignoring current."); m->mothurOutEndLine(); ignore=true; - //erase from file list - accnosFileNames.erase(accnosFileNames.begin()+i); - i--; - } - } - - if (!ignore) { - - if (inputDir != "") { - string path = m->hasPath(accnosFileNames[i]); - //if the user has not given a path then, add inputdir. else leave path alone. - if (path == "") { accnosFileNames[i] = inputDir + accnosFileNames[i]; } - } - - ifstream in; - int ableToOpen = m->openInputFile(accnosFileNames[i], in, "noerror"); - - //if you can't open it, try default location - if (ableToOpen == 1) { - if (m->getDefaultPath() != "") { //default path is set - string tryPath = m->getDefaultPath() + m->getSimpleName(accnosFileNames[i]); - m->mothurOut("Unable to open " + accnosFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); - ifstream in2; - ableToOpen = m->openInputFile(tryPath, in2, "noerror"); - in2.close(); - accnosFileNames[i] = tryPath; - } - } - //if you can't open it, try default location - if (ableToOpen == 1) { - if (m->getOutputDir() != "") { //default path is set - string tryPath = m->getOutputDir() + m->getSimpleName(accnosFileNames[i]); - m->mothurOut("Unable to open " + accnosFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); - ifstream in2; - ableToOpen = m->openInputFile(tryPath, in2, "noerror"); - in2.close(); - accnosFileNames[i] = tryPath; - } - } - in.close(); - - if (ableToOpen == 1) { - m->mothurOut("Unable to open " + accnosFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); - //erase from file list - accnosFileNames.erase(accnosFileNames.begin()+i); - i--; - } - } - } - - //make sure there is at least one valid file left - if (accnosFileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; } - } - - oligosfile = validParameter.validFile(parameters, "oligos", false); - if (oligosfile == "not found") { oligosfile = ""; } - else { - hasOligos = true; - m->splitAtDash(oligosfile, oligosFileNames); - - //go through files and make sure they are good, if not, then disregard them - for (int i = 0; i < oligosFileNames.size(); i++) { - bool ignore = false; - if (oligosFileNames[i] == "current") { - oligosFileNames[i] = m->getOligosFile(); - if (oligosFileNames[i] != "") { m->mothurOut("Using " + oligosFileNames[i] + " as input file for the accnos parameter where you had given current."); m->mothurOutEndLine(); } - else { - m->mothurOut("You have no current oligosfile, ignoring current."); m->mothurOutEndLine(); ignore=true; - //erase from file list - oligosFileNames.erase(oligosFileNames.begin()+i); - i--; - } - } - - if (!ignore) { - - if (inputDir != "") { - string path = m->hasPath(oligosFileNames[i]); - //if the user has not given a path then, add inputdir. else leave path alone. - if (path == "") { oligosFileNames[i] = inputDir + oligosFileNames[i]; } - } - - ifstream in; - int ableToOpen = m->openInputFile(oligosFileNames[i], in, "noerror"); - - //if you can't open it, try default location - if (ableToOpen == 1) { - if (m->getDefaultPath() != "") { //default path is set - string tryPath = m->getDefaultPath() + m->getSimpleName(oligosFileNames[i]); - m->mothurOut("Unable to open " + oligosFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); - ifstream in2; - ableToOpen = m->openInputFile(tryPath, in2, "noerror"); - in2.close(); - oligosFileNames[i] = tryPath; - } - } - //if you can't open it, try default location - if (ableToOpen == 1) { - if (m->getOutputDir() != "") { //default path is set - string tryPath = m->getOutputDir() + m->getSimpleName(oligosFileNames[i]); - m->mothurOut("Unable to open " + oligosFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); - ifstream in2; - ableToOpen = m->openInputFile(tryPath, in2, "noerror"); - in2.close(); - oligosFileNames[i] = tryPath; - } - } - in.close(); - - if (ableToOpen == 1) { - m->mothurOut("Unable to open " + oligosFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); - //erase from file list - oligosFileNames.erase(oligosFileNames.begin()+i); - i--; - } - } - } - - //make sure there is at least one valid file left - if (oligosFileNames.size() == 0) { m->mothurOut("no valid oligos files."); m->mothurOutEndLine(); abort = true; } - } - - if (hasOligos) { - split = 2; - if (oligosFileNames.size() != filenames.size()) { abort = true; m->mothurOut("If you provide a oligos file, you must have one for each sff file."); m->mothurOutEndLine(); } - } - - if (hasAccnos) { - if (accnosFileNames.size() != filenames.size()) { abort = true; m->mothurOut("If you provide a accnos file, you must have one for each sff file."); m->mothurOutEndLine(); } - } - - string temp = validParameter.validFile(parameters, "qfile", false); if (temp == "not found"){ temp = "T"; } - qual = m->isTrue(temp); - - temp = validParameter.validFile(parameters, "fasta", false); if (temp == "not found"){ temp = "T"; } - fasta = m->isTrue(temp); - - temp = validParameter.validFile(parameters, "flow", false); if (temp == "not found"){ temp = "T"; } - flow = m->isTrue(temp); - - temp = validParameter.validFile(parameters, "trim", false); if (temp == "not found"){ temp = "T"; } - trim = m->isTrue(temp); - - temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; } - m->mothurConvert(temp, bdiffs); - - temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; } - m->mothurConvert(temp, pdiffs); - - temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; } - m->mothurConvert(temp, ldiffs); - - temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; } - m->mothurConvert(temp, sdiffs); - - temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs; temp = toString(tempTotal); } - m->mothurConvert(temp, tdiffs); - - if(tdiffs == 0){ tdiffs = bdiffs + pdiffs + ldiffs + sdiffs; } - - temp = validParameter.validFile(parameters, "sfftxt", false); - if (temp == "not found") { temp = "F"; sfftxt = false; sfftxtFilename = ""; } - else if (m->isTrue(temp)) { sfftxt = true; sfftxtFilename = ""; } - else { - //you are a filename - if (inputDir != "") { - map::iterator it = parameters.find("sfftxt"); - //user has given a template file - if(it != parameters.end()){ - string path = m->hasPath(it->second); - //if the user has not given a path then, add inputdir. else leave path alone. - if (path == "") { parameters["sfftxt"] = inputDir + it->second; } - } - } - - sfftxtFilename = validParameter.validFile(parameters, "sfftxt", true); - if (sfftxtFilename == "not found") { sfftxtFilename = ""; } - else if (sfftxtFilename == "not open") { sfftxtFilename = ""; } - } - - if ((sfftxtFilename == "") && (filenames.size() == 0)) { - //if there is a current sff file, use it - string filename = m->getSFFFile(); - if (filename != "") { filenames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the sff parameter."); m->mothurOutEndLine(); } - else { m->mothurOut("[ERROR]: you must provide a valid sff or sfftxt file."); m->mothurOutEndLine(); abort=true; } - } - - - } - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "SffInfoCommand"); - exit(1); - } -} -//********************************************************************************************************************** -int SffInfoCommand::execute(){ - try { - if (abort == true) { if (calledHelp) { return 0; } return 2; } - - for (int s = 0; s < filenames.size(); s++) { - - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } - - int start = time(NULL); - - filenames[s] = m->getFullPathName(filenames[s]); - m->mothurOut("Extracting info from " + filenames[s] + " ..." ); m->mothurOutEndLine(); - - string accnos = ""; - if (hasAccnos) { accnos = accnosFileNames[s]; } - - string oligos = ""; - if (hasOligos) { oligos = oligosFileNames[s]; } - - int numReads = extractSffInfo(filenames[s], accnos, oligos); - - m->mothurOut("It took " + toString(time(NULL) - start) + " secs to extract " + toString(numReads) + "."); - } - - if (sfftxtFilename != "") { parseSffTxt(); } - - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } - - //set fasta file as new current fastafile - string current = ""; - itTypes = outputTypes.find("fasta"); - if (itTypes != outputTypes.end()) { - if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); } - } - - itTypes = outputTypes.find("qfile"); - if (itTypes != outputTypes.end()) { - if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); } - } - - itTypes = outputTypes.find("flow"); - if (itTypes != outputTypes.end()) { - if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFlowFile(current); } - } - - //report output filenames - m->mothurOutEndLine(); - m->mothurOut("Output File Names: "); m->mothurOutEndLine(); - for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } - m->mothurOutEndLine(); - - return 0; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "execute"); - exit(1); - } -} -//********************************************************************************************************************** -int SffInfoCommand::extractSffInfo(string input, string accnos, string oligos){ - try { - currentFileName = input; - if (outputDir == "") { outputDir += m->hasPath(input); } - - if (accnos != "") { readAccnosFile(accnos); } - else { seqNames.clear(); } - - if (oligos != "") { readOligos(oligos); split = 2; } - - ofstream outSfftxt, outFasta, outQual, outFlow; - string outFastaFileName, outQualFileName; - string rootName = outputDir + m->getRootName(m->getSimpleName(input)); - if(rootName.find_last_of(".") == rootName.npos){ rootName += "."; } - - map variables; - variables["[filename]"] = rootName; - string sfftxtFileName = getOutputFileName("sfftxt",variables); - string outFlowFileName = getOutputFileName("flow",variables); - if (!trim) { variables["[tag]"] = "raw"; } - outFastaFileName = getOutputFileName("fasta",variables); - outQualFileName = getOutputFileName("qfile",variables); - - if (sfftxt) { m->openOutputFile(sfftxtFileName, outSfftxt); outSfftxt.setf(ios::fixed, ios::floatfield); outSfftxt.setf(ios::showpoint); outputNames.push_back(sfftxtFileName); outputTypes["sfftxt"].push_back(sfftxtFileName); } - if (fasta) { m->openOutputFile(outFastaFileName, outFasta); outputNames.push_back(outFastaFileName); outputTypes["fasta"].push_back(outFastaFileName); } - if (qual) { m->openOutputFile(outQualFileName, outQual); outputNames.push_back(outQualFileName); outputTypes["qfile"].push_back(outQualFileName); } - if (flow) { m->openOutputFile(outFlowFileName, outFlow); outputNames.push_back(outFlowFileName); outFlow.setf(ios::fixed, ios::floatfield); outFlow.setf(ios::showpoint); outputTypes["flow"].push_back(outFlowFileName); } - - ifstream in; - in.open(input.c_str(), ios::binary); - - CommonHeader header; - readCommonHeader(in, header); - - int count = 0; - mycount = 0; - - //check magic number and version - if (header.magicNumber != 779314790) { m->mothurOut("Magic Number is not correct, not a valid .sff file"); m->mothurOutEndLine(); return count; } - if (header.version != "0001") { m->mothurOut("Version is not supported, only support version 0001."); m->mothurOutEndLine(); return count; } - - //print common header - if (sfftxt) { printCommonHeader(outSfftxt, header); } - if (flow) { outFlow << header.numFlowsPerRead << endl; } - - //read through the sff file - while (!in.eof()) { - - bool print = true; - - //read data - seqRead read; Header readheader; - readSeqData(in, read, header.numFlowsPerRead, readheader); - bool okay = sanityCheck(readheader, read); - if (!okay) { break; } - - //if you have provided an accosfile and this seq is not in it, then dont print - if (seqNames.size() != 0) { if (seqNames.count(readheader.name) == 0) { print = false; } } - - //print - if (print) { - if (sfftxt) { printHeader(outSfftxt, readheader); printSffTxtSeqData(outSfftxt, read, readheader); } - if (fasta) { printFastaSeqData(outFasta, read, readheader); } - if (qual) { printQualSeqData(outQual, read, readheader); } - if (flow) { printFlowSeqData(outFlow, read, readheader); } - } - - count++; - mycount++; - - //report progress - if((count+1) % 10000 == 0){ m->mothurOut(toString(count+1)); m->mothurOutEndLine(); } - - if (m->control_pressed) { count = 0; break; } - - if (count >= header.numReads) { break; } - //if (count >= 100) { break; } - } - - //report progress - if (!m->control_pressed) { if((count) % 10000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } } - - in.close(); - - if (sfftxt) { outSfftxt.close(); } - if (fasta) { outFasta.close(); } - if (qual) { outQual.close(); } - if (flow) { outFlow.close(); } - - if (split > 1) { - //create new common headers for each file with the correct number of reads - adjustCommonHeader(header); - - map::iterator it; - set namesToRemove; - for(int i=0;iisBlank(filehandles[i][j])){ - m->mothurRemove(filehandles[i][j]); - m->mothurRemove(filehandlesHeaders[i][j]); - namesToRemove.insert(filehandles[i][j]); - } - } - } - } - } - - //append new header to reads - for (int i = 0; i < filehandles.size(); i++) { - for (int j = 0; j < filehandles[i].size(); j++) { - m->appendFiles(filehandles[i][j], filehandlesHeaders[i][j]); - m->renameFile(filehandlesHeaders[i][j], filehandles[i][j]); - m->mothurRemove(filehandlesHeaders[i][j]); - if (numSplitReads[i][j] == 0) { m->mothurRemove(filehandles[i][j]); } - } - } - - //remove names for outputFileNames, just cleans up the output - for(int i = 0; i < outputNames.size(); i++) { - if (namesToRemove.count(outputNames[i]) != 0) { - outputNames.erase(outputNames.begin()+i); - i--; - } - } - - if(m->isBlank(noMatchFile)){ m->mothurRemove(noMatchFile); } - else { outputNames.push_back(noMatchFile); outputTypes["sff"].push_back(noMatchFile); } - } - - return count; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "extractSffInfo"); - exit(1); - } -} -//********************************************************************************************************************** -int SffInfoCommand::readCommonHeader(ifstream& in, CommonHeader& header){ - try { - - if (!in.eof()) { - - //read magic number - char buffer[4]; - in.read(buffer, 4); - header.magicNumber = be_int4(*(unsigned int *)(&buffer)); - - //read version - char buffer9[4]; - in.read(buffer9, 4); - header.version = ""; - for (int i = 0; i < 4; i++) { header.version += toString((int)(buffer9[i])); } - - //read offset - char buffer2 [8]; - in.read(buffer2, 8); - header.indexOffset = be_int8(*(unsigned long long *)(&buffer2)); - - //read index length - char buffer3 [4]; - in.read(buffer3, 4); - header.indexLength = be_int4(*(unsigned int *)(&buffer3)); - - //read num reads - char buffer4 [4]; - in.read(buffer4, 4); - header.numReads = be_int4(*(unsigned int *)(&buffer4)); - - if (m->debug) { m->mothurOut("[DEBUG]: numReads = " + toString(header.numReads) + "\n"); } - - //read header length - char buffer5 [2]; - in.read(buffer5, 2); - header.headerLength = be_int2(*(unsigned short *)(&buffer5)); - - //read key length - char buffer6 [2]; - in.read(buffer6, 2); - header.keyLength = be_int2(*(unsigned short *)(&buffer6)); - - //read number of flow reads - char buffer7 [2]; - in.read(buffer7, 2); - header.numFlowsPerRead = be_int2(*(unsigned short *)(&buffer7)); - - //read format code - char buffer8 [1]; - in.read(buffer8, 1); - header.flogramFormatCode = (int)(buffer8[0]); - - //read flow chars - char* tempBuffer = new char[header.numFlowsPerRead]; - in.read(&(*tempBuffer), header.numFlowsPerRead); - header.flowChars = tempBuffer; - if (header.flowChars.length() > header.numFlowsPerRead) { header.flowChars = header.flowChars.substr(0, header.numFlowsPerRead); } - delete[] tempBuffer; - - //read key - char* tempBuffer2 = new char[header.keyLength]; - in.read(&(*tempBuffer2), header.keyLength); - header.keySequence = tempBuffer2; - if (header.keySequence.length() > header.keyLength) { header.keySequence = header.keySequence.substr(0, header.keyLength); } - delete[] tempBuffer2; - - /* Pad to 8 chars */ - unsigned long long spotInFile = in.tellg(); - unsigned long long spot = (spotInFile + 7)& ~7; // ~ inverts - in.seekg(spot); - - }else{ - m->mothurOut("Error reading sff common header."); m->mothurOutEndLine(); - } - - return 0; - - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "readCommonHeader"); - exit(1); - } -} -//********************************************************************************************************************** -int SffInfoCommand::adjustCommonHeader(CommonHeader header){ - try { - - char* mybuffer = new char[4]; - ifstream in; - in.open(currentFileName.c_str(), ios::binary); - - //magic number - in.read(mybuffer,4); - for (int i = 0; i < filehandlesHeaders.size(); i++) { - for (int j = 0; j < filehandlesHeaders[i].size(); j++) { - ofstream out; - m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); - out.write(mybuffer, in.gcount()); - out.close(); - } - } - delete[] mybuffer; - - //version - mybuffer = new char[4]; - in.read(mybuffer,4); - for (int i = 0; i < filehandlesHeaders.size(); i++) { - for (int j = 0; j < filehandlesHeaders[i].size(); j++) { - ofstream out; - m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); - out.write(mybuffer, in.gcount()); - out.close(); - } - } - delete[] mybuffer; - - //offset - mybuffer = new char[8]; - in.read(mybuffer,8); - for (int i = 0; i < filehandlesHeaders.size(); i++) { - for (int j = 0; j < filehandlesHeaders[i].size(); j++) { - unsigned long long offset = 0; - char* thisbuffer = new char[8]; - thisbuffer[0] = (offset >> 56) & 0xFF; - thisbuffer[1] = (offset >> 48) & 0xFF; - thisbuffer[2] = (offset >> 40) & 0xFF; - thisbuffer[3] = (offset >> 32) & 0xFF; - thisbuffer[4] = (offset >> 24) & 0xFF; - thisbuffer[5] = (offset >> 16) & 0xFF; - thisbuffer[6] = (offset >> 8) & 0xFF; - thisbuffer[7] = offset & 0xFF; - ofstream out; - m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); - out.write(thisbuffer, 8); - out.close(); - } - } - delete[] mybuffer; - - - //read index length - mybuffer = new char[4]; - in.read(mybuffer,4); - for (int i = 0; i < filehandlesHeaders.size(); i++) { - for (int j = 0; j < filehandlesHeaders[i].size(); j++) { - ofstream out; - m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); - unsigned int offset = 0; - char* thisbuffer = new char[4]; - thisbuffer[0] = (offset >> 24) & 0xFF; - thisbuffer[1] = (offset >> 16) & 0xFF; - thisbuffer[2] = (offset >> 8) & 0xFF; - thisbuffer[3] = offset & 0xFF; - out.write(thisbuffer, 4); - out.close(); - } - } - delete[] mybuffer; - - //change num reads - mybuffer = new char[4]; - in.read(mybuffer,4); - delete[] mybuffer; - for (int i = 0; i < filehandlesHeaders.size(); i++) { - for (int j = 0; j < filehandlesHeaders[i].size(); j++) { - ofstream out; - m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); - //convert number of reads to 4 byte char* - char* thisbuffer = new char[4]; - if ((m->findEdianness()) == "BIG_ENDIAN") { - thisbuffer[0] = (numSplitReads[i][j] >> 24) & 0xFF; - thisbuffer[1] = (numSplitReads[i][j] >> 16) & 0xFF; - thisbuffer[2] = (numSplitReads[i][j] >> 8) & 0xFF; - thisbuffer[3] = numSplitReads[i][j] & 0xFF; - }else { - thisbuffer[0] = numSplitReads[i][j] & 0xFF; - thisbuffer[1] = (numSplitReads[i][j] >> 8) & 0xFF; - thisbuffer[2] = (numSplitReads[i][j] >> 16) & 0xFF; - thisbuffer[3] = (numSplitReads[i][j] >> 24) & 0xFF; - } - out.write(thisbuffer, 4); - out.close(); - delete[] thisbuffer; - } - } - - //read header length - mybuffer = new char[2]; - in.read(mybuffer,2); - for (int i = 0; i < filehandlesHeaders.size(); i++) { - for (int j = 0; j < filehandlesHeaders[i].size(); j++) { - ofstream out; - m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); - out.write(mybuffer, in.gcount()); - out.close(); - } - } - delete[] mybuffer; - - //read key length - mybuffer = new char[2]; - in.read(mybuffer,2); - for (int i = 0; i < filehandlesHeaders.size(); i++) { - for (int j = 0; j < filehandlesHeaders[i].size(); j++) { - ofstream out; - m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); - out.write(mybuffer, in.gcount()); - out.close(); - } - } - delete[] mybuffer; - - //read number of flow reads - mybuffer = new char[2]; - in.read(mybuffer,2); - for (int i = 0; i < filehandlesHeaders.size(); i++) { - for (int j = 0; j < filehandlesHeaders[i].size(); j++) { - ofstream out; - m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); - out.write(mybuffer, in.gcount()); - out.close(); - } - } - delete[] mybuffer; - - //read format code - mybuffer = new char[1]; - in.read(mybuffer,1); - for (int i = 0; i < filehandlesHeaders.size(); i++) { - for (int j = 0; j < filehandlesHeaders[i].size(); j++) { - ofstream out; - m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); - out.write(mybuffer, in.gcount()); - out.close(); - } - } - delete[] mybuffer; - - //read flow chars - mybuffer = new char[header.numFlowsPerRead]; - in.read(mybuffer,header.numFlowsPerRead); - for (int i = 0; i < filehandlesHeaders.size(); i++) { - for (int j = 0; j < filehandlesHeaders[i].size(); j++) { - ofstream out; - m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); - out.write(mybuffer, in.gcount()); - out.close(); - } - } - delete[] mybuffer; - - //read key - mybuffer = new char[header.keyLength]; - in.read(mybuffer,header.keyLength); - for (int i = 0; i < filehandlesHeaders.size(); i++) { - for (int j = 0; j < filehandlesHeaders[i].size(); j++) { - ofstream out; - m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); - out.write(mybuffer, in.gcount()); - out.close(); - } - } - delete[] mybuffer; - - - /* Pad to 8 chars */ - unsigned long long spotInFile = in.tellg(); - unsigned long long spot = (spotInFile + 7)& ~7; // ~ inverts - in.seekg(spot); - - mybuffer = new char[spot-spotInFile]; - for (int i = 0; i < filehandlesHeaders.size(); i++) { - for (int j = 0; j < filehandlesHeaders[i].size(); j++) { - ofstream out; - m->openOutputFileBinaryAppend(filehandlesHeaders[i][j], out); - out.write(mybuffer, spot-spotInFile); - out.close(); - } - } - delete[] mybuffer; - in.close(); - return 0; - - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "adjustCommonHeader"); - exit(1); - } -} -//********************************************************************************************************************** -int SffInfoCommand::readSeqData(ifstream& in, seqRead& read, int numFlowReads, Header& header){ - try { - unsigned long long startSpotInFile = in.tellg(); - if (!in.eof()) { - - /*****************************************/ - //read header - - //read header length - char buffer [2]; - in.read(buffer, 2); - header.headerLength = be_int2(*(unsigned short *)(&buffer)); - - //read name length - char buffer2 [2]; - in.read(buffer2, 2); - header.nameLength = be_int2(*(unsigned short *)(&buffer2)); - - //read num bases - char buffer3 [4]; - in.read(buffer3, 4); - header.numBases = be_int4(*(unsigned int *)(&buffer3)); - - - //read clip qual left - char buffer4 [2]; - in.read(buffer4, 2); - header.clipQualLeft = be_int2(*(unsigned short *)(&buffer4)); - header.clipQualLeft = 5; - - - //read clip qual right - char buffer5 [2]; - in.read(buffer5, 2); - header.clipQualRight = be_int2(*(unsigned short *)(&buffer5)); - - - //read clipAdapterLeft - char buffer6 [2]; - in.read(buffer6, 2); - header.clipAdapterLeft = be_int2(*(unsigned short *)(&buffer6)); - - - //read clipAdapterRight - char buffer7 [2]; - in.read(buffer7, 2); - header.clipAdapterRight = be_int2(*(unsigned short *)(&buffer7)); - - - //read name - char* tempBuffer = new char[header.nameLength]; - in.read(&(*tempBuffer), header.nameLength); - header.name = tempBuffer; - if (header.name.length() > header.nameLength) { header.name = header.name.substr(0, header.nameLength); } - - delete[] tempBuffer; - - //extract info from name - decodeName(header.timestamp, header.region, header.xy, header.name); - - /* Pad to 8 chars */ - unsigned long long spotInFile = in.tellg(); - unsigned long long spot = (spotInFile + 7)& ~7; - in.seekg(spot); - - /*****************************************/ - //sequence read - - //read flowgram - read.flowgram.resize(numFlowReads); - for (int i = 0; i < numFlowReads; i++) { - char buffer [2]; - in.read(buffer, 2); - read.flowgram[i] = be_int2(*(unsigned short *)(&buffer)); - } - - //read flowIndex - read.flowIndex.resize(header.numBases); - for (int i = 0; i < header.numBases; i++) { - char temp[1]; - in.read(temp, 1); - read.flowIndex[i] = be_int1(*(unsigned char *)(&temp)); - } - - //read bases - char* tempBuffer6 = new char[header.numBases]; - in.read(&(*tempBuffer6), header.numBases); - read.bases = tempBuffer6; - if (read.bases.length() > header.numBases) { read.bases = read.bases.substr(0, header.numBases); } - delete[] tempBuffer6; - - //read qual scores - read.qualScores.resize(header.numBases); - for (int i = 0; i < header.numBases; i++) { - char temp[1]; - in.read(temp, 1); - read.qualScores[i] = be_int1(*(unsigned char *)(&temp)); - } - - /* Pad to 8 chars */ - spotInFile = in.tellg(); - spot = (spotInFile + 7)& ~7; - in.seekg(spot); - - if (split > 1) { - char * mybuffer; - mybuffer = new char [spot-startSpotInFile]; - ifstream in2; - in2.open(currentFileName.c_str(), ios::binary); - in2.seekg(startSpotInFile); - in2.read(mybuffer,spot-startSpotInFile); - in2.close(); - - int barcodeIndex, primerIndex; - int trashCodeLength = findGroup(header, read, barcodeIndex, primerIndex); - - if(trashCodeLength == 0){ - //cout << header.name << " length = " << spot << '\t' << startSpotInFile << '\t' << in2.gcount() << endl; - - ofstream out; - m->openOutputFileBinaryAppend(filehandles[barcodeIndex][primerIndex], out); - out.write(mybuffer, in2.gcount()); - out.close(); - numSplitReads[barcodeIndex][primerIndex]++; - } - else{ - ofstream out; - m->openOutputFileBinaryAppend(noMatchFile, out); - out.write(mybuffer, in2.gcount()); - out.close(); - } - delete[] mybuffer; - } - }else{ - m->mothurOut("Error reading."); m->mothurOutEndLine(); - } - - return 0; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "readSeqData"); - exit(1); - } -} -//********************************************************************************************************************** -int SffInfoCommand::findGroup(Header header, seqRead read, int& barcode, int& primer) { - try { - //find group read belongs to - TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer); - - int success = 1; - string trashCode = ""; - int currentSeqsDiffs = 0; - - string seq = read.bases; - - if (trim) { - if(header.clipQualRight < header.clipQualLeft){ - if (header.clipQualRight == 0) { //don't trim right - seq = seq.substr(header.clipQualLeft-1); - }else { - seq = "NNNN"; - } - } - else if((header.clipQualRight != 0) && ((header.clipQualRight-header.clipQualLeft) >= 0)){ - seq = seq.substr((header.clipQualLeft-1), (header.clipQualRight-header.clipQualLeft)); - } - else { - seq = seq.substr(header.clipQualLeft-1); - } - }else{ - //if you wanted the sfftxt then you already converted the bases to the right case - if (!sfftxt) { - int endValue = header.clipQualRight; - //make the bases you want to clip lowercase and the bases you want to keep upper case - if(endValue == 0){ endValue = seq.length(); } - for (int i = 0; i < (header.clipQualLeft-1); i++) { seq[i] = tolower(seq[i]); } - for (int i = (header.clipQualLeft-1); i < (endValue-1); i++) { seq[i] = toupper(seq[i]); } - for (int i = (endValue-1); i < seq.length(); i++) { seq[i] = tolower(seq[i]); } - } - } - - Sequence currSeq(header.name, seq); - QualityScores currQual; - - if(numLinkers != 0){ - success = trimOligos.stripLinker(currSeq, currQual); - if(success > ldiffs) { trashCode += 'k'; } - else{ currentSeqsDiffs += success; } - - } - - if(barcodes.size() != 0){ - success = trimOligos.stripBarcode(currSeq, currQual, barcode); - if(success > bdiffs) { trashCode += 'b'; } - else{ currentSeqsDiffs += success; } - } - - if(numSpacers != 0){ - success = trimOligos.stripSpacer(currSeq, currQual); - if(success > sdiffs) { trashCode += 's'; } - else{ currentSeqsDiffs += success; } - - } - - if(numFPrimers != 0){ - success = trimOligos.stripForward(currSeq, currQual, primer, true); - if(success > pdiffs) { trashCode += 'f'; } - else{ currentSeqsDiffs += success; } - } - - if (currentSeqsDiffs > tdiffs) { trashCode += 't'; } - - if(revPrimer.size() != 0){ - success = trimOligos.stripReverse(currSeq, currQual); - if(!success) { trashCode += 'r'; } - } - - - return trashCode.length(); - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "findGroup"); - exit(1); - } -} -//********************************************************************************************************************** -int SffInfoCommand::decodeName(string& timestamp, string& region, string& xy, string name) { - try { - - if (name.length() >= 6) { - string time = name.substr(0, 6); - unsigned int timeNum = m->fromBase36(time); - - int q1 = timeNum / 60; - int sec = timeNum - 60 * q1; - int q2 = q1 / 60; - int minute = q1 - 60 * q2; - int q3 = q2 / 24; - int hr = q2 - 24 * q3; - int q4 = q3 / 32; - int day = q3 - 32 * q4; - int q5 = q4 / 13; - int mon = q4 - 13 * q5; - int year = 2000 + q5; - - timestamp = toString(year) + "_" + toString(mon) + "_" + toString(day) + "_" + toString(hr) + "_" + toString(minute) + "_" + toString(sec); - } - - if (name.length() >= 9) { - region = name.substr(7, 2); - - string xyNum = name.substr(9); - unsigned int myXy = m->fromBase36(xyNum); - int x = myXy >> 12; - int y = myXy & 4095; - - xy = toString(x) + "_" + toString(y); - } - - return 0; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "decodeName"); - exit(1); - } -} -//********************************************************************************************************************** -int SffInfoCommand::printCommonHeader(ofstream& out, CommonHeader& header) { - try { - - out << "Common Header:\nMagic Number: " << header.magicNumber << endl; - out << "Version: " << header.version << endl; - out << "Index Offset: " << header.indexOffset << endl; - out << "Index Length: " << header.indexLength << endl; - out << "Number of Reads: " << header.numReads << endl; - out << "Header Length: " << header.headerLength << endl; - out << "Key Length: " << header.keyLength << endl; - out << "Number of Flows: " << header.numFlowsPerRead << endl; - out << "Format Code: " << header.flogramFormatCode << endl; - out << "Flow Chars: " << header.flowChars << endl; - out << "Key Sequence: " << header.keySequence << endl << endl; - - return 0; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "printCommonHeader"); - exit(1); - } -} -//********************************************************************************************************************** -int SffInfoCommand::printHeader(ofstream& out, Header& header) { - try { - - out << ">" << header.name << endl; - out << "Run Prefix: " << header.timestamp << endl; - out << "Region #: " << header.region << endl; - out << "XY Location: " << header.xy << endl << endl; - - out << "Run Name: " << endl; - out << "Analysis Name: " << endl; - out << "Full Path: " << endl << endl; - - out << "Read Header Len: " << header.headerLength << endl; - out << "Name Length: " << header.nameLength << endl; - out << "# of Bases: " << header.numBases << endl; - out << "Clip Qual Left: " << header.clipQualLeft << endl; - out << "Clip Qual Right: " << header.clipQualRight << endl; - out << "Clip Adap Left: " << header.clipAdapterLeft << endl; - out << "Clip Adap Right: " << header.clipAdapterRight << endl << endl; - - return 0; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "printHeader"); - exit(1); - } -} -//********************************************************************************************************************** -bool SffInfoCommand::sanityCheck(Header& header, seqRead& read) { - try { - bool okay = true; - string message = "[WARNING]: Your sff file may be corrupted! Sequence: " + header.name + "\n"; - - if (header.clipQualLeft > read.bases.length()) { - okay = false; message += "Clip Qual Left = " + toString(header.clipQualLeft) + ", but we only read " + toString(read.bases.length()) + " bases.\n"; - } - if (header.clipQualRight > read.bases.length()) { - okay = false; message += "Clip Qual Right = " + toString(header.clipQualRight) + ", but we only read " + toString(read.bases.length()) + " bases.\n"; - } - if (header.clipQualLeft > read.qualScores.size()) { - okay = false; message += "Clip Qual Left = " + toString(header.clipQualLeft) + ", but we only read " + toString(read.qualScores.size()) + " quality scores.\n"; - } - if (header.clipQualRight > read.qualScores.size()) { - okay = false; message += "Clip Qual Right = " + toString(header.clipQualRight) + ", but we only read " + toString(read.qualScores.size()) + " quality scores.\n"; - } - - if (okay == false) { - m->mothurOut(message); m->mothurOutEndLine(); - } - - return okay; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "sanityCheck"); - exit(1); - } -} -//********************************************************************************************************************** -int SffInfoCommand::printSffTxtSeqData(ofstream& out, seqRead& read, Header& header) { - try { - out << "Flowgram: "; - for (int i = 0; i < read.flowgram.size(); i++) { out << setprecision(2) << (read.flowgram[i]/(float)100) << '\t'; } - - out << endl << "Flow Indexes: "; - int sum = 0; - for (int i = 0; i < read.flowIndex.size(); i++) { sum += read.flowIndex[i]; out << sum << '\t'; } - - //make the bases you want to clip lowercase and the bases you want to keep upper case - int endValue = header.clipQualRight; - if(endValue == 0){ endValue = read.bases.length(); } - for (int i = 0; i < (header.clipQualLeft-1); i++) { read.bases[i] = tolower(read.bases[i]); } - for (int i = (header.clipQualLeft-1); i < (endValue-1); i++) { read.bases[i] = toupper(read.bases[i]); } - for (int i = (endValue-1); i < read.bases.length(); i++) { read.bases[i] = tolower(read.bases[i]); } - - out << endl << "Bases: " << read.bases << endl << "Quality Scores: "; - for (int i = 0; i < read.qualScores.size(); i++) { out << read.qualScores[i] << '\t'; } - - - out << endl << endl; - - return 0; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "printSffTxtSeqData"); - exit(1); - } -} -//********************************************************************************************************************** -int SffInfoCommand::printFastaSeqData(ofstream& out, seqRead& read, Header& header) { - try { - string seq = read.bases; - - if (trim) { - if(header.clipQualRight < header.clipQualLeft){ - if (header.clipQualRight == 0) { //don't trim right - seq = seq.substr(header.clipQualLeft-1); - }else { - seq = "NNNN"; - } - } - else if((header.clipQualRight != 0) && ((header.clipQualRight-header.clipQualLeft) >= 0)){ - seq = seq.substr((header.clipQualLeft-1), (header.clipQualRight-header.clipQualLeft)); - } - else { - seq = seq.substr(header.clipQualLeft-1); - } - }else{ - //if you wanted the sfftxt then you already converted the bases to the right case - if (!sfftxt) { - int endValue = header.clipQualRight; - //make the bases you want to clip lowercase and the bases you want to keep upper case - if(endValue == 0){ endValue = seq.length(); } - for (int i = 0; i < (header.clipQualLeft-1); i++) { seq[i] = tolower(seq[i]); } - for (int i = (header.clipQualLeft-1); i < (endValue-1); i++) { seq[i] = toupper(seq[i]); } - for (int i = (endValue-1); i < seq.length(); i++) { seq[i] = tolower(seq[i]); } - } - } - - out << ">" << header.name << " xy=" << header.xy << endl; - out << seq << endl; - - return 0; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "printFastaSeqData"); - exit(1); - } -} - -//********************************************************************************************************************** -int SffInfoCommand::printQualSeqData(ofstream& out, seqRead& read, Header& header) { - try { - - if (trim) { - if(header.clipQualRight < header.clipQualLeft){ - if (header.clipQualRight == 0) { //don't trim right - out << ">" << header.name << " xy=" << header.xy << " length=" << (read.qualScores.size()-header.clipQualLeft) << endl; - for (int i = (header.clipQualLeft-1); i < read.qualScores.size(); i++) { out << read.qualScores[i] << '\t'; } - }else { - out << ">" << header.name << " xy=" << header.xy << endl; - out << "0\t0\t0\t0"; - } - } - else if((header.clipQualRight != 0) && ((header.clipQualRight-header.clipQualLeft) >= 0)){ - out << ">" << header.name << " xy=" << header.xy << " length=" << (header.clipQualRight-header.clipQualLeft) << endl; - for (int i = (header.clipQualLeft-1); i < (header.clipQualRight-1); i++) { out << read.qualScores[i] << '\t'; } - } - else{ - out << ">" << header.name << " xy=" << header.xy << " length=" << (header.clipQualRight-header.clipQualLeft) << endl; - for (int i = (header.clipQualLeft-1); i < read.qualScores.size(); i++) { out << read.qualScores[i] << '\t'; } - } - }else{ - out << ">" << header.name << " xy=" << header.xy << " length=" << read.qualScores.size() << endl; - for (int i = 0; i < read.qualScores.size(); i++) { out << read.qualScores[i] << '\t'; } - } - - out << endl; - - return 0; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "printQualSeqData"); - exit(1); - } -} - -//********************************************************************************************************************** -int SffInfoCommand::printFlowSeqData(ofstream& out, seqRead& read, Header& header) { - try { - - int endValue = header.clipQualRight; - if (header.clipQualRight == 0) { - endValue = read.flowIndex.size(); - if (m->debug) { m->mothurOut("[DEBUG]: " + header.name + " has clipQualRight=0.\n"); } - } - if(endValue > header.clipQualLeft){ - - int rightIndex = 0; - for (int i = 0; i < endValue; i++) { rightIndex += read.flowIndex[i]; } - - out << header.name << ' ' << rightIndex; - for (int i = 0; i < read.flowgram.size(); i++) { out << setprecision(2) << ' ' << (read.flowgram[i]/(float)100); } - out << endl; - } - - - return 0; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "printFlowSeqData"); - exit(1); - } -} -//********************************************************************************************************************** -int SffInfoCommand::readAccnosFile(string filename) { - try { - //remove old names - seqNames.clear(); - - ifstream in; - m->openInputFile(filename, in); - string name; - - while(!in.eof()){ - in >> name; m->gobble(in); - - seqNames.insert(name); - - if (m->control_pressed) { seqNames.clear(); break; } - } - in.close(); - - return 0; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "readAccnosFile"); - exit(1); - } -} -//********************************************************************************************************************** -int SffInfoCommand::parseSffTxt() { - try { - - ifstream inSFF; - m->openInputFile(sfftxtFilename, inSFF); - - if (outputDir == "") { outputDir += m->hasPath(sfftxtFilename); } - - //output file names - ofstream outFasta, outQual, outFlow; - string outFastaFileName, outQualFileName; - string fileRoot = m->getRootName(m->getSimpleName(sfftxtFilename)); - if (fileRoot.length() > 0) { - //rip off last . - fileRoot = fileRoot.substr(0, fileRoot.length()-1); - fileRoot = m->getRootName(fileRoot); - } - - map variables; - variables["[filename]"] = fileRoot; - string sfftxtFileName = getOutputFileName("sfftxt",variables); - string outFlowFileName = getOutputFileName("flow",variables); - if (!trim) { variables["[tag]"] = "raw"; } - outFastaFileName = getOutputFileName("fasta",variables); - outQualFileName = getOutputFileName("qfile",variables); - - if (fasta) { m->openOutputFile(outFastaFileName, outFasta); outputNames.push_back(outFastaFileName); outputTypes["fasta"].push_back(outFastaFileName); } - if (qual) { m->openOutputFile(outQualFileName, outQual); outputNames.push_back(outQualFileName); outputTypes["qfile"].push_back(outQualFileName); } - if (flow) { m->openOutputFile(outFlowFileName, outFlow); outputNames.push_back(outFlowFileName); outFlow.setf(ios::fixed, ios::floatfield); outFlow.setf(ios::showpoint); outputTypes["flow"].push_back(outFlowFileName); } - - //read common header - string commonHeader = m->getline(inSFF); - string magicNumber = m->getline(inSFF); - string version = m->getline(inSFF); - string indexOffset = m->getline(inSFF); - string indexLength = m->getline(inSFF); - int numReads = parseHeaderLineToInt(inSFF); - string headerLength = m->getline(inSFF); - string keyLength = m->getline(inSFF); - int numFlows = parseHeaderLineToInt(inSFF); - string flowgramCode = m->getline(inSFF); - string flowChars = m->getline(inSFF); - string keySequence = m->getline(inSFF); - m->gobble(inSFF); - - string seqName; - - if (flow) { outFlow << numFlows << endl; } - - for(int i=0;imothurOut("[ERROR]: Expected " + toString(numReads) + " but reached end of file at " + toString(i+1) + "."); m->mothurOutEndLine(); break; } - - Header header; - - //parse read header - inSFF >> seqName; - seqName = seqName.substr(1); - m->gobble(inSFF); - header.name = seqName; - - string runPrefix = parseHeaderLineToString(inSFF); header.timestamp = runPrefix; - string regionNumber = parseHeaderLineToString(inSFF); header.region = regionNumber; - string xyLocation = parseHeaderLineToString(inSFF); header.xy = xyLocation; - m->gobble(inSFF); - - string runName = parseHeaderLineToString(inSFF); - string analysisName = parseHeaderLineToString(inSFF); - string fullPath = parseHeaderLineToString(inSFF); - m->gobble(inSFF); - - string readHeaderLen = parseHeaderLineToString(inSFF); convert(readHeaderLen, header.headerLength); - string nameLength = parseHeaderLineToString(inSFF); convert(nameLength, header.nameLength); - int numBases = parseHeaderLineToInt(inSFF); header.numBases = numBases; - string clipQualLeft = parseHeaderLineToString(inSFF); convert(clipQualLeft, header.clipQualLeft); - int clipQualRight = parseHeaderLineToInt(inSFF); header.clipQualRight = clipQualRight; - string clipAdapLeft = parseHeaderLineToString(inSFF); convert(clipAdapLeft, header.clipAdapterLeft); - string clipAdapRight = parseHeaderLineToString(inSFF); convert(clipAdapRight, header.clipAdapterRight); - m->gobble(inSFF); - - seqRead read; - - //parse read - vector flowVector = parseHeaderLineToFloatVector(inSFF, numFlows); read.flowgram = flowVector; - vector flowIndices = parseHeaderLineToIntVector(inSFF, numBases); - - //adjust for print - vector flowIndicesAdjusted; flowIndicesAdjusted.push_back(flowIndices[0]); - for (int j = 1; j < flowIndices.size(); j++) { flowIndicesAdjusted.push_back(flowIndices[j] - flowIndices[j-1]); } - read.flowIndex = flowIndicesAdjusted; - - string bases = parseHeaderLineToString(inSFF); read.bases = bases; - vector qualityScores = parseHeaderLineToIntVector(inSFF, numBases); read.qualScores = qualityScores; - m->gobble(inSFF); - - //if you have provided an accosfile and this seq is not in it, then dont print - bool print = true; - if (seqNames.size() != 0) { if (seqNames.count(header.name) == 0) { print = false; } } - - //print - if (print) { - if (fasta) { printFastaSeqData(outFasta, read, header); } - if (qual) { printQualSeqData(outQual, read, header); } - if (flow) { printFlowSeqData(outFlow, read, header); } - } - - //report progress - if((i+1) % 10000 == 0){ m->mothurOut(toString(i+1)); m->mothurOutEndLine(); } - - if (m->control_pressed) { break; } - } - - //report progress - if (!m->control_pressed) { if((numReads) % 10000 != 0){ m->mothurOut(toString(numReads)); m->mothurOutEndLine(); } } - - inSFF.close(); - - if (fasta) { outFasta.close(); } - if (qual) { outQual.close(); } - if (flow) { outFlow.close(); } - - return 0; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "parseSffTxt"); - exit(1); - } -} -//********************************************************************************************************************** - -int SffInfoCommand::parseHeaderLineToInt(ifstream& file){ - try { - int number; - - while (!file.eof()) { - - char c = file.get(); - if (c == ':'){ - file >> number; - break; - } - - } - m->gobble(file); - return number; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "parseHeaderLineToInt"); - exit(1); - } - -} - -//********************************************************************************************************************** - -string SffInfoCommand::parseHeaderLineToString(ifstream& file){ - try { - string text; - - while (!file.eof()) { - char c = file.get(); - - if (c == ':'){ - //m->gobble(file); - //text = m->getline(file); - file >> text; - break; - } - } - m->gobble(file); - - return text; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "parseHeaderLineToString"); - exit(1); - } -} - -//********************************************************************************************************************** - -vector SffInfoCommand::parseHeaderLineToFloatVector(ifstream& file, int length){ - try { - vector floatVector(length); - - while (!file.eof()) { - char c = file.get(); - if (c == ':'){ - float temp; - for(int i=0;i> temp; - floatVector[i] = temp * 100; - } - break; - } - } - m->gobble(file); - return floatVector; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "parseHeaderLineToFloatVector"); - exit(1); - } -} - -//********************************************************************************************************************** - -vector SffInfoCommand::parseHeaderLineToIntVector(ifstream& file, int length){ - try { - vector intVector(length); - - while (!file.eof()) { - char c = file.get(); - if (c == ':'){ - for(int i=0;i> intVector[i]; - } - break; - } - } - m->gobble(file); - return intVector; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "parseHeaderLineToIntVector"); - exit(1); - } -} -//*************************************************************************************************************** - -bool SffInfoCommand::readOligos(string oligoFile){ - try { - filehandles.clear(); - numSplitReads.clear(); - filehandlesHeaders.clear(); - - ifstream inOligos; - m->openInputFile(oligoFile, inOligos); - - string type, oligo, group; - - int indexPrimer = 0; - int indexBarcode = 0; - - while(!inOligos.eof()){ - - inOligos >> type; - - if(type[0] == '#'){ - while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there - m->gobble(inOligos); - } - else{ - m->gobble(inOligos); - //make type case insensitive - for(int i=0;i> oligo; - - for(int i=0;i::iterator itPrime = primers.find(oligo); - if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); } - - primers[oligo]=indexPrimer; indexPrimer++; - primerNameVector.push_back(group); - }else if(type == "REVERSE"){ - //Sequence oligoRC("reverse", oligo); - //oligoRC.reverseComplement(); - string oligoRC = reverseOligo(oligo); - revPrimer.push_back(oligoRC); - } - else if(type == "BARCODE"){ - inOligos >> group; - - //check for repeat barcodes - map::iterator itBar = barcodes.find(oligo); - if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); } - - barcodes[oligo]=indexBarcode; indexBarcode++; - barcodeNameVector.push_back(group); - }else if(type == "LINKER"){ - linker.push_back(oligo); - }else if(type == "SPACER"){ - spacer.push_back(oligo); - } - else{ m->mothurOut("[WARNING]: " + type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); } - } - m->gobble(inOligos); - } - inOligos.close(); - - if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ split = 1; } - - //add in potential combos - if(barcodeNameVector.size() == 0){ - barcodes[""] = 0; - barcodeNameVector.push_back(""); - } - - if(primerNameVector.size() == 0){ - primers[""] = 0; - primerNameVector.push_back(""); - } - - filehandles.resize(barcodeNameVector.size()); - for(int i=0;i 1){ - set uniqueNames; //used to cleanup outputFileNames - for(map::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){ - for(map::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){ - - string primerName = primerNameVector[itPrimer->second]; - string barcodeName = barcodeNameVector[itBar->second]; - - string comboGroupName = ""; - string fastaFileName = ""; - string qualFileName = ""; - string nameFileName = ""; - - if(primerName == ""){ - comboGroupName = barcodeNameVector[itBar->second]; - } - else{ - if(barcodeName == ""){ - comboGroupName = primerNameVector[itPrimer->second]; - } - else{ - comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second]; - } - } - - ofstream temp; - map variables; - variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(currentFileName)); - variables["[group]"] = comboGroupName; - string thisFilename = getOutputFileName("sff",variables); - if (uniqueNames.count(thisFilename) == 0) { - outputNames.push_back(thisFilename); - outputTypes["sff"].push_back(thisFilename); - uniqueNames.insert(thisFilename); - } - - filehandles[itBar->second][itPrimer->second] = thisFilename; - temp.open(thisFilename.c_str(), ios::binary); temp.close(); - } - } - } - numFPrimers = primers.size(); - numLinkers = linker.size(); - numSpacers = spacer.size(); - map variables; - variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(currentFileName)); - variables["[group]"] = "scrap"; - noMatchFile = getOutputFileName("sff",variables); - m->mothurRemove(noMatchFile); - - bool allBlank = true; - for (int i = 0; i < barcodeNameVector.size(); i++) { - if (barcodeNameVector[i] != "") { - allBlank = false; - break; - } - } - for (int i = 0; i < primerNameVector.size(); i++) { - if (primerNameVector[i] != "") { - allBlank = false; - break; - } - } - - filehandlesHeaders.resize(filehandles.size()); - numSplitReads.resize(filehandles.size()); - for (int i = 0; i < filehandles.size(); i++) { - numSplitReads[i].resize(filehandles[i].size(), 0); - for (int j = 0; j < filehandles[i].size(); j++) { - filehandlesHeaders[i].push_back(filehandles[i][j]+"headers"); - } - } - - if (allBlank) { - m->mothurOut("[WARNING]: your oligos file does not contain any group names. mothur will not create a split the sff file."); m->mothurOutEndLine(); - split = 1; - return false; - } - - return true; - - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "readOligos"); - exit(1); - } -} -//********************************************************************/ -string SffInfoCommand::reverseOligo(string oligo){ - try { - string reverse = ""; - - for(int i=oligo.length()-1;i>=0;i--){ - - if(oligo[i] == 'A') { reverse += 'T'; } - else if(oligo[i] == 'T'){ reverse += 'A'; } - else if(oligo[i] == 'U'){ reverse += 'A'; } - - else if(oligo[i] == 'G'){ reverse += 'C'; } - else if(oligo[i] == 'C'){ reverse += 'G'; } - - else if(oligo[i] == 'R'){ reverse += 'Y'; } - else if(oligo[i] == 'Y'){ reverse += 'R'; } - - else if(oligo[i] == 'M'){ reverse += 'K'; } - else if(oligo[i] == 'K'){ reverse += 'M'; } - - else if(oligo[i] == 'W'){ reverse += 'W'; } - else if(oligo[i] == 'S'){ reverse += 'S'; } - - else if(oligo[i] == 'B'){ reverse += 'V'; } - else if(oligo[i] == 'V'){ reverse += 'B'; } - - else if(oligo[i] == 'D'){ reverse += 'H'; } - else if(oligo[i] == 'H'){ reverse += 'D'; } - - else { reverse += 'N'; } - } - - - return reverse; - } - catch(exception& e) { - m->errorOut(e, "SffInfoCommand", "reverseOligo"); - exit(1); - } -} - -//********************************************************************************************************************** - - - - +/* + * sffinfocommand.cpp + * Mothur + * + * Created by westcott on 7/7/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "sffinfocommand.h" +#include "endiannessmacros.h" +#include "trimoligos.h" +#include "sequence.hpp" +#include "qualityscores.h" + +//********************************************************************************************************************** +vector SffInfoCommand::setParameters(){ + try { + CommandParameter psff("sff", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(psff); + CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(poligos); + CommandParameter paccnos("accnos", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(paccnos); + CommandParameter psfftxt("sfftxt", "String", "", "", "", "", "","",false,false); parameters.push_back(psfftxt); + CommandParameter pflow("flow", "Boolean", "", "T", "", "", "","flow",false,false); parameters.push_back(pflow); + CommandParameter ptrim("trim", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(ptrim); + CommandParameter pfasta("fasta", "Boolean", "", "T", "", "", "","fasta",false,false); parameters.push_back(pfasta); + CommandParameter pqfile("qfile", "Boolean", "", "T", "", "", "","qfile",false,false); parameters.push_back(pqfile); + CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(ppdiffs); + CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pbdiffs); + CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pldiffs); + CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(psdiffs); + CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(ptdiffs); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); + + vector myArray; + for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } + return myArray; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "setParameters"); + exit(1); + } +} +//********************************************************************************************************************** +string SffInfoCommand::getHelpString(){ + try { + string helpString = ""; + helpString += "The sffinfo command reads a sff file and extracts the sequence data, or you can use it to parse a sfftxt file.\n"; + helpString += "The sffinfo command parameters are sff, fasta, qfile, accnos, flow, sfftxt, oligos, bdiffs, tdiffs, ldiffs, sdiffs, pdiffs and trim. sff is required. \n"; + helpString += "The sff parameter allows you to enter the sff file you would like to extract data from. You may enter multiple files by separating them by -'s.\n"; + helpString += "The fasta parameter allows you to indicate if you would like a fasta formatted file generated. Default=True. \n"; + helpString += "The qfile parameter allows you to indicate if you would like a quality file generated. Default=True. \n"; + helpString += "The oligos parameter allows you to provide an oligos file to split your sff file into separate sff files by barcode. \n"; + helpString += "The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs + sdiffs + ldiffs.\n"; + helpString += "The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n"; + helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n"; + helpString += "The ldiffs parameter is used to specify the number of differences allowed in the linker. The default is 0.\n"; + helpString += "The sdiffs parameter is used to specify the number of differences allowed in the spacer. The default is 0.\n"; + helpString += "The flow parameter allows you to indicate if you would like a flowgram file generated. Default=True. \n"; + helpString += "The sfftxt parameter allows you to indicate if you would like a sff.txt file generated. Default=False. \n"; + helpString += "If you want to parse an existing sfftxt file into flow, fasta and quality file, enter the file name using the sfftxt parameter. \n"; + helpString += "The trim parameter allows you to indicate if you would like a sequences and quality scores trimmed to the clipQualLeft and clipQualRight values. Default=True. \n"; + helpString += "The accnos parameter allows you to provide a accnos file containing the names of the sequences you would like extracted. You may enter multiple files by separating them by -'s. \n"; + helpString += "Example sffinfo(sff=mySffFile.sff, trim=F).\n"; + helpString += "Note: No spaces between parameter labels (i.e. sff), '=' and parameters (i.e.yourSffFileName).\n"; + return helpString; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "getHelpString"); + exit(1); + } +} + +//********************************************************************************************************************** +string SffInfoCommand::getOutputPattern(string type) { + try { + string pattern = ""; + + if (type == "fasta") { pattern = "[filename],fasta-[filename],[tag],fasta"; } + else if (type == "flow") { pattern = "[filename],flow"; } + else if (type == "sfftxt") { pattern = "[filename],sff.txt"; } + else if (type == "sff") { pattern = "[filename],[group],sff"; } + else if (type == "qfile") { pattern = "[filename],qual-[filename],[tag],qual"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } + + return pattern; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "getOutputPattern"); + exit(1); + } +} +//********************************************************************************************************************** +SffInfoCommand::SffInfoCommand(){ + try { + abort = true; calledHelp = true; + setParameters(); + vector tempOutNames; + outputTypes["fasta"] = tempOutNames; + outputTypes["flow"] = tempOutNames; + outputTypes["sfftxt"] = tempOutNames; + outputTypes["qfile"] = tempOutNames; + outputTypes["sff"] = tempOutNames; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "SffInfoCommand"); + exit(1); + } +} +//********************************************************************************************************************** + +SffInfoCommand::SffInfoCommand(string option) { + try { + abort = false; calledHelp = false; + hasAccnos = false; hasOligos = false; + split = 1; + + //allow user to run help + if(option == "help") { help(); abort = true; calledHelp = true; } + else if(option == "citation") { citation(); abort = true; calledHelp = true;} + + else { + //valid paramters for this command + vector myArray = setParameters(); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + //check to make sure all parameters are valid for command + for (map::iterator it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + //initialize outputTypes + vector tempOutNames; + outputTypes["fasta"] = tempOutNames; + outputTypes["flow"] = tempOutNames; + outputTypes["sfftxt"] = tempOutNames; + outputTypes["qfile"] = tempOutNames; + outputTypes["sff"] = tempOutNames; + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); if (inputDir == "not found"){ inputDir = ""; } + + sffFilename = validParameter.validFile(parameters, "sff", false); + if (sffFilename == "not found") { sffFilename = ""; } + else { + m->splitAtDash(sffFilename, filenames); + + //go through files and make sure they are good, if not, then disregard them + for (int i = 0; i < filenames.size(); i++) { + bool ignore = false; + if (filenames[i] == "current") { + filenames[i] = m->getSFFFile(); + if (filenames[i] != "") { m->mothurOut("Using " + filenames[i] + " as input file for the sff parameter where you had given current."); m->mothurOutEndLine(); } + else { + m->mothurOut("You have no current sfffile, ignoring current."); m->mothurOutEndLine(); ignore=true; + //erase from file list + filenames.erase(filenames.begin()+i); + i--; + } + } + + if (!ignore) { + if (inputDir != "") { + string path = m->hasPath(filenames[i]); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { filenames[i] = inputDir + filenames[i]; } + } + + ifstream in; + int ableToOpen = m->openInputFile(filenames[i], in, "noerror"); + + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(filenames[i]); + m->mothurOut("Unable to open " + filenames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + filenames[i] = tryPath; + } + } + + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(filenames[i]); + m->mothurOut("Unable to open " + filenames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + filenames[i] = tryPath; + } + } + + in.close(); + + if (ableToOpen == 1) { + m->mothurOut("Unable to open " + filenames[i] + ". It will be disregarded."); m->mothurOutEndLine(); + //erase from file list + filenames.erase(filenames.begin()+i); + i--; + }else { m->setSFFFile(filenames[i]); } + } + } + + //make sure there is at least one valid file left + if (filenames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; } + } + + accnosName = validParameter.validFile(parameters, "accnos", false); + if (accnosName == "not found") { accnosName = ""; } + else { + hasAccnos = true; + m->splitAtDash(accnosName, accnosFileNames); + + //go through files and make sure they are good, if not, then disregard them + for (int i = 0; i < accnosFileNames.size(); i++) { + bool ignore = false; + if (accnosFileNames[i] == "current") { + accnosFileNames[i] = m->getAccnosFile(); + if (accnosFileNames[i] != "") { m->mothurOut("Using " + accnosFileNames[i] + " as input file for the accnos parameter where you had given current."); m->mothurOutEndLine(); } + else { + m->mothurOut("You have no current accnosfile, ignoring current."); m->mothurOutEndLine(); ignore=true; + //erase from file list + accnosFileNames.erase(accnosFileNames.begin()+i); + i--; + } + } + + if (!ignore) { + + if (inputDir != "") { + string path = m->hasPath(accnosFileNames[i]); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { accnosFileNames[i] = inputDir + accnosFileNames[i]; } + } + + ifstream in; + int ableToOpen = m->openInputFile(accnosFileNames[i], in, "noerror"); + + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(accnosFileNames[i]); + m->mothurOut("Unable to open " + accnosFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + accnosFileNames[i] = tryPath; + } + } + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(accnosFileNames[i]); + m->mothurOut("Unable to open " + accnosFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + accnosFileNames[i] = tryPath; + } + } + in.close(); + + if (ableToOpen == 1) { + m->mothurOut("Unable to open " + accnosFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); + //erase from file list + accnosFileNames.erase(accnosFileNames.begin()+i); + i--; + } + } + } + + //make sure there is at least one valid file left + if (accnosFileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; } + } + + oligosfile = validParameter.validFile(parameters, "oligos", false); + if (oligosfile == "not found") { oligosfile = ""; } + else { + hasOligos = true; + m->splitAtDash(oligosfile, oligosFileNames); + + //go through files and make sure they are good, if not, then disregard them + for (int i = 0; i < oligosFileNames.size(); i++) { + bool ignore = false; + if (oligosFileNames[i] == "current") { + oligosFileNames[i] = m->getOligosFile(); + if (oligosFileNames[i] != "") { m->mothurOut("Using " + oligosFileNames[i] + " as input file for the accnos parameter where you had given current."); m->mothurOutEndLine(); } + else { + m->mothurOut("You have no current oligosfile, ignoring current."); m->mothurOutEndLine(); ignore=true; + //erase from file list + oligosFileNames.erase(oligosFileNames.begin()+i); + i--; + } + } + + if (!ignore) { + + if (inputDir != "") { + string path = m->hasPath(oligosFileNames[i]); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { oligosFileNames[i] = inputDir + oligosFileNames[i]; } + } + + ifstream in; + int ableToOpen = m->openInputFile(oligosFileNames[i], in, "noerror"); + + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(oligosFileNames[i]); + m->mothurOut("Unable to open " + oligosFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + oligosFileNames[i] = tryPath; + } + } + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(oligosFileNames[i]); + m->mothurOut("Unable to open " + oligosFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + oligosFileNames[i] = tryPath; + } + } + in.close(); + + if (ableToOpen == 1) { + m->mothurOut("Unable to open " + oligosFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); + //erase from file list + oligosFileNames.erase(oligosFileNames.begin()+i); + i--; + } + } + } + + //make sure there is at least one valid file left + if (oligosFileNames.size() == 0) { m->mothurOut("no valid oligos files."); m->mothurOutEndLine(); abort = true; } + } + + if (hasOligos) { + split = 2; + if (oligosFileNames.size() != filenames.size()) { abort = true; m->mothurOut("If you provide a oligos file, you must have one for each sff file."); m->mothurOutEndLine(); } + } + + if (hasAccnos) { + if (accnosFileNames.size() != filenames.size()) { abort = true; m->mothurOut("If you provide a accnos file, you must have one for each sff file."); m->mothurOutEndLine(); } + } + + string temp = validParameter.validFile(parameters, "qfile", false); if (temp == "not found"){ temp = "T"; } + qual = m->isTrue(temp); + + temp = validParameter.validFile(parameters, "fasta", false); if (temp == "not found"){ temp = "T"; } + fasta = m->isTrue(temp); + + temp = validParameter.validFile(parameters, "flow", false); if (temp == "not found"){ temp = "T"; } + flow = m->isTrue(temp); + + temp = validParameter.validFile(parameters, "trim", false); if (temp == "not found"){ temp = "T"; } + trim = m->isTrue(temp); + + temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, bdiffs); + + temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, pdiffs); + + temp = validParameter.validFile(parameters, "ldiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, ldiffs); + + temp = validParameter.validFile(parameters, "sdiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, sdiffs); + + temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs + ldiffs + sdiffs; temp = toString(tempTotal); } + m->mothurConvert(temp, tdiffs); + + if(tdiffs == 0){ tdiffs = bdiffs + pdiffs + ldiffs + sdiffs; } + + temp = validParameter.validFile(parameters, "sfftxt", false); + if (temp == "not found") { temp = "F"; sfftxt = false; sfftxtFilename = ""; } + else if (m->isTrue(temp)) { sfftxt = true; sfftxtFilename = ""; } + else { + //you are a filename + if (inputDir != "") { + map::iterator it = parameters.find("sfftxt"); + //user has given a template file + if(it != parameters.end()){ + string path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["sfftxt"] = inputDir + it->second; } + } + } + + sfftxtFilename = validParameter.validFile(parameters, "sfftxt", true); + if (sfftxtFilename == "not found") { sfftxtFilename = ""; } + else if (sfftxtFilename == "not open") { sfftxtFilename = ""; } + } + + if ((sfftxtFilename == "") && (filenames.size() == 0)) { + //if there is a current sff file, use it + string filename = m->getSFFFile(); + if (filename != "") { filenames.push_back(filename); m->mothurOut("Using " + filename + " as input file for the sff parameter."); m->mothurOutEndLine(); } + else { m->mothurOut("[ERROR]: you must provide a valid sff or sfftxt file."); m->mothurOutEndLine(); abort=true; } + } + + + } + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "SffInfoCommand"); + exit(1); + } +} +//********************************************************************************************************************** +int SffInfoCommand::execute(){ + try { + if (abort == true) { if (calledHelp) { return 0; } return 2; } + + for (int s = 0; s < filenames.size(); s++) { + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + + int start = time(NULL); + + filenames[s] = m->getFullPathName(filenames[s]); + m->mothurOut("Extracting info from " + filenames[s] + " ..." ); m->mothurOutEndLine(); + + string accnos = ""; + if (hasAccnos) { accnos = accnosFileNames[s]; } + + string oligos = ""; + if (hasOligos) { oligos = oligosFileNames[s]; } + + int numReads = extractSffInfo(filenames[s], accnos, oligos); + + m->mothurOut("It took " + toString(time(NULL) - start) + " secs to extract " + toString(numReads) + "."); + } + + if (sfftxtFilename != "") { parseSffTxt(); } + + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + + //set fasta file as new current fastafile + string current = ""; + itTypes = outputTypes.find("fasta"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); } + } + + itTypes = outputTypes.find("qfile"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); } + } + + itTypes = outputTypes.find("flow"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFlowFile(current); } + } + + //report output filenames + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "execute"); + exit(1); + } +} +//********************************************************************************************************************** +int SffInfoCommand::extractSffInfo(string input, string accnos, string oligos){ + try { + currentFileName = input; + if (outputDir == "") { outputDir += m->hasPath(input); } + + if (accnos != "") { readAccnosFile(accnos); } + else { seqNames.clear(); } + + if (oligos != "") { readOligos(oligos); split = 2; } + + ofstream outSfftxt, outFasta, outQual, outFlow; + string outFastaFileName, outQualFileName; + string rootName = outputDir + m->getRootName(m->getSimpleName(input)); + if(rootName.find_last_of(".") == rootName.npos){ rootName += "."; } + + map variables; + variables["[filename]"] = rootName; + string sfftxtFileName = getOutputFileName("sfftxt",variables); + string outFlowFileName = getOutputFileName("flow",variables); + if (!trim) { variables["[tag]"] = "raw"; } + outFastaFileName = getOutputFileName("fasta",variables); + outQualFileName = getOutputFileName("qfile",variables); + + if (sfftxt) { m->openOutputFile(sfftxtFileName, outSfftxt); outSfftxt.setf(ios::fixed, ios::floatfield); outSfftxt.setf(ios::showpoint); outputNames.push_back(sfftxtFileName); outputTypes["sfftxt"].push_back(sfftxtFileName); } + if (fasta) { m->openOutputFile(outFastaFileName, outFasta); outputNames.push_back(outFastaFileName); outputTypes["fasta"].push_back(outFastaFileName); } + if (qual) { m->openOutputFile(outQualFileName, outQual); outputNames.push_back(outQualFileName); outputTypes["qfile"].push_back(outQualFileName); } + if (flow) { m->openOutputFile(outFlowFileName, outFlow); outputNames.push_back(outFlowFileName); outFlow.setf(ios::fixed, ios::floatfield); outFlow.setf(ios::showpoint); outputTypes["flow"].push_back(outFlowFileName); } + + ifstream in; + m->openInputFileBinary(input, in); + + CommonHeader header; + readCommonHeader(in, header); + + int count = 0; + + //check magic number and version + if (header.magicNumber != 779314790) { m->mothurOut("Magic Number is not correct, not a valid .sff file"); m->mothurOutEndLine(); return count; } + if (header.version != "0001") { m->mothurOut("Version is not supported, only support version 0001."); m->mothurOutEndLine(); return count; } + + //print common header + if (sfftxt) { printCommonHeader(outSfftxt, header); } + if (flow) { outFlow << header.numFlowsPerRead << endl; } + + //read through the sff file + while (!in.eof()) { + + bool print = true; + + //read data + seqRead read; Header readheader; + readSeqData(in, read, header.numFlowsPerRead, readheader); + + bool okay = sanityCheck(readheader, read); + if (!okay) { break; } + + //if you have provided an accosfile and this seq is not in it, then dont print + if (seqNames.size() != 0) { if (seqNames.count(readheader.name) == 0) { print = false; } } + + //print + if (print) { + if (sfftxt) { printHeader(outSfftxt, readheader); printSffTxtSeqData(outSfftxt, read, readheader); } + if (fasta) { printFastaSeqData(outFasta, read, readheader); } + if (qual) { printQualSeqData(outQual, read, readheader); } + if (flow) { printFlowSeqData(outFlow, read, readheader); } + } + + count++; + + //report progress + if((count+1) % 10000 == 0){ m->mothurOut(toString(count+1)); m->mothurOutEndLine(); } + + if (m->control_pressed) { count = 0; break; } + + if (count >= header.numReads) { break; } + } + + //report progress + if (!m->control_pressed) { if((count) % 10000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } } + + in.close(); + + if (sfftxt) { outSfftxt.close(); } + if (fasta) { outFasta.close(); } + if (qual) { outQual.close(); } + if (flow) { outFlow.close(); } + + if (split > 1) { + //create new common headers for each file with the correct number of reads + adjustCommonHeader(header); + + //close files and delete ofstreams + for(int i=0;isecond)->close(); delete (filehandles[i][j].begin()->second); + (filehandlesHeaders[i][j].begin()->second)->close(); delete (filehandlesHeaders[i][j].begin()->second); + } + } + + //cout << "here" << endl; + map::iterator it; + set namesToRemove; + for(int i=0;ifirst != "") { + if (namesToRemove.count(filehandles[i][j].begin()->first) == 0) { + if(m->isBlank(filehandles[i][j].begin()->first)){ + //cout << i << '\t' << '\t' << j << '\t' << filehandles[i][j] << " is blank removing" << endl; + m->mothurRemove(filehandles[i][j].begin()->first); + m->mothurRemove(filehandlesHeaders[i][j].begin()->first); + namesToRemove.insert(filehandles[i][j].begin()->first); + } + } + } + } + } + //cout << "here2" << endl; + //append new header to reads + for (int i = 0; i < filehandles.size(); i++) { + for (int j = 0; j < filehandles[i].size(); j++) { + m->appendBinaryFiles(filehandles[i][j].begin()->first, filehandlesHeaders[i][j].begin()->first); + m->renameFile(filehandlesHeaders[i][j].begin()->first, filehandles[i][j].begin()->first); + m->mothurRemove(filehandlesHeaders[i][j].begin()->first); + //cout << i << '\t' << '\t' << j << '\t' << filehandles[i][j] << " done appending headers and removing " << filehandlesHeaders[i][j] << endl; + if (numSplitReads[i][j] == 0) { m->mothurRemove(filehandles[i][j].begin()->first); } + } + } + //cout << "here3" << endl; + //remove names for outputFileNames, just cleans up the output + for(int i = 0; i < outputNames.size(); i++) { + if (namesToRemove.count(outputNames[i]) != 0) { + //cout << "erasing " << i << '\t' << outputNames[i] << endl; + outputNames.erase(outputNames.begin()+i); + i--; + } + } + //cout << "here4" << endl; + if(m->isBlank(noMatchFile)){ m->mothurRemove(noMatchFile); } + else { outputNames.push_back(noMatchFile); outputTypes["sff"].push_back(noMatchFile); } + } + + return count; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "extractSffInfo"); + exit(1); + } +} +//********************************************************************************************************************** +int SffInfoCommand::readCommonHeader(ifstream& in, CommonHeader& header){ + try { + + if (!in.eof()) { + + //read magic number + char buffer[4]; + in.read(buffer, 4); + header.magicNumber = be_int4(*(unsigned int *)(&buffer)); + + //read version + char buffer9[4]; + in.read(buffer9, 4); + header.version = ""; + for (int i = 0; i < 4; i++) { header.version += toString((int)(buffer9[i])); } + + //read offset + char buffer2 [8]; + in.read(buffer2, 8); + header.indexOffset = be_int8(*(unsigned long long *)(&buffer2)); + + //read index length + char buffer3 [4]; + in.read(buffer3, 4); + header.indexLength = be_int4(*(unsigned int *)(&buffer3)); + + //read num reads + char buffer4 [4]; + in.read(buffer4, 4); + header.numReads = be_int4(*(unsigned int *)(&buffer4)); + + if (m->debug) { m->mothurOut("[DEBUG]: numReads = " + toString(header.numReads) + "\n"); } + + //read header length + char buffer5 [2]; + in.read(buffer5, 2); + header.headerLength = be_int2(*(unsigned short *)(&buffer5)); + + //read key length + char buffer6 [2]; + in.read(buffer6, 2); + header.keyLength = be_int2(*(unsigned short *)(&buffer6)); + + //read number of flow reads + char buffer7 [2]; + in.read(buffer7, 2); + header.numFlowsPerRead = be_int2(*(unsigned short *)(&buffer7)); + + //read format code + char buffer8 [1]; + in.read(buffer8, 1); + header.flogramFormatCode = (int)(buffer8[0]); + + //read flow chars + char* tempBuffer = new char[header.numFlowsPerRead]; + in.read(&(*tempBuffer), header.numFlowsPerRead); + header.flowChars = tempBuffer; + if (header.flowChars.length() > header.numFlowsPerRead) { header.flowChars = header.flowChars.substr(0, header.numFlowsPerRead); } + delete[] tempBuffer; + + //read key + char* tempBuffer2 = new char[header.keyLength]; + in.read(&(*tempBuffer2), header.keyLength); + header.keySequence = tempBuffer2; + if (header.keySequence.length() > header.keyLength) { header.keySequence = header.keySequence.substr(0, header.keyLength); } + delete[] tempBuffer2; + + /* Pad to 8 chars */ + unsigned long long spotInFile = in.tellg(); + unsigned long long spot = (spotInFile + 7)& ~7; // ~ inverts + in.seekg(spot); + + }else{ + m->mothurOut("Error reading sff common header."); m->mothurOutEndLine(); + } + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "readCommonHeader"); + exit(1); + } +} +//********************************************************************************************************************** +int SffInfoCommand::adjustCommonHeader(CommonHeader header){ + try { + string endian = m->findEdianness(); + char* mybuffer = new char[4]; + ifstream in; + m->openInputFileBinary(currentFileName, in); + + ofstream outNoMatchHeader; + string tempNoHeader = "tempNoMatchHeader"; + m->openOutputFileBinary(tempNoHeader, outNoMatchHeader); + + //magic number + in.read(mybuffer,4); + for (int i = 0; i < filehandlesHeaders.size(); i++) { + for (int j = 0; j < filehandlesHeaders[i].size(); j++) { + (*(filehandlesHeaders[i][j].begin()->second)).write(mybuffer, in.gcount()); + } + } + outNoMatchHeader.write(mybuffer, in.gcount()); + delete[] mybuffer; + + //version + mybuffer = new char[4]; + in.read(mybuffer,4); + for (int i = 0; i < filehandlesHeaders.size(); i++) { + for (int j = 0; j < filehandlesHeaders[i].size(); j++) { + (*(filehandlesHeaders[i][j].begin()->second)).write(mybuffer, in.gcount()); + } + } + outNoMatchHeader.write(mybuffer, in.gcount()); + delete[] mybuffer; + + //offset + mybuffer = new char[8]; + in.read(mybuffer,8); + unsigned long long offset = 0; + char* thisbuffer = new char[8]; + thisbuffer[0] = (offset >> 56) & 0xFF; + thisbuffer[1] = (offset >> 48) & 0xFF; + thisbuffer[2] = (offset >> 40) & 0xFF; + thisbuffer[3] = (offset >> 32) & 0xFF; + thisbuffer[4] = (offset >> 24) & 0xFF; + thisbuffer[5] = (offset >> 16) & 0xFF; + thisbuffer[6] = (offset >> 8) & 0xFF; + thisbuffer[7] = offset & 0xFF; + for (int i = 0; i < filehandlesHeaders.size(); i++) { + for (int j = 0; j < filehandlesHeaders[i].size(); j++) { + (*(filehandlesHeaders[i][j].begin()->second)).write(thisbuffer, 8); + } + } + outNoMatchHeader.write(thisbuffer, 8); + delete[] thisbuffer; + delete[] mybuffer; + + + //read index length + mybuffer = new char[4]; + in.read(mybuffer,4); + offset = 0; + char* thisbuffer2 = new char[4]; + thisbuffer2[0] = (offset >> 24) & 0xFF; + thisbuffer2[1] = (offset >> 16) & 0xFF; + thisbuffer2[2] = (offset >> 8) & 0xFF; + thisbuffer2[3] = offset & 0xFF; + for (int i = 0; i < filehandlesHeaders.size(); i++) { + for (int j = 0; j < filehandlesHeaders[i].size(); j++) { + (*(filehandlesHeaders[i][j].begin()->second)).write(thisbuffer2, 4); + } + } + outNoMatchHeader.write(thisbuffer2, 4); + delete[] thisbuffer2; + delete[] mybuffer; + + //change num reads + mybuffer = new char[4]; + in.read(mybuffer,4); + delete[] mybuffer; + for (int i = 0; i < filehandlesHeaders.size(); i++) { + for (int j = 0; j < filehandlesHeaders[i].size(); j++) { + char* thisbuffer = new char[4]; + if (endian == "BIG_ENDIAN") { + thisbuffer[0] = (numSplitReads[i][j] >> 24) & 0xFF; + thisbuffer[1] = (numSplitReads[i][j] >> 16) & 0xFF; + thisbuffer[2] = (numSplitReads[i][j] >> 8) & 0xFF; + thisbuffer[3] = numSplitReads[i][j] & 0xFF; + }else { + thisbuffer[0] = numSplitReads[i][j] & 0xFF; + thisbuffer[1] = (numSplitReads[i][j] >> 8) & 0xFF; + thisbuffer[2] = (numSplitReads[i][j] >> 16) & 0xFF; + thisbuffer[3] = (numSplitReads[i][j] >> 24) & 0xFF; + } + (*(filehandlesHeaders[i][j].begin()->second)).write(thisbuffer, 4); + delete[] thisbuffer; + } + } + char* thisbuffer3 = new char[4]; + if (endian == "BIG_ENDIAN") { + thisbuffer3[0] = (numNoMatch >> 24) & 0xFF; + thisbuffer3[1] = (numNoMatch >> 16) & 0xFF; + thisbuffer3[2] = (numNoMatch >> 8) & 0xFF; + thisbuffer3[3] = numNoMatch & 0xFF; + }else { + thisbuffer3[0] = numNoMatch & 0xFF; + thisbuffer3[1] = (numNoMatch >> 8) & 0xFF; + thisbuffer3[2] = (numNoMatch >> 16) & 0xFF; + thisbuffer3[3] = (numNoMatch >> 24) & 0xFF; + } + outNoMatchHeader.write(thisbuffer3, 4); + delete[] thisbuffer3; + + + //read header length + mybuffer = new char[2]; + in.read(mybuffer,2); + for (int i = 0; i < filehandlesHeaders.size(); i++) { + for (int j = 0; j < filehandlesHeaders[i].size(); j++) { + (*(filehandlesHeaders[i][j].begin()->second)).write(mybuffer, in.gcount()); + } + } + outNoMatchHeader.write(mybuffer, in.gcount()); + delete[] mybuffer; + + //read key length + mybuffer = new char[2]; + in.read(mybuffer,2); + for (int i = 0; i < filehandlesHeaders.size(); i++) { + for (int j = 0; j < filehandlesHeaders[i].size(); j++) { + (*(filehandlesHeaders[i][j].begin()->second)).write(mybuffer, in.gcount()); + } + } + outNoMatchHeader.write(mybuffer, in.gcount()); + delete[] mybuffer; + + //read number of flow reads + mybuffer = new char[2]; + in.read(mybuffer,2); + for (int i = 0; i < filehandlesHeaders.size(); i++) { + for (int j = 0; j < filehandlesHeaders[i].size(); j++) { + (*(filehandlesHeaders[i][j].begin()->second)).write(mybuffer, in.gcount()); + } + } + outNoMatchHeader.write(mybuffer, in.gcount()); + delete[] mybuffer; + + //read format code + mybuffer = new char[1]; + in.read(mybuffer,1); + for (int i = 0; i < filehandlesHeaders.size(); i++) { + for (int j = 0; j < filehandlesHeaders[i].size(); j++) { + (*(filehandlesHeaders[i][j].begin()->second)).write(mybuffer, in.gcount()); + } + } + outNoMatchHeader.write(mybuffer, in.gcount()); + delete[] mybuffer; + + //read flow chars + mybuffer = new char[header.numFlowsPerRead]; + in.read(mybuffer,header.numFlowsPerRead); + for (int i = 0; i < filehandlesHeaders.size(); i++) { + for (int j = 0; j < filehandlesHeaders[i].size(); j++) { + (*(filehandlesHeaders[i][j].begin()->second)).write(mybuffer, in.gcount()); + } + } + outNoMatchHeader.write(mybuffer, in.gcount()); + delete[] mybuffer; + + //read key + mybuffer = new char[header.keyLength]; + in.read(mybuffer,header.keyLength); + for (int i = 0; i < filehandlesHeaders.size(); i++) { + for (int j = 0; j < filehandlesHeaders[i].size(); j++) { + (*(filehandlesHeaders[i][j].begin()->second)).write(mybuffer, in.gcount()); + } + } + outNoMatchHeader.write(mybuffer, in.gcount()); + delete[] mybuffer; + + + /* Pad to 8 chars */ + unsigned long long spotInFile = in.tellg(); + unsigned long long spot = (spotInFile + 7)& ~7; // ~ inverts + in.seekg(spot); + + mybuffer = new char[spot-spotInFile]; + for (int i = 0; i < filehandlesHeaders.size(); i++) { + for (int j = 0; j < filehandlesHeaders[i].size(); j++) { + (*(filehandlesHeaders[i][j].begin()->second)).write(mybuffer, spot-spotInFile); + } + } + outNoMatchHeader.write(mybuffer, spot-spotInFile); + outNoMatchHeader.close(); + delete[] mybuffer; + in.close(); + + m->appendBinaryFiles(noMatchFile, tempNoHeader); + m->renameFile(tempNoHeader, noMatchFile); + m->mothurRemove(tempNoHeader); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "adjustCommonHeader"); + exit(1); + } +} +//********************************************************************************************************************** +bool SffInfoCommand::readSeqData(ifstream& in, seqRead& read, int numFlowReads, Header& header){ + try { + unsigned long long startSpotInFile = in.tellg(); + if (!in.eof()) { + + /*****************************************/ + //read header + + //read header length + char buffer [2]; + in.read(buffer, 2); + header.headerLength = be_int2(*(unsigned short *)(&buffer)); + + //read name length + char buffer2 [2]; + in.read(buffer2, 2); + header.nameLength = be_int2(*(unsigned short *)(&buffer2)); + + //read num bases + char buffer3 [4]; + in.read(buffer3, 4); + header.numBases = be_int4(*(unsigned int *)(&buffer3)); + + + //read clip qual left + char buffer4 [2]; + in.read(buffer4, 2); + header.clipQualLeft = be_int2(*(unsigned short *)(&buffer4)); + header.clipQualLeft = 5; + + + //read clip qual right + char buffer5 [2]; + in.read(buffer5, 2); + header.clipQualRight = be_int2(*(unsigned short *)(&buffer5)); + + + //read clipAdapterLeft + char buffer6 [2]; + in.read(buffer6, 2); + header.clipAdapterLeft = be_int2(*(unsigned short *)(&buffer6)); + + + //read clipAdapterRight + char buffer7 [2]; + in.read(buffer7, 2); + header.clipAdapterRight = be_int2(*(unsigned short *)(&buffer7)); + + + //read name + char* tempBuffer = new char[header.nameLength]; + in.read(&(*tempBuffer), header.nameLength); + header.name = tempBuffer; + if (header.name.length() > header.nameLength) { header.name = header.name.substr(0, header.nameLength); } + + delete[] tempBuffer; + + //extract info from name + decodeName(header.timestamp, header.region, header.xy, header.name); + + /* Pad to 8 chars */ + unsigned long long spotInFile = in.tellg(); + unsigned long long spot = (spotInFile + 7)& ~7; + in.seekg(spot); + + /*****************************************/ + //sequence read + + //read flowgram + read.flowgram.resize(numFlowReads); + for (int i = 0; i < numFlowReads; i++) { + char buffer [2]; + in.read(buffer, 2); + read.flowgram[i] = be_int2(*(unsigned short *)(&buffer)); + } + + //read flowIndex + read.flowIndex.resize(header.numBases); + for (int i = 0; i < header.numBases; i++) { + char temp[1]; + in.read(temp, 1); + read.flowIndex[i] = be_int1(*(unsigned char *)(&temp)); + } + + //read bases + char* tempBuffer6 = new char[header.numBases]; + in.read(&(*tempBuffer6), header.numBases); + read.bases = tempBuffer6; + if (read.bases.length() > header.numBases) { read.bases = read.bases.substr(0, header.numBases); } + delete[] tempBuffer6; + + //read qual scores + read.qualScores.resize(header.numBases); + for (int i = 0; i < header.numBases; i++) { + char temp[1]; + in.read(temp, 1); + read.qualScores[i] = be_int1(*(unsigned char *)(&temp)); + } + + /* Pad to 8 chars */ + spotInFile = in.tellg(); + spot = (spotInFile + 7)& ~7; + in.seekg(spot); + + if (split > 1) { + + int barcodeIndex, primerIndex; + int trashCodeLength = findGroup(header, read, barcodeIndex, primerIndex); + + char * mybuffer; + mybuffer = new char [spot-startSpotInFile]; + + ifstream in2; + m->openInputFileBinary(currentFileName, in2); + in2.seekg(startSpotInFile); + in2.read(mybuffer,spot-startSpotInFile); + + + if(trashCodeLength == 0){ + (*(filehandles[barcodeIndex][primerIndex].begin()->second)).write(mybuffer, in2.gcount()); + numSplitReads[barcodeIndex][primerIndex]++; + } + else{ + ofstream out; + m->openOutputFileBinaryAppend(noMatchFile, out); + out.write(mybuffer, in2.gcount()); + out.close(); + numNoMatch++; + } + delete[] mybuffer; + in2.close(); + } + + }else{ + m->mothurOut("Error reading."); m->mothurOutEndLine(); + } + + if (in.eof()) { return true; } + + return false; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "readSeqData"); + exit(1); + } +} +//********************************************************************************************************************** +int SffInfoCommand::findGroup(Header header, seqRead read, int& barcode, int& primer) { + try { + //find group read belongs to + TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer); + + int success = 1; + string trashCode = ""; + int currentSeqsDiffs = 0; + + string seq = read.bases; + + if (trim) { + if(header.clipQualRight < header.clipQualLeft){ + if (header.clipQualRight == 0) { //don't trim right + seq = seq.substr(header.clipQualLeft-1); + }else { + seq = "NNNN"; + } + } + else if((header.clipQualRight != 0) && ((header.clipQualRight-header.clipQualLeft) >= 0)){ + seq = seq.substr((header.clipQualLeft-1), (header.clipQualRight-header.clipQualLeft)); + } + else { + seq = seq.substr(header.clipQualLeft-1); + } + }else{ + //if you wanted the sfftxt then you already converted the bases to the right case + if (!sfftxt) { + int endValue = header.clipQualRight; + //make the bases you want to clip lowercase and the bases you want to keep upper case + if(endValue == 0){ endValue = seq.length(); } + for (int i = 0; i < (header.clipQualLeft-1); i++) { seq[i] = tolower(seq[i]); } + for (int i = (header.clipQualLeft-1); i < (endValue-1); i++) { seq[i] = toupper(seq[i]); } + for (int i = (endValue-1); i < seq.length(); i++) { seq[i] = tolower(seq[i]); } + } + } + + Sequence currSeq(header.name, seq); + QualityScores currQual; + + if(numLinkers != 0){ + success = trimOligos.stripLinker(currSeq, currQual); + if(success > ldiffs) { trashCode += 'k'; } + else{ currentSeqsDiffs += success; } + + } + + if(barcodes.size() != 0){ + success = trimOligos.stripBarcode(currSeq, currQual, barcode); + if(success > bdiffs) { trashCode += 'b'; } + else{ currentSeqsDiffs += success; } + } + + if(numSpacers != 0){ + success = trimOligos.stripSpacer(currSeq, currQual); + if(success > sdiffs) { trashCode += 's'; } + else{ currentSeqsDiffs += success; } + + } + + if(numFPrimers != 0){ + success = trimOligos.stripForward(currSeq, currQual, primer, true); + if(success > pdiffs) { trashCode += 'f'; } + else{ currentSeqsDiffs += success; } + } + + if (currentSeqsDiffs > tdiffs) { trashCode += 't'; } + + if(revPrimer.size() != 0){ + success = trimOligos.stripReverse(currSeq, currQual); + if(!success) { trashCode += 'r'; } + } + + + return trashCode.length(); + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "findGroup"); + exit(1); + } +} +//********************************************************************************************************************** +int SffInfoCommand::decodeName(string& timestamp, string& region, string& xy, string name) { + try { + + if (name.length() >= 6) { + string time = name.substr(0, 6); + unsigned int timeNum = m->fromBase36(time); + + int q1 = timeNum / 60; + int sec = timeNum - 60 * q1; + int q2 = q1 / 60; + int minute = q1 - 60 * q2; + int q3 = q2 / 24; + int hr = q2 - 24 * q3; + int q4 = q3 / 32; + int day = q3 - 32 * q4; + int q5 = q4 / 13; + int mon = q4 - 13 * q5; + int year = 2000 + q5; + + timestamp = toString(year) + "_" + toString(mon) + "_" + toString(day) + "_" + toString(hr) + "_" + toString(minute) + "_" + toString(sec); + } + + if (name.length() >= 9) { + region = name.substr(7, 2); + + string xyNum = name.substr(9); + unsigned int myXy = m->fromBase36(xyNum); + int x = myXy >> 12; + int y = myXy & 4095; + + xy = toString(x) + "_" + toString(y); + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "decodeName"); + exit(1); + } +} +//********************************************************************************************************************** +int SffInfoCommand::printCommonHeader(ofstream& out, CommonHeader& header) { + try { + + out << "Common Header:\nMagic Number: " << header.magicNumber << endl; + out << "Version: " << header.version << endl; + out << "Index Offset: " << header.indexOffset << endl; + out << "Index Length: " << header.indexLength << endl; + out << "Number of Reads: " << header.numReads << endl; + out << "Header Length: " << header.headerLength << endl; + out << "Key Length: " << header.keyLength << endl; + out << "Number of Flows: " << header.numFlowsPerRead << endl; + out << "Format Code: " << header.flogramFormatCode << endl; + out << "Flow Chars: " << header.flowChars << endl; + out << "Key Sequence: " << header.keySequence << endl << endl; + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "printCommonHeader"); + exit(1); + } +} +//********************************************************************************************************************** +int SffInfoCommand::printHeader(ofstream& out, Header& header) { + try { + + out << ">" << header.name << endl; + out << "Run Prefix: " << header.timestamp << endl; + out << "Region #: " << header.region << endl; + out << "XY Location: " << header.xy << endl << endl; + + out << "Run Name: " << endl; + out << "Analysis Name: " << endl; + out << "Full Path: " << endl << endl; + + out << "Read Header Len: " << header.headerLength << endl; + out << "Name Length: " << header.nameLength << endl; + out << "# of Bases: " << header.numBases << endl; + out << "Clip Qual Left: " << header.clipQualLeft << endl; + out << "Clip Qual Right: " << header.clipQualRight << endl; + out << "Clip Adap Left: " << header.clipAdapterLeft << endl; + out << "Clip Adap Right: " << header.clipAdapterRight << endl << endl; + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "printHeader"); + exit(1); + } +} +//********************************************************************************************************************** +bool SffInfoCommand::sanityCheck(Header& header, seqRead& read) { + try { + bool okay = true; + string message = "[WARNING]: Your sff file may be corrupted! Sequence: " + header.name + "\n"; + + if (header.clipQualLeft > read.bases.length()) { + okay = false; message += "Clip Qual Left = " + toString(header.clipQualLeft) + ", but we only read " + toString(read.bases.length()) + " bases.\n"; + } + if (header.clipQualRight > read.bases.length()) { + okay = false; message += "Clip Qual Right = " + toString(header.clipQualRight) + ", but we only read " + toString(read.bases.length()) + " bases.\n"; + } + if (header.clipQualLeft > read.qualScores.size()) { + okay = false; message += "Clip Qual Left = " + toString(header.clipQualLeft) + ", but we only read " + toString(read.qualScores.size()) + " quality scores.\n"; + } + if (header.clipQualRight > read.qualScores.size()) { + okay = false; message += "Clip Qual Right = " + toString(header.clipQualRight) + ", but we only read " + toString(read.qualScores.size()) + " quality scores.\n"; + } + + if (okay == false) { + m->mothurOut(message); m->mothurOutEndLine(); + } + + return okay; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "sanityCheck"); + exit(1); + } +} +//********************************************************************************************************************** +int SffInfoCommand::printSffTxtSeqData(ofstream& out, seqRead& read, Header& header) { + try { + out << "Flowgram: "; + for (int i = 0; i < read.flowgram.size(); i++) { out << setprecision(2) << (read.flowgram[i]/(float)100) << '\t'; } + + out << endl << "Flow Indexes: "; + int sum = 0; + for (int i = 0; i < read.flowIndex.size(); i++) { sum += read.flowIndex[i]; out << sum << '\t'; } + + //make the bases you want to clip lowercase and the bases you want to keep upper case + int endValue = header.clipQualRight; + if(endValue == 0){ endValue = read.bases.length(); } + for (int i = 0; i < (header.clipQualLeft-1); i++) { read.bases[i] = tolower(read.bases[i]); } + for (int i = (header.clipQualLeft-1); i < (endValue-1); i++) { read.bases[i] = toupper(read.bases[i]); } + for (int i = (endValue-1); i < read.bases.length(); i++) { read.bases[i] = tolower(read.bases[i]); } + + out << endl << "Bases: " << read.bases << endl << "Quality Scores: "; + for (int i = 0; i < read.qualScores.size(); i++) { out << read.qualScores[i] << '\t'; } + + + out << endl << endl; + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "printSffTxtSeqData"); + exit(1); + } +} +//********************************************************************************************************************** +int SffInfoCommand::printFastaSeqData(ofstream& out, seqRead& read, Header& header) { + try { + string seq = read.bases; + + if (trim) { + if(header.clipQualRight < header.clipQualLeft){ + if (header.clipQualRight == 0) { //don't trim right + seq = seq.substr(header.clipQualLeft-1); + }else { + seq = "NNNN"; + } + } + else if((header.clipQualRight != 0) && ((header.clipQualRight-header.clipQualLeft) >= 0)){ + seq = seq.substr((header.clipQualLeft-1), (header.clipQualRight-header.clipQualLeft)); + } + else { + seq = seq.substr(header.clipQualLeft-1); + } + }else{ + //if you wanted the sfftxt then you already converted the bases to the right case + if (!sfftxt) { + int endValue = header.clipQualRight; + //make the bases you want to clip lowercase and the bases you want to keep upper case + if(endValue == 0){ endValue = seq.length(); } + for (int i = 0; i < (header.clipQualLeft-1); i++) { seq[i] = tolower(seq[i]); } + for (int i = (header.clipQualLeft-1); i < (endValue-1); i++) { seq[i] = toupper(seq[i]); } + for (int i = (endValue-1); i < seq.length(); i++) { seq[i] = tolower(seq[i]); } + } + } + + out << ">" << header.name << " xy=" << header.xy << endl; + out << seq << endl; + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "printFastaSeqData"); + exit(1); + } +} + +//********************************************************************************************************************** +int SffInfoCommand::printQualSeqData(ofstream& out, seqRead& read, Header& header) { + try { + + if (trim) { + if(header.clipQualRight < header.clipQualLeft){ + if (header.clipQualRight == 0) { //don't trim right + out << ">" << header.name << " xy=" << header.xy << " length=" << (read.qualScores.size()-header.clipQualLeft) << endl; + for (int i = (header.clipQualLeft-1); i < read.qualScores.size(); i++) { out << read.qualScores[i] << '\t'; } + }else { + out << ">" << header.name << " xy=" << header.xy << endl; + out << "0\t0\t0\t0"; + } + } + else if((header.clipQualRight != 0) && ((header.clipQualRight-header.clipQualLeft) >= 0)){ + out << ">" << header.name << " xy=" << header.xy << " length=" << (header.clipQualRight-header.clipQualLeft) << endl; + for (int i = (header.clipQualLeft-1); i < (header.clipQualRight-1); i++) { out << read.qualScores[i] << '\t'; } + } + else{ + out << ">" << header.name << " xy=" << header.xy << " length=" << (header.clipQualRight-header.clipQualLeft) << endl; + for (int i = (header.clipQualLeft-1); i < read.qualScores.size(); i++) { out << read.qualScores[i] << '\t'; } + } + }else{ + out << ">" << header.name << " xy=" << header.xy << " length=" << read.qualScores.size() << endl; + for (int i = 0; i < read.qualScores.size(); i++) { out << read.qualScores[i] << '\t'; } + } + + out << endl; + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "printQualSeqData"); + exit(1); + } +} + +//********************************************************************************************************************** +int SffInfoCommand::printFlowSeqData(ofstream& out, seqRead& read, Header& header) { + try { + + int endValue = header.clipQualRight; + if (header.clipQualRight == 0) { + endValue = read.flowIndex.size(); + if (m->debug) { m->mothurOut("[DEBUG]: " + header.name + " has clipQualRight=0.\n"); } + } + if(endValue > header.clipQualLeft){ + + int rightIndex = 0; + for (int i = 0; i < endValue; i++) { rightIndex += read.flowIndex[i]; } + + out << header.name << ' ' << rightIndex; + for (int i = 0; i < read.flowgram.size(); i++) { out << setprecision(2) << ' ' << (read.flowgram[i]/(float)100); } + out << endl; + } + + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "printFlowSeqData"); + exit(1); + } +} +//********************************************************************************************************************** +int SffInfoCommand::readAccnosFile(string filename) { + try { + //remove old names + seqNames.clear(); + + ifstream in; + m->openInputFile(filename, in); + string name; + + while(!in.eof()){ + in >> name; m->gobble(in); + + seqNames.insert(name); + + if (m->control_pressed) { seqNames.clear(); break; } + } + in.close(); + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "readAccnosFile"); + exit(1); + } +} +//********************************************************************************************************************** +int SffInfoCommand::parseSffTxt() { + try { + + ifstream inSFF; + m->openInputFile(sfftxtFilename, inSFF); + + if (outputDir == "") { outputDir += m->hasPath(sfftxtFilename); } + + //output file names + ofstream outFasta, outQual, outFlow; + string outFastaFileName, outQualFileName; + string fileRoot = m->getRootName(m->getSimpleName(sfftxtFilename)); + if (fileRoot.length() > 0) { + //rip off last . + fileRoot = fileRoot.substr(0, fileRoot.length()-1); + fileRoot = m->getRootName(fileRoot); + } + + map variables; + variables["[filename]"] = fileRoot; + string sfftxtFileName = getOutputFileName("sfftxt",variables); + string outFlowFileName = getOutputFileName("flow",variables); + if (!trim) { variables["[tag]"] = "raw"; } + outFastaFileName = getOutputFileName("fasta",variables); + outQualFileName = getOutputFileName("qfile",variables); + + if (fasta) { m->openOutputFile(outFastaFileName, outFasta); outputNames.push_back(outFastaFileName); outputTypes["fasta"].push_back(outFastaFileName); } + if (qual) { m->openOutputFile(outQualFileName, outQual); outputNames.push_back(outQualFileName); outputTypes["qfile"].push_back(outQualFileName); } + if (flow) { m->openOutputFile(outFlowFileName, outFlow); outputNames.push_back(outFlowFileName); outFlow.setf(ios::fixed, ios::floatfield); outFlow.setf(ios::showpoint); outputTypes["flow"].push_back(outFlowFileName); } + + //read common header + string commonHeader = m->getline(inSFF); + string magicNumber = m->getline(inSFF); + string version = m->getline(inSFF); + string indexOffset = m->getline(inSFF); + string indexLength = m->getline(inSFF); + int numReads = parseHeaderLineToInt(inSFF); + string headerLength = m->getline(inSFF); + string keyLength = m->getline(inSFF); + int numFlows = parseHeaderLineToInt(inSFF); + string flowgramCode = m->getline(inSFF); + string flowChars = m->getline(inSFF); + string keySequence = m->getline(inSFF); + m->gobble(inSFF); + + string seqName; + + if (flow) { outFlow << numFlows << endl; } + + for(int i=0;imothurOut("[ERROR]: Expected " + toString(numReads) + " but reached end of file at " + toString(i+1) + "."); m->mothurOutEndLine(); break; } + + Header header; + + //parse read header + inSFF >> seqName; + seqName = seqName.substr(1); + m->gobble(inSFF); + header.name = seqName; + + string runPrefix = parseHeaderLineToString(inSFF); header.timestamp = runPrefix; + string regionNumber = parseHeaderLineToString(inSFF); header.region = regionNumber; + string xyLocation = parseHeaderLineToString(inSFF); header.xy = xyLocation; + m->gobble(inSFF); + + string runName = parseHeaderLineToString(inSFF); + string analysisName = parseHeaderLineToString(inSFF); + string fullPath = parseHeaderLineToString(inSFF); + m->gobble(inSFF); + + string readHeaderLen = parseHeaderLineToString(inSFF); convert(readHeaderLen, header.headerLength); + string nameLength = parseHeaderLineToString(inSFF); convert(nameLength, header.nameLength); + int numBases = parseHeaderLineToInt(inSFF); header.numBases = numBases; + string clipQualLeft = parseHeaderLineToString(inSFF); convert(clipQualLeft, header.clipQualLeft); + int clipQualRight = parseHeaderLineToInt(inSFF); header.clipQualRight = clipQualRight; + string clipAdapLeft = parseHeaderLineToString(inSFF); convert(clipAdapLeft, header.clipAdapterLeft); + string clipAdapRight = parseHeaderLineToString(inSFF); convert(clipAdapRight, header.clipAdapterRight); + m->gobble(inSFF); + + seqRead read; + + //parse read + vector flowVector = parseHeaderLineToFloatVector(inSFF, numFlows); read.flowgram = flowVector; + vector flowIndices = parseHeaderLineToIntVector(inSFF, numBases); + + //adjust for print + vector flowIndicesAdjusted; flowIndicesAdjusted.push_back(flowIndices[0]); + for (int j = 1; j < flowIndices.size(); j++) { flowIndicesAdjusted.push_back(flowIndices[j] - flowIndices[j-1]); } + read.flowIndex = flowIndicesAdjusted; + + string bases = parseHeaderLineToString(inSFF); read.bases = bases; + vector qualityScores = parseHeaderLineToIntVector(inSFF, numBases); read.qualScores = qualityScores; + m->gobble(inSFF); + + //if you have provided an accosfile and this seq is not in it, then dont print + bool print = true; + if (seqNames.size() != 0) { if (seqNames.count(header.name) == 0) { print = false; } } + + //print + if (print) { + if (fasta) { printFastaSeqData(outFasta, read, header); } + if (qual) { printQualSeqData(outQual, read, header); } + if (flow) { printFlowSeqData(outFlow, read, header); } + } + + //report progress + if((i+1) % 10000 == 0){ m->mothurOut(toString(i+1)); m->mothurOutEndLine(); } + + if (m->control_pressed) { break; } + } + + //report progress + if (!m->control_pressed) { if((numReads) % 10000 != 0){ m->mothurOut(toString(numReads)); m->mothurOutEndLine(); } } + + inSFF.close(); + + if (fasta) { outFasta.close(); } + if (qual) { outQual.close(); } + if (flow) { outFlow.close(); } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "parseSffTxt"); + exit(1); + } +} +//********************************************************************************************************************** + +int SffInfoCommand::parseHeaderLineToInt(ifstream& file){ + try { + int number; + + while (!file.eof()) { + + char c = file.get(); + if (c == ':'){ + file >> number; + break; + } + + } + m->gobble(file); + return number; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "parseHeaderLineToInt"); + exit(1); + } + +} + +//********************************************************************************************************************** + +string SffInfoCommand::parseHeaderLineToString(ifstream& file){ + try { + string text; + + while (!file.eof()) { + char c = file.get(); + + if (c == ':'){ + //m->gobble(file); + //text = m->getline(file); + file >> text; + break; + } + } + m->gobble(file); + + return text; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "parseHeaderLineToString"); + exit(1); + } +} + +//********************************************************************************************************************** + +vector SffInfoCommand::parseHeaderLineToFloatVector(ifstream& file, int length){ + try { + vector floatVector(length); + + while (!file.eof()) { + char c = file.get(); + if (c == ':'){ + float temp; + for(int i=0;i> temp; + floatVector[i] = temp * 100; + } + break; + } + } + m->gobble(file); + return floatVector; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "parseHeaderLineToFloatVector"); + exit(1); + } +} + +//********************************************************************************************************************** + +vector SffInfoCommand::parseHeaderLineToIntVector(ifstream& file, int length){ + try { + vector intVector(length); + + while (!file.eof()) { + char c = file.get(); + if (c == ':'){ + for(int i=0;i> intVector[i]; + } + break; + } + } + m->gobble(file); + return intVector; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "parseHeaderLineToIntVector"); + exit(1); + } +} +//*************************************************************************************************************** + +bool SffInfoCommand::readOligos(string oligoFile){ + try { + filehandles.clear(); + numSplitReads.clear(); + filehandlesHeaders.clear(); + + ifstream inOligos; + m->openInputFile(oligoFile, inOligos); + + string type, oligo, group; + + int indexPrimer = 0; + int indexBarcode = 0; + + while(!inOligos.eof()){ + + inOligos >> type; + + if(type[0] == '#'){ + while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there + m->gobble(inOligos); + } + else{ + m->gobble(inOligos); + //make type case insensitive + for(int i=0;i> oligo; + + for(int i=0;i::iterator itPrime = primers.find(oligo); + if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); } + + primers[oligo]=indexPrimer; indexPrimer++; + primerNameVector.push_back(group); + }else if(type == "REVERSE"){ + //Sequence oligoRC("reverse", oligo); + //oligoRC.reverseComplement(); + string oligoRC = reverseOligo(oligo); + revPrimer.push_back(oligoRC); + } + else if(type == "BARCODE"){ + inOligos >> group; + + //check for repeat barcodes + map::iterator itBar = barcodes.find(oligo); + if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); } + + barcodes[oligo]=indexBarcode; indexBarcode++; + barcodeNameVector.push_back(group); + }else if(type == "LINKER"){ + linker.push_back(oligo); + }else if(type == "SPACER"){ + spacer.push_back(oligo); + } + else{ m->mothurOut("[WARNING]: " + type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); } + } + m->gobble(inOligos); + } + inOligos.close(); + + if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ split = 1; } + + //add in potential combos + if(barcodeNameVector.size() == 0){ + barcodes[""] = 0; + barcodeNameVector.push_back(""); + } + + if(primerNameVector.size() == 0){ + primers[""] = 0; + primerNameVector.push_back(""); + } + + filehandles.resize(barcodeNameVector.size()); + for (int i = 0; i < filehandles.size(); i++) { + for (int j = 0; j < primerNameVector.size(); j++) { + ofstream* temp; + map myMap; myMap[""] = temp; + filehandles[i].push_back(myMap); + } + } + + if(split > 1){ + set uniqueNames; //used to cleanup outputFileNames + for(map::iterator itBar = barcodes.begin();itBar != barcodes.end();itBar++){ + for(map::iterator itPrimer = primers.begin();itPrimer != primers.end(); itPrimer++){ + + string primerName = primerNameVector[itPrimer->second]; + string barcodeName = barcodeNameVector[itBar->second]; + + string comboGroupName = ""; + string fastaFileName = ""; + string qualFileName = ""; + string nameFileName = ""; + + if(primerName == ""){ + comboGroupName = barcodeNameVector[itBar->second]; + } + else{ + if(barcodeName == ""){ + comboGroupName = primerNameVector[itPrimer->second]; + } + else{ + comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second]; + } + } + + ofstream* temp = new ofstream; + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(currentFileName)); + variables["[group]"] = comboGroupName; + string thisFilename = getOutputFileName("sff",variables); + if (uniqueNames.count(thisFilename) == 0) { + outputNames.push_back(thisFilename); + outputTypes["sff"].push_back(thisFilename); + uniqueNames.insert(thisFilename); + } + + map myMap; myMap[thisFilename] = temp; + m->openOutputFileBinary(thisFilename, *(temp)); + filehandles[itBar->second][itPrimer->second] = myMap; + map::iterator itOfstream = filehandles[itBar->second][itPrimer->second].find(""); + if (itOfstream != filehandles[itBar->second][itPrimer->second].end()) { filehandles[itBar->second][itPrimer->second].erase(itOfstream); } //remove blank entry so we dont mess with .begin() above. code above assumes only 1 file name in the map + } + } + } + numFPrimers = primers.size(); + numLinkers = linker.size(); + numSpacers = spacer.size(); + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(currentFileName)); + variables["[group]"] = "scrap"; + noMatchFile = getOutputFileName("sff",variables); + m->mothurRemove(noMatchFile); + numNoMatch = 0; + + bool allBlank = true; + for (int i = 0; i < barcodeNameVector.size(); i++) { + if (barcodeNameVector[i] != "") { + allBlank = false; + break; + } + } + for (int i = 0; i < primerNameVector.size(); i++) { + if (primerNameVector[i] != "") { + allBlank = false; + break; + } + } + + filehandlesHeaders.resize(filehandles.size()); + numSplitReads.resize(filehandles.size()); + for (int i = 0; i < filehandles.size(); i++) { + numSplitReads[i].resize(filehandles[i].size(), 0); + for (int j = 0; j < filehandles[i].size(); j++) { + ofstream* temp = new ofstream; + map myMap; + string thisHeader = (filehandles[i][j].begin())->first+"headers"; + myMap[thisHeader] = temp; + m->openOutputFileBinary(thisHeader, *(temp)); + filehandlesHeaders[i].push_back(myMap); + } + } + + if (allBlank) { + m->mothurOut("[WARNING]: your oligos file does not contain any group names. mothur will not create a split the sff file."); m->mothurOutEndLine(); + split = 1; + return false; + } + + return true; + + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "readOligos"); + exit(1); + } +} +//********************************************************************/ +string SffInfoCommand::reverseOligo(string oligo){ + try { + string reverse = ""; + + for(int i=oligo.length()-1;i>=0;i--){ + + if(oligo[i] == 'A') { reverse += 'T'; } + else if(oligo[i] == 'T'){ reverse += 'A'; } + else if(oligo[i] == 'U'){ reverse += 'A'; } + + else if(oligo[i] == 'G'){ reverse += 'C'; } + else if(oligo[i] == 'C'){ reverse += 'G'; } + + else if(oligo[i] == 'R'){ reverse += 'Y'; } + else if(oligo[i] == 'Y'){ reverse += 'R'; } + + else if(oligo[i] == 'M'){ reverse += 'K'; } + else if(oligo[i] == 'K'){ reverse += 'M'; } + + else if(oligo[i] == 'W'){ reverse += 'W'; } + else if(oligo[i] == 'S'){ reverse += 'S'; } + + else if(oligo[i] == 'B'){ reverse += 'V'; } + else if(oligo[i] == 'V'){ reverse += 'B'; } + + else if(oligo[i] == 'D'){ reverse += 'H'; } + else if(oligo[i] == 'H'){ reverse += 'D'; } + + else { reverse += 'N'; } + } + + + return reverse; + } + catch(exception& e) { + m->errorOut(e, "SffInfoCommand", "reverseOligo"); + exit(1); + } +} + +//********************************************************************************************************************** + + + + diff --git a/sffinfocommand.h b/sffinfocommand.h index 5ec6e72..541c8ba 100644 --- a/sffinfocommand.h +++ b/sffinfocommand.h @@ -82,19 +82,20 @@ private: string sffFilename, sfftxtFilename, outputDir, accnosName, currentFileName, oligosfile, noMatchFile; vector filenames, outputNames, accnosFileNames, oligosFileNames; bool abort, fasta, qual, trim, flow, sfftxt, hasAccnos, hasOligos; - int mycount, split, numFPrimers, numLinkers, numSpacers, pdiffs, bdiffs, ldiffs, sdiffs, tdiffs; + int mycount, split, numFPrimers, numLinkers, numSpacers, pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, numNoMatch; set seqNames; map barcodes; map primers; vector linker, spacer, primerNameVector, barcodeNameVector, revPrimer; vector > numSplitReads; - vector > filehandles, filehandlesHeaders; + vector > > filehandles; + vector > > filehandlesHeaders; //extract sff file functions int extractSffInfo(string, string, string); int readCommonHeader(ifstream&, CommonHeader&); - //int readHeader(ifstream&, Header&); - int readSeqData(ifstream&, seqRead&, int, Header&); + int readHeader(ifstream&, Header&); + bool readSeqData(ifstream&, seqRead&, int, Header&); int decodeName(string&, string&, string&, string); bool readOligos(string oligosFile); diff --git a/sracommand.cpp b/sracommand.cpp new file mode 100644 index 0000000..c291ddd --- /dev/null +++ b/sracommand.cpp @@ -0,0 +1,175 @@ +// +// sracommand.cpp +// Mothur +// +// Created by SarahsWork on 10/28/13. +// Copyright (c) 2013 Schloss Lab. All rights reserved. +// + +#include "sracommand.h" + +//********************************************************************************************************************** +vector SRACommand::setParameters(){ + try { + CommandParameter psff("sff", "InputTypes", "", "", "none", "none", "none","sra",false,false); parameters.push_back(psff); + CommandParameter pfastqfile("fastqfile", "InputTypes", "", "", "none", "none", "none","sra",false,false); parameters.push_back(pfastqfile); + //choose only one multiple options + CommandParameter pplatform("platform", "Multiple", "454-???-???", "454", "", "", "","",false,false); parameters.push_back(pplatform); + //every command must have inputdir and outputdir. This allows mothur users to redirect input and output files. + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); + + vector myArray; + for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } + return myArray; + } + catch(exception& e) { + m->errorOut(e, "SRACommand", "setParameters"); + exit(1); + } +} +//********************************************************************************************************************** +string SRACommand::getHelpString(){ + try { + string helpString = ""; + helpString += "The sra command creates a sequence read archive from sff or fastq files.\n"; + helpString += "The sra command parameters are: sff, fastqfiles, oligos, platform....\n"; + helpString += "The sffiles parameter is used to provide a file containing a \n"; + helpString += "The new command should be in the following format: \n"; + helpString += "new(...)\n"; + return helpString; + } + catch(exception& e) { + m->errorOut(e, "SRACommand", "getHelpString"); + exit(1); + } +} +//********************************************************************************************************************** +string SRACommand::getOutputPattern(string type) { + try { + string pattern = ""; + + if (type == "sra") { pattern = "[filename],sra"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } + + return pattern; + } + catch(exception& e) { + m->errorOut(e, "SRACommand", "getOutputPattern"); + exit(1); + } +} +//********************************************************************************************************************** +SRACommand::SRACommand(){ + try { + abort = true; calledHelp = true; + setParameters(); + vector tempOutNames; + outputTypes["sra"] = tempOutNames; + } + catch(exception& e) { + m->errorOut(e, "SRACommand", "SRACommand"); + exit(1); + } +} +//********************************************************************************************************************** +SRACommand::SRACommand(string option) { + try { + abort = false; calledHelp = false; + + //allow user to run help + if(option == "help") { help(); abort = true; calledHelp = true; } + else if(option == "citation") { citation(); abort = true; calledHelp = true;} + + else { + //valid paramters for this command + vector myArray = setParameters(); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + map::iterator it; + //check to make sure all parameters are valid for command + for (it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); + if (inputDir == "not found"){ inputDir = ""; } + else { + + string path; + it = parameters.find("sfffiles"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["sfffiles"] = inputDir + it->second; } + } + + it = parameters.find("fastqfiles"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["fastqfiles"] = inputDir + it->second; } + } + } + + //check for parameters + fastqfiles = validParameter.validFile(parameters, "fastqfiles", true); + if (fastqfiles == "not open") { fastqfiles = ""; abort = true; } + else if (fastqfiles == "not found") { fastqfiles = ""; } + + sfffiles = validParameter.validFile(parameters, "sfffiles", true); + if (sfffiles == "not open") { sfffiles = ""; abort = true; } + else if (sfffiles == "not found") { sfffiles = ""; } + + if ((fastqfiles == "") && (sfffiles == "")) { + m->mothurOut("No valid current files. You must provide a sfffiles or fastqfiles file before you can use the sra command."); m->mothurOutEndLine(); abort = true; + } + + //use only one Mutliple type + platform = validParameter.validFile(parameters, "platform", false); + if (platform == "not found") { platform = "454"; } + + if ((platform == "454") || (platform == "????") || (platform == "????") || (platform == "????")) { } + else { m->mothurOut("Not a valid platform option. Valid platform options are 454, ...."); m->mothurOutEndLine(); abort = true; } + + + } + + } + catch(exception& e) { + m->errorOut(e, "SRACommand", "SRACommand"); + exit(1); + } +} +//********************************************************************************************************************** + +int SRACommand::execute(){ + try { + + if (abort == true) { if (calledHelp) { return 0; } return 2; } + + + + //output files created by command + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + return 0; + + } + catch(exception& e) { + m->errorOut(e, "SRACommand", "SRACommand"); + exit(1); + } +} +//********************************************************************************************************************** + + diff --git a/sracommand.h b/sracommand.h new file mode 100644 index 0000000..0f0a945 --- /dev/null +++ b/sracommand.h @@ -0,0 +1,46 @@ +// +// sracommand.h +// Mothur +// +// Created by SarahsWork on 10/28/13. +// Copyright (c) 2013 Schloss Lab. All rights reserved. +// + +#ifndef Mothur_sracommand_h +#define Mothur_sracommand_h + +#include "command.hpp" + + +/**************************************************************************************************/ + +class SRACommand : public Command { +public: + SRACommand(string); + SRACommand(); + ~SRACommand(){} + + vector setParameters(); + string getCommandName() { return "sra"; } + string getCommandCategory() { return "Sequence Processing"; } + + string getOutputPattern(string); + + string getHelpString(); + string getCitation() { return "http://www.mothur.org/wiki/sra"; } + string getDescription() { return "create a Sequence Read Archive / SRA"; } + + int execute(); + void help() { m->mothurOut(getHelpString()); } + +private: + bool abort; + string sfffiles, fastqfiles, platform, outputDir; + vector outputNames; +}; + +/**************************************************************************************************/ + + + +#endif diff --git a/summarysharedcommand.cpp b/summarysharedcommand.cpp index d93fb6e..2c9399b 100644 --- a/summarysharedcommand.cpp +++ b/summarysharedcommand.cpp @@ -184,9 +184,6 @@ SummarySharedCommand::SummarySharedCommand(string option) { string temp = validParameter.validFile(parameters, "all", false); if (temp == "not found") { temp = "false"; } all = m->isTrue(temp); - temp = validParameter.validFile(parameters, "distance", false); if (temp == "not found") { temp = "false"; } - createPhylip = m->isTrue(temp); - temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "1000"; } m->mothurConvert(temp, iters); @@ -204,6 +201,10 @@ SummarySharedCommand::SummarySharedCommand(string option) { if (subsample == false) { iters = 0; } + temp = validParameter.validFile(parameters, "distance", false); if (temp == "not found") { temp = "false"; } + createPhylip = m->isTrue(temp); + if (subsample) { createPhylip = true; } + temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); m->mothurConvert(temp, processors); diff --git a/venncommand.cpp b/venncommand.cpp index 98796b3..3767687 100644 --- a/venncommand.cpp +++ b/venncommand.cpp @@ -29,8 +29,7 @@ vector VennCommand::setParameters(){ CommandParameter pnseqs("nseqs", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pnseqs); CommandParameter psharedotus("sharedotus", "Boolean", "", "t", "", "", "","",false,false); parameters.push_back(psharedotus); CommandParameter pfontsize("fontsize", "Number", "", "24", "", "", "","",false,false); parameters.push_back(pfontsize); - CommandParameter ppermute("permute", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ppermute); - CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); + CommandParameter ppermute("permute", "Multiple", "1-2-3-4", "4", "", "", "","",false,false); parameters.push_back(ppermute); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); vector myArray; @@ -56,7 +55,7 @@ string VennCommand::getHelpString(){ helpString += "The default value for calc is sobs if you have only read a list file or if you have selected only one group, and sharedsobs if you have multiple groups.\n"; helpString += "The default available estimators for calc are sobs, chao and ace if you have only read a list file, and sharedsobs, sharedchao and sharedace if you have read a shared file.\n"; helpString += "The nseqs parameter will output the number of sequences represented by the otus in the picture, default=F.\n"; - helpString += "If you have more than 4 groups, the permute parameter will find all possible combos of 4 of your groups and create pictures for them, default=F.\n"; + helpString += "If you have more than 4 groups, you can use the permute parameter to set the number of groups you would like mothur to divide the samples into to draw the venn diagrams for all possible combos. Default=4.\n"; helpString += "The only estimators available four 4 groups are sharedsobs and sharedchao.\n"; helpString += "The sharedotus parameter can be used with the sharedsobs calculator to get the names of the OTUs in each section of the venn diagram. Default=t.\n"; helpString += "The venn command outputs a .svg file for each calculator you specify at each distance you choose.\n"; @@ -215,8 +214,10 @@ VennCommand::VennCommand(string option) { temp = validParameter.validFile(parameters, "nseqs", false); if (temp == "not found"){ temp = "f"; } nseqs = m->isTrue(temp); - temp = validParameter.validFile(parameters, "permute", false); if (temp == "not found"){ temp = "f"; } - perm = m->isTrue(temp); + temp = validParameter.validFile(parameters, "permute", false); if (temp == "not found"){ temp = "4"; } + m->mothurConvert(temp, perm); + if ((perm == 1) || (perm == 2) || (perm == 3) || (perm == 4)) { } + else { m->mothurOut("[ERROR]: Not a valid permute value. Valid values are 1, 2, 3, and 4."); m->mothurOutEndLine(); abort = true; } temp = validParameter.validFile(parameters, "sharedotus", false); if (temp == "not found"){ temp = "t"; } sharedOtus = m->isTrue(temp); @@ -280,7 +281,7 @@ int VennCommand::execute(){ lookup = input->getSharedRAbundVectors(); lastLabel = lookup[0]->getLabel(); - if ((lookup.size() > 4) && (perm)) { combosOfFour = findCombinations(lookup.size()); } + if ((lookup.size() > 4)) { combos = findCombinations(lookup.size()); } }else if (format == "list") { sabund = input->getSAbundVector(); lastLabel = sabund->getLabel(); @@ -308,17 +309,12 @@ int VennCommand::execute(){ processedLabels.insert(lookup[0]->getLabel()); userLabels.erase(lookup[0]->getLabel()); - if ((lookup.size() > 4) && (!perm)){ - m->mothurOut("Error: Too many groups chosen. You may use up to 4 groups with the venn command. I will use the first four groups in your groupfile. If you set perm=t, I will find all possible combos of 4 groups."); m->mothurOutEndLine(); - for (int i = lookup.size(); i > 4; i--) { lookup.pop_back(); } //no memmory leak because pop_back calls destructor - vector outfilenames = venn->getPic(lookup, vennCalculators); - for(int i = 0; i < outfilenames.size(); i++) { if (outfilenames[i] != "control" ) { outputNames.push_back(outfilenames[i]); outputTypes["svg"].push_back(outfilenames[i]); } } - }else if ((lookup.size() > 4) && (perm)) { + if (lookup.size() > 4) { set< set >::iterator it3; set::iterator it2; - for (it3 = combosOfFour.begin(); it3 != combosOfFour.end(); it3++) { + for (it3 = combos.begin(); it3 != combos.end(); it3++) { set poss = *it3; vector subset; @@ -343,17 +339,10 @@ int VennCommand::execute(){ processedLabels.insert(lookup[0]->getLabel()); userLabels.erase(lookup[0]->getLabel()); - if ((lookup.size() > 4) && (!perm)){ - m->mothurOut("Error: Too many groups chosen. You may use up to 4 groups with the venn command. I will use the first four groups in your groupfile. If you set perm=t, I will find all possible combos of 4 groups."); m->mothurOutEndLine(); - for (int i = lookup.size(); i > 4; i--) { lookup.pop_back(); } //no memmory leak because pop_back calls destructor - - vector outfilenames = venn->getPic(lookup, vennCalculators); - for(int i = 0; i < outfilenames.size(); i++) { if (outfilenames[i] != "control" ) { outputNames.push_back(outfilenames[i]); outputTypes["svg"].push_back(outfilenames[i]); } } - - }else if ((lookup.size() > 4) && (perm)) { + if (lookup.size() > 4) { set< set >::iterator it3; set::iterator it2; - for (it3 = combosOfFour.begin(); it3 != combosOfFour.end(); it3++) { + for (it3 = combos.begin(); it3 != combos.end(); it3++) { set poss = *it3; vector subset; @@ -409,17 +398,10 @@ int VennCommand::execute(){ processedLabels.insert(lookup[0]->getLabel()); userLabels.erase(lookup[0]->getLabel()); - if ((lookup.size() > 4) && (!perm)){ - m->mothurOut("Error: Too many groups chosen. You may use up to 4 groups with the venn command. I will use the first four groups in your groupfile. If you set perm=t, I will find all possible combos of 4 groups."); m->mothurOutEndLine(); - for (int i = lookup.size(); i > 4; i--) { lookup.pop_back(); } //no memmory leak because pop_back calls destructor - - vector outfilenames = venn->getPic(lookup, vennCalculators); - for(int i = 0; i < outfilenames.size(); i++) { if (outfilenames[i] != "control" ) { outputNames.push_back(outfilenames[i]); outputTypes["svg"].push_back(outfilenames[i]); } } - - }else if ((lookup.size() > 4) && (perm)) { + if (lookup.size() > 4) { set< set >::iterator it3; set::iterator it2; - for (it3 = combosOfFour.begin(); it3 != combosOfFour.end(); it3++) { + for (it3 = combos.begin(); it3 != combos.end(); it3++) { set poss = *it3; vector subset; @@ -552,7 +534,7 @@ int VennCommand::execute(){ } } //********************************************************************************************************************** -//returns a vector of sets containing the 4 group combinations +//returns a vector of sets containing the group combinations set< set > VennCommand::findCombinations(int lookupSize){ try { set< set > combos; @@ -561,7 +543,7 @@ set< set > VennCommand::findCombinations(int lookupSize){ for (int i = 0; i < lookupSize; i++) { possibles.insert(i); } getCombos(possibles, combos); - + return combos; } @@ -571,11 +553,11 @@ set< set > VennCommand::findCombinations(int lookupSize){ } } //********************************************************************************************************************** -//recusively finds combos of 4 +//recusively finds combos of length perm int VennCommand::getCombos(set possibles, set< set >& combos){ try { - if (possibles.size() == 4) { //done + if (possibles.size() == perm) { //done if (combos.count(possibles) == 0) { //no dups combos.insert(possibles); } diff --git a/venncommand.h b/venncommand.h index 98a59d4..4762902 100644 --- a/venncommand.h +++ b/venncommand.h @@ -40,11 +40,11 @@ private: Venn* venn; vector vennCalculators; vector lookup; - set< set > combosOfFour; + set< set > combos; SAbundVector* sabund; - int abund, fontsize; + int abund, fontsize, perm; - bool abort, allLines, nseqs, perm, sharedOtus; + bool abort, allLines, nseqs, sharedOtus; set labels; //holds labels to be used string format, groups, calc, label, outputDir, sharedfile, listfile, inputfile; vector Estimators, Groups, outputNames; -- 2.39.2