X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=prcseqscommand.cpp;h=cf93cb796104aa90428e5466546c9839789216f7;hb=2cfb747aa8f63bde9c1114001e6d2e81ccd26178;hp=de2cb2058a064b043f424921f599edd6f7f8bf24;hpb=035f86272c776e1cccaa47021e26782e49cd41e7;p=mothur.git diff --git a/prcseqscommand.cpp b/prcseqscommand.cpp index de2cb20..cf93cb7 100644 --- a/prcseqscommand.cpp +++ b/prcseqscommand.cpp @@ -11,21 +11,23 @@ //********************************************************************************************************************** vector PcrSeqsCommand::setParameters(){ try { - CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter poligos("oligos", "InputTypes", "", "", "ecolioligos", "none", "none",false,false); parameters.push_back(poligos); - CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); - CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); - CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(ptax); - CommandParameter pecoli("ecoli", "InputTypes", "", "", "ecolioligos", "none", "none",false,false); parameters.push_back(pecoli); - CommandParameter pstart("start", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pstart); - CommandParameter pend("end", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pend); - CommandParameter pnomatch("nomatch", "Multiple", "reject-keep", "reject", "", "", "",false,false); parameters.push_back(pnomatch); - CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); - CommandParameter pkeepprimer("keepprimer", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pkeepprimer); - CommandParameter pkeepdots("keepdots", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pkeepdots); - CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); - CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); + CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta",false,true,true); parameters.push_back(pfasta); + CommandParameter poligos("oligos", "InputTypes", "", "", "ecolioligos", "none", "none","",false,false,true); parameters.push_back(poligos); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","name",false,false,true); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","count",false,false,true); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","group",false,false,true); parameters.push_back(pgroup); + CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none","taxonomy",false,false,true); parameters.push_back(ptax); + CommandParameter pecoli("ecoli", "InputTypes", "", "", "ecolioligos", "none", "none","",false,false); parameters.push_back(pecoli); + CommandParameter pstart("start", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pstart); + CommandParameter pend("end", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pend); + CommandParameter pnomatch("nomatch", "Multiple", "reject-keep", "reject", "", "", "","",false,false); parameters.push_back(pnomatch); + CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(ppdiffs); + + CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors); + CommandParameter pkeepprimer("keepprimer", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pkeepprimer); + CommandParameter pkeepdots("keepdots", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pkeepdots); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); vector myArray; for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } @@ -41,7 +43,7 @@ string PcrSeqsCommand::getHelpString(){ try { string helpString = ""; helpString += "The pcr.seqs command reads a fasta file.\n"; - helpString += "The pcr.seqs command parameters are fasta, oligos, name, group, count, taxonomy, ecoli, start, end, nomatch, processors, keepprimer and keepdots.\n"; + helpString += "The pcr.seqs command parameters are fasta, oligos, name, group, count, taxonomy, ecoli, start, end, nomatch, pdiffs, processors, keepprimer and keepdots.\n"; helpString += "The ecoli parameter is used to provide a fasta file containing a single reference sequence (e.g. for e. coli) this must be aligned. Mothur will trim to the start and end positions of the reference sequence.\n"; helpString += "The start parameter allows you to provide a starting position to trim to.\n"; helpString += "The end parameter allows you to provide a ending position to trim from.\n"; @@ -49,6 +51,7 @@ string PcrSeqsCommand::getHelpString(){ helpString += "The processors parameter allows you to use multiple processors.\n"; helpString += "The keepprimer parameter allows you to keep the primer, default=false.\n"; helpString += "The keepdots parameter allows you to keep the leading and trailing .'s, default=true.\n"; + helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n"; helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n"; helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Pcr.seqs .\n"; return helpString; @@ -58,31 +61,25 @@ string PcrSeqsCommand::getHelpString(){ exit(1); } } - //********************************************************************************************************************** -string PcrSeqsCommand::getOutputFileNameTag(string type, string inputName=""){ - try { - string outputFileName = ""; - map >::iterator it; +string PcrSeqsCommand::getOutputPattern(string type) { + try { + string pattern = ""; - //is this a type this command creates - it = outputTypes.find(type); - if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); } - else { - if (type == "fasta") { outputFileName = "pcr.fasta"; } - else if (type == "taxonomy") { outputFileName = "pcr" + m->getExtension(inputName); } - else if (type == "group") { outputFileName = "pcr" + m->getExtension(inputName); } - else if (type == "name") { outputFileName = "pcr" + m->getExtension(inputName); } - else if (type == "count") { outputFileName = "pcr" + m->getExtension(inputName); } - else if (type == "accnos") { outputFileName = "bad.accnos"; } - else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } - } - return outputFileName; - } - catch(exception& e) { - m->errorOut(e, "PcrSeqsCommand", "getOutputFileNameTag"); - exit(1); - } + if (type == "fasta") { pattern = "[filename],pcr,[extension]-[filename],[tag],pcr,[extension]"; } + else if (type == "taxonomy") { pattern = "[filename],pcr,[extension]"; } + else if (type == "name") { pattern = "[filename],pcr,[extension]"; } + else if (type == "group") { pattern = "[filename],pcr,[extension]"; } + else if (type == "count") { pattern = "[filename],pcr,[extension]"; } + else if (type == "accnos") { pattern = "[filename],bad.accnos"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } + + return pattern; + } + catch(exception& e) { + m->errorOut(e, "PcrSeqsCommand", "getOutputPattern"); + exit(1); + } } //********************************************************************************************************************** @@ -268,6 +265,9 @@ PcrSeqsCommand::PcrSeqsCommand(string option) { temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); m->mothurConvert(temp, processors); + + temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, pdiffs); nomatch = validParameter.validFile(parameters, "nomatch", false); if (nomatch == "not found") { nomatch = "reject"; } @@ -315,14 +315,17 @@ int PcrSeqsCommand::execute(){ string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); } - string trimSeqFile = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("fasta"); + map variables; + variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)); + variables["[extension]"] = m->getExtension(fastafile); + string trimSeqFile = getOutputFileName("fasta",variables); outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile); - - string badSeqFile = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "scrap." + getOutputFileNameTag("fasta"); + variables["[tag]"] = "scrap"; + string badSeqFile = getOutputFileName("fasta",variables); length = 0; - if(oligosfile != ""){ readOligos(); } if (m->control_pressed) { return 0; } + if(oligosfile != ""){ readOligos(); if (m->debug) { m->mothurOut("[DEBUG]: read oligos file. numprimers = " + toString(primers.size()) + ", revprimers = " + toString(revPrimer.size()) + ".\n"); } } if (m->control_pressed) { return 0; } if(ecolifile != "") { readEcoli(); } if (m->control_pressed) { return 0; } vector positions; @@ -502,7 +505,7 @@ int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string if (i!=0) {extension += toString(i) + ".temp"; processIDS.push_back(i); } // Allocate memory for thread data. - pcrData* tempPcr = new pcrData(filename, goodFileName+extension, badFileName+extension, m, oligosfile, ecolifile, primers, revPrimer, nomatch, keepprimer, keepdots, start, end, length, lines[i].start, lines[i].end); + pcrData* tempPcr = new pcrData(filename, goodFileName+extension, badFileName+extension, m, oligosfile, ecolifile, primers, revPrimer, nomatch, keepprimer, keepdots, start, end, length, pdiffs, lines[i].start, lines[i].end); pDataArray.push_back(tempPcr); //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier @@ -519,6 +522,9 @@ int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string //Close all thread handles and free memory allocations. for(int i=0; i < pDataArray.size(); i++){ num += pDataArray[i]->count; + if (pDataArray[i]->count != pDataArray[i]->fend) { + m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->fend) + " sequences assigned to it, quitting. \n"); m->control_pressed = true; + } for (set::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames.insert(*it); } CloseHandle(hThreadArray[i]); delete pDataArray[i]; @@ -561,6 +567,10 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta int count = 0; set lengths; + //pdiffs, bdiffs, primers, barcodes, revPrimers + map faked; + TrimOligos trim(pdiffs, 0, primers, faked, revPrimer); + while (!done) { if (m->control_pressed) { break; } @@ -570,6 +580,8 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta string trashCode = ""; if (currSeq.getName() != "") { + if (m->debug) { m->mothurOut("[DEBUG]: seq name = " + currSeq.getName() + ".\n"); } + bool goodSeq = true; if (oligosfile != "") { map mapAligned; @@ -578,7 +590,7 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta //process primers if (primers.size() != 0) { int primerStart = 0; int primerEnd = 0; - bool good = findForward(currSeq, primerStart, primerEnd); + bool good = trim.findForward(currSeq, primerStart, primerEnd); if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "f"; } else{ @@ -590,8 +602,9 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta } else { if (keepdots) { currSeq.filterToPos(mapAligned[primerStart]); } - else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart])); } + else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart])); } } + isAligned(currSeq.getAligned(), mapAligned); }else { if (!keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(primerEnd)); } else { currSeq.setAligned(currSeq.getUnaligned().substr(primerStart)); } @@ -602,10 +615,10 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta //process reverse primers if (revPrimer.size() != 0) { int primerStart = 0; int primerEnd = 0; - bool good = findReverse(currSeq, primerStart, primerEnd); + bool good = trim.findReverse(currSeq, primerStart, primerEnd); if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "r"; } else{ - //are you aligned + //are you aligned if (aligned) { if (!keepprimer) { if (keepdots) { currSeq.filterFromPos(mapAligned[primerStart]); } @@ -700,75 +713,9 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta } } //********************************************************************/ -bool PcrSeqsCommand::findForward(Sequence& seq, int& primerStart, int& primerEnd){ - try { - - string rawSequence = seq.getUnaligned(); - - for(int j=0;jcontrol_pressed) { primerStart = 0; primerEnd = 0; return false; } - string rawChunk = rawSequence.substr(j, olength); - if(compareDNASeq(oligo, rawChunk)) { - primerStart = j; - primerEnd = primerStart + olength; - return true; - } - - } - } - - primerStart = 0; primerEnd = 0; - return false; - - } - catch(exception& e) { - m->errorOut(e, "TrimOligos", "stripForward"); - exit(1); - } -} -//******************************************************************/ -bool PcrSeqsCommand::findReverse(Sequence& seq, int& primerStart, int& primerEnd){ - try { - - string rawSequence = seq.getUnaligned(); - - for(int i=0;i= 0; j--){ - if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; } - string rawChunk = rawSequence.substr(j, olength); - - if(compareDNASeq(oligo, rawChunk)) { - primerStart = j; - primerEnd = primerStart + olength; - return true; - } - - } - } - - primerStart = 0; primerEnd = 0; - return false; - } - catch(exception& e) { - m->errorOut(e, "PcrSeqsCommand", "findReverse"); - exit(1); - } -} -//********************************************************************/ bool PcrSeqsCommand::isAligned(string seq, map& aligned){ try { + aligned.clear(); bool isAligned = false; int countBases = 0; @@ -832,6 +779,7 @@ bool PcrSeqsCommand::readOligos(){ m->openInputFile(oligosfile, inOligos); string type, oligo, group; + int primerCount = 0; while(!inOligos.eof()){ @@ -856,10 +804,10 @@ bool PcrSeqsCommand::readOligos(){ // get rest of line in case there is a primer name while (!inOligos.eof()) { char c = inOligos.get(); - if (c == 10 || c == 13){ break; } + if (c == 10 || c == 13 || c == -1){ break; } else if (c == 32 || c == 9){;} //space or tab } - primers.push_back(oligo); + primers[oligo] = primerCount; primerCount++; }else if(type == "REVERSE"){ string oligoRC = reverseOligo(oligo); revPrimer.push_back(oligoRC); @@ -914,7 +862,9 @@ int PcrSeqsCommand::writeAccnos(set badNames){ try { string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); } - string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("accnos"); + map variables; + variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)); + string outputFileName = getOutputFileName("accnos",variables); outputNames.push_back(outputFileName); outputTypes["accnos"].push_back(outputFileName); ofstream out; @@ -933,50 +883,16 @@ int PcrSeqsCommand::writeAccnos(set badNames){ exit(1); } -} -//******************************************************************/ -bool PcrSeqsCommand::compareDNASeq(string oligo, string seq){ - try { - bool success = 1; - int length = oligo.length(); - - for(int i=0;ierrorOut(e, "PcrSeqsCommand", "compareDNASeq"); - exit(1); - } - } //*************************************************************************************************************** int PcrSeqsCommand::readName(set& names){ try { string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(namefile); } - string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(namefile)) + getOutputFileNameTag("name", namefile); + map variables; + variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(namefile)); + variables["[extension]"] = m->getExtension(namefile); + string outputFileName = getOutputFileName("name", variables); ofstream out; m->openOutputFile(outputFileName, out); @@ -1034,7 +950,10 @@ int PcrSeqsCommand::readGroup(set names){ try { string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(groupfile); } - string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)) + getOutputFileNameTag("group", groupfile); + map variables; + variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)); + variables["[extension]"] = m->getExtension(groupfile); + string outputFileName = getOutputFileName("group", variables); ofstream out; m->openOutputFile(outputFileName, out); @@ -1081,7 +1000,11 @@ int PcrSeqsCommand::readTax(set names){ try { string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(taxfile); } - string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(taxfile)) + getOutputFileNameTag("taxonomy", taxfile); + map variables; + variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(taxfile)); + variables["[extension]"] = m->getExtension(taxfile); + string outputFileName = getOutputFileName("taxonomy", variables); + ofstream out; m->openOutputFile(outputFileName, out); @@ -1128,7 +1051,11 @@ int PcrSeqsCommand::readCount(set badSeqNames){ m->openInputFile(countfile, in); set::iterator it; - string goodCountFile = outputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile); + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(countfile)); + variables["[extension]"] = m->getExtension(countfile); + string goodCountFile = getOutputFileName("count", variables); + outputNames.push_back(goodCountFile); outputTypes["count"].push_back(goodCountFile); ofstream goodCountOut; m->openOutputFile(goodCountFile, goodCountOut);