X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=prcseqscommand.cpp;h=8a8db37612937ee86d7d11adf1b9973d8cba634d;hp=de2cb2058a064b043f424921f599edd6f7f8bf24;hb=d1c97b8c04bb75faca1e76ffad60b37a4d789d3d;hpb=035f86272c776e1cccaa47021e26782e49cd41e7 diff --git a/prcseqscommand.cpp b/prcseqscommand.cpp index de2cb20..8a8db37 100644 --- a/prcseqscommand.cpp +++ b/prcseqscommand.cpp @@ -11,21 +11,23 @@ //********************************************************************************************************************** vector PcrSeqsCommand::setParameters(){ try { - CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter poligos("oligos", "InputTypes", "", "", "ecolioligos", "none", "none",false,false); parameters.push_back(poligos); - CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount); - CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup); - CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(ptax); - CommandParameter pecoli("ecoli", "InputTypes", "", "", "ecolioligos", "none", "none",false,false); parameters.push_back(pecoli); - CommandParameter pstart("start", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pstart); - CommandParameter pend("end", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pend); - CommandParameter pnomatch("nomatch", "Multiple", "reject-keep", "reject", "", "", "",false,false); parameters.push_back(pnomatch); - CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); - CommandParameter pkeepprimer("keepprimer", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pkeepprimer); - CommandParameter pkeepdots("keepdots", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pkeepdots); - CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); - CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); + CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta",false,true,true); parameters.push_back(pfasta); + CommandParameter poligos("oligos", "InputTypes", "", "", "ecolioligos", "none", "none","",false,false,true); parameters.push_back(poligos); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","name",false,false,true); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","count",false,false,true); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","group",false,false,true); parameters.push_back(pgroup); + CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none","taxonomy",false,false,true); parameters.push_back(ptax); + CommandParameter pecoli("ecoli", "InputTypes", "", "", "ecolioligos", "none", "none","",false,false); parameters.push_back(pecoli); + CommandParameter pstart("start", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pstart); + CommandParameter pend("end", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pend); + CommandParameter pnomatch("nomatch", "Multiple", "reject-keep", "reject", "", "", "","",false,false); parameters.push_back(pnomatch); + CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(ppdiffs); + + CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors); + CommandParameter pkeepprimer("keepprimer", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pkeepprimer); + CommandParameter pkeepdots("keepdots", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pkeepdots); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); vector myArray; for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } @@ -41,7 +43,7 @@ string PcrSeqsCommand::getHelpString(){ try { string helpString = ""; helpString += "The pcr.seqs command reads a fasta file.\n"; - helpString += "The pcr.seqs command parameters are fasta, oligos, name, group, count, taxonomy, ecoli, start, end, nomatch, processors, keepprimer and keepdots.\n"; + helpString += "The pcr.seqs command parameters are fasta, oligos, name, group, count, taxonomy, ecoli, start, end, nomatch, pdiffs, processors, keepprimer and keepdots.\n"; helpString += "The ecoli parameter is used to provide a fasta file containing a single reference sequence (e.g. for e. coli) this must be aligned. Mothur will trim to the start and end positions of the reference sequence.\n"; helpString += "The start parameter allows you to provide a starting position to trim to.\n"; helpString += "The end parameter allows you to provide a ending position to trim from.\n"; @@ -49,6 +51,7 @@ string PcrSeqsCommand::getHelpString(){ helpString += "The processors parameter allows you to use multiple processors.\n"; helpString += "The keepprimer parameter allows you to keep the primer, default=false.\n"; helpString += "The keepdots parameter allows you to keep the leading and trailing .'s, default=true.\n"; + helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n"; helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n"; helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Pcr.seqs .\n"; return helpString; @@ -58,31 +61,25 @@ string PcrSeqsCommand::getHelpString(){ exit(1); } } - //********************************************************************************************************************** -string PcrSeqsCommand::getOutputFileNameTag(string type, string inputName=""){ - try { - string outputFileName = ""; - map >::iterator it; +string PcrSeqsCommand::getOutputPattern(string type) { + try { + string pattern = ""; - //is this a type this command creates - it = outputTypes.find(type); - if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); } - else { - if (type == "fasta") { outputFileName = "pcr.fasta"; } - else if (type == "taxonomy") { outputFileName = "pcr" + m->getExtension(inputName); } - else if (type == "group") { outputFileName = "pcr" + m->getExtension(inputName); } - else if (type == "name") { outputFileName = "pcr" + m->getExtension(inputName); } - else if (type == "count") { outputFileName = "pcr" + m->getExtension(inputName); } - else if (type == "accnos") { outputFileName = "bad.accnos"; } - else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } - } - return outputFileName; - } - catch(exception& e) { - m->errorOut(e, "PcrSeqsCommand", "getOutputFileNameTag"); - exit(1); - } + if (type == "fasta") { pattern = "[filename],pcr,[extension]-[filename],[tag],pcr,[extension]"; } + else if (type == "taxonomy") { pattern = "[filename],pcr,[extension]"; } + else if (type == "name") { pattern = "[filename],pcr,[extension]"; } + else if (type == "group") { pattern = "[filename],pcr,[extension]"; } + else if (type == "count") { pattern = "[filename],pcr,[extension]"; } + else if (type == "accnos") { pattern = "[filename],bad.accnos"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } + + return pattern; + } + catch(exception& e) { + m->errorOut(e, "PcrSeqsCommand", "getOutputPattern"); + exit(1); + } } //********************************************************************************************************************** @@ -268,6 +265,9 @@ PcrSeqsCommand::PcrSeqsCommand(string option) { temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); m->mothurConvert(temp, processors); + + temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, pdiffs); nomatch = validParameter.validFile(parameters, "nomatch", false); if (nomatch == "not found") { nomatch = "reject"; } @@ -312,17 +312,21 @@ int PcrSeqsCommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } int start = time(NULL); + fileAligned = true; string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); } - string trimSeqFile = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("fasta"); + map variables; + variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)); + variables["[extension]"] = m->getExtension(fastafile); + string trimSeqFile = getOutputFileName("fasta",variables); outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile); - - string badSeqFile = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "scrap." + getOutputFileNameTag("fasta"); + variables["[tag]"] = "scrap"; + string badSeqFile = getOutputFileName("fasta",variables); length = 0; - if(oligosfile != ""){ readOligos(); } if (m->control_pressed) { return 0; } + if(oligosfile != ""){ readOligos(); if (m->debug) { m->mothurOut("[DEBUG]: read oligos file. numprimers = " + toString(primers.size()) + ", revprimers = " + toString(revPrimer.size()) + ".\n"); } } if (m->control_pressed) { return 0; } if(ecolifile != "") { readEcoli(); } if (m->control_pressed) { return 0; } vector positions; @@ -349,8 +353,7 @@ int PcrSeqsCommand::execute(){ if (m->control_pressed) { return 0; } set badNames; - if(processors == 1) { numFastaSeqs = driverPcr(fastafile, trimSeqFile, badSeqFile, badNames, lines[0]); } - else { numFastaSeqs = createProcesses(fastafile, trimSeqFile, badSeqFile, badNames); } + numFastaSeqs = createProcesses(fastafile, trimSeqFile, badSeqFile, badNames); if (m->control_pressed) { return 0; } @@ -426,6 +429,8 @@ int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string vector processIDS; int process = 1; int num = 0; + int pstart = -1; int pend = -1; + bool adjustNeeded = false; #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) @@ -437,12 +442,14 @@ int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - num = driverPcr(filename, goodFileName + toString(getpid()) + ".temp", badFileName + toString(getpid()) + ".temp", badSeqNames, lines[process]); + string locationsFile = toString(getpid()) + ".temp"; + num = driverPcr(filename, goodFileName + toString(getpid()) + ".temp", badFileName + toString(getpid()) + ".temp", locationsFile, badSeqNames, lines[process], pstart, pend, adjustNeeded); //pass numSeqs to parent ofstream out; string tempFile = filename + toString(getpid()) + ".num.temp"; m->openOutputFile(tempFile, out); + out << pstart << '\t' << pend << '\t' << adjustNeeded << endl; out << num << '\t' << badSeqNames.size() << endl; for (set::iterator it = badSeqNames.begin(); it != badSeqNames.end(); it++) { out << (*it) << endl; @@ -457,7 +464,8 @@ int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string } } - num = driverPcr(filename, goodFileName, badFileName, badSeqNames, lines[0]); + string locationsFile = toString(getpid()) + ".temp"; + num = driverPcr(filename, goodFileName, badFileName, locationsFile, badSeqNames, lines[0], pstart, pend, adjustNeeded); //force parent to wait until all the processes are done for (int i=0;iopenInputFile(tempFile, in); int numBadNames = 0; string name = ""; - if (!in.eof()) { int tempNum = 0; in >> tempNum >> numBadNames; num += tempNum; m->gobble(in); } + int tpstart = -1; int tpend = -1; bool tempAdjust = false; + + if (!in.eof()) { + in >> tpstart >> tpend >> tempAdjust; m->gobble(in); + + if (tempAdjust) { adjustNeeded = true; } + if (tpstart != -1) { + if (tpstart != pstart) { adjustNeeded = true; } + if (tpstart < pstart) { pstart = tpstart; } //smallest start + } + if (tpend != -1) { + if (tpend != pend) { adjustNeeded = true; } + if (tpend > pend) { pend = tpend; } //largest end + } + int tempNum = 0; in >> tempNum >> numBadNames; num += tempNum; m->gobble(in); + } for (int j = 0; j < numBadNames; j++) { in >> name; m->gobble(in); badSeqNames.insert(name); @@ -482,6 +505,9 @@ int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName); m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp")); + + m->appendFiles((toString(processIDS[i]) + ".temp"), locationsFile); + m->mothurRemove((toString(processIDS[i]) + ".temp")); } #else @@ -495,6 +521,11 @@ int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string DWORD dwThreadIdArray[processors-1]; HANDLE hThreadArray[processors-1]; + string locationsFile = "locationsFile.txt"; + m->mothurRemove(locationsFile); + m->mothurRemove(goodFileName); + m->mothurRemove(badFileName); + //Create processor worker threads. for( int i=0; icount; + if (pDataArray[i]->count != pDataArray[i]->fend) { + m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->fend) + " sequences assigned to it, quitting. \n"); m->control_pressed = true; + } + if (pDataArray[i]->adjustNeeded) { adjustNeeded = true; } + if (pDataArray[i]->pstart != -1) { + if (pDataArray[i]->pstart != pstart) { adjustNeeded = true; } + if (pDataArray[i]->pstart < pstart) { pstart = pDataArray[i]->pstart; } + } //smallest start + if (pDataArray[i]->pend != -1) { + if (pDataArray[i]->pend != pend) { adjustNeeded = true; } + if (pDataArray[i]->pend > pend) { pend = pDataArray[i]->pend; } + } //largest end + for (set::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames.insert(*it); } CloseHandle(hThreadArray[i]); delete pDataArray[i]; @@ -530,9 +574,13 @@ int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName); m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp")); + + m->appendFiles((locationsFile+toString(processIDS[i]) + ".temp"), locationsFile); + m->mothurRemove((locationsFile+toString(processIDS[i]) + ".temp")); } #endif + if (fileAligned && adjustNeeded) { adjustDots(goodFileName, locationsFile, pstart, pend); } return num; @@ -544,13 +592,16 @@ int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string } //********************************************************************************************************************** -int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta, set& badSeqNames, linePair filePos){ +int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta, string locationsName, set& badSeqNames, linePair filePos, int& pstart, int& pend, bool& adjustNeeded){ try { ofstream goodFile; m->openOutputFile(goodFasta, goodFile); ofstream badFile; m->openOutputFile(badFasta, badFile); + + ofstream locationsFile; + m->openOutputFile(locationsName, locationsFile); ifstream inFASTA; m->openInputFile(filename, inFASTA); @@ -560,6 +611,12 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta bool done = false; int count = 0; set lengths; + vector< set > locations; //locations[0] = beginning locations, locations[1] = ending locations + locations.resize(2); + + //pdiffs, bdiffs, primers, barcodes, revPrimers + map faked; + TrimOligos trim(pdiffs, 0, primers, faked, revPrimer); while (!done) { @@ -567,9 +624,19 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta Sequence currSeq(inFASTA); m->gobble(inFASTA); + if (fileAligned) { //assume aligned until proven otherwise + lengths.insert(currSeq.getAligned().length()); + if (lengths.size() > 1) { fileAligned = false; } + } + string trashCode = ""; + string locationsString = ""; + int thisPStart = -1; + int thisPEnd = -1; if (currSeq.getName() != "") { + if (m->debug) { m->mothurOut("[DEBUG]: seq name = " + currSeq.getName() + ".\n"); } + bool goodSeq = true; if (oligosfile != "") { map mapAligned; @@ -578,20 +645,33 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta //process primers if (primers.size() != 0) { int primerStart = 0; int primerEnd = 0; - bool good = findForward(currSeq, primerStart, primerEnd); + bool good = trim.findForward(currSeq, primerStart, primerEnd); if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "f"; } else{ //are you aligned if (aligned) { - if (!keepprimer) { - if (keepdots) { currSeq.filterToPos(mapAligned[primerEnd]); } - else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerEnd])); } + if (!keepprimer) { + if (keepdots) { currSeq.filterToPos(mapAligned[primerEnd-1]+1); } //mapAligned[primerEnd-1] is the location of the last base in the primer. we want to trim to the space just after that. The -1 & +1 ensures if the primer is followed by gaps they are not trimmed causing an aligned sequence dataset to become unaligned. + else { + currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerEnd-1]+1)); + if (fileAligned) { + thisPStart = mapAligned[primerEnd-1]+1; //locations[0].insert(mapAligned[primerEnd-1]+1); + locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerEnd-1]+1) + "\n"; + } + } } else { if (keepdots) { currSeq.filterToPos(mapAligned[primerStart]); } - else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart])); } + else { + currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart])); + if (fileAligned) { + thisPStart = mapAligned[primerStart]; //locations[0].insert(mapAligned[primerStart]); + locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerStart]) + "\n"; + } + } } + isAligned(currSeq.getAligned(), mapAligned); }else { if (!keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(primerEnd)); } else { currSeq.setAligned(currSeq.getUnaligned().substr(primerStart)); } @@ -602,18 +682,30 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta //process reverse primers if (revPrimer.size() != 0) { int primerStart = 0; int primerEnd = 0; - bool good = findReverse(currSeq, primerStart, primerEnd); + bool good = trim.findReverse(currSeq, primerStart, primerEnd); if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "r"; } - else{ + else{ //are you aligned if (aligned) { if (!keepprimer) { if (keepdots) { currSeq.filterFromPos(mapAligned[primerStart]); } - else { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerStart])); } + else { + currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerStart])); + if (fileAligned) { + thisPEnd = mapAligned[primerStart]; //locations[1].insert(mapAligned[primerStart]); + locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerStart]) + "\n"; + } + } } else { - if (keepdots) { currSeq.filterFromPos(mapAligned[primerEnd]); } - else { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerEnd])); } + if (keepdots) { currSeq.filterFromPos(mapAligned[primerEnd-1]+1); } + else { + currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerEnd-1]+1)); + if (fileAligned) { + thisPEnd = mapAligned[primerEnd-1]+1; //locations[1].insert(mapAligned[primerEnd-1]+1); + locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerEnd-1]+1) + "\n"; + } + } } } else { @@ -624,8 +716,7 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta } }else if (ecolifile != "") { //make sure the seqs are aligned - lengths.insert(currSeq.getAligned().length()); - if (lengths.size() > 1) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; } + if (!fileAligned) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; } else if (currSeq.getAligned().length() != length) { m->mothurOut("[ERROR]: seqs are not the same length as ecoli seq. When using ecoli option your sequences must be aligned and the same length as the ecoli sequence.\n"); m->control_pressed = true; break; }else { @@ -640,8 +731,7 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta } }else{ //using start and end to trim //make sure the seqs are aligned - lengths.insert(currSeq.getAligned().length()); - if (lengths.size() > 1) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; } + if (!fileAligned) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; } else { if (end != -1) { if (end > currSeq.getAligned().length()) { m->mothurOut("[ERROR]: end is longer than your sequence length, aborting.\n"); m->control_pressed = true; break; } @@ -666,7 +756,13 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta //trimming removed all bases if (currSeq.getUnaligned() == "") { goodSeq = false; } - if(goodSeq == 1) { currSeq.printSequence(goodFile); } + if(goodSeq == 1) { + currSeq.printSequence(goodFile); + if (m->debug) { m->mothurOut("[DEBUG]: " + locationsString + "\n"); } + if (locationsString != "") { locationsFile << locationsString; } + if (thisPStart != -1) { locations[0].insert(thisPStart); } + if (thisPEnd != -1) { locations[1].insert(thisPEnd); } + } else { badSeqNames.insert(currSeq.getName()); currSeq.setName(currSeq.getName() + '|' + trashCode); @@ -683,15 +779,24 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta #endif //report progress - if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); } + if((count) % 100 == 0){ m->mothurOutJustToScreen("Processing sequence: " + toString(count)+"\n"); } } //report progress - if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); } + if((count) % 100 != 0){ m->mothurOutJustToScreen("Processing sequence: " + toString(count)+"\n"); } badFile.close(); goodFile.close(); inFASTA.close(); - + locationsFile.close(); + + if (m->debug) { m->mothurOut("[DEBUG]: fileAligned = " + toString(fileAligned) +'\n'); } + + if (fileAligned && !keepdots) { //print out smallest start value and largest end value + if ((locations[0].size() > 1) || (locations[1].size() > 1)) { adjustNeeded = true; } + if (primers.size() != 0) { set::iterator it = locations[0].begin(); pstart = *it; } + if (revPrimer.size() != 0) { set::reverse_iterator it2 = locations[1].rbegin(); pend = *it2; } + } + return count; } catch(exception& e) { @@ -700,75 +805,9 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta } } //********************************************************************/ -bool PcrSeqsCommand::findForward(Sequence& seq, int& primerStart, int& primerEnd){ - try { - - string rawSequence = seq.getUnaligned(); - - for(int j=0;jcontrol_pressed) { primerStart = 0; primerEnd = 0; return false; } - string rawChunk = rawSequence.substr(j, olength); - if(compareDNASeq(oligo, rawChunk)) { - primerStart = j; - primerEnd = primerStart + olength; - return true; - } - - } - } - - primerStart = 0; primerEnd = 0; - return false; - - } - catch(exception& e) { - m->errorOut(e, "TrimOligos", "stripForward"); - exit(1); - } -} -//******************************************************************/ -bool PcrSeqsCommand::findReverse(Sequence& seq, int& primerStart, int& primerEnd){ - try { - - string rawSequence = seq.getUnaligned(); - - for(int i=0;i= 0; j--){ - if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; } - string rawChunk = rawSequence.substr(j, olength); - - if(compareDNASeq(oligo, rawChunk)) { - primerStart = j; - primerEnd = primerStart + olength; - return true; - } - - } - } - - primerStart = 0; primerEnd = 0; - return false; - } - catch(exception& e) { - m->errorOut(e, "PcrSeqsCommand", "findReverse"); - exit(1); - } -} -//********************************************************************/ bool PcrSeqsCommand::isAligned(string seq, map& aligned){ try { + aligned.clear(); bool isAligned = false; int countBases = 0; @@ -784,6 +823,80 @@ bool PcrSeqsCommand::isAligned(string seq, map& aligned){ exit(1); } } +//********************************************************************************************************************** +int PcrSeqsCommand::adjustDots(string goodFasta, string locations, int pstart, int pend){ + try { + ifstream inFasta; + m->openInputFile(goodFasta, inFasta); + + ifstream inLocations; + m->openInputFile(locations, inLocations); + + ofstream out; + m->openOutputFile(goodFasta+".temp", out); + + set lengths; + //cout << pstart << '\t' << pend << endl; + + while(!inFasta.eof()) { + if(m->control_pressed) { break; } + + Sequence seq(inFasta); m->gobble(inFasta); + + string name = ""; + int thisStart = -1; int thisEnd = -1; + if (primers.size() != 0) { inLocations >> name >> thisStart; m->gobble(inLocations); } + if (revPrimer.size() != 0) { inLocations >> name >> thisEnd; m->gobble(inLocations); } + //cout << seq.getName() << '\t' << thisStart << '\t' << thisEnd << '\t' << seq.getAligned().length() << endl; + //cout << seq.getName() << '\t' << pstart << '\t' << pend << endl; + + if (name != seq.getName()) { m->mothurOut("[ERROR]: name mismatch in pcr.seqs.\n"); } + else { + if (pstart != -1) { + if (thisStart != -1) { + if (thisStart != pstart) { + string dots = ""; + for (int i = pstart; i < thisStart; i++) { dots += "."; } + thisEnd += dots.length(); + dots += seq.getAligned(); + seq.setAligned(dots); + } + } + } + + if (pend != -1) { + if (thisEnd != -1) { + if (thisEnd != pend) { + string dots = seq.getAligned(); + for (int i = thisEnd; i < pend; i++) { dots += "."; } + seq.setAligned(dots); + } + } + } + lengths.insert(seq.getAligned().length()); + } + + seq.printSequence(out); + } + inFasta.close(); + inLocations.close(); + out.close(); + m->mothurRemove(locations); + m->mothurRemove(goodFasta); + m->renameFile(goodFasta+".temp", goodFasta); + + //cout << "final lengths = \n"; + //for (set::iterator it = lengths.begin(); it != lengths.end(); it++) { + // cout << *it << endl; + // } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "PcrSeqsCommand", "adjustDots"); + exit(1); + } +} //********************************************************************/ string PcrSeqsCommand::reverseOligo(string oligo){ try { @@ -832,6 +945,7 @@ bool PcrSeqsCommand::readOligos(){ m->openInputFile(oligosfile, inOligos); string type, oligo, group; + int primerCount = 0; while(!inOligos.eof()){ @@ -856,18 +970,39 @@ bool PcrSeqsCommand::readOligos(){ // get rest of line in case there is a primer name while (!inOligos.eof()) { char c = inOligos.get(); - if (c == 10 || c == 13){ break; } + if (c == 10 || c == 13 || c == -1){ break; } else if (c == 32 || c == 9){;} //space or tab } - primers.push_back(oligo); + primers[oligo] = primerCount; primerCount++; + //cout << "for oligo = " << oligo << endl; }else if(type == "REVERSE"){ string oligoRC = reverseOligo(oligo); revPrimer.push_back(oligoRC); - //cout << "oligo = " << oligo << " reverse = " << oligoRC << endl; + //cout << "rev oligo = " << oligo << " reverse = " << oligoRC << endl; }else if(type == "BARCODE"){ - inOligos >> group; + inOligos >> group; + }else if(type == "PRIMER"){ + m->gobble(inOligos); + primers[oligo] = primerCount; primerCount++; + + string roligo=""; + inOligos >> roligo; + + for(int i=0;imothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, linker, spacer and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); m->control_pressed = true; } + else{ m->mothurOut(type + " is not recognized as a valid type. Choices are primer, forward, reverse, linker, spacer and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); m->control_pressed = true; } } m->gobble(inOligos); } @@ -914,7 +1049,9 @@ int PcrSeqsCommand::writeAccnos(set badNames){ try { string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); } - string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("accnos"); + map variables; + variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)); + string outputFileName = getOutputFileName("accnos",variables); outputNames.push_back(outputFileName); outputTypes["accnos"].push_back(outputFileName); ofstream out; @@ -933,50 +1070,16 @@ int PcrSeqsCommand::writeAccnos(set badNames){ exit(1); } -} -//******************************************************************/ -bool PcrSeqsCommand::compareDNASeq(string oligo, string seq){ - try { - bool success = 1; - int length = oligo.length(); - - for(int i=0;ierrorOut(e, "PcrSeqsCommand", "compareDNASeq"); - exit(1); - } - } //*************************************************************************************************************** int PcrSeqsCommand::readName(set& names){ try { string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(namefile); } - string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(namefile)) + getOutputFileNameTag("name", namefile); + map variables; + variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(namefile)); + variables["[extension]"] = m->getExtension(namefile); + string outputFileName = getOutputFileName("name", variables); ofstream out; m->openOutputFile(outputFileName, out); @@ -1034,7 +1137,10 @@ int PcrSeqsCommand::readGroup(set names){ try { string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(groupfile); } - string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)) + getOutputFileNameTag("group", groupfile); + map variables; + variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)); + variables["[extension]"] = m->getExtension(groupfile); + string outputFileName = getOutputFileName("group", variables); ofstream out; m->openOutputFile(outputFileName, out); @@ -1081,7 +1187,11 @@ int PcrSeqsCommand::readTax(set names){ try { string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(taxfile); } - string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(taxfile)) + getOutputFileNameTag("taxonomy", taxfile); + map variables; + variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(taxfile)); + variables["[extension]"] = m->getExtension(taxfile); + string outputFileName = getOutputFileName("taxonomy", variables); + ofstream out; m->openOutputFile(outputFileName, out); @@ -1128,7 +1238,11 @@ int PcrSeqsCommand::readCount(set badSeqNames){ m->openInputFile(countfile, in); set::iterator it; - string goodCountFile = outputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile); + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(countfile)); + variables["[extension]"] = m->getExtension(countfile); + string goodCountFile = getOutputFileName("count", variables); + outputNames.push_back(goodCountFile); outputTypes["count"].push_back(goodCountFile); ofstream goodCountOut; m->openOutputFile(goodCountFile, goodCountOut); @@ -1161,7 +1275,7 @@ int PcrSeqsCommand::readCount(set badSeqNames){ //check for groups that have been eliminated CountTable ct; if (ct.testGroups(goodCountFile)) { - ct.readTable(goodCountFile); + ct.readTable(goodCountFile, true); ct.printTable(goodCountFile); }