X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=prcseqscommand.cpp;h=a1eae4687c4d49b2ce8e0f90895a36a994aec689;hp=afedc746360b495850d588b6c7ce3839ec3f9256;hb=cf9987b67aa49777a4c91c2d21f96e58bf17aa82;hpb=9aa36ad8297141ef9fcab04fea10e96d2fed26fe diff --git a/prcseqscommand.cpp b/prcseqscommand.cpp index afedc74..a1eae46 100644 --- a/prcseqscommand.cpp +++ b/prcseqscommand.cpp @@ -11,21 +11,23 @@ //********************************************************************************************************************** vector PcrSeqsCommand::setParameters(){ try { - CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter poligos("oligos", "InputTypes", "", "", "ecolioligos", "none", "none",false,false); parameters.push_back(poligos); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); - CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(ptax); - CommandParameter pecoli("ecoli", "InputTypes", "", "", "ecolioligos", "none", "none",false,false); parameters.push_back(pecoli); - CommandParameter pstart("start", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pstart); - CommandParameter pend("end", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pend); - CommandParameter pnomatch("nomatch", "Multiple", "reject-keep", "reject", "", "", "",false,false); parameters.push_back(pnomatch); - CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs); - CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); - CommandParameter pkeepprimer("keepprimer", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pkeepprimer); - CommandParameter pkeepdots("keepdots", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pkeepdots); - CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); - CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); + CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta",false,true,true); parameters.push_back(pfasta); + CommandParameter poligos("oligos", "InputTypes", "", "", "ecolioligos", "none", "none","",false,false,true); parameters.push_back(poligos); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","name",false,false,true); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","count",false,false,true); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","group",false,false,true); parameters.push_back(pgroup); + CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none","taxonomy",false,false,true); parameters.push_back(ptax); + CommandParameter pecoli("ecoli", "InputTypes", "", "", "ecolioligos", "none", "none","",false,false); parameters.push_back(pecoli); + CommandParameter pstart("start", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pstart); + CommandParameter pend("end", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pend); + CommandParameter pnomatch("nomatch", "Multiple", "reject-keep", "reject", "", "", "","",false,false); parameters.push_back(pnomatch); + CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(ppdiffs); + + CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors); + CommandParameter pkeepprimer("keepprimer", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pkeepprimer); + CommandParameter pkeepdots("keepdots", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pkeepdots); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); vector myArray; for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } @@ -40,8 +42,16 @@ vector PcrSeqsCommand::setParameters(){ string PcrSeqsCommand::getHelpString(){ try { string helpString = ""; - helpString += "The pcr.seqs command reads a fasta file ...\n"; - + helpString += "The pcr.seqs command reads a fasta file.\n"; + helpString += "The pcr.seqs command parameters are fasta, oligos, name, group, count, taxonomy, ecoli, start, end, nomatch, pdiffs, processors, keepprimer and keepdots.\n"; + helpString += "The ecoli parameter is used to provide a fasta file containing a single reference sequence (e.g. for e. coli) this must be aligned. Mothur will trim to the start and end positions of the reference sequence.\n"; + helpString += "The start parameter allows you to provide a starting position to trim to.\n"; + helpString += "The end parameter allows you to provide a ending position to trim from.\n"; + helpString += "The nomatch parameter allows you to decide what to do with sequences where the primer is not found. Default=reject, meaning remove from fasta file. if nomatch=true, then do nothing to sequence.\n"; + helpString += "The processors parameter allows you to use multiple processors.\n"; + helpString += "The keepprimer parameter allows you to keep the primer, default=false.\n"; + helpString += "The keepdots parameter allows you to keep the leading and trailing .'s, default=true.\n"; + helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n"; helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n"; helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Pcr.seqs .\n"; return helpString; @@ -51,8 +61,26 @@ string PcrSeqsCommand::getHelpString(){ exit(1); } } - - +//********************************************************************************************************************** +string PcrSeqsCommand::getOutputPattern(string type) { + try { + string pattern = ""; + + if (type == "fasta") { pattern = "[filename],pcr,[extension]-[filename],[tag],pcr,[extension]"; } + else if (type == "taxonomy") { pattern = "[filename],pcr,[extension]"; } + else if (type == "name") { pattern = "[filename],pcr,[extension]"; } + else if (type == "group") { pattern = "[filename],pcr,[extension]"; } + else if (type == "count") { pattern = "[filename],pcr,[extension]"; } + else if (type == "accnos") { pattern = "[filename],bad.accnos"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } + + return pattern; + } + catch(exception& e) { + m->errorOut(e, "PcrSeqsCommand", "getOutputPattern"); + exit(1); + } +} //********************************************************************************************************************** PcrSeqsCommand::PcrSeqsCommand(){ @@ -64,6 +92,7 @@ PcrSeqsCommand::PcrSeqsCommand(){ outputTypes["taxonomy"] = tempOutNames; outputTypes["group"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; outputTypes["accnos"] = tempOutNames; } catch(exception& e) { @@ -103,6 +132,7 @@ PcrSeqsCommand::PcrSeqsCommand(string option) { outputTypes["group"] = tempOutNames; outputTypes["name"] = tempOutNames; outputTypes["accnos"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -156,11 +186,17 @@ PcrSeqsCommand::PcrSeqsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["group"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } - //if the user changes the output directory command factory will send this info to us in the output parameter - outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } //check for required parameters fastafile = validParameter.validFile(parameters, "fasta", true); @@ -171,7 +207,9 @@ PcrSeqsCommand::PcrSeqsCommand(string option) { }else if (fastafile == "not open") { fastafile = ""; abort = true; } else { m->setFastaFile(fastafile); } - + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(fastafile); } + //check for optional parameter and set defaults // ...at some point should added some additional type checking... string temp; @@ -200,14 +238,24 @@ PcrSeqsCommand::PcrSeqsCommand(string option) { else if(groupfile == "not open"){ groupfile = ""; abort = true; } else { m->setGroupFile(groupfile); } + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { countfile = ""; abort = true; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((namefile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true; + } + + if ((groupfile != "") && (countfile != "")) { + m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true; + } + taxfile = validParameter.validFile(parameters, "taxonomy", true); if (taxfile == "not found"){ taxfile = ""; } else if(taxfile == "not open"){ taxfile = ""; abort = true; } else { m->setTaxonomyFile(taxfile); } - - temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; } - m->mothurConvert(temp, pdiffs); - + temp = validParameter.validFile(parameters, "start", false); if (temp == "not found") { temp = "-1"; } m->mothurConvert(temp, start); @@ -217,6 +265,9 @@ PcrSeqsCommand::PcrSeqsCommand(string option) { temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); m->mothurConvert(temp, processors); + + temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; } + m->mothurConvert(temp, pdiffs); nomatch = validParameter.validFile(parameters, "nomatch", false); if (nomatch == "not found") { nomatch = "reject"; } @@ -239,10 +290,12 @@ PcrSeqsCommand::PcrSeqsCommand(string option) { } //check to make sure you didn't forget the name file by mistake - if (namefile == "") { - vector files; files.push_back(fastafile); - parser.getNameFile(files); - } + if (countfile == "") { + if (namefile == "") { + vector files; files.push_back(fastafile); + parser.getNameFile(files); + } + } } } @@ -259,17 +312,21 @@ int PcrSeqsCommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } int start = time(NULL); + fileAligned = true; string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); } - string trimSeqFile = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "pcr.fasta"; + map variables; + variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)); + variables["[extension]"] = m->getExtension(fastafile); + string trimSeqFile = getOutputFileName("fasta",variables); outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile); - - string badSeqFile = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "pcr.scrap.fasta"; - outputNames.push_back(badSeqFile); outputTypes["fasta"].push_back(badSeqFile); + variables["[tag]"] = "scrap"; + string badSeqFile = getOutputFileName("fasta",variables); + length = 0; - if(oligosfile != ""){ readOligos(); } if (m->control_pressed) { return 0; } + if(oligosfile != ""){ readOligos(); if (m->debug) { m->mothurOut("[DEBUG]: read oligos file. numprimers = " + toString(primers.size()) + ", revprimers = " + toString(revPrimer.size()) + ".\n"); } } if (m->control_pressed) { return 0; } if(ecolifile != "") { readEcoli(); } if (m->control_pressed) { return 0; } vector positions; @@ -296,14 +353,14 @@ int PcrSeqsCommand::execute(){ if (m->control_pressed) { return 0; } set badNames; - if(processors == 1) { numFastaSeqs = driverPcr(fastafile, trimSeqFile, badSeqFile, badNames, lines[0]); } - else { numFastaSeqs = createProcesses(fastafile, trimSeqFile, badSeqFile, badNames); } + numFastaSeqs = createProcesses(fastafile, trimSeqFile, badSeqFile, badNames); if (m->control_pressed) { return 0; } //don't write or keep if blank if (badNames.size() != 0) { writeAccnos(badNames); } if (m->isBlank(badSeqFile)) { m->mothurRemove(badSeqFile); } + else { outputNames.push_back(badSeqFile); outputTypes["fasta"].push_back(badSeqFile); } if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } if (namefile != "") { readName(badNames); } @@ -312,7 +369,9 @@ int PcrSeqsCommand::execute(){ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } if (taxfile != "") { readTax(badNames); } if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } - + if (countfile != "") { readCount(badNames); } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } + m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } @@ -346,6 +405,11 @@ int PcrSeqsCommand::execute(){ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); } } + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } + m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); @@ -365,6 +429,8 @@ int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string vector processIDS; int process = 1; int num = 0; + int pstart = -1; int pend = -1; + bool adjustNeeded = false; #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) @@ -376,12 +442,14 @@ int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - num = driverPcr(filename, goodFileName + toString(getpid()) + ".temp", badFileName + toString(getpid()) + ".temp", badSeqNames, lines[process]); + string locationsFile = toString(getpid()) + ".temp"; + num = driverPcr(filename, goodFileName + toString(getpid()) + ".temp", badFileName + toString(getpid()) + ".temp", locationsFile, badSeqNames, lines[process], pstart, adjustNeeded); //pass numSeqs to parent ofstream out; string tempFile = filename + toString(getpid()) + ".num.temp"; m->openOutputFile(tempFile, out); + out << pstart << '\t' << adjustNeeded << endl; out << num << '\t' << badSeqNames.size() << endl; for (set::iterator it = badSeqNames.begin(); it != badSeqNames.end(); it++) { out << (*it) << endl; @@ -396,7 +464,8 @@ int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string } } - num = driverPcr(filename, goodFileName, badFileName, badSeqNames, lines[0]); + string locationsFile = toString(getpid()) + ".temp"; + num = driverPcr(filename, goodFileName, badFileName, locationsFile, badSeqNames, lines[0], pstart, adjustNeeded); //force parent to wait until all the processes are done for (int i=0;iopenInputFile(tempFile, in); int numBadNames = 0; string name = ""; - if (!in.eof()) { int tempNum = 0; in >> tempNum >> numBadNames; num += tempNum; m->gobble(in); } + int tpstart = -1; bool tempAdjust = false; + + if (!in.eof()) { + in >> tpstart >> tempAdjust; m->gobble(in); + + if (tempAdjust) { adjustNeeded = true; } + if (tpstart != -1) { + if (tpstart != pstart) { adjustNeeded = true; } + if (tpstart < pstart) { pstart = tpstart; } //smallest start + } + int tempNum = 0; in >> tempNum >> numBadNames; num += tempNum; m->gobble(in); + } for (int j = 0; j < numBadNames; j++) { in >> name; m->gobble(in); badSeqNames.insert(name); @@ -421,6 +501,9 @@ int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName); m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp")); + + m->appendFiles((toString(processIDS[i]) + ".temp"), locationsFile); + m->mothurRemove((toString(processIDS[i]) + ".temp")); } #else @@ -434,6 +517,11 @@ int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string DWORD dwThreadIdArray[processors-1]; HANDLE hThreadArray[processors-1]; + string locationsFile = "locationsFile.txt"; + m->mothurRemove(locationsFile); + m->mothurRemove(goodFileName); + m->mothurRemove(badFileName); + //Create processor worker threads. for( int i=0; icount; + if (pDataArray[i]->count != pDataArray[i]->fend) { + m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->fend) + " sequences assigned to it, quitting. \n"); m->control_pressed = true; + } + if (pDataArray[i]->adjustNeeded) { adjustNeeded = true; } + if (pDataArray[i]->pstart != -1) { + if (pDataArray[i]->pstart != pstart) { adjustNeeded = true; } + if (pDataArray[i]->pstart < pstart) { pstart = pDataArray[i]->pstart; } + } //smallest start + for (set::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames.insert(*it); } CloseHandle(hThreadArray[i]); delete pDataArray[i]; @@ -469,10 +566,50 @@ int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName); m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp")); + + m->appendFiles((locationsFile+toString(processIDS[i]) + ".temp"), locationsFile); + m->mothurRemove((locationsFile+toString(processIDS[i]) + ".temp")); } #endif + + + if (fileAligned) { + //find pend - pend is the biggest ending value, but we must account for when we adjust the start. That adjustment may make the "new" end larger then the largest end. So lets find out what that "new" end will be. + ifstream inLocations; + m->openInputFile(locationsFile, inLocations); + + while(!inLocations.eof()) { + + if (m->control_pressed) { break; } + + string name = ""; + int thisStart = -1; int thisEnd = -1; + if (primers.size() != 0) { inLocations >> name >> thisStart; m->gobble(inLocations); } + if (revPrimer.size() != 0) { inLocations >> name >> thisEnd; m->gobble(inLocations); } + else { pend = -1; break; } + + int myDiff = 0; + if (pstart != -1) { + if (thisStart != -1) { + if (thisStart != pstart) { myDiff += (thisStart - pstart); } + } + } + + int myEnd = thisEnd + myDiff; + //cout << name << '\t' << thisStart << '\t' << thisEnd << " diff = " << myDiff << '\t' << myEnd << endl; + + if (thisEnd != -1) { + if (myEnd > pend) { pend = myEnd; } + } + + } + inLocations.close(); + + adjustDots(goodFileName, locationsFile, pstart, pend); + } + return num; } @@ -483,13 +620,16 @@ int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string } //********************************************************************************************************************** -int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta, set& badSeqNames, linePair filePos){ +int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta, string locationsName, set& badSeqNames, linePair filePos, int& pstart, bool& adjustNeeded){ try { ofstream goodFile; m->openOutputFile(goodFasta, goodFile); ofstream badFile; m->openOutputFile(badFasta, badFile); + + ofstream locationsFile; + m->openOutputFile(locationsName, locationsFile); ifstream inFASTA; m->openInputFile(filename, inFASTA); @@ -499,6 +639,11 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta bool done = false; int count = 0; set lengths; + set locations; //locations[0] = beginning locations, + + //pdiffs, bdiffs, primers, barcodes, revPrimers + map faked; + TrimOligos trim(pdiffs, 0, primers, faked, revPrimer); while (!done) { @@ -506,9 +651,19 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta Sequence currSeq(inFASTA); m->gobble(inFASTA); + if (fileAligned) { //assume aligned until proven otherwise + lengths.insert(currSeq.getAligned().length()); + if (lengths.size() > 1) { fileAligned = false; } + } + string trashCode = ""; + string locationsString = ""; + int thisPStart = -1; + int thisPEnd = -1; if (currSeq.getName() != "") { + if (m->debug) { m->mothurOut("[DEBUG]: seq name = " + currSeq.getName() + ".\n"); } + bool goodSeq = true; if (oligosfile != "") { map mapAligned; @@ -517,20 +672,33 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta //process primers if (primers.size() != 0) { int primerStart = 0; int primerEnd = 0; - bool good = findForward(currSeq, primerStart, primerEnd); + bool good = trim.findForward(currSeq, primerStart, primerEnd); if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "f"; } else{ //are you aligned if (aligned) { - if (!keepprimer) { - if (keepdots) { currSeq.filterToPos(mapAligned[primerEnd]); } - else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerEnd])); } + if (!keepprimer) { + if (keepdots) { currSeq.filterToPos(mapAligned[primerEnd-1]+1); } //mapAligned[primerEnd-1] is the location of the last base in the primer. we want to trim to the space just after that. The -1 & +1 ensures if the primer is followed by gaps they are not trimmed causing an aligned sequence dataset to become unaligned. + else { + currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerEnd-1]+1)); + if (fileAligned) { + thisPStart = mapAligned[primerEnd-1]+1; //locations[0].insert(mapAligned[primerEnd-1]+1); + locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerEnd-1]+1) + "\n"; + } + } } else { if (keepdots) { currSeq.filterToPos(mapAligned[primerStart]); } - else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart])); } + else { + currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart])); + if (fileAligned) { + thisPStart = mapAligned[primerStart]; //locations[0].insert(mapAligned[primerStart]); + locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerStart]) + "\n"; + } + } } + isAligned(currSeq.getAligned(), mapAligned); }else { if (!keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(primerEnd)); } else { currSeq.setAligned(currSeq.getUnaligned().substr(primerStart)); } @@ -541,18 +709,30 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta //process reverse primers if (revPrimer.size() != 0) { int primerStart = 0; int primerEnd = 0; - bool good = findReverse(currSeq, primerStart, primerEnd); + bool good = trim.findReverse(currSeq, primerStart, primerEnd); if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "r"; } - else{ + else{ //are you aligned if (aligned) { if (!keepprimer) { if (keepdots) { currSeq.filterFromPos(mapAligned[primerStart]); } - else { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerStart])); } + else { + currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerStart])); + if (fileAligned) { + thisPEnd = mapAligned[primerStart]; //locations[1].insert(mapAligned[primerStart]); + locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerStart]) + "\n"; + } + } } else { - if (keepdots) { currSeq.filterFromPos(mapAligned[primerEnd]); } - else { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerEnd])); } + if (keepdots) { currSeq.filterFromPos(mapAligned[primerEnd-1]+1); } + else { + currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerEnd-1]+1)); + if (fileAligned) { + thisPEnd = mapAligned[primerEnd-1]+1; //locations[1].insert(mapAligned[primerEnd-1]+1); + locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerEnd-1]+1) + "\n"; + } + } } } else { @@ -563,8 +743,7 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta } }else if (ecolifile != "") { //make sure the seqs are aligned - lengths.insert(currSeq.getAligned().length()); - if (lengths.size() > 1) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; } + if (!fileAligned) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; } else if (currSeq.getAligned().length() != length) { m->mothurOut("[ERROR]: seqs are not the same length as ecoli seq. When using ecoli option your sequences must be aligned and the same length as the ecoli sequence.\n"); m->control_pressed = true; break; }else { @@ -579,8 +758,7 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta } }else{ //using start and end to trim //make sure the seqs are aligned - lengths.insert(currSeq.getAligned().length()); - if (lengths.size() > 1) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; } + if (!fileAligned) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; } else { if (end != -1) { if (end > currSeq.getAligned().length()) { m->mothurOut("[ERROR]: end is longer than your sequence length, aborting.\n"); m->control_pressed = true; break; } @@ -601,8 +779,16 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta } } } - - if(goodSeq == 1) { currSeq.printSequence(goodFile); } + + //trimming removed all bases + if (currSeq.getUnaligned() == "") { goodSeq = false; } + + if(goodSeq == 1) { + currSeq.printSequence(goodFile); + if (m->debug) { m->mothurOut("[DEBUG]: " + locationsString + "\n"); } + if (thisPStart != -1) { locations.insert(thisPStart); } + if (locationsString != "") { locationsFile << locationsString; } + } else { badSeqNames.insert(currSeq.getName()); currSeq.setName(currSeq.getName() + '|' + trashCode); @@ -619,15 +805,23 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta #endif //report progress - if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); } + if((count) % 100 == 0){ m->mothurOutJustToScreen("Processing sequence: " + toString(count)+"\n"); } } //report progress - if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); } + if((count) % 100 != 0){ m->mothurOutJustToScreen("Processing sequence: " + toString(count)+"\n"); } badFile.close(); goodFile.close(); inFASTA.close(); - + locationsFile.close(); + + if (m->debug) { m->mothurOut("[DEBUG]: fileAligned = " + toString(fileAligned) +'\n'); } + + if (fileAligned && !keepdots) { //print out smallest start value and largest end value + if (locations.size() > 1) { adjustNeeded = true; } + if (primers.size() != 0) { set::iterator it = locations.begin(); pstart = *it; } + } + return count; } catch(exception& e) { @@ -636,75 +830,9 @@ int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta } } //********************************************************************/ -bool PcrSeqsCommand::findForward(Sequence& seq, int& primerStart, int& primerEnd){ - try { - - string rawSequence = seq.getUnaligned(); - - for(int j=0;jcontrol_pressed) { primerStart = 0; primerEnd = 0; return false; } - string rawChunk = rawSequence.substr(j, olength); - if(compareDNASeq(oligo, rawChunk)) { - primerStart = j; - primerEnd = primerStart + olength; - return true; - } - - } - } - - primerStart = 0; primerEnd = 0; - return false; - - } - catch(exception& e) { - m->errorOut(e, "TrimOligos", "stripForward"); - exit(1); - } -} -//******************************************************************/ -bool PcrSeqsCommand::findReverse(Sequence& seq, int& primerStart, int& primerEnd){ - try { - - string rawSequence = seq.getUnaligned(); - - for(int i=0;i= 0; j--){ - if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; } - string rawChunk = rawSequence.substr(j, olength); - - if(compareDNASeq(oligo, rawChunk)) { - primerStart = j; - primerEnd = primerStart + olength; - return true; - } - - } - } - - primerStart = 0; primerEnd = 0; - return false; - } - catch(exception& e) { - m->errorOut(e, "PcrSeqsCommand", "findReverse"); - exit(1); - } -} -//********************************************************************/ bool PcrSeqsCommand::isAligned(string seq, map& aligned){ try { + aligned.clear(); bool isAligned = false; int countBases = 0; @@ -720,6 +848,84 @@ bool PcrSeqsCommand::isAligned(string seq, map& aligned){ exit(1); } } +//********************************************************************************************************************** +int PcrSeqsCommand::adjustDots(string goodFasta, string locations, int pstart, int pend){ + try { + ifstream inFasta; + m->openInputFile(goodFasta, inFasta); + + ifstream inLocations; + m->openInputFile(locations, inLocations); + + ofstream out; + m->openOutputFile(goodFasta+".temp", out); + + set lengths; + //cout << pstart << '\t' << pend << endl; + //if (pstart > pend) { //swap them + + while(!inFasta.eof()) { + if(m->control_pressed) { break; } + + Sequence seq(inFasta); m->gobble(inFasta); + + string name = ""; + int thisStart = -1; int thisEnd = -1; + if (primers.size() != 0) { inLocations >> name >> thisStart; m->gobble(inLocations); } + if (revPrimer.size() != 0) { inLocations >> name >> thisEnd; m->gobble(inLocations); } + + + //cout << seq.getName() << '\t' << thisStart << '\t' << thisEnd << '\t' << seq.getAligned().length() << endl; + //cout << seq.getName() << '\t' << pstart << '\t' << pend << endl; + + if (name != seq.getName()) { m->mothurOut("[ERROR]: name mismatch in pcr.seqs.\n"); } + else { + if (pstart != -1) { + if (thisStart != -1) { + if (thisStart != pstart) { + string dots = ""; + for (int i = pstart; i < thisStart; i++) { dots += "."; } + thisEnd += dots.length(); + dots += seq.getAligned(); + seq.setAligned(dots); + } + } + } + + if (pend != -1) { + if (thisEnd != -1) { + if (thisEnd != pend) { + string dots = seq.getAligned(); + for (int i = thisEnd; i < pend; i++) { dots += "."; } + seq.setAligned(dots); + } + } + } + lengths.insert(seq.getAligned().length()); + } + + seq.printSequence(out); + } + inFasta.close(); + inLocations.close(); + out.close(); + m->mothurRemove(locations); + m->mothurRemove(goodFasta); + m->renameFile(goodFasta+".temp", goodFasta); + + //cout << "final lengths = \n"; + //for (set::iterator it = lengths.begin(); it != lengths.end(); it++) { + //cout << *it << endl; + // cout << lengths.count(*it) << endl; + // } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "PcrSeqsCommand", "adjustDots"); + exit(1); + } +} //********************************************************************/ string PcrSeqsCommand::reverseOligo(string oligo){ try { @@ -768,6 +974,7 @@ bool PcrSeqsCommand::readOligos(){ m->openInputFile(oligosfile, inOligos); string type, oligo, group; + int primerCount = 0; while(!inOligos.eof()){ @@ -792,18 +999,39 @@ bool PcrSeqsCommand::readOligos(){ // get rest of line in case there is a primer name while (!inOligos.eof()) { char c = inOligos.get(); - if (c == 10 || c == 13){ break; } + if (c == 10 || c == 13 || c == -1){ break; } else if (c == 32 || c == 9){;} //space or tab } - primers.push_back(oligo); + primers[oligo] = primerCount; primerCount++; + //cout << "for oligo = " << oligo << endl; }else if(type == "REVERSE"){ string oligoRC = reverseOligo(oligo); revPrimer.push_back(oligoRC); - //cout << "oligo = " << oligo << " reverse = " << oligoRC << endl; + //cout << "rev oligo = " << oligo << " reverse = " << oligoRC << endl; }else if(type == "BARCODE"){ - inOligos >> group; + inOligos >> group; + }else if(type == "PRIMER"){ + m->gobble(inOligos); + primers[oligo] = primerCount; primerCount++; + + string roligo=""; + inOligos >> roligo; + + for(int i=0;imothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, linker, spacer and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); m->control_pressed = true; } + else{ m->mothurOut(type + " is not recognized as a valid type. Choices are primer, forward, reverse, linker, spacer and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); m->control_pressed = true; } } m->gobble(inOligos); } @@ -850,7 +1078,9 @@ int PcrSeqsCommand::writeAccnos(set badNames){ try { string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); } - string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "bad.accnos"; + map variables; + variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)); + string outputFileName = getOutputFileName("accnos",variables); outputNames.push_back(outputFileName); outputTypes["accnos"].push_back(outputFileName); ofstream out; @@ -869,50 +1099,16 @@ int PcrSeqsCommand::writeAccnos(set badNames){ exit(1); } -} -//******************************************************************/ -bool PcrSeqsCommand::compareDNASeq(string oligo, string seq){ - try { - bool success = 1; - int length = oligo.length(); - - for(int i=0;ierrorOut(e, "PcrSeqsCommand", "compareDNASeq"); - exit(1); - } - } //*************************************************************************************************************** int PcrSeqsCommand::readName(set& names){ try { string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(namefile); } - string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(namefile)) + "pcr" + m->getExtension(namefile); + map variables; + variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(namefile)); + variables["[extension]"] = m->getExtension(namefile); + string outputFileName = getOutputFileName("name", variables); ofstream out; m->openOutputFile(outputFileName, out); @@ -970,7 +1166,10 @@ int PcrSeqsCommand::readGroup(set names){ try { string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(groupfile); } - string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)) + "pcr" + m->getExtension(groupfile); + map variables; + variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)); + variables["[extension]"] = m->getExtension(groupfile); + string outputFileName = getOutputFileName("group", variables); ofstream out; m->openOutputFile(outputFileName, out); @@ -1017,7 +1216,11 @@ int PcrSeqsCommand::readTax(set names){ try { string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir += m->hasPath(taxfile); } - string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(taxfile)) + "pcr" + m->getExtension(taxfile); + map variables; + variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(taxfile)); + variables["[extension]"] = m->getExtension(taxfile); + string outputFileName = getOutputFileName("taxonomy", variables); + ofstream out; m->openOutputFile(outputFileName, out); @@ -1057,6 +1260,67 @@ int PcrSeqsCommand::readTax(set names){ exit(1); } } +//*************************************************************************************************************** +int PcrSeqsCommand::readCount(set badSeqNames){ + try { + ifstream in; + m->openInputFile(countfile, in); + set::iterator it; + + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(countfile)); + variables["[extension]"] = m->getExtension(countfile); + string goodCountFile = getOutputFileName("count", variables); + + outputNames.push_back(goodCountFile); outputTypes["count"].push_back(goodCountFile); + ofstream goodCountOut; m->openOutputFile(goodCountFile, goodCountOut); + + string headers = m->getline(in); m->gobble(in); + goodCountOut << headers << endl; + + string name, rest; int thisTotal, removedCount; removedCount = 0; + bool wroteSomething = false; + while (!in.eof()) { + + if (m->control_pressed) { goodCountOut.close(); in.close(); m->mothurRemove(goodCountFile); return 0; } + + in >> name; m->gobble(in); + in >> thisTotal; m->gobble(in); + rest = m->getline(in); m->gobble(in); + + if (badSeqNames.count(name) != 0) { removedCount+=thisTotal; } + else{ + wroteSomething = true; + goodCountOut << name << '\t' << thisTotal << '\t' << rest << endl; + } + } + in.close(); + goodCountOut.close(); + + if (m->control_pressed) { m->mothurRemove(goodCountFile); } + + if (wroteSomething == false) { m->mothurOut("Your count file contains only sequences from the .accnos file."); m->mothurOutEndLine(); } + + //check for groups that have been eliminated + CountTable ct; + if (ct.testGroups(goodCountFile)) { + ct.readTable(goodCountFile, true, false); + ct.printTable(goodCountFile); + } + + if (m->control_pressed) { m->mothurRemove(goodCountFile); } + + m->mothurOut("Removed " + toString(removedCount) + " sequences from your count file."); m->mothurOutEndLine(); + + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "PcrSeqsCommand", "readCOunt"); + exit(1); + } +} /**************************************************************************************/