X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=trimseqscommand.cpp;h=64d9d9ec161b5615ef59f3ae57429e38bdd49cdd;hb=4542a79d20176ff0ee830b0f9d4f4c92637439d9;hp=ae9f707aa2fe5ace59e59d1697a8bf6e50319421;hpb=d6c0a11d1cecfac18b323285e7ffadb7f58e848f;p=mothur.git diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index ae9f707..64d9d9e 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -11,37 +11,39 @@ #include "needlemanoverlap.hpp" #include "trimoligos.h" + //********************************************************************************************************************** vector TrimSeqsCommand::setParameters(){ try { - CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos); - CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip); - CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig); - CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxhomop); - CommandParameter pminlength("minlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pminlength); - CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxlength); - CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs); - CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pbdiffs); - CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pldiffs); - CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(psdiffs); - CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ptdiffs); - CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); - CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pallfiles); - CommandParameter pkeepforward("keepforward", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pkeepforward); - CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pqtrim); - CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqthreshold); - CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqaverage); - CommandParameter prollaverage("rollaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(prollaverage); - CommandParameter pqwindowaverage("qwindowaverage", "Number", "", "0", "", "", "",false,false); parameters.push_back(pqwindowaverage); - CommandParameter pqstepsize("qstepsize", "Number", "", "1", "", "", "",false,false); parameters.push_back(pqstepsize); - CommandParameter pqwindowsize("qwindowsize", "Number", "", "50", "", "", "",false,false); parameters.push_back(pqwindowsize); - CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "",false,false); parameters.push_back(pkeepfirst); - CommandParameter premovelast("removelast", "Number", "", "0", "", "", "",false,false); parameters.push_back(premovelast); - CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); - CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); + CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta",false,true,true); parameters.push_back(pfasta); + CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none","group",false,false,true); parameters.push_back(poligos); + CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none","qfile",false,false,true); parameters.push_back(pqfile); + CommandParameter pname("name", "InputTypes", "", "", "namecount", "none", "none","name",false,false,true); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "namecount", "none", "none","count",false,false,true); parameters.push_back(pcount); + CommandParameter pflip("flip", "Boolean", "", "F", "", "", "","",false,false,true); parameters.push_back(pflip); + CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pmaxambig); + CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pmaxhomop); + CommandParameter pminlength("minlength", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pminlength); + CommandParameter pmaxlength("maxlength", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pmaxlength); + CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(ppdiffs); + CommandParameter pbdiffs("bdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(pbdiffs); + CommandParameter pldiffs("ldiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pldiffs); + CommandParameter psdiffs("sdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(psdiffs); + CommandParameter ptdiffs("tdiffs", "Number", "", "0", "", "", "","",false,false); parameters.push_back(ptdiffs); + CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors); + CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pallfiles); + CommandParameter pkeepforward("keepforward", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pkeepforward); + CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pqtrim); + CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pqthreshold); + CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pqaverage); + CommandParameter prollaverage("rollaverage", "Number", "", "0", "", "", "","",false,false); parameters.push_back(prollaverage); + CommandParameter pqwindowaverage("qwindowaverage", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pqwindowaverage); + CommandParameter pqstepsize("qstepsize", "Number", "", "1", "", "", "","",false,false); parameters.push_back(pqstepsize); + CommandParameter pqwindowsize("qwindowsize", "Number", "", "50", "", "", "","",false,false); parameters.push_back(pqwindowsize); + CommandParameter pkeepfirst("keepfirst", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pkeepfirst); + CommandParameter premovelast("removelast", "Number", "", "0", "", "", "","",false,false); parameters.push_back(premovelast); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); vector myArray; for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } @@ -58,11 +60,12 @@ string TrimSeqsCommand::getHelpString(){ string helpString = ""; helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n"; helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n"; - helpString += "The trim.seqs command parameters are fasta, name, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n"; + helpString += "The trim.seqs command parameters are fasta, name, count, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n"; helpString += "The fasta parameter is required.\n"; helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n"; helpString += "The oligos parameter allows you to provide an oligos file.\n"; helpString += "The name parameter allows you to provide a names file with your fasta file.\n"; + helpString += "The count parameter allows you to provide a count file with your fasta file.\n"; helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n"; helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n"; helpString += "The minlength parameter allows you to set and minimum sequence length. \n"; @@ -97,8 +100,25 @@ string TrimSeqsCommand::getHelpString(){ exit(1); } } - - +//********************************************************************************************************************** +string TrimSeqsCommand::getOutputPattern(string type) { + try { + string pattern = ""; + + if (type == "qfile") { pattern = "[filename],[tag],qual"; } + else if (type == "fasta") { pattern = "[filename],[tag],fasta"; } + else if (type == "group") { pattern = "[filename],groups"; } + else if (type == "name") { pattern = "[filename],[tag],names"; } + else if (type == "count") { pattern = "[filename],count_table"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } + + return pattern; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "getOutputPattern"); + exit(1); + } +} //********************************************************************************************************************** TrimSeqsCommand::TrimSeqsCommand(){ @@ -110,6 +130,7 @@ TrimSeqsCommand::TrimSeqsCommand(){ outputTypes["qfile"] = tempOutNames; outputTypes["group"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand"); @@ -148,6 +169,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { outputTypes["qfile"] = tempOutNames; outputTypes["group"] = tempOutNames; outputTypes["name"] = tempOutNames; + outputTypes["count"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -185,6 +207,14 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["name"] = inputDir + it->second; } } + + it = parameters.find("count"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["count"] = inputDir + it->second; } + } } @@ -256,6 +286,13 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { if (temp == "not found") { nameFile = ""; } else if(temp == "not open") { nameFile = ""; abort = true; } else { nameFile = temp; m->setNameFile(nameFile); } + + countfile = validParameter.validFile(parameters, "count", true); + if (countfile == "not open") { abort = true; countfile = ""; } + else if (countfile == "not found") { countfile = ""; } + else { m->setCountTableFile(countfile); } + + if ((countfile != "") && (nameFile != "")) { m->mothurOut("You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; } m->mothurConvert(temp, qThreshold); @@ -308,10 +345,12 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { abort = true; } - if (nameFile == "") { - vector files; files.push_back(fastaFile); - parser.getNameFile(files); - } + if (countfile == "") { + if (nameFile == "") { + vector files; files.push_back(fastaFile); + parser.getNameFile(files); + } + } } } @@ -329,20 +368,25 @@ int TrimSeqsCommand::execute(){ numFPrimers = 0; //this needs to be initialized numRPrimers = 0; + numSpacers = 0; + numLinkers = 0; createGroup = false; vector > fastaFileNames; vector > qualFileNames; vector > nameFileNames; - string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta"; + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaFile)); + variables["[tag]"] = "trim"; + string trimSeqFile = getOutputFileName("fasta",variables); + string trimQualFile = getOutputFileName("qfile",variables); outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile); - - string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta"; + + variables["[tag]"] = "scrap"; + string scrapSeqFile = getOutputFileName("fasta",variables); + string scrapQualFile = getOutputFileName("qfile",variables); outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile); - string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual"; - string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual"; - if (qFileName != "") { outputNames.push_back(trimQualFile); outputNames.push_back(scrapQualFile); @@ -350,8 +394,11 @@ int TrimSeqsCommand::execute(){ outputTypes["qfile"].push_back(scrapQualFile); } - string trimNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "trim.names"; - string scrapNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "scrap.names"; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(nameFile)); + variables["[tag]"] = "trim"; + string trimNameFile = getOutputFileName("name",variables); + variables["[tag]"] = "scrap"; + string scrapNameFile = getOutputFileName("name",variables); if (nameFile != "") { m->readNames(nameFile, nameMap); @@ -360,38 +407,46 @@ int TrimSeqsCommand::execute(){ outputTypes["name"].push_back(trimNameFile); outputTypes["name"].push_back(scrapNameFile); } + + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(countfile)); + variables["[tag]"] = "trim"; + string trimCountFile = getOutputFileName("count",variables); + variables["[tag]"] = "scrap"; + string scrapCountFile = getOutputFileName("count",variables); + + if (countfile != "") { + CountTable ct; + ct.readTable(countfile); + nameCount = ct.getNameMap(); + outputNames.push_back(trimCountFile); + outputNames.push_back(scrapCountFile); + outputTypes["count"].push_back(trimCountFile); + outputTypes["count"].push_back(scrapCountFile); + } + if (m->control_pressed) { return 0; } string outputGroupFileName; if(oligoFile != ""){ createGroup = getOligos(fastaFileNames, qualFileNames, nameFileNames); - if (createGroup) { - outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups"; + if ((createGroup) && (countfile == "")){ + map myvariables; + myvariables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaFile)); + outputGroupFileName = getOutputFileName("group",myvariables); outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName); } } + + //fills lines and qlines + setLines(fastaFile, qFileName); - vector fastaFilePos; - vector qFilePos; + if(processors == 1){ + driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, trimCountFile, scrapCountFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]); + }else{ + createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, trimCountFile, scrapCountFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames); + } - setLines(fastaFile, qFileName, fastaFilePos, qFilePos); - - for (int i = 0; i < (fastaFilePos.size()-1); i++) { - lines.push_back(new linePair(fastaFilePos[i], fastaFilePos[(i+1)])); - if (qFileName != "") { qLines.push_back(new linePair(qFilePos[i], qFilePos[(i+1)])); } - } - if(qFileName == "") { qLines = lines; } //files with duds - - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - if(processors == 1){ - driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]); - }else{ - createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames); - } - #else - driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]); - #endif if (m->control_pressed) { return 0; } @@ -432,24 +487,64 @@ int TrimSeqsCommand::execute(){ for(int i = 0; i < outputNames.size(); i++) { if (namesToRemove.count(outputNames[i]) == 0) { outputNames2.push_back(outputNames[i]); } } outputNames = outputNames2; - for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) { - ifstream in; - m->openInputFile(it->first, in); - - ofstream out; - string thisGroupName = outputDir + m->getRootName(m->getSimpleName(it->first)) + "groups"; - outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName); - m->openOutputFile(thisGroupName, out); - - while (!in.eof()){ - if (m->control_pressed) { break; } - - Sequence currSeq(in); m->gobble(in); - out << currSeq.getName() << '\t' << it->second << endl; - } - in.close(); - out.close(); - } + for (it = uniqueFastaNames.begin(); it != uniqueFastaNames.end(); it++) { + ifstream in; + m->openInputFile(it->first, in); + + ofstream out; + map myvariables; + myvariables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(it->first)); + string thisGroupName = ""; + if (countfile == "") { thisGroupName = getOutputFileName("group",variables); outputNames.push_back(thisGroupName); outputTypes["group"].push_back(thisGroupName); } + else { thisGroupName = getOutputFileName("count",variables); outputNames.push_back(thisGroupName); outputTypes["count"].push_back(thisGroupName); } + m->openOutputFile(thisGroupName, out); + + if (countfile != "") { out << "Representative_Sequence\ttotal\t" << it->second << endl; } + + while (!in.eof()){ + if (m->control_pressed) { break; } + + Sequence currSeq(in); m->gobble(in); + if (countfile == "") { + out << currSeq.getName() << '\t' << it->second << endl; + + if (nameFile != "") { + map::iterator itName = nameMap.find(currSeq.getName()); + if (itName != nameMap.end()) { + vector thisSeqsNames; + m->splitAtChar(itName->second, thisSeqsNames, ','); + for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self + out << thisSeqsNames[k] << '\t' << it->second << endl; + } + }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); } + } + }else { + map::iterator itTotalReps = nameCount.find(currSeq.getName()); + if (itTotalReps != nameCount.end()) { out << currSeq.getName() << '\t' << itTotalReps->second << '\t' << itTotalReps->second << endl; } + else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); } + } + } + in.close(); + out.close(); + } + + if (countfile != "") { //create countfile with group info included + CountTable* ct = new CountTable(); + ct->readTable(trimCountFile); + map justTrimmedNames = ct->getNameMap(); + delete ct; + + CountTable newCt; + for (map::iterator itCount = groupCounts.begin(); itCount != groupCounts.end(); itCount++) { newCt.addGroup(itCount->first); } + vector tempCounts; tempCounts.resize(groupCounts.size(), 0); + for (map::iterator itNames = justTrimmedNames.begin(); itNames != justTrimmedNames.end(); itNames++) { + newCt.push_back(itNames->first, tempCounts); //add it to the table with no abundance so we can set the groups abundance + map::iterator it2 = groupMap.find(itNames->first); + if (it2 != groupMap.end()) { newCt.setAbund(itNames->first, it2->second, itNames->second); } + else { m->mothurOut("[ERROR]: missing group info for " + itNames->first + "."); m->mothurOutEndLine(); m->control_pressed = true; } + } + newCt.printTable(trimCountFile); + } } if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } @@ -486,6 +581,11 @@ int TrimSeqsCommand::execute(){ if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); } } + + itTypes = outputTypes.find("count"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); } + } m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); @@ -502,8 +602,7 @@ int TrimSeqsCommand::execute(){ } /**************************************************************************************/ - -int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string groupFileName, vector > fastaFileNames, vector > qualFileNames, vector > nameFileNames, linePair* line, linePair* qline) { +int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string trimCFileName, string scrapCFileName, string groupFileName, vector > fastaFileNames, vector > qualFileNames, vector > nameFileNames, linePair line, linePair qline) { try { @@ -527,9 +626,16 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string m->openOutputFile(scrapNFileName, scrapNameFile); } + ofstream trimCountFile; + ofstream scrapCountFile; + if(countfile != ""){ + m->openOutputFile(trimCFileName, trimCountFile); + m->openOutputFile(scrapCFileName, scrapCountFile); + if (line.start == 0) { trimCountFile << "Representative_Sequence\ttotal" << endl; scrapCountFile << "Representative_Sequence\ttotal" << endl; } + } ofstream outGroupsFile; - if (createGroup){ m->openOutputFile(groupFileName, outGroupsFile); } + if ((createGroup) && (countfile == "")){ m->openOutputFile(groupFileName, outGroupsFile); } if(allFiles){ for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file @@ -550,12 +656,12 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string ifstream inFASTA; m->openInputFile(filename, inFASTA); - inFASTA.seekg(line->start); + inFASTA.seekg(line.start); ifstream qFile; if(qFileName != "") { m->openInputFile(qFileName, qFile); - qFile.seekg(qline->start); + qFile.seekg(qline.start); } int count = 0; @@ -566,14 +672,11 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if (m->control_pressed) { inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close(); - if (createGroup) { outGroupsFile.close(); } - - if(qFileName != ""){ - qFile.close(); - } - for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } - - return 0; + if ((createGroup) && (countfile == "")) { outGroupsFile.close(); } + if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); } + if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); } + if(countfile != "") { scrapCountFile.close(); trimCountFile.close(); } + for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } int success = 1; @@ -582,9 +685,11 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string Sequence currSeq(inFASTA); m->gobble(inFASTA); //cout << currSeq.getName() << '\t' << currSeq.getUnaligned().length() << endl; + QualityScores currQual; if(qFileName != ""){ currQual = QualityScores(qFile); m->gobble(qFile); + //cout << currQual.getName() << endl; } string origSeq = currSeq.getUnaligned(); @@ -595,7 +700,9 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if(numLinkers != 0){ success = trimOligos.stripLinker(currSeq, currQual); - if(!success) { trashCode += 'k'; } + if(success > ldiffs) { trashCode += 'k'; } + else{ currentSeqsDiffs += success; } + } if(barcodes.size() != 0){ @@ -606,7 +713,9 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if(numSpacers != 0){ success = trimOligos.stripSpacer(currSeq, currQual); - if(!success) { trashCode += 's'; } + if(success > sdiffs) { trashCode += 's'; } + else{ currentSeqsDiffs += success; } + } if(numFPrimers != 0){ @@ -667,6 +776,8 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string } } + if (m->debug) { m->mothurOut("[DEBUG]: " + currSeq.getName() + ", trashcode= " + trashCode); if (trashCode.length() != 0) { m->mothurOutEndLine(); } } + if(trashCode.length() == 0){ currSeq.setAligned(currSeq.getUnaligned()); currSeq.printSequence(trimFASTAFile); @@ -675,11 +786,21 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string currQual.printQScores(trimQualFile); } + if(nameFile != ""){ map::iterator itName = nameMap.find(currSeq.getName()); if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; } else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); } } + + int numRedundants = 0; + if (countfile != "") { + map::iterator itCount = nameCount.find(currSeq.getName()); + if (itCount != nameCount.end()) { + trimCountFile << itCount->first << '\t' << itCount->second << endl; + numRedundants = itCount->second-1; + }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); } + } if (createGroup) { if(barcodes.size() != 0){ @@ -694,13 +815,17 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string } } - outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; + if (m->debug) { m->mothurOut(", group= " + thisGroup + "\n"); } + + if (countfile == "") { outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; } + else { groupMap[currSeq.getName()] = thisGroup; } if (nameFile != "") { map::iterator itName = nameMap.find(currSeq.getName()); if (itName != nameMap.end()) { vector thisSeqsNames; m->splitAtChar(itName->second, thisSeqsNames, ','); + numRedundants = thisSeqsNames.size()-1; //we already include ourselves below for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl; } @@ -708,8 +833,8 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string } map::iterator it = groupCounts.find(thisGroup); - if (it == groupCounts.end()) { groupCounts[thisGroup] = 1; } - else { groupCounts[it->first]++; } + if (it == groupCounts.end()) { groupCounts[thisGroup] = 1 + numRedundants; } + else { groupCounts[it->first] += (1 + numRedundants); } } } @@ -742,6 +867,13 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if (itName != nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; } else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); } } + if (countfile != "") { + map::iterator itCount = nameCount.find(currSeq.getName()); + if (itCount != nameCount.end()) { + trimCountFile << itCount->first << '\t' << itCount->second << endl; + }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); } + } + currSeq.setName(currSeq.getName() + '|' + trashCode); currSeq.setUnaligned(origSeq); currSeq.setAligned(origSeq); @@ -753,9 +885,9 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string count++; } - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) unsigned long long pos = inFASTA.tellg(); - if ((pos == -1) || (pos >= line->end)) { break; } + if ((pos == -1) || (pos >= line.end)) { break; } #else if (inFASTA.eof()) { break; } @@ -775,6 +907,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if (createGroup) { outGroupsFile.close(); } if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); } if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); } + if(countfile != "") { scrapCountFile.close(); trimCountFile.close(); } return count; } @@ -786,14 +919,15 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string /**************************************************************************************************/ -int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string groupFile, vector > fastaFileNames, vector > qualFileNames, vector > nameFileNames) { +int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string trimCountFileName, string scrapCountFileName, string groupFile, vector > fastaFileNames, vector > qualFileNames, vector > nameFileNames) { try { -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - int process = 1; + + int process = 1; int exitCommand = 1; processIDS.clear(); - //loop through and create all the processes you want +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + //loop through and create all the processes you want while (process != processors) { int pid = fork(); @@ -836,12 +970,16 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName (scrapQualFileName + toString(getpid()) + ".temp"), (trimNameFileName + toString(getpid()) + ".temp"), (scrapNameFileName + toString(getpid()) + ".temp"), + (trimCountFileName + toString(getpid()) + ".temp"), + (scrapCountFileName + toString(getpid()) + ".temp"), (groupFile + toString(getpid()) + ".temp"), tempFASTAFileNames, tempPrimerQualFileNames, tempNameFileNames, lines[process], qLines[process]); + + if (m->debug) { m->mothurOut("[DEBUG]: " + toString(lines[process].start) + '\t' + toString(qLines[process].start) + '\t' + toString(getpid()) + '\n'); } //pass groupCounts to parent if(createGroup){ @@ -854,6 +992,11 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName for (map::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) { out << it->first << '\t' << it->second << endl; } + + out << groupMap.size() << endl; + for (map::iterator it = groupMap.begin(); it != groupMap.end(); it++) { + out << it->first << '\t' << it->second << endl; + } out.close(); } exit(0); @@ -876,16 +1019,148 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName m->openOutputFile(trimNameFileName, temp); temp.close(); m->openOutputFile(scrapNameFileName, temp); temp.close(); } + if (countfile != "") { + m->openOutputFile(trimCountFileName, temp); temp.close(); + m->openOutputFile(scrapCountFileName, temp); temp.close(); + } - driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]); + driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, trimCountFileName, scrapCountFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]); //force parent to wait until all the processes are done for (int i=0;i pDataArray; + DWORD dwThreadIdArray[processors-1]; + HANDLE hThreadArray[processors-1]; + + //Create processor worker threads. + for( int h=0; h > tempFASTAFileNames = fastaFileNames; + vector > tempPrimerQualFileNames = qualFileNames; + vector > tempNameFileNames = nameFileNames; + + if(allFiles){ + ofstream temp; + + for(int i=0;iopenOutputFile(tempFASTAFileNames[i][j], temp); temp.close(); + + if(qFileName != ""){ + tempPrimerQualFileNames[i][j] += extension; + m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close(); + } + if(nameFile != ""){ + tempNameFileNames[i][j] += extension; + m->openOutputFile(tempNameFileNames[i][j], temp); temp.close(); + } + } + } + } + } + + + trimData* tempTrim = new trimData(filename, + qFileName, nameFile, countfile, + (trimFASTAFileName+extension), + (scrapFASTAFileName+extension), + (trimQualFileName+extension), + (scrapQualFileName+extension), + (trimNameFileName+extension), + (scrapNameFileName+extension), + (trimCountFileName+extension), + (scrapCountFileName+extension), + (groupFile+extension), + tempFASTAFileNames, + tempPrimerQualFileNames, + tempNameFileNames, + lines[h].start, lines[h].end, qLines[h].start, qLines[h].end, m, + pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, revPrimer, linker, spacer, + primerNameVector, barcodeNameVector, createGroup, allFiles, keepforward, keepFirst, removeLast, + qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage, + minLength, maxAmbig, maxHomoP, maxLength, flip, nameMap, nameCount); + pDataArray.push_back(tempTrim); + + hThreadArray[h] = CreateThread(NULL, 0, MyTrimThreadFunction, pDataArray[h], 0, &dwThreadIdArray[h]); + } + + //parent do my part + ofstream temp; + m->openOutputFile(trimFASTAFileName, temp); temp.close(); + m->openOutputFile(scrapFASTAFileName, temp); temp.close(); + if(qFileName != ""){ + m->openOutputFile(trimQualFileName, temp); temp.close(); + m->openOutputFile(scrapQualFileName, temp); temp.close(); + } + if (nameFile != "") { + m->openOutputFile(trimNameFileName, temp); temp.close(); + m->openOutputFile(scrapNameFileName, temp); temp.close(); + } + vector > tempFASTAFileNames = fastaFileNames; + vector > tempPrimerQualFileNames = qualFileNames; + vector > tempNameFileNames = nameFileNames; + if(allFiles){ + ofstream temp; + string extension = toString(processors-1) + ".temp"; + for(int i=0;iopenOutputFile(tempFASTAFileNames[i][j], temp); temp.close(); + + if(qFileName != ""){ + tempPrimerQualFileNames[i][j] += extension; + m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close(); + } + if(nameFile != ""){ + tempNameFileNames[i][j] += extension; + m->openOutputFile(tempNameFileNames[i][j], temp); temp.close(); + } + } + } + } + } + + driverCreateTrim(filename, qFileName, (trimFASTAFileName + toString(processors-1) + ".temp"), (scrapFASTAFileName + toString(processors-1) + ".temp"), (trimQualFileName + toString(processors-1) + ".temp"), (scrapQualFileName + toString(processors-1) + ".temp"), (trimNameFileName + toString(processors-1) + ".temp"), (scrapNameFileName + toString(processors-1) + ".temp"), (trimCountFileName + toString(processors-1) + ".temp"), (scrapCountFileName + toString(processors-1) + ".temp"), (groupFile + toString(processors-1) + ".temp"), tempFASTAFileNames, tempPrimerQualFileNames, tempNameFileNames, lines[processors-1], qLines[processors-1]); + processIDS.push_back(processors-1); + + + //Wait until all threads have terminated. + WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE); + + //Close all thread handles and free memory allocations. + for(int i=0; i < pDataArray.size(); i++){ + for (map::iterator it = pDataArray[i]->groupCounts.begin(); it != pDataArray[i]->groupCounts.end(); it++) { + map::iterator it2 = groupCounts.find(it->first); + if (it2 == groupCounts.end()) { groupCounts[it->first] = it->second; } + else { groupCounts[it->first] += it->second; } + } + for (map::iterator it = pDataArray[i]->groupMap.begin(); it != pDataArray[i]->groupMap.end(); it++) { + map::iterator it2 = groupMap.find(it->first); + if (it2 == groupMap.end()) { groupMap[it->first] = it->second; } + else { m->mothurOut("[ERROR]: " + it->first + " is in your fasta file more than once. Sequence names must be unique. please correct.\n"); } + } + CloseHandle(hThreadArray[i]); + delete pDataArray[i]; + } + +#endif + + + //append files for(int i=0;imothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine(); @@ -908,8 +1183,15 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName m->appendFiles((scrapNameFileName + toString(processIDS[i]) + ".temp"), scrapNameFileName); m->mothurRemove((scrapNameFileName + toString(processIDS[i]) + ".temp")); } + + if(countfile != ""){ + m->appendFiles((trimCountFileName + toString(processIDS[i]) + ".temp"), trimCountFileName); + m->mothurRemove((trimCountFileName + toString(processIDS[i]) + ".temp")); + m->appendFiles((scrapCountFileName + toString(processIDS[i]) + ".temp"), scrapCountFileName); + m->mothurRemove((scrapCountFileName + toString(processIDS[i]) + ".temp")); + } - if(createGroup){ + if((createGroup)&&(countfile == "")){ m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile); m->mothurRemove((groupFile + toString(processIDS[i]) + ".temp")); } @@ -936,6 +1218,7 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName } } + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) if(createGroup){ ifstream in; string tempFile = filename + toString(processIDS[i]) + ".num.temp"; @@ -946,21 +1229,33 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName in >> tempNum; m->gobble(in); if (tempNum != 0) { - while (!in.eof()) { - in >> group >> tempNum; m->gobble(in); - + for (int i = 0; i < tempNum; i++) { + int groupNum; + in >> group >> groupNum; m->gobble(in); + map::iterator it = groupCounts.find(group); - if (it == groupCounts.end()) { groupCounts[group] = tempNum; } - else { groupCounts[it->first] += tempNum; } + if (it == groupCounts.end()) { groupCounts[group] = groupNum; } + else { groupCounts[it->first] += groupNum; } } } + in >> tempNum; m->gobble(in); + if (tempNum != 0) { + for (int i = 0; i < tempNum; i++) { + string group, seqName; + in >> seqName >> group; m->gobble(in); + + map::iterator it = groupMap.find(seqName); + if (it == groupMap.end()) { groupMap[seqName] = group; } + else { m->mothurOut("[ERROR]: " + seqName + " is in your fasta file more than once. Sequence names must be unique. please correct.\n"); } + } + } + in.close(); m->mothurRemove(tempFile); } - + #endif } - - return exitCommand; -#endif + + return exitCommand; } catch(exception& e) { m->errorOut(e, "TrimSeqsCommand", "createProcessesCreateTrim"); @@ -970,14 +1265,16 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName /**************************************************************************************************/ -int TrimSeqsCommand::setLines(string filename, string qfilename, vector& fastaFilePos, vector& qfileFilePos) { +int TrimSeqsCommand::setLines(string filename, string qfilename) { try { - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + + vector fastaFilePos; + vector qfileFilePos; + + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) //set file positions for fasta file fastaFilePos = m->divideFile(filename, processors); - if (qfilename == "") { return processors; } - //get name of first sequence in each chunk map firstSeqNames; for (int i = 0; i < (fastaFilePos.size()-1); i++) { @@ -990,66 +1287,102 @@ int TrimSeqsCommand::setLines(string filename, string qfilename, vectoropenInputFile(qfilename, inQual); - string input; - while(!inQual.eof()){ - input = m->getline(inQual); - - if (input.length() != 0) { - if(input[0] == '>'){ //this is a sequence name line - istringstream nameStream(input); - - string sname = ""; nameStream >> sname; - sname = sname.substr(1); - - map::iterator it = firstSeqNames.find(sname); - - if(it != firstSeqNames.end()) { //this is the start of a new chunk - unsigned long long pos = inQual.tellg(); - qfileFilePos.push_back(pos - input.length() - 1); - firstSeqNames.erase(it); - } - } - } - - if (firstSeqNames.size() == 0) { break; } - } - inQual.close(); - - - if (firstSeqNames.size() != 0) { - for (map::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) { - m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine(); - } - qFileName = ""; - return processors; - } - - //get last file position of qfile - FILE * pFile; - unsigned long long size; - - //get num bytes in file - pFile = fopen (qfilename.c_str(),"rb"); - if (pFile==NULL) perror ("Error opening file"); - else{ - fseek (pFile, 0, SEEK_END); - size=ftell (pFile); - fclose (pFile); - } - - qfileFilePos.push_back(size); + if(qfilename != "") { + //seach for filePos of each first name in the qfile and save in qfileFilePos + ifstream inQual; + m->openInputFile(qfilename, inQual); + + string input; + while(!inQual.eof()){ + input = m->getline(inQual); + + if (input.length() != 0) { + if(input[0] == '>'){ //this is a sequence name line + istringstream nameStream(input); + + string sname = ""; nameStream >> sname; + sname = sname.substr(1); + + map::iterator it = firstSeqNames.find(sname); + + if(it != firstSeqNames.end()) { //this is the start of a new chunk + unsigned long long pos = inQual.tellg(); + qfileFilePos.push_back(pos - input.length() - 1); + firstSeqNames.erase(it); + } + } + } + + if (firstSeqNames.size() == 0) { break; } + } + inQual.close(); + + + if (firstSeqNames.size() != 0) { + for (map::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) { + m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine(); + } + qFileName = ""; + return processors; + } + + //get last file position of qfile + FILE * pFile; + unsigned long long size; + + //get num bytes in file + pFile = fopen (qfilename.c_str(),"rb"); + if (pFile==NULL) perror ("Error opening file"); + else{ + fseek (pFile, 0, SEEK_END); + size=ftell (pFile); + fclose (pFile); + } + + qfileFilePos.push_back(size); + } + + for (int i = 0; i < (fastaFilePos.size()-1); i++) { + if (m->debug) { m->mothurOut("[DEBUG]: " + toString(i) +'\t' + toString(fastaFilePos[i]) + '\t' + toString(fastaFilePos[i+1]) + '\n'); } + lines.push_back(linePair(fastaFilePos[i], fastaFilePos[(i+1)])); + if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[i], qfileFilePos[(i+1)])); } + } + if(qfilename == "") { qLines = lines; } //files with duds return processors; #else - - fastaFilePos.push_back(0); qfileFilePos.push_back(0); - fastaFilePos.push_back(1000); qfileFilePos.push_back(1000); + + if (processors == 1) { //save time + //fastaFilePos.push_back(0); qfileFilePos.push_back(0); + //fastaFilePos.push_back(1000); qfileFilePos.push_back(1000); + lines.push_back(linePair(0, 1000)); + if (qfilename != "") { qLines.push_back(linePair(0, 1000)); } + }else{ + int numFastaSeqs = 0; + fastaFilePos = m->setFilePosFasta(filename, numFastaSeqs); + if (fastaFilePos.size() < processors) { processors = fastaFilePos.size(); } + + if (qfilename != "") { + int numQualSeqs = 0; + qfileFilePos = m->setFilePosFasta(qfilename, numQualSeqs); + + if (numFastaSeqs != numQualSeqs) { + m->mothurOut("[ERROR]: You have " + toString(numFastaSeqs) + " sequences in your fasta file, but " + toString(numQualSeqs) + " sequences in your quality file."); m->mothurOutEndLine(); m->control_pressed = true; + } + } + + //figure out how many sequences you have to process + int numSeqsPerProcessor = numFastaSeqs / processors; + for (int i = 0; i < processors; i++) { + int startIndex = i * numSeqsPerProcessor; + if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; } + lines.push_back(linePair(fastaFilePos[startIndex], numSeqsPerProcessor)); + if (qfilename != "") { qLines.push_back(linePair(qfileFilePos[startIndex], numSeqsPerProcessor)); } + } + } + if(qfilename == "") { qLines = lines; } //files with duds return 1; #endif @@ -1077,7 +1410,9 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< while(!inOligos.eof()){ inOligos >> type; - + + if (m->debug) { m->mothurOut("[DEBUG]: reading type - " + type + ".\n"); } + if(type[0] == '#'){ while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there m->gobble(inOligos); @@ -1088,6 +1423,8 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< for(int i=0;i> oligo; + + if (m->debug) { m->mothurOut("[DEBUG]: reading - " + oligo + ".\n"); } for(int i=0;i >& fastaFileNames, vector< map::iterator itPrime = primers.find(oligo); if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); } + if (m->debug) { if (group != "") { m->mothurOut("[DEBUG]: reading group " + group + ".\n"); }else{ m->mothurOut("[DEBUG]: no group for primer " + oligo + ".\n"); } } + primers[oligo]=indexPrimer; indexPrimer++; primerNameVector.push_back(group); } else if(type == "REVERSE"){ - Sequence oligoRC("reverse", oligo); - oligoRC.reverseComplement(); - revPrimer.push_back(oligoRC.getUnaligned()); + //Sequence oligoRC("reverse", oligo); + //oligoRC.reverseComplement(); + string oligoRC = reverseOligo(oligo); + revPrimer.push_back(oligoRC); } else if(type == "BARCODE"){ inOligos >> group; @@ -1123,7 +1463,7 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< //check for repeat barcodes map::iterator itBar = barcodes.find(oligo); if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); } - + barcodes[oligo]=indexBarcode; indexBarcode++; barcodeNameVector.push_back(group); }else if(type == "LINKER"){ @@ -1131,7 +1471,7 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< }else if(type == "SPACER"){ spacer.push_back(oligo); } - else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); } + else{ m->mothurOut("[WARNING]: " + type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); } } m->gobble(inOligos); } @@ -1169,6 +1509,7 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< string fastaFileName = ""; string qualFileName = ""; string nameFileName = ""; + string countFileName = ""; if(primerName == ""){ comboGroupName = barcodeNameVector[itBar->second]; @@ -1184,7 +1525,10 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< ofstream temp; - fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta"; + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaFile)); + variables["[tag]"] = comboGroupName; + fastaFileName = getOutputFileName("fasta", variables); if (uniqueNames.count(fastaFileName) == 0) { outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName); @@ -1195,7 +1539,8 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< m->openOutputFile(fastaFileName, temp); temp.close(); if(qFileName != ""){ - qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual"; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(qFileName)); + qualFileName = getOutputFileName("qfile", variables); if (uniqueNames.count(qualFileName) == 0) { outputNames.push_back(qualFileName); outputTypes["qfile"].push_back(qualFileName); @@ -1206,7 +1551,8 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< } if(nameFile != ""){ - nameFileName = outputDir + m->getRootName(m->getSimpleName(nameFile)) + comboGroupName + ".names"; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(nameFile)); + nameFileName = getOutputFileName("name", variables); if (uniqueNames.count(nameFileName) == 0) { outputNames.push_back(nameFileName); outputTypes["name"].push_back(nameFileName); @@ -1215,7 +1561,6 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< nameFileNames[itBar->second][itPrimer->second] = nameFileName; m->openOutputFile(nameFileName, temp); temp.close(); } - } } } @@ -1237,7 +1582,7 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< break; } } - + if (allBlank) { m->mothurOut("[WARNING]: your oligos file does not contain any group names. mothur will not create a groupfile."); m->mothurOutEndLine(); allFiles = false; @@ -1338,6 +1683,46 @@ bool TrimSeqsCommand::cullHomoP(Sequence& seq){ } } +//********************************************************************/ +string TrimSeqsCommand::reverseOligo(string oligo){ + try { + string reverse = ""; + + for(int i=oligo.length()-1;i>=0;i--){ + + if(oligo[i] == 'A') { reverse += 'T'; } + else if(oligo[i] == 'T'){ reverse += 'A'; } + else if(oligo[i] == 'U'){ reverse += 'A'; } + + else if(oligo[i] == 'G'){ reverse += 'C'; } + else if(oligo[i] == 'C'){ reverse += 'G'; } + + else if(oligo[i] == 'R'){ reverse += 'Y'; } + else if(oligo[i] == 'Y'){ reverse += 'R'; } + + else if(oligo[i] == 'M'){ reverse += 'K'; } + else if(oligo[i] == 'K'){ reverse += 'M'; } + + else if(oligo[i] == 'W'){ reverse += 'W'; } + else if(oligo[i] == 'S'){ reverse += 'S'; } + + else if(oligo[i] == 'B'){ reverse += 'V'; } + else if(oligo[i] == 'V'){ reverse += 'B'; } + + else if(oligo[i] == 'D'){ reverse += 'H'; } + else if(oligo[i] == 'H'){ reverse += 'D'; } + + else { reverse += 'N'; } + } + + + return reverse; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "reverseOligo"); + exit(1); + } +} //***************************************************************************************************************