X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=trimseqscommand.cpp;h=213e241ccfc6ee8feeb6c34ed9b589343c7c71d1;hb=0ca63a8165baa0afa459e644ebe140ba496d5ba0;hp=d8a0d1d00aaf58768d887134ffb29aa3d4e5275c;hpb=4160a27002f4a3ca436fcd62eca0366b09f8e901;p=mothur.git diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index d8a0d1d..213e241 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -9,6 +9,7 @@ #include "trimseqscommand.h" #include "needlemanoverlap.hpp" +#include "trimoligos.h" //********************************************************************************************************************** vector TrimSeqsCommand::setParameters(){ @@ -16,6 +17,7 @@ vector TrimSeqsCommand::setParameters(){ CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(poligos); CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pqfile); + CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); CommandParameter pflip("flip", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pflip); CommandParameter pmaxambig("maxambig", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pmaxambig); CommandParameter pmaxhomop("maxhomop", "Number", "", "0", "", "", "",false,false); parameters.push_back(pmaxhomop); @@ -53,10 +55,11 @@ string TrimSeqsCommand::getHelpString(){ string helpString = ""; helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n"; helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n"; - helpString += "The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n"; + helpString += "The trim.seqs command parameters are fasta, name, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n"; helpString += "The fasta parameter is required.\n"; helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n"; helpString += "The oligos parameter allows you to provide an oligos file.\n"; + helpString += "The name parameter allows you to provide a names file with your fasta file.\n"; helpString += "The maxambig parameter allows you to set the maximum number of ambigious bases allowed. The default is -1.\n"; helpString += "The maxhomop parameter allows you to set a maximum homopolymer length. \n"; helpString += "The minlength parameter allows you to set and minimum sequence length. \n"; @@ -100,6 +103,7 @@ TrimSeqsCommand::TrimSeqsCommand(){ outputTypes["fasta"] = tempOutNames; outputTypes["qfile"] = tempOutNames; outputTypes["group"] = tempOutNames; + outputTypes["name"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand"); @@ -116,6 +120,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { //allow user to run help if(option == "help") { help(); abort = true; calledHelp = true; } + else if(option == "citation") { citation(); abort = true; calledHelp = true;} else { vector myArray = setParameters(); @@ -136,6 +141,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { outputTypes["fasta"] = tempOutNames; outputTypes["qfile"] = tempOutNames; outputTypes["group"] = tempOutNames; + outputTypes["name"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); @@ -166,6 +172,14 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { if (path == "") { parameters["qfile"] = inputDir + it->second; } } + it = parameters.find("name"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["name"] = inputDir + it->second; } + } + } @@ -176,6 +190,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { if (fastaFile != "") { m->mothurOut("Using " + fastaFile + " as input file for the fasta parameter."); m->mothurOutEndLine(); } else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; } }else if (fastaFile == "not open") { abort = true; } + else { m->setFastaFile(fastaFile); } //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ @@ -188,45 +203,50 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { // ...at some point should added some additional type checking... string temp; temp = validParameter.validFile(parameters, "flip", false); - if (temp == "not found"){ flip = 0; } - else if(m->isTrue(temp)) { flip = 1; } + if (temp == "not found") { flip = 0; } + else { flip = m->isTrue(temp); } temp = validParameter.validFile(parameters, "oligos", true); if (temp == "not found"){ oligoFile = ""; } else if(temp == "not open"){ abort = true; } - else { oligoFile = temp; } + else { oligoFile = temp; m->setOligosFile(oligoFile); } temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; } - convert(temp, maxAmbig); + m->mothurConvert(temp, maxAmbig); temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "0"; } - convert(temp, maxHomoP); + m->mothurConvert(temp, maxHomoP); temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "0"; } - convert(temp, minLength); + m->mothurConvert(temp, minLength); temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; } - convert(temp, maxLength); + m->mothurConvert(temp, maxLength); temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; } - convert(temp, bdiffs); + m->mothurConvert(temp, bdiffs); temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; } - convert(temp, pdiffs); + m->mothurConvert(temp, pdiffs); temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs; temp = toString(tempTotal); } - convert(temp, tdiffs); + m->mothurConvert(temp, tdiffs); if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; } temp = validParameter.validFile(parameters, "qfile", true); if (temp == "not found") { qFileName = ""; } else if(temp == "not open") { abort = true; } - else { qFileName = temp; } + else { qFileName = temp; m->setQualFile(qFileName); } + + temp = validParameter.validFile(parameters, "name", true); + if (temp == "not found") { nameFile = ""; } + else if(temp == "not open") { nameFile = ""; abort = true; } + else { nameFile = temp; m->setNameFile(nameFile); } temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; } - convert(temp, qThreshold); + m->mothurConvert(temp, qThreshold); temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "t"; } qtrim = m->isTrue(temp); @@ -257,7 +277,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); - convert(temp, processors); + m->mothurConvert(temp, processors); if(allFiles && (oligoFile == "")){ @@ -289,8 +309,10 @@ int TrimSeqsCommand::execute(){ numFPrimers = 0; //this needs to be initialized numRPrimers = 0; + createGroup = false; vector > fastaFileNames; vector > qualFileNames; + vector > nameFileNames; string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta"; outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile); @@ -300,6 +322,7 @@ int TrimSeqsCommand::execute(){ string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual"; string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual"; + if (qFileName != "") { outputNames.push_back(trimQualFile); outputNames.push_back(scrapQualFile); @@ -307,15 +330,30 @@ int TrimSeqsCommand::execute(){ outputTypes["qfile"].push_back(scrapQualFile); } + string trimNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "trim.names"; + string scrapNameFile = outputDir + m->getRootName(m->getSimpleName(nameFile)) + "scrap.names"; + + if (nameFile != "") { + m->readNames(nameFile, nameMap); + outputNames.push_back(trimNameFile); + outputNames.push_back(scrapNameFile); + outputTypes["name"].push_back(trimNameFile); + outputTypes["name"].push_back(scrapNameFile); + } + + if (m->control_pressed) { return 0; } + string outputGroupFileName; if(oligoFile != ""){ - outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups"; - outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName); - getOligos(fastaFileNames, qualFileNames); + createGroup = getOligos(fastaFileNames, qualFileNames, nameFileNames); + if (createGroup) { + outputGroupFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups"; + outputNames.push_back(outputGroupFileName); outputTypes["group"].push_back(outputGroupFileName); + } } - - vector fastaFilePos; - vector qFilePos; + + vector fastaFilePos; + vector qFilePos; setLines(fastaFile, qFileName, fastaFilePos, qFilePos); @@ -327,16 +365,16 @@ int TrimSeqsCommand::execute(){ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) if(processors == 1){ - driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]); + driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]); }else{ - createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames); + createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames); } #else - driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, outputGroupFileName, fastaFileNames, qualFileNames, lines[0], qLines[0]); + driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, trimNameFile, scrapNameFile, outputGroupFileName, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]); #endif if (m->control_pressed) { return 0; } - + if(allFiles){ map uniqueFastaNames;// so we don't add the same groupfile multiple times map::iterator it; @@ -344,19 +382,26 @@ int TrimSeqsCommand::execute(){ for(int i=0;iisBlank(fastaFileNames[i][j])){ - remove(fastaFileNames[i][j].c_str()); - namesToRemove.insert(fastaFileNames[i][j]); + if (namesToRemove.count(fastaFileNames[i][j]) == 0) { + if(m->isBlank(fastaFileNames[i][j])){ + m->mothurRemove(fastaFileNames[i][j]); + namesToRemove.insert(fastaFileNames[i][j]); - if(qFileName != ""){ - remove(qualFileNames[i][j].c_str()); - namesToRemove.insert(qualFileNames[i][j]); + if(qFileName != ""){ + m->mothurRemove(qualFileNames[i][j]); + namesToRemove.insert(qualFileNames[i][j]); + } + + if(nameFile != ""){ + m->mothurRemove(nameFileNames[i][j]); + namesToRemove.insert(nameFileNames[i][j]); + } + }else{ + it = uniqueFastaNames.find(fastaFileNames[i][j]); + if (it == uniqueFastaNames.end()) { + uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i]; + } } - }else{ - it = uniqueFastaNames.find(fastaFileNames[i][j]); - if (it == uniqueFastaNames.end()) { - uniqueFastaNames[fastaFileNames[i][j]] = barcodeNameVector[i]; - } } } } @@ -387,17 +432,18 @@ int TrimSeqsCommand::execute(){ } } - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } //output group counts m->mothurOutEndLine(); int total = 0; + if (groupCounts.size() != 0) { m->mothurOut("Group count: \n"); } for (map::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) { - total += it->second; m->mothurOut("Group " + it->first + " contains " + toString(it->second) + " sequences."); m->mothurOutEndLine(); + total += it->second; m->mothurOut(it->first + "\t" + toString(it->second)); m->mothurOutEndLine(); } if (total != 0) { m->mothurOut("Total of all groups is " + toString(total)); m->mothurOutEndLine(); } - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } //set fasta file as new current fastafile string current = ""; @@ -406,6 +452,11 @@ int TrimSeqsCommand::execute(){ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); } } + itTypes = outputTypes.find("name"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); } + } + itTypes = outputTypes.find("qfile"); if (itTypes != outputTypes.end()) { if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); } @@ -432,7 +483,7 @@ int TrimSeqsCommand::execute(){ /**************************************************************************************/ -int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string groupFileName, vector > fastaFileNames, vector > qualFileNames, linePair* line, linePair* qline) { +int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string trimFileName, string scrapFileName, string trimQFileName, string scrapQFileName, string trimNFileName, string scrapNFileName, string groupFileName, vector > fastaFileNames, vector > qualFileNames, vector > nameFileNames, linePair* line, linePair* qline) { try { @@ -449,8 +500,16 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string m->openOutputFile(scrapQFileName, scrapQualFile); } + ofstream trimNameFile; + ofstream scrapNameFile; + if(nameFile != ""){ + m->openOutputFile(trimNFileName, trimNameFile); + m->openOutputFile(scrapNFileName, scrapNameFile); + } + + ofstream outGroupsFile; - if (oligoFile != ""){ m->openOutputFile(groupFileName, outGroupsFile); } + if (createGroup){ m->openOutputFile(groupFileName, outGroupsFile); } if(allFiles){ for (int i = 0; i < fastaFileNames.size(); i++) { //clears old file for (int j = 0; j < fastaFileNames[i].size(); j++) { //clears old file @@ -460,6 +519,10 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if(qFileName != ""){ m->openOutputFile(qualFileNames[i][j], temp); temp.close(); } + + if(nameFile != ""){ + m->openOutputFile(nameFileNames[i][j], temp); temp.close(); + } } } } @@ -477,17 +540,18 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string int count = 0; bool moreSeqs = 1; + TrimOligos trimOligos(pdiffs, bdiffs, primers, barcodes, revPrimer); while (moreSeqs) { if (m->control_pressed) { inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close(); - if (oligoFile != "") { outGroupsFile.close(); } + if (createGroup) { outGroupsFile.close(); } if(qFileName != ""){ qFile.close(); } - for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } + for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } @@ -497,12 +561,12 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string int currentSeqsDiffs = 0; Sequence currSeq(inFASTA); m->gobble(inFASTA); - + //cout << currSeq.getName() << '\t' << currSeq.getUnaligned().length() << endl; QualityScores currQual; if(qFileName != ""){ currQual = QualityScores(qFile); m->gobble(qFile); } - + string origSeq = currSeq.getUnaligned(); if (origSeq != "") { @@ -510,13 +574,13 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string int primerIndex = 0; if(barcodes.size() != 0){ - success = stripBarcode(currSeq, currQual, barcodeIndex); + success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex); if(success > bdiffs) { trashCode += 'b'; } else{ currentSeqsDiffs += success; } } if(numFPrimers != 0){ - success = stripForward(currSeq, currQual, primerIndex); + success = trimOligos.stripForward(currSeq, currQual, primerIndex); if(success > pdiffs) { trashCode += 'f'; } else{ currentSeqsDiffs += success; } } @@ -524,7 +588,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if (currentSeqsDiffs > tdiffs) { trashCode += 't'; } if(numRPrimers != 0){ - success = stripReverse(currSeq, currQual); + success = trimOligos.stripReverse(currSeq, currQual); if(!success) { trashCode += 'r'; } } @@ -546,7 +610,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); } else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); } else { success = 1; } - + //you don't want to trim, if it fails above then scrap it if ((!qtrim) && (origLength != currSeq.getNumBases())) { success = 0; } @@ -581,18 +645,44 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string currQual.printQScores(trimQualFile); } - if(barcodes.size() != 0){ - string thisGroup = barcodeNameVector[barcodeIndex]; - if (primers.size() != 0) { if (primerNameVector[primerIndex] != "") { thisGroup += "." + primerNameVector[primerIndex]; } } - - outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; - - map::iterator it = groupCounts.find(thisGroup); - if (it == groupCounts.end()) { groupCounts[thisGroup] = 1; } - else { groupCounts[it->first]++; } - + if(nameFile != ""){ + map::iterator itName = nameMap.find(currSeq.getName()); + if (itName != nameMap.end()) { trimNameFile << itName->first << '\t' << itName->second << endl; } + else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); } } + if (createGroup) { + if(barcodes.size() != 0){ + string thisGroup = barcodeNameVector[barcodeIndex]; + if (primers.size() != 0) { + if (primerNameVector[primerIndex] != "") { + if(thisGroup != "") { + thisGroup += "." + primerNameVector[primerIndex]; + }else { + thisGroup = primerNameVector[primerIndex]; + } + } + } + + outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; + + if (nameFile != "") { + map::iterator itName = nameMap.find(currSeq.getName()); + if (itName != nameMap.end()) { + vector thisSeqsNames; + m->splitAtChar(itName->second, thisSeqsNames, ','); + for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self + outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl; + } + }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); } + } + + map::iterator it = groupCounts.find(thisGroup); + if (it == groupCounts.end()) { groupCounts[thisGroup] = 1; } + else { groupCounts[it->first]++; } + + } + } if(allFiles){ ofstream output; @@ -605,9 +695,23 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string currQual.printQScores(output); output.close(); } + + if(nameFile != ""){ + map::iterator itName = nameMap.find(currSeq.getName()); + if (itName != nameMap.end()) { + m->openOutputFileAppend(nameFileNames[barcodeIndex][primerIndex], output); + output << itName->first << '\t' << itName->second << endl; + output.close(); + }else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); } + } } } else{ + if(nameFile != ""){ //needs to be before the currSeq name is changed + map::iterator itName = nameMap.find(currSeq.getName()); + if (itName != nameMap.end()) { scrapNameFile << itName->first << '\t' << itName->second << endl; } + else { m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); m->mothurOutEndLine(); } + } currSeq.setName(currSeq.getName() + '|' + trashCode); currSeq.setUnaligned(origSeq); currSeq.setAligned(origSeq); @@ -620,25 +724,27 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string } #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - unsigned long int pos = inFASTA.tellg(); + unsigned long long pos = inFASTA.tellg(); if ((pos == -1) || (pos >= line->end)) { break; } + #else if (inFASTA.eof()) { break; } #endif - + //report progress if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } } //report progress if((count) % 1000 != 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } - + inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close(); - if (oligoFile != "") { outGroupsFile.close(); } + if (createGroup) { outGroupsFile.close(); } if(qFileName != "") { qFile.close(); scrapQualFile.close(); trimQualFile.close(); } + if(nameFile != "") { scrapNameFile.close(); trimNameFile.close(); } return count; } @@ -650,7 +756,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string /**************************************************************************************************/ -int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string groupFile, vector > fastaFileNames, vector > qualFileNames) { +int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFASTAFileName, string scrapFASTAFileName, string trimQualFileName, string scrapQualFileName, string trimNameFileName, string scrapNameFileName, string groupFile, vector > fastaFileNames, vector > qualFileNames, vector > nameFileNames) { try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) int process = 1; @@ -668,6 +774,7 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName vector > tempFASTAFileNames = fastaFileNames; vector > tempPrimerQualFileNames = qualFileNames; + vector > tempNameFileNames = nameFileNames; if(allFiles){ ofstream temp; @@ -682,6 +789,10 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName tempPrimerQualFileNames[i][j] += toString(getpid()) + ".temp"; m->openOutputFile(tempPrimerQualFileNames[i][j], temp); temp.close(); } + if(nameFile != ""){ + tempNameFileNames[i][j] += toString(getpid()) + ".temp"; + m->openOutputFile(tempNameFileNames[i][j], temp); temp.close(); + } } } } @@ -693,21 +804,28 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName (scrapFASTAFileName + toString(getpid()) + ".temp"), (trimQualFileName + toString(getpid()) + ".temp"), (scrapQualFileName + toString(getpid()) + ".temp"), + (trimNameFileName + toString(getpid()) + ".temp"), + (scrapNameFileName + toString(getpid()) + ".temp"), (groupFile + toString(getpid()) + ".temp"), tempFASTAFileNames, tempPrimerQualFileNames, + tempNameFileNames, lines[process], qLines[process]); //pass groupCounts to parent - ofstream out; - string tempFile = filename + toString(getpid()) + ".num.temp"; - m->openOutputFile(tempFile, out); - for (map::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) { - out << it->first << '\t' << it->second << endl; + if(createGroup){ + ofstream out; + string tempFile = filename + toString(getpid()) + ".num.temp"; + m->openOutputFile(tempFile, out); + + out << groupCounts.size() << endl; + + for (map::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) { + out << it->first << '\t' << it->second << endl; + } + out.close(); } - out.close(); - exit(0); }else { m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); @@ -720,10 +838,16 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName ofstream temp; m->openOutputFile(trimFASTAFileName, temp); temp.close(); m->openOutputFile(scrapFASTAFileName, temp); temp.close(); - m->openOutputFile(trimQualFileName, temp); temp.close(); - m->openOutputFile(scrapQualFileName, temp); temp.close(); + if(qFileName != ""){ + m->openOutputFile(trimQualFileName, temp); temp.close(); + m->openOutputFile(scrapQualFileName, temp); temp.close(); + } + if (nameFile != "") { + m->openOutputFile(trimNameFileName, temp); temp.close(); + m->openOutputFile(scrapNameFileName, temp); temp.close(); + } - driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, groupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]); + driverCreateTrim(filename, qFileName, trimFASTAFileName, scrapFASTAFileName, trimQualFileName, scrapQualFileName, trimNameFileName, scrapNameFileName, groupFile, fastaFileNames, qualFileNames, nameFileNames, lines[0], qLines[0]); //force parent to wait until all the processes are done for (int i=0;imothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine(); m->appendFiles((trimFASTAFileName + toString(processIDS[i]) + ".temp"), trimFASTAFileName); - remove((trimFASTAFileName + toString(processIDS[i]) + ".temp").c_str()); + m->mothurRemove((trimFASTAFileName + toString(processIDS[i]) + ".temp")); m->appendFiles((scrapFASTAFileName + toString(processIDS[i]) + ".temp"), scrapFASTAFileName); - remove((scrapFASTAFileName + toString(processIDS[i]) + ".temp").c_str()); + m->mothurRemove((scrapFASTAFileName + toString(processIDS[i]) + ".temp")); if(qFileName != ""){ m->appendFiles((trimQualFileName + toString(processIDS[i]) + ".temp"), trimQualFileName); - remove((trimQualFileName + toString(processIDS[i]) + ".temp").c_str()); + m->mothurRemove((trimQualFileName + toString(processIDS[i]) + ".temp")); m->appendFiles((scrapQualFileName + toString(processIDS[i]) + ".temp"), scrapQualFileName); - remove((scrapQualFileName + toString(processIDS[i]) + ".temp").c_str()); + m->mothurRemove((scrapQualFileName + toString(processIDS[i]) + ".temp")); + } + + if(nameFile != ""){ + m->appendFiles((trimNameFileName + toString(processIDS[i]) + ".temp"), trimNameFileName); + m->mothurRemove((trimNameFileName + toString(processIDS[i]) + ".temp")); + m->appendFiles((scrapNameFileName + toString(processIDS[i]) + ".temp"), scrapNameFileName); + m->mothurRemove((scrapNameFileName + toString(processIDS[i]) + ".temp")); } - m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile); - remove((groupFile + toString(processIDS[i]) + ".temp").c_str()); + if(createGroup){ + m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile); + m->mothurRemove((groupFile + toString(processIDS[i]) + ".temp")); + } if(allFiles){ @@ -757,30 +890,42 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName for(int k=0;kappendFiles((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp"), fastaFileNames[j][k]); - remove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str()); + m->mothurRemove((fastaFileNames[j][k] + toString(processIDS[i]) + ".temp")); if(qFileName != ""){ m->appendFiles((qualFileNames[j][k] + toString(processIDS[i]) + ".temp"), qualFileNames[j][k]); - remove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp").c_str()); + m->mothurRemove((qualFileNames[j][k] + toString(processIDS[i]) + ".temp")); + } + + if(nameFile != ""){ + m->appendFiles((nameFileNames[j][k] + toString(processIDS[i]) + ".temp"), nameFileNames[j][k]); + m->mothurRemove((nameFileNames[j][k] + toString(processIDS[i]) + ".temp")); } } } } } - ifstream in; - string tempFile = filename + toString(processIDS[i]) + ".num.temp"; - m->openInputFile(tempFile, in); - int tempNum; - string group; - while (!in.eof()) { - in >> group >> tempNum; m->gobble(in); + if(createGroup){ + ifstream in; + string tempFile = filename + toString(processIDS[i]) + ".num.temp"; + m->openInputFile(tempFile, in); + int tempNum; + string group; - map::iterator it = groupCounts.find(group); - if (it == groupCounts.end()) { groupCounts[group] = tempNum; } - else { groupCounts[it->first] += tempNum; } + in >> tempNum; m->gobble(in); + + if (tempNum != 0) { + while (!in.eof()) { + in >> group >> tempNum; m->gobble(in); + + map::iterator it = groupCounts.find(group); + if (it == groupCounts.end()) { groupCounts[group] = tempNum; } + else { groupCounts[it->first] += tempNum; } + } + } + in.close(); m->mothurRemove(tempFile); } - in.close(); remove(tempFile.c_str()); } @@ -795,9 +940,9 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName /**************************************************************************************************/ -int TrimSeqsCommand::setLines(string filename, string qfilename, vector& fastaFilePos, vector& qfileFilePos) { +int TrimSeqsCommand::setLines(string filename, string qfilename, vector& fastaFilePos, vector& qfileFilePos) { try { - + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) //set file positions for fasta file fastaFilePos = m->divideFile(filename, processors); @@ -834,7 +979,7 @@ int TrimSeqsCommand::setLines(string filename, string qfilename, vector::iterator it = firstSeqNames.find(sname); if(it != firstSeqNames.end()) { //this is the start of a new chunk - unsigned long int pos = inQual.tellg(); + unsigned long long pos = inQual.tellg(); qfileFilePos.push_back(pos - input.length() - 1); firstSeqNames.erase(it); } @@ -856,7 +1001,7 @@ int TrimSeqsCommand::setLines(string filename, string qfilename, vectorerrorOut(e, "TrimSeqsCommand", "setLines"); @@ -879,7 +1032,7 @@ int TrimSeqsCommand::setLines(string filename, string qfilename, vector >& fastaFileNames, vector >& qualFileNames){ +bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector >& qualFileNames, vector >& nameFileNames){ try { ifstream inOligos; m->openInputFile(oligoFile, inOligos); @@ -967,7 +1120,8 @@ void TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< for(int i=0;i uniqueNames; //used to cleanup outputFileNames @@ -980,6 +1134,7 @@ void TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< string comboGroupName = ""; string fastaFileName = ""; string qualFileName = ""; + string nameFileName = ""; if(primerName == ""){ comboGroupName = barcodeNameVector[itBar->second]; @@ -992,7 +1147,8 @@ void TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< comboGroupName = barcodeNameVector[itBar->second] + "." + primerNameVector[itPrimer->second]; } } - + + ofstream temp; fastaFileName = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + comboGroupName + ".fasta"; if (uniqueNames.count(fastaFileName) == 0) { @@ -1003,10 +1159,10 @@ void TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< fastaFileNames[itBar->second][itPrimer->second] = fastaFileName; m->openOutputFile(fastaFileName, temp); temp.close(); - + if(qFileName != ""){ qualFileName = outputDir + m->getRootName(m->getSimpleName(qFileName)) + comboGroupName + ".qual"; - if (uniqueNames.count(fastaFileName) == 0) { + if (uniqueNames.count(qualFileName) == 0) { outputNames.push_back(qualFileName); outputTypes["qfile"].push_back(qualFileName); } @@ -1014,291 +1170,52 @@ void TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< qualFileNames[itBar->second][itPrimer->second] = qualFileName; m->openOutputFile(qualFileName, temp); temp.close(); } + + if(nameFile != ""){ + nameFileName = outputDir + m->getRootName(m->getSimpleName(nameFile)) + comboGroupName + ".names"; + if (uniqueNames.count(nameFileName) == 0) { + outputNames.push_back(nameFileName); + outputTypes["name"].push_back(nameFileName); + } + + nameFileNames[itBar->second][itPrimer->second] = nameFileName; + m->openOutputFile(nameFileName, temp); temp.close(); + } + } } } numFPrimers = primers.size(); numRPrimers = revPrimer.size(); - - } - catch(exception& e) { - m->errorOut(e, "TrimSeqsCommand", "getOligos"); - exit(1); - } -} - -//*************************************************************************************************************** - -int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){ - try { - - string rawSequence = seq.getUnaligned(); - int success = bdiffs + 1; //guilty until proven innocent - //can you find the barcode - for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ - string oligo = it->first; - if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length - success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out - break; - } - - if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ - group = it->second; - seq.setUnaligned(rawSequence.substr(oligo.length())); - - if(qual.getName() != ""){ - qual.trimQScores(oligo.length(), -1); - } - - success = 0; + bool allBlank = true; + for (int i = 0; i < barcodeNameVector.size(); i++) { + if (barcodeNameVector[i] != "") { + allBlank = false; break; } } - - //if you found the barcode or if you don't want to allow for diffs - if ((bdiffs == 0) || (success == 0)) { return success; } - - else { //try aligning and see if you can find it - - int maxLength = 0; - - Alignment* alignment; - if (barcodes.size() > 0) { - map::iterator it=barcodes.begin(); - - for(it;it!=barcodes.end();it++){ - if(it->first.length() > maxLength){ - maxLength = it->first.length(); - } - } - alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1)); - - }else{ alignment = NULL; } - - //can you find the barcode - int minDiff = 1e6; - int minCount = 1; - int minGroup = -1; - int minPos = 0; - - for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ - string oligo = it->first; -// int length = oligo.length(); - - if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length - success = bdiffs + 10; - break; - } - - //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs)); - oligo = alignment->getSeqAAln(); - string temp = alignment->getSeqBAln(); - - int alnLength = oligo.length(); - - for(int i=oligo.length()-1;i>=0;i--){ - if(oligo[i] != '-'){ alnLength = i+1; break; } - } - oligo = oligo.substr(0,alnLength); - temp = temp.substr(0,alnLength); - - int numDiff = countDiffs(oligo, temp); - - if(numDiff < minDiff){ - minDiff = numDiff; - minCount = 1; - minGroup = it->second; - minPos = 0; - for(int i=0;i bdiffs) { success = minDiff; } //no good matches - else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes - else{ //use the best match - group = minGroup; - seq.setUnaligned(rawSequence.substr(minPos)); - - if(qual.getName() != ""){ - qual.trimQScores(minPos, -1); - } - success = minDiff; - } - - if (alignment != NULL) { delete alignment; } - - } - - return success; - - } - catch(exception& e) { - m->errorOut(e, "TrimSeqsCommand", "stripBarcode"); - exit(1); - } - -} - -//*************************************************************************************************************** - -int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){ - try { - string rawSequence = seq.getUnaligned(); - int success = pdiffs + 1; //guilty until proven innocent - - //can you find the primer - for(map::iterator it=primers.begin();it!=primers.end();it++){ - string oligo = it->first; - if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length - success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out - break; - } - - if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ - group = it->second; - seq.setUnaligned(rawSequence.substr(oligo.length())); - if(qual.getName() != ""){ - qual.trimQScores(oligo.length(), -1); - } - success = 0; + for (int i = 0; i < primerNameVector.size(); i++) { + if (primerNameVector[i] != "") { + allBlank = false; break; } } - - //if you found the barcode or if you don't want to allow for diffs - if ((pdiffs == 0) || (success == 0)) { return success; } - else { //try aligning and see if you can find it - - int maxLength = 0; - - Alignment* alignment; - if (primers.size() > 0) { - map::iterator it=primers.begin(); - - for(it;it!=primers.end();it++){ - if(it->first.length() > maxLength){ - maxLength = it->first.length(); - } - } - alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1)); - - }else{ alignment = NULL; } - - //can you find the barcode - int minDiff = 1e6; - int minCount = 1; - int minGroup = -1; - int minPos = 0; - - for(map::iterator it=primers.begin();it!=primers.end();it++){ - string oligo = it->first; -// int length = oligo.length(); - - if(rawSequence.length() < maxLength){ - success = pdiffs + 100; - break; - } - - //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs)); - oligo = alignment->getSeqAAln(); - string temp = alignment->getSeqBAln(); - - int alnLength = oligo.length(); - - for(int i=oligo.length()-1;i>=0;i--){ - if(oligo[i] != '-'){ alnLength = i+1; break; } - } - oligo = oligo.substr(0,alnLength); - temp = temp.substr(0,alnLength); - - int numDiff = countDiffs(oligo, temp); - - if(numDiff < minDiff){ - minDiff = numDiff; - minCount = 1; - minGroup = it->second; - minPos = 0; - for(int i=0;i pdiffs) { success = minDiff; } //no good matches - else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers - else{ //use the best match - group = minGroup; - seq.setUnaligned(rawSequence.substr(minPos)); - if(qual.getName() != ""){ - qual.trimQScores(minPos, -1); - } - success = minDiff; - } - - if (alignment != NULL) { delete alignment; } - + if (allBlank) { + m->mothurOut("[WARNING]: your oligos file does not contain any group names. mothur will not create a groupfile."); m->mothurOutEndLine(); + allFiles = false; + return false; } - return success; - - } - catch(exception& e) { - m->errorOut(e, "TrimSeqsCommand", "stripForward"); - exit(1); - } -} - -//*************************************************************************************************************** - -bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){ - try { - string rawSequence = seq.getUnaligned(); - bool success = 0; //guilty until proven innocent - - for(int i=0;ierrorOut(e, "TrimSeqsCommand", "stripReverse"); + m->errorOut(e, "TrimSeqsCommand", "getOligos"); exit(1); } } - //*************************************************************************************************************** bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){ @@ -1404,80 +1321,4 @@ bool TrimSeqsCommand::cullAmbigs(Sequence& seq){ } } - -//*************************************************************************************************************** - -bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){ - try { - bool success = 1; - int length = oligo.length(); - - for(int i=0;ierrorOut(e, "TrimSeqsCommand", "compareDNASeq"); - exit(1); - } - -} - -//*************************************************************************************************************** - -int TrimSeqsCommand::countDiffs(string oligo, string seq){ - try { - - int length = oligo.length(); - int countDiffs = 0; - - for(int i=0;ierrorOut(e, "TrimSeqsCommand", "countDiffs"); - exit(1); - } - -} - //***************************************************************************************************************