X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=trimseqscommand.cpp;h=b40b7e5b0123d848a26df5afd803f3625df9a4a8;hb=d635b39347cd81943ea50de7b813a0a5d743b0c0;hp=4b2ee4eee99d3d38c2e5cd4f77ad2c0b90af4b81;hpb=6e81846c8e5b2614f6b06643a9f558fb0e6669fa;p=mothur.git diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index 4b2ee4e..b40b7e5 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -10,6 +10,67 @@ #include "trimseqscommand.h" #include "needlemanoverlap.hpp" +//********************************************************************************************************************** + +vector TrimSeqsCommand::getValidParameters(){ + try { + string Array[] = {"fasta", "flip", "oligos", "maxambig", "maxhomop", "group","minlength", "maxlength", "qfile", + "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage", + "keepfirst", "removelast", + "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"}; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + return myArray; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "getValidParameters"); + exit(1); + } +} + +//********************************************************************************************************************** + +TrimSeqsCommand::TrimSeqsCommand(){ + try { + abort = true; + //initialize outputTypes + vector tempOutNames; + outputTypes["fasta"] = tempOutNames; + outputTypes["qual"] = tempOutNames; + outputTypes["group"] = tempOutNames; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand"); + exit(1); + } +} + +//********************************************************************************************************************** + +vector TrimSeqsCommand::getRequiredParameters(){ + try { + string Array[] = {"fasta"}; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + return myArray; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "getRequiredParameters"); + exit(1); + } +} + +//********************************************************************************************************************** + +vector TrimSeqsCommand::getRequiredFiles(){ + try { + vector myArray; + return myArray; + } + catch(exception& e) { + m->errorOut(e, "TrimSeqsCommand", "getRequiredFiles"); + exit(1); + } +} + //*************************************************************************************************************** TrimSeqsCommand::TrimSeqsCommand(string option) { @@ -23,8 +84,10 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { else { //valid paramters for this command - string AlignArray[] = {"fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", - "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage", "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"}; + string AlignArray[] = { "fasta", "flip", "oligos", "maxambig", "maxhomop", "group","minlength", "maxlength", "qfile", + "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage", + "keepfirst", "removelast", + "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"}; vector myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string))); @@ -39,6 +102,12 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } } + //initialize outputTypes + vector tempOutNames; + outputTypes["fasta"] = tempOutNames; + outputTypes["qual"] = tempOutNames; + outputTypes["group"] = tempOutNames; + //if the user changes the input directory command factory will send this info to us in the output parameter string inputDir = validParameter.validFile(parameters, "inputdir", false); if (inputDir == "not found"){ inputDir = ""; } @@ -47,7 +116,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { it = parameters.find("fasta"); //user has given a template file if(it != parameters.end()){ - path = hasPath(it->second); + path = m->hasPath(it->second); //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["fasta"] = inputDir + it->second; } } @@ -55,7 +124,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { it = parameters.find("oligos"); //user has given a template file if(it != parameters.end()){ - path = hasPath(it->second); + path = m->hasPath(it->second); //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["oligos"] = inputDir + it->second; } } @@ -63,22 +132,30 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { it = parameters.find("qfile"); //user has given a template file if(it != parameters.end()){ - path = hasPath(it->second); + path = m->hasPath(it->second); //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["qfile"] = inputDir + it->second; } } + + it = parameters.find("group"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["group"] = inputDir + it->second; } + } } //check for required parameters fastaFile = validParameter.validFile(parameters, "fasta", true); - if (fastaFile == "not found") { m->mothurOut("fasta is a required parameter for the screen.seqs command."); m->mothurOutEndLine(); abort = true; } + if (fastaFile == "not found") { m->mothurOut("fasta is a required parameter for the trim.seqs command."); m->mothurOutEndLine(); abort = true; } else if (fastaFile == "not open") { abort = true; } //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; - outputDir += hasPath(fastaFile); //if user entered a file with a path then preserve it + outputDir += m->hasPath(fastaFile); //if user entered a file with a path then preserve it } @@ -87,13 +164,18 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { string temp; temp = validParameter.validFile(parameters, "flip", false); if (temp == "not found"){ flip = 0; } - else if(isTrue(temp)) { flip = 1; } + else if(m->isTrue(temp)) { flip = 1; } temp = validParameter.validFile(parameters, "oligos", true); if (temp == "not found"){ oligoFile = ""; } else if(temp == "not open"){ abort = true; } else { oligoFile = temp; } + temp = validParameter.validFile(parameters, "group", true); + if (temp == "not found"){ groupfile = ""; } + else if(temp == "not open"){ abort = true; } + else { groupfile = temp; } + temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; } convert(temp, maxAmbig); @@ -126,7 +208,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { convert(temp, qThreshold); temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "F"; } - qtrim = isTrue(temp); + qtrim = m->isTrue(temp); temp = validParameter.validFile(parameters, "rollaverage", false); if (temp == "not found") { temp = "0"; } convert(temp, qRollAverage); @@ -134,23 +216,34 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; } convert(temp, qWindowAverage); - temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "100"; } + temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "50"; } convert(temp, qWindowSize); - temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "10"; } + temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "1"; } convert(temp, qWindowStep); temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; } convert(temp, qAverage); + + temp = validParameter.validFile(parameters, "keepfirst", false); if (temp == "not found") { temp = "0"; } + convert(temp, keepFirst); + + temp = validParameter.validFile(parameters, "removelast", false); if (temp == "not found") { temp = "0"; } + convert(temp, removeLast); temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; } - allFiles = isTrue(temp); + allFiles = m->isTrue(temp); temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found") { temp = "1"; } convert(temp, processors); - if(allFiles && oligoFile == ""){ - m->mothurOut("You selected allfiles, but didn't enter an oligos file. Ignoring the allfiles request."); m->mothurOutEndLine(); + if ((oligoFile != "") && (groupfile != "")) { + m->mothurOut("You given both a oligos file and a groupfile, only one is allowed."); m->mothurOutEndLine(); abort = true; + } + + + if(allFiles && (oligoFile == "") && (groupfile == "")){ + m->mothurOut("You selected allfiles, but didn't enter an oligos or group file. Ignoring the allfiles request."); m->mothurOutEndLine(); } if((qAverage != 0 && qThreshold != 0) && qFileName == ""){ m->mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); m->mothurOutEndLine(); @@ -169,13 +262,15 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { exit(1); } } + //********************************************************************************************************************** void TrimSeqsCommand::help(){ try { m->mothurOut("The trim.seqs command reads a fastaFile and creates .....\n"); - m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim and allfiles.\n"); + m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, group, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim and allfiles.\n"); m->mothurOut("The fasta parameter is required.\n"); + m->mothurOut("The group parameter allows you to enter a group file for your fasta file.\n"); m->mothurOut("The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n"); m->mothurOut("The oligos parameter .... The default is "".\n"); m->mothurOut("The maxambig parameter .... The default is -1.\n"); @@ -218,21 +313,47 @@ int TrimSeqsCommand::execute(){ numFPrimers = 0; //this needs to be initialized numRPrimers = 0; - - string trimSeqFile = outputDir + getRootName(getSimpleName(fastaFile)) + "trim.fasta"; - outputNames.push_back(trimSeqFile); - string scrapSeqFile = outputDir + getRootName(getSimpleName(fastaFile)) + "scrap.fasta"; - outputNames.push_back(scrapSeqFile); - string trimQualFile = outputDir + getRootName(getSimpleName(fastaFile)) + "trim.qual"; - outputNames.push_back(trimQualFile); - string scrapQualFile = outputDir + getRootName(getSimpleName(fastaFile)) + "scrap.qual"; - outputNames.push_back(scrapQualFile); - string groupFile = outputDir + getRootName(getSimpleName(fastaFile)) + "groups"; - vector fastaFileNames; vector qualFileNames; + + string trimSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.fasta"; + outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile); + string scrapSeqFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.fasta"; + outputNames.push_back(scrapSeqFile); outputTypes["fasta"].push_back(scrapSeqFile); + string trimQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "trim.qual"; + string scrapQualFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "scrap.qual"; + if (qFileName != "") { outputNames.push_back(trimQualFile); outputNames.push_back(scrapQualFile); outputTypes["qual"].push_back(trimQualFile); outputTypes["qual"].push_back(scrapQualFile); } + string groupFile = ""; + if (groupfile == "") { groupFile = outputDir + m->getRootName(m->getSimpleName(fastaFile)) + "groups"; } + else{ + groupFile = outputDir + m->getRootName(m->getSimpleName(groupfile)) + "trim.groups"; + outputNames.push_back(groupFile); outputTypes["group"].push_back(groupFile); + groupMap = new GroupMap(groupfile); + groupMap->readMap(); + + if(allFiles){ + for (int i = 0; i < groupMap->namesOfGroups.size(); i++) { + groupToIndex[groupMap->namesOfGroups[i]] = i; + groupVector.push_back(groupMap->namesOfGroups[i]); + fastaFileNames.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + groupMap->namesOfGroups[i] + ".fasta")); + + //we append later, so we want to clear file + ofstream outRemove; + m->openOutputFile(fastaFileNames[i], outRemove); + outRemove.close(); + if(qFileName != ""){ + qualFileNames.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupMap->namesOfGroups[i] + ".qual")); + ofstream outRemove2; + m->openOutputFile(qualFileNames[i], outRemove2); + outRemove2.close(); + } + } + } + comboStarts = fastaFileNames.size()-1; + } + if(oligoFile != ""){ - outputNames.push_back(groupFile); + outputNames.push_back(groupFile); outputTypes["group"].push_back(groupFile); getOligos(fastaFileNames, qualFileNames); } @@ -249,98 +370,32 @@ int TrimSeqsCommand::execute(){ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) if(processors == 1){ - driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, groupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]); - - for (int j = 0; j < fastaFileNames.size(); j++) { - rename((fastaFileNames[j] + toString(getpid()) + ".temp").c_str(), fastaFileNames[j].c_str()); - } - if(qFileName != ""){ - for (int j = 0; j < qualFileNames.size(); j++) { - rename((qualFileNames[j] + toString(getpid()) + ".temp").c_str(), qualFileNames[j].c_str()); - } - } - }else{ createProcessesCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, groupFile, fastaFileNames, qualFileNames); - - rename((trimSeqFile + toString(processIDS[0]) + ".temp").c_str(), trimSeqFile.c_str()); - rename((scrapSeqFile + toString(processIDS[0]) + ".temp").c_str(), scrapSeqFile.c_str()); - rename((groupFile + toString(processIDS[0]) + ".temp").c_str(), groupFile.c_str()); - - if(qFileName != ""){ - rename((trimQualFile + toString(processIDS[0]) + ".temp").c_str(), trimQualFile.c_str()); - rename((scrapQualFile + toString(processIDS[0]) + ".temp").c_str(), scrapQualFile.c_str()); - } - - - for (int j = 0; j < fastaFileNames.size(); j++) { - rename((fastaFileNames[j] + toString(processIDS[0]) + ".temp").c_str(), fastaFileNames[j].c_str()); - } - if(qFileName != ""){ - for (int j = 0; j < qualFileNames.size(); j++) { - rename((qualFileNames[j] + toString(getpid()) + ".temp").c_str(), qualFileNames[j].c_str()); - } - } - - //append files - for(int i=1;icontrol_pressed) { return 0; } + } #else - driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, groupFile, fastaFileNames, qualFileNames, lines[0], qlines[0]); - - for (int j = 0; j < fastaFileNames.size(); j++) { - rename((fastaFileNames[j] + toString(j) + ".temp").c_str(), fastaFileNames[j].c_str()); - } - if(qFileName != ""){ - for (int j = 0; j < qualFileNames.size(); j++) { - rename((qualFileNames[j] + toString(j) + ".temp").c_str(), qualFileNames[j].c_str()); - } - } - - if (m->control_pressed) { return 0; } + driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, trimQualFile, scrapQualFile, groupFile, fastaFileNames, qualFileNames, lines[0], qLines[0]); #endif - - + + if (m->control_pressed) { return 0; } + + set blanks; for(int i=0;i 0) { remove(fastaFileNames[i].c_str()); } + if (m->isBlank(fastaFileNames[i])) { blanks.insert(fastaFileNames[i]); } + else if (filesToRemove.count(fastaFileNames[i]) > 0) { remove(fastaFileNames[i].c_str()); } else { ifstream inFASTA; string seqName; - openInputFile(fastaFileNames[i], inFASTA); + m->openInputFile(fastaFileNames[i], inFASTA); ofstream outGroups; - string outGroupFilename = outputDir + getRootName(getSimpleName(fastaFileNames[i])) + "groups"; - openOutputFile(outGroupFilename, outGroups); - outputNames.push_back(outGroupFilename); + string outGroupFilename = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[i])) + "groups"; + + //if the fastafile is on the blanks list then the groups file should be as well + if (blanks.count(fastaFileNames[i]) != 0) { blanks.insert(outGroupFilename); } + + m->openOutputFile(outGroupFilename, outGroups); + outputNames.push_back(outGroupFilename); outputTypes["group"].push_back(outGroupFilename); string thisGroup = ""; if (i > comboStarts) { @@ -349,7 +404,7 @@ int TrimSeqsCommand::execute(){ if(itCombo->second == i){ thisGroup = itCombo->first; combos.erase(itCombo); break; } } }else{ thisGroup = groupVector[i]; } - + while(!inFASTA.eof()){ if(inFASTA.get() == '>'){ inFASTA >> seqName; @@ -362,30 +417,17 @@ int TrimSeqsCommand::execute(){ } } + for (set::iterator itBlanks = blanks.begin(); itBlanks != blanks.end(); itBlanks++) { remove((*(itBlanks)).c_str()); } + + blanks.clear(); if(qFileName != ""){ for(int i=0;i 0) { remove(qualFileNames[i].c_str()); } - else { - ifstream inQual; - string seqName; - openInputFile(qualFileNames[i], inQual); -// ofstream outGroups; -// -// string thisGroup = ""; -// if (i > comboStarts) { -// map::iterator itCombo; -// for(itCombo=combos.begin();itCombo!=combos.end(); itCombo++){ -// if(itCombo->second == i){ thisGroup = itCombo->first; combos.erase(itCombo); break; } -// } -// } -// else{ thisGroup = groupVector[i]; } - - inQual.close(); - } + if (m->isBlank(qualFileNames[i])) { blanks.insert(qualFileNames[i]); } + else if (filesToRemove.count(qualFileNames[i]) > 0) { remove(qualFileNames[i].c_str()); } } } + for (set::iterator itBlanks = blanks.begin(); itBlanks != blanks.end(); itBlanks++) { remove((*(itBlanks)).c_str()); } if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } @@ -413,47 +455,44 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string try { ofstream outFASTA; - int able = openOutputFile(trimFile, outFASTA); + int able = m->openOutputFile(trimFile, outFASTA); ofstream scrapFASTA; - openOutputFile(scrapFile, scrapFASTA); + m->openOutputFile(scrapFile, scrapFASTA); ofstream outQual; ofstream scrapQual; if(qFileName != ""){ - openOutputFile(trimQFile, outQual); - openOutputFile(scrapQFile, scrapQual); + m->openOutputFile(trimQFile, outQual); + m->openOutputFile(scrapQFile, scrapQual); } ofstream outGroups; - vector fastaFileNames; - vector qualFileNames; - if (oligoFile != "") { - openOutputFile(groupFile, outGroups); - for (int i = 0; i < fastaNames.size(); i++) { - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - fastaFileNames.push_back(new ofstream((fastaNames[i] + toString(getpid()) + ".temp").c_str(), ios::ate)); - if(qFileName != ""){ - qualFileNames.push_back(new ofstream((qualNames[i] + toString(getpid()) + ".temp").c_str(), ios::ate)); - } - #else - fastaFileNames.push_back(new ofstream((fastaNames[i] + toString(i) + ".temp").c_str(), ios::ate)); - if(qFileName != ""){ - qualFileNames.push_back(new ofstream((qualNames[i] + toString(i) + ".temp").c_str(), ios::ate)); - } - #endif - } + m->openOutputFile(groupFile, outGroups); } ifstream inFASTA; - openInputFile(filename, inFASTA); + m->openInputFile(filename, inFASTA); inFASTA.seekg(line->start); ifstream qFile; - if(qFileName != "") { openInputFile(qFileName, qFile); qFile.seekg(qline->start); } + if(qFileName != "") { m->openInputFile(qFileName, qFile); qFile.seekg(qline->start); } + + for (int i = 0; i < fastaNames.size(); i++) { //clears old file + ofstream temp; + m->openOutputFile(fastaNames[i], temp); + temp.close(); + } + for (int i = 0; i < qualNames.size(); i++) { //clears old file + ofstream temp; + m->openOutputFile(qualNames[i], temp); + temp.close(); + } + + bool done = false; int count = 0; @@ -462,12 +501,9 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if (m->control_pressed) { inFASTA.close(); outFASTA.close(); scrapFASTA.close(); if (oligoFile != "") { outGroups.close(); } - - for(int i=0;iclose(); delete fastaFileNames[i]; } if(qFileName != ""){ qFile.close(); - for(int i=0;iclose(); delete qualFileNames[i]; } } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } @@ -476,10 +512,12 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string int success = 1; - Sequence currSeq(inFASTA); gobble(inFASTA); + + Sequence currSeq(inFASTA); m->gobble(inFASTA); + QualityScores currQual; if(qFileName != ""){ - currQual = QualityScores(qFile, currSeq.getNumBases()); gobble(qFile); + currQual = QualityScores(qFile, currSeq.getNumBases()); m->gobble(qFile); } string origSeq = currSeq.getUnaligned(); @@ -488,25 +526,12 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string string trashCode = ""; int currentSeqsDiffs = 0; - if(qFileName != ""){ - if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); } - else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); } - else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); } - else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); } - - if (qtrim == 1 && (origSeq.length() != currSeq.getUnaligned().length())) { - success = 0; //if you don't want to trim and the sequence does not meet quality requirements, move to scrap - } - - if(!success) { trashCode += 'q'; } - } - if(barcodes.size() != 0){ success = stripBarcode(currSeq, currQual, groupBar); if(success > bdiffs) { trashCode += 'b'; } else{ currentSeqsDiffs += success; } } - + if(numFPrimers != 0){ success = stripForward(currSeq, currQual, groupPrime); if(success > pdiffs) { trashCode += 'f'; } @@ -514,11 +539,36 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string } if (currentSeqsDiffs > tdiffs) { trashCode += 't'; } - + if(numRPrimers != 0){ success = stripReverse(currSeq, currQual); if(!success) { trashCode += 'r'; } } + + if(keepFirst != 0){ + success = keepFirstTrim(currSeq, currQual); + } + + if(removeLast != 0){ + success = removeLastTrim(currSeq, currQual); + if(!success) { trashCode += 'l'; } + } + + + if(qFileName != ""){ + + if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); } + else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); } + else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); } + else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); } + else { success = 1; } + +// if (qtrim == 1 && (origSeq.length() != currSeq.getUnaligned().length())) { +// success = 0; //if you don't want to trim and the sequence does not meet quality requirements, move to scrap +// } + + if(!success) { trashCode += 'q'; } + } if(minLength > 0 || maxLength > 0){ success = cullLength(currSeq); @@ -533,7 +583,10 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if(!success) { trashCode += 'n'; } } - if(flip){ currSeq.reverseComplement(); currQual.flipQScores(); } // should go last + if(flip){ // should go last + currSeq.reverseComplement(); + currQual.flipQScores(); + } if(trashCode.length() == 0){ currSeq.setAligned(currSeq.getUnaligned()); @@ -552,10 +605,44 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string } outGroups << currSeq.getName() << '\t' << thisGroup << endl; if(allFiles){ - currSeq.printSequence(*fastaFileNames[indexToFastaFile]); + ofstream outTemp; + m->openOutputFileAppend(fastaNames[indexToFastaFile], outTemp); + //currSeq.printSequence(*fastaFileNames[indexToFastaFile]); + currSeq.printSequence(outTemp); + outTemp.close(); if(qFileName != ""){ - currQual.printQScores(*qualFileNames[indexToFastaFile]); + //currQual.printQScores(*qualFileNames[indexToFastaFile]); + ofstream outTemp2; + m->openOutputFileAppend(qualNames[indexToFastaFile], outTemp2); + currQual.printQScores(outTemp2); + outTemp2.close(); + } + } + } + + if (groupfile != "") { + string thisGroup = groupMap->getGroup(currSeq.getName()); + + if (thisGroup != "not found") { + outGroups << currSeq.getName() << '\t' << thisGroup << endl; + if (allFiles) { + ofstream outTemp; + m->openOutputFileAppend(fastaNames[groupToIndex[thisGroup]], outTemp); + currSeq.printSequence(outTemp); + outTemp.close(); + if(qFileName != ""){ + ofstream outTemp2; + m->openOutputFileAppend(qualNames[groupToIndex[thisGroup]], outTemp2); + currQual.printQScores(outTemp2); + outTemp2.close(); + } + } + }else{ + m->mothurOut(currSeq.getName() + " is not in your groupfile, adding to group XXX."); m->mothurOutEndLine(); + outGroups << currSeq.getName() << '\t' << "XXX" << endl; + if (allFiles) { + m->mothurOut("[ERROR]: " + currSeq.getName() + " will not be added to any .group.fasta or .group.qual file."); m->mothurOutEndLine(); } } } @@ -570,9 +657,13 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string count++; } - unsigned long int pos = inFASTA.tellg(); - if ((pos == -1) || (pos >= line->end)) { break; } - + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + unsigned long int pos = inFASTA.tellg(); + if ((pos == -1) || (pos >= line->end)) { break; } + #else + if (inFASTA.eof()) { break; } + #endif + //report progress if((count) % 1000 == 0){ m->mothurOut(toString(count)); m->mothurOutEndLine(); } @@ -587,18 +678,6 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if (oligoFile != "") { outGroups.close(); } if(qFileName != "") { qFile.close(); scrapQual.close(); outQual.close(); } - for(int i=0;iclose(); - delete fastaFileNames[i]; - } - - if(qFileName != ""){ - for(int i=0;iclose(); - delete qualFileNames[i]; - } - } - return count; } catch(exception& e) { @@ -612,7 +691,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName, string trimFile, string scrapFile, string trimQFile, string scrapQFile, string groupFile, vector fastaNames, vector qualNames) { try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - int process = 0; + int process = 1; int exitCommand = 1; processIDS.clear(); @@ -624,17 +703,94 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ + for (int i = 0; i < fastaNames.size(); i++) { + fastaNames[i] = (fastaNames[i] + toString(getpid()) + ".temp"); + //clear old file if it exists + ofstream temp; + m->openOutputFile(fastaNames[i], temp); + temp.close(); + if(qFileName != ""){ + qualNames[i] = (qualNames[i] + toString(getpid()) + ".temp"); + //clear old file if it exists + ofstream temp2; + m->openOutputFile(qualNames[i], temp2); + temp2.close(); + } + } + driverCreateTrim(filename, qFileName, (trimFile + toString(getpid()) + ".temp"), (scrapFile + toString(getpid()) + ".temp"), (trimQFile + toString(getpid()) + ".temp"), (scrapQFile + toString(getpid()) + ".temp"), (groupFile + toString(getpid()) + ".temp"), fastaNames, qualNames, lines[process], qLines[process]); exit(0); - }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); } + }else { + m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); + for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); } + exit(0); + } } + //parent do my part + for (int i = 0; i < fastaNames.size(); i++) { + //clear old file if it exists + ofstream temp; + m->openOutputFile(fastaNames[i], temp); + temp.close(); + if(qFileName != ""){ + //clear old file if it exists + ofstream temp2; + m->openOutputFile(qualNames[i], temp2); + temp2.close(); + } + } + + driverCreateTrim(filename, qFileName, trimFile, scrapFile, trimQFile, scrapQFile, groupFile, fastaNames, qualNames, lines[0], qLines[0]); + + //force parent to wait until all the processes are done - for (int i=0;imothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine(); + + m->appendFiles((trimFile + toString(processIDS[i]) + ".temp"), trimFile); + remove((trimFile + toString(processIDS[i]) + ".temp").c_str()); + m->appendFiles((scrapFile + toString(processIDS[i]) + ".temp"), scrapFile); + remove((scrapFile + toString(processIDS[i]) + ".temp").c_str()); + + m->mothurOut("Done with fasta files"); m->mothurOutEndLine(); + + if(qFileName != ""){ + m->appendFiles((trimQFile + toString(processIDS[i]) + ".temp"), trimQFile); + remove((trimQFile + toString(processIDS[i]) + ".temp").c_str()); + m->appendFiles((scrapQFile + toString(processIDS[i]) + ".temp"), scrapQFile); + remove((scrapQFile + toString(processIDS[i]) + ".temp").c_str()); + + m->mothurOut("Done with quality files"); m->mothurOutEndLine(); + } + + m->appendFiles((groupFile + toString(processIDS[i]) + ".temp"), groupFile); + remove((groupFile + toString(processIDS[i]) + ".temp").c_str()); + + m->mothurOut("Done with group file"); m->mothurOutEndLine(); + + for (int j = 0; j < fastaNames.size(); j++) { + m->appendFiles((fastaNames[j] + toString(processIDS[i]) + ".temp"), fastaNames[j]); + remove((fastaNames[j] + toString(processIDS[i]) + ".temp").c_str()); + } + + if(qFileName != ""){ + for (int j = 0; j < qualNames.size(); j++) { + m->appendFiles((qualNames[j] + toString(processIDS[i]) + ".temp"), qualNames[j]); + remove((qualNames[j] + toString(processIDS[i]) + ".temp").c_str()); + } + } + + if (allFiles) { m->mothurOut("Done with allfiles"); m->mothurOutEndLine(); } + } + return exitCommand; #endif } @@ -650,7 +806,7 @@ int TrimSeqsCommand::setLines(string filename, string qfilename, vectordivideFile(filename, processors); if (qfilename == "") { return processors; } @@ -658,7 +814,7 @@ int TrimSeqsCommand::setLines(string filename, string qfilename, vector firstSeqNames; for (int i = 0; i < (fastaFilePos.size()-1); i++) { ifstream in; - openInputFile(filename, in); + m->openInputFile(filename, in); in.seekg(fastaFilePos[i]); Sequence temp(in); @@ -669,11 +825,11 @@ int TrimSeqsCommand::setLines(string filename, string qfilename, vectoropenInputFile(qfilename, inQual); + string input; while(!inQual.eof()){ - input = getline(inQual); + input = m->getline(inQual); if (input.length() != 0) { if(input[0] == '>'){ //this is a sequence name line @@ -696,6 +852,7 @@ int TrimSeqsCommand::setLines(string filename, string qfilename, vector::iterator it = firstSeqNames.begin(); it != firstSeqNames.end(); it++) { m->mothurOut(it->first + " is in your fasta file and not in your quality file, not using quality file."); m->mothurOutEndLine(); @@ -731,7 +888,7 @@ int TrimSeqsCommand::setLines(string filename, string qfilename, vector& outFASTAVec, vector& outQualVec){ try { ifstream inOligos; - openInputFile(oligoFile, inOligos); + m->openInputFile(oligoFile, inOligos); ofstream test; @@ -740,7 +897,7 @@ void TrimSeqsCommand::getOligos(vector& outFASTAVec, vector& out //int indexPrimer = 0; while(!inOligos.eof()){ - inOligos >> type; + inOligos >> type; m->gobble(inOligos); if(type[0] == '#'){ while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there @@ -771,27 +928,29 @@ void TrimSeqsCommand::getOligos(vector& outFASTAVec, vector& out map::iterator itPrime = primers.find(oligo); if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); } - primers[oligo]=index; index++; - groupVector.push_back(group); + primers[oligo]=index; index++; + groupVector.push_back(group); - if(allFiles){ - outFASTAVec.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta")); - if(qFileName != ""){ - outQualVec.push_back((outputDir + getRootName(getSimpleName(qFileName)) + group + ".qual")); - } - if (group == "") { //if there is not a group for this primer, then this file will not get written to, but we add it to keep the indexes correct - filesToRemove.insert((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta")); + if(allFiles){ + outFASTAVec.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta")); if(qFileName != ""){ - filesToRemove.insert((outputDir + getRootName(getSimpleName(qFileName)) + group + ".qual")); + outQualVec.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual")); + } + if (group == "") { //if there is not a group for this primer, then this file will not get written to, but we add it to keep the indexes correct + filesToRemove.insert((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta")); + if(qFileName != ""){ + filesToRemove.insert((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual")); + } + }else { + outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta")); + outputTypes["fasta"].push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta")); + if(qFileName != ""){ + outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual")); + outputTypes["qual"].push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual")); + } } - }else { - outputNames.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta")); - if(qFileName != ""){ - outputNames.push_back((outputDir + getRootName(getSimpleName(qFileName)) + group + ".qual")); - } } - } - + } else if(type == "REVERSE"){ Sequence oligoRC("reverse", oligo); @@ -805,20 +964,23 @@ void TrimSeqsCommand::getOligos(vector& outFASTAVec, vector& out map::iterator itBar = barcodes.find(oligo); if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); } - barcodes[oligo]=index; index++; - groupVector.push_back(group); + barcodes[oligo]=index; index++; + groupVector.push_back(group); + + if(allFiles){ + outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta")); + outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta")); + outFASTAVec.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + group + ".fasta")); + if(qFileName != ""){ + outQualVec.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual")); + outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual")); + outputTypes["qual"].push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + group + ".qual")); + } + } - if(allFiles){ - outputNames.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta")); - outFASTAVec.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta")); - if(qFileName != ""){ - outQualVec.push_back((outputDir + getRootName(getSimpleName(qFileName)) + group + ".qual")); - outputNames.push_back((outputDir + getRootName(getSimpleName(qFileName)) + group + ".qual")); - } - } }else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); } } - gobble(inOligos); + m->gobble(inOligos); } inOligos.close(); @@ -829,13 +991,15 @@ void TrimSeqsCommand::getOligos(vector& outFASTAVec, vector& out for (map::iterator itBar = barcodes.begin(); itBar != barcodes.end(); itBar++) { for (map::iterator itPrime = primers.begin(); itPrime != primers.end(); itPrime++) { if (groupVector[itPrime->second] != "") { //there is a group for this primer - outputNames.push_back((outputDir + getRootName(getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".fasta")); - outFASTAVec.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".fasta")); + outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".fasta")); + outputTypes["fasta"].push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".fasta")); + outFASTAVec.push_back((outputDir + m->getRootName(m->getSimpleName(fastaFile)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".fasta")); combos[(groupVector[itBar->second] + "." + groupVector[itPrime->second])] = outFASTAVec.size()-1; if(qFileName != ""){ - outQualVec.push_back((outputDir + getRootName(getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".qual")); - outputNames.push_back((outputDir + getRootName(getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".qual")); + outQualVec.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".qual")); + outputNames.push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".qual")); + outputTypes["qual"].push_back((outputDir + m->getRootName(m->getSimpleName(qFileName)) + groupVector[itBar->second] + "." + groupVector[itPrime->second] + ".qual")); } } } @@ -999,7 +1163,6 @@ int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group seq.setUnaligned(rawSequence.substr(oligo.length())); if(qual.getName() != ""){ qual.trimQScores(oligo.length(), -1); - } success = 0; break; @@ -1137,6 +1300,52 @@ bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){ //*************************************************************************************************************** +bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){ + try { + bool success = 1; + if(qscores.getName() != ""){ + qscores.trimQScores(-1, keepFirst); + } + sequence.trim(keepFirst); + return success; + } + catch(exception& e) { + m->errorOut(e, "keepFirstTrim", "countDiffs"); + exit(1); + } + +} + +//*************************************************************************************************************** + +bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){ + try { + bool success = 0; + + int length = sequence.getNumBases() - removeLast; + + if(length > 0){ + if(qscores.getName() != ""){ + qscores.trimQScores(-1, length); + } + sequence.trim(length); + success = 1; + } + else{ + success = 0; + } + + return success; + } + catch(exception& e) { + m->errorOut(e, "removeLastTrim", "countDiffs"); + exit(1); + } + +} + +//*************************************************************************************************************** + bool TrimSeqsCommand::cullLength(Sequence& seq){ try { @@ -1233,6 +1442,7 @@ bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){ } } + //*************************************************************************************************************** int TrimSeqsCommand::countDiffs(string oligo, string seq){ @@ -1268,78 +1478,5 @@ int TrimSeqsCommand::countDiffs(string oligo, string seq){ } } -//*************************************************************************************************************** - -//bool TrimSeqsCommand::stripQualThreshold(Sequence& seq, ifstream& qFile){ -// try { -// -// string rawSequence = seq.getUnaligned(); -// int seqLength = seq.getNumBases(); -// bool success = 0; //guilty until proven innocent -// string name; -// -// qFile >> name; -// if (name[0] == '>') { if(name.substr(1) != seq.getName()) { m->mothurOut("sequence name mismatch btwn fasta: " + seq.getName() + " and qual file: " + name); m->mothurOutEndLine(); } } -// -// while (!qFile.eof()) { char c = qFile.get(); if (c == 10 || c == 13){ break; } } -// -// int score; -// int end = seqLength; -// -// for(int i=0;i> score; -// -// if(score < qThreshold){ -// end = i; -// break; -// } -// } -// for(int i=end+1;i> score; -// } -// -// seq.setUnaligned(rawSequence.substr(0,end)); -// -// return 1; -// } -// catch(exception& e) { -// m->errorOut(e, "TrimSeqsCommand", "stripQualThreshold"); -// exit(1); -// } -//} - -//*************************************************************************************************************** - -//bool TrimSeqsCommand::cullQualAverage(Sequence& seq, ifstream& qFile){ -// try { -// string rawSequence = seq.getUnaligned(); -// int seqLength = seq.getNumBases(); -// bool success = 0; //guilty until proven innocent -// string name; -// -// qFile >> name; -// if (name[0] == '>') { if(name.substr(1) != seq.getName()) { m->mothurOut("sequence name mismatch btwn fasta: " + seq.getName() + " and qual file: " + name); m->mothurOutEndLine(); } } -// -// while (!qFile.eof()) { char c = qFile.get(); if (c == 10 || c == 13){ break; } } -// -// float score; -// float average = 0; -// -// for(int i=0;i> score; -// average += score; -// } -// average /= seqLength; -// -// if(average >= qAverage) { success = 1; } -// else { success = 0; } -// -// return success; -// } -// catch(exception& e) { -// m->errorOut(e, "TrimSeqsCommand", "cullQualAverage"); -// exit(1); -// } -//} //***************************************************************************************************************