X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=trimseqscommand.cpp;h=2dc07794be5abb2f239e4b228c3aacbf51c51b6e;hb=d04f948b1a2a1a2984fc4a45d04403b8c121c5bc;hp=5e0541f9cb954575d39d277cdf4a50d6931c21c2;hpb=def6801aad4aadbbaa7cc615b11554e47dad5ce0;p=mothur.git diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index 5e0541f..2dc0779 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -11,10 +11,13 @@ #include "needlemanoverlap.hpp" //********************************************************************************************************************** + vector TrimSeqsCommand::getValidParameters(){ try { string Array[] = {"fasta", "flip", "oligos", "maxambig", "maxhomop", "group","minlength", "maxlength", "qfile", - "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage", "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"}; + "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage", + "keepfirst", "removelast", + "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); return myArray; } @@ -23,7 +26,9 @@ vector TrimSeqsCommand::getValidParameters(){ exit(1); } } + //********************************************************************************************************************** + TrimSeqsCommand::TrimSeqsCommand(){ try { abort = true; @@ -38,7 +43,9 @@ TrimSeqsCommand::TrimSeqsCommand(){ exit(1); } } + //********************************************************************************************************************** + vector TrimSeqsCommand::getRequiredParameters(){ try { string Array[] = {"fasta"}; @@ -50,7 +57,9 @@ vector TrimSeqsCommand::getRequiredParameters(){ exit(1); } } + //********************************************************************************************************************** + vector TrimSeqsCommand::getRequiredFiles(){ try { vector myArray; @@ -61,6 +70,7 @@ vector TrimSeqsCommand::getRequiredFiles(){ exit(1); } } + //*************************************************************************************************************** TrimSeqsCommand::TrimSeqsCommand(string option) { @@ -74,8 +84,10 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { else { //valid paramters for this command - string AlignArray[] = {"fasta", "flip", "group","oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", - "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage", "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"}; + string AlignArray[] = { "fasta", "flip", "oligos", "maxambig", "maxhomop", "group","minlength", "maxlength", "qfile", + "qthreshold", "qwindowaverage", "qstepsize", "qwindowsize", "qaverage", "rollaverage", + "keepfirst", "removelast", + "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"}; vector myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string))); @@ -204,14 +216,20 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { temp = validParameter.validFile(parameters, "qwindowaverage", false);if (temp == "not found") { temp = "0"; } convert(temp, qWindowAverage); - temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "100"; } + temp = validParameter.validFile(parameters, "qwindowsize", false); if (temp == "not found") { temp = "50"; } convert(temp, qWindowSize); - temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "10"; } + temp = validParameter.validFile(parameters, "qstepsize", false); if (temp == "not found") { temp = "1"; } convert(temp, qWindowStep); temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; } convert(temp, qAverage); + + temp = validParameter.validFile(parameters, "keepfirst", false); if (temp == "not found") { temp = "0"; } + convert(temp, keepFirst); + + temp = validParameter.validFile(parameters, "removelast", false); if (temp == "not found") { temp = "0"; } + convert(temp, removeLast); temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; } allFiles = m->isTrue(temp); @@ -244,6 +262,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { exit(1); } } + //********************************************************************************************************************** void TrimSeqsCommand::help(){ @@ -337,7 +356,7 @@ int TrimSeqsCommand::execute(){ outputNames.push_back(groupFile); outputTypes["group"].push_back(groupFile); getOligos(fastaFileNames, qualFileNames); } - + vector fastaFilePos; vector qFilePos; @@ -360,10 +379,10 @@ int TrimSeqsCommand::execute(){ #endif if (m->control_pressed) { return 0; } - - for(int i=0;iisBlank(fastaFileNames[i])) { remove(fastaFileNames[i].c_str()); } + set blanks; + for(int i=0;iisBlank(fastaFileNames[i])) { blanks.insert(fastaFileNames[i]); } else if (filesToRemove.count(fastaFileNames[i]) > 0) { remove(fastaFileNames[i].c_str()); } else { ifstream inFASTA; @@ -371,6 +390,10 @@ int TrimSeqsCommand::execute(){ m->openInputFile(fastaFileNames[i], inFASTA); ofstream outGroups; string outGroupFilename = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[i])) + "groups"; + + //if the fastafile is on the blanks list then the groups file should be as well + if (blanks.count(fastaFileNames[i]) != 0) { blanks.insert(outGroupFilename); } + m->openOutputFile(outGroupFilename, outGroups); outputNames.push_back(outGroupFilename); outputTypes["group"].push_back(outGroupFilename); @@ -394,30 +417,17 @@ int TrimSeqsCommand::execute(){ } } + for (set::iterator itBlanks = blanks.begin(); itBlanks != blanks.end(); itBlanks++) { remove((*(itBlanks)).c_str()); } + + blanks.clear(); if(qFileName != ""){ for(int i=0;iisBlank(qualFileNames[i])) { remove(qualFileNames[i].c_str()); } + if (m->isBlank(qualFileNames[i])) { blanks.insert(qualFileNames[i]); } else if (filesToRemove.count(qualFileNames[i]) > 0) { remove(qualFileNames[i].c_str()); } - else { - ifstream inQual; - string seqName; - m->openInputFile(qualFileNames[i], inQual); -// ofstream outGroups; -// -// string thisGroup = ""; -// if (i > comboStarts) { -// map::iterator itCombo; -// for(itCombo=combos.begin();itCombo!=combos.end(); itCombo++){ -// if(itCombo->second == i){ thisGroup = itCombo->first; combos.erase(itCombo); break; } -// } -// } -// else{ thisGroup = groupVector[i]; } - - inQual.close(); - } } } + for (set::iterator itBlanks = blanks.begin(); itBlanks != blanks.end(); itBlanks++) { remove((*(itBlanks)).c_str()); } if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } @@ -445,7 +455,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string try { ofstream outFASTA; - int able = m->openOutputFile(trimFile, outFASTA); + m->openOutputFile(trimFile, outFASTA); ofstream scrapFASTA; m->openOutputFile(scrapFile, scrapFASTA); @@ -516,25 +526,12 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string string trashCode = ""; int currentSeqsDiffs = 0; - if(qFileName != ""){ - if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); } - else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); } - else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); } - else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); } - - if (qtrim == 1 && (origSeq.length() != currSeq.getUnaligned().length())) { - success = 0; //if you don't want to trim and the sequence does not meet quality requirements, move to scrap - } - - if(!success) { trashCode += 'q'; } - } - if(barcodes.size() != 0){ success = stripBarcode(currSeq, currQual, groupBar); if(success > bdiffs) { trashCode += 'b'; } else{ currentSeqsDiffs += success; } } - + if(numFPrimers != 0){ success = stripForward(currSeq, currQual, groupPrime); if(success > pdiffs) { trashCode += 'f'; } @@ -542,11 +539,36 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string } if (currentSeqsDiffs > tdiffs) { trashCode += 't'; } - + if(numRPrimers != 0){ success = stripReverse(currSeq, currQual); if(!success) { trashCode += 'r'; } } + + if(keepFirst != 0){ + success = keepFirstTrim(currSeq, currQual); + } + + if(removeLast != 0){ + success = removeLastTrim(currSeq, currQual); + if(!success) { trashCode += 'l'; } + } + + + if(qFileName != ""){ + int origLength = currSeq.getNumBases(); + + if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); } + else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); } + else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); } + else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); } + else { success = 1; } + + //you don't want to trim, if it fails above then scrap it + if ((!qtrim) && (origLength != currSeq.getNumBases())) { success = 0; } + + if(!success) { trashCode += 'q'; } + } if(minLength > 0 || maxLength > 0){ success = cullLength(currSeq); @@ -561,7 +583,10 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if(!success) { trashCode += 'n'; } } - if(flip){ currSeq.reverseComplement(); currQual.flipQScores(); } // should go last + if(flip){ // should go last + currSeq.reverseComplement(); + currQual.flipQScores(); + } if(trashCode.length() == 0){ currSeq.setAligned(currSeq.getUnaligned()); @@ -727,7 +752,8 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName //append files for(int i=0;imothurOut("Appending files from process " + processIDS[i]); m->mothurOutEndLine(); + + m->mothurOut("Appending files from process " + toString(processIDS[i])); m->mothurOutEndLine(); m->appendFiles((trimFile + toString(processIDS[i]) + ".temp"), trimFile); remove((trimFile + toString(processIDS[i]) + ".temp").c_str()); @@ -857,6 +883,7 @@ int TrimSeqsCommand::setLines(string filename, string qfilename, vector& outFASTAVec, vector& outQualVec){ @@ -1068,7 +1095,6 @@ int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group oligo = oligo.substr(0,alnLength); temp = temp.substr(0,alnLength); - int newStart=0; int numDiff = countDiffs(oligo, temp); // cout << oligo << '\t' << temp << '\t' << numDiff << endl; @@ -1137,7 +1163,6 @@ int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group seq.setUnaligned(rawSequence.substr(oligo.length())); if(qual.getName() != ""){ qual.trimQScores(oligo.length(), -1); - } success = 0; break; @@ -1194,7 +1219,6 @@ int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group oligo = oligo.substr(0,alnLength); temp = temp.substr(0,alnLength); - int newStart=0; int numDiff = countDiffs(oligo, temp); // cout << oligo << '\t' << temp << '\t' << numDiff << endl; @@ -1275,6 +1299,52 @@ bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){ //*************************************************************************************************************** +bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){ + try { + bool success = 1; + if(qscores.getName() != ""){ + qscores.trimQScores(-1, keepFirst); + } + sequence.trim(keepFirst); + return success; + } + catch(exception& e) { + m->errorOut(e, "keepFirstTrim", "countDiffs"); + exit(1); + } + +} + +//*************************************************************************************************************** + +bool TrimSeqsCommand::removeLastTrim(Sequence& sequence, QualityScores& qscores){ + try { + bool success = 0; + + int length = sequence.getNumBases() - removeLast; + + if(length > 0){ + if(qscores.getName() != ""){ + qscores.trimQScores(-1, length); + } + sequence.trim(length); + success = 1; + } + else{ + success = 0; + } + + return success; + } + catch(exception& e) { + m->errorOut(e, "removeLastTrim", "countDiffs"); + exit(1); + } + +} + +//*************************************************************************************************************** + bool TrimSeqsCommand::cullLength(Sequence& seq){ try { @@ -1371,6 +1441,7 @@ bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){ } } + //*************************************************************************************************************** int TrimSeqsCommand::countDiffs(string oligo, string seq){ @@ -1406,78 +1477,5 @@ int TrimSeqsCommand::countDiffs(string oligo, string seq){ } } -//*************************************************************************************************************** - -//bool TrimSeqsCommand::stripQualThreshold(Sequence& seq, ifstream& qFile){ -// try { -// -// string rawSequence = seq.getUnaligned(); -// int seqLength = seq.getNumBases(); -// bool success = 0; //guilty until proven innocent -// string name; -// -// qFile >> name; -// if (name[0] == '>') { if(name.substr(1) != seq.getName()) { m->mothurOut("sequence name mismatch btwn fasta: " + seq.getName() + " and qual file: " + name); m->mothurOutEndLine(); } } -// -// while (!qFile.eof()) { char c = qFile.get(); if (c == 10 || c == 13){ break; } } -// -// int score; -// int end = seqLength; -// -// for(int i=0;i> score; -// -// if(score < qThreshold){ -// end = i; -// break; -// } -// } -// for(int i=end+1;i> score; -// } -// -// seq.setUnaligned(rawSequence.substr(0,end)); -// -// return 1; -// } -// catch(exception& e) { -// m->errorOut(e, "TrimSeqsCommand", "stripQualThreshold"); -// exit(1); -// } -//} - -//*************************************************************************************************************** - -//bool TrimSeqsCommand::cullQualAverage(Sequence& seq, ifstream& qFile){ -// try { -// string rawSequence = seq.getUnaligned(); -// int seqLength = seq.getNumBases(); -// bool success = 0; //guilty until proven innocent -// string name; -// -// qFile >> name; -// if (name[0] == '>') { if(name.substr(1) != seq.getName()) { m->mothurOut("sequence name mismatch btwn fasta: " + seq.getName() + " and qual file: " + name); m->mothurOutEndLine(); } } -// -// while (!qFile.eof()) { char c = qFile.get(); if (c == 10 || c == 13){ break; } } -// -// float score; -// float average = 0; -// -// for(int i=0;i> score; -// average += score; -// } -// average /= seqLength; -// -// if(average >= qAverage) { success = 1; } -// else { success = 0; } -// -// return success; -// } -// catch(exception& e) { -// m->errorOut(e, "TrimSeqsCommand", "cullQualAverage"); -// exit(1); -// } -//} //***************************************************************************************************************