X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=trimseqscommand.cpp;h=f7773244bc37d888d4c44818cdd8dd1d020444ba;hb=c82900be3ceed3d9bc491bdc98b1819ef85c1af7;hp=132cc6b767956e2c5b522adb4facb9904fc30a78;hpb=cd37904452dc95b183ff313ff05720c562902487;p=mothur.git diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index 132cc6b..f777324 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -21,7 +21,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option){ else { //valid paramters for this command - string AlignArray[] = {"fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", "qthreshold", "qaverage", "allfiles"}; + string AlignArray[] = {"fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", "qthreshold", "qaverage", "allfiles", "qtrim"}; vector myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string))); @@ -37,7 +37,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option){ //check for required parameters fastaFile = validParameter.validFile(parameters, "fasta", true); - if (fastaFile == "not found") { cout << "fasta is a required parameter for the screen.seqs command." << endl; abort = true; } + if (fastaFile == "not found") { mothurOut("fasta is a required parameter for the screen.seqs command."); mothurOutEndLine(); abort = true; } else if (fastaFile == "not open") { abort = true; } @@ -72,62 +72,66 @@ TrimSeqsCommand::TrimSeqsCommand(string option){ temp = validParameter.validFile(parameters, "qthreshold", false); if (temp == "not found") { temp = "0"; } convert(temp, qThreshold); + + temp = validParameter.validFile(parameters, "qtrim", false); if (temp == "not found") { temp = "F"; } + qtrim = isTrue(temp); temp = validParameter.validFile(parameters, "qaverage", false); if (temp == "not found") { temp = "0"; } convert(temp, qAverage); temp = validParameter.validFile(parameters, "allfiles", false); if (temp == "not found") { temp = "F"; } allFiles = isTrue(temp); - + if(allFiles && oligoFile == ""){ - cout << "You selected allfiles, but didn't enter an oligos file. Ignoring the allfiles request." << endl; + mothurOut("You selected allfiles, but didn't enter an oligos file. Ignoring the allfiles request."); mothurOutEndLine(); + } + if((qAverage != 0 && qThreshold != 0) && qFileName == ""){ + mothurOut("You didn't provide a quality file name, quality criteria will be ignored."); mothurOutEndLine(); + qAverage=0; + qThreshold=0; } - if(!flip && oligoFile=="" && !maxLength && !minLength && (maxAmbig==-1) && !maxHomoP && qFileName == ""){ - cout << "You didn't set any options... quiting command." << endl; + mothurOut("You didn't set any options... quiting command."); mothurOutEndLine(); abort = true; } - } } catch(exception& e) { - cout << "Standard Error: " << e.what() << " has occurred in the TrimSeqsCommand class Function TrimSeqsCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + errorOut(e, "TrimSeqsCommand", "TrimSeqsCommand"); exit(1); } - catch(...) { - cout << "An unknown error has occurred in the TrimSeqsCommand class function TrimSeqsCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; - exit(1); - } } //********************************************************************************************************************** void TrimSeqsCommand::help(){ try { - cout << "The trim.seqs command reads a fastaFile and creates ....." << "\n"; - cout << "The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength and maxlength." << "\n"; - cout << "The fasta parameter is required." << "\n"; - cout << "The flip parameter .... The default is 0." << "\n"; - cout << "The oligos parameter .... The default is ""." << "\n"; - cout << "The maxambig parameter .... The default is -1." << "\n"; - cout << "The maxhomop parameter .... The default is 0." << "\n"; - cout << "The minlength parameter .... The default is 0." << "\n"; - cout << "The maxlength parameter .... The default is 0." << "\n"; - cout << "The trim.seqs command should be in the following format: " << "\n"; - cout << "trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, " << "\n"; - cout << "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) " << "\n"; - cout << "Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...)." << "\n"; - cout << "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta)." << "\n" << "\n"; + mothurOut("The trim.seqs command reads a fastaFile and creates .....\n"); + mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, qtrim and allfiles.\n"); + mothurOut("The fasta parameter is required.\n"); + mothurOut("The flip parameter .... The default is 0.\n"); + mothurOut("The oligos parameter .... The default is "".\n"); + mothurOut("The maxambig parameter .... The default is -1.\n"); + mothurOut("The maxhomop parameter .... The default is 0.\n"); + mothurOut("The minlength parameter .... The default is 0.\n"); + mothurOut("The maxlength parameter .... The default is 0.\n"); + mothurOut("The qfile parameter .....\n"); + mothurOut("The qthreshold parameter .... The default is 0.\n"); + mothurOut("The qaverage parameter .... The default is 0.\n"); + mothurOut("The allfiles parameter .... The default is F.\n"); + mothurOut("The qtrim parameter .... The default is F.\n"); + mothurOut("The trim.seqs command should be in the following format: \n"); + mothurOut("trim.seqs(fasta=yourFastaFile, flip=yourFlip, oligos=yourOligos, maxambig=yourMaxambig, \n"); + mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n"); + mothurOut("Example trim.seqs(fasta=abrecovery.fasta, flip=..., oligos=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n"); + mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n"); + mothurOut("For more details please check out the wiki http://www.mothur.org/wiki/Trim.seqs .\n\n"); } catch(exception& e) { - cout << "Standard Error: " << e.what() << " has occurred in the TrimSeqsCommand class Function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + errorOut(e, "TrimSeqsCommand", "help"); exit(1); } - catch(...) { - cout << "An unknown error has occurred in the TrimSeqsCommand class function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; - exit(1); - } } @@ -141,21 +145,21 @@ int TrimSeqsCommand::execute(){ try{ if (abort == true) { return 0; } - - vector groupFileNames; - vector fastaFileNames; - if(oligoFile != "") { getOligos(fastaFileNames, groupFileNames); } ifstream inFASTA; openInputFile(fastaFile, inFASTA); - + ofstream outFASTA; string trimSeqFile = getRootName(fastaFile) + "trim.fasta"; openOutputFile(trimSeqFile, outFASTA); ofstream outGroups; - string groupFile = getRootName(fastaFile) + "groups"; - openOutputFile(groupFile, outGroups); + vector fastaFileNames; + if(oligoFile != ""){ + string groupFile = getRootName(fastaFile) + "groups"; + openOutputFile(groupFile, outGroups); + getOligos(fastaFileNames); + } ofstream scrapFASTA; string scrapSeqFile = getRootName(fastaFile) + "scrap.fasta"; @@ -165,60 +169,66 @@ int TrimSeqsCommand::execute(){ if(qFileName != "") { openInputFile(qFileName, qFile); } bool success; - + while(!inFASTA.eof()){ Sequence currSeq(inFASTA); string origSeq = currSeq.getUnaligned(); - int group; - string trashCode = ""; - - if(qFileName != ""){ - if(qThreshold != 0) { success = stripQualThreshold(currSeq, qFile); } - else if(qAverage != 0) { success = cullQualAverage(currSeq, qFile); } - if(!success) { trashCode += 'q'; } - } - if(barcodes.size() != 0){ - success = stripBarcode(currSeq, group); - if(!success){ trashCode += 'b'; } - } - if(numFPrimers != 0){ - success = stripForward(currSeq); - if(!success){ trashCode += 'f'; } - } - if(numRPrimers != 0){ - success = stripReverse(currSeq); - if(!success){ trashCode += 'r'; } - } - if(minLength > 0 || maxLength > 0){ - success = cullLength(currSeq); - if(!success){ trashCode += 'l'; } - } - if(maxHomoP > 0){ - success = cullHomoP(currSeq); - if(!success){ trashCode += 'h'; } - } - if(maxAmbig != -1){ - success = cullAmbigs(currSeq); - if(!success){ trashCode += 'n'; } - } - - if(flip){ currSeq.reverseComplement(); } // should go last - - if(trashCode.length() == 0){ - currSeq.printSequence(outFASTA); + if (origSeq != "") { + int group; + string trashCode = ""; + + if(qFileName != ""){ + if(qThreshold != 0) { success = stripQualThreshold(currSeq, qFile); } + else if(qAverage != 0) { success = cullQualAverage(currSeq, qFile); } + if ((!qtrim) && (origSeq.length() != currSeq.getUnaligned().length())) { + success = 0; //if you don't want to trim and the sequence does not meet quality requirements, move to scrap + } + if(!success) { trashCode += 'q'; } + } if(barcodes.size() != 0){ - outGroups << currSeq.getName() << '\t' << groupVector[group] << endl; - if(allFiles){ - *groupFileNames[group] << currSeq.getName() << '\t' << groupVector[group] << endl; - currSeq.printSequence(*fastaFileNames[group]); + success = stripBarcode(currSeq, group); + if(!success){ trashCode += 'b'; } + } + if(numFPrimers != 0){ + success = stripForward(currSeq); + if(!success){ trashCode += 'f'; } + } + if(numRPrimers != 0){ + success = stripReverse(currSeq); + if(!success){ trashCode += 'r'; } + } + if(minLength > 0 || maxLength > 0){ + success = cullLength(currSeq); + if(!success){ trashCode += 'l'; } + } + if(maxHomoP > 0){ + success = cullHomoP(currSeq); + if(!success){ trashCode += 'h'; } + } + if(maxAmbig != -1){ + success = cullAmbigs(currSeq); + if(!success){ trashCode += 'n'; } + } + + if(flip){ currSeq.reverseComplement(); } // should go last + + if(trashCode.length() == 0){ + currSeq.setAligned(currSeq.getUnaligned()); //this is because of a modification we made to the sequence class to fix a bug. all seqs have an aligned version, which is the version that gets printed. + currSeq.printSequence(outFASTA); + if(barcodes.size() != 0){ + outGroups << currSeq.getName() << '\t' << groupVector[group] << endl; + + if(allFiles){ + currSeq.printSequence(*fastaFileNames[group]); + } } } - } - else{ - currSeq.setName(currSeq.getName() + '|' + trashCode); - currSeq.setUnaligned(origSeq); - currSeq.printSequence(scrapFASTA); + else{ + currSeq.setName(currSeq.getName() + '|' + trashCode); + currSeq.setUnaligned(origSeq); + currSeq.printSequence(scrapFASTA); + } } gobble(inFASTA); } @@ -226,288 +236,356 @@ int TrimSeqsCommand::execute(){ outFASTA.close(); scrapFASTA.close(); outGroups.close(); + if(qFileName != "") { qFile.close(); } - for(int i=0;iclose(); - delete groupFileNames[i]; - + for(int i=0;iclose(); delete fastaFileNames[i]; + } + + for(int i=0;i'){ + inFASTA >> seqName; + outGroups << seqName << '\t' << groupVector[i] << endl; + } + while (!inFASTA.eof()) { char c = inFASTA.get(); if (c == 10 || c == 13){ break; } } + } + outGroups.close(); + inFASTA.close(); } + return 0; } catch(exception& e) { - cout << "Standard Error: " << e.what() << " has occurred in the TrimSeqsCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; - exit(1); - } - catch(...) { - cout << "An unknown error has occurred in the TrimSeqsCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + errorOut(e, "TrimSeqsCommand", "execute"); exit(1); } } //*************************************************************************************************************** -void TrimSeqsCommand::getOligos(vector& outFASTAVec, vector& outGroupsVec){ - - ifstream inOligos; - openInputFile(oligoFile, inOligos); - - ofstream test; - - string type, oligo, group; - int index=0; - - while(!inOligos.eof()){ - inOligos >> type; +void TrimSeqsCommand::getOligos(vector& outFASTAVec){ + try { + ifstream inOligos; + openInputFile(oligoFile, inOligos); - if(type[0] == '#'){ - while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there - } - else{ - inOligos >> oligo; - - for(int i=0;i> type; - if(type == "forward"){ - forPrimer.push_back(oligo); - } - else if(type == "reverse"){ - revPrimer.push_back(oligo); + if(type[0] == '#'){ + while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there } - else if(type == "barcode"){ - inOligos >> group; - barcodes[oligo]=index++; - groupVector.push_back(group); + else{ + inOligos >> oligo; + + for(int i=0;i> group; + barcodes[oligo]=index++; + groupVector.push_back(group); - if(allFiles){ - outFASTAVec.push_back(new ofstream((getRootName(fastaFile) + group + ".fasta").c_str(), ios::ate)); - outGroupsVec.push_back(new ofstream((getRootName(fastaFile) + group + ".groups").c_str(), ios::ate)); + if(allFiles){ + outFASTAVec.push_back(new ofstream((getRootName(fastaFile) + group + ".fasta").c_str(), ios::ate)); + } } } } + + inOligos.close(); + + numFPrimers = forPrimer.size(); + numRPrimers = revPrimer.size(); + } - - numFPrimers = forPrimer.size(); - numRPrimers = revPrimer.size(); + catch(exception& e) { + errorOut(e, "TrimSeqsCommand", "getOligos"); + exit(1); + } + } //*************************************************************************************************************** bool TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){ - - string rawSequence = seq.getUnaligned(); - bool success = 0; //guilty until proven innocent - - for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ - string oligo = it->first; - if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length - success = 0; - break; - } + try { + string rawSequence = seq.getUnaligned(); + bool success = 0; //guilty until proven innocent - if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ - group = it->second; - seq.setUnaligned(rawSequence.substr(oligo.length())); - success = 1; - break; + for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ + string oligo = it->first; + if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length + success = 0; + break; + } + + if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ + group = it->second; + seq.setUnaligned(rawSequence.substr(oligo.length())); + success = 1; + break; + } } + return success; + } - return success; - + catch(exception& e) { + errorOut(e, "TrimSeqsCommand", "stripBarcode"); + exit(1); + } + } //*************************************************************************************************************** bool TrimSeqsCommand::stripForward(Sequence& seq){ - - string rawSequence = seq.getUnaligned(); - bool success = 0; //guilty until proven innocent - - for(int i=0;i= minLength && maxLength == 0) { success = 1; } - else if(length >= minLength && length <= maxLength) { success = 1; } - else { success = 0; } + int length = seq.getNumBases(); + bool success = 0; //guilty until proven innocent + + if(length >= minLength && maxLength == 0) { success = 1; } + else if(length >= minLength && length <= maxLength) { success = 1; } + else { success = 0; } + + return success; - return success; + } + catch(exception& e) { + errorOut(e, "TrimSeqsCommand", "cullLength"); + exit(1); + } } //*************************************************************************************************************** bool TrimSeqsCommand::cullHomoP(Sequence& seq){ - - int longHomoP = seq.getLongHomoPolymer(); - bool success = 0; //guilty until proven innocent - - if(longHomoP <= maxHomoP){ success = 1; } - else { success = 0; } - - return success; + try { + int longHomoP = seq.getLongHomoPolymer(); + bool success = 0; //guilty until proven innocent + + if(longHomoP <= maxHomoP){ success = 1; } + else { success = 0; } + + return success; + } + catch(exception& e) { + errorOut(e, "TrimSeqsCommand", "cullHomoP"); + exit(1); + } } //*************************************************************************************************************** bool TrimSeqsCommand::cullAmbigs(Sequence& seq){ - - int numNs = seq.getAmbigBases(); - bool success = 0; //guilty until proven innocent - - if(numNs <= maxAmbig) { success = 1; } - else { success = 0; } - - return success; + try { + int numNs = seq.getAmbigBases(); + bool success = 0; //guilty until proven innocent + + if(numNs <= maxAmbig) { success = 1; } + else { success = 0; } + + return success; + } + catch(exception& e) { + errorOut(e, "TrimSeqsCommand", "cullAmbigs"); + exit(1); + } } //*************************************************************************************************************** bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){ - - bool success = 1; - int length = oligo.length(); - - for(int i=0;i> name; - if(name.substr(1) != seq.getName()) { cout << "sequence name mismatch btwn fasta and qual file" << endl; } - while (!qFile.eof()) { char c = qFile.get(); if (c == 10 || c == 13){ break; } } - - int score; - int end = seqLength; - - for(int i=0;i> score; - - if(score <= qThreshold){ - end = i; - break; + try { + string rawSequence = seq.getUnaligned(); + int seqLength = rawSequence.length(); + string name; + + qFile >> name; + if (name.length() != 0) { if(name.substr(1) != seq.getName()) { mothurOut("sequence name mismatch btwn fasta and qual file"); mothurOutEndLine(); } } + while (!qFile.eof()) { char c = qFile.get(); if (c == 10 || c == 13){ break; } } + + int score; + int end = seqLength; + + for(int i=0;i> score; + + if(score <= qThreshold){ + end = i; + break; + } + } + for(int i=end+1;i> score; } + + seq.setUnaligned(rawSequence.substr(0,end)); + + return 1; } - for(int i=end+1;i> score; + catch(exception& e) { + errorOut(e, "TrimSeqsCommand", "stripQualThreshold"); + exit(1); } - - seq.setUnaligned(rawSequence.substr(0,end)); - - return 1; } //*************************************************************************************************************** bool TrimSeqsCommand::cullQualAverage(Sequence& seq, ifstream& qFile){ - - string rawSequence = seq.getUnaligned(); - int seqLength = seq.getNumBases(); - bool success = 0; //guilty until proven innocent - string name; - - qFile >> name; - if(name.substr(1) != seq.getName()) { cout << "sequence name mismatch btwn fasta and qual file" << endl; } - while (!qFile.eof()) { char c = qFile.get(); if (c == 10 || c == 13){ break; } } - - float score; - float average = 0; - - for(int i=0;i> score; - average += score; + try { + string rawSequence = seq.getUnaligned(); + int seqLength = seq.getNumBases(); + bool success = 0; //guilty until proven innocent + string name; + + qFile >> name; + if (name[0] == '>') { if(name.substr(1) != seq.getName()) { mothurOut("sequence name mismatch btwn fasta: " + seq.getName() + " and qual file: " + name); mothurOutEndLine(); } } + + while (!qFile.eof()) { char c = qFile.get(); if (c == 10 || c == 13){ break; } } + + float score; + float average = 0; + + for(int i=0;i> score; + average += score; + } + average /= seqLength; + + if(average >= qAverage) { success = 1; } + else { success = 0; } + + return success; + } + catch(exception& e) { + errorOut(e, "TrimSeqsCommand", "cullQualAverage"); + exit(1); } - average /= seqLength; - - if(average >= qAverage) { success = 1; } - else { success = 0; } - - return success; } //*************************************************************************************************************** - -