X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=trimseqscommand.cpp;h=cbf5e13603ced805d185196fad9129eacb573d22;hb=bc4c4d63e983a3e91e74b5038bfec6705e1a3a2e;hp=a544733375b004e651409ef47c9db62bb0f0756d;hpb=e21aedb9ffeabff18207d53e9cc5420d2e30a5b2;p=mothur.git diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index a544733..cbf5e13 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -16,6 +16,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { try { abort = false; + comboStarts = 0; //allow user to run help if(option == "help") { help(); abort = true; } @@ -163,7 +164,7 @@ void TrimSeqsCommand::help(){ m->mothurOut("The trim.seqs command reads a fastaFile and creates .....\n"); m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim and allfiles.\n"); m->mothurOut("The fasta parameter is required.\n"); - m->mothurOut("The flip parameter .... The default is 0.\n"); + m->mothurOut("The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n"); m->mothurOut("The oligos parameter .... The default is "".\n"); m->mothurOut("The maxambig parameter .... The default is -1.\n"); m->mothurOut("The maxhomop parameter .... The default is 0.\n"); @@ -224,8 +225,9 @@ int TrimSeqsCommand::execute(){ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) if(processors == 1){ ifstream inFASTA; + int numSeqs; openInputFile(fastaFile, inFASTA); - int numSeqs=count(istreambuf_iterator(inFASTA),istreambuf_iterator(), '>'); + getNumSeqs(inFASTA, numSeqs); inFASTA.close(); lines.push_back(new linePair(0, numSeqs)); @@ -266,35 +268,51 @@ int TrimSeqsCommand::execute(){ if (m->control_pressed) { return 0; } #else ifstream inFASTA; - openInputFile(fastafileNames[s], inFASTA); - numSeqs=count(istreambuf_iterator(inFASTA),istreambuf_iterator(), '>'); + int numSeqs; + openInputFile(fastaFile, inFASTA); + getNumSeqs(inFASTA, numSeqs); inFASTA.close(); lines.push_back(new linePair(0, numSeqs)); - driverCreateSummary(fastafile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames, lines[0], lines[0]); + driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames, lines[0], lines[0]); if (m->control_pressed) { return 0; } #endif for(int i=0;i'){ - inFASTA >> seqName; - outGroups << seqName << '\t' << groupVector[i] << endl; + if (isBlank(fastaFileNames[i])) { remove(fastaFileNames[i].c_str()); } + else if (filesToRemove.count(fastaFileNames[i]) > 0) { remove(fastaFileNames[i].c_str()); } + else { + ifstream inFASTA; + string seqName; + //openInputFile(getRootName(fastaFile) + groupVector[i] + ".fasta", inFASTA); + openInputFile(fastaFileNames[i], inFASTA); + ofstream outGroups; + string outGroupFilename = outputDir + getRootName(getSimpleName(fastaFileNames[i])) + "groups"; + openOutputFile(outGroupFilename, outGroups); + //openOutputFile(outputDir + getRootName(getSimpleName(fastaFile)) + groupVector[i] + ".groups", outGroups); + outputNames.push_back(outGroupFilename); + + string thisGroup = ""; + if (i > comboStarts) { + map::iterator itCombo; + for(itCombo=combos.begin();itCombo!=combos.end(); itCombo++){ + if(itCombo->second == i){ thisGroup = itCombo->first; combos.erase(itCombo); break; } + } + }else{ thisGroup = groupVector[i]; } + + while(!inFASTA.eof()){ + if(inFASTA.get() == '>'){ + inFASTA >> seqName; + outGroups << seqName << '\t' << thisGroup << endl; + } + while (!inFASTA.eof()) { char c = inFASTA.get(); if (c == 10 || c == 13){ break; } } } - while (!inFASTA.eof()) { char c = inFASTA.get(); if (c == 10 || c == 13){ break; } } + outGroups.close(); + inFASTA.close(); } - outGroups.close(); - inFASTA.close(); } if (m->control_pressed) { @@ -331,7 +349,11 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if (oligoFile != "") { openOutputFile(groupFile, outGroups); for (int i = 0; i < fastaNames.size(); i++) { + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) fastaFileNames.push_back(new ofstream((fastaNames[i] + toString(getpid()) + ".temp").c_str(), ios::ate)); + #else + fastaFileNames.push_back(new ofstream((fastaNames[i] + toString(i) + ".temp").c_str(), ios::ate)); + #endif } } @@ -347,47 +369,41 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string for(int i=0;inum;i++){ if (m->control_pressed) { - inFASTA.close(); - outFASTA.close(); - scrapFASTA.close(); - if (oligoFile != "") { outGroups.close(); } - if(qFileName != "") { qFile.close(); } - for(int i=0;iclose(); - delete fastaFileNames[i]; - } + inFASTA.close(); outFASTA.close(); scrapFASTA.close(); if (oligoFile != "") { outGroups.close(); } if(qFileName != "") { qFile.close(); } + for(int i=0;iclose(); delete fastaFileNames[i]; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; } - bool success = 1; + int success = 1; Sequence currSeq(inFASTA); string origSeq = currSeq.getUnaligned(); if (origSeq != "") { - int group; + int groupBar, groupPrime; string trashCode = ""; int currentSeqsDiffs = 0; if(qFileName != ""){ if(qThreshold != 0) { success = stripQualThreshold(currSeq, qFile); } else if(qAverage != 0) { success = cullQualAverage(currSeq, qFile); } - if ((!qtrim) && (origSeq.length() != currSeq.getUnaligned().length())) { + + if (qtrim == 1 && (origSeq.length() != currSeq.getUnaligned().length())) { success = 0; //if you don't want to trim and the sequence does not meet quality requirements, move to scrap } + if(!success) { trashCode += 'q'; } } if(barcodes.size() != 0){ - success = stripBarcode(currSeq, group); -// cout << "here: " << success << endl; + success = stripBarcode(currSeq, groupBar); if(success > bdiffs){ trashCode += 'b'; } else{ currentSeqsDiffs += success; } } if(numFPrimers != 0){ - success = stripForward(currSeq); + success = stripForward(currSeq, groupPrime); if(success > pdiffs){ trashCode += 'f'; } else{ currentSeqsDiffs += success; } } @@ -415,19 +431,28 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if(flip){ currSeq.reverseComplement(); } // should go last if(trashCode.length() == 0){ - currSeq.setAligned(currSeq.getUnaligned()); //this is because of a modification we made to the sequence class to fix a bug. all seqs have an aligned version, which is the version that gets printed. + currSeq.setAligned(currSeq.getUnaligned()); currSeq.printSequence(outFASTA); if(barcodes.size() != 0){ - outGroups << currSeq.getName() << '\t' << groupVector[group] << endl; - + string thisGroup = groupVector[groupBar]; + int indexToFastaFile = groupBar; + if (primers.size() != 0){ + //does this primer have a group + if (groupVector[groupPrime] != "") { + thisGroup += "." + groupVector[groupPrime]; + indexToFastaFile = combos[thisGroup]; + } + } + outGroups << currSeq.getName() << '\t' << thisGroup << endl; if(allFiles){ - currSeq.printSequence(*fastaFileNames[group]); + currSeq.printSequence(*fastaFileNames[indexToFastaFile]); } } } else{ currSeq.setName(currSeq.getName() + '|' + trashCode); currSeq.setUnaligned(origSeq); + currSeq.setAligned(origSeq); currSeq.printSequence(scrapFASTA); } } @@ -554,14 +579,18 @@ void TrimSeqsCommand::getOligos(vector& outFASTAVec){ //vector> type; - + if(type[0] == '#'){ while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there } else{ + //make type case insensitive + for(int i=0;i> oligo; for(int i=0;i& outFASTAVec){ //vector::iterator itPrime = primers.find(oligo); + if (itPrime != primers.end()) { m->mothurOut("primer " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); } + + primers[oligo]=index; index++; + groupVector.push_back(group); + + if(allFiles){ + outFASTAVec.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + toString(index) + "." + group + ".fasta")); + if (group == "") { //if there is not a group for this primer, then this file will not get written to, but we add it to keep the indexes correct + filesToRemove.insert((outputDir + getRootName(getSimpleName(fastaFile)) + toString(index) + "." + group + ".fasta")); + }else { + outputNames.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + toString(index) + "." + group + ".fasta")); + } + } + } - else if(type == "reverse"){ + else if(type == "REVERSE"){ Sequence oligoRC("reverse", oligo); oligoRC.reverseComplement(); revPrimer.push_back(oligoRC.getUnaligned()); } - else if(type == "barcode"){ + else if(type == "BARCODE"){ inOligos >> group; - barcodes[oligo]=index++; + + //check for repeat barcodes + map::iterator itBar = barcodes.find(oligo); + if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine(); } + + barcodes[oligo]=index; index++; groupVector.push_back(group); if(allFiles){ - //outFASTAVec.push_back(new ofstream((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta").c_str(), ios::ate)); - outputNames.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta")); - outFASTAVec.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + group + ".fasta")); + outputNames.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + toString(index) + "." + group + ".fasta")); + outFASTAVec.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + toString(index) + "." + group + ".fasta")); } - } + }else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); } } + gobble(inOligos); } inOligos.close(); - numFPrimers = forPrimer.size(); + //add in potential combos + if(allFiles){ + comboStarts = outFASTAVec.size()-1; + for (map::iterator itBar = barcodes.begin(); itBar != barcodes.end(); itBar++) { + for (map::iterator itPrime = primers.begin(); itPrime != primers.end(); itPrime++) { + if (groupVector[itPrime->second] != "") { //there is a group for this primer + outputNames.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + toString(itBar->second) + "." + groupVector[itBar->second] + "." + toString(itPrime->second) + "." + groupVector[itPrime->second] + ".fasta")); + outFASTAVec.push_back((outputDir + getRootName(getSimpleName(fastaFile)) + toString(itBar->second) + "." + groupVector[itBar->second] + "." + toString(itPrime->second) + "." + groupVector[itPrime->second] + ".fasta")); + combos[(groupVector[itBar->second] + "." + groupVector[itPrime->second])] = outFASTAVec.size()-1; + } + } + } + } + + numFPrimers = primers.size(); numRPrimers = revPrimer.size(); } @@ -606,15 +679,16 @@ void TrimSeqsCommand::getOligos(vector& outFASTAVec){ //vector::iterator it=barcodes.begin();it!=barcodes.end();it++){ string oligo = it->first; if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length - success = bdiffs + 1; - break; + success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; } if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ @@ -643,7 +717,7 @@ int TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){ maxLength = it->first.length(); } } - alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (maxLength+bdiffs+1)); + alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1)); }else{ alignment = NULL; } @@ -658,7 +732,7 @@ int TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){ // int length = oligo.length(); if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length - success = bdiffs + 1; + success = bdiffs + 10; break; } @@ -674,15 +748,12 @@ int TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){ } oligo = oligo.substr(0,alnLength); temp = temp.substr(0,alnLength); -// cout << "barcode = " << oligo << " raw = " << rawSequence.substr(0,alnLength) << " raw aligned = " << temp << endl; - cout << seq.getName() << endl; - cout << temp << endl; - cout << oligo << endl; - cout << alnLength << endl; - cout << endl; int newStart=0; int numDiff = countDiffs(oligo, temp); + +// cout << oligo << '\t' << temp << '\t' << numDiff << endl; + if(numDiff < minDiff){ minDiff = numDiff; minCount = 1; @@ -699,17 +770,20 @@ int TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){ } } - if(minDiff > bdiffs){ success = bdiffs + 1; } - else if(minCount > 1) { success = bdiffs + 1; } - else{ + + if(minDiff > bdiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes + else{ //use the best match group = minGroup; - seq.setUnaligned("*" + rawSequence.substr(minPos)); + seq.setUnaligned(rawSequence.substr(minPos)); success = minDiff; } if (alignment != NULL) { delete alignment; } } +// cout << success << endl; + return success; } @@ -722,27 +796,27 @@ int TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){ //*************************************************************************************************************** -int TrimSeqsCommand::stripForward(Sequence& seq){ +int TrimSeqsCommand::stripForward(Sequence& seq, int& group){ try { string rawSequence = seq.getUnaligned(); - bool success = pdiffs + 1; //guilty until proven innocent + int success = pdiffs + 1; //guilty until proven innocent //can you find the primer - for(int i=0;i::iterator it=primers.begin();it!=primers.end();it++){ + string oligo = it->first; + if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length + success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; } if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ + group = it->second; seq.setUnaligned(rawSequence.substr(oligo.length())); success = 0; break; } } - + //if you found the barcode or if you don't want to allow for diffs // cout << success; if ((pdiffs == 0) || (success == 0)) { return success; } @@ -753,27 +827,30 @@ int TrimSeqsCommand::stripForward(Sequence& seq){ int maxLength = 0; Alignment* alignment; - if (numFPrimers > 0) { + if (primers.size() > 0) { + map::iterator it=primers.begin(); - for(int i=0;i maxLength){ - maxLength = forPrimer[i].length(); + for(it;it!=primers.end();it++){ + if(it->first.length() > maxLength){ + maxLength = it->first.length(); } } - alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (maxLength+pdiffs+1)); + alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1)); }else{ alignment = NULL; } //can you find the barcode int minDiff = 1e6; int minCount = 1; + int minGroup = -1; int minPos = 0; - for(int i=0;i::iterator it=primers.begin();it!=primers.end();it++){ + string oligo = it->first; +// int length = oligo.length(); if(rawSequence.length() < maxLength){ - success = pdiffs + 1; + success = pdiffs + 100; break; } @@ -789,18 +866,16 @@ int TrimSeqsCommand::stripForward(Sequence& seq){ } oligo = oligo.substr(0,alnLength); temp = temp.substr(0,alnLength); -// cout << "barcode = " << oligo << " raw = " << rawSequence.substr(0,alnLength) << " raw aligned = " << temp << endl; - cout << seq.getName() << endl; - cout << temp << endl; - cout << oligo << endl; - cout << alnLength << endl; - cout << endl; int newStart=0; int numDiff = countDiffs(oligo, temp); + +// cout << oligo << '\t' << temp << '\t' << numDiff << endl; + if(numDiff < minDiff){ minDiff = numDiff; minCount = 1; + minGroup = it->second; minPos = 0; for(int i=0;i pdiffs){ success = pdiffs + 1; } - else if(minCount > 1) { success = pdiffs + 1; } - else{ - seq.setUnaligned("*" + rawSequence.substr(minPos)); + + if(minDiff > pdiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers + else{ //use the best match + group = minGroup; + seq.setUnaligned(rawSequence.substr(minPos)); success = minDiff; } if (alignment != NULL) { delete alignment; } } + return success; } @@ -999,16 +1077,41 @@ int TrimSeqsCommand::countDiffs(string oligo, string seq){ bool TrimSeqsCommand::stripQualThreshold(Sequence& seq, ifstream& qFile){ try { +// string rawSequence = seq.getUnaligned(); +// int seqLength; // = rawSequence.length(); +// string name, temp, temp2; +// +// qFile >> name; +// +// //get rest of line +// temp = ""; +// while (!qFile.eof()) { +// char c = qFile.get(); +// if (c == 10 || c == 13){ break; } +// else { temp += c; } +// } +// +// int pos = temp.find("length"); +// if (pos == temp.npos) { m->mothurOut("Cannot find length in qfile for " + seq.getName()); m->mothurOutEndLine(); seqLength = 0; } +// else { +// string tempLength = temp.substr(pos); +// istringstream iss (tempLength,istringstream::in); +// iss >> temp; +// } +// +// splitAtEquals(temp2, temp); //separates length=242, temp=length, temp2=242 +// convert(temp, seqLength); //converts string to int +// +// if (name.length() != 0) { if(name.substr(1) != seq.getName()) { m->mothurOut("sequence name mismatch btwn fasta and qual file"); m->mothurOutEndLine(); } } + string rawSequence = seq.getUnaligned(); - int seqLength; // = rawSequence.length(); - string name, temp, temp2; + int seqLength = seq.getNumBases(); + bool success = 0; //guilty until proven innocent + string name; + + qFile >> name; + if (name[0] == '>') { if(name.substr(1) != seq.getName()) { m->mothurOut("sequence name mismatch btwn fasta: " + seq.getName() + " and qual file: " + name); m->mothurOutEndLine(); } } - qFile >> name >> temp; - - splitAtEquals(temp2, temp); //separates length=242, temp=length, temp2=242 - convert(temp, seqLength); //converts string to int - - if (name.length() != 0) { if(name.substr(1) != seq.getName()) { m->mothurOut("sequence name mismatch btwn fasta and qual file"); m->mothurOutEndLine(); } } while (!qFile.eof()) { char c = qFile.get(); if (c == 10 || c == 13){ break; } } int score; @@ -1017,7 +1120,7 @@ bool TrimSeqsCommand::stripQualThreshold(Sequence& seq, ifstream& qFile){ for(int i=0;i> score; - if(score <= qThreshold){ + if(score < qThreshold){ end = i; break; }