X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=trimseqscommand.cpp;h=705ac25f5a5e727424a2567a0d358b4c1dc951e5;hb=8ef6687c1f586285d01c000cc5e359bf9c07c717;hp=537992584660b29696b602f09285aff6279aea73;hpb=c7f1abdd8b8d24f186320880515a848039ce051a;p=mothur.git diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index 5379925..705ac25 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -23,7 +23,7 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { else { //valid paramters for this command string AlignArray[] = {"fasta", "flip", "oligos", "maxambig", "maxhomop", "minlength", "maxlength", "qfile", - "qthreshold", "qaverage", "allfiles", "qtrim","diffs", "processors", "outputdir","inputdir"}; + "qthreshold", "qaverage", "allfiles", "qtrim","tdiffs", "pdiffs", "bdiffs", "processors", "outputdir","inputdir"}; vector myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string))); @@ -104,8 +104,17 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "0"; } convert(temp, maxLength); - temp = validParameter.validFile(parameters, "diffs", false); if (temp == "not found") { temp = "0"; } - convert(temp, diffs); + + temp = validParameter.validFile(parameters, "bdiffs", false); if (temp == "not found") { temp = "0"; } + convert(temp, bdiffs); + + temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; } + convert(temp, pdiffs); + + temp = validParameter.validFile(parameters, "tdiffs", false); if (temp == "not found") { int tempTotal = pdiffs + bdiffs; temp = toString(tempTotal); } + convert(temp, tdiffs); + + if(tdiffs == 0){ tdiffs = bdiffs + pdiffs; } temp = validParameter.validFile(parameters, "qfile", true); if (temp == "not found") { qFileName = ""; } @@ -160,7 +169,9 @@ void TrimSeqsCommand::help(){ m->mothurOut("The maxhomop parameter .... The default is 0.\n"); m->mothurOut("The minlength parameter .... The default is 0.\n"); m->mothurOut("The maxlength parameter .... The default is 0.\n"); - m->mothurOut("The diffs parameter .... The default is 0.\n"); + m->mothurOut("The tdiffs parameter is used to specify the total number of differences allowed in the sequence. The default is pdiffs + bdiffs.\n"); + m->mothurOut("The bdiffs parameter is used to specify the number of differences allowed in the barcode. The default is 0.\n"); + m->mothurOut("The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n"); m->mothurOut("The qfile parameter .....\n"); m->mothurOut("The qthreshold parameter .... The default is 0.\n"); m->mothurOut("The qaverage parameter .... The default is 0.\n"); @@ -255,13 +266,13 @@ int TrimSeqsCommand::execute(){ if (m->control_pressed) { return 0; } #else ifstream inFASTA; - openInputFile(fastafileNames[s], inFASTA); - numSeqs=count(istreambuf_iterator(inFASTA),istreambuf_iterator(), '>'); + openInputFile(fastaFile, inFASTA); + int numSeqs=count(istreambuf_iterator(inFASTA),istreambuf_iterator(), '>'); inFASTA.close(); lines.push_back(new linePair(0, numSeqs)); - driverCreateSummary(fastafile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames, lines[0], lines[0]); + driverCreateTrim(fastaFile, qFileName, trimSeqFile, scrapSeqFile, groupFile, fastaFileNames, lines[0], lines[0]); if (m->control_pressed) { return 0; } #endif @@ -320,7 +331,11 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if (oligoFile != "") { openOutputFile(groupFile, outGroups); for (int i = 0; i < fastaNames.size(); i++) { + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) fastaFileNames.push_back(new ofstream((fastaNames[i] + toString(getpid()) + ".temp").c_str(), ios::ate)); + #else + fastaFileNames.push_back(new ofstream((fastaNames[i] + toString(i) + ".temp").c_str(), ios::ate)); + #endif } } @@ -349,7 +364,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string return 0; } - bool success = 1; + int success = 1; Sequence currSeq(inFASTA); @@ -357,26 +372,33 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if (origSeq != "") { int group; string trashCode = ""; + int currentSeqsDiffs = 0; if(qFileName != ""){ if(qThreshold != 0) { success = stripQualThreshold(currSeq, qFile); } else if(qAverage != 0) { success = cullQualAverage(currSeq, qFile); } - if ((!qtrim) && (origSeq.length() != currSeq.getUnaligned().length())) { + + if (qtrim == 1 && (origSeq.length() != currSeq.getUnaligned().length())) { success = 0; //if you don't want to trim and the sequence does not meet quality requirements, move to scrap } + if(!success) { trashCode += 'q'; } } if(barcodes.size() != 0){ success = stripBarcode(currSeq, group); - if(!success){ trashCode += 'b'; } + if(success > bdiffs){ trashCode += 'b'; } + else{ currentSeqsDiffs += success; } } - + if(numFPrimers != 0){ success = stripForward(currSeq); - if(!success){ trashCode += 'f'; } + if(success > pdiffs){ trashCode += 'f'; } + else{ currentSeqsDiffs += success; } } - + + if (currentSeqsDiffs > tdiffs) { trashCode += 't'; } + if(numRPrimers != 0){ success = stripReverse(currSeq); if(!success){ trashCode += 'r'; } @@ -398,7 +420,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string if(flip){ currSeq.reverseComplement(); } // should go last if(trashCode.length() == 0){ - currSeq.setAligned(currSeq.getUnaligned()); //this is because of a modification we made to the sequence class to fix a bug. all seqs have an aligned version, which is the version that gets printed. + currSeq.setAligned(currSeq.getUnaligned()); currSeq.printSequence(outFASTA); if(barcodes.size() != 0){ outGroups << currSeq.getName() << '\t' << groupVector[group] << endl; @@ -411,6 +433,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string else{ currSeq.setName(currSeq.getName() + '|' + trashCode); currSeq.setUnaligned(origSeq); + currSeq.setAligned(origSeq); currSeq.printSequence(scrapFASTA); } } @@ -587,69 +610,110 @@ void TrimSeqsCommand::getOligos(vector& outFASTAVec){ //vector::iterator it=barcodes.begin();it!=barcodes.end();it++){ string oligo = it->first; if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length - success = 0; - break; + success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; } if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ group = it->second; seq.setUnaligned(rawSequence.substr(oligo.length())); - success = 1; + success = 0; break; } } //if you found the barcode or if you don't want to allow for diffs - if ((diffs == 0) || (success == 1)) { return success; } +// cout << success; + if ((bdiffs == 0) || (success == 0)) { return success; } else { //try aligning and see if you can find it - +// cout << endl; + + int maxLength = 0; + Alignment* alignment; - if (barcodes.size() > 0) { //assumes barcodes are all the same length + if (barcodes.size() > 0) { map::iterator it=barcodes.begin(); - string temp = it->first; - - alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (temp.length()+diffs+1)); + + for(it;it!=barcodes.end();it++){ + if(it->first.length() > maxLength){ + maxLength = it->first.length(); + } + } + alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (maxLength+bdiffs+1)); + }else{ alignment = NULL; } - //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minGroup = -1; + int minPos = 0; + for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ string oligo = it->first; - int length = oligo.length(); +// int length = oligo.length(); - if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length - success = 0; + if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; break; } //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->align(oligo, rawSequence.substr(0,length+diffs)); + alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs)); oligo = alignment->getSeqAAln(); string temp = alignment->getSeqBAln(); - cout << "barcode = " << oligo << " raw = " << rawSequence.substr(0,oligo.length()) << " raw aligned = " << temp << endl; + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ + if(oligo[i] != '-'){ alnLength = i+1; break; } + } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); int newStart=0; - if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()), length, newStart)){ - cout << "found match" << endl; - group = it->second; - seq.setUnaligned(rawSequence.substr(newStart)); - success = 1; - break; + int numDiff = countDiffs(oligo, temp); + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minGroup = it->second; + minPos = 0; + for(int i=0;i bdiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes + else{ //use the best match + group = minGroup; + seq.setUnaligned(rawSequence.substr(minPos)); + success = minDiff; } if (alignment != NULL) { delete alignment; } + } +// cout << success << endl; + return success; } @@ -662,63 +726,103 @@ bool TrimSeqsCommand::stripBarcode(Sequence& seq, int& group){ //*************************************************************************************************************** -bool TrimSeqsCommand::stripForward(Sequence& seq){ +int TrimSeqsCommand::stripForward(Sequence& seq){ try { string rawSequence = seq.getUnaligned(); - bool success = 0; //guilty until proven innocent + int success = pdiffs + 1; //guilty until proven innocent + //can you find the primer for(int i=0;i 0) { alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (forPrimer[0].length()+diffs+1)); } //assumes primers are all the same length - else{ alignment = NULL; } + if (numFPrimers > 0) { + + for(int i=0;i maxLength){ + maxLength = forPrimer[i].length(); + } + } + alignment = new NeedlemanOverlap(-2.0, 1.0, -1.0, (maxLength+pdiffs+1)); + + }else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minPos = 0; - //can you find the primer for(int i=0;ialign(oligo, rawSequence.substr(0,length+diffs)); + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs)); oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); - int newStart = 0; - if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()), length, newStart)){ - seq.setUnaligned(rawSequence.substr(newStart)); - success = 1; - break; + for(int i=oligo.length()-1;i>=0;i--){ + if(oligo[i] != '-'){ alnLength = i+1; break; } } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + + int newStart=0; + int numDiff = countDiffs(oligo, temp); + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minPos = 0; + for(int i=0;i pdiffs) { success = minDiff; } + else if(minCount > 1) { success = pdiffs + 10; } + else{ + seq.setUnaligned(rawSequence.substr(minPos)); + success = minDiff; } if (alignment != NULL) { delete alignment; } + } - return success; - + } catch(exception& e) { m->errorOut(e, "TrimSeqsCommand", "stripForward"); @@ -856,24 +960,14 @@ bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){ } //*************************************************************************************************************** -bool TrimSeqsCommand::compareDNASeq(string oligo, string seq, int numBases, int& end){ +int TrimSeqsCommand::countDiffs(string oligo, string seq){ try { - bool success = 1; + int length = oligo.length(); - end = length; - int countBases = 0; int countDiffs = 0; - if (length != 0) { - if ((oligo[0] == '-') || (oligo[0] == '.')) { success = 0; return success; } //no gaps allowed at beginning - } - for(int i=0;ierrorOut(e, "TrimSeqsCommand", "countDiffs"); exit(1); } @@ -909,16 +997,41 @@ bool TrimSeqsCommand::compareDNASeq(string oligo, string seq, int numBases, int& bool TrimSeqsCommand::stripQualThreshold(Sequence& seq, ifstream& qFile){ try { +// string rawSequence = seq.getUnaligned(); +// int seqLength; // = rawSequence.length(); +// string name, temp, temp2; +// +// qFile >> name; +// +// //get rest of line +// temp = ""; +// while (!qFile.eof()) { +// char c = qFile.get(); +// if (c == 10 || c == 13){ break; } +// else { temp += c; } +// } +// +// int pos = temp.find("length"); +// if (pos == temp.npos) { m->mothurOut("Cannot find length in qfile for " + seq.getName()); m->mothurOutEndLine(); seqLength = 0; } +// else { +// string tempLength = temp.substr(pos); +// istringstream iss (tempLength,istringstream::in); +// iss >> temp; +// } +// +// splitAtEquals(temp2, temp); //separates length=242, temp=length, temp2=242 +// convert(temp, seqLength); //converts string to int +// +// if (name.length() != 0) { if(name.substr(1) != seq.getName()) { m->mothurOut("sequence name mismatch btwn fasta and qual file"); m->mothurOutEndLine(); } } + string rawSequence = seq.getUnaligned(); - int seqLength; // = rawSequence.length(); - string name, temp, temp2; + int seqLength = seq.getNumBases(); + bool success = 0; //guilty until proven innocent + string name; + + qFile >> name; + if (name[0] == '>') { if(name.substr(1) != seq.getName()) { m->mothurOut("sequence name mismatch btwn fasta: " + seq.getName() + " and qual file: " + name); m->mothurOutEndLine(); } } - qFile >> name >> temp; - - splitAtEquals(temp2, temp); //separates length=242, temp=length, temp2=242 - convert(temp, seqLength); //converts string to int - - if (name.length() != 0) { if(name.substr(1) != seq.getName()) { m->mothurOut("sequence name mismatch btwn fasta and qual file"); m->mothurOutEndLine(); } } while (!qFile.eof()) { char c = qFile.get(); if (c == 10 || c == 13){ break; } } int score; @@ -927,7 +1040,7 @@ bool TrimSeqsCommand::stripQualThreshold(Sequence& seq, ifstream& qFile){ for(int i=0;i> score; - if(score <= qThreshold){ + if(score < qThreshold){ end = i; break; }