X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;ds=sidebyside;f=trimoligos.cpp;h=91bdb83ad2d3a7f9e97713dba66709018d89908c;hb=e9845ee4c8db2e044e87d721cc2d94f8d609e03d;hp=a53f4e85388f0f7bbb6fda40c2074b4f555e29d3;hpb=7f1aca4ed9e6db70de82e80ca4771f7680b21d26;p=mothur.git diff --git a/trimoligos.cpp b/trimoligos.cpp index a53f4e8..91bdb83 100644 --- a/trimoligos.cpp +++ b/trimoligos.cpp @@ -17,6 +17,8 @@ TrimOligos::TrimOligos(int p, int b, int l, int s, map pr, map br, vector r, vector lk, vector sp){ try { m = MothurOut::getInstance(); + paired = false; + trashCode = ""; pdiffs = p; bdiffs = b; @@ -71,6 +73,8 @@ TrimOligos::TrimOligos(int p, int b, int l, int s, map pr, map< bdiffs = b; ldiffs = l; sdiffs = s; + paired = true; + trashCode = ""; maxFBarcodeLength = 0; for(map::iterator it=br.begin();it!=br.end();it++){ @@ -144,6 +148,8 @@ TrimOligos::TrimOligos(int p, int b, map pr, map br, v barcodes = br; primers = pr; revPrimer = r; + paired = false; + trashCode = ""; maxFBarcodeLength = 0; for(map::iterator it=barcodes.begin();it!=barcodes.end();it++){ @@ -175,6 +181,7 @@ bool TrimOligos::findForward(Sequence& seq, int& primerStart, int& primerEnd){ try { string rawSequence = seq.getUnaligned(); + trashCode = ""; for(map::iterator it=primers.begin();it!=primers.end();it++){ string oligo = it->first; @@ -197,7 +204,7 @@ bool TrimOligos::findForward(Sequence& seq, int& primerStart, int& primerEnd){ primerStart = 0; primerEnd = 0; //if you don't want to allow for diffs - if (pdiffs == 0) { return false; } + if (pdiffs == 0) { trashCode = "f"; return false; } else { //try aligning and see if you can find it //can you find the barcode @@ -224,7 +231,7 @@ bool TrimOligos::findForward(Sequence& seq, int& primerStart, int& primerEnd){ string rawChunk = rawSequence.substr(j, olength+pdiffs); //use needleman to align first primer.length()+numdiffs of sequence to each barcode - alignment->align(oligo, rawChunk); + alignment->alignPrimer(oligo, rawChunk); oligo = alignment->getSeqAAln(); string temp = alignment->getSeqBAln(); @@ -250,11 +257,12 @@ bool TrimOligos::findForward(Sequence& seq, int& primerStart, int& primerEnd){ if (alignment != NULL) { delete alignment; } - if(minDiff > pdiffs) { primerStart = 0; primerEnd = 0; return false; } //no good matches - else if(minCount > 1) { primerStart = 0; primerEnd = 0; return false; } //can't tell the difference between multiple primers - else{ return true; } + if(minDiff > pdiffs) { primerStart = 0; primerEnd = 0; trashCode = "f"; return false; } //no good matches + else if(minCount > 1) { primerStart = 0; primerEnd = 0; trashCode = "f"; return false; } //can't tell the difference between multiple primers + else{ trashCode = ""; return true; } } - + + trashCode = "f"; primerStart = 0; primerEnd = 0; return false; @@ -267,7 +275,7 @@ bool TrimOligos::findForward(Sequence& seq, int& primerStart, int& primerEnd){ //******************************************************************/ bool TrimOligos::findReverse(Sequence& seq, int& primerStart, int& primerEnd){ try { - + trashCode = ""; string rawSequence = seq.getUnaligned(); for(int i=0;ialign(oligo, rawSequence.substr(0,oligo.length()+bdiffs)); + alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+bdiffs)); oligo = alignment->getSeqAAln(); string temp = alignment->getSeqBAln(); @@ -380,8 +392,8 @@ int TrimOligos::stripBarcode(Sequence& seq, QualityScores& qual, int& group){ } - if(minDiff > bdiffs) { success = minDiff; } //no good matches - else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes + if(minDiff > bdiffs) { trashCode = "b"; success = minDiff; } //no good matches + else if(minCount > 1) { trashCode = "b"; success = bdiffs + 100; } //can't tell the difference between multiple barcodes else{ //use the best match group = minGroup; seq.setUnaligned(rawSequence.substr(minPos)); @@ -390,6 +402,7 @@ int TrimOligos::stripBarcode(Sequence& seq, QualityScores& qual, int& group){ qual.trimQScores(minPos, -1); } success = minDiff; + trashCode = ""; } if (alignment != NULL) { delete alignment; } @@ -407,6 +420,7 @@ int TrimOligos::stripBarcode(Sequence& seq, QualityScores& qual, int& group){ //*******************************************************************/ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, int& group){ try { + trashCode = ""; //look for forward barcode string rawFSequence = forwardSeq.getUnaligned(); string rawRSequence = reverseSeq.getUnaligned(); @@ -419,25 +433,33 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, int& gr if(rawFSequence.length() < foligo.length()){ //let's just assume that the barcodes are the same length success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out - break; + trashCode += "b"; + break; } if(rawRSequence.length() < roligo.length()){ //let's just assume that the barcodes are the same length success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out + trashCode += "d"; break; } - if((compareDNASeq(foligo, rawFSequence.substr(0,foligo.length()))) && (compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length())))) { - group = it->first; - forwardSeq.setUnaligned(rawFSequence.substr(foligo.length())); - reverseSeq.setUnaligned(rawRSequence.substr(0,(rawRSequence.length()-roligo.length()))); - success = 0; - break; - } + if(compareDNASeq(foligo, rawFSequence.substr(0,foligo.length()))) { + if (compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length()))) { + group = it->first; + forwardSeq.setUnaligned(rawFSequence.substr(foligo.length())); + reverseSeq.setUnaligned(rawRSequence.substr(0,(rawRSequence.length()-roligo.length()))); + success = 0; trashCode = ""; + return success; + } + }else { + trashCode = "b"; + if (!compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length()))) { trashCode += "d"; } + } } //if you found the barcode or if you don't want to allow for diffs - if ((bdiffs == 0) || (success == 0)) { return success; } + if (bdiffs == 0) { return success; } else { //try aligning and see if you can find it + trashCode = ""; Alignment* alignment; if (ifbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFBarcodeLength+bdiffs+1)); } else{ alignment = NULL; } @@ -473,7 +495,7 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, int& gr } //cout << "before = " << oligo << '\t' << rawFSequence.substr(0,oligo.length()+bdiffs) << endl; //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->align(oligo, rawFSequence.substr(0,oligo.length()+bdiffs)); + alignment->alignPrimer(oligo, rawFSequence.substr(0,oligo.length()+bdiffs)); oligo = alignment->getSeqAAln(); string temp = alignment->getSeqBAln(); @@ -513,8 +535,8 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, int& gr } //cout << minDiff << '\t' << minCount << '\t' << endl; - if(minDiff > bdiffs) { success = minDiff; } //no good matches - else{ + if(minDiff > bdiffs) { trashCode = "b"; minFGroup.clear(); success = minDiff; } //no good matches + //else{ //check for reverse match if (alignment != NULL) { delete alignment; } @@ -536,7 +558,7 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, int& gr } //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->align(oligo, rawRSequence.substr(0,oligo.length()+bdiffs)); + alignment->alignPrimer(oligo, rawRSequence.substr(0,oligo.length()+bdiffs)); oligo = alignment->getSeqAAln(); string temp = alignment->getSeqBAln(); @@ -587,7 +609,7 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, int& gr cout << endl; } cout << endl;*/ - if(minDiff > bdiffs) { success = minDiff; } //no good matches + if(minDiff > bdiffs) { trashCode += "d"; success = minDiff; } //no good matches else { bool foundMatch = false; vector matches; @@ -613,13 +635,16 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, int& gr forwardSeq.setUnaligned(rawFSequence.substr(fStart)); reverseSeq.setUnaligned(rawRSequence.substr(rStart)); success = minDiff; - }else { success = bdiffs + 100; } + }else { if (trashCode == "") { trashCode = "bd"; } success = bdiffs + 100; } } - } + //} if (alignment != NULL) { delete alignment; } } + //catchall for case I didn't think of + if ((trashCode == "") && (success > bdiffs)) { trashCode = "bd"; } + return success; } @@ -632,6 +657,8 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, int& gr //*******************************************************************/ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, QualityScores& forwardQual, QualityScores& reverseQual, int& group){ try { + trashCode = ""; + //look for forward barcode string rawFSequence = forwardSeq.getUnaligned(); string rawRSequence = reverseSeq.getUnaligned(); @@ -644,27 +671,36 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, Quality if(rawFSequence.length() < foligo.length()){ //let's just assume that the barcodes are the same length success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out + trashCode = "b"; break; } if(rawRSequence.length() < roligo.length()){ //let's just assume that the barcodes are the same length success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out + trashCode += "d"; break; } - if((compareDNASeq(foligo, rawFSequence.substr(0,foligo.length()))) && (compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length())))) { - group = it->first; - forwardSeq.setUnaligned(rawFSequence.substr(foligo.length())); - reverseSeq.setUnaligned(rawRSequence.substr(0,(rawRSequence.length()-roligo.length()))); - forwardQual.trimQScores(foligo.length(), -1); - reverseQual.trimQScores(-1, rawRSequence.length()-roligo.length()); - success = 0; - break; + if(compareDNASeq(foligo, rawFSequence.substr(0,foligo.length()))) { + if (compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length()))) { + group = it->first; + forwardSeq.setUnaligned(rawFSequence.substr(foligo.length())); + reverseSeq.setUnaligned(rawRSequence.substr(0,(rawRSequence.length()-roligo.length()))); + forwardQual.trimQScores(foligo.length(), -1); + reverseQual.trimQScores(-1, rawRSequence.length()-roligo.length()); + success = 0; trashCode = ""; + return success; + } + }else { + trashCode = "b"; + if (!compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length()))) { trashCode += "d"; + } } } //if you found the barcode or if you don't want to allow for diffs - if ((bdiffs == 0) || (success == 0)) { return success; } + if (bdiffs == 0) { return success; } else { //try aligning and see if you can find it + trashCode = ""; Alignment* alignment; if (ifbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFBarcodeLength+bdiffs+1)); } else{ alignment = NULL; } @@ -700,7 +736,7 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, Quality } //cout << "before = " << oligo << '\t' << rawFSequence.substr(0,oligo.length()+bdiffs) << endl; //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->align(oligo, rawFSequence.substr(0,oligo.length()+bdiffs)); + alignment->alignPrimer(oligo, rawFSequence.substr(0,oligo.length()+bdiffs)); oligo = alignment->getSeqAAln(); string temp = alignment->getSeqBAln(); @@ -740,8 +776,8 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, Quality } //cout << minDiff << '\t' << minCount << '\t' << endl; - if(minDiff > bdiffs) { success = minDiff; } //no good matches - else{ + if(minDiff > bdiffs) { trashCode = "b"; minFGroup.clear(); success = minDiff; } //no good matches + //else{ //check for reverse match if (alignment != NULL) { delete alignment; } @@ -763,7 +799,7 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, Quality } //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->align(oligo, rawRSequence.substr(0,oligo.length()+bdiffs)); + alignment->alignPrimer(oligo, rawRSequence.substr(0,oligo.length()+bdiffs)); oligo = alignment->getSeqAAln(); string temp = alignment->getSeqBAln(); @@ -814,7 +850,7 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, Quality cout << endl; } cout << endl;*/ - if(minDiff > bdiffs) { success = minDiff; } //no good matches + if(minDiff > bdiffs) { trashCode = "d"; success = minDiff; } //no good matches else { bool foundMatch = false; vector matches; @@ -842,13 +878,16 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, Quality forwardQual.trimQScores(fStart, -1); reverseQual.trimQScores(rStart, -1); success = minDiff; - }else { success = bdiffs + 100; } + }else { if (trashCode == "") { trashCode = "bd"; } success = bdiffs + 100; } } - } + //} if (alignment != NULL) { delete alignment; } } + //catchall for case I didn't think of + if ((trashCode == "") && (success > bdiffs)) { trashCode = "bd"; } + return success; } @@ -858,6 +897,508 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, Quality } } +//*******************************************************************/ +int TrimOligos::stripPairedBarcode(Sequence& seq, QualityScores& qual, int& group){ + try { + trashCode = ""; + + //look for forward barcode + string rawSeq = seq.getUnaligned(); + int success = bdiffs + 1; //guilty until proven innocent + //cout << seq.getName() << endl; + //can you find the forward barcode + for(map::iterator it=ipbarcodes.begin();it!=ipbarcodes.end();it++){ + string foligo = it->second.forward; + string roligo = it->second.reverse; + //cout << it->first << '\t' << foligo << '\t' << roligo << endl; + //cout << it->first << '\t' << rawSeq.substr(0,foligo.length()) << '\t' << rawSeq.substr(rawSeq.length()-roligo.length(),roligo.length()) << endl; + if(rawSeq.length() < foligo.length()){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out + trashCode = "b"; + break; + } + if(rawSeq.length() < roligo.length()){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out + trashCode += "d"; + break; + } + + if(compareDNASeq(foligo, rawSeq.substr(0,foligo.length()))) { //find forward + if (compareDNASeq(roligo, rawSeq.substr(rawSeq.length()-roligo.length(),roligo.length()))) { //find reverse + group = it->first; + string trimmedSeq = rawSeq.substr(foligo.length()); //trim forward barcode + seq.setUnaligned(trimmedSeq.substr(0,(trimmedSeq.length()-roligo.length()))); //trim reverse barcode + if(qual.getName() != ""){ + qual.trimQScores(-1, rawSeq.length()-roligo.length()); + qual.trimQScores(foligo.length(), -1); + } + success = 0; + trashCode = ""; + return success; + } + }else { + trashCode = "b"; + if (!compareDNASeq(roligo, rawSeq.substr(rawSeq.length()-roligo.length(),roligo.length()))) { trashCode += "d"; } + } + } + //cout << "success=" << success << endl; + //if you found the barcode or if you don't want to allow for diffs + if (bdiffs == 0) { return success; } + else { //try aligning and see if you can find it + Alignment* alignment; + if (ifbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFBarcodeLength+bdiffs+1)); } + else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + vector< vector > minFGroup; + vector minFPos; + + //the pair can have duplicates, but we only want to search for the unique forward and reverses, example + /* + 1 Sarah Westcott + 2 John Westcott + 3 Anna Westcott + 4 Sarah Schloss + 5 Pat Schloss + 6 Gail Brown + 7 Pat Moore + + only want to look for forward = Sarah, John, Anna, Pat, Gail + reverse = Westcott, Schloss, Brown, Moore + + but if best match forward = 4, and reverse = 1, we want to count as a valid match because forward 1 and forward 4 are the same. so both barcodes map to same group. + */ + //cout << endl << forwardSeq.getName() << endl; + for(map >::iterator it=ifbarcodes.begin();it!=ifbarcodes.end();it++){ + string oligo = it->first; + + if(rawSeq.length() < maxFBarcodeLength){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; + break; + } + //cout << "before = " << oligo << '\t' << rawSeq.substr(0,oligo.length()+bdiffs) << endl; + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->alignPrimer(oligo, rawSeq.substr(0,oligo.length()+bdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + int numDiff = countDiffs(oligo, temp); + + if (alnLength == 0) { numDiff = bdiffs + 100; } + //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl; + + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minFGroup.clear(); + minFGroup.push_back(it->second); + int tempminFPos = 0; + minFPos.clear(); + for(int i=0;isecond); + int tempminFPos = 0; + for(int i=0;i bdiffs) { trashCode = "b"; minFGroup.clear(); success = minDiff; } //no good matches + //else{ + //check for reverse match + if (alignment != NULL) { delete alignment; } + + if (irbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxRBarcodeLength+bdiffs+1)); } + else{ alignment = NULL; } + + //can you find the barcode + minDiff = 1e6; + minCount = 1; + vector< vector > minRGroup; + vector minRPos; + + string rawRSequence = reverseOligo(seq.getUnaligned()); + //cout << irbarcodes.size() << '\t' << maxRBarcodeLength << endl; + for(map >::iterator it=irbarcodes.begin();it!=irbarcodes.end();it++){ + string oligo = reverseOligo(it->first); + //cout << "r before = " << reverseOligo(oligo) << '\t' << reverseOligo(rawRSequence.substr(0,oligo.length()+bdiffs)) << endl; + if(rawRSequence.length() < maxRBarcodeLength){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->alignPrimer(oligo, rawRSequence.substr(0,oligo.length()+bdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + int numDiff = countDiffs(oligo, temp); + if (alnLength == 0) { numDiff = bdiffs + 100; } + + //cout << "r after = " << reverseOligo(oligo) << '\t' << reverseOligo(temp) << '\t' << numDiff << endl; + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minRGroup.clear(); + minRGroup.push_back(it->second); + int tempminRPos = 0; + minRPos.clear(); + for(int i=0;isecond); + } + + } + + /*cout << minDiff << '\t' << minCount << '\t' << endl; + for (int i = 0; i < minFGroup.size(); i++) { + cout << i << '\t'; + for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; } + cout << endl; + } + cout << endl; + for (int i = 0; i < minRGroup.size(); i++) { + cout << i << '\t'; + for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; } + cout << endl; + } + cout << endl;*/ + if(minDiff > bdiffs) { trashCode += "d"; success = minDiff; } //no good matches + else { + bool foundMatch = false; + vector matches; + for (int i = 0; i < minFGroup.size(); i++) { + for (int j = 0; j < minFGroup[i].size(); j++) { + for (int k = 0; k < minRGroup.size(); k++) { + if (m->inUsersGroups(minFGroup[i][j], minRGroup[k])) { matches.push_back(minFGroup[i][j]); k+= minRGroup.size(); } + } + } + } + + int fStart = 0; + int rStart = 0; + if (matches.size() == 1) { + foundMatch = true; + group = matches[0]; + fStart = minFPos[0]; + rStart = rawSeq.length() - minRPos[0]; + } + + //we have an acceptable match for the forward and reverse, but do they match? + if (foundMatch) { + string trimmedSeq = rawSeq.substr(0, rStart); //trim reverse barcode + seq.setUnaligned(trimmedSeq.substr(fStart)); //trim forward barcode + if(qual.getName() != ""){ + qual.trimQScores(-1, rStart); + qual.trimQScores(fStart, -1); + } + success = minDiff; + //cout << "barcode = " << ipbarcodes[group].forward << '\t' << ipbarcodes[group].reverse << endl; + }else { if (trashCode == "") { trashCode = "bd"; } success = bdiffs + 100; } + } + //} + + if (alignment != NULL) { delete alignment; } + } + + //catchall for case I didn't think of + if ((trashCode == "") && (success > bdiffs)) { trashCode = "bd"; } + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripPairedBarcode"); + exit(1); + } + +} +//*******************************************************************/ +int TrimOligos::stripPairedPrimers(Sequence& seq, QualityScores& qual, int& group, bool keepForward){ + try { + trashCode = ""; + //look for forward + string rawSeq = seq.getUnaligned(); + int success = pdiffs + 1; //guilty until proven innocent + //cout << seq.getName() << endl; + //can you find the forward + for(map::iterator it=ipprimers.begin();it!=ipprimers.end();it++){ + string foligo = it->second.forward; + string roligo = it->second.reverse; + + //cout << it->first << '\t' << foligo << '\t' << roligo << endl; + //cout << it->first << '\t' << rawSeq.substr(0,foligo.length()) << '\t' << rawSeq.substr(rawSeq.length()-roligo.length(),roligo.length()) << endl; + if(rawSeq.length() < foligo.length()){ //let's just assume that the barcodes are the same length + success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out + trashCode = "p"; + break; + } + if(rawSeq.length() < roligo.length()){ //let's just assume that the barcodes are the same length + success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out + trashCode += "q"; + break; + } + + if(compareDNASeq(foligo, rawSeq.substr(0,foligo.length()))) { //find forward + if (compareDNASeq(roligo, rawSeq.substr(rawSeq.length()-roligo.length(),roligo.length()))) { //find reverse + group = it->first; + string trimmedSeq = rawSeq.substr(foligo.length()); //trim forward barcode + seq.setUnaligned(trimmedSeq.substr(0,(trimmedSeq.length()-roligo.length()))); //trim reverse barcode + if(qual.getName() != ""){ + qual.trimQScores(-1, rawSeq.length()-roligo.length()); + qual.trimQScores(foligo.length(), -1); + } + success = 0; + trashCode = ""; + return success; + } + }else { + trashCode = "b"; + if (!compareDNASeq(roligo, rawSeq.substr(rawSeq.length()-roligo.length(),roligo.length()))) { trashCode += "d"; } + } + + } + //cout << "success=" << success << endl; + //if you found the barcode or if you don't want to allow for diffs + if (pdiffs == 0) { return success; } + else { //try aligning and see if you can find it + Alignment* alignment; + if (ifprimers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFPrimerLength+pdiffs+1)); } + else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + vector< vector > minFGroup; + vector minFPos; + + //the pair can have duplicates, but we only want to search for the unique forward and reverses, example + /* + 1 Sarah Westcott + 2 John Westcott + 3 Anna Westcott + 4 Sarah Schloss + 5 Pat Schloss + 6 Gail Brown + 7 Pat Moore + + only want to look for forward = Sarah, John, Anna, Pat, Gail + reverse = Westcott, Schloss, Brown, Moore + + but if best match forward = 4, and reverse = 1, we want to count as a valid match because forward 1 and forward 4 are the same. so both barcodes map to same group. + */ + //cout << endl << forwardSeq.getName() << endl; + for(map >::iterator it=ifprimers.begin();it!=ifprimers.end();it++){ + string oligo = it->first; + + if(rawSeq.length() < maxFPrimerLength){ //let's just assume that the barcodes are the same length + success = pdiffs + 10; + break; + } + //cout << "before = " << oligo << '\t' << rawSeq.substr(0,oligo.length()+bdiffs) << endl; + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->alignPrimer(oligo, rawSeq.substr(0,oligo.length()+pdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + int numDiff = countDiffs(oligo, temp); + + if (alnLength == 0) { numDiff = pdiffs + 100; } + //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl; + + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minFGroup.clear(); + minFGroup.push_back(it->second); + int tempminFPos = 0; + minFPos.clear(); + for(int i=0;isecond); + int tempminFPos = 0; + for(int i=0;i pdiffs) { trashCode = "p"; minFGroup.clear(); success = minDiff; } //no good matches + //else{ + //check for reverse match + if (alignment != NULL) { delete alignment; } + + if (irprimers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxRPrimerLength+pdiffs+1)); } + else{ alignment = NULL; } + + //can you find the barcode + minDiff = 1e6; + minCount = 1; + vector< vector > minRGroup; + vector minRPos; + + string rawRSequence = reverseOligo(seq.getUnaligned()); + + for(map >::iterator it=irprimers.begin();it!=irprimers.end();it++){ + string oligo = reverseOligo(it->first); + //cout << "r before = " << reverseOligo(oligo) << '\t' << reverseOligo(rawRSequence.substr(0,oligo.length()+pdiffs)) << endl; + if(rawRSequence.length() < maxRPrimerLength){ //let's just assume that the barcodes are the same length + success = pdiffs + 10; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->alignPrimer(oligo, rawRSequence.substr(0,oligo.length()+pdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + int numDiff = countDiffs(oligo, temp); + if (alnLength == 0) { numDiff = pdiffs + 100; } + + //cout << "r after = " << reverseOligo(oligo) << '\t' << reverseOligo(temp) << '\t' << numDiff << endl; + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minRGroup.clear(); + minRGroup.push_back(it->second); + int tempminRPos = 0; + minRPos.clear(); + for(int i=0;isecond); + } + + } + + /*cout << minDiff << '\t' << minCount << '\t' << endl; + for (int i = 0; i < minFGroup.size(); i++) { + cout << i << '\t'; + for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; } + cout << endl; + } + cout << endl; + for (int i = 0; i < minRGroup.size(); i++) { + cout << i << '\t'; + for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; } + cout << endl; + } + cout << endl;*/ + if(minDiff > pdiffs) { trashCode += "q"; success = minDiff; } //no good matches + else { + bool foundMatch = false; + vector matches; + for (int i = 0; i < minFGroup.size(); i++) { + for (int j = 0; j < minFGroup[i].size(); j++) { + for (int k = 0; k < minRGroup.size(); k++) { + if (m->inUsersGroups(minFGroup[i][j], minRGroup[k])) { matches.push_back(minFGroup[i][j]); k+= minRGroup.size(); } + } + } + } + + int fStart = 0; + int rStart = 0; + if (matches.size() == 1) { + foundMatch = true; + group = matches[0]; + fStart = minFPos[0]; + rStart = rawSeq.length() - minRPos[0]; + } + + //we have an acceptable match for the forward and reverse, but do they match? + if (foundMatch) { + if (!keepForward) { + string trimmedSeq = rawSeq.substr(0, rStart); //trim reverse barcode + seq.setUnaligned(trimmedSeq.substr(fStart)); //trim forward barcode + if(qual.getName() != ""){ + qual.trimQScores(-1, rStart); + qual.trimQScores(fStart, -1); + } + } + success = minDiff; + //cout << "primer = " << ipprimers[group].forward << '\t' << ipprimers[group].reverse << endl; + }else { if (trashCode == "") { trashCode = "pq"; } success = pdiffs + 100; } + } + //} + + if (alignment != NULL) { delete alignment; } + } + + //catchall for case I didn't think of + if ((trashCode == "") && (success > pdiffs)) { trashCode = "pq"; } + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripPairedPrimers"); + exit(1); + } + +} + //*******************************************************************/ int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, QualityScores& forwardQual, QualityScores& reverseQual, int& group){ try { @@ -929,7 +1470,7 @@ int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, Quality } //cout << "before = " << oligo << '\t' << rawFSequence.substr(0,oligo.length()+pdiffs) << endl; //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->align(oligo, rawFSequence.substr(0,oligo.length()+pdiffs)); + alignment->alignPrimer(oligo, rawFSequence.substr(0,oligo.length()+pdiffs)); oligo = alignment->getSeqAAln(); string temp = alignment->getSeqBAln(); @@ -992,7 +1533,7 @@ int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, Quality } //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->align(oligo, rawRSequence.substr(0,oligo.length()+pdiffs)); + alignment->alignPrimer(oligo, rawRSequence.substr(0,oligo.length()+pdiffs)); oligo = alignment->getSeqAAln(); string temp = alignment->getSeqBAln(); @@ -1156,7 +1697,7 @@ int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, int& gr } //cout << "before = " << oligo << '\t' << rawFSequence.substr(0,oligo.length()+pdiffs) << endl; //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->align(oligo, rawFSequence.substr(0,oligo.length()+pdiffs)); + alignment->alignPrimer(oligo, rawFSequence.substr(0,oligo.length()+pdiffs)); oligo = alignment->getSeqAAln(); string temp = alignment->getSeqBAln(); @@ -1219,7 +1760,7 @@ int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, int& gr } //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->align(oligo, rawRSequence.substr(0,oligo.length()+pdiffs)); + alignment->alignPrimer(oligo, rawRSequence.substr(0,oligo.length()+pdiffs)); oligo = alignment->getSeqAAln(); string temp = alignment->getSeqBAln(); @@ -1360,7 +1901,7 @@ int TrimOligos::stripBarcode(Sequence& seq, int& group){ } //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs)); + alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+bdiffs)); oligo = alignment->getSeqAAln(); string temp = alignment->getSeqBAln(); @@ -1460,7 +2001,7 @@ int TrimOligos::stripForward(Sequence& seq, int& group){ } //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs)); + alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+pdiffs)); oligo = alignment->getSeqAAln(); string temp = alignment->getSeqBAln(); @@ -1514,6 +2055,9 @@ int TrimOligos::stripForward(Sequence& seq, int& group){ //*******************************************************************/ int TrimOligos::stripForward(Sequence& seq, QualityScores& qual, int& group, bool keepForward){ try { + + if (paired) { int success = stripPairedPrimers(seq, qual, group, keepForward); return success; } + string rawSequence = seq.getUnaligned(); int success = pdiffs + 1; //guilty until proven innocent @@ -1560,7 +2104,7 @@ int TrimOligos::stripForward(Sequence& seq, QualityScores& qual, int& group, boo } //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs)); + alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+pdiffs)); oligo = alignment->getSeqAAln(); string temp = alignment->getSeqBAln(); @@ -1722,7 +2266,7 @@ bool TrimOligos::stripLinker(Sequence& seq, QualityScores& qual){ } //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->align(oligo, rawSequence.substr(0,oligo.length()+ldiffs)); + alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+ldiffs)); oligo = alignment->getSeqAAln(); string temp = alignment->getSeqBAln(); @@ -1821,7 +2365,7 @@ bool TrimOligos::stripLinker(Sequence& seq){ } //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->align(oligo, rawSequence.substr(0,oligo.length()+ldiffs)); + alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+ldiffs)); oligo = alignment->getSeqAAln(); string temp = alignment->getSeqBAln(); @@ -1918,7 +2462,7 @@ bool TrimOligos::stripSpacer(Sequence& seq, QualityScores& qual){ } //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->align(oligo, rawSequence.substr(0,oligo.length()+sdiffs)); + alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+sdiffs)); oligo = alignment->getSeqAAln(); string temp = alignment->getSeqBAln(); @@ -2017,7 +2561,7 @@ bool TrimOligos::stripSpacer(Sequence& seq){ } //use needleman to align first barcode.length()+numdiffs of sequence to each barcode - alignment->align(oligo, rawSequence.substr(0,oligo.length()+sdiffs)); + alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+sdiffs)); oligo = alignment->getSeqAAln(); string temp = alignment->getSeqBAln(); @@ -2137,6 +2681,48 @@ int TrimOligos::countDiffs(string oligo, string seq){ exit(1); } } +//********************************************************************/ +string TrimOligos::reverseOligo(string oligo){ + try { + string reverse = ""; + + for(int i=oligo.length()-1;i>=0;i--){ + + if(oligo[i] == 'A') { reverse += 'T'; } + else if(oligo[i] == 'T'){ reverse += 'A'; } + else if(oligo[i] == 'U'){ reverse += 'A'; } + + else if(oligo[i] == 'G'){ reverse += 'C'; } + else if(oligo[i] == 'C'){ reverse += 'G'; } + + else if(oligo[i] == 'R'){ reverse += 'Y'; } + else if(oligo[i] == 'Y'){ reverse += 'R'; } + + else if(oligo[i] == 'M'){ reverse += 'K'; } + else if(oligo[i] == 'K'){ reverse += 'M'; } + + else if(oligo[i] == 'W'){ reverse += 'W'; } + else if(oligo[i] == 'S'){ reverse += 'S'; } + + else if(oligo[i] == 'B'){ reverse += 'V'; } + else if(oligo[i] == 'V'){ reverse += 'B'; } + + else if(oligo[i] == 'D'){ reverse += 'H'; } + else if(oligo[i] == 'H'){ reverse += 'D'; } + + else if(oligo[i] == '-'){ reverse += '-'; } + else { reverse += 'N'; } + } + + + return reverse; + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "reverseOligo"); + exit(1); + } +} + /********************************************************************/