X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=trimoligos.cpp;h=2f92cc847ca75e1a0983aeef2d09e9eaab68e57f;hb=01f8d2c7d982a6209211f5abbcf2a086fdf60d0a;hp=f0b5a80880240e78674d6d97198ba7d47c8793ab;hpb=ae57e166b2ed7b475ec3f466106bd76fabadd063;p=mothur.git diff --git a/trimoligos.cpp b/trimoligos.cpp index f0b5a80..2f92cc8 100644 --- a/trimoligos.cpp +++ b/trimoligos.cpp @@ -12,6 +12,51 @@ #include "needlemanoverlap.hpp" +/********************************************************************/ +//strip, pdiffs, bdiffs, primers, barcodes, revPrimers +TrimOligos::TrimOligos(int p, int b, int l, int s, map pr, map br, map rbr, vector r, vector lk, vector sp){ + try { + m = MothurOut::getInstance(); + + pdiffs = p; + bdiffs = b; + ldiffs = l; + sdiffs = s; + + barcodes = br; + rbarcodes = rbr; + primers = pr; + revPrimer = r; + linker = lk; + spacer = sp; + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "TrimOligos"); + exit(1); + } +} +/********************************************************************/ +//strip, pdiffs, bdiffs, primers, barcodes, revPrimers +TrimOligos::TrimOligos(int p, int b, int l, int s, map pr, map br, vector r, vector lk, vector sp){ + try { + m = MothurOut::getInstance(); + + pdiffs = p; + bdiffs = b; + ldiffs = l; + sdiffs = s; + + barcodes = br; + primers = pr; + revPrimer = r; + linker = lk; + spacer = sp; + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "TrimOligos"); + exit(1); + } +} /********************************************************************/ //strip, pdiffs, bdiffs, primers, barcodes, revPrimers TrimOligos::TrimOligos(int p, int b, map pr, map br, vector r){ @@ -69,9 +114,9 @@ int TrimOligos::stripBarcode(Sequence& seq, QualityScores& qual, int& group){ Alignment* alignment; if (barcodes.size() > 0) { - map::iterator it=barcodes.begin(); + map::iterator it; - for(it;it!=barcodes.end();it++){ + for(it=barcodes.begin();it!=barcodes.end();it++){ if(it->first.length() > maxLength){ maxLength = it->first.length(); } @@ -107,8 +152,7 @@ int TrimOligos::stripBarcode(Sequence& seq, QualityScores& qual, int& group){ } oligo = oligo.substr(0,alnLength); temp = temp.substr(0,alnLength); - - int numDiff = countDiffs(oligo, temp); + int numDiff = countDiffs(oligo, temp); if(numDiff < minDiff){ minDiff = numDiff; @@ -132,7 +176,7 @@ int TrimOligos::stripBarcode(Sequence& seq, QualityScores& qual, int& group){ else{ //use the best match group = minGroup; seq.setUnaligned(rawSequence.substr(minPos)); - + if(qual.getName() != ""){ qual.trimQScores(minPos, -1); } @@ -215,7 +259,7 @@ int TrimOligos::stripBarcode(Sequence& seq, int& group){ alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs)); oligo = alignment->getSeqAAln(); string temp = alignment->getSeqBAln(); - + int alnLength = oligo.length(); for(int i=oligo.length()-1;i>=0;i--){ @@ -223,7 +267,7 @@ int TrimOligos::stripBarcode(Sequence& seq, int& group){ } oligo = oligo.substr(0,alnLength); temp = temp.substr(0,alnLength); - + int numDiff = countDiffs(oligo, temp); if(numDiff < minDiff){ @@ -263,6 +307,239 @@ int TrimOligos::stripBarcode(Sequence& seq, int& group){ exit(1); } +} +//*******************************************************************/ +int TrimOligos::stripRBarcode(Sequence& seq, QualityScores& qual, int& group){ + try { + + string rawSequence = seq.getUnaligned(); + int success = bdiffs + 1; //guilty until proven innocent + + //can you find the barcode + for(map::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){ + string oligo = it->first; + if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; + } + + if(compareDNASeq(oligo, rawSequence.substr((rawSequence.length()-oligo.length()),oligo.length()))){ + group = it->second; + seq.setUnaligned(rawSequence.substr(0,(rawSequence.length()-oligo.length()))); + + if(qual.getName() != ""){ + qual.trimQScores(-1, rawSequence.length()-oligo.length()); + } + + success = 0; + break; + } + } + + //if you found the barcode or if you don't want to allow for diffs + if ((bdiffs == 0) || (success == 0)) { return success; } + + else { //try aligning and see if you can find it + + int maxLength = 0; + + Alignment* alignment; + if (rbarcodes.size() > 0) { + map::iterator it; + + for(it=rbarcodes.begin();it!=rbarcodes.end();it++){ + if(it->first.length() > maxLength){ + maxLength = it->first.length(); + } + } + alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1)); + + }else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minGroup = -1; + int minPos = 0; + + for(map::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){ + string oligo = it->first; + // int length = oligo.length(); + + if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->align(oligo, rawSequence.substr((rawSequence.length()-(oligo.length()+bdiffs)),oligo.length()+bdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=0;isecond; + minPos = 0; + for(int i=alnLength-1;i>=0;i--){ + if(temp[i] != '-'){ + minPos++; + } + } + } + else if(numDiff == minDiff){ + minCount++; + } + + } + + if(minDiff > bdiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes + else{ //use the best match + group = minGroup; + seq.setUnaligned(rawSequence.substr(0, (rawSequence.length()-minPos))); + + if(qual.getName() != ""){ + qual.trimQScores(-1, (rawSequence.length()-minPos)); + } + success = minDiff; + } + + if (alignment != NULL) { delete alignment; } + + } + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripRBarcode"); + exit(1); + } + +} +//*******************************************************************/ +int TrimOligos::stripRBarcode(Sequence& seq, int& group){ + try { + + string rawSequence = seq.getUnaligned(); + int success = bdiffs + 1; //guilty until proven innocent + + //can you find the barcode + for(map::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){ + string oligo = it->first; + if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out + break; + } + + if(compareDNASeq(oligo, rawSequence.substr((rawSequence.length()-oligo.length()),oligo.length()))){ + group = it->second; + seq.setUnaligned(rawSequence.substr(0,(rawSequence.length()-oligo.length()))); + + success = 0; + break; + } + } + + //if you found the barcode or if you don't want to allow for diffs + if ((bdiffs == 0) || (success == 0)) { return success; } + + else { //try aligning and see if you can find it + + int maxLength = 0; + + Alignment* alignment; + if (rbarcodes.size() > 0) { + map::iterator it; + + for(it=rbarcodes.begin();it!=rbarcodes.end();it++){ + if(it->first.length() > maxLength){ + maxLength = it->first.length(); + } + } + alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1)); + + }else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minGroup = -1; + int minPos = 0; + + for(map::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){ + string oligo = it->first; + // int length = oligo.length(); + + if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length + success = bdiffs + 10; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->align(oligo, rawSequence.substr((rawSequence.length()-(oligo.length()+bdiffs)),oligo.length()+bdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=0;isecond; + minPos = 0; + for(int i=alnLength-1;i>=0;i--){ + if(temp[i] != '-'){ + minPos++; + } + } + } + else if(numDiff == minDiff){ + minCount++; + } + + } + + if(minDiff > bdiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes + else{ //use the best match + group = minGroup; + seq.setUnaligned(rawSequence.substr(0, (rawSequence.length()-minPos))); + + success = minDiff; + } + + if (alignment != NULL) { delete alignment; } + + } + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripRBarcode"); + exit(1); + } + } //********************************************************************/ int TrimOligos::stripForward(Sequence& seq, int& group){ @@ -296,9 +573,9 @@ int TrimOligos::stripForward(Sequence& seq, int& group){ Alignment* alignment; if (primers.size() > 0) { - map::iterator it=primers.begin(); + map::iterator it; - for(it;it!=primers.end();it++){ + for(it=primers.begin();it!=primers.end();it++){ if(it->first.length() > maxLength){ maxLength = it->first.length(); } @@ -375,7 +652,7 @@ int TrimOligos::stripForward(Sequence& seq, int& group){ } } //*******************************************************************/ -int TrimOligos::stripForward(Sequence& seq, QualityScores& qual, int& group){ +int TrimOligos::stripForward(Sequence& seq, QualityScores& qual, int& group, bool keepForward){ try { string rawSequence = seq.getUnaligned(); int success = pdiffs + 1; //guilty until proven innocent @@ -390,9 +667,9 @@ int TrimOligos::stripForward(Sequence& seq, QualityScores& qual, int& group){ if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){ group = it->second; - seq.setUnaligned(rawSequence.substr(oligo.length())); + if (!keepForward) { seq.setUnaligned(rawSequence.substr(oligo.length())); } if(qual.getName() != ""){ - qual.trimQScores(oligo.length(), -1); + if (!keepForward) { qual.trimQScores(oligo.length(), -1); } } success = 0; break; @@ -408,9 +685,9 @@ int TrimOligos::stripForward(Sequence& seq, QualityScores& qual, int& group){ Alignment* alignment; if (primers.size() > 0) { - map::iterator it=primers.begin(); + map::iterator it; - for(it;it!=primers.end();it++){ + for(it=primers.begin();it!=primers.end();it++){ if(it->first.length() > maxLength){ maxLength = it->first.length(); } @@ -470,9 +747,9 @@ int TrimOligos::stripForward(Sequence& seq, QualityScores& qual, int& group){ else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers else{ //use the best match group = minGroup; - seq.setUnaligned(rawSequence.substr(minPos)); + if (!keepForward) { seq.setUnaligned(rawSequence.substr(minPos)); } if(qual.getName() != ""){ - qual.trimQScores(minPos, -1); + if (!keepForward) { qual.trimQScores(minPos, -1); } } success = minDiff; } @@ -550,6 +827,437 @@ bool TrimOligos::stripReverse(Sequence& seq){ exit(1); } } +//******************************************************************/ +bool TrimOligos::stripLinker(Sequence& seq, QualityScores& qual){ + try { + string rawSequence = seq.getUnaligned(); + bool success = ldiffs + 1; //guilty until proven innocent + + for(int i=0;i 0) { + for(int i = 0; i < linker.size(); i++){ + if(linker[i].length() > maxLength){ + maxLength = linker[i].length(); + } + } + alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+ldiffs+1)); + + }else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minPos = 0; + + for(int i = 0; i < linker.size(); i++){ + string oligo = linker[i]; + // int length = oligo.length(); + + if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length + success = ldiffs + 10; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->align(oligo, rawSequence.substr(0,oligo.length()+ldiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ + if(oligo[i] != '-'){ alnLength = i+1; break; } + } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + + int numDiff = countDiffs(oligo, temp); + + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minPos = 0; + for(int i=0;i ldiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = ldiffs + 100; } //can't tell the difference between multiple barcodes + else{ //use the best match + seq.setUnaligned(rawSequence.substr(minPos)); + + if(qual.getName() != ""){ + qual.trimQScores(minPos, -1); + } + success = minDiff; + } + + if (alignment != NULL) { delete alignment; } + + } + + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripLinker"); + exit(1); + } +} +//******************************************************************/ +bool TrimOligos::stripLinker(Sequence& seq){ + try { + + string rawSequence = seq.getUnaligned(); + bool success = ldiffs +1; //guilty until proven innocent + + for(int i=0;i 0) { + for(int i = 0; i < linker.size(); i++){ + if(linker[i].length() > maxLength){ + maxLength = linker[i].length(); + } + } + alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+ldiffs+1)); + + }else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minPos = 0; + + for(int i = 0; i < linker.size(); i++){ + string oligo = linker[i]; + // int length = oligo.length(); + + if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length + success = ldiffs + 10; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->align(oligo, rawSequence.substr(0,oligo.length()+ldiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ + if(oligo[i] != '-'){ alnLength = i+1; break; } + } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + + int numDiff = countDiffs(oligo, temp); + + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minPos = 0; + for(int i=0;i ldiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = ldiffs + 100; } //can't tell the difference between multiple barcodes + else{ //use the best match + seq.setUnaligned(rawSequence.substr(minPos)); + success = minDiff; + } + + if (alignment != NULL) { delete alignment; } + + } + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripLinker"); + exit(1); + } +} + +//******************************************************************/ +bool TrimOligos::stripSpacer(Sequence& seq, QualityScores& qual){ + try { + string rawSequence = seq.getUnaligned(); + bool success = sdiffs+1; //guilty until proven innocent + + for(int i=0;i 0) { + for(int i = 0; i < spacer.size(); i++){ + if(spacer[i].length() > maxLength){ + maxLength = spacer[i].length(); + } + } + alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+sdiffs+1)); + + }else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minPos = 0; + + for(int i = 0; i < spacer.size(); i++){ + string oligo = spacer[i]; + // int length = oligo.length(); + + if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length + success = sdiffs + 10; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->align(oligo, rawSequence.substr(0,oligo.length()+sdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ + if(oligo[i] != '-'){ alnLength = i+1; break; } + } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + + int numDiff = countDiffs(oligo, temp); + + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minPos = 0; + for(int i=0;i sdiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = sdiffs + 100; } //can't tell the difference between multiple barcodes + else{ //use the best match + seq.setUnaligned(rawSequence.substr(minPos)); + + if(qual.getName() != ""){ + qual.trimQScores(minPos, -1); + } + success = minDiff; + } + + if (alignment != NULL) { delete alignment; } + + } + + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripSpacer"); + exit(1); + } +} +//******************************************************************/ +bool TrimOligos::stripSpacer(Sequence& seq){ + try { + + string rawSequence = seq.getUnaligned(); + bool success = sdiffs+1; //guilty until proven innocent + + for(int i=0;i 0) { + for(int i = 0; i < spacer.size(); i++){ + if(spacer[i].length() > maxLength){ + maxLength = spacer[i].length(); + } + } + alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+sdiffs+1)); + + }else{ alignment = NULL; } + + //can you find the barcode + int minDiff = 1e6; + int minCount = 1; + int minPos = 0; + + for(int i = 0; i < spacer.size(); i++){ + string oligo = spacer[i]; + // int length = oligo.length(); + + if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length + success = sdiffs + 10; + break; + } + + //use needleman to align first barcode.length()+numdiffs of sequence to each barcode + alignment->align(oligo, rawSequence.substr(0,oligo.length()+sdiffs)); + oligo = alignment->getSeqAAln(); + string temp = alignment->getSeqBAln(); + + int alnLength = oligo.length(); + + for(int i=oligo.length()-1;i>=0;i--){ + if(oligo[i] != '-'){ alnLength = i+1; break; } + } + oligo = oligo.substr(0,alnLength); + temp = temp.substr(0,alnLength); + + int numDiff = countDiffs(oligo, temp); + + if(numDiff < minDiff){ + minDiff = numDiff; + minCount = 1; + minPos = 0; + for(int i=0;i sdiffs) { success = minDiff; } //no good matches + else if(minCount > 1) { success = sdiffs + 100; } //can't tell the difference between multiple barcodes + else{ //use the best match + seq.setUnaligned(rawSequence.substr(minPos)); + success = minDiff; + } + + if (alignment != NULL) { delete alignment; } + + } + + return success; + + } + catch(exception& e) { + m->errorOut(e, "TrimOligos", "stripSpacer"); + exit(1); + } +} //******************************************************************/ bool TrimOligos::compareDNASeq(string oligo, string seq){