5 * Created by westcott on 9/1/11.
6 * Copyright 2011 Schloss Lab. All rights reserved.
10 #include "trimoligos.h"
11 #include "alignment.hpp"
12 #include "needlemanoverlap.hpp"
15 /********************************************************************/
16 //strip, pdiffs, bdiffs, primers, barcodes, revPrimers
17 TrimOligos::TrimOligos(int p, int b, map<string, int> pr, map<string, int> br, vector<string> r){
19 m = MothurOut::getInstance();
29 m->errorOut(e, "TrimOligos", "TrimOligos");
33 /********************************************************************/
34 TrimOligos::~TrimOligos() {}
35 //*******************************************************************/
36 int TrimOligos::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
39 string rawSequence = seq.getUnaligned();
40 int success = bdiffs + 1; //guilty until proven innocent
42 //can you find the barcode
43 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
44 string oligo = it->first;
45 if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
46 success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
50 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
52 seq.setUnaligned(rawSequence.substr(oligo.length()));
54 if(qual.getName() != ""){
55 qual.trimQScores(oligo.length(), -1);
63 //if you found the barcode or if you don't want to allow for diffs
64 if ((bdiffs == 0) || (success == 0)) { return success; }
66 else { //try aligning and see if you can find it
71 if (barcodes.size() > 0) {
72 map<string,int>::iterator it=barcodes.begin();
74 for(it;it!=barcodes.end();it++){
75 if(it->first.length() > maxLength){
76 maxLength = it->first.length();
79 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
81 }else{ alignment = NULL; }
83 //can you find the barcode
89 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
90 string oligo = it->first;
91 // int length = oligo.length();
93 if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
94 success = bdiffs + 10;
98 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
99 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
100 oligo = alignment->getSeqAAln();
101 string temp = alignment->getSeqBAln();
103 int alnLength = oligo.length();
105 for(int i=oligo.length()-1;i>=0;i--){
106 if(oligo[i] != '-'){ alnLength = i+1; break; }
108 oligo = oligo.substr(0,alnLength);
109 temp = temp.substr(0,alnLength);
111 int numDiff = countDiffs(oligo, temp);
113 if(numDiff < minDiff){
116 minGroup = it->second;
118 for(int i=0;i<alnLength;i++){
124 else if(numDiff == minDiff){
130 if(minDiff > bdiffs) { success = minDiff; } //no good matches
131 else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
132 else{ //use the best match
134 seq.setUnaligned(rawSequence.substr(minPos));
136 if(qual.getName() != ""){
137 qual.trimQScores(minPos, -1);
142 if (alignment != NULL) { delete alignment; }
149 catch(exception& e) {
150 m->errorOut(e, "TrimOligos", "stripBarcode");
155 //*******************************************************************/
156 int TrimOligos::stripBarcode(Sequence& seq, int& group){
159 string rawSequence = seq.getUnaligned();
160 int success = bdiffs + 1; //guilty until proven innocent
162 //can you find the barcode
163 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
164 string oligo = it->first;
165 if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
166 success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
170 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
172 seq.setUnaligned(rawSequence.substr(oligo.length()));
179 //if you found the barcode or if you don't want to allow for diffs
180 if ((bdiffs == 0) || (success == 0)) { return success; }
182 else { //try aligning and see if you can find it
186 Alignment* alignment;
187 if (barcodes.size() > 0) {
188 map<string,int>::iterator it=barcodes.begin();
190 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
191 if(it->first.length() > maxLength){
192 maxLength = it->first.length();
195 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
197 }else{ alignment = NULL; }
199 //can you find the barcode
205 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
206 string oligo = it->first;
207 // int length = oligo.length();
209 if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
210 success = bdiffs + 10;
214 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
215 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
216 oligo = alignment->getSeqAAln();
217 string temp = alignment->getSeqBAln();
219 int alnLength = oligo.length();
221 for(int i=oligo.length()-1;i>=0;i--){
222 if(oligo[i] != '-'){ alnLength = i+1; break; }
224 oligo = oligo.substr(0,alnLength);
225 temp = temp.substr(0,alnLength);
227 int numDiff = countDiffs(oligo, temp);
229 if(numDiff < minDiff){
232 minGroup = it->second;
234 for(int i=0;i<alnLength;i++){
240 else if(numDiff == minDiff){
246 if(minDiff > bdiffs) { success = minDiff; } //no good matches
247 else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
248 else{ //use the best match
250 seq.setUnaligned(rawSequence.substr(minPos));
254 if (alignment != NULL) { delete alignment; }
261 catch(exception& e) {
262 m->errorOut(e, "TrimOligos", "stripBarcode");
267 //********************************************************************/
268 int TrimOligos::stripForward(Sequence& seq, int& group){
271 string rawSequence = seq.getUnaligned();
272 int success = pdiffs + 1; //guilty until proven innocent
274 //can you find the primer
275 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
276 string oligo = it->first;
277 if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length
278 success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
282 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
284 seq.setUnaligned(rawSequence.substr(oligo.length()));
290 //if you found the barcode or if you don't want to allow for diffs
291 if ((pdiffs == 0) || (success == 0)) { return success; }
293 else { //try aligning and see if you can find it
297 Alignment* alignment;
298 if (primers.size() > 0) {
299 map<string,int>::iterator it=primers.begin();
301 for(it;it!=primers.end();it++){
302 if(it->first.length() > maxLength){
303 maxLength = it->first.length();
306 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));
308 }else{ alignment = NULL; }
310 //can you find the barcode
316 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
317 string oligo = it->first;
318 // int length = oligo.length();
320 if(rawSequence.length() < maxLength){
321 success = pdiffs + 100;
325 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
326 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
327 oligo = alignment->getSeqAAln();
328 string temp = alignment->getSeqBAln();
330 int alnLength = oligo.length();
332 for(int i=oligo.length()-1;i>=0;i--){
333 if(oligo[i] != '-'){ alnLength = i+1; break; }
335 oligo = oligo.substr(0,alnLength);
336 temp = temp.substr(0,alnLength);
338 int numDiff = countDiffs(oligo, temp);
340 if(numDiff < minDiff){
343 minGroup = it->second;
345 for(int i=0;i<alnLength;i++){
351 else if(numDiff == minDiff){
357 if(minDiff > pdiffs) { success = minDiff; } //no good matches
358 else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers
359 else{ //use the best match
361 seq.setUnaligned(rawSequence.substr(minPos));
365 if (alignment != NULL) { delete alignment; }
372 catch(exception& e) {
373 m->errorOut(e, "TrimOligos", "stripForward");
377 //*******************************************************************/
378 int TrimOligos::stripForward(Sequence& seq, QualityScores& qual, int& group){
380 string rawSequence = seq.getUnaligned();
381 int success = pdiffs + 1; //guilty until proven innocent
383 //can you find the primer
384 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
385 string oligo = it->first;
386 if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length
387 success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
391 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
393 seq.setUnaligned(rawSequence.substr(oligo.length()));
394 if(qual.getName() != ""){
395 qual.trimQScores(oligo.length(), -1);
402 //if you found the barcode or if you don't want to allow for diffs
403 if ((pdiffs == 0) || (success == 0)) { return success; }
405 else { //try aligning and see if you can find it
409 Alignment* alignment;
410 if (primers.size() > 0) {
411 map<string,int>::iterator it=primers.begin();
413 for(it;it!=primers.end();it++){
414 if(it->first.length() > maxLength){
415 maxLength = it->first.length();
418 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));
420 }else{ alignment = NULL; }
422 //can you find the barcode
428 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
429 string oligo = it->first;
430 // int length = oligo.length();
432 if(rawSequence.length() < maxLength){
433 success = pdiffs + 100;
437 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
438 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
439 oligo = alignment->getSeqAAln();
440 string temp = alignment->getSeqBAln();
442 int alnLength = oligo.length();
444 for(int i=oligo.length()-1;i>=0;i--){
445 if(oligo[i] != '-'){ alnLength = i+1; break; }
447 oligo = oligo.substr(0,alnLength);
448 temp = temp.substr(0,alnLength);
450 int numDiff = countDiffs(oligo, temp);
452 if(numDiff < minDiff){
455 minGroup = it->second;
457 for(int i=0;i<alnLength;i++){
463 else if(numDiff == minDiff){
469 if(minDiff > pdiffs) { success = minDiff; } //no good matches
470 else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers
471 else{ //use the best match
473 seq.setUnaligned(rawSequence.substr(minPos));
474 if(qual.getName() != ""){
475 qual.trimQScores(minPos, -1);
480 if (alignment != NULL) { delete alignment; }
487 catch(exception& e) {
488 m->errorOut(e, "TrimOligos", "stripForward");
492 //******************************************************************/
493 bool TrimOligos::stripReverse(Sequence& seq, QualityScores& qual){
495 string rawSequence = seq.getUnaligned();
496 bool success = 0; //guilty until proven innocent
498 for(int i=0;i<revPrimer.size();i++){
499 string oligo = revPrimer[i];
501 if(rawSequence.length() < oligo.length()){
506 if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
507 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
508 if(qual.getName() != ""){
509 qual.trimQScores(-1, rawSequence.length()-oligo.length());
518 catch(exception& e) {
519 m->errorOut(e, "TrimOligos", "stripReverse");
523 //******************************************************************/
524 bool TrimOligos::stripReverse(Sequence& seq){
527 string rawSequence = seq.getUnaligned();
528 bool success = 0; //guilty until proven innocent
530 for(int i=0;i<revPrimer.size();i++){
531 string oligo = revPrimer[i];
533 if(rawSequence.length() < oligo.length()){
538 if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
539 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
548 catch(exception& e) {
549 m->errorOut(e, "TrimOligos", "stripReverse");
554 //******************************************************************/
555 bool TrimOligos::compareDNASeq(string oligo, string seq){
558 int length = oligo.length();
560 for(int i=0;i<length;i++){
562 if(oligo[i] != seq[i]){
563 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
564 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
565 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
566 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
567 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
568 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
569 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
570 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
571 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
572 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
573 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
574 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
576 if(success == 0) { break; }
585 catch(exception& e) {
586 m->errorOut(e, "TrimOligos", "compareDNASeq");
591 //********************************************************************/
592 int TrimOligos::countDiffs(string oligo, string seq){
595 int length = oligo.length();
598 for(int i=0;i<length;i++){
600 if(oligo[i] != seq[i]){
601 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.') { countDiffs++; }
602 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { countDiffs++; }
603 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { countDiffs++; }
604 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { countDiffs++; }
605 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { countDiffs++; }
606 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
607 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { countDiffs++; }
608 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
609 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
610 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
611 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { countDiffs++; }
612 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
619 catch(exception& e) {
620 m->errorOut(e, "TrimOligos", "countDiffs");
624 /********************************************************************/