5 * Created by westcott on 9/1/11.
6 * Copyright 2011 Schloss Lab. All rights reserved.
10 #include "trimoligos.h"
11 #include "alignment.hpp"
12 #include "needlemanoverlap.hpp"
15 /********************************************************************/
16 //strip, pdiffs, bdiffs, primers, barcodes, revPrimers
17 TrimOligos::TrimOligos(int p, int b, int l, int s, map<string, int> pr, map<string, int> br, vector<string> r, vector<string> lk, vector<string> sp){
19 m = MothurOut::getInstance();
33 m->errorOut(e, "TrimOligos", "TrimOligos");
37 /********************************************************************/
38 //strip, pdiffs, bdiffs, primers, barcodes, revPrimers
39 TrimOligos::TrimOligos(int p, int b, map<string, int> pr, map<string, int> br, vector<string> r){
41 m = MothurOut::getInstance();
51 m->errorOut(e, "TrimOligos", "TrimOligos");
55 /********************************************************************/
56 TrimOligos::~TrimOligos() {}
57 //*******************************************************************/
58 int TrimOligos::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
61 string rawSequence = seq.getUnaligned();
62 int success = bdiffs + 1; //guilty until proven innocent
64 //can you find the barcode
65 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
66 string oligo = it->first;
67 if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
68 success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
72 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
74 seq.setUnaligned(rawSequence.substr(oligo.length()));
76 if(qual.getName() != ""){
77 qual.trimQScores(oligo.length(), -1);
85 //if you found the barcode or if you don't want to allow for diffs
86 if ((bdiffs == 0) || (success == 0)) { return success; }
88 else { //try aligning and see if you can find it
93 if (barcodes.size() > 0) {
94 map<string,int>::iterator it;
96 for(it=barcodes.begin();it!=barcodes.end();it++){
97 if(it->first.length() > maxLength){
98 maxLength = it->first.length();
101 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
103 }else{ alignment = NULL; }
105 //can you find the barcode
111 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
112 string oligo = it->first;
113 // int length = oligo.length();
115 if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
116 success = bdiffs + 10;
120 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
121 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
122 oligo = alignment->getSeqAAln();
123 string temp = alignment->getSeqBAln();
125 int alnLength = oligo.length();
127 for(int i=oligo.length()-1;i>=0;i--){
128 if(oligo[i] != '-'){ alnLength = i+1; break; }
130 oligo = oligo.substr(0,alnLength);
131 temp = temp.substr(0,alnLength);
133 int numDiff = countDiffs(oligo, temp);
135 if(numDiff < minDiff){
138 minGroup = it->second;
140 for(int i=0;i<alnLength;i++){
146 else if(numDiff == minDiff){
152 if(minDiff > bdiffs) { success = minDiff; } //no good matches
153 else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
154 else{ //use the best match
156 seq.setUnaligned(rawSequence.substr(minPos));
158 if(qual.getName() != ""){
159 qual.trimQScores(minPos, -1);
164 if (alignment != NULL) { delete alignment; }
171 catch(exception& e) {
172 m->errorOut(e, "TrimOligos", "stripBarcode");
177 //*******************************************************************/
178 int TrimOligos::stripBarcode(Sequence& seq, int& group){
181 string rawSequence = seq.getUnaligned();
182 int success = bdiffs + 1; //guilty until proven innocent
184 //can you find the barcode
185 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
186 string oligo = it->first;
187 if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
188 success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
192 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
194 seq.setUnaligned(rawSequence.substr(oligo.length()));
201 //if you found the barcode or if you don't want to allow for diffs
202 if ((bdiffs == 0) || (success == 0)) { return success; }
204 else { //try aligning and see if you can find it
208 Alignment* alignment;
209 if (barcodes.size() > 0) {
210 map<string,int>::iterator it=barcodes.begin();
212 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
213 if(it->first.length() > maxLength){
214 maxLength = it->first.length();
217 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));
219 }else{ alignment = NULL; }
221 //can you find the barcode
227 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
228 string oligo = it->first;
229 // int length = oligo.length();
231 if(rawSequence.length() < maxLength){ //let's just assume that the barcodes are the same length
232 success = bdiffs + 10;
236 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
237 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
238 oligo = alignment->getSeqAAln();
239 string temp = alignment->getSeqBAln();
241 int alnLength = oligo.length();
243 for(int i=oligo.length()-1;i>=0;i--){
244 if(oligo[i] != '-'){ alnLength = i+1; break; }
246 oligo = oligo.substr(0,alnLength);
247 temp = temp.substr(0,alnLength);
249 int numDiff = countDiffs(oligo, temp);
251 if(numDiff < minDiff){
254 minGroup = it->second;
256 for(int i=0;i<alnLength;i++){
262 else if(numDiff == minDiff){
268 if(minDiff > bdiffs) { success = minDiff; } //no good matches
269 else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
270 else{ //use the best match
272 seq.setUnaligned(rawSequence.substr(minPos));
276 if (alignment != NULL) { delete alignment; }
283 catch(exception& e) {
284 m->errorOut(e, "TrimOligos", "stripBarcode");
289 //********************************************************************/
290 int TrimOligos::stripForward(Sequence& seq, int& group){
293 string rawSequence = seq.getUnaligned();
294 int success = pdiffs + 1; //guilty until proven innocent
296 //can you find the primer
297 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
298 string oligo = it->first;
299 if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length
300 success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
304 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
306 seq.setUnaligned(rawSequence.substr(oligo.length()));
312 //if you found the barcode or if you don't want to allow for diffs
313 if ((pdiffs == 0) || (success == 0)) { return success; }
315 else { //try aligning and see if you can find it
319 Alignment* alignment;
320 if (primers.size() > 0) {
321 map<string,int>::iterator it;
323 for(it=primers.begin();it!=primers.end();it++){
324 if(it->first.length() > maxLength){
325 maxLength = it->first.length();
328 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));
330 }else{ alignment = NULL; }
332 //can you find the barcode
338 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
339 string oligo = it->first;
340 // int length = oligo.length();
342 if(rawSequence.length() < maxLength){
343 success = pdiffs + 100;
347 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
348 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
349 oligo = alignment->getSeqAAln();
350 string temp = alignment->getSeqBAln();
352 int alnLength = oligo.length();
354 for(int i=oligo.length()-1;i>=0;i--){
355 if(oligo[i] != '-'){ alnLength = i+1; break; }
357 oligo = oligo.substr(0,alnLength);
358 temp = temp.substr(0,alnLength);
360 int numDiff = countDiffs(oligo, temp);
362 if(numDiff < minDiff){
365 minGroup = it->second;
367 for(int i=0;i<alnLength;i++){
373 else if(numDiff == minDiff){
379 if(minDiff > pdiffs) { success = minDiff; } //no good matches
380 else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers
381 else{ //use the best match
383 seq.setUnaligned(rawSequence.substr(minPos));
387 if (alignment != NULL) { delete alignment; }
394 catch(exception& e) {
395 m->errorOut(e, "TrimOligos", "stripForward");
399 //*******************************************************************/
400 int TrimOligos::stripForward(Sequence& seq, QualityScores& qual, int& group, bool keepForward){
402 string rawSequence = seq.getUnaligned();
403 int success = pdiffs + 1; //guilty until proven innocent
405 //can you find the primer
406 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
407 string oligo = it->first;
408 if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length
409 success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
413 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
415 if (!keepForward) { seq.setUnaligned(rawSequence.substr(oligo.length())); }
416 if(qual.getName() != ""){
417 if (!keepForward) { qual.trimQScores(oligo.length(), -1); }
424 //if you found the barcode or if you don't want to allow for diffs
425 if ((pdiffs == 0) || (success == 0)) { return success; }
427 else { //try aligning and see if you can find it
431 Alignment* alignment;
432 if (primers.size() > 0) {
433 map<string,int>::iterator it;
435 for(it=primers.begin();it!=primers.end();it++){
436 if(it->first.length() > maxLength){
437 maxLength = it->first.length();
440 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));
442 }else{ alignment = NULL; }
444 //can you find the barcode
450 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
451 string oligo = it->first;
452 // int length = oligo.length();
454 if(rawSequence.length() < maxLength){
455 success = pdiffs + 100;
459 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
460 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
461 oligo = alignment->getSeqAAln();
462 string temp = alignment->getSeqBAln();
464 int alnLength = oligo.length();
466 for(int i=oligo.length()-1;i>=0;i--){
467 if(oligo[i] != '-'){ alnLength = i+1; break; }
469 oligo = oligo.substr(0,alnLength);
470 temp = temp.substr(0,alnLength);
472 int numDiff = countDiffs(oligo, temp);
474 if(numDiff < minDiff){
477 minGroup = it->second;
479 for(int i=0;i<alnLength;i++){
485 else if(numDiff == minDiff){
491 if(minDiff > pdiffs) { success = minDiff; } //no good matches
492 else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers
493 else{ //use the best match
495 if (!keepForward) { seq.setUnaligned(rawSequence.substr(minPos)); }
496 if(qual.getName() != ""){
497 if (!keepForward) { qual.trimQScores(minPos, -1); }
502 if (alignment != NULL) { delete alignment; }
509 catch(exception& e) {
510 m->errorOut(e, "TrimOligos", "stripForward");
514 //******************************************************************/
515 bool TrimOligos::stripReverse(Sequence& seq, QualityScores& qual){
517 string rawSequence = seq.getUnaligned();
518 bool success = 0; //guilty until proven innocent
520 for(int i=0;i<revPrimer.size();i++){
521 string oligo = revPrimer[i];
523 if(rawSequence.length() < oligo.length()){
528 if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
529 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
530 if(qual.getName() != ""){
531 qual.trimQScores(-1, rawSequence.length()-oligo.length());
540 catch(exception& e) {
541 m->errorOut(e, "TrimOligos", "stripReverse");
545 //******************************************************************/
546 bool TrimOligos::stripReverse(Sequence& seq){
549 string rawSequence = seq.getUnaligned();
550 bool success = 0; //guilty until proven innocent
552 for(int i=0;i<revPrimer.size();i++){
553 string oligo = revPrimer[i];
555 if(rawSequence.length() < oligo.length()){
560 if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
561 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
570 catch(exception& e) {
571 m->errorOut(e, "TrimOligos", "stripReverse");
575 //******************************************************************/
576 bool TrimOligos::stripLinker(Sequence& seq, QualityScores& qual){
578 string rawSequence = seq.getUnaligned();
579 bool success = 0; //guilty until proven innocent
581 for(int i=0;i<linker.size();i++){
582 string oligo = linker[i];
584 if(rawSequence.length() < oligo.length()){
589 if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
590 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
591 if(qual.getName() != ""){
592 qual.trimQScores(-1, rawSequence.length()-oligo.length());
601 catch(exception& e) {
602 m->errorOut(e, "TrimOligos", "stripLinker");
606 //******************************************************************/
607 bool TrimOligos::stripLinker(Sequence& seq){
610 string rawSequence = seq.getUnaligned();
611 bool success = 0; //guilty until proven innocent
613 for(int i=0;i<linker.size();i++){
614 string oligo = linker[i];
616 if(rawSequence.length() < oligo.length()){
621 if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
622 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
631 catch(exception& e) {
632 m->errorOut(e, "TrimOligos", "stripLinker");
637 //******************************************************************/
638 bool TrimOligos::stripSpacer(Sequence& seq, QualityScores& qual){
640 string rawSequence = seq.getUnaligned();
641 bool success = 0; //guilty until proven innocent
643 for(int i=0;i<spacer.size();i++){
644 string oligo = spacer[i];
646 if(rawSequence.length() < oligo.length()){
651 if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
652 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
653 if(qual.getName() != ""){
654 qual.trimQScores(-1, rawSequence.length()-oligo.length());
663 catch(exception& e) {
664 m->errorOut(e, "TrimOligos", "stripSpacer");
668 //******************************************************************/
669 bool TrimOligos::stripSpacer(Sequence& seq){
672 string rawSequence = seq.getUnaligned();
673 bool success = 0; //guilty until proven innocent
675 for(int i=0;i<spacer.size();i++){
676 string oligo = spacer[i];
678 if(rawSequence.length() < oligo.length()){
683 if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
684 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
693 catch(exception& e) {
694 m->errorOut(e, "TrimOligos", "stripSpacer");
699 //******************************************************************/
700 bool TrimOligos::compareDNASeq(string oligo, string seq){
703 int length = oligo.length();
705 for(int i=0;i<length;i++){
707 if(oligo[i] != seq[i]){
708 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
709 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
710 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
711 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
712 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
713 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
714 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
715 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
716 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
717 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
718 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
719 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
721 if(success == 0) { break; }
730 catch(exception& e) {
731 m->errorOut(e, "TrimOligos", "compareDNASeq");
736 //********************************************************************/
737 int TrimOligos::countDiffs(string oligo, string seq){
740 int length = oligo.length();
743 for(int i=0;i<length;i++){
745 if(oligo[i] != seq[i]){
746 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.') { countDiffs++; }
747 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { countDiffs++; }
748 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { countDiffs++; }
749 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { countDiffs++; }
750 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { countDiffs++; }
751 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
752 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { countDiffs++; }
753 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
754 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
755 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
756 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { countDiffs++; }
757 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
764 catch(exception& e) {
765 m->errorOut(e, "TrimOligos", "countDiffs");
769 /********************************************************************/