5 * Created by westcott on 9/1/11.
6 * Copyright 2011 Schloss Lab. All rights reserved.
10 #include "trimoligos.h"
11 #include "alignment.hpp"
12 #include "needlemanoverlap.hpp"
15 /********************************************************************/
16 //strip, pdiffs, bdiffs, primers, barcodes, revPrimers
17 TrimOligos::TrimOligos(int p, int b, int l, int s, map<string, int> pr, map<string, int> br, vector<string> r, vector<string> lk, vector<string> sp){
19 m = MothurOut::getInstance();
31 maxFBarcodeLength = 0;
32 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
33 if(it->first.length() > maxFBarcodeLength){
34 maxFBarcodeLength = it->first.length();
38 map<string,int>::iterator it;
39 for(it=primers.begin();it!=primers.end();it++){
40 if(it->first.length() > maxFPrimerLength){
41 maxFPrimerLength = it->first.length();
46 for(int i = 0; i < linker.size(); i++){
47 if(linker[i].length() > maxLinkerLength){
48 maxLinkerLength = linker[i].length();
53 for(int i = 0; i < spacer.size(); i++){
54 if(spacer[i].length() > maxSpacerLength){
55 maxSpacerLength = spacer[i].length();
60 m->errorOut(e, "TrimOligos", "TrimOligos");
64 /********************************************************************/
65 //strip, pdiffs, bdiffs, primers, barcodes, revPrimers
66 TrimOligos::TrimOligos(int p, int b, int l, int s, map<int, oligosPair> pr, map<int, oligosPair> br){
68 m = MothurOut::getInstance();
75 maxFBarcodeLength = 0;
76 for(map<int,oligosPair>::iterator it=br.begin();it!=br.end();it++){
77 string forward = it->second.forward;
78 map<string, vector<int> >::iterator itForward = ifbarcodes.find(forward);
80 if(forward.length() > maxFBarcodeLength){ maxFBarcodeLength = forward.length(); }
82 if (itForward == ifbarcodes.end()) {
83 vector<int> temp; temp.push_back(it->first);
84 ifbarcodes[forward] = temp;
85 }else { itForward->second.push_back(it->first); }
89 for(map<int,oligosPair>::iterator it=pr.begin();it!=pr.end();it++){
90 string forward = it->second.forward;
91 map<string, vector<int> >::iterator itForward = ifprimers.find(forward);
93 if(forward.length() > maxFPrimerLength){ maxFPrimerLength = forward.length(); }
95 if (itForward == ifprimers.end()) {
96 vector<int> temp; temp.push_back(it->first);
97 ifprimers[forward] = temp;
98 }else { itForward->second.push_back(it->first); }
101 maxRBarcodeLength = 0;
102 for(map<int,oligosPair>::iterator it=br.begin();it!=br.end();it++){
103 string forward = it->second.reverse;
104 map<string, vector<int> >::iterator itForward = irbarcodes.find(forward);
106 if(forward.length() > maxRBarcodeLength){ maxRBarcodeLength = forward.length(); }
108 if (itForward == irbarcodes.end()) {
109 vector<int> temp; temp.push_back(it->first);
110 irbarcodes[forward] = temp;
111 }else { itForward->second.push_back(it->first); }
114 maxRPrimerLength = 0;
115 for(map<int,oligosPair>::iterator it=pr.begin();it!=pr.end();it++){
116 string forward = it->second.reverse;
117 map<string, vector<int> >::iterator itForward = irprimers.find(forward);
119 if(forward.length() > maxRPrimerLength){ maxRPrimerLength = forward.length(); }
121 if (itForward == irprimers.end()) {
122 vector<int> temp; temp.push_back(it->first);
123 irprimers[forward] = temp;
124 }else { itForward->second.push_back(it->first); }
130 catch(exception& e) {
131 m->errorOut(e, "TrimOligos", "TrimOligos");
135 /********************************************************************/
136 //strip, pdiffs, bdiffs, primers, barcodes, revPrimers
137 TrimOligos::TrimOligos(int p, int b, map<string, int> pr, map<string, int> br, vector<string> r){
139 m = MothurOut::getInstance();
148 catch(exception& e) {
149 m->errorOut(e, "TrimOligos", "TrimOligos");
153 /********************************************************************/
154 TrimOligos::~TrimOligos() {}
155 //*******************************************************************/
156 int TrimOligos::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
159 string rawSequence = seq.getUnaligned();
160 int success = bdiffs + 1; //guilty until proven innocent
162 //can you find the barcode
163 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
164 string oligo = it->first;
165 if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
166 success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
170 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
172 seq.setUnaligned(rawSequence.substr(oligo.length()));
174 if(qual.getName() != ""){
175 qual.trimQScores(oligo.length(), -1);
183 //if you found the barcode or if you don't want to allow for diffs
184 if ((bdiffs == 0) || (success == 0)) { return success; }
186 else { //try aligning and see if you can find it
187 Alignment* alignment;
188 if (barcodes.size() > 0) {alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFBarcodeLength+bdiffs+1)); }
189 else{ alignment = NULL; }
191 //can you find the barcode
197 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
198 string oligo = it->first;
199 // int length = oligo.length();
201 if(rawSequence.length() < maxFBarcodeLength){ //let's just assume that the barcodes are the same length
202 success = bdiffs + 10;
206 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
207 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
208 oligo = alignment->getSeqAAln();
209 string temp = alignment->getSeqBAln();
211 int alnLength = oligo.length();
213 for(int i=oligo.length()-1;i>=0;i--){
214 if(oligo[i] != '-'){ alnLength = i+1; break; }
216 oligo = oligo.substr(0,alnLength);
217 temp = temp.substr(0,alnLength);
218 int numDiff = countDiffs(oligo, temp);
220 if(numDiff < minDiff){
223 minGroup = it->second;
225 for(int i=0;i<alnLength;i++){
231 else if(numDiff == minDiff){
237 if(minDiff > bdiffs) { success = minDiff; } //no good matches
238 else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
239 else{ //use the best match
241 seq.setUnaligned(rawSequence.substr(minPos));
243 if(qual.getName() != ""){
244 qual.trimQScores(minPos, -1);
249 if (alignment != NULL) { delete alignment; }
256 catch(exception& e) {
257 m->errorOut(e, "TrimOligos", "stripBarcode");
261 //*******************************************************************/
262 int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, QualityScores& forwardQual, QualityScores& reverseQual, int& group){
264 //look for forward barcode
265 string rawFSequence = forwardSeq.getUnaligned();
266 string rawRSequence = reverseSeq.getUnaligned();
267 int success = bdiffs + 1; //guilty until proven innocent
269 //can you find the forward barcode
270 for(map<int,oligosPair>::iterator it=ipbarcodes.begin();it!=ipbarcodes.end();it++){
271 string foligo = it->second.forward;
272 string roligo = it->second.reverse;
274 if(rawFSequence.length() < foligo.length()){ //let's just assume that the barcodes are the same length
275 success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
278 if(rawRSequence.length() < roligo.length()){ //let's just assume that the barcodes are the same length
279 success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
283 if((compareDNASeq(foligo, rawFSequence.substr(0,foligo.length()))) && (compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length())))) {
285 forwardSeq.setUnaligned(rawFSequence.substr(foligo.length()));
286 reverseSeq.setUnaligned(rawRSequence.substr(0,(rawRSequence.length()-roligo.length())));
287 forwardQual.trimQScores(foligo.length(), -1);
288 reverseQual.trimQScores(-1, rawRSequence.length()-roligo.length());
294 //if you found the barcode or if you don't want to allow for diffs
295 if ((bdiffs == 0) || (success == 0)) { return success; }
296 else { //try aligning and see if you can find it
297 Alignment* alignment;
298 if (ifbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFBarcodeLength+bdiffs+1)); }
299 else{ alignment = NULL; }
301 //can you find the barcode
304 vector< vector<int> > minFGroup;
307 //the pair can have duplicates, but we only want to search for the unique forward and reverses, example
317 only want to look for forward = Sarah, John, Anna, Pat, Gail
318 reverse = Westcott, Schloss, Brown, Moore
320 but if best match forward = 4, and reverse = 1, we want to count as a valid match because forward 1 and forward 4 are the same. so both barcodes map to same group.
322 //cout << endl << forwardSeq.getName() << endl;
323 for(map<string, vector<int> >::iterator it=ifbarcodes.begin();it!=ifbarcodes.end();it++){
324 string oligo = it->first;
326 if(rawFSequence.length() < maxFBarcodeLength){ //let's just assume that the barcodes are the same length
327 success = bdiffs + 10;
330 //cout << "before = " << oligo << '\t' << rawFSequence.substr(0,oligo.length()+bdiffs) << endl;
331 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
332 alignment->align(oligo, rawFSequence.substr(0,oligo.length()+bdiffs));
333 oligo = alignment->getSeqAAln();
334 string temp = alignment->getSeqBAln();
336 int alnLength = oligo.length();
338 for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } }
339 oligo = oligo.substr(0,alnLength);
340 temp = temp.substr(0,alnLength);
341 int numDiff = countDiffs(oligo, temp);
343 if (alnLength == 0) { numDiff = bdiffs + 100; }
344 //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
346 if(numDiff < minDiff){
350 minFGroup.push_back(it->second);
353 for(int i=0;i<alnLength;i++){
358 minFPos.push_back(tempminFPos);
359 }else if(numDiff == minDiff){
360 minFGroup.push_back(it->second);
362 for(int i=0;i<alnLength;i++){
367 minFPos.push_back(tempminFPos);
371 //cout << minDiff << '\t' << minCount << '\t' << endl;
372 if(minDiff > bdiffs) { success = minDiff; } //no good matches
374 //check for reverse match
375 if (alignment != NULL) { delete alignment; }
377 if (irbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxRBarcodeLength+bdiffs+1)); }
378 else{ alignment = NULL; }
380 //can you find the barcode
383 vector< vector<int> > minRGroup;
386 for(map<string, vector<int> >::iterator it=irbarcodes.begin();it!=irbarcodes.end();it++){
387 string oligo = it->first;
388 //cout << "before = " << oligo << '\t' << rawRSequence.substr(0,oligo.length()+bdiffs) << endl;
389 if(rawRSequence.length() < maxRBarcodeLength){ //let's just assume that the barcodes are the same length
390 success = bdiffs + 10;
394 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
395 alignment->align(oligo, rawRSequence.substr(0,oligo.length()+bdiffs));
396 oligo = alignment->getSeqAAln();
397 string temp = alignment->getSeqBAln();
399 int alnLength = oligo.length();
400 for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } }
401 oligo = oligo.substr(0,alnLength);
402 temp = temp.substr(0,alnLength);
403 int numDiff = countDiffs(oligo, temp);
404 if (alnLength == 0) { numDiff = bdiffs + 100; }
406 //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
407 if(numDiff < minDiff){
411 minRGroup.push_back(it->second);
414 for(int i=0;i<alnLength;i++){
419 minRPos.push_back(tempminRPos);
420 }else if(numDiff == minDiff){
422 for(int i=0;i<alnLength;i++){
427 minRPos.push_back(tempminRPos);
428 minRGroup.push_back(it->second);
433 /*cout << minDiff << '\t' << minCount << '\t' << endl;
434 for (int i = 0; i < minFGroup.size(); i++) {
436 for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; }
440 for (int i = 0; i < minRGroup.size(); i++) {
442 for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; }
446 if(minDiff > bdiffs) { success = minDiff; } //no good matches
448 bool foundMatch = false;
450 for (int i = 0; i < minFGroup.size(); i++) {
451 for (int j = 0; j < minFGroup[i].size(); j++) {
452 for (int k = 0; k < minRGroup.size(); k++) {
453 if (m->inUsersGroups(minFGroup[i][j], minRGroup[k])) { matches.push_back(minFGroup[i][j]); k+= minRGroup.size(); }
460 if (matches.size() == 1) {
467 //we have an acceptable match for the forward and reverse, but do they match?
469 forwardSeq.setUnaligned(rawFSequence.substr(fStart));
470 reverseSeq.setUnaligned(rawRSequence.substr(rStart));
471 forwardQual.trimQScores(fStart, -1);
472 reverseQual.trimQScores(rStart, -1);
474 }else { success = bdiffs + 100; }
478 if (alignment != NULL) { delete alignment; }
484 catch(exception& e) {
485 m->errorOut(e, "TrimOligos", "stripIBarcode");
490 //*******************************************************************/
491 int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, QualityScores& forwardQual, QualityScores& reverseQual, int& group){
493 //look for forward barcode
494 string rawFSequence = forwardSeq.getUnaligned();
495 string rawRSequence = reverseSeq.getUnaligned();
496 int success = pdiffs + 1; //guilty until proven innocent
498 //can you find the forward barcode
499 for(map<int,oligosPair>::iterator it=ipprimers.begin();it!=ipprimers.end();it++){
500 string foligo = it->second.forward;
501 string roligo = it->second.reverse;
503 if(rawFSequence.length() < foligo.length()){ //let's just assume that the barcodes are the same length
504 success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
507 if(rawRSequence.length() < roligo.length()){ //let's just assume that the barcodes are the same length
508 success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
512 if((compareDNASeq(foligo, rawFSequence.substr(0,foligo.length()))) && (compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length())))) {
514 forwardSeq.setUnaligned(rawFSequence.substr(foligo.length()));
515 reverseSeq.setUnaligned(rawRSequence.substr(0,(rawRSequence.length()-roligo.length())));
516 forwardQual.trimQScores(foligo.length(), -1);
517 reverseQual.trimQScores(-1, rawRSequence.length()-roligo.length());
523 //if you found the barcode or if you don't want to allow for diffs
524 if ((pdiffs == 0) || (success == 0)) { return success; }
525 else { //try aligning and see if you can find it
526 Alignment* alignment;
527 if (ifprimers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFPrimerLength+pdiffs+1)); }
528 else{ alignment = NULL; }
530 //can you find the barcode
533 vector< vector<int> > minFGroup;
536 //the pair can have duplicates, but we only want to search for the unique forward and reverses, example
546 only want to look for forward = Sarah, John, Anna, Pat, Gail
547 reverse = Westcott, Schloss, Brown, Moore
549 but if best match forward = 4, and reverse = 1, we want to count as a valid match because forward 1 and forward 4 are the same. so both barcodes map to same group.
551 //cout << endl << forwardSeq.getName() << endl;
552 for(map<string, vector<int> >::iterator it=ifprimers.begin();it!=ifprimers.end();it++){
553 string oligo = it->first;
555 if(rawFSequence.length() < maxFPrimerLength){ //let's just assume that the barcodes are the same length
556 success = pdiffs + 10;
559 //cout << "before = " << oligo << '\t' << rawFSequence.substr(0,oligo.length()+pdiffs) << endl;
560 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
561 alignment->align(oligo, rawFSequence.substr(0,oligo.length()+pdiffs));
562 oligo = alignment->getSeqAAln();
563 string temp = alignment->getSeqBAln();
565 int alnLength = oligo.length();
567 for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } }
568 oligo = oligo.substr(0,alnLength);
569 temp = temp.substr(0,alnLength);
570 int numDiff = countDiffs(oligo, temp);
572 if (alnLength == 0) { numDiff = pdiffs + 100; }
573 //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
575 if(numDiff < minDiff){
579 minFGroup.push_back(it->second);
582 for(int i=0;i<alnLength;i++){
587 minFPos.push_back(tempminFPos);
588 }else if(numDiff == minDiff){
589 minFGroup.push_back(it->second);
591 for(int i=0;i<alnLength;i++){
596 minFPos.push_back(tempminFPos);
600 //cout << minDiff << '\t' << minCount << '\t' << endl;
601 if(minDiff > pdiffs) { success = minDiff; } //no good matches
603 //check for reverse match
604 if (alignment != NULL) { delete alignment; }
606 if (irbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxRPrimerLength+pdiffs+1)); }
607 else{ alignment = NULL; }
609 //can you find the barcode
612 vector< vector<int> > minRGroup;
615 for(map<string, vector<int> >::iterator it=irprimers.begin();it!=irprimers.end();it++){
616 string oligo = it->first;
617 //cout << "before = " << oligo << '\t' << rawRSequence.substr(0,oligo.length()+pdiffs) << endl;
618 if(rawRSequence.length() < maxRPrimerLength){ //let's just assume that the barcodes are the same length
619 success = pdiffs + 10;
623 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
624 alignment->align(oligo, rawRSequence.substr(0,oligo.length()+pdiffs));
625 oligo = alignment->getSeqAAln();
626 string temp = alignment->getSeqBAln();
628 int alnLength = oligo.length();
629 for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } }
630 oligo = oligo.substr(0,alnLength);
631 temp = temp.substr(0,alnLength);
632 int numDiff = countDiffs(oligo, temp);
633 if (alnLength == 0) { numDiff = pdiffs + 100; }
635 //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
636 if(numDiff < minDiff){
640 minRGroup.push_back(it->second);
643 for(int i=0;i<alnLength;i++){
648 minRPos.push_back(tempminRPos);
649 }else if(numDiff == minDiff){
651 for(int i=0;i<alnLength;i++){
656 minRPos.push_back(tempminRPos);
657 minRGroup.push_back(it->second);
662 /*cout << minDiff << '\t' << minCount << '\t' << endl;
663 for (int i = 0; i < minFGroup.size(); i++) {
665 for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; }
669 for (int i = 0; i < minRGroup.size(); i++) {
671 for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; }
675 if(minDiff > pdiffs) { success = minDiff; } //no good matches
677 bool foundMatch = false;
679 for (int i = 0; i < minFGroup.size(); i++) {
680 for (int j = 0; j < minFGroup[i].size(); j++) {
681 for (int k = 0; k < minRGroup.size(); k++) {
682 if (m->inUsersGroups(minFGroup[i][j], minRGroup[k])) { matches.push_back(minFGroup[i][j]); k+= minRGroup.size(); }
689 if (matches.size() == 1) {
696 //we have an acceptable match for the forward and reverse, but do they match?
698 forwardSeq.setUnaligned(rawFSequence.substr(fStart));
699 reverseSeq.setUnaligned(rawRSequence.substr(rStart));
700 forwardQual.trimQScores(fStart, -1);
701 reverseQual.trimQScores(rStart, -1);
703 }else { success = pdiffs + 100; }
707 if (alignment != NULL) { delete alignment; }
713 catch(exception& e) {
714 m->errorOut(e, "TrimOligos", "stripIForward");
719 //*******************************************************************/
720 int TrimOligos::stripBarcode(Sequence& seq, int& group){
723 string rawSequence = seq.getUnaligned();
724 int success = bdiffs + 1; //guilty until proven innocent
726 //can you find the barcode
727 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
728 string oligo = it->first;
729 if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
730 success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
734 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
736 seq.setUnaligned(rawSequence.substr(oligo.length()));
743 //if you found the barcode or if you don't want to allow for diffs
744 if ((bdiffs == 0) || (success == 0)) { return success; }
746 else { //try aligning and see if you can find it
747 Alignment* alignment;
748 if (barcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFBarcodeLength+bdiffs+1)); }
749 else{ alignment = NULL; }
751 //can you find the barcode
757 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
758 string oligo = it->first;
759 // int length = oligo.length();
761 if(rawSequence.length() < maxFBarcodeLength){ //let's just assume that the barcodes are the same length
762 success = bdiffs + 10;
766 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
767 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
768 oligo = alignment->getSeqAAln();
769 string temp = alignment->getSeqBAln();
771 int alnLength = oligo.length();
773 for(int i=oligo.length()-1;i>=0;i--){
774 if(oligo[i] != '-'){ alnLength = i+1; break; }
776 oligo = oligo.substr(0,alnLength);
777 temp = temp.substr(0,alnLength);
779 int numDiff = countDiffs(oligo, temp);
781 if(numDiff < minDiff){
784 minGroup = it->second;
786 for(int i=0;i<alnLength;i++){
792 else if(numDiff == minDiff){
798 if(minDiff > bdiffs) { success = minDiff; } //no good matches
799 else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
800 else{ //use the best match
802 seq.setUnaligned(rawSequence.substr(minPos));
806 if (alignment != NULL) { delete alignment; }
813 catch(exception& e) {
814 m->errorOut(e, "TrimOligos", "stripBarcode");
820 /********************************************************************/
821 int TrimOligos::stripForward(Sequence& seq, int& group){
824 string rawSequence = seq.getUnaligned();
825 int success = pdiffs + 1; //guilty until proven innocent
827 //can you find the primer
828 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
829 string oligo = it->first;
830 if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length
831 success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
835 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
837 seq.setUnaligned(rawSequence.substr(oligo.length()));
843 //if you found the barcode or if you don't want to allow for diffs
844 if ((pdiffs == 0) || (success == 0)) { return success; }
846 else { //try aligning and see if you can find it
847 Alignment* alignment;
848 if (primers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFPrimerLength+pdiffs+1)); }
849 else{ alignment = NULL; }
851 //can you find the barcode
857 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
858 string oligo = it->first;
859 // int length = oligo.length();
861 if(rawSequence.length() < maxFPrimerLength){
862 success = pdiffs + 100;
866 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
867 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
868 oligo = alignment->getSeqAAln();
869 string temp = alignment->getSeqBAln();
871 int alnLength = oligo.length();
873 for(int i=oligo.length()-1;i>=0;i--){
874 if(oligo[i] != '-'){ alnLength = i+1; break; }
876 oligo = oligo.substr(0,alnLength);
877 temp = temp.substr(0,alnLength);
879 int numDiff = countDiffs(oligo, temp);
881 if(numDiff < minDiff){
884 minGroup = it->second;
886 for(int i=0;i<alnLength;i++){
892 else if(numDiff == minDiff){
898 if(minDiff > pdiffs) { success = minDiff; } //no good matches
899 else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers
900 else{ //use the best match
902 seq.setUnaligned(rawSequence.substr(minPos));
906 if (alignment != NULL) { delete alignment; }
913 catch(exception& e) {
914 m->errorOut(e, "TrimOligos", "stripForward");
918 //*******************************************************************/
919 int TrimOligos::stripForward(Sequence& seq, QualityScores& qual, int& group, bool keepForward){
921 string rawSequence = seq.getUnaligned();
922 int success = pdiffs + 1; //guilty until proven innocent
924 //can you find the primer
925 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
926 string oligo = it->first;
927 if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length
928 success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
932 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
934 if (!keepForward) { seq.setUnaligned(rawSequence.substr(oligo.length())); }
935 if(qual.getName() != ""){
936 if (!keepForward) { qual.trimQScores(oligo.length(), -1); }
943 //if you found the barcode or if you don't want to allow for diffs
944 if ((pdiffs == 0) || (success == 0)) { return success; }
946 else { //try aligning and see if you can find it
947 Alignment* alignment;
948 if (primers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFPrimerLength+pdiffs+1)); }
949 else{ alignment = NULL; }
951 //can you find the barcode
957 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
958 string oligo = it->first;
959 // int length = oligo.length();
961 if(rawSequence.length() < maxFPrimerLength){
962 success = pdiffs + 100;
966 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
967 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
968 oligo = alignment->getSeqAAln();
969 string temp = alignment->getSeqBAln();
971 int alnLength = oligo.length();
973 for(int i=oligo.length()-1;i>=0;i--){
974 if(oligo[i] != '-'){ alnLength = i+1; break; }
976 oligo = oligo.substr(0,alnLength);
977 temp = temp.substr(0,alnLength);
979 int numDiff = countDiffs(oligo, temp);
981 if(numDiff < minDiff){
984 minGroup = it->second;
986 for(int i=0;i<alnLength;i++){
992 else if(numDiff == minDiff){
998 if(minDiff > pdiffs) { success = minDiff; } //no good matches
999 else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers
1000 else{ //use the best match
1002 if (!keepForward) { seq.setUnaligned(rawSequence.substr(minPos)); }
1003 if(qual.getName() != ""){
1004 if (!keepForward) { qual.trimQScores(minPos, -1); }
1009 if (alignment != NULL) { delete alignment; }
1016 catch(exception& e) {
1017 m->errorOut(e, "TrimOligos", "stripForward");
1021 //******************************************************************/
1022 bool TrimOligos::stripReverse(Sequence& seq, QualityScores& qual){
1024 string rawSequence = seq.getUnaligned();
1025 bool success = 0; //guilty until proven innocent
1027 for(int i=0;i<revPrimer.size();i++){
1028 string oligo = revPrimer[i];
1030 if(rawSequence.length() < oligo.length()){
1035 if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
1036 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
1037 if(qual.getName() != ""){
1038 qual.trimQScores(-1, rawSequence.length()-oligo.length());
1047 catch(exception& e) {
1048 m->errorOut(e, "TrimOligos", "stripReverse");
1052 //******************************************************************/
1053 bool TrimOligos::stripReverse(Sequence& seq){
1056 string rawSequence = seq.getUnaligned();
1057 bool success = 0; //guilty until proven innocent
1059 for(int i=0;i<revPrimer.size();i++){
1060 string oligo = revPrimer[i];
1062 if(rawSequence.length() < oligo.length()){
1067 if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
1068 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
1077 catch(exception& e) {
1078 m->errorOut(e, "TrimOligos", "stripReverse");
1082 //******************************************************************/
1083 bool TrimOligos::stripLinker(Sequence& seq, QualityScores& qual){
1085 string rawSequence = seq.getUnaligned();
1086 bool success = ldiffs + 1; //guilty until proven innocent
1088 for(int i=0;i<linker.size();i++){
1089 string oligo = linker[i];
1091 if(rawSequence.length() < oligo.length()){
1092 success = ldiffs + 10;
1096 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1097 seq.setUnaligned(rawSequence.substr(oligo.length()));
1098 if(qual.getName() != ""){
1099 qual.trimQScores(oligo.length(), -1);
1106 //if you found the linker or if you don't want to allow for diffs
1107 if ((ldiffs == 0) || (success == 0)) { return success; }
1109 else { //try aligning and see if you can find it
1110 Alignment* alignment;
1111 if (linker.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLinkerLength+ldiffs+1)); }
1112 else{ alignment = NULL; }
1114 //can you find the barcode
1119 for(int i = 0; i < linker.size(); i++){
1120 string oligo = linker[i];
1121 // int length = oligo.length();
1123 if(rawSequence.length() < maxLinkerLength){ //let's just assume that the barcodes are the same length
1124 success = ldiffs + 10;
1128 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1129 alignment->align(oligo, rawSequence.substr(0,oligo.length()+ldiffs));
1130 oligo = alignment->getSeqAAln();
1131 string temp = alignment->getSeqBAln();
1133 int alnLength = oligo.length();
1135 for(int i=oligo.length()-1;i>=0;i--){
1136 if(oligo[i] != '-'){ alnLength = i+1; break; }
1138 oligo = oligo.substr(0,alnLength);
1139 temp = temp.substr(0,alnLength);
1141 int numDiff = countDiffs(oligo, temp);
1143 if(numDiff < minDiff){
1147 for(int i=0;i<alnLength;i++){
1153 else if(numDiff == minDiff){
1159 if(minDiff > ldiffs) { success = minDiff; } //no good matches
1160 else if(minCount > 1) { success = ldiffs + 100; } //can't tell the difference between multiple barcodes
1161 else{ //use the best match
1162 seq.setUnaligned(rawSequence.substr(minPos));
1164 if(qual.getName() != ""){
1165 qual.trimQScores(minPos, -1);
1170 if (alignment != NULL) { delete alignment; }
1178 catch(exception& e) {
1179 m->errorOut(e, "TrimOligos", "stripLinker");
1183 //******************************************************************/
1184 bool TrimOligos::stripLinker(Sequence& seq){
1187 string rawSequence = seq.getUnaligned();
1188 bool success = ldiffs +1; //guilty until proven innocent
1190 for(int i=0;i<linker.size();i++){
1191 string oligo = linker[i];
1193 if(rawSequence.length() < oligo.length()){
1194 success = ldiffs +10;
1198 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1199 seq.setUnaligned(rawSequence.substr(oligo.length()));
1205 //if you found the linker or if you don't want to allow for diffs
1206 if ((ldiffs == 0) || (success == 0)) { return success; }
1208 else { //try aligning and see if you can find it
1209 Alignment* alignment;
1210 if (linker.size() > 0) {alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLinkerLength+ldiffs+1)); }
1211 else{ alignment = NULL; }
1213 //can you find the barcode
1218 for(int i = 0; i < linker.size(); i++){
1219 string oligo = linker[i];
1220 // int length = oligo.length();
1222 if(rawSequence.length() < maxLinkerLength){ //let's just assume that the barcodes are the same length
1223 success = ldiffs + 10;
1227 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1228 alignment->align(oligo, rawSequence.substr(0,oligo.length()+ldiffs));
1229 oligo = alignment->getSeqAAln();
1230 string temp = alignment->getSeqBAln();
1232 int alnLength = oligo.length();
1234 for(int i=oligo.length()-1;i>=0;i--){
1235 if(oligo[i] != '-'){ alnLength = i+1; break; }
1237 oligo = oligo.substr(0,alnLength);
1238 temp = temp.substr(0,alnLength);
1240 int numDiff = countDiffs(oligo, temp);
1242 if(numDiff < minDiff){
1246 for(int i=0;i<alnLength;i++){
1252 else if(numDiff == minDiff){
1258 if(minDiff > ldiffs) { success = minDiff; } //no good matches
1259 else if(minCount > 1) { success = ldiffs + 100; } //can't tell the difference between multiple barcodes
1260 else{ //use the best match
1261 seq.setUnaligned(rawSequence.substr(minPos));
1265 if (alignment != NULL) { delete alignment; }
1272 catch(exception& e) {
1273 m->errorOut(e, "TrimOligos", "stripLinker");
1278 //******************************************************************/
1279 bool TrimOligos::stripSpacer(Sequence& seq, QualityScores& qual){
1281 string rawSequence = seq.getUnaligned();
1282 bool success = sdiffs+1; //guilty until proven innocent
1284 for(int i=0;i<spacer.size();i++){
1285 string oligo = spacer[i];
1287 if(rawSequence.length() < oligo.length()){
1288 success = sdiffs+10;
1292 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1293 seq.setUnaligned(rawSequence.substr(oligo.length()));
1294 if(qual.getName() != ""){
1295 qual.trimQScores(oligo.length(), -1);
1302 //if you found the spacer or if you don't want to allow for diffs
1303 if ((sdiffs == 0) || (success == 0)) { return success; }
1305 else { //try aligning and see if you can find it
1306 Alignment* alignment;
1307 if (spacer.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxSpacerLength+sdiffs+1)); }
1308 else{ alignment = NULL; }
1310 //can you find the barcode
1315 for(int i = 0; i < spacer.size(); i++){
1316 string oligo = spacer[i];
1317 // int length = oligo.length();
1319 if(rawSequence.length() < maxSpacerLength){ //let's just assume that the barcodes are the same length
1320 success = sdiffs + 10;
1324 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1325 alignment->align(oligo, rawSequence.substr(0,oligo.length()+sdiffs));
1326 oligo = alignment->getSeqAAln();
1327 string temp = alignment->getSeqBAln();
1329 int alnLength = oligo.length();
1331 for(int i=oligo.length()-1;i>=0;i--){
1332 if(oligo[i] != '-'){ alnLength = i+1; break; }
1334 oligo = oligo.substr(0,alnLength);
1335 temp = temp.substr(0,alnLength);
1337 int numDiff = countDiffs(oligo, temp);
1339 if(numDiff < minDiff){
1343 for(int i=0;i<alnLength;i++){
1349 else if(numDiff == minDiff){
1355 if(minDiff > sdiffs) { success = minDiff; } //no good matches
1356 else if(minCount > 1) { success = sdiffs + 100; } //can't tell the difference between multiple barcodes
1357 else{ //use the best match
1358 seq.setUnaligned(rawSequence.substr(minPos));
1360 if(qual.getName() != ""){
1361 qual.trimQScores(minPos, -1);
1366 if (alignment != NULL) { delete alignment; }
1374 catch(exception& e) {
1375 m->errorOut(e, "TrimOligos", "stripSpacer");
1379 //******************************************************************/
1380 bool TrimOligos::stripSpacer(Sequence& seq){
1383 string rawSequence = seq.getUnaligned();
1384 bool success = sdiffs+1; //guilty until proven innocent
1386 for(int i=0;i<spacer.size();i++){
1387 string oligo = spacer[i];
1389 if(rawSequence.length() < oligo.length()){
1390 success = sdiffs+10;
1394 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1395 seq.setUnaligned(rawSequence.substr(oligo.length()));
1401 //if you found the spacer or if you don't want to allow for diffs
1402 if ((sdiffs == 0) || (success == 0)) { return success; }
1404 else { //try aligning and see if you can find it
1405 Alignment* alignment;
1406 if (spacer.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxSpacerLength+sdiffs+1)); }
1407 else{ alignment = NULL; }
1409 //can you find the barcode
1414 for(int i = 0; i < spacer.size(); i++){
1415 string oligo = spacer[i];
1416 // int length = oligo.length();
1418 if(rawSequence.length() < maxSpacerLength){ //let's just assume that the barcodes are the same length
1419 success = sdiffs + 10;
1423 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1424 alignment->align(oligo, rawSequence.substr(0,oligo.length()+sdiffs));
1425 oligo = alignment->getSeqAAln();
1426 string temp = alignment->getSeqBAln();
1428 int alnLength = oligo.length();
1430 for(int i=oligo.length()-1;i>=0;i--){
1431 if(oligo[i] != '-'){ alnLength = i+1; break; }
1433 oligo = oligo.substr(0,alnLength);
1434 temp = temp.substr(0,alnLength);
1436 int numDiff = countDiffs(oligo, temp);
1438 if(numDiff < minDiff){
1442 for(int i=0;i<alnLength;i++){
1448 else if(numDiff == minDiff){
1454 if(minDiff > sdiffs) { success = minDiff; } //no good matches
1455 else if(minCount > 1) { success = sdiffs + 100; } //can't tell the difference between multiple barcodes
1456 else{ //use the best match
1457 seq.setUnaligned(rawSequence.substr(minPos));
1461 if (alignment != NULL) { delete alignment; }
1468 catch(exception& e) {
1469 m->errorOut(e, "TrimOligos", "stripSpacer");
1474 //******************************************************************/
1475 bool TrimOligos::compareDNASeq(string oligo, string seq){
1478 int length = oligo.length();
1480 for(int i=0;i<length;i++){
1482 if(oligo[i] != seq[i]){
1483 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
1484 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
1485 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
1486 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
1487 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
1488 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1489 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
1490 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
1491 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1492 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
1493 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
1494 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
1496 if(success == 0) { break; }
1505 catch(exception& e) {
1506 m->errorOut(e, "TrimOligos", "compareDNASeq");
1511 //********************************************************************/
1512 int TrimOligos::countDiffs(string oligo, string seq){
1515 int length = oligo.length();
1518 for(int i=0;i<length;i++){
1520 if(oligo[i] != seq[i]){
1521 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.') { countDiffs++; }
1522 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { countDiffs++; }
1523 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { countDiffs++; }
1524 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { countDiffs++; }
1525 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { countDiffs++; }
1526 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1527 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { countDiffs++; }
1528 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
1529 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1530 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
1531 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { countDiffs++; }
1532 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
1539 catch(exception& e) {
1540 m->errorOut(e, "TrimOligos", "countDiffs");
1544 /********************************************************************/