5 * Created by westcott on 9/1/11.
6 * Copyright 2011 Schloss Lab. All rights reserved.
10 #include "trimoligos.h"
11 #include "alignment.hpp"
12 #include "needlemanoverlap.hpp"
15 /********************************************************************/
16 //strip, pdiffs, bdiffs, primers, barcodes, revPrimers
17 TrimOligos::TrimOligos(int p, int b, int l, int s, map<string, int> pr, map<string, int> br, vector<string> r, vector<string> lk, vector<string> sp){
19 m = MothurOut::getInstance();
33 maxFBarcodeLength = 0;
34 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
35 if(it->first.length() > maxFBarcodeLength){
36 maxFBarcodeLength = it->first.length();
40 map<string,int>::iterator it;
41 for(it=primers.begin();it!=primers.end();it++){
42 if(it->first.length() > maxFPrimerLength){
43 maxFPrimerLength = it->first.length();
48 for(int i = 0; i < linker.size(); i++){
49 if(linker[i].length() > maxLinkerLength){
50 maxLinkerLength = linker[i].length();
55 for(int i = 0; i < spacer.size(); i++){
56 if(spacer[i].length() > maxSpacerLength){
57 maxSpacerLength = spacer[i].length();
62 m->errorOut(e, "TrimOligos", "TrimOligos");
66 /********************************************************************/
67 //strip, pdiffs, bdiffs, primers, barcodes, revPrimers
68 TrimOligos::TrimOligos(int p, int b, int l, int s, map<int, oligosPair> pr, map<int, oligosPair> br){
70 m = MothurOut::getInstance();
79 maxFBarcodeLength = 0;
80 for(map<int,oligosPair>::iterator it=br.begin();it!=br.end();it++){
81 string forward = it->second.forward;
82 map<string, vector<int> >::iterator itForward = ifbarcodes.find(forward);
84 if(forward.length() > maxFBarcodeLength){ maxFBarcodeLength = forward.length(); }
86 if (itForward == ifbarcodes.end()) {
87 vector<int> temp; temp.push_back(it->first);
88 ifbarcodes[forward] = temp;
89 }else { itForward->second.push_back(it->first); }
93 for(map<int,oligosPair>::iterator it=pr.begin();it!=pr.end();it++){
94 string forward = it->second.forward;
95 map<string, vector<int> >::iterator itForward = ifprimers.find(forward);
97 if(forward.length() > maxFPrimerLength){ maxFPrimerLength = forward.length(); }
99 if (itForward == ifprimers.end()) {
100 vector<int> temp; temp.push_back(it->first);
101 ifprimers[forward] = temp;
102 }else { itForward->second.push_back(it->first); }
105 maxRBarcodeLength = 0;
106 for(map<int,oligosPair>::iterator it=br.begin();it!=br.end();it++){
107 string forward = it->second.reverse;
108 map<string, vector<int> >::iterator itForward = irbarcodes.find(forward);
110 if(forward.length() > maxRBarcodeLength){ maxRBarcodeLength = forward.length(); }
112 if (itForward == irbarcodes.end()) {
113 vector<int> temp; temp.push_back(it->first);
114 irbarcodes[forward] = temp;
115 }else { itForward->second.push_back(it->first); }
118 maxRPrimerLength = 0;
119 for(map<int,oligosPair>::iterator it=pr.begin();it!=pr.end();it++){
120 string forward = it->second.reverse;
121 map<string, vector<int> >::iterator itForward = irprimers.find(forward);
123 if(forward.length() > maxRPrimerLength){ maxRPrimerLength = forward.length(); }
125 if (itForward == irprimers.end()) {
126 vector<int> temp; temp.push_back(it->first);
127 irprimers[forward] = temp;
128 }else { itForward->second.push_back(it->first); }
134 catch(exception& e) {
135 m->errorOut(e, "TrimOligos", "TrimOligos");
139 /********************************************************************/
140 //strip, pdiffs, bdiffs, primers, barcodes, revPrimers
141 TrimOligos::TrimOligos(int p, int b, map<string, int> pr, map<string, int> br, vector<string> r){
143 m = MothurOut::getInstance();
154 maxFBarcodeLength = 0;
155 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
156 string oligo = it->first;
157 if(oligo.length() > maxFBarcodeLength){
158 maxFBarcodeLength = oligo.length();
161 maxRBarcodeLength = maxFBarcodeLength;
163 maxFPrimerLength = 0;
164 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
165 string oligo = it->first;
166 if(oligo.length() > maxFPrimerLength){
167 maxFPrimerLength = oligo.length();
170 maxRPrimerLength = maxFPrimerLength;
172 catch(exception& e) {
173 m->errorOut(e, "TrimOligos", "TrimOligos");
177 /********************************************************************/
178 TrimOligos::~TrimOligos() {}
179 //********************************************************************/
180 bool TrimOligos::findForward(Sequence& seq, int& primerStart, int& primerEnd){
183 string rawSequence = seq.getUnaligned();
186 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
187 string oligo = it->first;
189 if(rawSequence.length() < oligo.length()) { break; }
192 int olength = oligo.length();
193 for (int j = 0; j < rawSequence.length()-olength; j++){
194 if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; }
195 string rawChunk = rawSequence.substr(j, olength);
196 if(compareDNASeq(oligo, rawChunk)) {
198 primerEnd = primerStart + olength;
205 primerStart = 0; primerEnd = 0;
206 //if you don't want to allow for diffs
207 if (pdiffs == 0) { trashCode = "f"; return false; }
208 else { //try aligning and see if you can find it
210 //can you find the barcode
214 Alignment* alignment;
215 if (primers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFPrimerLength+pdiffs+1)); }
216 else{ alignment = NULL; }
218 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
220 string prim = it->first;
222 int olength = prim.length();
223 if (rawSequence.length() < olength) {} //ignore primers too long for this seq
225 for (int j = 0; j < rawSequence.length()-olength; j++){
227 string oligo = it->first;
229 if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; }
231 string rawChunk = rawSequence.substr(j, olength+pdiffs);
233 //use needleman to align first primer.length()+numdiffs of sequence to each barcode
234 alignment->alignPrimer(oligo, rawChunk);
235 oligo = alignment->getSeqAAln();
236 string temp = alignment->getSeqBAln();
238 int alnLength = oligo.length();
240 for(int i=oligo.length()-1;i>=0;i--){
241 if(oligo[i] != '-'){ alnLength = i+1; break; }
243 oligo = oligo.substr(0,alnLength);
244 temp = temp.substr(0,alnLength);
246 int numDiff = countDiffs(oligo, temp);
248 if(numDiff < minDiff){
252 primerEnd = primerStart + alnLength;
253 }else if(numDiff == minDiff){ minCount++; }
258 if (alignment != NULL) { delete alignment; }
260 if(minDiff > pdiffs) { primerStart = 0; primerEnd = 0; trashCode = "f"; return false; } //no good matches
261 else if(minCount > 1) { primerStart = 0; primerEnd = 0; trashCode = "f"; return false; } //can't tell the difference between multiple primers
262 else{ trashCode = ""; return true; }
266 primerStart = 0; primerEnd = 0;
270 catch(exception& e) {
271 m->errorOut(e, "TrimOligos", "stripForward");
275 //******************************************************************/
276 bool TrimOligos::findReverse(Sequence& seq, int& primerStart, int& primerEnd){
279 string rawSequence = seq.getUnaligned();
281 for(int i=0;i<revPrimer.size();i++){
282 string oligo = revPrimer[i];
283 if(rawSequence.length() < oligo.length()) { break; }
286 int olength = oligo.length();
287 for (int j = rawSequence.length()-olength; j >= 0; j--){
288 if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; }
289 string rawChunk = rawSequence.substr(j, olength);
291 if(compareDNASeq(oligo, rawChunk)) {
293 primerEnd = primerStart + olength;
301 primerStart = 0; primerEnd = 0;
304 catch(exception& e) {
305 m->errorOut(e, "PcrSeqsCommand", "findReverse");
309 //*******************************************************************/
311 int TrimOligos::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
314 if (paired) { int success = stripPairedBarcode(seq, qual, group); return success; }
316 string rawSequence = seq.getUnaligned();
317 int success = bdiffs + 1; //guilty until proven innocent
319 //can you find the barcode
320 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
321 string oligo = it->first;
322 if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
323 success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
327 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
329 seq.setUnaligned(rawSequence.substr(oligo.length()));
331 if(qual.getName() != ""){
332 qual.trimQScores(oligo.length(), -1);
341 //if you found the barcode or if you don't want to allow for diffs
342 if (bdiffs == 0) { trashCode = "b"; return success; }
344 else { //try aligning and see if you can find it
345 Alignment* alignment;
346 if (barcodes.size() > 0) {alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFBarcodeLength+bdiffs+1)); }
347 else{ alignment = NULL; }
349 //can you find the barcode
355 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
356 string oligo = it->first;
357 // int length = oligo.length();
359 if(rawSequence.length() < maxFBarcodeLength){ //let's just assume that the barcodes are the same length
360 success = bdiffs + 10; trashCode = "b";
364 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
365 alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
366 oligo = alignment->getSeqAAln();
367 string temp = alignment->getSeqBAln();
369 int alnLength = oligo.length();
371 for(int i=oligo.length()-1;i>=0;i--){
372 if(oligo[i] != '-'){ alnLength = i+1; break; }
374 oligo = oligo.substr(0,alnLength);
375 temp = temp.substr(0,alnLength);
376 int numDiff = countDiffs(oligo, temp);
378 if(numDiff < minDiff){
381 minGroup = it->second;
383 for(int i=0;i<alnLength;i++){
389 else if(numDiff == minDiff){
395 if(minDiff > bdiffs) { trashCode = "b"; success = minDiff; } //no good matches
396 else if(minCount > 1) { trashCode = "b"; success = bdiffs + 100; } //can't tell the difference between multiple barcodes
397 else{ //use the best match
399 seq.setUnaligned(rawSequence.substr(minPos));
401 if(qual.getName() != ""){
402 qual.trimQScores(minPos, -1);
408 if (alignment != NULL) { delete alignment; }
415 catch(exception& e) {
416 m->errorOut(e, "TrimOligos", "stripBarcode");
420 //*******************************************************************/
421 int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, int& group){
424 //look for forward barcode
425 string rawFSequence = forwardSeq.getUnaligned();
426 string rawRSequence = reverseSeq.getUnaligned();
427 int success = bdiffs + 1; //guilty until proven innocent
429 //can you find the forward barcode
430 for(map<int,oligosPair>::iterator it=ipbarcodes.begin();it!=ipbarcodes.end();it++){
431 string foligo = it->second.forward;
432 string roligo = it->second.reverse;
434 if(rawFSequence.length() < foligo.length()){ //let's just assume that the barcodes are the same length
435 success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
439 if(rawRSequence.length() < roligo.length()){ //let's just assume that the barcodes are the same length
440 success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
445 if(compareDNASeq(foligo, rawFSequence.substr(0,foligo.length()))) {
446 if (compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length()))) {
448 forwardSeq.setUnaligned(rawFSequence.substr(foligo.length()));
449 reverseSeq.setUnaligned(rawRSequence.substr(0,(rawRSequence.length()-roligo.length())));
450 success = 0; trashCode = "";
455 if (!compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length()))) { trashCode += "d"; }
459 //if you found the barcode or if you don't want to allow for diffs
460 if (bdiffs == 0) { return success; }
461 else { //try aligning and see if you can find it
463 Alignment* alignment;
464 if (ifbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFBarcodeLength+bdiffs+1)); }
465 else{ alignment = NULL; }
467 //can you find the barcode
470 vector< vector<int> > minFGroup;
473 //the pair can have duplicates, but we only want to search for the unique forward and reverses, example
483 only want to look for forward = Sarah, John, Anna, Pat, Gail
484 reverse = Westcott, Schloss, Brown, Moore
486 but if best match forward = 4, and reverse = 1, we want to count as a valid match because forward 1 and forward 4 are the same. so both barcodes map to same group.
488 //cout << endl << forwardSeq.getName() << endl;
489 for(map<string, vector<int> >::iterator it=ifbarcodes.begin();it!=ifbarcodes.end();it++){
490 string oligo = it->first;
492 if(rawFSequence.length() < maxFBarcodeLength){ //let's just assume that the barcodes are the same length
493 success = bdiffs + 10;
496 //cout << "before = " << oligo << '\t' << rawFSequence.substr(0,oligo.length()+bdiffs) << endl;
497 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
498 alignment->alignPrimer(oligo, rawFSequence.substr(0,oligo.length()+bdiffs));
499 oligo = alignment->getSeqAAln();
500 string temp = alignment->getSeqBAln();
502 int alnLength = oligo.length();
504 for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } }
505 oligo = oligo.substr(0,alnLength);
506 temp = temp.substr(0,alnLength);
507 int numDiff = countDiffs(oligo, temp);
509 if (alnLength == 0) { numDiff = bdiffs + 100; }
510 //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
512 if(numDiff < minDiff){
516 minFGroup.push_back(it->second);
519 for(int i=0;i<alnLength;i++){
524 minFPos.push_back(tempminFPos);
525 }else if(numDiff == minDiff){
526 minFGroup.push_back(it->second);
528 for(int i=0;i<alnLength;i++){
533 minFPos.push_back(tempminFPos);
537 //cout << minDiff << '\t' << minCount << '\t' << endl;
538 if(minDiff > bdiffs) { trashCode = "b"; minFGroup.clear(); success = minDiff; } //no good matches
540 //check for reverse match
541 if (alignment != NULL) { delete alignment; }
543 if (irbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxRBarcodeLength+bdiffs+1)); }
544 else{ alignment = NULL; }
546 //can you find the barcode
549 vector< vector<int> > minRGroup;
552 for(map<string, vector<int> >::iterator it=irbarcodes.begin();it!=irbarcodes.end();it++){
553 string oligo = it->first;
554 //cout << "before = " << oligo << '\t' << rawRSequence.substr(0,oligo.length()+bdiffs) << endl;
555 if(rawRSequence.length() < maxRBarcodeLength){ //let's just assume that the barcodes are the same length
556 success = bdiffs + 10;
560 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
561 alignment->alignPrimer(oligo, rawRSequence.substr(0,oligo.length()+bdiffs));
562 oligo = alignment->getSeqAAln();
563 string temp = alignment->getSeqBAln();
565 int alnLength = oligo.length();
566 for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } }
567 oligo = oligo.substr(0,alnLength);
568 temp = temp.substr(0,alnLength);
569 int numDiff = countDiffs(oligo, temp);
570 if (alnLength == 0) { numDiff = bdiffs + 100; }
572 //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
573 if(numDiff < minDiff){
577 minRGroup.push_back(it->second);
580 for(int i=0;i<alnLength;i++){
585 minRPos.push_back(tempminRPos);
586 }else if(numDiff == minDiff){
588 for(int i=0;i<alnLength;i++){
593 minRPos.push_back(tempminRPos);
594 minRGroup.push_back(it->second);
599 /*cout << minDiff << '\t' << minCount << '\t' << endl;
600 for (int i = 0; i < minFGroup.size(); i++) {
602 for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; }
606 for (int i = 0; i < minRGroup.size(); i++) {
608 for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; }
612 if(minDiff > bdiffs) { trashCode += "d"; success = minDiff; } //no good matches
614 bool foundMatch = false;
616 for (int i = 0; i < minFGroup.size(); i++) {
617 for (int j = 0; j < minFGroup[i].size(); j++) {
618 for (int k = 0; k < minRGroup.size(); k++) {
619 if (m->inUsersGroups(minFGroup[i][j], minRGroup[k])) { matches.push_back(minFGroup[i][j]); k+= minRGroup.size(); }
626 if (matches.size() == 1) {
633 //we have an acceptable match for the forward and reverse, but do they match?
635 forwardSeq.setUnaligned(rawFSequence.substr(fStart));
636 reverseSeq.setUnaligned(rawRSequence.substr(rStart));
638 }else { if (trashCode == "") { trashCode = "bd"; } success = bdiffs + 100; }
642 if (alignment != NULL) { delete alignment; }
645 //catchall for case I didn't think of
646 if ((trashCode == "") && (success > bdiffs)) { trashCode = "bd"; }
651 catch(exception& e) {
652 m->errorOut(e, "TrimOligos", "stripIBarcode");
657 //*******************************************************************/
658 int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, QualityScores& forwardQual, QualityScores& reverseQual, int& group){
662 //look for forward barcode
663 string rawFSequence = forwardSeq.getUnaligned();
664 string rawRSequence = reverseSeq.getUnaligned();
665 int success = bdiffs + 1; //guilty until proven innocent
667 //can you find the forward barcode
668 for(map<int,oligosPair>::iterator it=ipbarcodes.begin();it!=ipbarcodes.end();it++){
669 string foligo = it->second.forward;
670 string roligo = it->second.reverse;
672 if(rawFSequence.length() < foligo.length()){ //let's just assume that the barcodes are the same length
673 success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
677 if(rawRSequence.length() < roligo.length()){ //let's just assume that the barcodes are the same length
678 success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
683 if(compareDNASeq(foligo, rawFSequence.substr(0,foligo.length()))) {
684 if (compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length()))) {
686 forwardSeq.setUnaligned(rawFSequence.substr(foligo.length()));
687 reverseSeq.setUnaligned(rawRSequence.substr(0,(rawRSequence.length()-roligo.length())));
688 forwardQual.trimQScores(foligo.length(), -1);
689 reverseQual.trimQScores(-1, rawRSequence.length()-roligo.length());
690 success = 0; trashCode = "";
695 if (!compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length()))) { trashCode += "d";
700 //if you found the barcode or if you don't want to allow for diffs
701 if (bdiffs == 0) { return success; }
702 else { //try aligning and see if you can find it
704 Alignment* alignment;
705 if (ifbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFBarcodeLength+bdiffs+1)); }
706 else{ alignment = NULL; }
708 //can you find the barcode
711 vector< vector<int> > minFGroup;
714 //the pair can have duplicates, but we only want to search for the unique forward and reverses, example
724 only want to look for forward = Sarah, John, Anna, Pat, Gail
725 reverse = Westcott, Schloss, Brown, Moore
727 but if best match forward = 4, and reverse = 1, we want to count as a valid match because forward 1 and forward 4 are the same. so both barcodes map to same group.
729 //cout << endl << forwardSeq.getName() << endl;
730 for(map<string, vector<int> >::iterator it=ifbarcodes.begin();it!=ifbarcodes.end();it++){
731 string oligo = it->first;
733 if(rawFSequence.length() < maxFBarcodeLength){ //let's just assume that the barcodes are the same length
734 success = bdiffs + 10;
737 //cout << "before = " << oligo << '\t' << rawFSequence.substr(0,oligo.length()+bdiffs) << endl;
738 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
739 alignment->alignPrimer(oligo, rawFSequence.substr(0,oligo.length()+bdiffs));
740 oligo = alignment->getSeqAAln();
741 string temp = alignment->getSeqBAln();
743 int alnLength = oligo.length();
745 for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } }
746 oligo = oligo.substr(0,alnLength);
747 temp = temp.substr(0,alnLength);
748 int numDiff = countDiffs(oligo, temp);
750 if (alnLength == 0) { numDiff = bdiffs + 100; }
751 //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
753 if(numDiff < minDiff){
757 minFGroup.push_back(it->second);
760 for(int i=0;i<alnLength;i++){
765 minFPos.push_back(tempminFPos);
766 }else if(numDiff == minDiff){
767 minFGroup.push_back(it->second);
769 for(int i=0;i<alnLength;i++){
774 minFPos.push_back(tempminFPos);
778 //cout << minDiff << '\t' << minCount << '\t' << endl;
779 if(minDiff > bdiffs) { trashCode = "b"; minFGroup.clear(); success = minDiff; } //no good matches
781 //check for reverse match
782 if (alignment != NULL) { delete alignment; }
784 if (irbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxRBarcodeLength+bdiffs+1)); }
785 else{ alignment = NULL; }
787 //can you find the barcode
790 vector< vector<int> > minRGroup;
793 for(map<string, vector<int> >::iterator it=irbarcodes.begin();it!=irbarcodes.end();it++){
794 string oligo = it->first;
795 //cout << "before = " << oligo << '\t' << rawRSequence.substr(0,oligo.length()+bdiffs) << endl;
796 if(rawRSequence.length() < maxRBarcodeLength){ //let's just assume that the barcodes are the same length
797 success = bdiffs + 10;
801 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
802 alignment->alignPrimer(oligo, rawRSequence.substr(0,oligo.length()+bdiffs));
803 oligo = alignment->getSeqAAln();
804 string temp = alignment->getSeqBAln();
806 int alnLength = oligo.length();
807 for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } }
808 oligo = oligo.substr(0,alnLength);
809 temp = temp.substr(0,alnLength);
810 int numDiff = countDiffs(oligo, temp);
811 if (alnLength == 0) { numDiff = bdiffs + 100; }
813 //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
814 if(numDiff < minDiff){
818 minRGroup.push_back(it->second);
821 for(int i=0;i<alnLength;i++){
826 minRPos.push_back(tempminRPos);
827 }else if(numDiff == minDiff){
829 for(int i=0;i<alnLength;i++){
834 minRPos.push_back(tempminRPos);
835 minRGroup.push_back(it->second);
840 /*cout << minDiff << '\t' << minCount << '\t' << endl;
841 for (int i = 0; i < minFGroup.size(); i++) {
843 for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; }
847 for (int i = 0; i < minRGroup.size(); i++) {
849 for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; }
853 if(minDiff > bdiffs) { trashCode = "d"; success = minDiff; } //no good matches
855 bool foundMatch = false;
857 for (int i = 0; i < minFGroup.size(); i++) {
858 for (int j = 0; j < minFGroup[i].size(); j++) {
859 for (int k = 0; k < minRGroup.size(); k++) {
860 if (m->inUsersGroups(minFGroup[i][j], minRGroup[k])) { matches.push_back(minFGroup[i][j]); k+= minRGroup.size(); }
867 if (matches.size() == 1) {
874 //we have an acceptable match for the forward and reverse, but do they match?
876 forwardSeq.setUnaligned(rawFSequence.substr(fStart));
877 reverseSeq.setUnaligned(rawRSequence.substr(rStart));
878 forwardQual.trimQScores(fStart, -1);
879 reverseQual.trimQScores(rStart, -1);
881 }else { if (trashCode == "") { trashCode = "bd"; } success = bdiffs + 100; }
885 if (alignment != NULL) { delete alignment; }
888 //catchall for case I didn't think of
889 if ((trashCode == "") && (success > bdiffs)) { trashCode = "bd"; }
894 catch(exception& e) {
895 m->errorOut(e, "TrimOligos", "stripIBarcode");
900 //*******************************************************************/
901 int TrimOligos::stripPairedBarcode(Sequence& seq, QualityScores& qual, int& group){
905 //look for forward barcode
906 string rawSeq = seq.getUnaligned();
907 int success = bdiffs + 1; //guilty until proven innocent
908 //cout << seq.getName() << endl;
909 //can you find the forward barcode
910 for(map<int,oligosPair>::iterator it=ipbarcodes.begin();it!=ipbarcodes.end();it++){
911 string foligo = it->second.forward;
912 string roligo = it->second.reverse;
913 //cout << it->first << '\t' << foligo << '\t' << roligo << endl;
914 //cout << it->first << '\t' << rawSeq.substr(0,foligo.length()) << '\t' << rawSeq.substr(rawSeq.length()-roligo.length(),roligo.length()) << endl;
915 if(rawSeq.length() < foligo.length()){ //let's just assume that the barcodes are the same length
916 success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
920 if(rawSeq.length() < roligo.length()){ //let's just assume that the barcodes are the same length
921 success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
926 if(compareDNASeq(foligo, rawSeq.substr(0,foligo.length()))) { //find forward
927 if (compareDNASeq(roligo, rawSeq.substr(rawSeq.length()-roligo.length(),roligo.length()))) { //find reverse
929 string trimmedSeq = rawSeq.substr(foligo.length()); //trim forward barcode
930 seq.setUnaligned(trimmedSeq.substr(0,(trimmedSeq.length()-roligo.length()))); //trim reverse barcode
931 if(qual.getName() != ""){
932 qual.trimQScores(-1, rawSeq.length()-roligo.length());
933 qual.trimQScores(foligo.length(), -1);
941 if (!compareDNASeq(roligo, rawSeq.substr(rawSeq.length()-roligo.length(),roligo.length()))) { trashCode += "d"; }
944 //cout << "success=" << success << endl;
945 //if you found the barcode or if you don't want to allow for diffs
946 if (bdiffs == 0) { return success; }
947 else { //try aligning and see if you can find it
948 Alignment* alignment;
949 if (ifbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFBarcodeLength+bdiffs+1)); }
950 else{ alignment = NULL; }
952 //can you find the barcode
955 vector< vector<int> > minFGroup;
958 //the pair can have duplicates, but we only want to search for the unique forward and reverses, example
968 only want to look for forward = Sarah, John, Anna, Pat, Gail
969 reverse = Westcott, Schloss, Brown, Moore
971 but if best match forward = 4, and reverse = 1, we want to count as a valid match because forward 1 and forward 4 are the same. so both barcodes map to same group.
973 //cout << endl << forwardSeq.getName() << endl;
974 for(map<string, vector<int> >::iterator it=ifbarcodes.begin();it!=ifbarcodes.end();it++){
975 string oligo = it->first;
977 if(rawSeq.length() < maxFBarcodeLength){ //let's just assume that the barcodes are the same length
978 success = bdiffs + 10;
981 //cout << "before = " << oligo << '\t' << rawSeq.substr(0,oligo.length()+bdiffs) << endl;
982 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
983 alignment->alignPrimer(oligo, rawSeq.substr(0,oligo.length()+bdiffs));
984 oligo = alignment->getSeqAAln();
985 string temp = alignment->getSeqBAln();
987 int alnLength = oligo.length();
989 for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } }
990 oligo = oligo.substr(0,alnLength);
991 temp = temp.substr(0,alnLength);
992 int numDiff = countDiffs(oligo, temp);
994 if (alnLength == 0) { numDiff = bdiffs + 100; }
995 //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
997 if(numDiff < minDiff){
1001 minFGroup.push_back(it->second);
1002 int tempminFPos = 0;
1004 for(int i=0;i<alnLength;i++){
1009 minFPos.push_back(tempminFPos);
1010 }else if(numDiff == minDiff){
1011 minFGroup.push_back(it->second);
1012 int tempminFPos = 0;
1013 for(int i=0;i<alnLength;i++){
1018 minFPos.push_back(tempminFPos);
1022 //cout << minDiff << '\t' << minCount << '\t' << endl;
1023 if(minDiff > bdiffs) { trashCode = "b"; minFGroup.clear(); success = minDiff; } //no good matches
1025 //check for reverse match
1026 if (alignment != NULL) { delete alignment; }
1028 if (irbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxRBarcodeLength+bdiffs+1)); }
1029 else{ alignment = NULL; }
1031 //can you find the barcode
1034 vector< vector<int> > minRGroup;
1035 vector<int> minRPos;
1037 string rawRSequence = reverseOligo(seq.getUnaligned());
1038 //cout << irbarcodes.size() << '\t' << maxRBarcodeLength << endl;
1039 for(map<string, vector<int> >::iterator it=irbarcodes.begin();it!=irbarcodes.end();it++){
1040 string oligo = reverseOligo(it->first);
1041 //cout << "r before = " << reverseOligo(oligo) << '\t' << reverseOligo(rawRSequence.substr(0,oligo.length()+bdiffs)) << endl;
1042 if(rawRSequence.length() < maxRBarcodeLength){ //let's just assume that the barcodes are the same length
1043 success = bdiffs + 10;
1047 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1048 alignment->alignPrimer(oligo, rawRSequence.substr(0,oligo.length()+bdiffs));
1049 oligo = alignment->getSeqAAln();
1050 string temp = alignment->getSeqBAln();
1052 int alnLength = oligo.length();
1053 for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } }
1054 oligo = oligo.substr(0,alnLength);
1055 temp = temp.substr(0,alnLength);
1056 int numDiff = countDiffs(oligo, temp);
1057 if (alnLength == 0) { numDiff = bdiffs + 100; }
1059 //cout << "r after = " << reverseOligo(oligo) << '\t' << reverseOligo(temp) << '\t' << numDiff << endl;
1060 if(numDiff < minDiff){
1064 minRGroup.push_back(it->second);
1065 int tempminRPos = 0;
1067 for(int i=0;i<alnLength;i++){
1072 minRPos.push_back(tempminRPos);
1073 }else if(numDiff == minDiff){
1074 int tempminRPos = 0;
1075 for(int i=0;i<alnLength;i++){
1080 minRPos.push_back(tempminRPos);
1081 minRGroup.push_back(it->second);
1086 /*cout << minDiff << '\t' << minCount << '\t' << endl;
1087 for (int i = 0; i < minFGroup.size(); i++) {
1089 for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; }
1093 for (int i = 0; i < minRGroup.size(); i++) {
1095 for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; }
1099 if(minDiff > bdiffs) { trashCode += "d"; success = minDiff; } //no good matches
1101 bool foundMatch = false;
1102 vector<int> matches;
1103 for (int i = 0; i < minFGroup.size(); i++) {
1104 for (int j = 0; j < minFGroup[i].size(); j++) {
1105 for (int k = 0; k < minRGroup.size(); k++) {
1106 if (m->inUsersGroups(minFGroup[i][j], minRGroup[k])) { matches.push_back(minFGroup[i][j]); k+= minRGroup.size(); }
1113 if (matches.size() == 1) {
1116 fStart = minFPos[0];
1117 rStart = rawSeq.length() - minRPos[0];
1120 //we have an acceptable match for the forward and reverse, but do they match?
1122 string trimmedSeq = rawSeq.substr(0, rStart); //trim reverse barcode
1123 seq.setUnaligned(trimmedSeq.substr(fStart)); //trim forward barcode
1124 if(qual.getName() != ""){
1125 qual.trimQScores(-1, rStart);
1126 qual.trimQScores(fStart, -1);
1129 //cout << "barcode = " << ipbarcodes[group].forward << '\t' << ipbarcodes[group].reverse << endl;
1130 }else { if (trashCode == "") { trashCode = "bd"; } success = bdiffs + 100; }
1134 if (alignment != NULL) { delete alignment; }
1137 //catchall for case I didn't think of
1138 if ((trashCode == "") && (success > bdiffs)) { trashCode = "bd"; }
1143 catch(exception& e) {
1144 m->errorOut(e, "TrimOligos", "stripPairedBarcode");
1149 //*******************************************************************/
1150 int TrimOligos::stripPairedPrimers(Sequence& seq, QualityScores& qual, int& group, bool keepForward){
1154 string rawSeq = seq.getUnaligned();
1155 int success = pdiffs + 1; //guilty until proven innocent
1156 //cout << seq.getName() << endl;
1157 //can you find the forward
1158 for(map<int,oligosPair>::iterator it=ipprimers.begin();it!=ipprimers.end();it++){
1159 string foligo = it->second.forward;
1160 string roligo = it->second.reverse;
1162 //cout << it->first << '\t' << foligo << '\t' << roligo << endl;
1163 //cout << it->first << '\t' << rawSeq.substr(0,foligo.length()) << '\t' << rawSeq.substr(rawSeq.length()-roligo.length(),roligo.length()) << endl;
1164 if(rawSeq.length() < foligo.length()){ //let's just assume that the barcodes are the same length
1165 success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
1169 if(rawSeq.length() < roligo.length()){ //let's just assume that the barcodes are the same length
1170 success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
1175 if(compareDNASeq(foligo, rawSeq.substr(0,foligo.length()))) { //find forward
1176 if (compareDNASeq(roligo, rawSeq.substr(rawSeq.length()-roligo.length(),roligo.length()))) { //find reverse
1178 string trimmedSeq = rawSeq.substr(foligo.length()); //trim forward barcode
1179 seq.setUnaligned(trimmedSeq.substr(0,(trimmedSeq.length()-roligo.length()))); //trim reverse barcode
1180 if(qual.getName() != ""){
1181 qual.trimQScores(-1, rawSeq.length()-roligo.length());
1182 qual.trimQScores(foligo.length(), -1);
1190 if (!compareDNASeq(roligo, rawSeq.substr(rawSeq.length()-roligo.length(),roligo.length()))) { trashCode += "d"; }
1194 //cout << "success=" << success << endl;
1195 //if you found the barcode or if you don't want to allow for diffs
1196 if (pdiffs == 0) { return success; }
1197 else { //try aligning and see if you can find it
1198 Alignment* alignment;
1199 if (ifprimers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFPrimerLength+pdiffs+1)); }
1200 else{ alignment = NULL; }
1202 //can you find the barcode
1205 vector< vector<int> > minFGroup;
1206 vector<int> minFPos;
1208 //the pair can have duplicates, but we only want to search for the unique forward and reverses, example
1218 only want to look for forward = Sarah, John, Anna, Pat, Gail
1219 reverse = Westcott, Schloss, Brown, Moore
1221 but if best match forward = 4, and reverse = 1, we want to count as a valid match because forward 1 and forward 4 are the same. so both barcodes map to same group.
1223 //cout << endl << forwardSeq.getName() << endl;
1224 for(map<string, vector<int> >::iterator it=ifprimers.begin();it!=ifprimers.end();it++){
1225 string oligo = it->first;
1227 if(rawSeq.length() < maxFPrimerLength){ //let's just assume that the barcodes are the same length
1228 success = pdiffs + 10;
1231 //cout << "before = " << oligo << '\t' << rawSeq.substr(0,oligo.length()+bdiffs) << endl;
1232 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1233 alignment->alignPrimer(oligo, rawSeq.substr(0,oligo.length()+pdiffs));
1234 oligo = alignment->getSeqAAln();
1235 string temp = alignment->getSeqBAln();
1237 int alnLength = oligo.length();
1239 for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } }
1240 oligo = oligo.substr(0,alnLength);
1241 temp = temp.substr(0,alnLength);
1242 int numDiff = countDiffs(oligo, temp);
1244 if (alnLength == 0) { numDiff = pdiffs + 100; }
1245 //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
1247 if(numDiff < minDiff){
1251 minFGroup.push_back(it->second);
1252 int tempminFPos = 0;
1254 for(int i=0;i<alnLength;i++){
1259 minFPos.push_back(tempminFPos);
1260 }else if(numDiff == minDiff){
1261 minFGroup.push_back(it->second);
1262 int tempminFPos = 0;
1263 for(int i=0;i<alnLength;i++){
1268 minFPos.push_back(tempminFPos);
1272 //cout << minDiff << '\t' << minCount << '\t' << endl;
1273 if(minDiff > pdiffs) { trashCode = "p"; minFGroup.clear(); success = minDiff; } //no good matches
1275 //check for reverse match
1276 if (alignment != NULL) { delete alignment; }
1278 if (irprimers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxRPrimerLength+pdiffs+1)); }
1279 else{ alignment = NULL; }
1281 //can you find the barcode
1284 vector< vector<int> > minRGroup;
1285 vector<int> minRPos;
1287 string rawRSequence = reverseOligo(seq.getUnaligned());
1289 for(map<string, vector<int> >::iterator it=irprimers.begin();it!=irprimers.end();it++){
1290 string oligo = reverseOligo(it->first);
1291 //cout << "r before = " << reverseOligo(oligo) << '\t' << reverseOligo(rawRSequence.substr(0,oligo.length()+pdiffs)) << endl;
1292 if(rawRSequence.length() < maxRPrimerLength){ //let's just assume that the barcodes are the same length
1293 success = pdiffs + 10;
1297 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1298 alignment->alignPrimer(oligo, rawRSequence.substr(0,oligo.length()+pdiffs));
1299 oligo = alignment->getSeqAAln();
1300 string temp = alignment->getSeqBAln();
1302 int alnLength = oligo.length();
1303 for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } }
1304 oligo = oligo.substr(0,alnLength);
1305 temp = temp.substr(0,alnLength);
1306 int numDiff = countDiffs(oligo, temp);
1307 if (alnLength == 0) { numDiff = pdiffs + 100; }
1309 //cout << "r after = " << reverseOligo(oligo) << '\t' << reverseOligo(temp) << '\t' << numDiff << endl;
1310 if(numDiff < minDiff){
1314 minRGroup.push_back(it->second);
1315 int tempminRPos = 0;
1317 for(int i=0;i<alnLength;i++){
1322 minRPos.push_back(tempminRPos);
1323 }else if(numDiff == minDiff){
1324 int tempminRPos = 0;
1325 for(int i=0;i<alnLength;i++){
1330 minRPos.push_back(tempminRPos);
1331 minRGroup.push_back(it->second);
1336 /*cout << minDiff << '\t' << minCount << '\t' << endl;
1337 for (int i = 0; i < minFGroup.size(); i++) {
1339 for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; }
1343 for (int i = 0; i < minRGroup.size(); i++) {
1345 for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; }
1349 if(minDiff > pdiffs) { trashCode += "q"; success = minDiff; } //no good matches
1351 bool foundMatch = false;
1352 vector<int> matches;
1353 for (int i = 0; i < minFGroup.size(); i++) {
1354 for (int j = 0; j < minFGroup[i].size(); j++) {
1355 for (int k = 0; k < minRGroup.size(); k++) {
1356 if (m->inUsersGroups(minFGroup[i][j], minRGroup[k])) { matches.push_back(minFGroup[i][j]); k+= minRGroup.size(); }
1363 if (matches.size() == 1) {
1366 fStart = minFPos[0];
1367 rStart = rawSeq.length() - minRPos[0];
1370 //we have an acceptable match for the forward and reverse, but do they match?
1373 string trimmedSeq = rawSeq.substr(0, rStart); //trim reverse barcode
1374 seq.setUnaligned(trimmedSeq.substr(fStart)); //trim forward barcode
1375 if(qual.getName() != ""){
1376 qual.trimQScores(-1, rStart);
1377 qual.trimQScores(fStart, -1);
1381 //cout << "primer = " << ipprimers[group].forward << '\t' << ipprimers[group].reverse << endl;
1382 }else { if (trashCode == "") { trashCode = "pq"; } success = pdiffs + 100; }
1386 if (alignment != NULL) { delete alignment; }
1389 //catchall for case I didn't think of
1390 if ((trashCode == "") && (success > pdiffs)) { trashCode = "pq"; }
1395 catch(exception& e) {
1396 m->errorOut(e, "TrimOligos", "stripPairedPrimers");
1402 //*******************************************************************/
1403 int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, QualityScores& forwardQual, QualityScores& reverseQual, int& group){
1405 //look for forward barcode
1406 string rawFSequence = forwardSeq.getUnaligned();
1407 string rawRSequence = reverseSeq.getUnaligned();
1408 int success = pdiffs + 1; //guilty until proven innocent
1410 //can you find the forward barcode
1411 for(map<int,oligosPair>::iterator it=ipprimers.begin();it!=ipprimers.end();it++){
1412 string foligo = it->second.forward;
1413 string roligo = it->second.reverse;
1415 if(rawFSequence.length() < foligo.length()){ //let's just assume that the barcodes are the same length
1416 success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
1419 if(rawRSequence.length() < roligo.length()){ //let's just assume that the barcodes are the same length
1420 success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
1424 if((compareDNASeq(foligo, rawFSequence.substr(0,foligo.length()))) && (compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length())))) {
1426 forwardSeq.setUnaligned(rawFSequence.substr(foligo.length()));
1427 reverseSeq.setUnaligned(rawRSequence.substr(0,(rawRSequence.length()-roligo.length())));
1428 forwardQual.trimQScores(foligo.length(), -1);
1429 reverseQual.trimQScores(-1, rawRSequence.length()-roligo.length());
1435 //if you found the barcode or if you don't want to allow for diffs
1436 if ((pdiffs == 0) || (success == 0)) { return success; }
1437 else { //try aligning and see if you can find it
1438 Alignment* alignment;
1439 if (ifprimers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFPrimerLength+pdiffs+1)); }
1440 else{ alignment = NULL; }
1442 //can you find the barcode
1445 vector< vector<int> > minFGroup;
1446 vector<int> minFPos;
1448 //the pair can have duplicates, but we only want to search for the unique forward and reverses, example
1458 only want to look for forward = Sarah, John, Anna, Pat, Gail
1459 reverse = Westcott, Schloss, Brown, Moore
1461 but if best match forward = 4, and reverse = 1, we want to count as a valid match because forward 1 and forward 4 are the same. so both barcodes map to same group.
1463 //cout << endl << forwardSeq.getName() << endl;
1464 for(map<string, vector<int> >::iterator it=ifprimers.begin();it!=ifprimers.end();it++){
1465 string oligo = it->first;
1467 if(rawFSequence.length() < maxFPrimerLength){ //let's just assume that the barcodes are the same length
1468 success = pdiffs + 10;
1471 //cout << "before = " << oligo << '\t' << rawFSequence.substr(0,oligo.length()+pdiffs) << endl;
1472 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1473 alignment->alignPrimer(oligo, rawFSequence.substr(0,oligo.length()+pdiffs));
1474 oligo = alignment->getSeqAAln();
1475 string temp = alignment->getSeqBAln();
1477 int alnLength = oligo.length();
1479 for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } }
1480 oligo = oligo.substr(0,alnLength);
1481 temp = temp.substr(0,alnLength);
1482 int numDiff = countDiffs(oligo, temp);
1484 if (alnLength == 0) { numDiff = pdiffs + 100; }
1485 //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
1487 if(numDiff < minDiff){
1491 minFGroup.push_back(it->second);
1492 int tempminFPos = 0;
1494 for(int i=0;i<alnLength;i++){
1499 minFPos.push_back(tempminFPos);
1500 }else if(numDiff == minDiff){
1501 minFGroup.push_back(it->second);
1502 int tempminFPos = 0;
1503 for(int i=0;i<alnLength;i++){
1508 minFPos.push_back(tempminFPos);
1512 //cout << minDiff << '\t' << minCount << '\t' << endl;
1513 if(minDiff > pdiffs) { success = minDiff; } //no good matches
1515 //check for reverse match
1516 if (alignment != NULL) { delete alignment; }
1518 if (irbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxRPrimerLength+pdiffs+1)); }
1519 else{ alignment = NULL; }
1521 //can you find the barcode
1524 vector< vector<int> > minRGroup;
1525 vector<int> minRPos;
1527 for(map<string, vector<int> >::iterator it=irprimers.begin();it!=irprimers.end();it++){
1528 string oligo = it->first;
1529 //cout << "before = " << oligo << '\t' << rawRSequence.substr(0,oligo.length()+pdiffs) << endl;
1530 if(rawRSequence.length() < maxRPrimerLength){ //let's just assume that the barcodes are the same length
1531 success = pdiffs + 10;
1535 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1536 alignment->alignPrimer(oligo, rawRSequence.substr(0,oligo.length()+pdiffs));
1537 oligo = alignment->getSeqAAln();
1538 string temp = alignment->getSeqBAln();
1540 int alnLength = oligo.length();
1541 for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } }
1542 oligo = oligo.substr(0,alnLength);
1543 temp = temp.substr(0,alnLength);
1544 int numDiff = countDiffs(oligo, temp);
1545 if (alnLength == 0) { numDiff = pdiffs + 100; }
1547 //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
1548 if(numDiff < minDiff){
1552 minRGroup.push_back(it->second);
1553 int tempminRPos = 0;
1555 for(int i=0;i<alnLength;i++){
1560 minRPos.push_back(tempminRPos);
1561 }else if(numDiff == minDiff){
1562 int tempminRPos = 0;
1563 for(int i=0;i<alnLength;i++){
1568 minRPos.push_back(tempminRPos);
1569 minRGroup.push_back(it->second);
1574 /*cout << minDiff << '\t' << minCount << '\t' << endl;
1575 for (int i = 0; i < minFGroup.size(); i++) {
1577 for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; }
1581 for (int i = 0; i < minRGroup.size(); i++) {
1583 for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; }
1587 if(minDiff > pdiffs) { success = minDiff; } //no good matches
1589 bool foundMatch = false;
1590 vector<int> matches;
1591 for (int i = 0; i < minFGroup.size(); i++) {
1592 for (int j = 0; j < minFGroup[i].size(); j++) {
1593 for (int k = 0; k < minRGroup.size(); k++) {
1594 if (m->inUsersGroups(minFGroup[i][j], minRGroup[k])) { matches.push_back(minFGroup[i][j]); k+= minRGroup.size(); }
1601 if (matches.size() == 1) {
1604 fStart = minFPos[0];
1605 rStart = minRPos[0];
1608 //we have an acceptable match for the forward and reverse, but do they match?
1610 forwardSeq.setUnaligned(rawFSequence.substr(fStart));
1611 reverseSeq.setUnaligned(rawRSequence.substr(rStart));
1612 forwardQual.trimQScores(fStart, -1);
1613 reverseQual.trimQScores(rStart, -1);
1615 }else { success = pdiffs + 100; }
1619 if (alignment != NULL) { delete alignment; }
1625 catch(exception& e) {
1626 m->errorOut(e, "TrimOligos", "stripIForward");
1631 //*******************************************************************/
1632 int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, int& group){
1634 //look for forward barcode
1635 string rawFSequence = forwardSeq.getUnaligned();
1636 string rawRSequence = reverseSeq.getUnaligned();
1637 int success = pdiffs + 1; //guilty until proven innocent
1639 //can you find the forward barcode
1640 for(map<int,oligosPair>::iterator it=ipprimers.begin();it!=ipprimers.end();it++){
1641 string foligo = it->second.forward;
1642 string roligo = it->second.reverse;
1644 if(rawFSequence.length() < foligo.length()){ //let's just assume that the barcodes are the same length
1645 success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
1648 if(rawRSequence.length() < roligo.length()){ //let's just assume that the barcodes are the same length
1649 success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
1653 if((compareDNASeq(foligo, rawFSequence.substr(0,foligo.length()))) && (compareDNASeq(roligo, rawRSequence.substr((rawRSequence.length()-roligo.length()),roligo.length())))) {
1655 forwardSeq.setUnaligned(rawFSequence.substr(foligo.length()));
1656 reverseSeq.setUnaligned(rawRSequence.substr(0,(rawRSequence.length()-roligo.length())));
1662 //if you found the barcode or if you don't want to allow for diffs
1663 if ((pdiffs == 0) || (success == 0)) { return success; }
1664 else { //try aligning and see if you can find it
1665 Alignment* alignment;
1666 if (ifprimers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFPrimerLength+pdiffs+1)); }
1667 else{ alignment = NULL; }
1669 //can you find the barcode
1672 vector< vector<int> > minFGroup;
1673 vector<int> minFPos;
1675 //the pair can have duplicates, but we only want to search for the unique forward and reverses, example
1685 only want to look for forward = Sarah, John, Anna, Pat, Gail
1686 reverse = Westcott, Schloss, Brown, Moore
1688 but if best match forward = 4, and reverse = 1, we want to count as a valid match because forward 1 and forward 4 are the same. so both barcodes map to same group.
1690 //cout << endl << forwardSeq.getName() << endl;
1691 for(map<string, vector<int> >::iterator it=ifprimers.begin();it!=ifprimers.end();it++){
1692 string oligo = it->first;
1694 if(rawFSequence.length() < maxFPrimerLength){ //let's just assume that the barcodes are the same length
1695 success = pdiffs + 10;
1698 //cout << "before = " << oligo << '\t' << rawFSequence.substr(0,oligo.length()+pdiffs) << endl;
1699 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1700 alignment->alignPrimer(oligo, rawFSequence.substr(0,oligo.length()+pdiffs));
1701 oligo = alignment->getSeqAAln();
1702 string temp = alignment->getSeqBAln();
1704 int alnLength = oligo.length();
1706 for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } }
1707 oligo = oligo.substr(0,alnLength);
1708 temp = temp.substr(0,alnLength);
1709 int numDiff = countDiffs(oligo, temp);
1711 if (alnLength == 0) { numDiff = pdiffs + 100; }
1712 //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
1714 if(numDiff < minDiff){
1718 minFGroup.push_back(it->second);
1719 int tempminFPos = 0;
1721 for(int i=0;i<alnLength;i++){
1726 minFPos.push_back(tempminFPos);
1727 }else if(numDiff == minDiff){
1728 minFGroup.push_back(it->second);
1729 int tempminFPos = 0;
1730 for(int i=0;i<alnLength;i++){
1735 minFPos.push_back(tempminFPos);
1739 //cout << minDiff << '\t' << minCount << '\t' << endl;
1740 if(minDiff > pdiffs) { success = minDiff; } //no good matches
1742 //check for reverse match
1743 if (alignment != NULL) { delete alignment; }
1745 if (irbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxRPrimerLength+pdiffs+1)); }
1746 else{ alignment = NULL; }
1748 //can you find the barcode
1751 vector< vector<int> > minRGroup;
1752 vector<int> minRPos;
1754 for(map<string, vector<int> >::iterator it=irprimers.begin();it!=irprimers.end();it++){
1755 string oligo = it->first;
1756 //cout << "before = " << oligo << '\t' << rawRSequence.substr(0,oligo.length()+pdiffs) << endl;
1757 if(rawRSequence.length() < maxRPrimerLength){ //let's just assume that the barcodes are the same length
1758 success = pdiffs + 10;
1762 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1763 alignment->alignPrimer(oligo, rawRSequence.substr(0,oligo.length()+pdiffs));
1764 oligo = alignment->getSeqAAln();
1765 string temp = alignment->getSeqBAln();
1767 int alnLength = oligo.length();
1768 for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1; break; } }
1769 oligo = oligo.substr(0,alnLength);
1770 temp = temp.substr(0,alnLength);
1771 int numDiff = countDiffs(oligo, temp);
1772 if (alnLength == 0) { numDiff = pdiffs + 100; }
1774 //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
1775 if(numDiff < minDiff){
1779 minRGroup.push_back(it->second);
1780 int tempminRPos = 0;
1782 for(int i=0;i<alnLength;i++){
1787 minRPos.push_back(tempminRPos);
1788 }else if(numDiff == minDiff){
1789 int tempminRPos = 0;
1790 for(int i=0;i<alnLength;i++){
1795 minRPos.push_back(tempminRPos);
1796 minRGroup.push_back(it->second);
1801 /*cout << minDiff << '\t' << minCount << '\t' << endl;
1802 for (int i = 0; i < minFGroup.size(); i++) {
1804 for (int j = 0; j < minFGroup[i].size(); j++) { cout << minFGroup[i][j] << " "; }
1808 for (int i = 0; i < minRGroup.size(); i++) {
1810 for (int j = 0; j < minRGroup[i].size(); j++) { cout << minRGroup[i][j] << " "; }
1814 if(minDiff > pdiffs) { success = minDiff; } //no good matches
1816 bool foundMatch = false;
1817 vector<int> matches;
1818 for (int i = 0; i < minFGroup.size(); i++) {
1819 for (int j = 0; j < minFGroup[i].size(); j++) {
1820 for (int k = 0; k < minRGroup.size(); k++) {
1821 if (m->inUsersGroups(minFGroup[i][j], minRGroup[k])) { matches.push_back(minFGroup[i][j]); k+= minRGroup.size(); }
1828 if (matches.size() == 1) {
1831 fStart = minFPos[0];
1832 rStart = minRPos[0];
1835 //we have an acceptable match for the forward and reverse, but do they match?
1837 forwardSeq.setUnaligned(rawFSequence.substr(fStart));
1838 reverseSeq.setUnaligned(rawRSequence.substr(rStart));
1840 }else { success = pdiffs + 100; }
1844 if (alignment != NULL) { delete alignment; }
1850 catch(exception& e) {
1851 m->errorOut(e, "TrimOligos", "stripIForward");
1856 //*******************************************************************/
1857 int TrimOligos::stripBarcode(Sequence& seq, int& group){
1860 string rawSequence = seq.getUnaligned();
1861 int success = bdiffs + 1; //guilty until proven innocent
1863 //can you find the barcode
1864 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1865 string oligo = it->first;
1866 if(rawSequence.length() < oligo.length()){ //let's just assume that the barcodes are the same length
1867 success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
1871 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1873 seq.setUnaligned(rawSequence.substr(oligo.length()));
1880 //if you found the barcode or if you don't want to allow for diffs
1881 if ((bdiffs == 0) || (success == 0)) { return success; }
1883 else { //try aligning and see if you can find it
1884 Alignment* alignment;
1885 if (barcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFBarcodeLength+bdiffs+1)); }
1886 else{ alignment = NULL; }
1888 //can you find the barcode
1894 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
1895 string oligo = it->first;
1896 // int length = oligo.length();
1898 if(rawSequence.length() < maxFBarcodeLength){ //let's just assume that the barcodes are the same length
1899 success = bdiffs + 10;
1903 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
1904 alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
1905 oligo = alignment->getSeqAAln();
1906 string temp = alignment->getSeqBAln();
1908 int alnLength = oligo.length();
1910 for(int i=oligo.length()-1;i>=0;i--){
1911 if(oligo[i] != '-'){ alnLength = i+1; break; }
1913 oligo = oligo.substr(0,alnLength);
1914 temp = temp.substr(0,alnLength);
1916 int numDiff = countDiffs(oligo, temp);
1918 if(numDiff < minDiff){
1921 minGroup = it->second;
1923 for(int i=0;i<alnLength;i++){
1929 else if(numDiff == minDiff){
1935 if(minDiff > bdiffs) { success = minDiff; } //no good matches
1936 else if(minCount > 1) { success = bdiffs + 100; } //can't tell the difference between multiple barcodes
1937 else{ //use the best match
1939 seq.setUnaligned(rawSequence.substr(minPos));
1943 if (alignment != NULL) { delete alignment; }
1950 catch(exception& e) {
1951 m->errorOut(e, "TrimOligos", "stripBarcode");
1957 /********************************************************************/
1958 int TrimOligos::stripForward(Sequence& seq, int& group){
1961 string rawSequence = seq.getUnaligned();
1962 int success = pdiffs + 1; //guilty until proven innocent
1964 //can you find the primer
1965 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1966 string oligo = it->first;
1967 if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length
1968 success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
1972 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
1974 seq.setUnaligned(rawSequence.substr(oligo.length()));
1980 //if you found the barcode or if you don't want to allow for diffs
1981 if ((pdiffs == 0) || (success == 0)) { return success; }
1983 else { //try aligning and see if you can find it
1984 Alignment* alignment;
1985 if (primers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFPrimerLength+pdiffs+1)); }
1986 else{ alignment = NULL; }
1988 //can you find the barcode
1994 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
1995 string oligo = it->first;
1996 // int length = oligo.length();
1998 if(rawSequence.length() < maxFPrimerLength){
1999 success = pdiffs + 100;
2003 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
2004 alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
2005 oligo = alignment->getSeqAAln();
2006 string temp = alignment->getSeqBAln();
2008 int alnLength = oligo.length();
2010 for(int i=oligo.length()-1;i>=0;i--){
2011 if(oligo[i] != '-'){ alnLength = i+1; break; }
2013 oligo = oligo.substr(0,alnLength);
2014 temp = temp.substr(0,alnLength);
2016 int numDiff = countDiffs(oligo, temp);
2018 if(numDiff < minDiff){
2021 minGroup = it->second;
2023 for(int i=0;i<alnLength;i++){
2029 else if(numDiff == minDiff){
2035 if(minDiff > pdiffs) { success = minDiff; } //no good matches
2036 else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers
2037 else{ //use the best match
2039 seq.setUnaligned(rawSequence.substr(minPos));
2043 if (alignment != NULL) { delete alignment; }
2050 catch(exception& e) {
2051 m->errorOut(e, "TrimOligos", "stripForward");
2055 //*******************************************************************/
2056 int TrimOligos::stripForward(Sequence& seq, QualityScores& qual, int& group, bool keepForward){
2059 if (paired) { int success = stripPairedPrimers(seq, qual, group, keepForward); return success; }
2061 string rawSequence = seq.getUnaligned();
2062 int success = pdiffs + 1; //guilty until proven innocent
2064 //can you find the primer
2065 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
2066 string oligo = it->first;
2067 if(rawSequence.length() < oligo.length()){ //let's just assume that the primers are the same length
2068 success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
2072 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
2074 if (!keepForward) { seq.setUnaligned(rawSequence.substr(oligo.length())); }
2075 if(qual.getName() != ""){
2076 if (!keepForward) { qual.trimQScores(oligo.length(), -1); }
2083 //if you found the barcode or if you don't want to allow for diffs
2084 if ((pdiffs == 0) || (success == 0)) { return success; }
2086 else { //try aligning and see if you can find it
2087 Alignment* alignment;
2088 if (primers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFPrimerLength+pdiffs+1)); }
2089 else{ alignment = NULL; }
2091 //can you find the barcode
2097 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
2098 string oligo = it->first;
2099 // int length = oligo.length();
2101 if(rawSequence.length() < maxFPrimerLength){
2102 success = pdiffs + 100;
2106 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
2107 alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
2108 oligo = alignment->getSeqAAln();
2109 string temp = alignment->getSeqBAln();
2111 int alnLength = oligo.length();
2113 for(int i=oligo.length()-1;i>=0;i--){
2114 if(oligo[i] != '-'){ alnLength = i+1; break; }
2116 oligo = oligo.substr(0,alnLength);
2117 temp = temp.substr(0,alnLength);
2119 int numDiff = countDiffs(oligo, temp);
2121 if(numDiff < minDiff){
2124 minGroup = it->second;
2126 for(int i=0;i<alnLength;i++){
2132 else if(numDiff == minDiff){
2138 if(minDiff > pdiffs) { success = minDiff; } //no good matches
2139 else if(minCount > 1) { success = pdiffs + 10; } //can't tell the difference between multiple primers
2140 else{ //use the best match
2142 if (!keepForward) { seq.setUnaligned(rawSequence.substr(minPos)); }
2143 if(qual.getName() != ""){
2144 if (!keepForward) { qual.trimQScores(minPos, -1); }
2149 if (alignment != NULL) { delete alignment; }
2156 catch(exception& e) {
2157 m->errorOut(e, "TrimOligos", "stripForward");
2161 //******************************************************************/
2162 bool TrimOligos::stripReverse(Sequence& seq, QualityScores& qual){
2164 string rawSequence = seq.getUnaligned();
2165 bool success = 0; //guilty until proven innocent
2167 for(int i=0;i<revPrimer.size();i++){
2168 string oligo = revPrimer[i];
2170 if(rawSequence.length() < oligo.length()){
2175 if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
2176 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
2177 if(qual.getName() != ""){
2178 qual.trimQScores(-1, rawSequence.length()-oligo.length());
2187 catch(exception& e) {
2188 m->errorOut(e, "TrimOligos", "stripReverse");
2192 //******************************************************************/
2193 bool TrimOligos::stripReverse(Sequence& seq){
2196 string rawSequence = seq.getUnaligned();
2197 bool success = 0; //guilty until proven innocent
2199 for(int i=0;i<revPrimer.size();i++){
2200 string oligo = revPrimer[i];
2202 if(rawSequence.length() < oligo.length()){
2207 if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
2208 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
2217 catch(exception& e) {
2218 m->errorOut(e, "TrimOligos", "stripReverse");
2222 //******************************************************************/
2223 bool TrimOligos::stripLinker(Sequence& seq, QualityScores& qual){
2225 string rawSequence = seq.getUnaligned();
2226 bool success = ldiffs + 1; //guilty until proven innocent
2228 for(int i=0;i<linker.size();i++){
2229 string oligo = linker[i];
2231 if(rawSequence.length() < oligo.length()){
2232 success = ldiffs + 10;
2236 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
2237 seq.setUnaligned(rawSequence.substr(oligo.length()));
2238 if(qual.getName() != ""){
2239 qual.trimQScores(oligo.length(), -1);
2246 //if you found the linker or if you don't want to allow for diffs
2247 if ((ldiffs == 0) || (success == 0)) { return success; }
2249 else { //try aligning and see if you can find it
2250 Alignment* alignment;
2251 if (linker.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLinkerLength+ldiffs+1)); }
2252 else{ alignment = NULL; }
2254 //can you find the barcode
2259 for(int i = 0; i < linker.size(); i++){
2260 string oligo = linker[i];
2261 // int length = oligo.length();
2263 if(rawSequence.length() < maxLinkerLength){ //let's just assume that the barcodes are the same length
2264 success = ldiffs + 10;
2268 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
2269 alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+ldiffs));
2270 oligo = alignment->getSeqAAln();
2271 string temp = alignment->getSeqBAln();
2273 int alnLength = oligo.length();
2275 for(int i=oligo.length()-1;i>=0;i--){
2276 if(oligo[i] != '-'){ alnLength = i+1; break; }
2278 oligo = oligo.substr(0,alnLength);
2279 temp = temp.substr(0,alnLength);
2281 int numDiff = countDiffs(oligo, temp);
2283 if(numDiff < minDiff){
2287 for(int i=0;i<alnLength;i++){
2293 else if(numDiff == minDiff){
2299 if(minDiff > ldiffs) { success = minDiff; } //no good matches
2300 else if(minCount > 1) { success = ldiffs + 100; } //can't tell the difference between multiple barcodes
2301 else{ //use the best match
2302 seq.setUnaligned(rawSequence.substr(minPos));
2304 if(qual.getName() != ""){
2305 qual.trimQScores(minPos, -1);
2310 if (alignment != NULL) { delete alignment; }
2318 catch(exception& e) {
2319 m->errorOut(e, "TrimOligos", "stripLinker");
2323 //******************************************************************/
2324 bool TrimOligos::stripLinker(Sequence& seq){
2327 string rawSequence = seq.getUnaligned();
2328 bool success = ldiffs +1; //guilty until proven innocent
2330 for(int i=0;i<linker.size();i++){
2331 string oligo = linker[i];
2333 if(rawSequence.length() < oligo.length()){
2334 success = ldiffs +10;
2338 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
2339 seq.setUnaligned(rawSequence.substr(oligo.length()));
2345 //if you found the linker or if you don't want to allow for diffs
2346 if ((ldiffs == 0) || (success == 0)) { return success; }
2348 else { //try aligning and see if you can find it
2349 Alignment* alignment;
2350 if (linker.size() > 0) {alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLinkerLength+ldiffs+1)); }
2351 else{ alignment = NULL; }
2353 //can you find the barcode
2358 for(int i = 0; i < linker.size(); i++){
2359 string oligo = linker[i];
2360 // int length = oligo.length();
2362 if(rawSequence.length() < maxLinkerLength){ //let's just assume that the barcodes are the same length
2363 success = ldiffs + 10;
2367 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
2368 alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+ldiffs));
2369 oligo = alignment->getSeqAAln();
2370 string temp = alignment->getSeqBAln();
2372 int alnLength = oligo.length();
2374 for(int i=oligo.length()-1;i>=0;i--){
2375 if(oligo[i] != '-'){ alnLength = i+1; break; }
2377 oligo = oligo.substr(0,alnLength);
2378 temp = temp.substr(0,alnLength);
2380 int numDiff = countDiffs(oligo, temp);
2382 if(numDiff < minDiff){
2386 for(int i=0;i<alnLength;i++){
2392 else if(numDiff == minDiff){
2398 if(minDiff > ldiffs) { success = minDiff; } //no good matches
2399 else if(minCount > 1) { success = ldiffs + 100; } //can't tell the difference between multiple barcodes
2400 else{ //use the best match
2401 seq.setUnaligned(rawSequence.substr(minPos));
2405 if (alignment != NULL) { delete alignment; }
2412 catch(exception& e) {
2413 m->errorOut(e, "TrimOligos", "stripLinker");
2418 //******************************************************************/
2419 bool TrimOligos::stripSpacer(Sequence& seq, QualityScores& qual){
2421 string rawSequence = seq.getUnaligned();
2422 bool success = sdiffs+1; //guilty until proven innocent
2424 for(int i=0;i<spacer.size();i++){
2425 string oligo = spacer[i];
2427 if(rawSequence.length() < oligo.length()){
2428 success = sdiffs+10;
2432 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
2433 seq.setUnaligned(rawSequence.substr(oligo.length()));
2434 if(qual.getName() != ""){
2435 qual.trimQScores(oligo.length(), -1);
2442 //if you found the spacer or if you don't want to allow for diffs
2443 if ((sdiffs == 0) || (success == 0)) { return success; }
2445 else { //try aligning and see if you can find it
2446 Alignment* alignment;
2447 if (spacer.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxSpacerLength+sdiffs+1)); }
2448 else{ alignment = NULL; }
2450 //can you find the barcode
2455 for(int i = 0; i < spacer.size(); i++){
2456 string oligo = spacer[i];
2457 // int length = oligo.length();
2459 if(rawSequence.length() < maxSpacerLength){ //let's just assume that the barcodes are the same length
2460 success = sdiffs + 10;
2464 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
2465 alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+sdiffs));
2466 oligo = alignment->getSeqAAln();
2467 string temp = alignment->getSeqBAln();
2469 int alnLength = oligo.length();
2471 for(int i=oligo.length()-1;i>=0;i--){
2472 if(oligo[i] != '-'){ alnLength = i+1; break; }
2474 oligo = oligo.substr(0,alnLength);
2475 temp = temp.substr(0,alnLength);
2477 int numDiff = countDiffs(oligo, temp);
2479 if(numDiff < minDiff){
2483 for(int i=0;i<alnLength;i++){
2489 else if(numDiff == minDiff){
2495 if(minDiff > sdiffs) { success = minDiff; } //no good matches
2496 else if(minCount > 1) { success = sdiffs + 100; } //can't tell the difference between multiple barcodes
2497 else{ //use the best match
2498 seq.setUnaligned(rawSequence.substr(minPos));
2500 if(qual.getName() != ""){
2501 qual.trimQScores(minPos, -1);
2506 if (alignment != NULL) { delete alignment; }
2514 catch(exception& e) {
2515 m->errorOut(e, "TrimOligos", "stripSpacer");
2519 //******************************************************************/
2520 bool TrimOligos::stripSpacer(Sequence& seq){
2523 string rawSequence = seq.getUnaligned();
2524 bool success = sdiffs+1; //guilty until proven innocent
2526 for(int i=0;i<spacer.size();i++){
2527 string oligo = spacer[i];
2529 if(rawSequence.length() < oligo.length()){
2530 success = sdiffs+10;
2534 if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
2535 seq.setUnaligned(rawSequence.substr(oligo.length()));
2541 //if you found the spacer or if you don't want to allow for diffs
2542 if ((sdiffs == 0) || (success == 0)) { return success; }
2544 else { //try aligning and see if you can find it
2545 Alignment* alignment;
2546 if (spacer.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxSpacerLength+sdiffs+1)); }
2547 else{ alignment = NULL; }
2549 //can you find the barcode
2554 for(int i = 0; i < spacer.size(); i++){
2555 string oligo = spacer[i];
2556 // int length = oligo.length();
2558 if(rawSequence.length() < maxSpacerLength){ //let's just assume that the barcodes are the same length
2559 success = sdiffs + 10;
2563 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
2564 alignment->alignPrimer(oligo, rawSequence.substr(0,oligo.length()+sdiffs));
2565 oligo = alignment->getSeqAAln();
2566 string temp = alignment->getSeqBAln();
2568 int alnLength = oligo.length();
2570 for(int i=oligo.length()-1;i>=0;i--){
2571 if(oligo[i] != '-'){ alnLength = i+1; break; }
2573 oligo = oligo.substr(0,alnLength);
2574 temp = temp.substr(0,alnLength);
2576 int numDiff = countDiffs(oligo, temp);
2578 if(numDiff < minDiff){
2582 for(int i=0;i<alnLength;i++){
2588 else if(numDiff == minDiff){
2594 if(minDiff > sdiffs) { success = minDiff; } //no good matches
2595 else if(minCount > 1) { success = sdiffs + 100; } //can't tell the difference between multiple barcodes
2596 else{ //use the best match
2597 seq.setUnaligned(rawSequence.substr(minPos));
2601 if (alignment != NULL) { delete alignment; }
2608 catch(exception& e) {
2609 m->errorOut(e, "TrimOligos", "stripSpacer");
2614 //******************************************************************/
2615 bool TrimOligos::compareDNASeq(string oligo, string seq){
2618 int length = oligo.length();
2620 for(int i=0;i<length;i++){
2622 if(oligo[i] != seq[i]){
2623 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
2624 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
2625 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
2626 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
2627 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
2628 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
2629 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
2630 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
2631 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
2632 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
2633 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
2634 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
2636 if(success == 0) { break; }
2645 catch(exception& e) {
2646 m->errorOut(e, "TrimOligos", "compareDNASeq");
2651 //********************************************************************/
2652 int TrimOligos::countDiffs(string oligo, string seq){
2655 int length = oligo.length();
2658 for(int i=0;i<length;i++){
2660 if(oligo[i] != seq[i]){
2661 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.') { countDiffs++; }
2662 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { countDiffs++; }
2663 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { countDiffs++; }
2664 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { countDiffs++; }
2665 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { countDiffs++; }
2666 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
2667 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { countDiffs++; }
2668 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
2669 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
2670 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { countDiffs++; }
2671 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { countDiffs++; }
2672 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { countDiffs++; }
2679 catch(exception& e) {
2680 m->errorOut(e, "TrimOligos", "countDiffs");
2684 //********************************************************************/
2685 string TrimOligos::reverseOligo(string oligo){
2687 string reverse = "";
2689 for(int i=oligo.length()-1;i>=0;i--){
2691 if(oligo[i] == 'A') { reverse += 'T'; }
2692 else if(oligo[i] == 'T'){ reverse += 'A'; }
2693 else if(oligo[i] == 'U'){ reverse += 'A'; }
2695 else if(oligo[i] == 'G'){ reverse += 'C'; }
2696 else if(oligo[i] == 'C'){ reverse += 'G'; }
2698 else if(oligo[i] == 'R'){ reverse += 'Y'; }
2699 else if(oligo[i] == 'Y'){ reverse += 'R'; }
2701 else if(oligo[i] == 'M'){ reverse += 'K'; }
2702 else if(oligo[i] == 'K'){ reverse += 'M'; }
2704 else if(oligo[i] == 'W'){ reverse += 'W'; }
2705 else if(oligo[i] == 'S'){ reverse += 'S'; }
2707 else if(oligo[i] == 'B'){ reverse += 'V'; }
2708 else if(oligo[i] == 'V'){ reverse += 'B'; }
2710 else if(oligo[i] == 'D'){ reverse += 'H'; }
2711 else if(oligo[i] == 'H'){ reverse += 'D'; }
2713 else if(oligo[i] == '-'){ reverse += '-'; }
2714 else { reverse += 'N'; }
2720 catch(exception& e) {
2721 m->errorOut(e, "TrimOligos", "reverseOligo");
2726 /********************************************************************/