]> git.donarmstrong.com Git - mothur.git/blob - trimoligos.cpp
working on windows paralellization, added trimOligos class to be used by trim.flows...
[mothur.git] / trimoligos.cpp
1 /*
2  *  trimoligos.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 9/1/11.
6  *  Copyright 2011 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "trimoligos.h"
11 #include "alignment.hpp"
12 #include "needlemanoverlap.hpp"
13
14
15 /********************************************************************/
16 //strip, pdiffs, bdiffs, primers, barcodes, revPrimers
17 TrimOligos::TrimOligos(int p, int b, map<string, int> pr, map<string, int> br, vector<string> r){
18         try {
19                 m = MothurOut::getInstance();
20                 
21                 pdiffs = p;
22                 bdiffs = b;
23                 
24                 barcodes = br;
25                 primers = pr;
26                 revPrimer = r;
27         }
28         catch(exception& e) {
29                 m->errorOut(e, "TrimOligos", "TrimOligos");
30                 exit(1);
31         }
32 }
33 /********************************************************************/
34 TrimOligos::~TrimOligos() {}
35 //*******************************************************************/
36 int TrimOligos::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
37         try {
38                 
39                 string rawSequence = seq.getUnaligned();
40                 int success = bdiffs + 1;       //guilty until proven innocent
41                 
42                 //can you find the barcode
43                 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
44                         string oligo = it->first;
45                         if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
46                                 success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
47                                 break;  
48                         }
49                         
50                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
51                                 group = it->second;
52                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
53                                 
54                                 if(qual.getName() != ""){
55                                         qual.trimQScores(oligo.length(), -1);
56                                 }
57                                 
58                                 success = 0;
59                                 break;
60                         }
61                 }
62                 
63                 //if you found the barcode or if you don't want to allow for diffs
64                 if ((bdiffs == 0) || (success == 0)) { return success;  }
65                 
66                 else { //try aligning and see if you can find it
67                         
68                         int maxLength = 0;
69                         
70                         Alignment* alignment;
71                         if (barcodes.size() > 0) {
72                                 map<string,int>::iterator it=barcodes.begin();
73                                 
74                                 for(it;it!=barcodes.end();it++){
75                                         if(it->first.length() > maxLength){
76                                                 maxLength = it->first.length();
77                                         }
78                                 }
79                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
80                                 
81                         }else{ alignment = NULL; } 
82                         
83                         //can you find the barcode
84                         int minDiff = 1e6;
85                         int minCount = 1;
86                         int minGroup = -1;
87                         int minPos = 0;
88                         
89                         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
90                                 string oligo = it->first;
91                                 //                              int length = oligo.length();
92                                 
93                                 if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
94                                         success = bdiffs + 10;
95                                         break;
96                                 }
97                                 
98                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
99                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
100                                 oligo = alignment->getSeqAAln();
101                                 string temp = alignment->getSeqBAln();
102                                 
103                                 int alnLength = oligo.length();
104                                 
105                                 for(int i=oligo.length()-1;i>=0;i--){
106                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
107                                 }
108                                 oligo = oligo.substr(0,alnLength);
109                                 temp = temp.substr(0,alnLength);
110                                 
111                                 int numDiff = countDiffs(oligo, temp);
112                                 
113                                 if(numDiff < minDiff){
114                                         minDiff = numDiff;
115                                         minCount = 1;
116                                         minGroup = it->second;
117                                         minPos = 0;
118                                         for(int i=0;i<alnLength;i++){
119                                                 if(temp[i] != '-'){
120                                                         minPos++;
121                                                 }
122                                         }
123                                 }
124                                 else if(numDiff == minDiff){
125                                         minCount++;
126                                 }
127                                 
128                         }
129                         
130                         if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
131                         else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
132                         else{                                                                                                   //use the best match
133                                 group = minGroup;
134                                 seq.setUnaligned(rawSequence.substr(minPos));
135                                 
136                                 if(qual.getName() != ""){
137                                         qual.trimQScores(minPos, -1);
138                                 }
139                                 success = minDiff;
140                         }
141                         
142                         if (alignment != NULL) {  delete alignment;  }
143                         
144                 }
145                 
146                 return success;
147                 
148         }
149         catch(exception& e) {
150                 m->errorOut(e, "TrimOligos", "stripBarcode");
151                 exit(1);
152         }
153         
154 }
155 //*******************************************************************/
156 int TrimOligos::stripBarcode(Sequence& seq, int& group){
157         try {
158                 
159                 string rawSequence = seq.getUnaligned();
160                 int success = bdiffs + 1;       //guilty until proven innocent
161                 
162                 //can you find the barcode
163                 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
164                         string oligo = it->first;
165                         if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
166                                 success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
167                                 break;  
168                         }
169                         
170                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
171                                 group = it->second;
172                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
173                                 
174                                 success = 0;
175                                 break;
176                         }
177                 }
178                 
179                 //if you found the barcode or if you don't want to allow for diffs
180                 if ((bdiffs == 0) || (success == 0)) { return success;  }
181                 
182                 else { //try aligning and see if you can find it
183                         
184                         int maxLength = 0;
185                         
186                         Alignment* alignment;
187                         if (barcodes.size() > 0) {
188                                 map<string,int>::iterator it=barcodes.begin();
189                                 
190                                 for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
191                                         if(it->first.length() > maxLength){
192                                                 maxLength = it->first.length();
193                                         }
194                                 }
195                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
196                                 
197                         }else{ alignment = NULL; } 
198                         
199                         //can you find the barcode
200                         int minDiff = 1e6;
201                         int minCount = 1;
202                         int minGroup = -1;
203                         int minPos = 0;
204                         
205                         for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
206                                 string oligo = it->first;
207                                 //                              int length = oligo.length();
208                                 
209                                 if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
210                                         success = bdiffs + 10;
211                                         break;
212                                 }
213                                 
214                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
215                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
216                                 oligo = alignment->getSeqAAln();
217                                 string temp = alignment->getSeqBAln();
218                                 
219                                 int alnLength = oligo.length();
220                                 
221                                 for(int i=oligo.length()-1;i>=0;i--){
222                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
223                                 }
224                                 oligo = oligo.substr(0,alnLength);
225                                 temp = temp.substr(0,alnLength);
226                                 
227                                 int numDiff = countDiffs(oligo, temp);
228                                 
229                                 if(numDiff < minDiff){
230                                         minDiff = numDiff;
231                                         minCount = 1;
232                                         minGroup = it->second;
233                                         minPos = 0;
234                                         for(int i=0;i<alnLength;i++){
235                                                 if(temp[i] != '-'){
236                                                         minPos++;
237                                                 }
238                                         }
239                                 }
240                                 else if(numDiff == minDiff){
241                                         minCount++;
242                                 }
243                                 
244                         }
245                         
246                         if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
247                         else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
248                         else{                                                                                                   //use the best match
249                                 group = minGroup;
250                                 seq.setUnaligned(rawSequence.substr(minPos));
251                                 success = minDiff;
252                         }
253                         
254                         if (alignment != NULL) {  delete alignment;  }
255                         
256                 }
257                 
258                 return success;
259                 
260         }
261         catch(exception& e) {
262                 m->errorOut(e, "TrimOligos", "stripBarcode");
263                 exit(1);
264         }
265         
266 }
267 //********************************************************************/
268 int TrimOligos::stripForward(Sequence& seq, int& group){
269         try {
270                 
271                 string rawSequence = seq.getUnaligned();
272                 int success = pdiffs + 1;       //guilty until proven innocent
273                 
274                 //can you find the primer
275                 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
276                         string oligo = it->first;
277                         if(rawSequence.length() < oligo.length()){      //let's just assume that the primers are the same length
278                                 success = pdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
279                                 break;  
280                         }
281                         
282                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
283                                 group = it->second;
284                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
285                                 success = 0;
286                                 break;
287                         }
288                 }
289                 
290                 //if you found the barcode or if you don't want to allow for diffs
291                 if ((pdiffs == 0) || (success == 0)) {  return success;  }
292                 
293                 else { //try aligning and see if you can find it
294                         
295                         int maxLength = 0;
296                         
297                         Alignment* alignment;
298                         if (primers.size() > 0) {
299                                 map<string,int>::iterator it=primers.begin();
300                                 
301                                 for(it;it!=primers.end();it++){
302                                         if(it->first.length() > maxLength){
303                                                 maxLength = it->first.length();
304                                         }
305                                 }
306                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));  
307                                 
308                         }else{ alignment = NULL; } 
309                         
310                         //can you find the barcode
311                         int minDiff = 1e6;
312                         int minCount = 1;
313                         int minGroup = -1;
314                         int minPos = 0;
315                         
316                         for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
317                                 string oligo = it->first;
318                                 //                              int length = oligo.length();
319                                 
320                                 if(rawSequence.length() < maxLength){   
321                                         success = pdiffs + 100;
322                                         break;
323                                 }
324                                 
325                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
326                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
327                                 oligo = alignment->getSeqAAln();
328                                 string temp = alignment->getSeqBAln();
329                                 
330                                 int alnLength = oligo.length();
331                                 
332                                 for(int i=oligo.length()-1;i>=0;i--){
333                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
334                                 }
335                                 oligo = oligo.substr(0,alnLength);
336                                 temp = temp.substr(0,alnLength);
337                                 
338                                 int numDiff = countDiffs(oligo, temp);
339                                 
340                                 if(numDiff < minDiff){
341                                         minDiff = numDiff;
342                                         minCount = 1;
343                                         minGroup = it->second;
344                                         minPos = 0;
345                                         for(int i=0;i<alnLength;i++){
346                                                 if(temp[i] != '-'){
347                                                         minPos++;
348                                                 }
349                                         }
350                                 }
351                                 else if(numDiff == minDiff){
352                                         minCount++;
353                                 }
354                                 
355                         }
356                         
357                         if(minDiff > pdiffs)    {       success = minDiff;              }       //no good matches
358                         else if(minCount > 1)   {       success = pdiffs + 10;  }       //can't tell the difference between multiple primers
359                         else{                                                                                                   //use the best match
360                                 group = minGroup;
361                                 seq.setUnaligned(rawSequence.substr(minPos));
362                                 success = minDiff;
363                         }
364                         
365                         if (alignment != NULL) {  delete alignment;  }
366                         
367                 }
368                 
369                 return success;
370                 
371         }
372         catch(exception& e) {
373                 m->errorOut(e, "TrimOligos", "stripForward");
374                 exit(1);
375         }
376 }
377 //*******************************************************************/
378 int TrimOligos::stripForward(Sequence& seq, QualityScores& qual, int& group){
379         try {
380                 string rawSequence = seq.getUnaligned();
381                 int success = pdiffs + 1;       //guilty until proven innocent
382                 
383                 //can you find the primer
384                 for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
385                         string oligo = it->first;
386                         if(rawSequence.length() < oligo.length()){      //let's just assume that the primers are the same length
387                                 success = pdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
388                                 break;  
389                         }
390                         
391                         if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
392                                 group = it->second;
393                                 seq.setUnaligned(rawSequence.substr(oligo.length()));
394                                 if(qual.getName() != ""){
395                                         qual.trimQScores(oligo.length(), -1);
396                                 }
397                                 success = 0;
398                                 break;
399                         }
400                 }
401                 
402                 //if you found the barcode or if you don't want to allow for diffs
403                 if ((pdiffs == 0) || (success == 0)) { return success;  }
404                 
405                 else { //try aligning and see if you can find it
406                         
407                         int maxLength = 0;
408                         
409                         Alignment* alignment;
410                         if (primers.size() > 0) {
411                                 map<string,int>::iterator it=primers.begin();
412                                 
413                                 for(it;it!=primers.end();it++){
414                                         if(it->first.length() > maxLength){
415                                                 maxLength = it->first.length();
416                                         }
417                                 }
418                                 alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));  
419                                 
420                         }else{ alignment = NULL; } 
421                         
422                         //can you find the barcode
423                         int minDiff = 1e6;
424                         int minCount = 1;
425                         int minGroup = -1;
426                         int minPos = 0;
427                         
428                         for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
429                                 string oligo = it->first;
430                                 //                              int length = oligo.length();
431                                 
432                                 if(rawSequence.length() < maxLength){   
433                                         success = pdiffs + 100;
434                                         break;
435                                 }
436                                 
437                                 //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
438                                 alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
439                                 oligo = alignment->getSeqAAln();
440                                 string temp = alignment->getSeqBAln();
441                                 
442                                 int alnLength = oligo.length();
443                                 
444                                 for(int i=oligo.length()-1;i>=0;i--){
445                                         if(oligo[i] != '-'){    alnLength = i+1;        break;  }
446                                 }
447                                 oligo = oligo.substr(0,alnLength);
448                                 temp = temp.substr(0,alnLength);
449                                 
450                                 int numDiff = countDiffs(oligo, temp);
451                                 
452                                 if(numDiff < minDiff){
453                                         minDiff = numDiff;
454                                         minCount = 1;
455                                         minGroup = it->second;
456                                         minPos = 0;
457                                         for(int i=0;i<alnLength;i++){
458                                                 if(temp[i] != '-'){
459                                                         minPos++;
460                                                 }
461                                         }
462                                 }
463                                 else if(numDiff == minDiff){
464                                         minCount++;
465                                 }
466                                 
467                         }
468                         
469                         if(minDiff > pdiffs)    {       success = minDiff;              }       //no good matches
470                         else if(minCount > 1)   {       success = pdiffs + 10;  }       //can't tell the difference between multiple primers
471                         else{                                                                                                   //use the best match
472                                 group = minGroup;
473                                 seq.setUnaligned(rawSequence.substr(minPos));
474                                 if(qual.getName() != ""){
475                                         qual.trimQScores(minPos, -1);
476                                 }
477                                 success = minDiff;
478                         }
479                         
480                         if (alignment != NULL) {  delete alignment;  }
481                         
482                 }
483                 
484                 return success;
485                 
486         }
487         catch(exception& e) {
488                 m->errorOut(e, "TrimOligos", "stripForward");
489                 exit(1);
490         }
491 }
492 //******************************************************************/
493 bool TrimOligos::stripReverse(Sequence& seq, QualityScores& qual){
494         try {
495                 string rawSequence = seq.getUnaligned();
496                 bool success = 0;       //guilty until proven innocent
497                 
498                 for(int i=0;i<revPrimer.size();i++){
499                         string oligo = revPrimer[i];
500                         
501                         if(rawSequence.length() < oligo.length()){
502                                 success = 0;
503                                 break;
504                         }
505                         
506                         if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
507                                 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
508                                 if(qual.getName() != ""){
509                                         qual.trimQScores(-1, rawSequence.length()-oligo.length());
510                                 }
511                                 success = 1;
512                                 break;
513                         }
514                 }       
515                 return success;
516                 
517         }
518         catch(exception& e) {
519                 m->errorOut(e, "TrimOligos", "stripReverse");
520                 exit(1);
521         }
522 }
523 //******************************************************************/
524 bool TrimOligos::stripReverse(Sequence& seq){
525         try {
526                 
527                 string rawSequence = seq.getUnaligned();
528                 bool success = 0;       //guilty until proven innocent
529                 
530                 for(int i=0;i<revPrimer.size();i++){
531                         string oligo = revPrimer[i];
532                         
533                         if(rawSequence.length() < oligo.length()){
534                                 success = 0;
535                                 break;
536                         }
537                         
538                         if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
539                                 seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
540                                 success = 1;
541                                 break;
542                         }
543                 }       
544                 
545                 return success;
546                 
547         }
548         catch(exception& e) {
549                 m->errorOut(e, "TrimOligos", "stripReverse");
550                 exit(1);
551         }
552 }
553
554 //******************************************************************/
555 bool TrimOligos::compareDNASeq(string oligo, string seq){
556         try {
557                 bool success = 1;
558                 int length = oligo.length();
559                 
560                 for(int i=0;i<length;i++){
561                         
562                         if(oligo[i] != seq[i]){
563                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
564                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
565                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
566                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
567                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
568                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
569                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
570                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
571                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
572                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
573                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
574                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
575                                 
576                                 if(success == 0)        {       break;   }
577                         }
578                         else{
579                                 success = 1;
580                         }
581                 }
582                 
583                 return success;
584         }
585         catch(exception& e) {
586                 m->errorOut(e, "TrimOligos", "compareDNASeq");
587                 exit(1);
588         }
589         
590 }
591 //********************************************************************/
592 int TrimOligos::countDiffs(string oligo, string seq){
593         try {
594                 
595                 int length = oligo.length();
596                 int countDiffs = 0;
597                 
598                 for(int i=0;i<length;i++){
599                         
600                         if(oligo[i] != seq[i]){
601                                 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
602                                 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
603                                 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
604                                 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
605                                 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
606                                 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
607                                 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
608                                 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
609                                 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
610                                 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
611                                 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
612                                 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }       
613                         }
614                         
615                 }
616                 
617                 return countDiffs;
618         }
619         catch(exception& e) {
620                 m->errorOut(e, "TrimOligos", "countDiffs");
621                 exit(1);
622         }
623 }
624 /********************************************************************/
625
626
627