]> git.donarmstrong.com Git - mothur.git/commitdiff
trim.seqs linker and spacer mods
authorSarah Westcott <mothur.westcott@gmail.com>
Wed, 22 Feb 2012 14:53:19 +0000 (09:53 -0500)
committerSarah Westcott <mothur.westcott@gmail.com>
Wed, 22 Feb 2012 14:53:19 +0000 (09:53 -0500)
trimoligos.cpp

index 245cfdcac8819d076782b85795300c0dfb034462..cf818707c672c82acfdbd3bf896c9c0c2edbae36 100644 (file)
@@ -154,7 +154,7 @@ int TrimOligos::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
                        else{                                                                                                   //use the best match
                                group = minGroup;
                                seq.setUnaligned(rawSequence.substr(minPos));
-                               
+    
                                if(qual.getName() != ""){
                                        qual.trimQScores(minPos, -1);
                                }
@@ -586,15 +586,95 @@ bool TrimOligos::stripLinker(Sequence& seq, QualityScores& qual){
                                break;
                        }
                        
-                       if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
-                               seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
+                       if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
+                               seq.setUnaligned(rawSequence.substr(oligo.length()));
                                if(qual.getName() != ""){
-                                       qual.trimQScores(-1, rawSequence.length()-oligo.length());
+                                       qual.trimQScores(oligo.length(), -1);
                                }
                                success = 1;
                                break;
                        }
-               }       
+               }
+        
+        //if you found the linker or if you don't want to allow for diffs
+               if ((ldiffs == 0) || (success == 0)) { return success;  }
+               
+               else { //try aligning and see if you can find it
+                       
+                       int maxLength = 0;
+                       
+                       Alignment* alignment;
+                       if (linker.size() > 0) {
+                               for(int i = 0; i < linker.size(); i++){
+                                       if(linker[i].length() > maxLength){
+                                               maxLength = linker[i].length();
+                                       }
+                               }
+                               alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+ldiffs+1));  
+                               
+                       }else{ alignment = NULL; } 
+                       
+                       //can you find the barcode
+                       int minDiff = 1e6;
+                       int minCount = 1;
+                       int minPos = 0;
+                       
+                       for(int i = 0; i < linker.size(); i++){
+                               string oligo = linker[i];
+                               //                              int length = oligo.length();
+                               
+                               if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
+                                       success = ldiffs + 10;
+                                       break;
+                               }
+                               
+                               //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+                               alignment->align(oligo, rawSequence.substr(0,oligo.length()+ldiffs));
+                               oligo = alignment->getSeqAAln();
+                               string temp = alignment->getSeqBAln();
+                               
+                               int alnLength = oligo.length();
+                               
+                               for(int i=oligo.length()-1;i>=0;i--){
+                                       if(oligo[i] != '-'){    alnLength = i+1;        break;  }
+                               }
+                               oligo = oligo.substr(0,alnLength);
+                               temp = temp.substr(0,alnLength);
+                               
+                               int numDiff = countDiffs(oligo, temp);
+                               
+                               if(numDiff < minDiff){
+                                       minDiff = numDiff;
+                                       minCount = 1;
+                                       minPos = 0;
+                                       for(int i=0;i<alnLength;i++){
+                                               if(temp[i] != '-'){
+                                                       minPos++;
+                                               }
+                                       }
+                               }
+                               else if(numDiff == minDiff){
+                                       minCount++;
+                               }
+                               
+                       }
+                       
+                       if(minDiff > ldiffs)    {       success = minDiff;              }       //no good matches
+                       else if(minCount > 1)   {       success = ldiffs + 100; }       //can't tell the difference between multiple barcodes
+                       else{                                                                                                   //use the best match
+                               seq.setUnaligned(rawSequence.substr(minPos));
+                               
+                               if(qual.getName() != ""){
+                                       qual.trimQScores(minPos, -1);
+                               }
+                               success = minDiff;
+                       }
+                       
+                       if (alignment != NULL) {  delete alignment;  }
+                       
+               }
+
+       
                return success;
                
        }
@@ -618,13 +698,87 @@ bool TrimOligos::stripLinker(Sequence& seq){
                                break;
                        }
                        
-                       if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
-                               seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
+                       if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
+                               seq.setUnaligned(rawSequence.substr(oligo.length()));
                                success = 1;
                                break;
                        }
                }       
                
+        //if you found the linker or if you don't want to allow for diffs
+               if ((ldiffs == 0) || (success == 0)) { return success;  }
+               
+               else { //try aligning and see if you can find it
+                       
+                       int maxLength = 0;
+                       
+                       Alignment* alignment;
+                       if (linker.size() > 0) {
+                               for(int i = 0; i < linker.size(); i++){
+                                       if(linker[i].length() > maxLength){
+                                               maxLength = linker[i].length();
+                                       }
+                               }
+                               alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+ldiffs+1));  
+                               
+                       }else{ alignment = NULL; } 
+                       
+                       //can you find the barcode
+                       int minDiff = 1e6;
+                       int minCount = 1;
+                       int minPos = 0;
+                       
+                       for(int i = 0; i < linker.size(); i++){
+                               string oligo = linker[i];
+                               //                              int length = oligo.length();
+                               
+                               if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
+                                       success = ldiffs + 10;
+                                       break;
+                               }
+                               
+                               //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+                               alignment->align(oligo, rawSequence.substr(0,oligo.length()+ldiffs));
+                               oligo = alignment->getSeqAAln();
+                               string temp = alignment->getSeqBAln();
+                               
+                               int alnLength = oligo.length();
+                               
+                               for(int i=oligo.length()-1;i>=0;i--){
+                                       if(oligo[i] != '-'){    alnLength = i+1;        break;  }
+                               }
+                               oligo = oligo.substr(0,alnLength);
+                               temp = temp.substr(0,alnLength);
+                               
+                               int numDiff = countDiffs(oligo, temp);
+                               
+                               if(numDiff < minDiff){
+                                       minDiff = numDiff;
+                                       minCount = 1;
+                                       minPos = 0;
+                                       for(int i=0;i<alnLength;i++){
+                                               if(temp[i] != '-'){
+                                                       minPos++;
+                                               }
+                                       }
+                               }
+                               else if(numDiff == minDiff){
+                                       minCount++;
+                               }
+                               
+                       }
+                       
+                       if(minDiff > ldiffs)    {       success = minDiff;              }       //no good matches
+                       else if(minCount > 1)   {       success = ldiffs + 100; }       //can't tell the difference between multiple barcodes
+                       else{                                                                                                   //use the best match
+                               seq.setUnaligned(rawSequence.substr(minPos));
+                               success = minDiff;
+                       }
+                       
+                       if (alignment != NULL) {  delete alignment;  }
+                       
+               }
+
                return success;
                
        }
@@ -648,15 +802,95 @@ bool TrimOligos::stripSpacer(Sequence& seq, QualityScores& qual){
                                break;
                        }
                        
-                       if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
-                               seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
+                       if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
+                               seq.setUnaligned(rawSequence.substr(oligo.length()));
                                if(qual.getName() != ""){
-                                       qual.trimQScores(-1, rawSequence.length()-oligo.length());
+                                       qual.trimQScores(oligo.length(), -1);
                                }
                                success = 1;
                                break;
                        }
-               }       
+               }
+        
+        //if you found the spacer or if you don't want to allow for diffs
+               if ((sdiffs == 0) || (success == 0)) { return success;  }
+               
+               else { //try aligning and see if you can find it
+                       
+                       int maxLength = 0;
+                       
+                       Alignment* alignment;
+                       if (spacer.size() > 0) {
+                               for(int i = 0; i < spacer.size(); i++){
+                                       if(spacer[i].length() > maxLength){
+                                               maxLength = spacer[i].length();
+                                       }
+                               }
+                               alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+sdiffs+1));  
+                               
+                       }else{ alignment = NULL; } 
+                       
+                       //can you find the barcode
+                       int minDiff = 1e6;
+                       int minCount = 1;
+                       int minPos = 0;
+                       
+                       for(int i = 0; i < spacer.size(); i++){
+                               string oligo = spacer[i];
+                               //                              int length = oligo.length();
+                               
+                               if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
+                                       success = sdiffs + 10;
+                                       break;
+                               }
+                               
+                               //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+                               alignment->align(oligo, rawSequence.substr(0,oligo.length()+sdiffs));
+                               oligo = alignment->getSeqAAln();
+                               string temp = alignment->getSeqBAln();
+                               
+                               int alnLength = oligo.length();
+                               
+                               for(int i=oligo.length()-1;i>=0;i--){
+                                       if(oligo[i] != '-'){    alnLength = i+1;        break;  }
+                               }
+                               oligo = oligo.substr(0,alnLength);
+                               temp = temp.substr(0,alnLength);
+                               
+                               int numDiff = countDiffs(oligo, temp);
+                               
+                               if(numDiff < minDiff){
+                                       minDiff = numDiff;
+                                       minCount = 1;
+                                       minPos = 0;
+                                       for(int i=0;i<alnLength;i++){
+                                               if(temp[i] != '-'){
+                                                       minPos++;
+                                               }
+                                       }
+                               }
+                               else if(numDiff == minDiff){
+                                       minCount++;
+                               }
+                               
+                       }
+                       
+                       if(minDiff > sdiffs)    {       success = minDiff;              }       //no good matches
+                       else if(minCount > 1)   {       success = sdiffs + 100; }       //can't tell the difference between multiple barcodes
+                       else{                                                                                                   //use the best match
+                               seq.setUnaligned(rawSequence.substr(minPos));
+                               
+                               if(qual.getName() != ""){
+                                       qual.trimQScores(minPos, -1);
+                               }
+                               success = minDiff;
+                       }
+                       
+                       if (alignment != NULL) {  delete alignment;  }
+                       
+               }
+        
+
                return success;
                
        }
@@ -680,13 +914,87 @@ bool TrimOligos::stripSpacer(Sequence& seq){
                                break;
                        }
                        
-                       if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
-                               seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
+                       if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
+                               seq.setUnaligned(rawSequence.substr(oligo.length()));
                                success = 1;
                                break;
                        }
                }       
                
+        //if you found the spacer or if you don't want to allow for diffs
+               if ((sdiffs == 0) || (success == 0)) { return success;  }
+               
+               else { //try aligning and see if you can find it
+                       
+                       int maxLength = 0;
+                       
+                       Alignment* alignment;
+                       if (spacer.size() > 0) {
+                               for(int i = 0; i < spacer.size(); i++){
+                                       if(spacer[i].length() > maxLength){
+                                               maxLength = spacer[i].length();
+                                       }
+                               }
+                               alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+sdiffs+1));  
+                               
+                       }else{ alignment = NULL; } 
+                       
+                       //can you find the barcode
+                       int minDiff = 1e6;
+                       int minCount = 1;
+                       int minPos = 0;
+                       
+                       for(int i = 0; i < spacer.size(); i++){
+                               string oligo = spacer[i];
+                               //                              int length = oligo.length();
+                               
+                               if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
+                                       success = sdiffs + 10;
+                                       break;
+                               }
+                               
+                               //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+                               alignment->align(oligo, rawSequence.substr(0,oligo.length()+sdiffs));
+                               oligo = alignment->getSeqAAln();
+                               string temp = alignment->getSeqBAln();
+                               
+                               int alnLength = oligo.length();
+                               
+                               for(int i=oligo.length()-1;i>=0;i--){
+                                       if(oligo[i] != '-'){    alnLength = i+1;        break;  }
+                               }
+                               oligo = oligo.substr(0,alnLength);
+                               temp = temp.substr(0,alnLength);
+                               
+                               int numDiff = countDiffs(oligo, temp);
+                               
+                               if(numDiff < minDiff){
+                                       minDiff = numDiff;
+                                       minCount = 1;
+                                       minPos = 0;
+                                       for(int i=0;i<alnLength;i++){
+                                               if(temp[i] != '-'){
+                                                       minPos++;
+                                               }
+                                       }
+                               }
+                               else if(numDiff == minDiff){
+                                       minCount++;
+                               }
+                               
+                       }
+                       
+                       if(minDiff > sdiffs)    {       success = minDiff;              }       //no good matches
+                       else if(minCount > 1)   {       success = sdiffs + 100; }       //can't tell the difference between multiple barcodes
+                       else{                                                                                                   //use the best match
+                               seq.setUnaligned(rawSequence.substr(minPos));
+                               success = minDiff;
+                       }
+                       
+                       if (alignment != NULL) {  delete alignment;  }
+                       
+               }
+
                return success;
                
        }