]> git.donarmstrong.com Git - mothur.git/blobdiff - trimoligos.cpp
added oligos class. added check orient parameter to trim.flows, sffinfo, fastq.info...
[mothur.git] / trimoligos.cpp
index 0f2a9cbb0e584ddfa393a53015b02b37ccfa3903..c7016d8b08a5804d1636a9ab8fde27b891f2a0a6 100644 (file)
@@ -800,6 +800,14 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, Quality
                 
                 if(minDiff > bdiffs)   {       success = minDiff;      }       //no good matches
                 else {
+                    if (m->debug) {
+                        string fMatches = "";
+                        for (int i = 0; i < minFGroup.size(); i++) { for (int j = 0; j < minFGroup[i].size(); j++) { fMatches += toString(minFGroup[i][j]) + " "; } }
+                        m->mothurOut("[DEBUG]: forward matches =  " + fMatches + "\n");
+                        string rMatches = "";
+                        for (int i = 0; i < minRGroup.size(); i++) { for (int j = 0; j < minRGroup[i].size(); j++) { rMatches += toString(minRGroup[i][j]) + " "; } }
+                        m->mothurOut("[DEBUG]: reverse matches =  " + rMatches + "\n");
+                    }
                     bool foundMatch = false;
                     vector<int> matches;
                     for (int i = 0; i < minFGroup.size(); i++) {
@@ -817,6 +825,9 @@ int TrimOligos::stripBarcode(Sequence& forwardSeq, Sequence& reverseSeq, Quality
                         group = matches[0];
                         fStart = minFPos[0];
                         rStart = minRPos[0];
+                        if (m->debug) { m->mothurOut("[DEBUG]: found match = " + toString(matches[0]) + ".\n");  }
+                    }else {
+                        if (m->debug) { m->mothurOut("[DEBUG]: failed too many matches.\n");  }
                     }
                     
                     //we have an acceptable match for the forward and reverse, but do they match?
@@ -1065,6 +1076,222 @@ int TrimOligos::stripPairedBarcode(Sequence& seq, QualityScores& qual, int& grou
         exit(1);
     }
     
+}
+//*******************************************************************/
+int TrimOligos::stripPairedBarcode(Sequence& seq, int& group){
+    try {
+        //look for forward barcode
+        string rawSeq = seq.getUnaligned();
+        int success = bdiffs + 1;      //guilty until proven innocent
+        //cout << seq.getName() << endl;
+        //can you find the forward barcode
+        for(map<int,oligosPair>::iterator it=ipbarcodes.begin();it!=ipbarcodes.end();it++){
+            string foligo = it->second.forward;
+            string roligo = it->second.reverse;
+            //cout << it->first << '\t' << foligo << '\t' << roligo << endl;
+            //cout << it->first << '\t' << rawSeq.substr(0,foligo.length()) << '\t' << rawSeq.substr(rawSeq.length()-roligo.length(),roligo.length()) << endl;
+            if(rawSeq.length() < foligo.length()){     //let's just assume that the barcodes are the same length
+                success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
+                break;
+            }
+            if(rawSeq.length() < roligo.length()){     //let's just assume that the barcodes are the same length
+                success = bdiffs + 10; //if the sequence is shorter than the barcode then bail out
+                break;
+            }
+            
+            if (rawSeq.length() < (foligo.length() + roligo.length())) {  success = pdiffs + 10; break; }
+            
+            if((compareDNASeq(foligo, rawSeq.substr(0,foligo.length()))) && (compareDNASeq(roligo, rawSeq.substr(rawSeq.length()-roligo.length(),roligo.length())))) {
+                group = it->first;
+                string trimmedSeq = rawSeq.substr(foligo.length()); //trim forward barcode
+                seq.setUnaligned(trimmedSeq.substr(0,(trimmedSeq.length()-roligo.length()))); //trim reverse barcode
+                success = 0;
+                break;
+            }
+        }
+        //cout << "success=" << success << endl;
+        //if you found the barcode or if you don't want to allow for diffs
+        if ((bdiffs == 0) || (success == 0)) { return success; }
+        else { //try aligning and see if you can find it
+            Alignment* alignment;
+            if (ifbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFBarcodeLength+bdiffs+1)); }
+            else{ alignment = NULL; }
+            
+            //can you find the barcode
+            int minDiff = 1e6;
+            int minCount = 1;
+            vector< vector<int> > minFGroup;
+            vector<int> minFPos;
+            
+            //the pair can have duplicates, but we only want to search for the unique forward and reverses, example
+            /*
+             1 Sarah Westcott
+             2 John Westcott
+             3 Anna Westcott
+             4 Sarah Schloss
+             5 Pat Schloss
+             6 Gail Brown
+             7 Pat Moore
+             only want to look for forward = Sarah, John, Anna, Pat, Gail
+             reverse = Westcott, Schloss, Brown, Moore
+             but if best match forward = 4, and reverse = 1, we want to count as a valid match because forward 1 and forward 4 are the same. so both barcodes map to same group.
+             */
+            //cout << endl << forwardSeq.getName() << endl;
+            for(map<string, vector<int> >::iterator it=ifbarcodes.begin();it!=ifbarcodes.end();it++){
+                string oligo = it->first;
+                
+                if(rawSeq.length() < maxFBarcodeLength){       //let's just assume that the barcodes are the same length
+                    success = bdiffs + 10;
+                    break;
+                }
+                //cout << "before = " << oligo << '\t' << rawSeq.substr(0,oligo.length()+bdiffs) << endl;
+                //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+                alignment->alignPrimer(oligo, rawSeq.substr(0,oligo.length()+bdiffs));
+                oligo = alignment->getSeqAAln();
+                string temp = alignment->getSeqBAln();
+                
+                int alnLength = oligo.length();
+                
+                for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){     alnLength = i+1;        break;  } }
+                oligo = oligo.substr(0,alnLength);
+                temp = temp.substr(0,alnLength);
+                int numDiff = countDiffs(oligo, temp);
+                
+                if (alnLength == 0) { numDiff = bdiffs + 100; }
+                //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
+                
+                if(numDiff < minDiff){
+                    minDiff = numDiff;
+                    minCount = 1;
+                    minFGroup.clear();
+                    minFGroup.push_back(it->second);
+                    int tempminFPos = 0;
+                    minFPos.clear();
+                    for(int i=0;i<alnLength;i++){
+                        if(temp[i] != '-'){
+                            tempminFPos++;
+                        }
+                    }
+                    minFPos.push_back(tempminFPos);
+                }else if(numDiff == minDiff){
+                    minFGroup.push_back(it->second);
+                    int tempminFPos = 0;
+                    for(int i=0;i<alnLength;i++){
+                        if(temp[i] != '-'){
+                            tempminFPos++;
+                        }
+                    }
+                    minFPos.push_back(tempminFPos);
+                }
+            }
+            
+            //cout << minDiff << '\t' << minCount << '\t' << endl;
+            if(minDiff > bdiffs)       {       success = minDiff;      }       //no good matches
+            else{
+                //check for reverse match
+                if (alignment != NULL) { delete alignment; }
+                
+                if (irbarcodes.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxRBarcodeLength+bdiffs+1)); }
+                else{ alignment = NULL; }
+                
+                //can you find the barcode
+                minDiff = 1e6;
+                minCount = 1;
+                vector< vector<int> > minRGroup;
+                vector<int> minRPos;
+                
+                string rawRSequence = reverseOligo(seq.getUnaligned());
+                //cout << irbarcodes.size() << '\t' << maxRBarcodeLength << endl;
+                for(map<string, vector<int> >::iterator it=irbarcodes.begin();it!=irbarcodes.end();it++){
+                    string oligo = reverseOligo(it->first);
+                    //cout << "r before = " << reverseOligo(oligo) << '\t' << reverseOligo(rawRSequence.substr(0,oligo.length()+bdiffs)) << endl;
+                    if(rawRSequence.length() < maxRBarcodeLength){     //let's just assume that the barcodes are the same length
+                        success = bdiffs + 10;
+                        break;
+                    }
+                    
+                    //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+                    alignment->alignPrimer(oligo, rawRSequence.substr(0,oligo.length()+bdiffs));
+                    oligo = alignment->getSeqAAln();
+                    string temp = alignment->getSeqBAln();
+                    
+                    int alnLength = oligo.length();
+                    for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1;        break;  } }
+                    oligo = oligo.substr(0,alnLength);
+                    temp = temp.substr(0,alnLength);
+                    int numDiff = countDiffs(oligo, temp);
+                    if (alnLength == 0) { numDiff = bdiffs + 100; }
+                    
+                    //cout << "r after = " << reverseOligo(oligo) << '\t' << reverseOligo(temp) << '\t' << numDiff << endl;
+                    if(numDiff < minDiff){
+                        minDiff = numDiff;
+                        minCount = 1;
+                        minRGroup.clear();
+                        minRGroup.push_back(it->second);
+                        int tempminRPos = 0;
+                        minRPos.clear();
+                        for(int i=0;i<alnLength;i++){
+                            if(temp[i] != '-'){
+                                tempminRPos++;
+                            }
+                        }
+                        minRPos.push_back(tempminRPos);
+                    }else if(numDiff == minDiff){
+                        int tempminRPos = 0;
+                        for(int i=0;i<alnLength;i++){
+                            if(temp[i] != '-'){
+                                tempminRPos++;
+                            }
+                        }
+                        minRPos.push_back(tempminRPos);
+                        minRGroup.push_back(it->second);
+                    }
+                    
+                }
+                
+                if(minDiff > bdiffs)   {       success = minDiff;      }       //no good matches
+                else {
+                    bool foundMatch = false;
+                    vector<int> matches;
+                    for (int i = 0; i < minFGroup.size(); i++) {
+                        for (int j = 0; j < minFGroup[i].size(); j++) {
+                            for (int k = 0; k < minRGroup.size(); k++) {
+                                if (m->inUsersGroups(minFGroup[i][j], minRGroup[k])) { matches.push_back(minFGroup[i][j]); k+= minRGroup.size(); }
+                            }
+                        }
+                    }
+                    
+                    int fStart = 0;
+                    int rStart = 0;
+                    if (matches.size() == 1) {
+                        foundMatch = true;
+                        group = matches[0];
+                        fStart = minFPos[0];
+                        rStart = rawSeq.length() - minRPos[0];
+                        if (fStart > rStart) { foundMatch = false; } //only barcodes not a good sequence
+                    }
+                    
+                    //we have an acceptable match for the forward and reverse, but do they match?
+                    if (foundMatch) {
+                        string trimmedSeq = rawSeq.substr(0, rStart); //trim reverse barcode
+                        seq.setUnaligned(trimmedSeq.substr(fStart)); //trim forward barcode
+                        success = minDiff;
+                        //cout << "barcode = " << ipbarcodes[group].forward << '\t' << ipbarcodes[group].reverse << endl;
+                    }else { success = bdiffs + 100;    }
+                }
+            }
+            
+            if (alignment != NULL) { delete alignment; }
+        }
+        
+        return success;
+        
+    }
+    catch(exception& e) {
+        m->errorOut(e, "TrimOligos", "stripPairedBarcode");
+        exit(1);
+    }
+    
 }
 //*******************************************************************/
 int TrimOligos::stripPairedPrimers(Sequence& seq, QualityScores& qual, int& group, bool keepForward){
@@ -1295,7 +1522,223 @@ int TrimOligos::stripPairedPrimers(Sequence& seq, QualityScores& qual, int& grou
     }
     
 }
-
+//*******************************************************************/
+int TrimOligos::stripPairedPrimers(Sequence& seq, int& group){
+    try {
+        //look for forward
+        string rawSeq = seq.getUnaligned();
+        int success = pdiffs + 1;      //guilty until proven innocent
+        //cout << seq.getName() << endl;
+        //can you find the forward
+        for(map<int,oligosPair>::iterator it=ipprimers.begin();it!=ipprimers.end();it++){
+            string foligo = it->second.forward;
+            string roligo = it->second.reverse;
+            
+            //cout << it->first << '\t' << foligo << '\t' << roligo << endl;
+            //cout << it->first << '\t' << rawSeq.substr(0,foligo.length()) << '\t' << rawSeq.substr(rawSeq.length()-roligo.length(),roligo.length()) << endl;
+            if(rawSeq.length() < foligo.length()){     //let's just assume that the barcodes are the same length
+                success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
+                break;
+            }
+            if(rawSeq.length() < roligo.length()){     //let's just assume that the barcodes are the same length
+                success = pdiffs + 10; //if the sequence is shorter than the barcode then bail out
+                break;
+            }
+            
+            if (rawSeq.length() < (foligo.length() + roligo.length())) {  success = pdiffs + 10; break; }
+            
+            if((compareDNASeq(foligo, rawSeq.substr(0,foligo.length()))) && (compareDNASeq(roligo, rawSeq.substr(rawSeq.length()-roligo.length(),roligo.length())))) {
+                group = it->first;
+                string trimmedSeq = rawSeq.substr(foligo.length()); //trim forward barcode
+                seq.setUnaligned(trimmedSeq.substr(0,(trimmedSeq.length()-roligo.length()))); //trim reverse barcode
+                success = 0;
+                break;
+            }
+        }
+        //cout << "success=" << success << endl;
+        //if you found the barcode or if you don't want to allow for diffs
+        if ((pdiffs == 0) || (success == 0)) { return success; }
+        else { //try aligning and see if you can find it
+            Alignment* alignment;
+            if (ifprimers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxFPrimerLength+pdiffs+1)); }
+            else{ alignment = NULL; }
+            
+            //can you find the barcode
+            int minDiff = 1e6;
+            int minCount = 1;
+            vector< vector<int> > minFGroup;
+            vector<int> minFPos;
+            
+            //the pair can have duplicates, but we only want to search for the unique forward and reverses, example
+            /*
+             1 Sarah Westcott
+             2 John Westcott
+             3 Anna Westcott
+             4 Sarah Schloss
+             5 Pat Schloss
+             6 Gail Brown
+             7 Pat Moore
+             only want to look for forward = Sarah, John, Anna, Pat, Gail
+             reverse = Westcott, Schloss, Brown, Moore
+             but if best match forward = 4, and reverse = 1, we want to count as a valid match because forward 1 and forward 4 are the same. so both barcodes map to same group.
+             */
+            //cout << endl << forwardSeq.getName() << endl;
+            for(map<string, vector<int> >::iterator it=ifprimers.begin();it!=ifprimers.end();it++){
+                string oligo = it->first;
+                
+                if(rawSeq.length() < maxFPrimerLength){        //let's just assume that the barcodes are the same length
+                    success = pdiffs + 10;
+                    break;
+                }
+                //cout << "before = " << oligo << '\t' << rawSeq.substr(0,oligo.length()+pdiffs) << endl;
+                //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+                alignment->alignPrimer(oligo, rawSeq.substr(0,oligo.length()+pdiffs));
+                oligo = alignment->getSeqAAln();
+                string temp = alignment->getSeqBAln();
+                
+                int alnLength = oligo.length();
+                
+                for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){     alnLength = i+1;        break;  } }
+                oligo = oligo.substr(0,alnLength);
+                temp = temp.substr(0,alnLength);
+                int numDiff = countDiffs(oligo, temp);
+                
+                if (alnLength == 0) { numDiff = pdiffs + 100; }
+                //cout << "after = " << oligo << '\t' << temp << '\t' << numDiff << endl;
+                
+                if(numDiff < minDiff){
+                    minDiff = numDiff;
+                    minCount = 1;
+                    minFGroup.clear();
+                    minFGroup.push_back(it->second);
+                    int tempminFPos = 0;
+                    minFPos.clear();
+                    for(int i=0;i<alnLength;i++){
+                        if(temp[i] != '-'){
+                            tempminFPos++;
+                        }
+                    }
+                    minFPos.push_back(tempminFPos);
+                }else if(numDiff == minDiff){
+                    minFGroup.push_back(it->second);
+                    int tempminFPos = 0;
+                    for(int i=0;i<alnLength;i++){
+                        if(temp[i] != '-'){
+                            tempminFPos++;
+                        }
+                    }
+                    minFPos.push_back(tempminFPos);
+                }
+            }
+            
+            //cout << minDiff << '\t' << minCount << '\t' << endl;
+            if(minDiff > pdiffs)       {       success = minDiff;      }       //no good matches
+            else{
+                //check for reverse match
+                if (alignment != NULL) { delete alignment; }
+                
+                if (irprimers.size() > 0) { alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxRPrimerLength+pdiffs+1)); }
+                else{ alignment = NULL; }
+                
+                //can you find the barcode
+                minDiff = 1e6;
+                minCount = 1;
+                vector< vector<int> > minRGroup;
+                vector<int> minRPos;
+                
+                string rawRSequence = reverseOligo(seq.getUnaligned());
+                
+                for(map<string, vector<int> >::iterator it=irprimers.begin();it!=irprimers.end();it++){
+                    string oligo = reverseOligo(it->first);
+                    //cout << "r before = " << reverseOligo(oligo) << '\t' << reverseOligo(rawRSequence.substr(0,oligo.length()+pdiffs)) << endl;
+                    if(rawRSequence.length() < maxRPrimerLength){      //let's just assume that the barcodes are the same length
+                        success = pdiffs + 10;
+                        break;
+                    }
+                    
+                    //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+                    alignment->alignPrimer(oligo, rawRSequence.substr(0,oligo.length()+pdiffs));
+                    oligo = alignment->getSeqAAln();
+                    string temp = alignment->getSeqBAln();
+                    
+                    int alnLength = oligo.length();
+                    for(int i=oligo.length()-1;i>=0;i--){ if(oligo[i] != '-'){ alnLength = i+1;        break;  } }
+                    oligo = oligo.substr(0,alnLength);
+                    temp = temp.substr(0,alnLength);
+                    int numDiff = countDiffs(oligo, temp);
+                    if (alnLength == 0) { numDiff = pdiffs + 100; }
+                    
+                    //cout << "r after = " << reverseOligo(oligo) << '\t' << reverseOligo(temp) << '\t' << numDiff << endl;
+                    if(numDiff < minDiff){
+                        minDiff = numDiff;
+                        minCount = 1;
+                        minRGroup.clear();
+                        minRGroup.push_back(it->second);
+                        int tempminRPos = 0;
+                        minRPos.clear();
+                        for(int i=0;i<alnLength;i++){
+                            if(temp[i] != '-'){
+                                tempminRPos++;
+                            }
+                        }
+                        minRPos.push_back(tempminRPos);
+                    }else if(numDiff == minDiff){
+                        int tempminRPos = 0;
+                        for(int i=0;i<alnLength;i++){
+                            if(temp[i] != '-'){
+                                tempminRPos++;
+                            }
+                        }
+                        minRPos.push_back(tempminRPos);
+                        minRGroup.push_back(it->second);
+                    }
+                    
+                }
+                
+                if(minDiff > pdiffs)   {       success = minDiff;      }       //no good matches
+                else {
+                    bool foundMatch = false;
+                    vector<int> matches;
+                    for (int i = 0; i < minFGroup.size(); i++) {
+                        for (int j = 0; j < minFGroup[i].size(); j++) {
+                            for (int k = 0; k < minRGroup.size(); k++) {
+                                if (m->inUsersGroups(minFGroup[i][j], minRGroup[k])) { matches.push_back(minFGroup[i][j]); k+= minRGroup.size(); }
+                            }
+                        }
+                    }
+                    
+                    int fStart = 0;
+                    int rStart = 0;
+                    if (matches.size() == 1) {
+                        foundMatch = true;
+                        group = matches[0];
+                        fStart = minFPos[0];
+                        rStart = rawSeq.length() - minRPos[0];
+                        if (fStart > rStart) { foundMatch = false; } //only primers not a good sequence
+                    }
+                    
+                    //we have an acceptable match for the forward and reverse, but do they match?
+                    if (foundMatch) {
+                        string trimmedSeq = rawSeq.substr(0, rStart); //trim reverse barcode
+                        seq.setUnaligned(trimmedSeq.substr(fStart)); //trim forward barcode
+                        success = minDiff;
+                        //cout << "primer = " << ipprimers[group].forward << '\t' << ipprimers[group].reverse << endl;
+                    }else { success = pdiffs + 100;    }
+                }
+            }
+            
+            if (alignment != NULL) { delete alignment; }
+        }
+        
+        return success;
+        
+    }
+    catch(exception& e) {
+        m->errorOut(e, "TrimOligos", "stripPairedPrimers");
+        exit(1);
+    }
+    
+}
 //*******************************************************************/
 int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, QualityScores& forwardQual, QualityScores& reverseQual, int& group){
     try {
@@ -1733,6 +2176,8 @@ int TrimOligos::stripForward(Sequence& forwardSeq, Sequence& reverseSeq, int& gr
 int TrimOligos::stripBarcode(Sequence& seq, int& group){
     try {
         
+        if (paired) { int success = stripPairedBarcode(seq, group); return success; }
+        
         string rawSequence = seq.getUnaligned();
         int success = bdiffs + 1;      //guilty until proven innocent
         
@@ -1836,6 +2281,8 @@ int TrimOligos::stripBarcode(Sequence& seq, int& group){
 int TrimOligos::stripForward(Sequence& seq, int& group){
     try {
         
+        if (paired) { int success = stripPairedPrimers(seq, group); return success; }
+        
         string rawSequence = seq.getUnaligned();
         int success = pdiffs + 1;      //guilty until proven innocent