]> git.donarmstrong.com Git - mothur.git/commitdiff
added forward and reverse barcodes to trim.seqs to process illumina seqs
authorSarah Westcott <mothur.westcott@gmail.com>
Tue, 5 Jun 2012 17:08:53 +0000 (13:08 -0400)
committerSarah Westcott <mothur.westcott@gmail.com>
Tue, 5 Jun 2012 17:08:53 +0000 (13:08 -0400)
trimoligos.cpp
trimoligos.h
trimseqscommand.cpp
trimseqscommand.h

index 8c523ce68d8a042bb702900d468def2aadad157b..2f92cc847ca75e1a0983aeef2d09e9eaab68e57f 100644 (file)
 #include "needlemanoverlap.hpp"
 
 
+/********************************************************************/
+//strip, pdiffs, bdiffs, primers, barcodes, revPrimers
+TrimOligos::TrimOligos(int p, int b, int l, int s, map<string, int> pr, map<string, int> br, map<string, int> rbr, vector<string> r, vector<string> lk, vector<string> sp){
+       try {
+               m = MothurOut::getInstance();
+               
+               pdiffs = p;
+               bdiffs = b;
+        ldiffs = l;
+        sdiffs = s;
+               
+               barcodes = br;
+        rbarcodes = rbr;
+               primers = pr;
+               revPrimer = r;
+        linker = lk;
+        spacer = sp;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "TrimOligos", "TrimOligos");
+               exit(1);
+       }
+}
 /********************************************************************/
 //strip, pdiffs, bdiffs, primers, barcodes, revPrimers
 TrimOligos::TrimOligos(int p, int b, int l, int s, map<string, int> pr, map<string, int> br, vector<string> r, vector<string> lk, vector<string> sp){
@@ -129,8 +152,7 @@ int TrimOligos::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
                                }
                                oligo = oligo.substr(0,alnLength);
                                temp = temp.substr(0,alnLength);
-                               
-                               int numDiff = countDiffs(oligo, temp);
+                int numDiff = countDiffs(oligo, temp);
                                
                                if(numDiff < minDiff){
                                        minDiff = numDiff;
@@ -237,7 +259,7 @@ int TrimOligos::stripBarcode(Sequence& seq, int& group){
                                alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
                                oligo = alignment->getSeqAAln();
                                string temp = alignment->getSeqBAln();
-                               
+                                
                                int alnLength = oligo.length();
                                
                                for(int i=oligo.length()-1;i>=0;i--){
@@ -245,7 +267,7 @@ int TrimOligos::stripBarcode(Sequence& seq, int& group){
                                }
                                oligo = oligo.substr(0,alnLength);
                                temp = temp.substr(0,alnLength);
-                               
+                                
                                int numDiff = countDiffs(oligo, temp);
                                
                                if(numDiff < minDiff){
@@ -285,6 +307,239 @@ int TrimOligos::stripBarcode(Sequence& seq, int& group){
                exit(1);
        }
        
+}
+//*******************************************************************/
+int TrimOligos::stripRBarcode(Sequence& seq, QualityScores& qual, int& group){
+       try {
+               
+               string rawSequence = seq.getUnaligned();
+               int success = bdiffs + 1;       //guilty until proven innocent
+               
+               //can you find the barcode
+               for(map<string,int>::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){
+                       string oligo = it->first;
+                       if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
+                               success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
+                               break;  
+                       }
+            
+                       if(compareDNASeq(oligo, rawSequence.substr((rawSequence.length()-oligo.length()),oligo.length()))){
+                               group = it->second;
+                               seq.setUnaligned(rawSequence.substr(0,(rawSequence.length()-oligo.length())));
+                               
+                               if(qual.getName() != ""){
+                                       qual.trimQScores(-1, rawSequence.length()-oligo.length());
+                               }
+                               
+                               success = 0;
+                               break;
+                       }
+               }
+               
+               //if you found the barcode or if you don't want to allow for diffs
+               if ((bdiffs == 0) || (success == 0)) { return success;  }
+               
+               else { //try aligning and see if you can find it
+                       
+                       int maxLength = 0;
+                       
+                       Alignment* alignment;
+                       if (rbarcodes.size() > 0) {
+                               map<string,int>::iterator it; 
+                               
+                               for(it=rbarcodes.begin();it!=rbarcodes.end();it++){
+                                       if(it->first.length() > maxLength){
+                                               maxLength = it->first.length();
+                                       }
+                               }
+                               alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
+                               
+                       }else{ alignment = NULL; } 
+                       
+                       //can you find the barcode
+                       int minDiff = 1e6;
+                       int minCount = 1;
+                       int minGroup = -1;
+                       int minPos = 0;
+                       
+                       for(map<string,int>::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){
+                               string oligo = it->first;
+                               //                              int length = oligo.length();
+                               
+                               if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
+                                       success = bdiffs + 10;
+                                       break;
+                               }
+                               
+                               //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+                               alignment->align(oligo, rawSequence.substr((rawSequence.length()-(oligo.length()+bdiffs)),oligo.length()+bdiffs));
+                               oligo = alignment->getSeqAAln();
+                               string temp = alignment->getSeqBAln();
+     
+                               int alnLength = oligo.length();
+                               
+                               for(int i=0;i<alnLength;i++){
+                                       if(oligo[i] != '-'){    alnLength = i;  break;  }
+                               }
+                               oligo = oligo.substr(alnLength);
+                               temp = temp.substr(alnLength);
+                               
+                               int numDiff = countDiffs(oligo, temp);
+                               
+                               if(numDiff < minDiff){
+                                       minDiff = numDiff;
+                                       minCount = 1;
+                                       minGroup = it->second;
+                                       minPos = 0;
+                                       for(int i=alnLength-1;i>=0;i--){
+                                               if(temp[i] != '-'){
+                                                       minPos++;
+                                               }
+                                       }
+                               }
+                               else if(numDiff == minDiff){
+                                       minCount++;
+                               }
+                               
+                       }
+                       
+                       if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
+                       else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
+                       else{                                                                                                   //use the best match
+                               group = minGroup;
+                               seq.setUnaligned(rawSequence.substr(0, (rawSequence.length()-minPos)));
+                
+                               if(qual.getName() != ""){
+                                       qual.trimQScores(-1, (rawSequence.length()-minPos));
+                               }
+                               success = minDiff;
+                       }
+                       
+                       if (alignment != NULL) {  delete alignment;  }
+                       
+               }
+               
+               return success;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "TrimOligos", "stripRBarcode");
+               exit(1);
+       }
+       
+}
+//*******************************************************************/
+int TrimOligos::stripRBarcode(Sequence& seq, int& group){
+       try {
+               
+               string rawSequence = seq.getUnaligned();
+               int success = bdiffs + 1;       //guilty until proven innocent
+               
+               //can you find the barcode
+               for(map<string,int>::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){
+                       string oligo = it->first;
+                       if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
+                               success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
+                               break;  
+                       }
+            
+                       if(compareDNASeq(oligo, rawSequence.substr((rawSequence.length()-oligo.length()),oligo.length()))){
+                               group = it->second;
+                               seq.setUnaligned(rawSequence.substr(0,(rawSequence.length()-oligo.length())));
+                               
+                               success = 0;
+                               break;
+                       }
+               }
+               
+               //if you found the barcode or if you don't want to allow for diffs
+               if ((bdiffs == 0) || (success == 0)) { return success;  }
+               
+               else { //try aligning and see if you can find it
+                       
+                       int maxLength = 0;
+                       
+                       Alignment* alignment;
+                       if (rbarcodes.size() > 0) {
+                               map<string,int>::iterator it; 
+                               
+                               for(it=rbarcodes.begin();it!=rbarcodes.end();it++){
+                                       if(it->first.length() > maxLength){
+                                               maxLength = it->first.length();
+                                       }
+                               }
+                               alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
+                               
+                       }else{ alignment = NULL; } 
+                       
+                       //can you find the barcode
+                       int minDiff = 1e6;
+                       int minCount = 1;
+                       int minGroup = -1;
+                       int minPos = 0;
+                       
+                       for(map<string,int>::iterator it=rbarcodes.begin();it!=rbarcodes.end();it++){
+                               string oligo = it->first;
+                               //                              int length = oligo.length();
+                               
+                               if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
+                                       success = bdiffs + 10;
+                                       break;
+                               }
+                               
+                               //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
+                               alignment->align(oligo, rawSequence.substr((rawSequence.length()-(oligo.length()+bdiffs)),oligo.length()+bdiffs));
+                               oligo = alignment->getSeqAAln();
+                               string temp = alignment->getSeqBAln();
+                
+                               int alnLength = oligo.length();
+                               
+                               for(int i=0;i<alnLength;i++){
+                                       if(oligo[i] != '-'){    alnLength = i;  break;  }
+                               }
+                               oligo = oligo.substr(alnLength);
+                               temp = temp.substr(alnLength);
+                               
+                               int numDiff = countDiffs(oligo, temp);
+                               
+                               if(numDiff < minDiff){
+                                       minDiff = numDiff;
+                                       minCount = 1;
+                                       minGroup = it->second;
+                                       minPos = 0;
+                                       for(int i=alnLength-1;i>=0;i--){
+                                               if(temp[i] != '-'){
+                                                       minPos++;
+                                               }
+                                       }
+                               }
+                               else if(numDiff == minDiff){
+                                       minCount++;
+                               }
+                               
+                       }
+                       
+                       if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
+                       else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
+                       else{                                                                                                   //use the best match
+                               group = minGroup;
+                               seq.setUnaligned(rawSequence.substr(0, (rawSequence.length()-minPos)));
+                
+                               success = minDiff;
+                       }
+                       
+                       if (alignment != NULL) {  delete alignment;  }
+                       
+               }
+               
+               return success;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "TrimOligos", "stripRBarcode");
+               exit(1);
+       }
+       
 }
 //********************************************************************/
 int TrimOligos::stripForward(Sequence& seq, int& group){
index e3ea7d55537e323f3869a6157ff3aed4bbd99eea..a32b3d8e4f2d388b15b3aa68ed66fa61f33c1681 100644 (file)
@@ -20,11 +20,15 @@ class TrimOligos {
        
        public:
         TrimOligos(int,int, map<string, int>, map<string, int>, vector<string>); //pdiffs, bdiffs, primers, barcodes, revPrimers
-               TrimOligos(int,int, int, int, map<string, int>, map<string, int>, vector<string>, vector<string>, vector<string>); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimers, linker, spacer
+        TrimOligos(int,int, int, int, map<string, int>, map<string, int>, map<string, int>, vector<string>, vector<string>, vector<string>); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimers, linker, spacer
+        TrimOligos(int,int, int, int, map<string, int>, map<string, int>, vector<string>, vector<string>, vector<string>); //pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimers, linker, spacer
                ~TrimOligos();
        
                int stripBarcode(Sequence&, int&);      
                int stripBarcode(Sequence&, QualityScores&, int&);
+    
+        int stripRBarcode(Sequence&, int&);    
+        int stripRBarcode(Sequence&, QualityScores&, int&);
        
                int stripForward(Sequence&, int&);
                int stripForward(Sequence&, QualityScores&, int&, bool);
@@ -43,6 +47,7 @@ class TrimOligos {
                int pdiffs, bdiffs, ldiffs, sdiffs;
        
                map<string, int> barcodes;
+        map<string, int> rbarcodes;
                map<string, int> primers;
                vector<string> revPrimer;
         vector<string> linker;
index 9f8fafb24e14a807baa8dd5a981c93556c46e3b7..c019a70e4a2a7d35374192b3f8cca11787e7ecef 100644 (file)
@@ -562,7 +562,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                
                int count = 0;
                bool moreSeqs = 1;
-               TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, revPrimer, linker, spacer);
+               TrimOligos trimOligos(pdiffs, bdiffs, ldiffs, sdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer);
        
                while (moreSeqs) {
                                
@@ -607,6 +607,12 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                                        if(success > bdiffs)            {       trashCode += 'b';       }
                                        else{ currentSeqsDiffs += success;  }
                                }
+                
+                if(rbarcodes.size() != 0){
+                                       success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex);
+                                       if(success > bdiffs)            {       trashCode += 'b';       }
+                                       else{ currentSeqsDiffs += success;  }
+                               }
                                
                 if(numSpacers != 0){
                                        success = trimOligos.stripSpacer(currSeq, currQual);
@@ -946,7 +952,7 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName
                                               tempPrimerQualFileNames,
                                               tempNameFileNames,
                                               lines[i].start, lines[i].end, qLines[i].start, qLines[i].end, m,
-                                              pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, revPrimer, linker, spacer, 
+                                              pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, rbarcodes, revPrimer, linker, spacer, 
                                              primerNameVector, barcodeNameVector, createGroup, allFiles, keepforward, keepFirst, removeLast,
                                               qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage,
                                              minLength, maxAmbig, maxHomoP, maxLength, flip, nameMap);
@@ -1187,7 +1193,6 @@ int TrimSeqsCommand::setLines(string filename, string qfilename) {
                 int startIndex =  i * numSeqsPerProcessor;
                 if(i == (processors - 1)){     numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;   }
                 lines.push_back(linePair(fastaFilePos[startIndex], numSeqsPerProcessor));
-                cout << fastaFilePos[startIndex] << '\t' << numSeqsPerProcessor << endl;
                 if (qfilename != "") {  qLines.push_back(linePair(qfileFilePos[startIndex], numSeqsPerProcessor)); }
             }
         }
@@ -1262,7 +1267,29 @@ bool TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<
                                }
                                else if(type == "BARCODE"){
                                        inOligos >> group;
+                    
+                    //barcode lines can look like   BARCODE   atgcatgc   groupName  - for 454 seqs
+                    //or                            BARCODE   atgcatgc   atgcatgc    groupName  - for illumina data that has forward and reverse info
+                                       string temp = "";
+                    while (!inOligos.eof())    {       
+                                               char c = inOligos.get(); 
+                                               if (c == 10 || c == 13){        break;  }
+                                               else if (c == 32 || c == 9){;} //space or tab
+                                               else {  temp += c;  }
+                                       } 
                                        
+                    //then this is illumina data with 4 columns
+                    if (temp != "") {  
+                        string reverseBarcode = reverseOligo(group); //reverse barcode
+                        group = temp;
+                        
+                        //check for repeat barcodes
+                        map<string, int>::iterator itBar = rbarcodes.find(reverseBarcode);
+                        if (itBar != rbarcodes.end()) { m->mothurOut("barcode " + reverseBarcode + " is in your oligos file already."); m->mothurOutEndLine();  }
+                                               
+                        rbarcodes[reverseBarcode]=indexBarcode; 
+                    }
+                        
                                        //check for repeat barcodes
                                        map<string, int>::iterator itBar = barcodes.find(oligo);
                                        if (itBar != barcodes.end()) { m->mothurOut("barcode " + oligo + " is in your oligos file already."); m->mothurOutEndLine();  }
index 9ba64c4b62b7c7e87f213472b730a91bd0b37305..ba4e61411b8498820139e8ae2b1ed71cb37b0432 100644 (file)
@@ -63,6 +63,7 @@ private:
        vector<string> revPrimer, outputNames;
        set<string> filesToRemove;
        map<string, int> barcodes;
+    map<string, int> rbarcodes;
        vector<string> groupVector;
        map<string, int> primers;
     vector<string>  linker;
@@ -101,6 +102,7 @@ struct trimData {
        double qRollAverage, qThreshold, qWindowAverage, qAverage;
     vector<string> revPrimer;
        map<string, int> barcodes;
+    map<string, int> rbarcodes;
        map<string, int> primers;
     vector<string>  linker;
     vector<string>  spacer;
@@ -112,7 +114,7 @@ struct trimData {
     
        trimData(){}
        trimData(string fn, string qn, string nf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string gn, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<vector<string> > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend,  MothurOut* mout,
-                      int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, vector<string> revP, vector<string> li, vector<string> spa, 
+                      int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, map<string, int> rbar, vector<string> revP, vector<string> li, vector<string> spa, 
                       vector<string> priNameVector, vector<string> barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL,
                       int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage,
                       int minL, int maxA, int maxH, int maxL, bool fli, map<string, string> nm) {
@@ -141,6 +143,7 @@ struct trimData {
         sdiffs = sd;
         tdiffs = td;
         barcodes = bar;
+        rbarcodes = rbar;
         primers = pri;      numFPrimers = primers.size();
         revPrimer = revP;   numRPrimers = revPrimer.size();
         linker = li;        numLinkers = linker.size();
@@ -237,7 +240,7 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){
                }
                
                
-               TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer);
+               TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->rbarcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer);
         
                pDataArray->count = pDataArray->lineEnd;
                for(int i = 0; i < pDataArray->lineEnd; i++){ //end is the number of sequences to process
@@ -277,7 +280,13 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){
                                        if(success > pDataArray->bdiffs)                {       trashCode += 'b';       }
                                        else{ currentSeqsDiffs += success;  }
                                }
-                               
+                
+                               if(pDataArray->rbarcodes.size() != 0){
+                                       success = trimOligos.stripRBarcode(currSeq, currQual, barcodeIndex);
+                                       if(success > pDataArray->bdiffs)                {       trashCode += 'b';       }
+                                       else{ currentSeqsDiffs += success;  }
+                               }
+                
                 if(pDataArray->numSpacers != 0){
                                        success = trimOligos.stripSpacer(currSeq, currQual);
                                        if(success > pDataArray->sdiffs)                {       trashCode += 's';       }