]> git.donarmstrong.com Git - mothur.git/blobdiff - trimseqscommand.cpp
working on windows paralellization, added trimOligos class to be used by trim.flows...
[mothur.git] / trimseqscommand.cpp
index a8c3fc697ae10c1c6a117ce68c550a928281ec44..1cbb91a2da487a7f0aa00a0689d393297ba592c0 100644 (file)
@@ -9,6 +9,7 @@
 
 #include "trimseqscommand.h"
 #include "needlemanoverlap.hpp"
+#include "trimoligos.h"
 
 //**********************************************************************************************************************
 vector<string> TrimSeqsCommand::setParameters(){       
@@ -351,8 +352,8 @@ int TrimSeqsCommand::execute(){
                        }
                }
                
-               vector<unsigned long int> fastaFilePos;
-               vector<unsigned long int> qFilePos;
+               vector<unsigned long long> fastaFilePos;
+               vector<unsigned long long> qFilePos;
                
                setLines(fastaFile, qFileName, fastaFilePos, qFilePos);
                
@@ -539,6 +540,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                
                int count = 0;
                bool moreSeqs = 1;
+               TrimOligos trimOligos(pdiffs, bdiffs, primers, barcodes, revPrimer);
        
                while (moreSeqs) {
                                
@@ -572,13 +574,13 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                                int primerIndex = 0;
                                
                                if(barcodes.size() != 0){
-                                       success = stripBarcode(currSeq, currQual, barcodeIndex);
+                                       success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
                                        if(success > bdiffs)            {       trashCode += 'b';       }
                                        else{ currentSeqsDiffs += success;  }
                                }
                                
                                if(numFPrimers != 0){
-                                       success = stripForward(currSeq, currQual, primerIndex);
+                                       success = trimOligos.stripForward(currSeq, currQual, primerIndex);
                                        if(success > pdiffs)            {       trashCode += 'f';       }
                                        else{ currentSeqsDiffs += success;  }
                                }
@@ -586,7 +588,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                                if (currentSeqsDiffs > tdiffs)  {       trashCode += 't';   }
                                
                                if(numRPrimers != 0){
-                                       success = stripReverse(currSeq, currQual);
+                                       success = trimOligos.stripReverse(currSeq, currQual);
                                        if(!success)                            {       trashCode += 'r';       }
                                }
 
@@ -722,7 +724,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                        }
                        
                        #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-                               unsigned long int pos = inFASTA.tellg();
+                               unsigned long long pos = inFASTA.tellg();
                                if ((pos == -1) || (pos >= line->end)) { break; }
                        
                        #else
@@ -938,7 +940,7 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName
 
 /**************************************************************************************************/
 
-int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long int>& fastaFilePos, vector<unsigned long int>& qfileFilePos) {
+int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned long long>& fastaFilePos, vector<unsigned long long>& qfileFilePos) {
        try {
                #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
                //set file positions for fasta file
@@ -977,7 +979,7 @@ int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned
                                        map<string, int>::iterator it = firstSeqNames.find(sname);
                                        
                                        if(it != firstSeqNames.end()) { //this is the start of a new chunk
-                                               unsigned long int pos = inQual.tellg(); 
+                                               unsigned long long pos = inQual.tellg(); 
                                                qfileFilePos.push_back(pos - input.length() - 1);       
                                                firstSeqNames.erase(it);
                                        }
@@ -999,7 +1001,7 @@ int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned
 
                //get last file position of qfile
                FILE * pFile;
-               unsigned long int size;
+               unsigned long long size;
                
                //get num bytes in file
                pFile = fopen (qfilename.c_str(),"rb");
@@ -1019,7 +1021,7 @@ int TrimSeqsCommand::setLines(string filename, string qfilename, vector<unsigned
                        fastaFilePos.push_back(0); qfileFilePos.push_back(0);
                        //get last file position of fastafile
                        FILE * pFile;
-                       unsigned long int size;
+                       unsigned long long size;
                        
                        //get num bytes in file
                        pFile = fopen (filename.c_str(),"rb");
@@ -1240,279 +1242,6 @@ bool TrimSeqsCommand::getOligos(vector<vector<string> >& fastaFileNames, vector<
                exit(1);
        }
 }
-
-//***************************************************************************************************************
-
-int TrimSeqsCommand::stripBarcode(Sequence& seq, QualityScores& qual, int& group){
-       try {
-               
-               string rawSequence = seq.getUnaligned();
-               int success = bdiffs + 1;       //guilty until proven innocent
-               
-               //can you find the barcode
-               for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
-                       string oligo = it->first;
-                       if(rawSequence.length() < oligo.length()){      //let's just assume that the barcodes are the same length
-                               success = bdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
-                               break;  
-                       }
-                       
-                       if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
-                               group = it->second;
-                               seq.setUnaligned(rawSequence.substr(oligo.length()));
-                               
-                               if(qual.getName() != ""){
-                                       qual.trimQScores(oligo.length(), -1);
-                               }
-                               
-                               success = 0;
-                               break;
-                       }
-               }
-               
-               //if you found the barcode or if you don't want to allow for diffs
-               if ((bdiffs == 0) || (success == 0)) { return success;  }
-               
-               else { //try aligning and see if you can find it
-
-                       int maxLength = 0;
-
-                       Alignment* alignment;
-                       if (barcodes.size() > 0) {
-                               map<string,int>::iterator it=barcodes.begin();
-
-                               for(it;it!=barcodes.end();it++){
-                                       if(it->first.length() > maxLength){
-                                               maxLength = it->first.length();
-                                       }
-                               }
-                               alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+bdiffs+1));  
-
-                       }else{ alignment = NULL; } 
-                       
-                       //can you find the barcode
-                       int minDiff = 1e6;
-                       int minCount = 1;
-                       int minGroup = -1;
-                       int minPos = 0;
-                       
-                       for(map<string,int>::iterator it=barcodes.begin();it!=barcodes.end();it++){
-                               string oligo = it->first;
-//                             int length = oligo.length();
-                               
-                               if(rawSequence.length() < maxLength){   //let's just assume that the barcodes are the same length
-                                       success = bdiffs + 10;
-                                       break;
-                               }
-                               
-                               //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
-                               alignment->align(oligo, rawSequence.substr(0,oligo.length()+bdiffs));
-                               oligo = alignment->getSeqAAln();
-                               string temp = alignment->getSeqBAln();
-               
-                               int alnLength = oligo.length();
-                               
-                               for(int i=oligo.length()-1;i>=0;i--){
-                                       if(oligo[i] != '-'){    alnLength = i+1;        break;  }
-                               }
-                               oligo = oligo.substr(0,alnLength);
-                               temp = temp.substr(0,alnLength);
-                               
-                               int numDiff = countDiffs(oligo, temp);
-                               
-                               if(numDiff < minDiff){
-                                       minDiff = numDiff;
-                                       minCount = 1;
-                                       minGroup = it->second;
-                                       minPos = 0;
-                                       for(int i=0;i<alnLength;i++){
-                                               if(temp[i] != '-'){
-                                                       minPos++;
-                                               }
-                                       }
-                               }
-                               else if(numDiff == minDiff){
-                                       minCount++;
-                               }
-
-                       }
-
-                       if(minDiff > bdiffs)    {       success = minDiff;              }       //no good matches
-                       else if(minCount > 1)   {       success = bdiffs + 100; }       //can't tell the difference between multiple barcodes
-                       else{                                                                                                   //use the best match
-                               group = minGroup;
-                               seq.setUnaligned(rawSequence.substr(minPos));
-                               
-                               if(qual.getName() != ""){
-                                       qual.trimQScores(minPos, -1);
-                               }
-                               success = minDiff;
-                       }
-                       
-                       if (alignment != NULL) {  delete alignment;  }
-                       
-               }
-       
-               return success;
-               
-       }
-       catch(exception& e) {
-               m->errorOut(e, "TrimSeqsCommand", "stripBarcode");
-               exit(1);
-       }
-
-}
-
-//***************************************************************************************************************
-
-int TrimSeqsCommand::stripForward(Sequence& seq, QualityScores& qual, int& group){
-       try {
-               string rawSequence = seq.getUnaligned();
-               int success = pdiffs + 1;       //guilty until proven innocent
-               
-               //can you find the primer
-               for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
-                       string oligo = it->first;
-                       if(rawSequence.length() < oligo.length()){      //let's just assume that the primers are the same length
-                               success = pdiffs + 10;                                  //if the sequence is shorter than the barcode then bail out
-                               break;  
-                       }
-                       
-                       if(compareDNASeq(oligo, rawSequence.substr(0,oligo.length()))){
-                               group = it->second;
-                               seq.setUnaligned(rawSequence.substr(oligo.length()));
-                               if(qual.getName() != ""){
-                                       qual.trimQScores(oligo.length(), -1);
-                               }
-                               success = 0;
-                               break;
-                       }
-               }
-
-               //if you found the barcode or if you don't want to allow for diffs
-               if ((pdiffs == 0) || (success == 0)) { return success;  }
-               
-               else { //try aligning and see if you can find it
-
-                       int maxLength = 0;
-
-                       Alignment* alignment;
-                       if (primers.size() > 0) {
-                               map<string,int>::iterator it=primers.begin();
-
-                               for(it;it!=primers.end();it++){
-                                       if(it->first.length() > maxLength){
-                                               maxLength = it->first.length();
-                                       }
-                               }
-                               alignment = new NeedlemanOverlap(-1.0, 1.0, -1.0, (maxLength+pdiffs+1));  
-
-                       }else{ alignment = NULL; } 
-                       
-                       //can you find the barcode
-                       int minDiff = 1e6;
-                       int minCount = 1;
-                       int minGroup = -1;
-                       int minPos = 0;
-                       
-                       for(map<string,int>::iterator it=primers.begin();it!=primers.end();it++){
-                               string oligo = it->first;
-//                             int length = oligo.length();
-                               
-                               if(rawSequence.length() < maxLength){   
-                                       success = pdiffs + 100;
-                                       break;
-                               }
-                               
-                               //use needleman to align first barcode.length()+numdiffs of sequence to each barcode
-                               alignment->align(oligo, rawSequence.substr(0,oligo.length()+pdiffs));
-                               oligo = alignment->getSeqAAln();
-                               string temp = alignment->getSeqBAln();
-               
-                               int alnLength = oligo.length();
-                               
-                               for(int i=oligo.length()-1;i>=0;i--){
-                                       if(oligo[i] != '-'){    alnLength = i+1;        break;  }
-                               }
-                               oligo = oligo.substr(0,alnLength);
-                               temp = temp.substr(0,alnLength);
-                               
-                               int numDiff = countDiffs(oligo, temp);
-                               
-                               if(numDiff < minDiff){
-                                       minDiff = numDiff;
-                                       minCount = 1;
-                                       minGroup = it->second;
-                                       minPos = 0;
-                                       for(int i=0;i<alnLength;i++){
-                                               if(temp[i] != '-'){
-                                                       minPos++;
-                                               }
-                                       }
-                               }
-                               else if(numDiff == minDiff){
-                                       minCount++;
-                               }
-
-                       }
-
-                       if(minDiff > pdiffs)    {       success = minDiff;              }       //no good matches
-                       else if(minCount > 1)   {       success = pdiffs + 10;  }       //can't tell the difference between multiple primers
-                       else{                                                                                                   //use the best match
-                               group = minGroup;
-                               seq.setUnaligned(rawSequence.substr(minPos));
-                               if(qual.getName() != ""){
-                                       qual.trimQScores(minPos, -1);
-                               }
-                               success = minDiff;
-                       }
-                       
-                       if (alignment != NULL) {  delete alignment;  }
-                       
-               }
-               
-               return success;
-
-       }
-       catch(exception& e) {
-               m->errorOut(e, "TrimSeqsCommand", "stripForward");
-               exit(1);
-       }
-}
-
-//***************************************************************************************************************
-
-bool TrimSeqsCommand::stripReverse(Sequence& seq, QualityScores& qual){
-       try {
-               string rawSequence = seq.getUnaligned();
-               bool success = 0;       //guilty until proven innocent
-               
-               for(int i=0;i<numRPrimers;i++){
-                       string oligo = revPrimer[i];
-                       
-                       if(rawSequence.length() < oligo.length()){
-                               success = 0;
-                               break;
-                       }
-                       
-                       if(compareDNASeq(oligo, rawSequence.substr(rawSequence.length()-oligo.length(),oligo.length()))){
-                               seq.setUnaligned(rawSequence.substr(0,rawSequence.length()-oligo.length()));
-                               if(qual.getName() != ""){
-                                       qual.trimQScores(-1, rawSequence.length()-oligo.length());
-                               }
-                               success = 1;
-                               break;
-                       }
-               }       
-               return success;
-               
-       }
-       catch(exception& e) {
-               m->errorOut(e, "TrimSeqsCommand", "stripReverse");
-               exit(1);
-       }
-}
-
 //***************************************************************************************************************
 
 bool TrimSeqsCommand::keepFirstTrim(Sequence& sequence, QualityScores& qscores){
@@ -1618,80 +1347,4 @@ bool TrimSeqsCommand::cullAmbigs(Sequence& seq){
        }
        
 }
-
-//***************************************************************************************************************
-
-bool TrimSeqsCommand::compareDNASeq(string oligo, string seq){
-       try {
-               bool success = 1;
-               int length = oligo.length();
-               
-               for(int i=0;i<length;i++){
-                       
-                       if(oligo[i] != seq[i]){
-                               if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C')    {       success = 0;    }
-                               else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       success = 0;    }
-                               else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       success = 0;    }
-                               else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       success = 0;    }
-                               else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       success = 0;    }
-                               else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       success = 0;    }
-                               else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       success = 0;    }
-                               else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       success = 0;    }
-                               else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
-                               else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       success = 0;    }
-                               else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       success = 0;    }
-                               else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       success = 0;    }                       
-                               
-                               if(success == 0)        {       break;   }
-                       }
-                       else{
-                               success = 1;
-                       }
-               }
-               
-               return success;
-       }
-       catch(exception& e) {
-               m->errorOut(e, "TrimSeqsCommand", "compareDNASeq");
-               exit(1);
-       }
-
-}
-
-//***************************************************************************************************************
-
-int TrimSeqsCommand::countDiffs(string oligo, string seq){
-       try {
-
-               int length = oligo.length();
-               int countDiffs = 0;
-               
-               for(int i=0;i<length;i++){
-                                                               
-                       if(oligo[i] != seq[i]){
-                               if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C' || oligo[i] == '-' || oligo[i] == '.')      {       countDiffs++;   }
-                               else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N'))                                {       countDiffs++;   }
-                               else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G'))                                    {       countDiffs++;   }
-                               else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T'))                                    {       countDiffs++;   }
-                               else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A'))                                    {       countDiffs++;   }
-                               else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G'))                                    {       countDiffs++;   }
-                               else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A'))                                    {       countDiffs++;   }
-                               else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G'))                                    {       countDiffs++;   }
-                               else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
-                               else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G'))   {       countDiffs++;   }
-                               else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C'))   {       countDiffs++;   }
-                               else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G'))   {       countDiffs++;   }       
-                       }
-                       
-               }
-               
-               return countDiffs;
-       }
-       catch(exception& e) {
-               m->errorOut(e, "TrimSeqsCommand", "countDiffs");
-               exit(1);
-       }
-
-}
-
 //***************************************************************************************************************