]> git.donarmstrong.com Git - mothur.git/blobdiff - trimseqscommand.h
paralellized trim.seqs for windows. fixed bug in trim.flows that stopped fasta data...
[mothur.git] / trimseqscommand.h
index b6050f2ee58a3db6f56714e7f313fa9ed93b77b5..5006ce923d00183b0bc22ed08adbfaa6240df247 100644 (file)
@@ -15,6 +15,8 @@
 #include "sequence.hpp"
 #include "qualityscores.h"
 #include "groupmap.h"
+#include "trimoligos.h"
+
 
 class TrimSeqsCommand : public Command {
 public:
@@ -35,13 +37,14 @@ public:
 private:
        
        GroupMap* groupMap;
-       
-       struct linePair {
-               unsigned long long start;
-               unsigned long long end;
-               linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
-       };
-       
+    
+    struct linePair {
+        unsigned long long start;
+        unsigned long long end;
+        linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
+        linePair() {}
+    };
+
        bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
        bool keepFirstTrim(Sequence&, QualityScores&);
        bool removeLastTrim(Sequence&, QualityScores&);
@@ -71,12 +74,418 @@ private:
        map<string, string> nameMap;
 
        vector<int> processIDS;   //processid
-       vector<linePair*> lines;
-       vector<linePair*> qLines;
+       vector<linePair> lines;
+       vector<linePair> qLines;
        
-       int driverCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >, linePair*, linePair*);  
+       int driverCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >, linePair, linePair);    
        int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >);
-       int setLines(string, string, vector<unsigned long long>&, vector<unsigned long long>&);
+       int setLines(string, string);
+};
+
+/**************************************************************************************************/
+//custom data structure for threads to use.
+// This is passed by void pointer so it can be any data type
+// that can be passed using a single void pointer (LPVOID).
+struct trimData {
+    unsigned long long start, end;
+    MothurOut* m;
+    string filename, qFileName, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, groupFileName, nameFile;
+       vector<vector<string> > fastaFileNames;
+    vector<vector<string> > qualFileNames;
+    vector<vector<string> > nameFileNames;
+    unsigned long long lineStart, lineEnd, qlineStart, qlineEnd;
+    bool flip, allFiles, qtrim, keepforward, createGroup;
+       int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs;
+       int qWindowSize, qWindowStep, keepFirst, removeLast, count;
+       double qRollAverage, qThreshold, qWindowAverage, qAverage;
+    vector<string> revPrimer;
+       map<string, int> barcodes;
+       map<string, int> primers;
+    vector<string>  linker;
+    vector<string>  spacer;
+       map<string, int> combos;
+       vector<string> primerNameVector;        
+       vector<string> barcodeNameVector;       
+       map<string, int> groupCounts;  
+       map<string, string> nameMap;
+    
+       trimData(){}
+       trimData(string fn, string qn, string nf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string gn, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<vector<string> > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend,  MothurOut* mout,
+                      int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, vector<string> revP, vector<string> li, vector<string> spa, 
+                      vector<string> priNameVector, vector<string> barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL,
+                      int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage,
+                      int minL, int maxA, int maxH, int maxL, bool fli, map<string, string> nm) {
+        filename = fn;
+        qFileName = qn;
+        nameFile = nf;
+        trimFileName = tn;
+        scrapFileName = sn;
+        trimQFileName = tqn;
+        scrapQFileName = sqn;
+        trimNFileName = tnn;
+        scrapNFileName = snn;
+        groupFileName = gn;
+        fastaFileNames = ffn;
+        qualFileNames = qfn;
+        nameFileNames = nfn;
+        lineStart = lstart;
+        lineEnd = lend;
+        qlineStart = qstart;
+        qlineEnd = qend;
+               m = mout;
+        
+        pdiffs = pd;
+        bdiffs = bd;
+        ldiffs = ld;
+        sdiffs = sd;
+        tdiffs = td;
+        barcodes = bar;
+        primers = pri;      numFPrimers = primers.size();
+        revPrimer = revP;   numRPrimers = revPrimer.size();
+        linker = li;        numLinkers = linker.size();
+        spacer = spa;       numSpacers = spacer.size();
+        primerNameVector = priNameVector;
+        barcodeNameVector = barNameVector;
+        
+        createGroup = cGroup;
+        allFiles = aFiles;
+        keepforward = keepF;
+        keepFirst = keepfi;
+        removeLast = removeL;
+        qWindowStep = WindowStep;
+        qWindowSize = WindowSize;
+        qWindowAverage = WindowAverage;
+        qtrim = trim;
+        qThreshold = Threshold;
+        qAverage = Average;
+        qRollAverage = RollAverage;
+        minLength = minL;
+        maxAmbig = maxA;
+        maxHomoP = maxH;
+        maxLength = maxL;
+        flip = fli;
+        nameMap = nm;
+        count = 0;
+       }
 };
+/**************************************************************************************************/
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+#else
+static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ 
+       trimData* pDataArray;
+       pDataArray = (trimData*)lpParam;
+       
+       try {
+        ofstream trimFASTAFile;
+               pDataArray->m->openOutputFile(pDataArray->trimFileName, trimFASTAFile);
+               
+               ofstream scrapFASTAFile;
+               pDataArray->m->openOutputFile(pDataArray->scrapFileName, scrapFASTAFile);
+               
+               ofstream trimQualFile;
+               ofstream scrapQualFile;
+               if(pDataArray->qFileName != ""){
+                       pDataArray->m->openOutputFile(pDataArray->trimQFileName, trimQualFile);
+                       pDataArray->m->openOutputFile(pDataArray->scrapQFileName, scrapQualFile);
+               }
+               
+               ofstream trimNameFile;
+               ofstream scrapNameFile;
+               if(pDataArray->nameFile != ""){
+                       pDataArray->m->openOutputFile(pDataArray->trimNFileName, trimNameFile);
+                       pDataArray->m->openOutputFile(pDataArray->scrapNFileName, scrapNameFile);
+               }
+               
+               
+               ofstream outGroupsFile;
+               if (pDataArray->createGroup){   pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile);   }
+               if(pDataArray->allFiles){
+                       for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
+                               for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file
+                                       if (pDataArray->fastaFileNames[i][j] != "") {
+                                               ofstream temp;
+                                               pDataArray->m->openOutputFile(pDataArray->fastaFileNames[i][j], temp);                  temp.close();
+                                               if(pDataArray->qFileName != ""){
+                                                       pDataArray->m->openOutputFile(pDataArray->qualFileNames[i][j], temp);                   temp.close();
+                                               }
+                                               
+                                               if(pDataArray->nameFile != ""){
+                                                       pDataArray->m->openOutputFile(pDataArray->nameFileNames[i][j], temp);                   temp.close();
+                                               }
+                                       }
+                               }
+                       }
+               }
+               
+               ifstream inFASTA;
+               pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
+               if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) {
+                       inFASTA.seekg(0);
+               }else { //this accounts for the difference in line endings. 
+                       inFASTA.seekg(pDataArray->lineStart-1); pDataArray->m->gobble(inFASTA); 
+               }
+               
+               ifstream qFile;
+               if(pDataArray->qFileName != "") {
+                       pDataArray->m->openInputFile(pDataArray->qFileName, qFile);
+                       if ((pDataArray->qlineStart == 0) || (pDataArray->qlineStart == 1)) {
+                qFile.seekg(0);
+            }else { //this accounts for the difference in line endings. 
+                qFile.seekg(pDataArray->qlineStart-1); pDataArray->m->gobble(qFile); 
+            } 
+               }
+               
+               
+               TrimOligos trimOligos(pDataArray->pdiffs, pDataArray->bdiffs, pDataArray->ldiffs, pDataArray->sdiffs, pDataArray->primers, pDataArray->barcodes, pDataArray->revPrimer, pDataArray->linker, pDataArray->spacer);
+        
+               pDataArray->count = pDataArray->lineEnd;
+               for(int i = 0; i < pDataArray->lineEnd; i++){ //end is the number of sequences to process
+                                  
+                       if (pDataArray->m->control_pressed) { 
+                               inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
+                               if (pDataArray->createGroup) {   outGroupsFile.close();   }
+                               if(pDataArray->qFileName != ""){ qFile.close(); }
+                               return 0;
+                       }
+                       
+                       int success = 1;
+                       string trashCode = "";
+                       int currentSeqsDiffs = 0;
+            
+                       Sequence currSeq(inFASTA); pDataArray->m->gobble(inFASTA);
+                       
+                       QualityScores currQual;
+                       if(pDataArray->qFileName != ""){
+                               currQual = QualityScores(qFile);  pDataArray->m->gobble(qFile);
+                       }
+                       
+                       string origSeq = currSeq.getUnaligned();
+                       if (origSeq != "") {
+                               
+                               int barcodeIndex = 0;
+                               int primerIndex = 0;
+                               
+                if(pDataArray->numLinkers != 0){
+                                       success = trimOligos.stripLinker(currSeq, currQual);
+                                       if(success > pDataArray->ldiffs)                {       trashCode += 'k';       }
+                                       else{ currentSeqsDiffs += success;  }
+                               }
+                
+                               if(pDataArray->barcodes.size() != 0){
+                                       success = trimOligos.stripBarcode(currSeq, currQual, barcodeIndex);
+                                       if(success > pDataArray->bdiffs)                {       trashCode += 'b';       }
+                                       else{ currentSeqsDiffs += success;  }
+                               }
+                               
+                if(pDataArray->numSpacers != 0){
+                                       success = trimOligos.stripSpacer(currSeq, currQual);
+                                       if(success > pDataArray->sdiffs)                {       trashCode += 's';       }
+                                       else{ currentSeqsDiffs += success;  }
+
+                               }
+                
+                               if(pDataArray->numFPrimers != 0){
+                                       success = trimOligos.stripForward(currSeq, currQual, primerIndex, pDataArray->keepforward);
+                                       if(success > pDataArray->pdiffs)                {       trashCode += 'f';       }
+                                       else{ currentSeqsDiffs += success;  }
+                               }
+                               
+                               if (currentSeqsDiffs > pDataArray->tdiffs)      {       trashCode += 't';   }
+                               
+                               if(pDataArray->numRPrimers != 0){
+                                       success = trimOligos.stripReverse(currSeq, currQual);
+                                       if(!success)                            {       trashCode += 'r';       }
+                               }
+                
+                               if(pDataArray->keepFirst != 0){
+                                       //success = keepFirstTrim(currSeq, currQual);
+                    success = 1;
+                    if(currQual.getName() != ""){
+                        currQual.trimQScores(-1, pDataArray->keepFirst);
+                    }
+                    currSeq.trim(pDataArray->keepFirst);
+                               }
+                               
+                               if(pDataArray->removeLast != 0){
+                                       //success = removeLastTrim(currSeq, currQual);
+                    success = 0;
+                    int length = currSeq.getNumBases() - pDataArray->removeLast;
+                    
+                    if(length > 0){
+                        if(currQual.getName() != ""){
+                            currQual.trimQScores(-1, length);
+                        }
+                        currSeq.trim(length);
+                        success = 1;
+                    }
+                    else{ success = 0; }
+                    
+                                       if(!success)                            {       trashCode += 'l';       }
+                               }
+                
+                               
+                               if(pDataArray->qFileName != ""){
+                                       int origLength = currSeq.getNumBases();
+                                       
+                                       if(pDataArray->qThreshold != 0)                 {       success = currQual.stripQualThreshold(currSeq, pDataArray->qThreshold);                 }
+                                       else if(pDataArray->qAverage != 0)              {       success = currQual.cullQualAverage(currSeq, pDataArray->qAverage);                              }
+                                       else if(pDataArray->qRollAverage != 0)  {       success = currQual.stripQualRollingAverage(currSeq, pDataArray->qRollAverage);  }
+                                       else if(pDataArray->qWindowAverage != 0){       success = currQual.stripQualWindowAverage(currSeq, pDataArray->qWindowStep, pDataArray->qWindowSize, pDataArray->qWindowAverage);       }
+                                       else                                            {       success = 1;                            }
+                                       
+                                       //you don't want to trim, if it fails above then scrap it
+                                       if ((!pDataArray->qtrim) && (origLength != currSeq.getNumBases())) {  success = 0; }
+                                       
+                                       if(!success)                            {       trashCode += 'q';       }
+                               }                               
+                
+                               if(pDataArray->minLength > 0 || pDataArray->maxLength > 0){
+                                       //success = cullLength(currSeq);
+                    int length = currSeq.getNumBases();
+                    success = 0;       //guilty until proven innocent
+                    if(length >= pDataArray->minLength && pDataArray->maxLength == 0)                  {       success = 1;    }
+                    else if(length >= pDataArray->minLength && length <= pDataArray->maxLength)        {       success = 1;    }
+                    else                                                                                               {       success = 0;    }
+                    
+                                       if(!success)                            {       trashCode += 'l';       }
+                               }
+                               if(pDataArray->maxHomoP > 0){
+                                       //success = cullHomoP(currSeq);
+                    int longHomoP = currSeq.getLongHomoPolymer();
+                    success = 0;       //guilty until proven innocent
+                    if(longHomoP <= pDataArray->maxHomoP){     success = 1;    }
+                    else                                       {       success = 0;    }
+                    
+                                       if(!success)                            {       trashCode += 'h';       }
+                               }
+                               if(pDataArray->maxAmbig != -1){
+                                       //success = cullAmbigs(currSeq);
+                    int numNs = currSeq.getAmbigBases();
+                    success = 0;       //guilty until proven innocent
+                    if(numNs <= pDataArray->maxAmbig)  {       success = 1;    }
+                    else                                       {       success = 0;    }
+                                       if(!success)                            {       trashCode += 'n';       }
+                               }
+                               
+                               if(pDataArray->flip){           // should go last                       
+                                       currSeq.reverseComplement();
+                                       if(pDataArray->qFileName != ""){
+                                               currQual.flipQScores(); 
+                                       }
+                               }
+                               
+                               if(trashCode.length() == 0){
+                                       currSeq.setAligned(currSeq.getUnaligned());
+                                       currSeq.printSequence(trimFASTAFile);
+                                       
+                                       if(pDataArray->qFileName != ""){
+                                               currQual.printQScores(trimQualFile);
+                                       }
+                                       
+                                       if(pDataArray->nameFile != ""){
+                                               map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
+                                               if (itName != pDataArray->nameMap.end()) {  trimNameFile << itName->first << '\t' << itName->second << endl; }
+                                               else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
+                                       }
+                                       
+                                       if (pDataArray->createGroup) {
+                                               if(pDataArray->barcodes.size() != 0){
+                                                       string thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
+                                                       if (pDataArray->primers.size() != 0) { 
+                                                               if (pDataArray->primerNameVector[primerIndex] != "") { 
+                                                                       if(thisGroup != "") {
+                                                                               thisGroup += "." + pDataArray->primerNameVector[primerIndex]; 
+                                                                       }else {
+                                                                               thisGroup = pDataArray->primerNameVector[primerIndex]; 
+                                                                       }
+                                                               } 
+                                                       }
+                                                       
+                                                       outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
+                                                       
+                                                       if (pDataArray->nameFile != "") {
+                                                               map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
+                                                               if (itName != pDataArray->nameMap.end()) { 
+                                                                       vector<string> thisSeqsNames; 
+                                                                       pDataArray->m->splitAtChar(itName->second, thisSeqsNames, ',');
+                                                                       for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
+                                                                               outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
+                                                                       }
+                                                               }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }                                                   
+                                                       }
+                                                       
+                                                       map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
+                                                       if (it == pDataArray->groupCounts.end()) {      pDataArray->groupCounts[thisGroup] = 1; }
+                                                       else { pDataArray->groupCounts[it->first]++; }
+                            
+                                               }
+                                       }
+                                       
+                                       if(pDataArray->allFiles){
+                                               ofstream output;
+                                               pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
+                                               currSeq.printSequence(output);
+                                               output.close();
+                                               
+                                               if(pDataArray->qFileName != ""){
+                                                       pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output);
+                                                       currQual.printQScores(output);
+                                                       output.close();                                                 
+                                               }
+                                               
+                                               if(pDataArray->nameFile != ""){
+                                                       map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
+                                                       if (itName != pDataArray->nameMap.end()) { 
+                                                               pDataArray->m->openOutputFileAppend(pDataArray->nameFileNames[barcodeIndex][primerIndex], output);
+                                                               output << itName->first << '\t' << itName->second << endl; 
+                                                               output.close();
+                                                       }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
+                                               }
+                                       }
+                               }
+                               else{
+                                       if(pDataArray->nameFile != ""){ //needs to be before the currSeq name is changed
+                                               map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
+                                               if (itName != pDataArray->nameMap.end()) {  scrapNameFile << itName->first << '\t' << itName->second << endl; }
+                                               else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
+                                       }
+                                       currSeq.setName(currSeq.getName() + '|' + trashCode);
+                                       currSeq.setUnaligned(origSeq);
+                                       currSeq.setAligned(origSeq);
+                                       currSeq.printSequence(scrapFASTAFile);
+                                       if(pDataArray->qFileName != ""){
+                                               currQual.printQScores(scrapQualFile);
+                                       }
+                               }
+                               
+                       }
+                       
+                       //report progress
+                       if((i) % 1000 == 0){    pDataArray->m->mothurOut(toString(i)); pDataArray->m->mothurOutEndLine();               }
+                       
+               }
+               //report progress
+               if((pDataArray->count) % 1000 != 0){    pDataArray->m->mothurOut(toString(pDataArray->count)); pDataArray->m->mothurOutEndLine();               }
+               
+               
+               inFASTA.close();
+               trimFASTAFile.close();
+               scrapFASTAFile.close();
+               if (pDataArray->createGroup) {   outGroupsFile.close();   }
+               if(pDataArray->qFileName != "") {       qFile.close();  scrapQualFile.close(); trimQualFile.close();    }
+               if(pDataArray->nameFile != "")  {       scrapNameFile.close(); trimNameFile.close();    }
+               
+        return 0;
+            
+        }
+        catch(exception& e) {
+            pDataArray->m->errorOut(e, "TrimSeqsCommand", "MyTrimThreadFunction");
+            exit(1);
+        }
+    } 
+#endif
+    
+
+/**************************************************************************************************/
 
 #endif