]> git.donarmstrong.com Git - mothur.git/blobdiff - trimseqscommand.h
fixed bug in pre.cluster with output file name change and other bugs while testing...
[mothur.git] / trimseqscommand.h
index 9ba64c4b62b7c7e87f213472b730a91bd0b37305..891b14dc463342e9403aab317fc0877fdd78e82c 100644 (file)
@@ -14,8 +14,8 @@
 #include "command.hpp"
 #include "sequence.hpp"
 #include "qualityscores.h"
 #include "command.hpp"
 #include "sequence.hpp"
 #include "qualityscores.h"
-#include "groupmap.h"
 #include "trimoligos.h"
 #include "trimoligos.h"
+#include "counttable.h"
 
 
 class TrimSeqsCommand : public Command {
 
 
 class TrimSeqsCommand : public Command {
@@ -27,7 +27,9 @@ public:
        vector<string> setParameters();
        string getCommandName()                 { return "trim.seqs";   }
        string getCommandCategory()             { return "Sequence Processing";         }
        vector<string> setParameters();
        string getCommandName()                 { return "trim.seqs";   }
        string getCommandCategory()             { return "Sequence Processing";         }
+       
        string getHelpString(); 
        string getHelpString(); 
+    string getOutputPattern(string);   
        string getCitation() { return "http://www.mothur.org/wiki/Trim.seqs"; }
        string getDescription()         { return "provides the preprocessing features needed to screen and sort pyrosequences"; }
 
        string getCitation() { return "http://www.mothur.org/wiki/Trim.seqs"; }
        string getDescription()         { return "provides the preprocessing features needed to screen and sort pyrosequences"; }
 
@@ -35,16 +37,13 @@ public:
        void help() { m->mothurOut(getHelpString()); }  
        
 private:
        void help() { m->mothurOut(getHelpString()); }  
        
 private:
-       
-       GroupMap* groupMap;
-    
     struct linePair {
         unsigned long long start;
         unsigned long long end;
         linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
         linePair() {}
     };
     struct linePair {
         unsigned long long start;
         unsigned long long end;
         linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {}
         linePair() {}
     };
-
+    
        bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
        bool keepFirstTrim(Sequence&, QualityScores&);
        bool removeLastTrim(Sequence&, QualityScores&);
        bool getOligos(vector<vector<string> >&, vector<vector<string> >&, vector<vector<string> >&);
        bool keepFirstTrim(Sequence&, QualityScores&);
        bool removeLastTrim(Sequence&, QualityScores&);
@@ -54,7 +53,7 @@ private:
     string reverseOligo(string);
 
        bool abort, createGroup;
     string reverseOligo(string);
 
        bool abort, createGroup;
-       string fastaFile, oligoFile, qFileName, groupfile, nameFile, outputDir;
+       string fastaFile, oligoFile, qFileName, groupfile, nameFile, countfile, outputDir;
        
        bool flip, allFiles, qtrim, keepforward;
        int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, comboStarts;
        
        bool flip, allFiles, qtrim, keepforward;
        int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, comboStarts;
@@ -73,13 +72,15 @@ private:
        vector<string> barcodeNameVector;       //needed here?
        map<string, int> groupCounts;  
        map<string, string> nameMap;
        vector<string> barcodeNameVector;       //needed here?
        map<string, int> groupCounts;  
        map<string, string> nameMap;
+    map<string, int> nameCount; //for countfile name -> repCount
+    map<string, string> groupMap; //for countfile name -> group
 
        vector<int> processIDS;   //processid
        vector<linePair> lines;
        vector<linePair> qLines;
        
 
        vector<int> processIDS;   //processid
        vector<linePair> lines;
        vector<linePair> qLines;
        
-       int driverCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >, linePair, linePair);    
-       int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >);
+       int driverCreateTrim(string, string, string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >, linePair, linePair);    
+       int createProcessesCreateTrim(string, string, string, string, string, string, string, string, string, string, string, vector<vector<string> >, vector<vector<string> >, vector<vector<string> >);
        int setLines(string, string);
 };
 
        int setLines(string, string);
 };
 
@@ -90,7 +91,7 @@ private:
 struct trimData {
     unsigned long long start, end;
     MothurOut* m;
 struct trimData {
     unsigned long long start, end;
     MothurOut* m;
-    string filename, qFileName, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, groupFileName, nameFile;
+    string filename, qFileName, trimFileName, scrapFileName, trimQFileName, scrapQFileName, trimNFileName, scrapNFileName, trimCFileName, scrapCFileName, groupFileName, nameFile, countfile;
        vector<vector<string> > fastaFileNames;
     vector<vector<string> > qualFileNames;
     vector<vector<string> > nameFileNames;
        vector<vector<string> > fastaFileNames;
     vector<vector<string> > qualFileNames;
     vector<vector<string> > nameFileNames;
@@ -102,6 +103,7 @@ struct trimData {
     vector<string> revPrimer;
        map<string, int> barcodes;
        map<string, int> primers;
     vector<string> revPrimer;
        map<string, int> barcodes;
        map<string, int> primers;
+    map<string, int> nameCount;
     vector<string>  linker;
     vector<string>  spacer;
        map<string, int> combos;
     vector<string>  linker;
     vector<string>  spacer;
        map<string, int> combos;
@@ -109,22 +111,26 @@ struct trimData {
        vector<string> barcodeNameVector;       
        map<string, int> groupCounts;  
        map<string, string> nameMap;
        vector<string> barcodeNameVector;       
        map<string, int> groupCounts;  
        map<string, string> nameMap;
+    map<string, string> groupMap;
     
        trimData(){}
     
        trimData(){}
-       trimData(string fn, string qn, string nf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string gn, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<vector<string> > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend,  MothurOut* mout,
+       trimData(string fn, string qn, string nf, string cf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string tcn, string scn,string gn, vector<vector<string> > ffn, vector<vector<string> > qfn, vector<vector<string> > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend,  MothurOut* mout,
                       int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, vector<string> revP, vector<string> li, vector<string> spa, 
                       vector<string> priNameVector, vector<string> barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL,
                       int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage,
                       int pd, int bd, int ld, int sd, int td, map<string, int> pri, map<string, int> bar, vector<string> revP, vector<string> li, vector<string> spa, 
                       vector<string> priNameVector, vector<string> barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL,
                       int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage,
-                      int minL, int maxA, int maxH, int maxL, bool fli, map<string, string> nm) {
+                      int minL, int maxA, int maxH, int maxL, bool fli, map<string, string> nm, map<string, int> ncount) {
         filename = fn;
         qFileName = qn;
         nameFile = nf;
         filename = fn;
         qFileName = qn;
         nameFile = nf;
+        countfile = cf;
         trimFileName = tn;
         scrapFileName = sn;
         trimQFileName = tqn;
         scrapQFileName = sqn;
         trimNFileName = tnn;
         scrapNFileName = snn;
         trimFileName = tn;
         scrapFileName = sn;
         trimQFileName = tqn;
         scrapQFileName = sqn;
         trimNFileName = tnn;
         scrapNFileName = snn;
+        trimCFileName = tcn;
+        scrapCFileName = scn;
         groupFileName = gn;
         fastaFileNames = ffn;
         qualFileNames = qfn;
         groupFileName = gn;
         fastaFileNames = ffn;
         qualFileNames = qfn;
@@ -134,6 +140,7 @@ struct trimData {
         qlineStart = qstart;
         qlineEnd = qend;
                m = mout;
         qlineStart = qstart;
         qlineEnd = qend;
                m = mout;
+        nameCount = ncount;
         
         pdiffs = pd;
         bdiffs = bd;
         
         pdiffs = pd;
         bdiffs = bd;
@@ -199,7 +206,7 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){
                
                
                ofstream outGroupsFile;
                
                
                ofstream outGroupsFile;
-               if (pDataArray->createGroup){   pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile);   }
+               if ((pDataArray->createGroup) && (pDataArray->countfile == "")){        pDataArray->m->openOutputFile(pDataArray->groupFileName, outGroupsFile);   }
                if(pDataArray->allFiles){
                        for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
                                for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file
                if(pDataArray->allFiles){
                        for (int i = 0; i < pDataArray->fastaFileNames.size(); i++) { //clears old file
                                for (int j = 0; j < pDataArray->fastaFileNames[i].size(); j++) { //clears old file
@@ -218,6 +225,14 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){
                        }
                }
                
                        }
                }
                
+        ofstream trimCountFile;
+               ofstream scrapCountFile;
+               if(pDataArray->countfile != ""){
+                       pDataArray->m->openOutputFile(pDataArray->trimCFileName, trimCountFile);
+                       pDataArray->m->openOutputFile(pDataArray->scrapCFileName, scrapCountFile);
+            if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) { trimCountFile << "Representative_Sequence\ttotal" << endl; scrapCountFile << "Representative_Sequence\ttotal" << endl; }
+               }
+        
                ifstream inFASTA;
                pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
                if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) {
                ifstream inFASTA;
                pDataArray->m->openInputFile(pDataArray->filename, inFASTA);
                if ((pDataArray->lineStart == 0) || (pDataArray->lineStart == 1)) {
@@ -244,7 +259,11 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){
                                   
                        if (pDataArray->m->control_pressed) { 
                                inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
                                   
                        if (pDataArray->m->control_pressed) { 
                                inFASTA.close(); trimFASTAFile.close(); scrapFASTAFile.close();
-                               if (pDataArray->createGroup) {   outGroupsFile.close();   }
+                               if ((pDataArray->createGroup) && (pDataArray->countfile == "")) {        outGroupsFile.close();   }
+                if(pDataArray->qFileName != "")        {       qFile.close();  scrapQualFile.close(); trimQualFile.close();    }
+                if(pDataArray->nameFile != "") {       scrapNameFile.close(); trimNameFile.close();    }
+                if(pDataArray->countfile != "")        {       scrapCountFile.close(); trimCountFile.close();  }
+
                                if(pDataArray->qFileName != ""){ qFile.close(); }
                                return 0;
                        }
                                if(pDataArray->qFileName != ""){ qFile.close(); }
                                return 0;
                        }
@@ -277,7 +296,7 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){
                                        if(success > pDataArray->bdiffs)                {       trashCode += 'b';       }
                                        else{ currentSeqsDiffs += success;  }
                                }
                                        if(success > pDataArray->bdiffs)                {       trashCode += 'b';       }
                                        else{ currentSeqsDiffs += success;  }
                                }
-                               
+                
                 if(pDataArray->numSpacers != 0){
                                        success = trimOligos.stripSpacer(currSeq, currQual);
                                        if(success > pDataArray->sdiffs)                {       trashCode += 's';       }
                 if(pDataArray->numSpacers != 0){
                                        success = trimOligos.stripSpacer(currSeq, currQual);
                                        if(success > pDataArray->sdiffs)                {       trashCode += 's';       }
@@ -376,20 +395,8 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){
                                }
                                
                                if(trashCode.length() == 0){
                                }
                                
                                if(trashCode.length() == 0){
-                                       currSeq.setAligned(currSeq.getUnaligned());
-                                       currSeq.printSequence(trimFASTAFile);
-                                       
-                                       if(pDataArray->qFileName != ""){
-                                               currQual.printQScores(trimQualFile);
-                                       }
-                                       
-                                       if(pDataArray->nameFile != ""){
-                                               map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
-                                               if (itName != pDataArray->nameMap.end()) {  trimNameFile << itName->first << '\t' << itName->second << endl; }
-                                               else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
-                                       }
-                                       
-                                       if (pDataArray->createGroup) {
+                    string thisGroup = "";
+                    if (pDataArray->createGroup) {
                                                if(pDataArray->barcodes.size() != 0){
                                                        string thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
                                                        if (pDataArray->primers.size() != 0) { 
                                                if(pDataArray->barcodes.size() != 0){
                                                        string thisGroup = pDataArray->barcodeNameVector[barcodeIndex];
                                                        if (pDataArray->primers.size() != 0) { 
@@ -401,48 +408,81 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){
                                                                        }
                                                                } 
                                                        }
                                                                        }
                                                                } 
                                                        }
-                                                       
-                                                       outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
-                                                       
-                                                       if (pDataArray->nameFile != "") {
-                                                               map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
-                                                               if (itName != pDataArray->nameMap.end()) { 
-                                                                       vector<string> thisSeqsNames; 
-                                                                       pDataArray->m->splitAtChar(itName->second, thisSeqsNames, ',');
-                                                                       for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
-                                                                               outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
-                                                                       }
-                                                               }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }                                                   
-                                                       }
-                                                       
-                                                       map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
-                                                       if (it == pDataArray->groupCounts.end()) {      pDataArray->groupCounts[thisGroup] = 1; }
-                                                       else { pDataArray->groupCounts[it->first]++; }
+                        }
+                    }
+                    
+                    int pos = thisGroup.find("ignore");
+                    if (pos == string::npos) {
+                        
+                        currSeq.setAligned(currSeq.getUnaligned());
+                        currSeq.printSequence(trimFASTAFile);
+                        
+                        if(pDataArray->qFileName != ""){
+                            currQual.printQScores(trimQualFile);
+                        }
+                        
+                        if(pDataArray->nameFile != ""){
+                            map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
+                            if (itName != pDataArray->nameMap.end()) {  trimNameFile << itName->first << '\t' << itName->second << endl; }
+                            else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
+                        }
+                        
+                        int numRedundants = 0;
+                        if (pDataArray->countfile != "") {
+                            map<string, int>::iterator itCount = pDataArray->nameCount.find(currSeq.getName());
+                            if (itCount != pDataArray->nameCount.end()) { 
+                                trimCountFile << itCount->first << '\t' << itCount->second << endl;
+                                numRedundants = itCount->second-1;
+                            }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); }
+                        }
+                        
+                        if (pDataArray->createGroup) {
+                            if(pDataArray->barcodes.size() != 0){
+                                
+                                if (pDataArray->countfile == "") { outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; }
+                                else {   pDataArray->groupMap[currSeq.getName()] = thisGroup; }
+                                
+                                if (pDataArray->nameFile != "") {
+                                    map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
+                                    if (itName != pDataArray->nameMap.end()) { 
+                                        vector<string> thisSeqsNames; 
+                                        pDataArray->m->splitAtChar(itName->second, thisSeqsNames, ',');
+                                        numRedundants = thisSeqsNames.size()-1; //we already include ourselves below
+                                        for (int k = 1; k < thisSeqsNames.size(); k++) { //start at 1 to skip self
+                                            outGroupsFile << thisSeqsNames[k] << '\t' << thisGroup << endl;
+                                        }
+                                    }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }                                                      
+                                }
+                                
+                                map<string, int>::iterator it = pDataArray->groupCounts.find(thisGroup);
+                                if (it == pDataArray->groupCounts.end()) {     pDataArray->groupCounts[thisGroup] = 1 + numRedundants; }
+                                else { pDataArray->groupCounts[it->first] += (1 + numRedundants); }
+                                
+                            }
+                        }
+                        
+                        if(pDataArray->allFiles){
+                            ofstream output;
+                            pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
+                            currSeq.printSequence(output);
+                            output.close();
                             
                             
-                                               }
-                                       }
-                                       
-                                       if(pDataArray->allFiles){
-                                               ofstream output;
-                                               pDataArray->m->openOutputFileAppend(pDataArray->fastaFileNames[barcodeIndex][primerIndex], output);
-                                               currSeq.printSequence(output);
-                                               output.close();
-                                               
-                                               if(pDataArray->qFileName != ""){
-                                                       pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output);
-                                                       currQual.printQScores(output);
-                                                       output.close();                                                 
-                                               }
-                                               
-                                               if(pDataArray->nameFile != ""){
-                                                       map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
-                                                       if (itName != pDataArray->nameMap.end()) { 
-                                                               pDataArray->m->openOutputFileAppend(pDataArray->nameFileNames[barcodeIndex][primerIndex], output);
-                                                               output << itName->first << '\t' << itName->second << endl; 
-                                                               output.close();
-                                                       }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
-                                               }
-                                       }
+                            if(pDataArray->qFileName != ""){
+                                pDataArray->m->openOutputFileAppend(pDataArray->qualFileNames[barcodeIndex][primerIndex], output);
+                                currQual.printQScores(output);
+                                output.close();                                                        
+                            }
+                            
+                            if(pDataArray->nameFile != ""){
+                                map<string, string>::iterator itName = pDataArray->nameMap.find(currSeq.getName());
+                                if (itName != pDataArray->nameMap.end()) { 
+                                    pDataArray->m->openOutputFileAppend(pDataArray->nameFileNames[barcodeIndex][primerIndex], output);
+                                    output << itName->first << '\t' << itName->second << endl; 
+                                    output.close();
+                                }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
+                            }
+                        }
+                    }
                                }
                                else{
                                        if(pDataArray->nameFile != ""){ //needs to be before the currSeq name is changed
                                }
                                else{
                                        if(pDataArray->nameFile != ""){ //needs to be before the currSeq name is changed
@@ -450,6 +490,12 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){
                                                if (itName != pDataArray->nameMap.end()) {  scrapNameFile << itName->first << '\t' << itName->second << endl; }
                                                else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
                                        }
                                                if (itName != pDataArray->nameMap.end()) {  scrapNameFile << itName->first << '\t' << itName->second << endl; }
                                                else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your namefile, please correct."); pDataArray->m->mothurOutEndLine(); }
                                        }
+                    if (pDataArray->countfile != "") {
+                        map<string, int>::iterator itCount = pDataArray->nameCount.find(currSeq.getName());
+                        if (itCount != pDataArray->nameCount.end()) { 
+                            trimCountFile << itCount->first << '\t' << itCount->second << endl;
+                        }else { pDataArray->m->mothurOut("[ERROR]: " + currSeq.getName() + " is not in your count file, please correct."); pDataArray->m->mothurOutEndLine(); }
+                    }
                                        currSeq.setName(currSeq.getName() + '|' + trashCode);
                                        currSeq.setUnaligned(origSeq);
                                        currSeq.setAligned(origSeq);
                                        currSeq.setName(currSeq.getName() + '|' + trashCode);
                                        currSeq.setUnaligned(origSeq);
                                        currSeq.setAligned(origSeq);