]> git.donarmstrong.com Git - mothur.git/blobdiff - chimerauchimecommand.h
added modify names parameter to set.dir
[mothur.git] / chimerauchimecommand.h
index b7da889701f3ac2bc7883cf5c39356550eb2e25d..fadc4a4d401dcf5da93d0d32c56ed9ec8a7ef99a 100644 (file)
@@ -14,6 +14,8 @@
 #include "mothur.h"
 #include "command.hpp"
 #include "sequenceparser.h"
+#include "counttable.h"
+#include "sequencecountparser.h"
 
 /***********************************************************/
 
@@ -26,8 +28,10 @@ public:
        vector<string> setParameters();
        string getCommandName()                 { return "chimera.uchime";              }
        string getCommandCategory()             { return "Sequence Processing"; }
+       
        string getHelpString(); 
-       string getCitation() { return "uchime by Robert C. Edgar\nhttp://drive5.com/uchime\nThis code is donated to the public domain.\nhttp://www.mothur.org/wiki/Chimera.uchime\nEdgar,R.C., Haas,B.J., Clemente,J.C., Quince,C. and Knight,R. (2011), UCHIME improves sensitivity and speed of chimera detection, Bioinformatics, in press.\n"; }
+    string getOutputPattern(string);   
+       string getCitation() { return "uchime by Robert C. Edgar\nhttp://drive5.com/uchime\nThis code was donated to the public domain.\nEdgar,R.C., Haas,B.J., Clemente,J.C., Quince,C. and Knight,R. (2011), UCHIME improves sensitivity and speed of chimera detection.  Bioinformatics 27:2194.\nhttp://www.mothur.org/wiki/Chimera.uchime\n"; }
        string getDescription()         { return "detect chimeric sequences"; }
        
        int execute(); 
@@ -44,11 +48,12 @@ private:
        int driver(string, string, string, string, int&);
        int createProcesses(string, string, string, string, int&);
                
-       bool abort, useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract;
-       string fastafile, groupfile, templatefile, outputDir, namefile, abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract;
+       bool abort, useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract, hasCount, hasName, dups;
+       string fastafile, groupfile, templatefile, outputDir, namefile, countfile, abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract, uchimeLocation, strand;
        int processors;
        
-       
+       SequenceParser* sparser;
+    SequenceCountParser* cparser;
        vector<string> outputNames;
        vector<string> fastaFileNames;
        vector<string> nameFileNames;
@@ -57,9 +62,10 @@ private:
        string getNamesFile(string&);
        int readFasta(string, map<string, string>&);
        int printFile(vector<seqPriorityNode>&, string);
-       int deconvoluteResults(SequenceParser&, string, string, string);
-       int driverGroups(SequenceParser&, string, string, string, string, int, int, vector<string>);
-       int createProcessesGroups(SequenceParser&, string, string, string, string, vector<string>, string, string, string);
+       int deconvoluteResults(map<string, string>&, string, string, string);
+       int driverGroups(string, string, string, string, string, int, int, vector<string>);
+       int createProcessesGroups(string, string, string, string, string, vector<string>, string, string, string);
+    int prepFile(string filename, string);
 
 
 };
@@ -74,17 +80,17 @@ struct uchimeData {
        string namefile; 
        string groupfile;
        string outputFName;
-       string accnos, alns, filename, templatefile;
+       string accnos, alns, filename, templatefile, uchimeLocation, countlist;
        MothurOut* m;
        int start;
        int end;
        int threadID, count, numChimeras;
        vector<string> groups;
-       bool useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract;
-       string abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract;
+       bool dups, useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract, hasCount;
+       string abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract, strand;
        
        uchimeData(){}
-       uchimeData(string o, string t, string file, string f, string n, string g, string ac,  string al, vector<string> gr, MothurOut* mout, int st, int en, int tid) {
+       uchimeData(string o, string uloc, string t, string file, string f, string n, string g, string ac,  string al, string nc, vector<string> gr, MothurOut* mout, int st, int en, int tid) {
                fastafile = f;
                namefile = n;
                groupfile = g;
@@ -100,8 +106,10 @@ struct uchimeData {
                groups = gr;
                count = 0;
                numChimeras = 0;
+        uchimeLocation = uloc;
+        countlist = nc;
        }
-       void setBooleans(bool Abskew, bool calns, bool MinH, bool Mindiv, bool Xn, bool Dn, bool Xa, bool Chunks, bool Minchunk, bool Idsmoothwindow, bool Minsmoothid, bool Maxp, bool skipgap, bool skipgap2, bool Minlen, bool Maxlen, bool uc, bool Queryfract) {
+       void setBooleans(bool dps, bool Abskew, bool calns, bool MinH, bool Mindiv, bool Xn, bool Dn, bool Xa, bool Chunks, bool Minchunk, bool Idsmoothwindow, bool Minsmoothid, bool Maxp, bool skipgap, bool skipgap2, bool Minlen, bool Maxlen, bool uc, bool Queryfract, bool hc) {
                useAbskew = Abskew;
                chimealns = calns;
                useMinH = MinH;
@@ -120,12 +128,15 @@ struct uchimeData {
                useMaxlen = Maxlen;
                ucl = uc;
                useQueryfract = Queryfract;
+        hasCount = hc;
+        dups = dps;
        }
        
-       void setVariables(string abske, string min, string mindi, string x, string d, string xa2, string chunk, string minchun, string idsmoothwindo, string minsmoothi, string max, string minle, string maxle, string queryfrac) {
+       void setVariables(string abske, string min, string mindi, string x, string d, string xa2, string chunk, string minchun, string idsmoothwindo, string minsmoothi, string max, string minle, string maxle, string queryfrac, string stra) {
                abskew = abske;
                minh = min;
                mindiv = mindi;
+        strand = stra;
                xn = x;
                dn = d;
                xa = xa2;
@@ -161,16 +172,34 @@ static DWORD WINAPI MyUchimeThreadFunction(LPVOID lpParam){
                
                //parse fasta and name file by group
                SequenceParser* parser;
-               if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile);      }
-               else                                                    { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile);                                            }
+        SequenceCountParser* cparser;
+               if (pDataArray->hasCount) {
+            CountTable* ct = new CountTable();
+            ct->readTable(pDataArray->namefile, true);
+            cparser = new SequenceCountParser(pDataArray->fastafile, *ct);
+            delete ct;
+        }else {
+            if (pDataArray->namefile != "") { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile, pDataArray->namefile); }
+            else                                                       { parser = new SequenceParser(pDataArray->groupfile, pDataArray->fastafile);                                            }
+        }
                
                int totalSeqs = 0;
                int numChimeras = 0;
+        
+        ofstream outCountList;
+        if (pDataArray->hasCount && pDataArray->dups) { pDataArray->m->openOutputFile(pDataArray->countlist, outCountList); }
+
                
                for (int i = pDataArray->start; i < pDataArray->end; i++) {
-                       int start = time(NULL);  if (pDataArray->m->control_pressed) {  delete parser; return 0; }
+                       int start = time(NULL);  if (pDataArray->m->control_pressed) {  if (pDataArray->hasCount) { delete cparser; } { delete parser; } return 0; }
                        
-                       int error = parser->getSeqs(pDataArray->groups[i], pDataArray->filename, true); if ((error == 1) || pDataArray->m->control_pressed) {  delete parser; return 0; }
+            
+                       int error;
+            if (pDataArray->hasCount) { 
+                error = cparser->getSeqs(pDataArray->groups[i], pDataArray->filename, true); if ((error == 1) || pDataArray->m->control_pressed) {  delete cparser; return 0; }
+            }else {
+               error = parser->getSeqs(pDataArray->groups[i], pDataArray->filename, true); if ((error == 1) || pDataArray->m->control_pressed) {  delete parser; return 0; } 
+            }
                        
                        //int numSeqs = driver((outputFName + groups[i]), filename, (accnos+ groups[i]), (alns+ groups[i]), numChimeras);
                        ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -183,18 +212,8 @@ static DWORD WINAPI MyUchimeThreadFunction(LPVOID lpParam){
                        
                        vector<char*> cPara;
                        
-                       string path = pDataArray->m->argv;
-                       string tempPath = path;
-                       for (int j = 0; j < path.length(); j++) { tempPath[j] = tolower(path[j]); }
-                       path = path.substr(0, (tempPath.find_last_of('m')));
-                       
-                       string uchimeCommand = path;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
-                       uchimeCommand += "uchime ";
-#else
-                       uchimeCommand += "uchime";
-                       uchimeCommand = "\"" + uchimeCommand + "\"";
-#endif                 
+            string uchimeCommand = pDataArray->uchimeLocation;
+            uchimeCommand = "\"" + uchimeCommand + "\"";
                        
                        char* tempUchime;
                        tempUchime= new char[uchimeCommand.length()+1]; 
@@ -231,6 +250,15 @@ static DWORD WINAPI MyUchimeThreadFunction(LPVOID lpParam){
                                cPara.push_back(tempa);
                        }
                        
+            if (pDataArray->strand != "") {
+                char* tempA = new char[9]; 
+                *tempA = '\0'; strncat(tempA, "--strand", 8);
+                cPara.push_back(tempA);
+                char* tempa = new char[pDataArray->strand.length()+1];
+                *tempa = '\0'; strncat(tempa, pDataArray->strand.c_str(), pDataArray->strand.length());
+                cPara.push_back(tempa);
+            }
+            
                        if (pDataArray->useAbskew) {
                                char* tempskew = new char[9];
                                *tempskew = '\0'; strncat(tempskew, "--abskew", 8);
@@ -403,10 +431,10 @@ static DWORD WINAPI MyUchimeThreadFunction(LPVOID lpParam){
                        
                        //uchime_main(numArgs, uchimeParameters); 
                        //cout << "commandString = " << commandString << endl;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
-#else
                        commandString = "\"" + commandString + "\"";
-#endif
+            
+            if (pDataArray->m->debug) { pDataArray->m->mothurOut("[DEBUG]: uchime command = " + commandString + ".\n"); }
+            
                        system(commandString.c_str());
                        
                        //free memory
@@ -418,7 +446,7 @@ static DWORD WINAPI MyUchimeThreadFunction(LPVOID lpParam){
                        filename = filename.substr(1, filename.length()-2);
                        alns = alns.substr(1, alns.length()-2);
                        
-                       if (pDataArray->m->control_pressed) { delete parser; return 0; }
+                       if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } return 0; }
                        
                        //create accnos file from uchime results
                        ifstream in; 
@@ -426,9 +454,14 @@ static DWORD WINAPI MyUchimeThreadFunction(LPVOID lpParam){
                        
                        ofstream out;
                        pDataArray->m->openOutputFile(accnos, out);
+            
                        
                        int num = 0;
                        numChimeras = 0;
+            map<string, string> thisnamemap;
+            map<string, string>::iterator itN;
+            if (pDataArray->dups && !pDataArray->hasCount) { thisnamemap = parser->getNameMap(pDataArray->groups[i]); }
+                
                        while(!in.eof()) {
                                
                                if (pDataArray->m->control_pressed) { break; }
@@ -444,7 +477,22 @@ static DWORD WINAPI MyUchimeThreadFunction(LPVOID lpParam){
                                for (int j = 0; j < 15; j++) {  in >> chimeraFlag; }
                                pDataArray->m->gobble(in);
                                
-                               if (chimeraFlag == "Y") {  out << name << endl; numChimeras++; }
+                               if (chimeraFlag == "Y") {
+                    if (pDataArray->dups) {
+                        if (!pDataArray->hasCount) { //output redundant names for each group
+                            itN = thisnamemap.find(name);
+                            if (itN != thisnamemap.end()) {
+                                vector<string> tempNames; pDataArray->m->splitAtComma(itN->second, tempNames);
+                                for (int j = 0; j < tempNames.size(); j++) { out << tempNames[j] << endl; }
+                            }else { pDataArray->m->mothurOut("[ERROR]: parsing cannot find " + name + ".\n"); pDataArray->m->control_pressed = true; }
+
+                        }else {
+                            out << name << endl;
+                            outCountList << name << '\t' << pDataArray->groups[i] << endl;
+                        }
+                    }else{  out << name << endl;  }
+                    numChimeras++;
+                }
                                num++;
                        }
                        in.close();
@@ -455,7 +503,7 @@ static DWORD WINAPI MyUchimeThreadFunction(LPVOID lpParam){
                        totalSeqs += num;
                        pDataArray->numChimeras += numChimeras;
                        
-                       if (pDataArray->m->control_pressed) { delete parser; return 0; }
+                       if (pDataArray->m->control_pressed) { if (pDataArray->hasCount) { delete cparser; } { delete parser; } return 0; }
                        
                        //remove file made for uchime
                        pDataArray->m->mothurRemove(filename);
@@ -469,8 +517,9 @@ static DWORD WINAPI MyUchimeThreadFunction(LPVOID lpParam){
                        
                }       
                
+        if (pDataArray->hasCount && pDataArray->dups) { outCountList.close(); }
                pDataArray->count = totalSeqs;
-               delete parser;
+               if (pDataArray->hasCount) { delete cparser; } { delete parser; }
                return totalSeqs;
                
        }
@@ -505,22 +554,41 @@ static DWORD WINAPI MyUchimeSeqsThreadFunction(LPVOID lpParam){
                
                vector<char*> cPara;
                
-               char* tempUchime;
-               tempUchime= new char[8]; 
-               *tempUchime = '\0';
-               strncat(tempUchime, "uchime ", 7); 
-               cPara.push_back(tempUchime);
-               
-               char* tempIn = new char[8]; 
-               *tempIn = '\0'; strncat(tempIn, "--input", 7);
-               //strcpy(tempIn, "--input"); 
-               cPara.push_back(tempIn);
-               char* temp = new char[filename.length()+1];
-               *temp = '\0'; strncat(temp, filename.c_str(), filename.length());
-               //strcpy(temp, filename.c_str());
-               cPara.push_back(temp);
-       
-               //add reference file
+               string uchimeCommand = pDataArray->uchimeLocation;
+        uchimeCommand = "\"" + uchimeCommand + "\"";
+        
+        char* tempUchime;
+        tempUchime= new char[uchimeCommand.length()+1]; 
+        *tempUchime = '\0';
+        strncat(tempUchime, uchimeCommand.c_str(), uchimeCommand.length());
+        cPara.push_back(tempUchime);
+               
+        string outputFileName = filename.substr(1, filename.length()-2) + ".uchime_formatted";
+        //prepFile(filename.substr(1, filename.length()-2), outputFileName);
+        //prepFile(filename, outputFileName);
+        /******************************************/
+        ifstream in23;
+        pDataArray->m->openInputFile((filename.substr(1, filename.length()-2)), in23);
+        
+        ofstream out23;
+        pDataArray->m->openOutputFile(outputFileName, out23);
+        
+        int fcount = 0;
+        while (!in23.eof()) {
+            if (pDataArray->m->control_pressed) { break;  }
+            
+            Sequence seq(in23); pDataArray->m->gobble(in23);
+            
+            if (seq.getName() != "") { seq.printSequence(out23); fcount++; }
+        }
+        in23.close();
+        out23.close();
+        /******************************************/
+        
+        filename = outputFileName;
+        filename = "\"" + filename + "\"";
+        
+        //add reference file
                char* tempRef = new char[5]; 
                //strcpy(tempRef, "--db"); 
                *tempRef = '\0'; strncat(tempRef, "--db", 4);
@@ -529,6 +597,15 @@ static DWORD WINAPI MyUchimeSeqsThreadFunction(LPVOID lpParam){
                //strcpy(tempR, templatefile.c_str());
                *tempR = '\0'; strncat(tempR, templatefile.c_str(), templatefile.length());
                cPara.push_back(tempR);
+        
+               char* tempIn = new char[8]; 
+               *tempIn = '\0'; strncat(tempIn, "--input", 7);
+               //strcpy(tempIn, "--input"); 
+               cPara.push_back(tempIn);
+               char* temp = new char[filename.length()+1];
+               *temp = '\0'; strncat(temp, filename.c_str(), filename.length());
+               //strcpy(temp, filename.c_str());
+               cPara.push_back(temp);
                
                char* tempO = new char[12]; 
                *tempO = '\0'; strncat(tempO, "--uchimeout", 11);
@@ -550,6 +627,15 @@ static DWORD WINAPI MyUchimeSeqsThreadFunction(LPVOID lpParam){
                        cPara.push_back(tempa);
                }
                
+        if (pDataArray->strand != "") {
+            char* tempA = new char[9]; 
+            *tempA = '\0'; strncat(tempA, "--strand", 8);
+            cPara.push_back(tempA);
+            char* tempa = new char[pDataArray->strand.length()+1];
+            *tempa = '\0'; strncat(tempa, pDataArray->strand.c_str(), pDataArray->strand.length());
+            cPara.push_back(tempa);
+        }
+        
                if (pDataArray->useAbskew) {
                        char* tempskew = new char[9];
                        *tempskew = '\0'; strncat(tempskew, "--abskew", 8);
@@ -720,8 +806,11 @@ static DWORD WINAPI MyUchimeSeqsThreadFunction(LPVOID lpParam){
                for (int j = 0; j < cPara.size(); j++) {  uchimeParameters[j] = cPara[j];  commandString += toString(cPara[j]) + " "; } 
                //int numArgs = cPara.size();
                
+        commandString = "\"" + commandString + "\"";
+        
                //uchime_main(numArgs, uchimeParameters); 
                //cout << "commandString = " << commandString << endl;
+        if (pDataArray->m->debug) { pDataArray->m->mothurOut("[DEBUG]: uchime command = " + commandString + ".\n"); }
                system(commandString.c_str());
                
                //free memory
@@ -760,12 +849,15 @@ static DWORD WINAPI MyUchimeSeqsThreadFunction(LPVOID lpParam){
                in.close();
                out.close();
                
+        if (fcount != totalSeqs) { pDataArray->m->mothurOut("[ERROR]: process " + toString(pDataArray->threadID) + " only processed " + toString(pDataArray->count) + " of " + toString(pDataArray->end) + " sequences assigned to it, quitting. \n"); pDataArray->m->control_pressed = true; }
+        
                if (pDataArray->m->control_pressed) { return 0; }
                
                pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(totalSeqs) + " sequences.");       pDataArray->m->mothurOutEndLine();                                      
        
                pDataArray->count = totalSeqs;
                pDataArray->numChimeras = numChimeras;
+        
                return totalSeqs;
                
        }