changed random forest's inheritance

[mothur.git] / chimerauchimecommand.cpp
diff --git a/chimerauchimecommand.cpp b/chimerauchimecommand.cpp

index bad4c96b8c74d645208be1158ad11cd2ef924faf..6f7ba106ead04e9ccd3b7b290dce4c3a55dabd20 100644 (file)
--- a/chimerauchimecommand.cpp
+++ b/chimerauchimecommand.cpp
@@ -35,6 +35,8 @@ vector<string> ChimeraUchimeCommand::setParameters(){
                 CommandParameter pchunks("chunks", "Number", "", "4", "", "", "",false,false); parameters.push_back(pchunks);
                 CommandParameter pminchunk("minchunk", "Number", "", "64", "", "", "",false,false); parameters.push_back(pminchunk);
                 CommandParameter pidsmoothwindow("idsmoothwindow", "Number", "", "32", "", "", "",false,false); parameters.push_back(pidsmoothwindow);
+        CommandParameter pdups("dereplicate", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pdups);
+
                 //CommandParameter pminsmoothid("minsmoothid", "Number", "", "0.95", "", "", "",false,false); parameters.push_back(pminsmoothid);
                 CommandParameter pmaxp("maxp", "Number", "", "2", "", "", "",false,false); parameters.push_back(pmaxp);
                 CommandParameter pskipgaps("skipgaps", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pskipgaps);
@@ -59,12 +61,13 @@ string ChimeraUchimeCommand::getHelpString(){
                 string helpString = "";
                 helpString += "The chimera.uchime command reads a fastafile and referencefile and outputs potentially chimeric sequences.\n";
                 helpString += "This command is a wrapper for uchime written by Robert C. Edgar.\n";
-               helpString += "The chimera.uchime command parameters are fasta, name, count, reference, processors, abskew, chimealns, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, skipgaps, skipgaps2, minlen, maxlen, ucl and queryfact.\n";
+               helpString += "The chimera.uchime command parameters are fasta, name, count, reference, processors, dereplicate, abskew, chimealns, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, skipgaps, skipgaps2, minlen, maxlen, ucl and queryfact.\n";
                 helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n";
                 helpString += "The name parameter allows you to provide a name file, if you are using template=self. \n";
          helpString += "The count parameter allows you to provide a count file, if you are using template=self. \n";
                 helpString += "You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amazon.fasta \n";
                 helpString += "The group parameter allows you to provide a group file. The group file can be used with a namesfile and reference=self. When checking sequences, only sequences from the same group as the query sequence will be used as the reference. \n";
+        helpString += "If the dereplicate parameter is false, then if one group finds the seqeunce to be chimeric, then all groups find it to be chimeric, default=f.\n";
                 helpString += "The reference parameter allows you to enter a reference file containing known non-chimeric sequences, and is required. You may also set template=self, in this case the abundant sequences will be used as potential parents. \n";
                 helpString += "The processors parameter allows you to specify how many processors you would like to use.  The default is 1. \n";
                 helpString += "The abskew parameter can only be used with template=self. Minimum abundance skew. Default 1.9. Abundance skew is: min [ abund(parent1), abund(parent2) ] / abund(query).\n";
@@ -557,6 +560,15 @@ ChimeraUchimeCommand::ChimeraUchimeCommand(string option)  {
  
                         temp = validParameter.validFile(parameters, "skipgaps2", false);                                if (temp == "not found") { temp = "t"; }
                         skipgaps2 = m->isTrue(temp); 
+            
+            string usedDups = "false";
+                       temp = validParameter.validFile(parameters, "dereplicate", false);      
+                       if (temp == "not found") { 
+                               if (groupfile != "")    {  temp = "false";                                      }
+                               else                    {  temp = "true"; usedDups = "";        }
+                       }
+                       dups = m->isTrue(temp);
+
                         
                         if (hasName && (templatefile != "self")) { m->mothurOut("You have provided a namefile and the reference parameter is not set to self. I am not sure what reference you are trying to use, aborting."); m->mothurOutEndLine(); abort=true; }
                         if (hasGroup && (templatefile != "self")) { m->mothurOut("You have provided a group file and the reference parameter is not set to self. I am not sure what reference you are trying to use, aborting."); m->mothurOutEndLine(); abort=true; }
@@ -713,12 +725,14 @@ int ChimeraUchimeCommand::execute(){
                                 if (m->control_pressed) {  for (int j = 0; j < outputNames.size(); j++) {       m->mothurRemove(outputNames[j]);        }  return 0;    }                               
                  if (hasCount) { delete cparser; }
                  else { delete sparser; }
-
-                               int totalChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName, alnsFileName);
-                               
-                               m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(totalSeqs) + " sequences. " + toString(totalChimeras) + " chimeras were found.");  m->mothurOutEndLine();
-                               m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine(); 
+                
+                if (!dups) { 
+                    int totalChimeras = deconvoluteResults(uniqueNames, outputFileName, accnosFileName, alnsFileName);
                                 
+                    m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to check " + toString(totalSeqs) + " sequences. " + toString(totalChimeras) + " chimeras were found.");     m->mothurOutEndLine();
+                    m->mothurOut("The number of sequences checked may be larger than the number of unique sequences because some sequences are found in several samples."); m->mothurOutEndLine(); 
+                               }
+                
                                 if (m->control_pressed) {  for (int j = 0; j < outputNames.size(); j++) {       m->mothurRemove(outputNames[j]);        }  return 0;    }                               
                                         
                         }else{
@@ -800,7 +814,7 @@ int ChimeraUchimeCommand::deconvoluteResults(map<string, string>& uniqueNames, s
                         //find unique name
                         itUnique = uniqueNames.find(name);
                         
-                       if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing accnos results. Cannot find "+ name + "."); m->mothurOutEndLine(); m->control_pressed = true; }
+                       if (itUnique == uniqueNames.end()) { m->mothurOut("[ERROR]: trouble parsing accnos results. Cannot find " + name + "."); m->mothurOutEndLine(); m->control_pressed = true; }
                         else {
                                 itChimeras = chimerasInFile.find((itUnique->second));
                                 
@@ -1168,29 +1182,20 @@ int ChimeraUchimeCommand::driver(string outputFName, string filename, string acc
                 vector<char*> cPara;
                 
                 string uchimeCommand = uchimeLocation;
-#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
-               uchimeCommand += " ";
-#else
-               uchimeCommand = "\"" + uchimeCommand + "\"";
-#endif
-               
-               char* tempUchime;
+        uchimeCommand = "\"" + uchimeCommand + "\" ";
+        
+        char* tempUchime;
                 tempUchime= new char[uchimeCommand.length()+1]; 
                 *tempUchime = '\0';
                 strncat(tempUchime, uchimeCommand.c_str(), uchimeCommand.length());
                 cPara.push_back(tempUchime);
                 
-               char* tempIn = new char[8]; 
-               *tempIn = '\0'; strncat(tempIn, "--input", 7);
-               //strcpy(tempIn, "--input"); 
-               cPara.push_back(tempIn);
-               char* temp = new char[filename.length()+1];
-               *temp = '\0'; strncat(temp, filename.c_str(), filename.length());
-               //strcpy(temp, filename.c_str());
-               cPara.push_back(temp);
-               
-               //are you using a reference file
+        //are you using a reference file
                 if (templatefile != "self") {
+            string outputFileName = filename.substr(1, filename.length()-2) + ".uchime_formatted";
+            prepFile(filename.substr(1, filename.length()-2), outputFileName);
+            filename = outputFileName;
+            filename = "\"" + filename + "\"";
                         //add reference file
                         char* tempRef = new char[5]; 
                         //strcpy(tempRef, "--db"); 
@@ -1202,6 +1207,15 @@ int ChimeraUchimeCommand::driver(string outputFName, string filename, string acc
                         cPara.push_back(tempR);
                 }
                 
+               char* tempIn = new char[8]; 
+               *tempIn = '\0'; strncat(tempIn, "--input", 7);
+               //strcpy(tempIn, "--input"); 
+               cPara.push_back(tempIn);
+               char* temp = new char[filename.length()+1];
+               *temp = '\0'; strncat(temp, filename.c_str(), filename.length());
+               //strcpy(temp, filename.c_str());
+               cPara.push_back(temp);
+               
                 char* tempO = new char[12]; 
                 *tempO = '\0'; strncat(tempO, "--uchimeout", 11);
                 //strcpy(tempO, "--uchimeout"); 
@@ -1438,15 +1452,21 @@ int ChimeraUchimeCommand::driver(string outputFName, string filename, string acc
                         
                         string name = "";
                         string chimeraFlag = "";
-                       in >> chimeraFlag >> name;
-                       
-                       //fix name if needed
-                       if (templatefile == "self") { 
-                               name = name.substr(0, name.length()-1); //rip off last /
-                               name = name.substr(0, name.find_last_of('/'));
+                       //in >> chimeraFlag >> name;
+                       
+            string line = m->getline(in);
+            vector<string> pieces = m->splitWhiteSpace(line);
+            if (pieces.size() > 2) { 
+                name = pieces[1];
+                //fix name if needed
+                if (templatefile == "self") { 
+                    name = name.substr(0, name.length()-1); //rip off last /
+                    name = name.substr(0, name.find_last_of('/'));
+                }
+                
+                chimeraFlag = pieces[pieces.size()-1];
                         }
-                       
-                       for (int i = 0; i < 15; i++) {  in >> chimeraFlag; }
+                       //for (int i = 0; i < 15; i++) {  in >> chimeraFlag; }
                         m->gobble(in);
                         
                         if (chimeraFlag == "Y") {  out << name << endl; numChimeras++; }
@@ -1455,6 +1475,8 @@ int ChimeraUchimeCommand::driver(string outputFName, string filename, string acc
                 in.close();
                 out.close();
                 
+        //if (templatefile != "self") {  m->mothurRemove(filename); }
+        
                 return num;
         }
         catch(exception& e) {
@@ -1463,6 +1485,34 @@ int ChimeraUchimeCommand::driver(string outputFName, string filename, string acc
         }
  }
  /**************************************************************************************************/
+//uchime can't handle some of the things allowed in mothurs fasta files. This functions "cleans up" the file.
+int ChimeraUchimeCommand::prepFile(string filename, string output) {
+       try {
+        
+        ifstream in;
+        m->openInputFile(filename, in);
+        
+        ofstream out;
+        m->openOutputFile(output, out);
+        
+        while (!in.eof()) {
+            if (m->control_pressed) { break;  }
+            
+            Sequence seq(in); m->gobble(in);
+            
+            if (seq.getName() != "") { seq.printSequence(out); }
+        }
+        in.close();
+        out.close();
+        
+        return 0;
+    }
+       catch(exception& e) {
+               m->errorOut(e, "ChimeraUchimeCommand", "prepFile");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
  
  int ChimeraUchimeCommand::createProcesses(string outputFileName, string filename, string accnos, string alns, int& numChimeras) {
         try {
@@ -1583,7 +1633,7 @@ int ChimeraUchimeCommand::createProcesses(string outputFileName, string filename
                         string extension = toString(i) + ".temp";
                         
                         uchimeData* tempUchime = new uchimeData(outputFileName+extension, uchimeLocation, templatefile, files[i], "", "", "", accnos+extension, alns+extension, dummy, m, 0, 0,  i);
-                       tempUchime->setBooleans(useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract);
+                       tempUchime->setBooleans(useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract, hasCount);
                         tempUchime->setVariables(abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract);
                         
                         pDataArray.push_back(tempUchime);
@@ -1715,7 +1765,7 @@ int ChimeraUchimeCommand::createProcessesGroups(string outputFName, string filen
                         string extension = toString(i) + ".temp";
                         
                         uchimeData* tempUchime = new uchimeData(outputFName+extension, uchimeLocation, templatefile, filename+extension, fastaFile, nameFile, groupFile, accnos+extension, alns+extension, groups, m, lines[i].start, lines[i].end,  i);
-                       tempUchime->setBooleans(useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract);
+                       tempUchime->setBooleans(useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract, hasCount);
                         tempUchime->setVariables(abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract);
                         
                         pDataArray.push_back(tempUchime);
@@ -1728,7 +1778,7 @@ int ChimeraUchimeCommand::createProcessesGroups(string outputFName, string filen
                 
                 
                 //using the main process as a worker saves time and memory
-               num = driverGroups(parser, outputFName, filename, accnos, alns, lines[0].start, lines[0].end, groups);
+               num = driverGroups(outputFName, filename, accnos, alns, lines[0].start, lines[0].end, groups);
                 
                 //Wait until all threads have terminated.
                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);