added sequence name to error string in fastq.info. Changed np_shannon to npshannon.

[mothur.git] / clustersplitcommand.cpp
diff --git a/clustersplitcommand.cpp b/clustersplitcommand.cpp

index 6d908c61c94eb0917730479cf76a7f34cc102d15..34caf654124886f9cd23638a149b2b3487ca53e2 100644 (file)
--- a/clustersplitcommand.cpp
+++ b/clustersplitcommand.cpp
@@ -24,16 +24,16 @@ vector<string> ClusterSplitCommand::setParameters(){
                 CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName",false,false); parameters.push_back(pfasta);
                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName-FastaTaxName",false,false); parameters.push_back(pname);
                 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName",false,false); parameters.push_back(pcolumn);
-               CommandParameter ptaxlevel("taxlevel", "Number", "", "1", "", "", "",false,false); parameters.push_back(ptaxlevel);
+               CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "",false,false); parameters.push_back(ptaxlevel);
                 CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "",false,false); parameters.push_back(psplitmethod);
                 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
                 CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pshowabund);
                 CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptiming);
                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
-               CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff);
+               CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "",false,false); parameters.push_back(pcutoff);
                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
-               CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "furthest", "", "", "",false,false); parameters.push_back(pmethod);
-               CommandParameter phard("hard", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(phard);
+               CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "",false,false); parameters.push_back(pmethod);
+               CommandParameter phard("hard", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(phard);
                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
                         
@@ -60,12 +60,12 @@ string ClusterSplitCommand::getHelpString(){
                 helpString += "The phylip and column parameter allow you to enter your distance file. \n";
                 helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
                 helpString += "The name parameter allows you to enter your name file and is required if your distance file is in column format. \n";
-               helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n";
+               helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
                 helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
-               helpString += "The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n";
+               helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
                 helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
                 helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
-               helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1, meaning use the first taxon in each list. \n";
+               helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
                 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n";
  #ifdef USE_MPI
                 helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
@@ -105,6 +105,7 @@ ClusterSplitCommand::ClusterSplitCommand(string option)  {
                 
                 //allow user to run help
                 if(option == "help") { help(); abort = true; calledHelp = true; }
+               else if(option == "citation") { citation(); abort = true; calledHelp = true;}
                 
                 else {
                         vector<string> myArray = setParameters();
@@ -182,25 +183,27 @@ ClusterSplitCommand::ClusterSplitCommand(string option)  {
                         phylipfile = validParameter.validFile(parameters, "phylip", true);
                         if (phylipfile == "not open") { abort = true; }
                         else if (phylipfile == "not found") { phylipfile = ""; }        
-                       else {  distfile = phylipfile;  format = "phylip";      }
+                       else {  distfile = phylipfile;  format = "phylip";      m->setPhylipFile(phylipfile); }
                         
                         columnfile = validParameter.validFile(parameters, "column", true);
                         if (columnfile == "not open") { abort = true; } 
                         else if (columnfile == "not found") { columnfile = ""; }
-                       else {  distfile = columnfile; format = "column";       }
+                       else {  distfile = columnfile; format = "column";       m->setColumnFile(columnfile); }
                         
                         namefile = validParameter.validFile(parameters, "name", true);
                         if (namefile == "not open") { abort = true; }   
-                       else if (namefile == "not found") { namefile = ""; }
+                       else if (namefile == "not found") { namefile = "";  }
+                       else { m->setNameFile(namefile); }
                         
                         fastafile = validParameter.validFile(parameters, "fasta", true);
                         if (fastafile == "not open") { abort = true; }  
                         else if (fastafile == "not found") { fastafile = ""; }
-                       else { distfile = fastafile;  splitmethod = "fasta";  }
+                       else { distfile = fastafile;  splitmethod = "fasta";  m->setFastaFile(fastafile); }
                         
                         taxFile = validParameter.validFile(parameters, "taxonomy", true);
-                       if (taxFile == "not open") { abort = true; }    
+                       if (taxFile == "not open") { taxFile = ""; abort = true; }      
                         else if (taxFile == "not found") { taxFile = ""; }
+                       else {  m->setTaxonomyFile(taxFile); }
                         
                         if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { 
                                 //is there are current file available for either of these?
@@ -261,9 +264,9 @@ ClusterSplitCommand::ClusterSplitCommand(string option)  {
                         if (temp == "not found") { temp = "100"; }
                         //saves precision legnth for formatting below
                         length = temp.length();
-                       convert(temp, precision); 
+                       m->mothurConvert(temp, precision); 
                         
-                       temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "F"; }
+                       temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "T"; }
                         hard = m->isTrue(temp);
                         
                         temp = validParameter.validFile(parameters, "large", false);                    if (temp == "not found") { temp = "F"; }
@@ -271,7 +274,7 @@ ClusterSplitCommand::ClusterSplitCommand(string option)  {
                         
                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
                         m->setProcessors(temp);
-                       convert(temp, processors);
+                       m->mothurConvert(temp, processors);
                         
                         temp = validParameter.validFile(parameters, "splitmethod", false);      
                         if (splitmethod != "fasta") {
@@ -279,14 +282,14 @@ ClusterSplitCommand::ClusterSplitCommand(string option)  {
                                 else {  splitmethod = temp; }
                         }
                         
-                       temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "10"; }
-                       convert(temp, cutoff); 
+                       temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "0.25"; }
+                       m->mothurConvert(temp, cutoff); 
                         cutoff += (5 / (precision * 10.0));  
                         
-                       temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "1"; }
-                       convert(temp, taxLevelCutoff); 
+                       temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "3"; }
+                       m->mothurConvert(temp, taxLevelCutoff); 
                         
-                       method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "furthest"; }
+                       method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "average"; }
                         
                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
@@ -552,17 +555,23 @@ int ClusterSplitCommand::execute(){
                 MPI_Barrier(MPI_COMM_WORLD);
                 
         #else
-
+               
+               //sanity check
+               if (processors > distName.size()) { processors = distName.size(); }
+               
                 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
                                 if(processors == 1){
                                         listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
                                 }else{
+                                       
+                                       //cout << processors << '\t' << distName.size() << endl;
                                         vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
                                         dividedNames.resize(processors);
                                         
                                         //for each file group figure out which process will complete it
                                         //want to divide the load intelligently so the big files are spread between processes
                                         for (int i = 0; i < distName.size(); i++) { 
+                                               //cout << i << endl;
                                                 int processToAssign = (i+1) % processors; 
                                                 if (processToAssign == 0) { processToAssign = processors; }
                                                 
@@ -571,6 +580,7 @@ int ClusterSplitCommand::execute(){
                                         
                                         //not lets reverse the order of ever other process, so we balance big files running with little ones
                                         for (int i = 0; i < processors; i++) {
+                                               //cout << i << endl;
                                                 int remainder = ((i+1) % processors);
                                                 if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
                                         }
@@ -593,7 +603,7 @@ int ClusterSplitCommand::execute(){
                                                         listFileNames.push_back(tempName);
                                                 }
                                                 in.close();
-                                               remove((toString(processIDS[i]) + ".temp").c_str());
+                                               m->mothurRemove((toString(processIDS[i]) + ".temp"));
                                                 
                                                 //get labels
                                                 filename = toString(processIDS[i]) + ".temp.labels";
@@ -610,14 +620,14 @@ int ClusterSplitCommand::execute(){
                                                         if (labels.count(tempName) == 0) { labels.insert(tempName); }
                                                 }
                                                 in2.close();
-                                               remove((toString(processIDS[i]) + ".temp.labels").c_str());
+                                               m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
                                         }
                                 }
                 #else
                                 listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
                 #endif
         #endif  
-               if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
+               if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
                 
                 if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
                 
@@ -634,11 +644,11 @@ int ClusterSplitCommand::execute(){
                 ListVector* listSingle;
                 map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
                 
-               if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
+               if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
                 
                 mergeLists(listFileNames, labelBins, listSingle);
  
-               if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
+               if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
                 
                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
                 
@@ -700,7 +710,7 @@ map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames,
                                 listSingle->push_back(secondCol);
                         }
                         in.close();
-                       remove(singleton.c_str());
+                       m->mothurRemove(singleton);
                         
                         numSingleBins = listSingle->getNumBins();
                 }else{  listSingle = NULL; numSingleBins = 0;  }
@@ -726,8 +736,8 @@ map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames,
                 for (int k = 0; k < listNames.size(); k++) {
         
                         if (m->control_pressed) {  
-                               if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str());  }
-                               for (int i = 0; i < listNames.size(); i++) {   remove(listNames[i].c_str());  }
+                               if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton);  }
+                               for (int i = 0; i < listNames.size(); i++) {   m->mothurRemove(listNames[i]);  }
                                 return labelBin;
                         }
                         
@@ -780,7 +790,7 @@ map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames,
                         delete input;
                         
                         outFilled.close();
-                       remove(listNames[k].c_str());
+                       m->mothurRemove(listNames[k]);
                         rename(filledInList.c_str(), listNames[k].c_str());
                 }
                 
@@ -830,7 +840,7 @@ int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> us
                         //get the list info from each file
                         for (int k = 0; k < listNames.size(); k++) {
         
-                               if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str());  } delete rabund; return 0; }
+                               if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]);  } delete rabund; return 0; }
                                 
                                 InputData* input = new InputData(listNames[k], "list");
                                 ListVector* list = input->getListVector(thisLabel);
@@ -862,7 +872,7 @@ int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> us
                 
                 if (listSingle != NULL) { delete listSingle;  }
                 
-               for (int i = 0; i < listNames.size(); i++) {  remove(listNames[i].c_str());  }
+               for (int i = 0; i < listNames.size(); i++) {  m->mothurRemove(listNames[i]);  }
                 
                 return 0;
         }
@@ -1044,7 +1054,7 @@ vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNa
                                 if (m->control_pressed) { //clean up
                                         delete matrix; delete list;     delete cluster; delete rabund;
                                         listFile.close();
-                                       for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
+                                       for (int i = 0; i < listFileNames.size(); i++) {        m->mothurRemove(listFileNames[i]);      }
                                         listFileNames.clear(); return listFileNames;
                                 }
                 
@@ -1090,12 +1100,12 @@ vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNa
                         listFile.close();
                         
                         if (m->control_pressed) { //clean up
-                               for (int i = 0; i < listFileNames.size(); i++) {        remove(listFileNames[i].c_str());       }
+                               for (int i = 0; i < listFileNames.size(); i++) {        m->mothurRemove(listFileNames[i]);      }
                                 listFileNames.clear(); return listFileNames;
                         }
                         
-                       remove(thisDistFile.c_str());
-                       remove(thisNamefile.c_str());
+                       m->mothurRemove(thisDistFile);
+                       m->mothurRemove(thisNamefile);
                         
                         if (saveCutoff != cutoff) { 
                                 if (hard)       {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
@@ -1134,7 +1144,7 @@ int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> >
                 string thisOutputDir = outputDir;
                 if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
                 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "dist";
-               remove(outputFileName.c_str());
+               m->mothurRemove(outputFileName);
                 
                 
                 for (int i = 0; i < distNames.size(); i++) {