fixed cluster.split command

[mothur.git] / clustersplitcommand.cpp
diff --git a/clustersplitcommand.cpp b/clustersplitcommand.cpp

index a1e6db088d0d8292034d73a9f2121654fb726918..d10a51fde293fa203e03a856ba67c63d5895a55b 100644 (file)
--- a/clustersplitcommand.cpp
+++ b/clustersplitcommand.cpp
@@ -27,7 +27,7 @@ ClusterSplitCommand::ClusterSplitCommand(string option)  {
                 
                 else {
                         //valid paramters for this command
-                       string Array[] =  {"phylip","column","name","cutoff","precision","method","showabund","timing","hard","processors","splitcutoff","outputdir","inputdir"};
+                       string Array[] =  {"phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","showabund","timing","hard","processors","outputdir","inputdir"};
                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
                         
                         OptionParser parser(option);
@@ -76,6 +76,14 @@ ClusterSplitCommand::ClusterSplitCommand(string option)  {
                                         //if the user has not given a path then, add inputdir. else leave path alone.
                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
                                 }
+                               
+                               it = parameters.find("taxonomy");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
+                               }
                         }
                         
                         //check for required parameters
@@ -93,11 +101,15 @@ ClusterSplitCommand::ClusterSplitCommand(string option)  {
                         if (namefile == "not open") { abort = true; }   
                         else if (namefile == "not found") { namefile = ""; }
                         
-                       if ((phylipfile == "") && (columnfile == "")) { m->mothurOut("When executing a hcluster command you must enter a phylip or a column."); m->mothurOutEndLine(); abort = true; }
-                       else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a hcluster command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
+                       taxFile = validParameter.validFile(parameters, "taxonomy", true);
+                       if (taxFile == "not open") { abort = true; }    
+                       else if (taxFile == "not found") { taxFile = ""; }
+                       
+                       if ((phylipfile == "") && (columnfile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column."); m->mothurOutEndLine(); abort = true; }
+                       else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
                 
                         if (columnfile != "") {
-                               if (namefile == "") {  cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; }
+                               if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
                         }
                                         
                         //check for optional parameter and set defaults
@@ -116,21 +128,24 @@ ClusterSplitCommand::ClusterSplitCommand(string option)  {
                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
                         convert(temp, processors); 
                         
-                       temp = validParameter.validFile(parameters, "cutoff", false);
-                       if (temp == "not found") { temp = "10"; }
+                       splitmethod = validParameter.validFile(parameters, "splitmethod", false);               if (splitmethod == "not found") { splitmethod = "distance"; }
+                       
+                       temp = validParameter.validFile(parameters, "cutoff", false);           if (temp == "not found")  { temp = "10"; }
                         convert(temp, cutoff); 
-                       if (!hard) {    cutoff += (5 / (precision * 10.0));  }
+                       cutoff += (5 / (precision * 10.0));  
                         
-                       temp = validParameter.validFile(parameters, "splitcutoff", false);
-                       if (temp == "not found") { temp = "0.10"; }
-                       convert(temp, splitcutoff); 
-                       if (!hard) {    splitcutoff += (5 / (precision * 10.0));  }
+                       temp = validParameter.validFile(parameters, "taxlevel", false);         if (temp == "not found")  { temp = "1"; }
+                       convert(temp, taxLevelCutoff); 
                         
-                       method = validParameter.validFile(parameters, "method", false);
-                       if (method == "not found") { method = "furthest"; }
+                       method = validParameter.validFile(parameters, "method", false);         if (method == "not found") { method = "furthest"; }
                         
                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
+                       
+                       if ((splitmethod == "distance") || (splitmethod == "classify")) { }
+                       else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance or classify."); m->mothurOutEndLine(); abort = true; }
+                       
+                       if ((splitmethod == "classify") && (taxFile == "")) {  m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true;  }
  
                         showabund = validParameter.validFile(parameters, "showabund", false);
                         if (showabund == "not found") { showabund = "T"; }
@@ -150,11 +165,19 @@ ClusterSplitCommand::ClusterSplitCommand(string option)  {
  
  void ClusterSplitCommand::help(){
         try {
-               m->mothurOut("The cluster command can only be executed after a successful read.dist command.\n");
-               m->mothurOut("The cluster command parameter options are method, cuttoff, hard, precision, showabund and timing. No parameters are required.\n");
-               m->mothurOut("The cluster command should be in the following format: \n");
-               m->mothurOut("cluster(method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n");
-               m->mothurOut("The acceptable cluster methods are furthest, nearest and average.  If no method is provided then furthest is assumed.\n\n");      
+               m->mothurOut("The cluster.split command parameter options are phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, processors. Phylip or column and name are required.\n");
+               m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
+               m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
+               m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
+               m->mothurOut("The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n");
+               m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
+               m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance or classify. \n");
+               m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
+               m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1. \n");
+               m->mothurOut("The cluster.split command should be in the following format: \n");
+               m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
+               m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");       
+
         }
         catch(exception& e) {
                 m->errorOut(e, "ClusterSplitCommand", "help");
@@ -177,15 +200,17 @@ int ClusterSplitCommand::execute(){
                 
                 //if user gave a phylip file convert to column file
                 if (format == "phylip") {
+       
                         ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
                         
                         NameAssignment* nameMap = NULL;
+                       convert->setFormat("phylip");
                         convert->read(nameMap);
                         
                         if (m->control_pressed) {  delete convert;  return 0;  }
                         
-                       string distfile = convert->getOutputFile();
-                       
+                       distfile = convert->getOutputFile();
+               
                         //if no names file given with phylip file, create it
                         ListVector* listToMakeNameFile =  convert->getListVector();
                         if (namefile == "") {  //you need to make a namefile for split matrix
@@ -201,13 +226,15 @@ int ClusterSplitCommand::execute(){
                         delete listToMakeNameFile;
                         delete convert;
                 }
-               
                 if (m->control_pressed) { return 0; }
                 
                 time_t estart = time(NULL);
                 
                 //split matrix into non-overlapping groups
-               SplitMatrix* split = new SplitMatrix(distfile, namefile, splitcutoff);
+               SplitMatrix* split;
+               if (splitmethod == "distance")  {       split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod);                      }
+               else                                                    {       split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod);  }
+               
                 split->split();
                 
                 if (m->control_pressed) { delete split; return 0; }
@@ -216,11 +243,11 @@ int ClusterSplitCommand::execute(){
                 vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
                 delete split;
                 
+               if (m->control_pressed) { return 0; }
+               
                 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
                 estart = time(NULL);
                 
-               if (m->control_pressed) { return 0; }
-               
                 //****************** break up files between processes and cluster each file set ******************************//
                 vector<string> listFileNames;
                 set<string> labels;
@@ -241,6 +268,12 @@ int ClusterSplitCommand::execute(){
                                                 dividedNames[(processToAssign-1)].push_back(distName[i]);
                                         }
                                         
+                                       //not lets reverse the order of ever other process, so we balance big files running with little ones
+                                       for (int i = 0; i < processors; i++) {
+                                               int remainder = ((i+1) % processors);
+                                               if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
+                                       }
+                                       
                                         createProcesses(dividedNames);
                                                         
                                         if (m->control_pressed) { return 0; }
@@ -280,8 +313,12 @@ int ClusterSplitCommand::execute(){
                 if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
                 
                 //****************** merge list file and create rabund and sabund files ******************************//
-                               
-               mergeLists(listFileNames, singletonName, labels);
+               ListVector* listSingle;
+               map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
+               
+               if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
+               
+               mergeLists(listFileNames, labelBins, listSingle);
  
                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
                 
@@ -300,21 +337,14 @@ int ClusterSplitCommand::execute(){
         }
  }
  //**********************************************************************************************************************
-int ClusterSplitCommand::mergeLists(vector<string> listNames, string singleton, set<string> userLabels){
+map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string> userLabels, ListVector*& listSingle){
         try {
-               if (outputDir == "") { outputDir += hasPath(distfile); }
-               fileroot = outputDir + getRootName(getSimpleName(distfile));
-               
-               openOutputFile(fileroot+ tag + ".sabund",       outSabund);
-               openOutputFile(fileroot+ tag + ".rabund",       outRabund);
-               openOutputFile(fileroot+ tag + ".list",         outList);
                                 
-               outputNames.push_back(fileroot+ tag + ".sabund");
-               outputNames.push_back(fileroot+ tag + ".rabund");
-               outputNames.push_back(fileroot+ tag + ".list");
-                       
+               map<float, int> labelBin;
+               vector<float> orderFloat;
+               int numSingleBins;
+               
                 //read in singletons
-               ListVector* listSingle = NULL;
                 if (singleton != "none") {
                         ifstream in;
                         openInputFile(singleton, in);
@@ -326,99 +356,176 @@ int ClusterSplitCommand::mergeLists(vector<string> listNames, string singleton,
                                 listSingle->push_back(secondCol);
                         }
                         in.close();
-               }
+                       remove(singleton.c_str());
+                       
+                       numSingleBins = listSingle->getNumBins();
+               }else{  listSingle = NULL; numSingleBins = 0;  }
                 
-               vector<float> orderFloat;
-       
                 //go through users set and make them floats so we can sort them 
                 for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
-                       float temp;
+                       float temp = -10.0;
  
-                       if ((*it != "unique") && (convertTestFloat(*it, temp) == true)){
-                               convert(*it, temp);
-                               orderFloat.push_back(temp);
-                       }else if (*it == "unique") { orderFloat.push_back(-1.0); }
-                       else {
-                               userLabels.erase(*it); 
-                               it--;
-                       }
+                       if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) {       convert(*it, temp);     }
+                       else if (*it == "unique")                                                                               {       temp = -1.0;            }
+                       
+                       orderFloat.push_back(temp);
+                       labelBin[temp] = numSingleBins; //initialize numbins 
                 }
-               
+       
                 //sort order
                 sort(orderFloat.begin(), orderFloat.end());
+               userLabels.clear();
+                       
+               //get the list info from each file
+               for (int k = 0; k < listNames.size(); k++) {
+       
+                       if (m->control_pressed) {  
+                               if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str());  }
+                               for (int i = 0; i < listNames.size(); i++) {   remove(listNames[i].c_str());  }
+                               return labelBin;
+                       }
+                       
+                       InputData* input = new InputData(listNames[k], "list");
+                       ListVector* list = input->getListVector();
+                       string lastLabel = list->getLabel();
+                       
+                       string filledInList = listNames[k] + "filledInTemp";
+                       ofstream outFilled;
+                       openOutputFile(filledInList, outFilled);
+       
+                       //for each label needed
+                       for(int l = 0; l < orderFloat.size(); l++){
+                       
+                               string thisLabel;
+                               if (orderFloat[l] == -1) { thisLabel = "unique"; }
+                               else { thisLabel = toString(orderFloat[l],  length-1);  } 
  
-               vector<InputData*> inputs;
-               vector<string> lastLabels;
-               for (int i = 0; i < listNames.size(); i++) {
-                       InputData* input = new InputData(listNames[i], "list");
-                       inputs.push_back(input);
+                               //this file has reached the end
+                               if (list == NULL) { 
+                                       list = input->getListVector(lastLabel, true); 
+                               }else{  //do you have the distance, or do you need to fill in
+                                               
+                                       float labelFloat;
+                                       if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
+                                       else { convert(list->getLabel(), labelFloat); }
+
+                                       //check for missing labels
+                                       if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
+                                               //if its bigger get last label, otherwise keep it
+                                               delete list;
+                                               list = input->getListVector(lastLabel, true);  //get last list vector to use, you actually want to move back in the file
+                                       }
+                                       lastLabel = list->getLabel();
+                               }
+                               
+                               //print to new file
+                               list->setLabel(thisLabel);
+                               list->print(outFilled);
+               
+                               //update labelBin
+                               labelBin[orderFloat[l]] += list->getNumBins();
+                                                                       
+                               delete list;
+                                                                       
+                               list = input->getListVector();
+                       }
                         
-                       ifstream in;
-                       openInputFile(listNames[i], in);
-                       ListVector tempList(in);
-                       lastLabels.push_back(tempList.getLabel());
-                       in.close();
+                       if (list != NULL) { delete list; }
+                       delete input;
+                       
+                       outFilled.close();
+                       remove(listNames[k].c_str());
+                       rename(filledInList.c_str(), listNames[k].c_str());
                 }
                 
-               ListVector* merged = NULL;
+               return labelBin;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ClusterSplitCommand", "completeListFile");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
+       try {
+               if (outputDir == "") { outputDir += hasPath(distfile); }
+               fileroot = outputDir + getRootName(getSimpleName(distfile));
+               
+               openOutputFile(fileroot+ tag + ".sabund",       outSabund);
+               openOutputFile(fileroot+ tag + ".rabund",       outRabund);
+               openOutputFile(fileroot+ tag + ".list",         outList);
                                 
+               outputNames.push_back(fileroot+ tag + ".sabund");
+               outputNames.push_back(fileroot+ tag + ".rabund");
+               outputNames.push_back(fileroot+ tag + ".list");
+               
+               map<float, int>::iterator itLabel;
+
                 //for each label needed
-               for(int l = 0; l < orderFloat.size(); l++){
+               for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
                         
                         string thisLabel;
-                       if (orderFloat[l] == -1) { thisLabel = "unique"; }
-                       else { thisLabel = toString(orderFloat[l],  length-1);  } 
-       
+                       if (itLabel->first == -1) { thisLabel = "unique"; }
+                       else { thisLabel = toString(itLabel->first,  length-1);  } 
+                       
+                       outList << thisLabel << '\t' << itLabel->second << '\t';
+
+                       RAbundVector* rabund = new RAbundVector();
+                       rabund->setLabel(thisLabel);
+
+                       //add in singletons
+                       if (listSingle != NULL) {
+                               for (int j = 0; j < listSingle->getNumBins(); j++) {
+                                       outList << listSingle->get(j) << '\t';
+                                       rabund->push_back(getNumNames(listSingle->get(j)));
+                               }
+                       }
+                       
                         //get the list info from each file
                         for (int k = 0; k < listNames.size(); k++) {
         
-                               ListVector* list = inputs[k]->getListVector();
+                               if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str());  } delete rabund; return 0; }
+                               
+                               InputData* input = new InputData(listNames[k], "list");
+                               ListVector* list = input->getListVector(thisLabel);
                                 
                                 //this file has reached the end
-                               if (list == NULL) { list = inputs[k]->getListVector(lastLabels[k], true); }     
-                                               
-                               float labelFloat;
-                               if (list->getLabel() == "unique") {  labelFloat = -1.0;  }
-                               else { convert(list->getLabel(), labelFloat); }
-
-                               //check for missing labels
-                               if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
-                                       //if its bigger get last label, otherwise keep it
+                               if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }        
+                               else {          
+                                       for (int j = 0; j < list->getNumBins(); j++) {
+                                               outList << list->get(j) << '\t';
+                                               rabund->push_back(getNumNames(list->get(j)));
+                                       }
                                         delete list;
-                                       list = inputs[k]->getListVector(lastLabels[k], true); //get last list vector to use, you actually want to move back in the file
                                 }
-                               lastLabels[k] = list->getLabel();
-
-                               //is this the first file
-                               if (merged == NULL) {  merged = new ListVector();  merged->setLabel(thisLabel); }
-                               
-                               for (int j = 0; j < list->getNumBins(); j++) {
-                                       merged->push_back(list->get(j));
-                               }
-                               
-                               delete list;
+                               delete input;
                         }
                         
-                       //add in singletons
-                       for (int j = 0; j < listSingle->getNumBins(); j++) {
-                               merged->push_back(listSingle->get(j));
-                       }
-
-                       //print to files
-                       printData(merged);
+                       SAbundVector sabund = rabund->getSAbundVector();
+                       
+                       sabund.print(outSabund);
+                       rabund->print(outRabund);
+                       outList << endl;
                         
-                       delete merged; merged = NULL;
+                       delete rabund;
                 }
                 
-               if (listSingle != NULL) { delete listSingle; remove(singleton.c_str());  }
+               outList.close();
+               outRabund.close();
+               outSabund.close();
                 
-               for (int i = 0; i < listNames.size(); i++) {  delete inputs[i];  remove(listNames[i].c_str());  }
+               if (listSingle != NULL) { delete listSingle;  }
+               
+               for (int i = 0; i < listNames.size(); i++) {  remove(listNames[i].c_str());  }
+               
+               return 0;
         }
         catch(exception& e) {
                 m->errorOut(e, "ClusterSplitCommand", "mergeLists");
                 exit(1);
         }
  }
+
  //**********************************************************************************************************************
  
  void ClusterSplitCommand::printData(ListVector* oldList){
@@ -575,7 +682,12 @@ vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNa
                                 cluster->update(cutoff);
         
                                 float dist = matrix->getSmallDist();
-                               float rndDist = roundDist(dist, precision);
+                               float rndDist;
+                               if (hard) {
+                                       rndDist = ceilDist(dist, precision); 
+                               }else{
+                                       rndDist = roundDist(dist, precision); 
+                               }
  
                                 if(previousDist <= 0.0000 && dist != previousDist){
                                         oldList.setLabel("unique");
@@ -604,7 +716,7 @@ vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNa
                                 oldList.print(listFile);
                                 if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
                         }
-               
+       
                         delete matrix; delete list;     delete cluster; delete rabund; 
                         listFile.close();
                         
@@ -617,7 +729,7 @@ vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNa
                         remove(thisNamefile.c_str());
                 }
                 
-                               
+                                       
                 return listFileNames;
         
         }