else {
//valid paramters for this command
- string Array[] = {"phylip","column","name","cutoff","precision","method","showabund","timing","hard","processors","splitcutoff","outputdir","inputdir"};
+ string Array[] = {"phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","showabund","timing","hard","processors","outputdir","inputdir"};
vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
OptionParser parser(option);
//if the user has not given a path then, add inputdir. else leave path alone.
if (path == "") { parameters["name"] = inputDir + it->second; }
}
+
+ it = parameters.find("taxonomy");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
+ }
}
//check for required parameters
if (namefile == "not open") { abort = true; }
else if (namefile == "not found") { namefile = ""; }
- if ((phylipfile == "") && (columnfile == "")) { m->mothurOut("When executing a hcluster command you must enter a phylip or a column."); m->mothurOutEndLine(); abort = true; }
- else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a hcluster command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
+ taxFile = validParameter.validFile(parameters, "taxonomy", true);
+ if (taxFile == "not open") { abort = true; }
+ else if (taxFile == "not found") { taxFile = ""; }
+
+ if ((phylipfile == "") && (columnfile == "")) { m->mothurOut("When executing a cluster.split command you must enter a phylip or a column."); m->mothurOutEndLine(); abort = true; }
+ else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a cluster.split command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
if (columnfile != "") {
- if (namefile == "") { cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; }
+ if (namefile == "") { m->mothurOut("You need to provide a namefile if you are going to use the column format."); m->mothurOutEndLine(); abort = true; }
}
//check for optional parameter and set defaults
temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; }
convert(temp, processors);
- temp = validParameter.validFile(parameters, "cutoff", false);
- if (temp == "not found") { temp = "10"; }
+ splitmethod = validParameter.validFile(parameters, "splitmethod", false); if (splitmethod == "not found") { splitmethod = "distance"; }
+
+ temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10"; }
convert(temp, cutoff);
- if (!hard) { cutoff += (5 / (precision * 10.0)); }
+ cutoff += (5 / (precision * 10.0));
- temp = validParameter.validFile(parameters, "splitcutoff", false);
- if (temp == "not found") { temp = "0.10"; }
- convert(temp, splitcutoff);
- if (!hard) { splitcutoff += (5 / (precision * 10.0)); }
+ temp = validParameter.validFile(parameters, "taxlevel", false); if (temp == "not found") { temp = "1"; }
+ convert(temp, taxLevelCutoff);
- method = validParameter.validFile(parameters, "method", false);
- if (method == "not found") { method = "furthest"; }
+ method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "furthest"; }
if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
+
+ if ((splitmethod == "distance") || (splitmethod == "classify")) { }
+ else { m->mothurOut("Not a valid splitting method. Valid splitting algorithms are distance or classify."); m->mothurOutEndLine(); abort = true; }
+
+ if ((splitmethod == "classify") && (taxFile == "")) { m->mothurOut("You need to provide a taxonomy file if you are going to use the classify splitmethod."); m->mothurOutEndLine(); abort = true; }
showabund = validParameter.validFile(parameters, "showabund", false);
if (showabund == "not found") { showabund = "T"; }
void ClusterSplitCommand::help(){
try {
- m->mothurOut("The cluster command can only be executed after a successful read.dist command.\n");
- m->mothurOut("The cluster command parameter options are method, cuttoff, hard, precision, showabund and timing. No parameters are required.\n");
- m->mothurOut("The cluster command should be in the following format: \n");
- m->mothurOut("cluster(method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n");
- m->mothurOut("The acceptable cluster methods are furthest, nearest and average. If no method is provided then furthest is assumed.\n\n");
+ m->mothurOut("The cluster.split command parameter options are phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, processors. Phylip or column and name are required.\n");
+ m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
+ m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
+ m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
+ m->mothurOut("The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n");
+ m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
+ m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance or classify. \n");
+ m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
+ m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1. \n");
+ m->mothurOut("The cluster.split command should be in the following format: \n");
+ m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
+ m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");
+
}
catch(exception& e) {
m->errorOut(e, "ClusterSplitCommand", "help");
//if user gave a phylip file convert to column file
if (format == "phylip") {
+
ReadCluster* convert = new ReadCluster(distfile, cutoff, outputDir, false);
NameAssignment* nameMap = NULL;
+ convert->setFormat("phylip");
convert->read(nameMap);
if (m->control_pressed) { delete convert; return 0; }
- string distfile = convert->getOutputFile();
-
+ distfile = convert->getOutputFile();
+
//if no names file given with phylip file, create it
ListVector* listToMakeNameFile = convert->getListVector();
if (namefile == "") { //you need to make a namefile for split matrix
delete listToMakeNameFile;
delete convert;
}
-
if (m->control_pressed) { return 0; }
time_t estart = time(NULL);
//split matrix into non-overlapping groups
- SplitMatrix* split = new SplitMatrix(distfile, namefile, splitcutoff);
+ SplitMatrix* split;
+ if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod); }
+ else { split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod); }
+
split->split();
if (m->control_pressed) { delete split; return 0; }
vector< map<string, string> > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size
delete split;
+ if (m->control_pressed) { return 0; }
+
m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
estart = time(NULL);
- if (m->control_pressed) { return 0; }
-
//****************** break up files between processes and cluster each file set ******************************//
vector<string> listFileNames;
set<string> labels;
dividedNames[(processToAssign-1)].push_back(distName[i]);
}
+ //not lets reverse the order of ever other process, so we balance big files running with little ones
+ for (int i = 0; i < processors; i++) {
+ int remainder = ((i+1) % processors);
+ if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
+ }
+
createProcesses(dividedNames);
if (m->control_pressed) { return 0; }
if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
//****************** merge list file and create rabund and sabund files ******************************//
-
- mergeLists(listFileNames, singletonName, labels);
+ ListVector* listSingle;
+ map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
+
+ if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
+
+ mergeLists(listFileNames, labelBins, listSingle);
if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
}
}
//**********************************************************************************************************************
-int ClusterSplitCommand::mergeLists(vector<string> listNames, string singleton, set<string> userLabels){
+map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string> userLabels, ListVector*& listSingle){
try {
- if (outputDir == "") { outputDir += hasPath(distfile); }
- fileroot = outputDir + getRootName(getSimpleName(distfile));
-
- openOutputFile(fileroot+ tag + ".sabund", outSabund);
- openOutputFile(fileroot+ tag + ".rabund", outRabund);
- openOutputFile(fileroot+ tag + ".list", outList);
- outputNames.push_back(fileroot+ tag + ".sabund");
- outputNames.push_back(fileroot+ tag + ".rabund");
- outputNames.push_back(fileroot+ tag + ".list");
-
+ map<float, int> labelBin;
+ vector<float> orderFloat;
+ int numSingleBins;
+
//read in singletons
- ListVector* listSingle = NULL;
if (singleton != "none") {
ifstream in;
openInputFile(singleton, in);
listSingle->push_back(secondCol);
}
in.close();
- }
+ remove(singleton.c_str());
+
+ numSingleBins = listSingle->getNumBins();
+ }else{ listSingle = NULL; numSingleBins = 0; }
- vector<float> orderFloat;
-
//go through users set and make them floats so we can sort them
for(set<string>::iterator it = userLabels.begin(); it != userLabels.end(); ++it) {
- float temp;
+ float temp = -10.0;
- if ((*it != "unique") && (convertTestFloat(*it, temp) == true)){
- convert(*it, temp);
- orderFloat.push_back(temp);
- }else if (*it == "unique") { orderFloat.push_back(-1.0); }
- else {
- userLabels.erase(*it);
- it--;
- }
+ if ((*it != "unique") && (convertTestFloat(*it, temp) == true)) { convert(*it, temp); }
+ else if (*it == "unique") { temp = -1.0; }
+
+ orderFloat.push_back(temp);
+ labelBin[temp] = numSingleBins; //initialize numbins
}
-
+
//sort order
sort(orderFloat.begin(), orderFloat.end());
+ userLabels.clear();
+
+ //get the list info from each file
+ for (int k = 0; k < listNames.size(); k++) {
+
+ if (m->control_pressed) {
+ if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str()); }
+ for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); }
+ return labelBin;
+ }
+
+ InputData* input = new InputData(listNames[k], "list");
+ ListVector* list = input->getListVector();
+ string lastLabel = list->getLabel();
+
+ string filledInList = listNames[k] + "filledInTemp";
+ ofstream outFilled;
+ openOutputFile(filledInList, outFilled);
+
+ //for each label needed
+ for(int l = 0; l < orderFloat.size(); l++){
+
+ string thisLabel;
+ if (orderFloat[l] == -1) { thisLabel = "unique"; }
+ else { thisLabel = toString(orderFloat[l], length-1); }
- vector<InputData*> inputs;
- vector<string> lastLabels;
- for (int i = 0; i < listNames.size(); i++) {
- InputData* input = new InputData(listNames[i], "list");
- inputs.push_back(input);
+ //this file has reached the end
+ if (list == NULL) {
+ list = input->getListVector(lastLabel, true);
+ }else{ //do you have the distance, or do you need to fill in
+
+ float labelFloat;
+ if (list->getLabel() == "unique") { labelFloat = -1.0; }
+ else { convert(list->getLabel(), labelFloat); }
+
+ //check for missing labels
+ if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
+ //if its bigger get last label, otherwise keep it
+ delete list;
+ list = input->getListVector(lastLabel, true); //get last list vector to use, you actually want to move back in the file
+ }
+ lastLabel = list->getLabel();
+ }
+
+ //print to new file
+ list->setLabel(thisLabel);
+ list->print(outFilled);
+
+ //update labelBin
+ labelBin[orderFloat[l]] += list->getNumBins();
+
+ delete list;
+
+ list = input->getListVector();
+ }
- ifstream in;
- openInputFile(listNames[i], in);
- ListVector tempList(in);
- lastLabels.push_back(tempList.getLabel());
- in.close();
+ if (list != NULL) { delete list; }
+ delete input;
+
+ outFilled.close();
+ remove(listNames[k].c_str());
+ rename(filledInList.c_str(), listNames[k].c_str());
}
- ListVector* merged = NULL;
+ return labelBin;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ClusterSplitCommand", "completeListFile");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
+int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> userLabels, ListVector* listSingle){
+ try {
+ if (outputDir == "") { outputDir += hasPath(distfile); }
+ fileroot = outputDir + getRootName(getSimpleName(distfile));
+
+ openOutputFile(fileroot+ tag + ".sabund", outSabund);
+ openOutputFile(fileroot+ tag + ".rabund", outRabund);
+ openOutputFile(fileroot+ tag + ".list", outList);
+ outputNames.push_back(fileroot+ tag + ".sabund");
+ outputNames.push_back(fileroot+ tag + ".rabund");
+ outputNames.push_back(fileroot+ tag + ".list");
+
+ map<float, int>::iterator itLabel;
+
//for each label needed
- for(int l = 0; l < orderFloat.size(); l++){
+ for(itLabel = userLabels.begin(); itLabel != userLabels.end(); itLabel++) {
string thisLabel;
- if (orderFloat[l] == -1) { thisLabel = "unique"; }
- else { thisLabel = toString(orderFloat[l], length-1); }
-
+ if (itLabel->first == -1) { thisLabel = "unique"; }
+ else { thisLabel = toString(itLabel->first, length-1); }
+
+ outList << thisLabel << '\t' << itLabel->second << '\t';
+
+ RAbundVector* rabund = new RAbundVector();
+ rabund->setLabel(thisLabel);
+
+ //add in singletons
+ if (listSingle != NULL) {
+ for (int j = 0; j < listSingle->getNumBins(); j++) {
+ outList << listSingle->get(j) << '\t';
+ rabund->push_back(getNumNames(listSingle->get(j)));
+ }
+ }
+
//get the list info from each file
for (int k = 0; k < listNames.size(); k++) {
- ListVector* list = inputs[k]->getListVector();
+ if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); } delete rabund; return 0; }
+
+ InputData* input = new InputData(listNames[k], "list");
+ ListVector* list = input->getListVector(thisLabel);
//this file has reached the end
- if (list == NULL) { list = inputs[k]->getListVector(lastLabels[k], true); }
-
- float labelFloat;
- if (list->getLabel() == "unique") { labelFloat = -1.0; }
- else { convert(list->getLabel(), labelFloat); }
-
- //check for missing labels
- if (labelFloat > orderFloat[l]) { //you are missing the label, get the next smallest one
- //if its bigger get last label, otherwise keep it
+ if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine(); }
+ else {
+ for (int j = 0; j < list->getNumBins(); j++) {
+ outList << list->get(j) << '\t';
+ rabund->push_back(getNumNames(list->get(j)));
+ }
delete list;
- list = inputs[k]->getListVector(lastLabels[k], true); //get last list vector to use, you actually want to move back in the file
}
- lastLabels[k] = list->getLabel();
-
- //is this the first file
- if (merged == NULL) { merged = new ListVector(); merged->setLabel(thisLabel); }
-
- for (int j = 0; j < list->getNumBins(); j++) {
- merged->push_back(list->get(j));
- }
-
- delete list;
+ delete input;
}
- //add in singletons
- for (int j = 0; j < listSingle->getNumBins(); j++) {
- merged->push_back(listSingle->get(j));
- }
-
- //print to files
- printData(merged);
+ SAbundVector sabund = rabund->getSAbundVector();
+
+ sabund.print(outSabund);
+ rabund->print(outRabund);
+ outList << endl;
- delete merged; merged = NULL;
+ delete rabund;
}
- if (listSingle != NULL) { delete listSingle; remove(singleton.c_str()); }
+ outList.close();
+ outRabund.close();
+ outSabund.close();
- for (int i = 0; i < listNames.size(); i++) { delete inputs[i]; remove(listNames[i].c_str()); }
+ if (listSingle != NULL) { delete listSingle; }
+
+ for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); }
+
+ return 0;
}
catch(exception& e) {
m->errorOut(e, "ClusterSplitCommand", "mergeLists");
exit(1);
}
}
+
//**********************************************************************************************************************
void ClusterSplitCommand::printData(ListVector* oldList){
cluster->update(cutoff);
float dist = matrix->getSmallDist();
- float rndDist = roundDist(dist, precision);
+ float rndDist;
+ if (hard) {
+ rndDist = ceilDist(dist, precision);
+ }else{
+ rndDist = roundDist(dist, precision);
+ }
if(previousDist <= 0.0000 && dist != previousDist){
oldList.setLabel("unique");
oldList.print(listFile);
if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
}
-
+
delete matrix; delete list; delete cluster; delete rabund;
listFile.close();
remove(thisNamefile.c_str());
}
-
+
return listFileNames;
}