*/
#include "clustersplitcommand.h"
-#include "readcluster.h"
-#include "splitmatrix.h"
-#include "readphylip.h"
-#include "readcolumn.h"
-#include "readmatrix.hpp"
-#include "inputdata.h"
//**********************************************************************************************************************
CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName",false,false); parameters.push_back(pfasta);
CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName-FastaTaxName",false,false); parameters.push_back(pname);
CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName",false,false); parameters.push_back(pcolumn);
- CommandParameter ptaxlevel("taxlevel", "Number", "", "1", "", "", "",false,false); parameters.push_back(ptaxlevel);
+ CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "",false,false); parameters.push_back(ptaxlevel);
CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "",false,false); parameters.push_back(psplitmethod);
CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pshowabund);
CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptiming);
CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
- CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff);
+ CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "",false,false); parameters.push_back(pcutoff);
CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "",false,false); parameters.push_back(pmethod);
CommandParameter phard("hard", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(phard);
+ CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pclassic);
CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
helpString += "The phylip and column parameter allow you to enter your distance file. \n";
helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
helpString += "The name parameter allows you to enter your name file and is required if your distance file is in column format. \n";
- helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n";
+ helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
- helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1, meaning use the first taxon in each list. \n";
+ helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n";
+ helpString += "The classic parameter allows you to indicate that you want to run your files with cluster.classic. It is only valid with splitmethod=fasta. Default=f.\n";
#ifdef USE_MPI
helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
#endif
}
}
//**********************************************************************************************************************
+string ClusterSplitCommand::getOutputFileNameTag(string type, string inputName=""){
+ try {
+ string outputFileName = "";
+ map<string, vector<string> >::iterator it;
+
+ //is this a type this command creates
+ it = outputTypes.find(type);
+ if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
+ else {
+ if (type == "list") { outputFileName = "list"; }
+ else if (type == "rabund") { outputFileName = "rabund"; }
+ else if (type == "sabund") { outputFileName = "sabund"; }
+ else if (type == "column") { outputFileName = "dist"; }
+ else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; }
+ }
+ return outputFileName;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ClusterSplitCommand", "getOutputFileNameTag");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
ClusterSplitCommand::ClusterSplitCommand(){
try {
abort = true; calledHelp = true;
else { distfile = fastafile; splitmethod = "fasta"; m->setFastaFile(fastafile); }
taxFile = validParameter.validFile(parameters, "taxonomy", true);
- if (taxFile == "not open") { abort = true; }
+ if (taxFile == "not open") { taxFile = ""; abort = true; }
else if (taxFile == "not found") { taxFile = ""; }
- else { m->setTaxonomyFile(taxFile); }
+ else { m->setTaxonomyFile(taxFile); if (splitmethod != "fasta") { splitmethod = "classify"; } }
if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) {
//is there are current file available for either of these?
if (temp == "not found") { temp = "100"; }
//saves precision legnth for formatting below
length = temp.length();
- convert(temp, precision);
+ m->mothurConvert(temp, precision);
temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "T"; }
hard = m->isTrue(temp);
temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
large = m->isTrue(temp);
-
+
temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
m->setProcessors(temp);
- convert(temp, processors);
+ m->mothurConvert(temp, processors);
temp = validParameter.validFile(parameters, "splitmethod", false);
- if (splitmethod != "fasta") {
+ if ((splitmethod != "fasta") && (splitmethod != "classify")) {
if (temp == "not found") { splitmethod = "distance"; }
else { splitmethod = temp; }
}
- temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10"; }
- convert(temp, cutoff);
+ temp = validParameter.validFile(parameters, "classic", false); if (temp == "not found") { temp = "F"; }
+ classic = m->isTrue(temp);
+
+ if ((splitmethod != "fasta") && classic) { m->mothurOut("splitmethod must be fasta to use cluster.classic.\n"); abort=true; }
+
+ temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "0.25"; }
+ m->mothurConvert(temp, cutoff);
cutoff += (5 / (precision * 10.0));
- temp = validParameter.validFile(parameters, "taxlevel", false); if (temp == "not found") { temp = "1"; }
- convert(temp, taxLevelCutoff);
+ temp = validParameter.validFile(parameters, "taxlevel", false); if (temp == "not found") { temp = "3"; }
+ m->mothurConvert(temp, taxLevelCutoff);
method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "average"; }
- if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
+ if ((method == "furthest") || (method == "nearest") || (method == "average")) { m->mothurOut("Using splitmethod " + splitmethod + ".\n"); }
else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
SplitMatrix* split;
if (splitmethod == "distance") { split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large); }
else if (splitmethod == "classify") { split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large); }
- else if (splitmethod == "fasta") { split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, outputDir); }
+ else if (splitmethod == "fasta") { split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, cutoff, splitmethod, processors, classic, outputDir); }
else { m->mothurOut("Not a valid splitting method. Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0; }
split->split();
MPI_Barrier(MPI_COMM_WORLD);
#else
-
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
+ //sanity check
+ if (processors > distName.size()) { processors = distName.size(); }
+
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
if(processors == 1){
listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
}else{
- vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
- dividedNames.resize(processors);
-
- //for each file group figure out which process will complete it
- //want to divide the load intelligently so the big files are spread between processes
- for (int i = 0; i < distName.size(); i++) {
- int processToAssign = (i+1) % processors;
- if (processToAssign == 0) { processToAssign = processors; }
-
- dividedNames[(processToAssign-1)].push_back(distName[i]);
- }
-
- //not lets reverse the order of ever other process, so we balance big files running with little ones
- for (int i = 0; i < processors; i++) {
- int remainder = ((i+1) % processors);
- if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
- }
-
- createProcesses(dividedNames);
-
- if (m->control_pressed) { return 0; }
-
- //get list of list file names from each process
- for(int i=0;i<processors;i++){
- string filename = toString(processIDS[i]) + ".temp";
- ifstream in;
- m->openInputFile(filename, in);
-
- in >> tag; m->gobble(in);
-
- while(!in.eof()) {
- string tempName;
- in >> tempName; m->gobble(in);
- listFileNames.push_back(tempName);
- }
- in.close();
- remove((toString(processIDS[i]) + ".temp").c_str());
-
- //get labels
- filename = toString(processIDS[i]) + ".temp.labels";
- ifstream in2;
- m->openInputFile(filename, in2);
-
- float tempCutoff;
- in2 >> tempCutoff; m->gobble(in2);
- if (tempCutoff < cutoff) { cutoff = tempCutoff; }
-
- while(!in2.eof()) {
- string tempName;
- in2 >> tempName; m->gobble(in2);
- if (labels.count(tempName) == 0) { labels.insert(tempName); }
- }
- in2.close();
- remove((toString(processIDS[i]) + ".temp.labels").c_str());
- }
- }
+ listFileNames = createProcesses(distName, labels);
+ }
#else
listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
#endif
#endif
- if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
+ if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); }
ListVector* listSingle;
map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
- if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
+ if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
mergeLists(listFileNames, labelBins, listSingle);
- if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
+ if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
listSingle->push_back(secondCol);
}
in.close();
- remove(singleton.c_str());
+ m->mothurRemove(singleton);
numSingleBins = listSingle->getNumBins();
}else{ listSingle = NULL; numSingleBins = 0; }
for (int k = 0; k < listNames.size(); k++) {
if (m->control_pressed) {
- if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str()); }
- for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); }
+ if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton); }
+ for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); }
return labelBin;
}
delete input;
outFilled.close();
- remove(listNames[k].c_str());
+ m->mothurRemove(listNames[k]);
rename(filledInList.c_str(), listNames[k].c_str());
}
if (outputDir == "") { outputDir += m->hasPath(distfile); }
fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
- m->openOutputFile(fileroot+ tag + ".sabund", outSabund);
- m->openOutputFile(fileroot+ tag + ".rabund", outRabund);
- m->openOutputFile(fileroot+ tag + ".list", outList);
-
- outputNames.push_back(fileroot+ tag + ".sabund"); outputTypes["list"].push_back(fileroot+ tag + ".list");
- outputNames.push_back(fileroot+ tag + ".rabund"); outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
- outputNames.push_back(fileroot+ tag + ".list"); outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
+ string sabundFileName = fileroot+ tag + "." + getOutputFileNameTag("sabund");
+ string rabundFileName = fileroot+ tag + "." + getOutputFileNameTag("rabund");
+ string listFileName = fileroot+ tag + "." + getOutputFileNameTag("list");
+
+ m->openOutputFile(sabundFileName, outSabund);
+ m->openOutputFile(rabundFileName, outRabund);
+ m->openOutputFile(listFileName, outList);
+ outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
+ outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
+ outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
map<float, int>::iterator itLabel;
//for each label needed
//get the list info from each file
for (int k = 0; k < listNames.size(); k++) {
- if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); } delete rabund; return 0; }
+ if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); } delete rabund; return 0; }
InputData* input = new InputData(listNames[k], "list");
ListVector* list = input->getListVector(thisLabel);
if (listSingle != NULL) { delete listSingle; }
- for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); }
+ for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); }
return 0;
}
}
}
//**********************************************************************************************************************
-int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
+vector<string> ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
try {
+
+ vector<string> listFiles;
+ vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
+ dividedNames.resize(processors);
+
+ //for each file group figure out which process will complete it
+ //want to divide the load intelligently so the big files are spread between processes
+ for (int i = 0; i < distName.size(); i++) {
+ //cout << i << endl;
+ int processToAssign = (i+1) % processors;
+ if (processToAssign == 0) { processToAssign = processors; }
+
+ dividedNames[(processToAssign-1)].push_back(distName[i]);
+ if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
+ }
+
+ //not lets reverse the order of ever other process, so we balance big files running with little ones
+ for (int i = 0; i < processors; i++) {
+ //cout << i << endl;
+ int remainder = ((i+1) % processors);
+ if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); }
+ }
+
+ if (m->control_pressed) { return listFiles; }
- #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
- int process = 0;
- int exitCommand = 1;
+ #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+ int process = 1;
processIDS.clear();
//loop through and create all the processes you want
}
}
+ //do your part
+ listFiles = cluster(dividedNames[0], labels);
+
//force parent to wait until all the processes are done
- for (int i=0;i<processors;i++) {
+ for (int i=0;i< processIDS.size();i++) {
int temp = processIDS[i];
wait(&temp);
}
+
+ //get list of list file names from each process
+ for(int i=0;i<processIDS.size();i++){
+ string filename = toString(processIDS[i]) + ".temp";
+ ifstream in;
+ m->openInputFile(filename, in);
+
+ in >> tag; m->gobble(in);
+
+ while(!in.eof()) {
+ string tempName;
+ in >> tempName; m->gobble(in);
+ listFiles.push_back(tempName);
+ }
+ in.close();
+ m->mothurRemove((toString(processIDS[i]) + ".temp"));
+
+ //get labels
+ filename = toString(processIDS[i]) + ".temp.labels";
+ ifstream in2;
+ m->openInputFile(filename, in2);
+
+ float tempCutoff;
+ in2 >> tempCutoff; m->gobble(in2);
+ if (tempCutoff < cutoff) { cutoff = tempCutoff; }
+
+ while(!in2.eof()) {
+ string tempName;
+ in2 >> tempName; m->gobble(in2);
+ if (labels.count(tempName) == 0) { labels.insert(tempName); }
+ }
+ in2.close();
+ m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
+ }
+
+
+ #else
+
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+ //Windows version shared memory, so be careful when passing variables through the clusterData struct.
+ //Above fork() will clone, so memory is separate, but that's not the case with windows,
+ //Taking advantage of shared memory to allow both threads to add labels.
+ //////////////////////////////////////////////////////////////////////////////////////////////////////
+
+ vector<clusterData*> pDataArray;
+ DWORD dwThreadIdArray[processors-1];
+ HANDLE hThreadArray[processors-1];
- return exitCommand;
+ //Create processor worker threads.
+ for( int i=1; i<processors; i++ ){
+ // Allocate memory for thread data.
+ clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
+ pDataArray.push_back(tempCluster);
+ processIDS.push_back(i);
+
+ //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
+ //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
+ hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
+
+ }
+
+ //do your part
+ listFiles = cluster(dividedNames[0], labels);
+
+ //Wait until all threads have terminated.
+ WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+
+ //Close all thread handles and free memory allocations.
+ for(int i=0; i < pDataArray.size(); i++){
+ //get tag
+ tag = pDataArray[i]->tag;
+ //get listfiles created
+ for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
+ //get labels
+ set<string>::iterator it;
+ for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
+ //check cutoff
+ if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
+ CloseHandle(hThreadArray[i]);
+ delete pDataArray[i];
+ }
+
#endif
+
+ return listFiles;
}
catch(exception& e) {
vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
try {
- Cluster* cluster;
- SparseMatrix* matrix;
- ListVector* list;
- ListVector oldList;
- RAbundVector* rabund;
vector<string> listFileNames;
-
double smallestCutoff = cutoff;
//cluster each distance file
for (int i = 0; i < distNames.size(); i++) {
- if (m->control_pressed) { return listFileNames; }
-
+
string thisNamefile = distNames[i].begin()->second;
string thisDistFile = distNames[i].begin()->first;
-
- #ifdef USE_MPI
- int pid;
- MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
-
- //output your files too
- if (pid != 0) {
- cout << endl << "Reading " << thisDistFile << endl;
- }
- #endif
-
- m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
-
- ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
- read->setCutoff(cutoff);
-
- NameAssignment* nameMap = new NameAssignment(thisNamefile);
- nameMap->readMap();
- read->read(nameMap);
-
- if (m->control_pressed) { delete read; delete nameMap; return listFileNames; }
-
- list = read->getListVector();
- oldList = *list;
- matrix = read->getMatrix();
-
- delete read;
- delete nameMap;
-
- #ifdef USE_MPI
- //output your files too
- if (pid != 0) {
- cout << endl << "Clustering " << thisDistFile << endl;
- }
- #endif
-
- m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
-
- rabund = new RAbundVector(list->getRAbundVector());
-
- //create cluster
- if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
- else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
- else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); }
- tag = cluster->getTag();
-
- if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
- fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
-
- ofstream listFile;
- m->openOutputFile(fileroot+ tag + ".list", listFile);
-
- listFileNames.push_back(fileroot+ tag + ".list");
-
- float previousDist = 0.00000;
- float rndPreviousDist = 0.00000;
-
- oldList = *list;
-
- print_start = true;
- start = time(NULL);
- double saveCutoff = cutoff;
-
- while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
-
- if (m->control_pressed) { //clean up
- delete matrix; delete list; delete cluster; delete rabund;
- listFile.close();
- for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
- listFileNames.clear(); return listFileNames;
- }
-
- cluster->update(saveCutoff);
-
- float dist = matrix->getSmallDist();
- float rndDist;
- if (hard) {
- rndDist = m->ceilDist(dist, precision);
- }else{
- rndDist = m->roundDist(dist, precision);
- }
-
- if(previousDist <= 0.0000 && dist != previousDist){
- oldList.setLabel("unique");
- oldList.print(listFile);
- if (labels.count("unique") == 0) { labels.insert("unique"); }
- }
- else if(rndDist != rndPreviousDist){
- oldList.setLabel(toString(rndPreviousDist, length-1));
- oldList.print(listFile);
- if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
- }
-
- previousDist = dist;
- rndPreviousDist = rndDist;
- oldList = *list;
- }
+ string listFileName = "";
+ if (classic) { listFileName = clusterClassicFile(thisDistFile, thisNamefile, labels, smallestCutoff); }
+ else { listFileName = clusterFile(thisDistFile, thisNamefile, labels, smallestCutoff); }
-
- if(previousDist <= 0.0000){
- oldList.setLabel("unique");
- oldList.print(listFile);
- if (labels.count("unique") == 0) { labels.insert("unique"); }
- }
- else if(rndPreviousDist<cutoff){
- oldList.setLabel(toString(rndPreviousDist, length-1));
- oldList.print(listFile);
- if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
- }
-
- delete matrix; delete list; delete cluster; delete rabund;
- listFile.close();
-
if (m->control_pressed) { //clean up
- for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); }
+ for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); }
listFileNames.clear(); return listFileNames;
}
-
- remove(thisDistFile.c_str());
- remove(thisNamefile.c_str());
-
- if (saveCutoff != cutoff) {
- if (hard) { saveCutoff = m->ceilDist(saveCutoff, precision); }
- else { saveCutoff = m->roundDist(saveCutoff, precision); }
-
- m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
- }
-
- if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff; }
- }
+
+ listFileNames.push_back(listFileName);
+ }
cutoff = smallestCutoff;
}
//**********************************************************************************************************************
+string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
+ try {
+ string listFileName = "";
+
+ ListVector* list = NULL;
+ ListVector oldList;
+ RAbundVector* rabund = NULL;
+
+#ifdef USE_MPI
+ int pid;
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+
+ //output your files too
+ if (pid != 0) {
+ cout << endl << "Reading " << thisDistFile << endl;
+ }
+#endif
+
+ m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
+
+ NameAssignment* nameMap = new NameAssignment(thisNamefile);
+ nameMap->readMap();
+
+ //reads phylip file storing data in 2D vector, also fills list and rabund
+ bool sim = false;
+ ClusterClassic* cluster = new ClusterClassic(cutoff, method, sim);
+ cluster->readPhylipFile(thisDistFile, nameMap);
+ tag = cluster->getTag();
+
+ if (m->control_pressed) { delete cluster; return 0; }
+
+ list = cluster->getListVector();
+ rabund = cluster->getRAbundVector();
+
+ if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
+ fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
+ listFileName = fileroot+ tag + ".list";
+
+ ofstream listFile;
+ m->openOutputFile(fileroot+ tag + ".list", listFile);
+
+ float previousDist = 0.00000;
+ float rndPreviousDist = 0.00000;
+ oldList = *list;
+
+#ifdef USE_MPI
+ //output your files too
+ if (pid != 0) {
+ cout << endl << "Clustering " << thisDistFile << endl;
+ }
+#endif
+
+ m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
+
+ while ((cluster->getSmallDist() < cutoff) && (cluster->getNSeqs() > 1)){
+ if (m->control_pressed) { delete cluster; delete list; delete rabund; listFile.close(); return listFileName; }
+
+ cluster->update(cutoff);
+
+ float dist = cluster->getSmallDist();
+ float rndDist;
+ if (hard) {
+ rndDist = m->ceilDist(dist, precision);
+ }else{
+ rndDist = m->roundDist(dist, precision);
+ }
+
+ if(previousDist <= 0.0000 && dist != previousDist){
+ oldList.setLabel("unique");
+ oldList.print(listFile);
+ if (labels.count("unique") == 0) { labels.insert("unique"); }
+ }
+ else if(rndDist != rndPreviousDist){
+ oldList.setLabel(toString(rndPreviousDist, length-1));
+ oldList.print(listFile);
+ if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
+ }
+
+
+ previousDist = dist;
+ rndPreviousDist = rndDist;
+ oldList = *list;
+ }
+
+ if(previousDist <= 0.0000){
+ oldList.setLabel("unique");
+ oldList.print(listFile);
+ if (labels.count("unique") == 0) { labels.insert("unique"); }
+ }
+ else if(rndPreviousDist<cutoff){
+ oldList.setLabel(toString(rndPreviousDist, length-1));
+ oldList.print(listFile);
+ if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
+ }
+
+
+ listFile.close();
+
+ delete cluster; delete nameMap; delete list; delete rabund;
+
+
+ return listFileName;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ClusterSplitCommand", "clusterClassicFile");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
+string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile, set<string>& labels, double& smallestCutoff){
+ try {
+ string listFileName = "";
+
+ Cluster* cluster = NULL;
+ SparseMatrix* matrix = NULL;
+ ListVector* list = NULL;
+ ListVector oldList;
+ RAbundVector* rabund = NULL;
+
+ if (m->control_pressed) { return listFileName; }
+
+#ifdef USE_MPI
+ int pid;
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+
+ //output your files too
+ if (pid != 0) {
+ cout << endl << "Reading " << thisDistFile << endl;
+ }
+#endif
+
+ m->mothurOutEndLine(); m->mothurOut("Reading " + thisDistFile); m->mothurOutEndLine();
+
+ ReadMatrix* read = new ReadColumnMatrix(thisDistFile);
+ read->setCutoff(cutoff);
+
+ NameAssignment* nameMap = new NameAssignment(thisNamefile);
+ nameMap->readMap();
+ read->read(nameMap);
+
+ if (m->control_pressed) { delete read; delete nameMap; return listFileName; }
+
+ list = read->getListVector();
+ oldList = *list;
+ matrix = read->getMatrix();
+
+ delete read; read = NULL;
+ delete nameMap; nameMap = NULL;
+
+
+#ifdef USE_MPI
+ //output your files too
+ if (pid != 0) {
+ cout << endl << "Clustering " << thisDistFile << endl;
+ }
+#endif
+
+ m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
+
+ rabund = new RAbundVector(list->getRAbundVector());
+
+ //create cluster
+ if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
+ else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
+ else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); }
+ tag = cluster->getTag();
+
+ if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
+ fileroot = outputDir + m->getRootName(m->getSimpleName(thisDistFile));
+
+ ofstream listFile;
+ m->openOutputFile(fileroot+ tag + ".list", listFile);
+
+ listFileName = fileroot+ tag + ".list";
+
+ float previousDist = 0.00000;
+ float rndPreviousDist = 0.00000;
+
+ oldList = *list;
+
+ print_start = true;
+ start = time(NULL);
+ double saveCutoff = cutoff;
+
+ while (matrix->getSmallDist() < cutoff && matrix->getNNodes() > 0){
+
+ if (m->control_pressed) { //clean up
+ delete matrix; delete list; delete cluster; delete rabund;
+ listFile.close();
+ m->mothurRemove(listFileName);
+ return listFileName;
+ }
+
+ cluster->update(saveCutoff);
+
+ float dist = matrix->getSmallDist();
+ float rndDist;
+ if (hard) {
+ rndDist = m->ceilDist(dist, precision);
+ }else{
+ rndDist = m->roundDist(dist, precision);
+ }
+
+ if(previousDist <= 0.0000 && dist != previousDist){
+ oldList.setLabel("unique");
+ oldList.print(listFile);
+ if (labels.count("unique") == 0) { labels.insert("unique"); }
+ }
+ else if(rndDist != rndPreviousDist){
+ oldList.setLabel(toString(rndPreviousDist, length-1));
+ oldList.print(listFile);
+ if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
+ }
+
+ previousDist = dist;
+ rndPreviousDist = rndDist;
+ oldList = *list;
+ }
+
+
+ if(previousDist <= 0.0000){
+ oldList.setLabel("unique");
+ oldList.print(listFile);
+ if (labels.count("unique") == 0) { labels.insert("unique"); }
+ }
+ else if(rndPreviousDist<cutoff){
+ oldList.setLabel(toString(rndPreviousDist, length-1));
+ oldList.print(listFile);
+ if (labels.count(toString(rndPreviousDist, length-1)) == 0) { labels.insert(toString(rndPreviousDist, length-1)); }
+ }
+
+ delete matrix; delete list; delete cluster; delete rabund;
+ matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
+ listFile.close();
+
+ if (m->control_pressed) { //clean up
+ m->mothurRemove(listFileName);
+ return listFileName;
+ }
+
+ m->mothurRemove(thisDistFile);
+ m->mothurRemove(thisNamefile);
+
+ if (saveCutoff != cutoff) {
+ if (hard) { saveCutoff = m->ceilDist(saveCutoff, precision); }
+ else { saveCutoff = m->roundDist(saveCutoff, precision); }
+
+ m->mothurOut("Cutoff was " + toString(cutoff) + " changed cutoff to " + toString(saveCutoff)); m->mothurOutEndLine();
+ }
+
+ if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff; }
+
+ return listFileName;
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "ClusterSplitCommand", "clusterFile");
+ exit(1);
+ }
+}
+//**********************************************************************************************************************
int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> > distNames) {
try{
string thisOutputDir = outputDir;
if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
- string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "dist";
- remove(outputFileName.c_str());
+ string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("column");
+ m->mothurRemove(outputFileName);
for (int i = 0; i < distNames.size(); i++) {