X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=clustersplitcommand.cpp;h=9cdd7b065b857e5ddb7eaacbc27e4c3af84a43cb;hb=f816b683e586575bfe3479760a8afd5ab08e8573;hp=9999eb0fb81e461e9f168c2c81b7531a3774607a;hpb=220dc345e493cddc569521111ce32ac4d965ab7f;p=mothur.git diff --git a/clustersplitcommand.cpp b/clustersplitcommand.cpp index 9999eb0..9cdd7b0 100644 --- a/clustersplitcommand.cpp +++ b/clustersplitcommand.cpp @@ -8,12 +8,7 @@ */ #include "clustersplitcommand.h" -#include "readcluster.h" -#include "splitmatrix.h" -#include "readphylip.h" -#include "readcolumn.h" -#include "readmatrix.hpp" -#include "inputdata.h" + //********************************************************************************************************************** @@ -24,13 +19,13 @@ vector ClusterSplitCommand::setParameters(){ CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName",false,false); parameters.push_back(pfasta); CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName-FastaTaxName",false,false); parameters.push_back(pname); CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName",false,false); parameters.push_back(pcolumn); - CommandParameter ptaxlevel("taxlevel", "Number", "", "1", "", "", "",false,false); parameters.push_back(ptaxlevel); + CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "",false,false); parameters.push_back(ptaxlevel); CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "",false,false); parameters.push_back(psplitmethod); CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge); CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pshowabund); CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptiming); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); - CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff); + CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "",false,false); parameters.push_back(pcutoff); CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision); CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "",false,false); parameters.push_back(pmethod); CommandParameter phard("hard", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(phard); @@ -60,12 +55,12 @@ string ClusterSplitCommand::getHelpString(){ helpString += "The phylip and column parameter allow you to enter your distance file. \n"; helpString += "The fasta parameter allows you to enter your aligned fasta file. \n"; helpString += "The name parameter allows you to enter your name file and is required if your distance file is in column format. \n"; - helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n"; + helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n"; helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n"; helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n"; helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n"; helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n"; - helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1, meaning use the first taxon in each list. \n"; + helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n"; helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n"; #ifdef USE_MPI helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n"; @@ -201,9 +196,9 @@ ClusterSplitCommand::ClusterSplitCommand(string option) { else { distfile = fastafile; splitmethod = "fasta"; m->setFastaFile(fastafile); } taxFile = validParameter.validFile(parameters, "taxonomy", true); - if (taxFile == "not open") { abort = true; } + if (taxFile == "not open") { taxFile = ""; abort = true; } else if (taxFile == "not found") { taxFile = ""; } - else { m->setTaxonomyFile(taxFile); } + else { m->setTaxonomyFile(taxFile); if (splitmethod != "fasta") { splitmethod = "classify"; } } if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { //is there are current file available for either of these? @@ -264,7 +259,7 @@ ClusterSplitCommand::ClusterSplitCommand(string option) { if (temp == "not found") { temp = "100"; } //saves precision legnth for formatting below length = temp.length(); - convert(temp, precision); + m->mothurConvert(temp, precision); temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "T"; } hard = m->isTrue(temp); @@ -274,24 +269,24 @@ ClusterSplitCommand::ClusterSplitCommand(string option) { temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); - convert(temp, processors); + m->mothurConvert(temp, processors); temp = validParameter.validFile(parameters, "splitmethod", false); - if (splitmethod != "fasta") { + if ((splitmethod != "fasta") && (splitmethod != "classify")) { if (temp == "not found") { splitmethod = "distance"; } else { splitmethod = temp; } } - temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10"; } - convert(temp, cutoff); + temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "0.25"; } + m->mothurConvert(temp, cutoff); cutoff += (5 / (precision * 10.0)); - temp = validParameter.validFile(parameters, "taxlevel", false); if (temp == "not found") { temp = "1"; } - convert(temp, taxLevelCutoff); + temp = validParameter.validFile(parameters, "taxlevel", false); if (temp == "not found") { temp = "3"; } + m->mothurConvert(temp, taxLevelCutoff); method = validParameter.validFile(parameters, "method", false); if (method == "not found") { method = "average"; } - if ((method == "furthest") || (method == "nearest") || (method == "average")) { } + if ((method == "furthest") || (method == "nearest") || (method == "average")) { m->mothurOut("Using splitmethod " + splitmethod + ".\n"); } else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; } if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { } @@ -555,72 +550,21 @@ int ClusterSplitCommand::execute(){ MPI_Barrier(MPI_COMM_WORLD); #else - - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + ///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED /////////////////////// + //sanity check + if (processors > distName.size()) { processors = distName.size(); } + + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) if(processors == 1){ listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files }else{ - vector < vector < map > > dividedNames; //distNames[1] = vector of filenames for process 1... - dividedNames.resize(processors); - - //for each file group figure out which process will complete it - //want to divide the load intelligently so the big files are spread between processes - for (int i = 0; i < distName.size(); i++) { - int processToAssign = (i+1) % processors; - if (processToAssign == 0) { processToAssign = processors; } - - dividedNames[(processToAssign-1)].push_back(distName[i]); - } - - //not lets reverse the order of ever other process, so we balance big files running with little ones - for (int i = 0; i < processors; i++) { - int remainder = ((i+1) % processors); - if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); } - } - - createProcesses(dividedNames); - - if (m->control_pressed) { return 0; } - - //get list of list file names from each process - for(int i=0;iopenInputFile(filename, in); - - in >> tag; m->gobble(in); - - while(!in.eof()) { - string tempName; - in >> tempName; m->gobble(in); - listFileNames.push_back(tempName); - } - in.close(); - remove((toString(processIDS[i]) + ".temp").c_str()); - - //get labels - filename = toString(processIDS[i]) + ".temp.labels"; - ifstream in2; - m->openInputFile(filename, in2); - - float tempCutoff; - in2 >> tempCutoff; m->gobble(in2); - if (tempCutoff < cutoff) { cutoff = tempCutoff; } - - while(!in2.eof()) { - string tempName; - in2 >> tempName; m->gobble(in2); - if (labels.count(tempName) == 0) { labels.insert(tempName); } - } - in2.close(); - remove((toString(processIDS[i]) + ".temp.labels").c_str()); - } - } + listFileNames = createProcesses(distName, labels); + } #else listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files #endif #endif - if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; } + if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; } if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); } @@ -637,11 +581,11 @@ int ClusterSplitCommand::execute(){ ListVector* listSingle; map labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins - if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; } + if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } mergeLists(listFileNames, labelBins, listSingle); - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; } + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; } m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine(); @@ -703,7 +647,7 @@ map ClusterSplitCommand::completeListFile(vector listNames, listSingle->push_back(secondCol); } in.close(); - remove(singleton.c_str()); + m->mothurRemove(singleton); numSingleBins = listSingle->getNumBins(); }else{ listSingle = NULL; numSingleBins = 0; } @@ -729,8 +673,8 @@ map ClusterSplitCommand::completeListFile(vector listNames, for (int k = 0; k < listNames.size(); k++) { if (m->control_pressed) { - if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str()); } - for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); } + if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton); } + for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); } return labelBin; } @@ -783,7 +727,7 @@ map ClusterSplitCommand::completeListFile(vector listNames, delete input; outFilled.close(); - remove(listNames[k].c_str()); + m->mothurRemove(listNames[k]); rename(filledInList.c_str(), listNames[k].c_str()); } @@ -833,7 +777,7 @@ int ClusterSplitCommand::mergeLists(vector listNames, map us //get the list info from each file for (int k = 0; k < listNames.size(); k++) { - if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); } delete rabund; return 0; } + if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); } delete rabund; return 0; } InputData* input = new InputData(listNames[k], "list"); ListVector* list = input->getListVector(thisLabel); @@ -865,7 +809,7 @@ int ClusterSplitCommand::mergeLists(vector listNames, map us if (listSingle != NULL) { delete listSingle; } - for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str()); } + for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]); } return 0; } @@ -897,12 +841,35 @@ void ClusterSplitCommand::printData(ListVector* oldList){ } } //********************************************************************************************************************** -int ClusterSplitCommand::createProcesses(vector < vector < map > > dividedNames){ +vector ClusterSplitCommand::createProcesses(vector< map > distName, set& labels){ try { + + vector listFiles; + vector < vector < map > > dividedNames; //distNames[1] = vector of filenames for process 1... + dividedNames.resize(processors); + + //for each file group figure out which process will complete it + //want to divide the load intelligently so the big files are spread between processes + for (int i = 0; i < distName.size(); i++) { + //cout << i << endl; + int processToAssign = (i+1) % processors; + if (processToAssign == 0) { processToAssign = processors; } + + dividedNames[(processToAssign-1)].push_back(distName[i]); + if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); } + } + + //not lets reverse the order of ever other process, so we balance big files running with little ones + for (int i = 0; i < processors; i++) { + //cout << i << endl; + int remainder = ((i+1) % processors); + if (remainder) { reverse(dividedNames[i].begin(), dividedNames[i].end()); } + } + + if (m->control_pressed) { return listFiles; } - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - int process = 0; - int exitCommand = 1; + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + int process = 1; processIDS.clear(); //loop through and create all the processes you want @@ -943,14 +910,99 @@ int ClusterSplitCommand::createProcesses(vector < vector < map > } } + //do your part + listFiles = cluster(dividedNames[0], labels); + //force parent to wait until all the processes are done - for (int i=0;iopenInputFile(filename, in); + + in >> tag; m->gobble(in); + + while(!in.eof()) { + string tempName; + in >> tempName; m->gobble(in); + listFiles.push_back(tempName); + } + in.close(); + m->mothurRemove((toString(processIDS[i]) + ".temp")); + + //get labels + filename = toString(processIDS[i]) + ".temp.labels"; + ifstream in2; + m->openInputFile(filename, in2); + + float tempCutoff; + in2 >> tempCutoff; m->gobble(in2); + if (tempCutoff < cutoff) { cutoff = tempCutoff; } + + while(!in2.eof()) { + string tempName; + in2 >> tempName; m->gobble(in2); + if (labels.count(tempName) == 0) { labels.insert(tempName); } + } + in2.close(); + m->mothurRemove((toString(processIDS[i]) + ".temp.labels")); + } + + + #else + + ////////////////////////////////////////////////////////////////////////////////////////////////////// + //Windows version shared memory, so be careful when passing variables through the clusterData struct. + //Above fork() will clone, so memory is separate, but that's not the case with windows, + //Taking advantage of shared memory to allow both threads to add labels. + ////////////////////////////////////////////////////////////////////////////////////////////////////// - return exitCommand; + vector pDataArray; + DWORD dwThreadIdArray[processors-1]; + HANDLE hThreadArray[processors-1]; + + //Create processor worker threads. + for( int i=1; itag; + //get listfiles created + for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); } + //get labels + set::iterator it; + for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); } + //check cutoff + if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; } + CloseHandle(hThreadArray[i]); + delete pDataArray[i]; + } + #endif + + return listFiles; } catch(exception& e) { @@ -962,18 +1014,19 @@ int ClusterSplitCommand::createProcesses(vector < vector < map > vector ClusterSplitCommand::cluster(vector< map > distNames, set& labels){ try { - Cluster* cluster; - SparseMatrix* matrix; - ListVector* list; - ListVector oldList; - RAbundVector* rabund; vector listFileNames; - double smallestCutoff = cutoff; //cluster each distance file for (int i = 0; i < distNames.size(); i++) { + + Cluster* cluster = NULL; + SparseMatrix* matrix = NULL; + ListVector* list = NULL; + ListVector oldList; + RAbundVector* rabund = NULL; + if (m->control_pressed) { return listFileNames; } string thisNamefile = distNames[i].begin()->second; @@ -1004,8 +1057,8 @@ vector ClusterSplitCommand::cluster(vector< map > distNa oldList = *list; matrix = read->getMatrix(); - delete read; - delete nameMap; + delete read; read = NULL; + delete nameMap; nameMap = NULL; #ifdef USE_MPI @@ -1047,7 +1100,7 @@ vector ClusterSplitCommand::cluster(vector< map > distNa if (m->control_pressed) { //clean up delete matrix; delete list; delete cluster; delete rabund; listFile.close(); - for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } + for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } listFileNames.clear(); return listFileNames; } @@ -1090,15 +1143,16 @@ vector ClusterSplitCommand::cluster(vector< map > distNa } delete matrix; delete list; delete cluster; delete rabund; + matrix = NULL; list = NULL; cluster = NULL; rabund = NULL; listFile.close(); if (m->control_pressed) { //clean up - for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } + for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } listFileNames.clear(); return listFileNames; } - remove(thisDistFile.c_str()); - remove(thisNamefile.c_str()); + m->mothurRemove(thisDistFile); + m->mothurRemove(thisNamefile); if (saveCutoff != cutoff) { if (hard) { saveCutoff = m->ceilDist(saveCutoff, precision); } @@ -1137,7 +1191,7 @@ int ClusterSplitCommand::createMergedDistanceFile(vector< map > string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); } string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "dist"; - remove(outputFileName.c_str()); + m->mothurRemove(outputFileName); for (int i = 0; i < distNames.size(); i++) {