X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=clustersplitcommand.cpp;h=95693cc420a7e55d5c9a96a102e3d9e55ad0221c;hp=277e4a248374e596f61fa2f976c1b25696495a9d;hb=050a3ff02473a3d4c0980964e1a9ebe52e55d6b8;hpb=79a7d3273749b08d4f9f8dfe350c964ff0c4351e diff --git a/clustersplitcommand.cpp b/clustersplitcommand.cpp index 277e4a2..95693cc 100644 --- a/clustersplitcommand.cpp +++ b/clustersplitcommand.cpp @@ -13,26 +13,26 @@ //********************************************************************************************************************** vector ClusterSplitCommand::setParameters(){ try { - CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName",false,false); parameters.push_back(ptaxonomy); - CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none",false,false); parameters.push_back(pphylip); - CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName",false,false); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName",false,false); parameters.push_back(pname); - CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "",false,false); parameters.push_back(pcount); - CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName",false,false); parameters.push_back(pcolumn); - CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "",false,false); parameters.push_back(ptaxlevel); - CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "",false,false); parameters.push_back(psplitmethod); - CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge); - CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pshowabund); - CommandParameter pcluster("cluster", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pcluster); - CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptiming); - CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); - CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "",false,false); parameters.push_back(pcutoff); - CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision); - CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "",false,false); parameters.push_back(pmethod); - CommandParameter phard("hard", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(phard); - CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pclassic); - CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); - CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); + CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName","",false,false,true); parameters.push_back(ptaxonomy); + CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none","list",false,false,true); parameters.push_back(pphylip); + CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName","list",false,false,true); parameters.push_back(pfasta); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName","rabund-sabund",false,false,true); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "","",false,false,true); parameters.push_back(pcount); + CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName","list",false,false,true); parameters.push_back(pcolumn); + CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "","",false,false,true); parameters.push_back(ptaxlevel); + CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "","",false,false,true); parameters.push_back(psplitmethod); + CommandParameter plarge("large", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(plarge); + CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pshowabund); + CommandParameter pcluster("cluster", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pcluster); + CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ptiming); + CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors); + CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "","",false,false,true); parameters.push_back(pcutoff); + CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision); + CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "","",false,false); parameters.push_back(pmethod); + CommandParameter phard("hard", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(phard); + CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pclassic); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); vector myArray; for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } @@ -61,7 +61,7 @@ string ClusterSplitCommand::getHelpString(){ helpString += "The cluster parameter allows you to indicate whether you want to run the clustering or just split the distance matrix, default=t"; helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n"; helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n"; - helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n"; + helpString += "The method allows you to specify what clustering algorithm you want to use, default=average, option furthest, nearest, or average. \n"; helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n"; helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n"; helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n"; @@ -81,27 +81,22 @@ string ClusterSplitCommand::getHelpString(){ } } //********************************************************************************************************************** -string ClusterSplitCommand::getOutputFileNameTag(string type, string inputName=""){ - try { - string outputFileName = ""; - map >::iterator it; +string ClusterSplitCommand::getOutputPattern(string type) { + try { + string pattern = ""; - //is this a type this command creates - it = outputTypes.find(type); - if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); } - else { - if (type == "list") { outputFileName = "list"; } - else if (type == "rabund") { outputFileName = "rabund"; } - else if (type == "sabund") { outputFileName = "sabund"; } - else if (type == "column") { outputFileName = "dist"; } - else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } - } - return outputFileName; - } - catch(exception& e) { - m->errorOut(e, "ClusterSplitCommand", "getOutputFileNameTag"); - exit(1); - } + if (type == "list") { pattern = "[filename],[clustertag],list-[filename],[clustertag],[tag2],list"; } + else if (type == "rabund") { pattern = "[filename],[clustertag],rabund"; } + else if (type == "sabund") { pattern = "[filename],[clustertag],sabund"; } + else if (type == "column") { pattern = "[filename],dist"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } + + return pattern; + } + catch(exception& e) { + m->errorOut(e, "ClusterSplitCommand", "getOutputPattern"); + exit(1); + } } //********************************************************************************************************************** ClusterSplitCommand::ClusterSplitCommand(){ @@ -444,19 +439,18 @@ int ClusterSplitCommand::execute(){ vector< map > distName = split->getDistanceFiles(); //returns map of distance files -> namefile sorted by distance file size delete split; + if (m->debug) { m->mothurOut("[DEBUG]: distName.size() = " + toString(distName.size()) + ".\n"); } + //output a merged distance file - if (splitmethod == "fasta") { createMergedDistanceFile(distName); } - + //if (splitmethod == "fasta") { createMergedDistanceFile(distName); } if (m->control_pressed) { return 0; } m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine(); estart = time(NULL); - + if (!runCluster) { -#ifdef USE_MPI - } -#endif + m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); for (int i = 0; i < distName.size(); i++) { m->mothurOut(distName[i].begin()->first); m->mothurOutEndLine(); m->mothurOut(distName[i].begin()->second); m->mothurOutEndLine(); } @@ -464,7 +458,7 @@ int ClusterSplitCommand::execute(){ return 0; } - + //****************** break up files between processes and cluster each file set ******************************// #ifdef USE_MPI ////you are process 0 from above//// @@ -821,11 +815,13 @@ int ClusterSplitCommand::mergeLists(vector listNames, map us if (outputDir == "") { outputDir += m->hasPath(distfile); } fileroot = outputDir + m->getRootName(m->getSimpleName(distfile)); - string sabundFileName = fileroot+ tag + "." + getOutputFileNameTag("sabund"); - string rabundFileName = fileroot+ tag + "." + getOutputFileNameTag("rabund"); - string listFileName = fileroot+ tag + "."; - if (countfile != "") { listFileName += "unique_"; } - listFileName += getOutputFileNameTag("list"); + map variables; + variables["[filename]"] = fileroot; + variables["[clustertag]"] = tag; + string sabundFileName = getOutputFileName("sabund", variables); + string rabundFileName = getOutputFileName("rabund", variables); + if (countfile != "") { variables["[tag2]"] = "unique_list"; } + string listFileName = getOutputFileName("list", variables); if (countfile == "") { m->openOutputFile(sabundFileName, outSabund); @@ -847,9 +843,12 @@ int ClusterSplitCommand::mergeLists(vector listNames, map us if (itLabel->first == -1) { thisLabel = "unique"; } else { thisLabel = toString(itLabel->first, length-1); } - outList << thisLabel << '\t' << itLabel->second << '\t'; + //outList << thisLabel << '\t' << itLabel->second << '\t'; RAbundVector* rabund = NULL; + ListVector completeList; + completeList.setLabel(thisLabel); + if (countfile == "") { rabund = new RAbundVector(); rabund->setLabel(thisLabel); @@ -858,7 +857,8 @@ int ClusterSplitCommand::mergeLists(vector listNames, map us //add in singletons if (listSingle != NULL) { for (int j = 0; j < listSingle->getNumBins(); j++) { - outList << listSingle->get(j) << '\t'; + //outList << listSingle->get(j) << '\t'; + completeList.push_back(listSingle->get(j)); if (countfile == "") { rabund->push_back(m->getNumNames(listSingle->get(j))); } } } @@ -875,7 +875,8 @@ int ClusterSplitCommand::mergeLists(vector listNames, map us if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine(); } else { for (int j = 0; j < list->getNumBins(); j++) { - outList << list->get(j) << '\t'; + //outList << list->get(j) << '\t'; + completeList.push_back(list->get(j)); if (countfile == "") { rabund->push_back(m->getNumNames(list->get(j))); } } delete list; @@ -888,7 +889,8 @@ int ClusterSplitCommand::mergeLists(vector listNames, map us sabund.print(outSabund); rabund->print(outRabund); } - outList << endl; + //outList << endl; + completeList.print(outList); if (rabund != NULL) { delete rabund; } } @@ -950,7 +952,7 @@ vector ClusterSplitCommand::createProcesses(vector< map if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); } } - //not lets reverse the order of ever other process, so we balance big files running with little ones + //now lets reverse the order of ever other process, so we balance big files running with little ones for (int i = 0; i < processors; i++) { //cout << i << endl; int remainder = ((i+1) % processors); @@ -1172,7 +1174,7 @@ string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisN cluster->readPhylipFile(thisDistFile, nameMap); }else if (countfile != "") { ct = new CountTable(); - ct->readTable(thisNamefile); + ct->readTable(thisNamefile, false); cluster->readPhylipFile(thisDistFile, ct); } tag = cluster->getTag(); @@ -1300,7 +1302,7 @@ string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile read->read(nameMap); }else if (countfile != "") { ct = new CountTable(); - ct->readTable(thisNamefile); + ct->readTable(thisNamefile, false); read->read(ct); }else { read->read(nameMap); } @@ -1328,9 +1330,10 @@ string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine(); //create cluster - if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); } - else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); } - else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method); } + float adjust = -1.0; + if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method, adjust); } + else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, matrix, cutoff, method, adjust); } + else if(method == "average"){ cluster = new AverageLinkage(rabund, list, matrix, cutoff, method, adjust); } tag = cluster->getTag(); if (outputDir == "") { outputDir += m->hasPath(thisDistFile); } @@ -1440,7 +1443,9 @@ int ClusterSplitCommand::createMergedDistanceFile(vector< map > string thisOutputDir = outputDir; if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); } - string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("column"); + map variables; + variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)); + string outputFileName = getOutputFileName("column", variables); m->mothurRemove(outputFileName);