X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=clustersplitcommand.cpp;h=95693cc420a7e55d5c9a96a102e3d9e55ad0221c;hp=277e4a248374e596f61fa2f976c1b25696495a9d;hb=050a3ff02473a3d4c0980964e1a9ebe52e55d6b8;hpb=79a7d3273749b08d4f9f8dfe350c964ff0c4351e

diff --git a/clustersplitcommand.cpp b/clustersplitcommand.cpp
index 277e4a2..95693cc 100644
--- a/clustersplitcommand.cpp
+++ b/clustersplitcommand.cpp
@@ -13,26 +13,26 @@
 //**********************************************************************************************************************
 vector<string> ClusterSplitCommand::setParameters(){	
 	try {
-		CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName",false,false); parameters.push_back(ptaxonomy);
-		CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none",false,false); parameters.push_back(pphylip);
-		CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName",false,false); parameters.push_back(pfasta);
-		CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName",false,false); parameters.push_back(pname);
-        CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "",false,false); parameters.push_back(pcount);
-		CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName",false,false); parameters.push_back(pcolumn);
-		CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "",false,false); parameters.push_back(ptaxlevel);
-		CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "",false,false); parameters.push_back(psplitmethod);
-		CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
-		CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pshowabund);
-        CommandParameter pcluster("cluster", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pcluster);
-		CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptiming);
-		CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
-		CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "",false,false); parameters.push_back(pcutoff);
-		CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
-		CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "",false,false); parameters.push_back(pmethod);
-		CommandParameter phard("hard", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(phard);
-        CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pclassic);
-		CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
-		CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
+		CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "FastaTaxName","",false,false,true); parameters.push_back(ptaxonomy);
+		CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "none","list",false,false,true); parameters.push_back(pphylip);
+		CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName","list",false,false,true); parameters.push_back(pfasta);
+		CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName-FastaTaxName","rabund-sabund",false,false,true); parameters.push_back(pname);
+        CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "","",false,false,true); parameters.push_back(pcount);
+		CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName","list",false,false,true); parameters.push_back(pcolumn);
+		CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "","",false,false,true); parameters.push_back(ptaxlevel);
+		CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "","",false,false,true); parameters.push_back(psplitmethod);
+		CommandParameter plarge("large", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(plarge);
+		CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pshowabund);
+        CommandParameter pcluster("cluster", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pcluster);
+		CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(ptiming);
+		CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
+		CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "","",false,false,true); parameters.push_back(pcutoff);
+		CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
+		CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "","",false,false); parameters.push_back(pmethod);
+		CommandParameter phard("hard", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(phard);
+        CommandParameter pclassic("classic", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pclassic);
+		CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
+		CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
 			
 		vector<string> myArray;
 		for (int i = 0; i < parameters.size(); i++) {	myArray.push_back(parameters[i].name);		}
@@ -61,7 +61,7 @@ string ClusterSplitCommand::getHelpString(){
         helpString += "The cluster parameter allows you to indicate whether you want to run the clustering or just split the distance matrix, default=t";
 		helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
 		helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
-		helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
+		helpString += "The method allows you to specify what clustering algorithm you want to use, default=average, option furthest, nearest, or average. \n";
 		helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
 		helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
 		helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
@@ -81,27 +81,22 @@ string ClusterSplitCommand::getHelpString(){
 	}
 }
 //**********************************************************************************************************************
-string ClusterSplitCommand::getOutputFileNameTag(string type, string inputName=""){	
-	try {
-        string outputFileName = "";
-		map<string, vector<string> >::iterator it;
+string ClusterSplitCommand::getOutputPattern(string type) {
+    try {
+        string pattern = "";
         
-        //is this a type this command creates
-        it = outputTypes.find(type);
-        if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
-        else {
-            if (type == "list") {  outputFileName =  "list"; }
-            else if (type == "rabund") {  outputFileName =  "rabund"; }
-            else if (type == "sabund") {  outputFileName =  "sabund"; }
-            else if (type == "column") {  outputFileName =  "dist"; }
-            else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
-        }
-        return outputFileName;
-	}
-	catch(exception& e) {
-		m->errorOut(e, "ClusterSplitCommand", "getOutputFileNameTag");
-		exit(1);
-	}
+        if (type == "list") {  pattern = "[filename],[clustertag],list-[filename],[clustertag],[tag2],list"; } 
+        else if (type == "rabund") {  pattern = "[filename],[clustertag],rabund"; } 
+        else if (type == "sabund") {  pattern = "[filename],[clustertag],sabund"; }
+        else if (type == "column") {  pattern = "[filename],dist"; }
+        else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
+        
+        return pattern;
+    }
+    catch(exception& e) {
+        m->errorOut(e, "ClusterSplitCommand", "getOutputPattern");
+        exit(1);
+    }
 }
 //**********************************************************************************************************************
 ClusterSplitCommand::ClusterSplitCommand(){	
@@ -444,19 +439,18 @@ int ClusterSplitCommand::execute(){
 		vector< map<string, string> > distName = split->getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
 		delete split;
 		
+        if (m->debug) { m->mothurOut("[DEBUG]: distName.size() = " + toString(distName.size()) + ".\n"); }
+                
 		//output a merged distance file
-		if (splitmethod == "fasta")		{ createMergedDistanceFile(distName); }
-			
+		//if (splitmethod == "fasta")		{ createMergedDistanceFile(distName); }
 				
 		if (m->control_pressed) { return 0; }
 		
 		m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
 		estart = time(NULL);
-                
+              
         if (!runCluster) { 
-#ifdef USE_MPI
-    }
-#endif	
+
                 m->mothurOutEndLine();
                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
                 for (int i = 0; i < distName.size(); i++) {	m->mothurOut(distName[i].begin()->first); m->mothurOutEndLine(); m->mothurOut(distName[i].begin()->second); m->mothurOutEndLine();	}
@@ -464,7 +458,7 @@ int ClusterSplitCommand::execute(){
                 return 0;
                 
         }
-                
+   
 		//****************** break up files between processes and cluster each file set ******************************//
 	#ifdef USE_MPI
 			////you are process 0 from above////
@@ -821,11 +815,13 @@ int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> us
 		if (outputDir == "") { outputDir += m->hasPath(distfile); }
 		fileroot = outputDir + m->getRootName(m->getSimpleName(distfile));
 		
-        string sabundFileName = fileroot+ tag + "." + getOutputFileNameTag("sabund");
-        string rabundFileName = fileroot+ tag + "." + getOutputFileNameTag("rabund");
-        string listFileName = fileroot+ tag + ".";
-        if (countfile != "") { listFileName += "unique_"; }
-        listFileName += getOutputFileNameTag("list");
+        map<string, string> variables; 
+        variables["[filename]"] = fileroot;
+        variables["[clustertag]"] = tag;
+        string sabundFileName = getOutputFileName("sabund", variables);
+        string rabundFileName = getOutputFileName("rabund", variables);
+        if (countfile != "") { variables["[tag2]"] = "unique_list"; }
+        string listFileName = getOutputFileName("list", variables);
         
         if (countfile == "") {
             m->openOutputFile(sabundFileName,	outSabund);
@@ -847,9 +843,12 @@ int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> us
 			if (itLabel->first == -1) { thisLabel = "unique"; }
 			else { thisLabel = toString(itLabel->first,  length-1);  } 
 			
-			outList << thisLabel << '\t' << itLabel->second << '\t';
+			//outList << thisLabel << '\t' << itLabel->second << '\t';
             
             RAbundVector* rabund = NULL;
+            ListVector completeList;
+            completeList.setLabel(thisLabel);
+            
             if (countfile == "") {
                 rabund = new RAbundVector();
                 rabund->setLabel(thisLabel);
@@ -858,7 +857,8 @@ int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> us
 			//add in singletons
 			if (listSingle != NULL) {
 				for (int j = 0; j < listSingle->getNumBins(); j++) {
-					outList << listSingle->get(j) << '\t';
+					//outList << listSingle->get(j) << '\t';
+                    completeList.push_back(listSingle->get(j));
 					if (countfile == "") { rabund->push_back(m->getNumNames(listSingle->get(j))); }
 				}
 			}
@@ -875,7 +875,8 @@ int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> us
 				if (list == NULL) { m->mothurOut("Error merging listvectors in file " + listNames[k]); m->mothurOutEndLine();  }	
 				else {		
 					for (int j = 0; j < list->getNumBins(); j++) {
-						outList << list->get(j) << '\t';
+						//outList << list->get(j) << '\t';
+                        completeList.push_back(list->get(j));
 						if (countfile == "") { rabund->push_back(m->getNumNames(list->get(j))); }
 					}
 					delete list;
@@ -888,7 +889,8 @@ int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> us
                 sabund.print(outSabund);
                 rabund->print(outRabund);
             }
-			outList << endl;
+			//outList << endl;
+            completeList.print(outList);
 			
 			if (rabund != NULL) { delete rabund; }
 		}
@@ -950,7 +952,7 @@ vector<string>  ClusterSplitCommand::createProcesses(vector< map<string, string>
             if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
         }
         
-        //not lets reverse the order of ever other process, so we balance big files running with little ones
+        //now lets reverse the order of ever other process, so we balance big files running with little ones
         for (int i = 0; i < processors; i++) {
             //cout << i << endl;
             int remainder = ((i+1) % processors);
@@ -1172,7 +1174,7 @@ string ClusterSplitCommand::clusterClassicFile(string thisDistFile, string thisN
             cluster->readPhylipFile(thisDistFile, nameMap);
 		}else if (countfile != "") {
             ct = new CountTable();
-            ct->readTable(thisNamefile);
+            ct->readTable(thisNamefile, false);
             cluster->readPhylipFile(thisDistFile, ct);
         }
         tag = cluster->getTag();
@@ -1300,7 +1302,7 @@ string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile
             read->read(nameMap);
 		}else if (countfile != "") {
             ct = new CountTable();
-            ct->readTable(thisNamefile);
+            ct->readTable(thisNamefile, false);
             read->read(ct);
         }else { read->read(nameMap); }
 		
@@ -1328,9 +1330,10 @@ string ClusterSplitCommand::clusterFile(string thisDistFile, string thisNamefile
         m->mothurOutEndLine(); m->mothurOut("Clustering " + thisDistFile); m->mothurOutEndLine();
 		
         //create cluster
-        if (method == "furthest")	{	cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method); }
-        else if(method == "nearest"){	cluster = new SingleLinkage(rabund, list, matrix, cutoff, method); }
-        else if(method == "average"){	cluster = new AverageLinkage(rabund, list, matrix, cutoff, method);	}
+        float adjust = -1.0;
+        if (method == "furthest")	{	cluster = new CompleteLinkage(rabund, list, matrix, cutoff, method, adjust); }
+        else if(method == "nearest"){	cluster = new SingleLinkage(rabund, list, matrix, cutoff, method, adjust); }
+        else if(method == "average"){	cluster = new AverageLinkage(rabund, list, matrix, cutoff, method, adjust);	}
         tag = cluster->getTag();
 		
         if (outputDir == "") { outputDir += m->hasPath(thisDistFile); }
@@ -1440,7 +1443,9 @@ int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> >
 		
 		string thisOutputDir = outputDir;
 		if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
-		string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("column");
+        map<string, string> variables; 
+        variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
+		string outputFileName = getOutputFileName("column", variables);
 		m->mothurRemove(outputFileName);