From 8ab46c171b45ae3782d839539792d049017a361b Mon Sep 17 00:00:00 2001
From: westcott <westcott>
Date: Fri, 28 May 2010 15:00:33 +0000
Subject: [PATCH] finished cluster.split adding classify method.

---
 clustersplitcommand.cpp | 47 ++++++++++++++++++++++++++---------------
 clustersplitcommand.h   |  2 +-
 splitmatrix.cpp         | 10 +++++----
 3 files changed, 37 insertions(+), 22 deletions(-)
diff --git a/clustersplitcommand.cpp b/clustersplitcommand.cpp
index 4b3c27b..ed995fc 100644
--- a/clustersplitcommand.cpp
+++ b/clustersplitcommand.cpp
@@ -27,7 +27,7 @@ ClusterSplitCommand::ClusterSplitCommand(string option)  {
 		
 		else {
 			//valid paramters for this command
-			string Array[] =  {"phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","showabund","timing","hard","processors","outputdir","inputdir"};
+			string Array[] =  {"phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","showabund","timing","hard","processors","outputdir","inputdir"};
 			vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
 			
 			OptionParser parser(option);
@@ -128,14 +128,16 @@ ClusterSplitCommand::ClusterSplitCommand(string option)  {
 			temp = validParameter.validFile(parameters, "processors", false);	if (temp == "not found"){	temp = "1";				}
 			convert(temp, processors); 
 			
-			temp = validParameter.validFile(parameters, "cutoff", false);
-			if (temp == "not found") { temp = "10"; }
+			splitmethod = validParameter.validFile(parameters, "splitmethod", false);		if (splitmethod == "not found") { splitmethod = "distance"; }
+			
+			temp = validParameter.validFile(parameters, "cutoff", false);		if (temp == "not found")  { temp = "10"; }
 			convert(temp, cutoff); 
 			cutoff += (5 / (precision * 10.0));  
 			
-			method = validParameter.validFile(parameters, "method", false);		if (method == "not found") { method = "furthest"; }
+			temp = validParameter.validFile(parameters, "taxlevel", false);		if (temp == "not found")  { temp = "1"; }
+			convert(temp, taxLevelCutoff); 
 			
-			splitmethod = validParameter.validFile(parameters, "splitmethod", false);		if (splitmethod == "not found") { method = "distance"; }
+			method = validParameter.validFile(parameters, "method", false);		if (method == "not found") { method = "furthest"; }
 			
 			if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
 			else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
@@ -163,12 +165,18 @@ ClusterSplitCommand::ClusterSplitCommand(string option)  {
 
 void ClusterSplitCommand::help(){
 	try {
-		m->mothurOut("The cluster.split command parameter options are cutoff, splitcutoff, precision, method, splitmethod, phylip, column, name, showabund, timing. Phylip or column and name are required.\n");
+		m->mothurOut("The cluster.split command parameter options are phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, processors. Phylip or column and name are required.\n");
 		m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
 		m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
+		m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
+		m->mothurOut("The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n");
+		m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n");
+		m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance or classify. \n");
+		m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
+		m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1. \n");
 		m->mothurOut("The cluster.split command should be in the following format: \n");
-		m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n");
-		m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify) \n");	
+		m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
+		m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");	
 
 	}
 	catch(exception& e) {
@@ -223,7 +231,10 @@ int ClusterSplitCommand::execute(){
 		time_t estart = time(NULL);
 		
 		//split matrix into non-overlapping groups
-		SplitMatrix* split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod);
+		SplitMatrix* split;
+		if (splitmethod == "distance")	{	split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod);			}
+		else							{	split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod);  }
+		
 		split->split();
 		
 		if (m->control_pressed) { delete split; return 0; }
@@ -334,7 +345,7 @@ int ClusterSplitCommand::mergeLists(vector<string> listNames, string singleton,
 		outputNames.push_back(fileroot+ tag + ".sabund");
 		outputNames.push_back(fileroot+ tag + ".rabund");
 		outputNames.push_back(fileroot+ tag + ".list");
-			
+		
 		//read in singletons
 		ListVector* listSingle = NULL;
 		if (singleton != "none") {
@@ -365,7 +376,7 @@ int ClusterSplitCommand::mergeLists(vector<string> listNames, string singleton,
 				it--;
 			}
 		}
-		
+	
 		//sort order
 		sort(orderFloat.begin(), orderFloat.end());
 
@@ -381,7 +392,7 @@ int ClusterSplitCommand::mergeLists(vector<string> listNames, string singleton,
 			lastLabels.push_back(tempList.getLabel());
 			in.close();
 		}
-		
+	
 		ListVector* merged = NULL;
 				
 		//for each label needed
@@ -429,10 +440,12 @@ int ClusterSplitCommand::mergeLists(vector<string> listNames, string singleton,
 			}
 			
 			//add in singletons
-			for (int j = 0; j < listSingle->getNumBins(); j++) {
-				merged->push_back(listSingle->get(j));
+			if (listSingle != NULL) {
+				for (int j = 0; j < listSingle->getNumBins(); j++) {
+					merged->push_back(listSingle->get(j));
+				}
 			}
-
+			
 			//print to files
 			printData(merged);
 			
@@ -640,7 +653,7 @@ vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNa
 				oldList.print(listFile);
 				if (labels.count(toString(rndPreviousDist,  length-1)) == 0) { labels.insert(toString(rndPreviousDist,  length-1)); }
 			}
-		
+	
 			delete matrix; delete list;	delete cluster; delete rabund; 
 			listFile.close();
 			
@@ -653,7 +666,7 @@ vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNa
 			remove(thisNamefile.c_str());
 		}
 		
-				
+					
 		return listFileNames;
 	
 	}
diff --git a/clustersplitcommand.h b/clustersplitcommand.h
index 631a8b6..e838bc4 100644
--- a/clustersplitcommand.h
+++ b/clustersplitcommand.h
@@ -34,7 +34,7 @@ private:
 
 	string method, fileroot, tag, outputDir, phylipfile, columnfile, namefile, distfile, format, showabund, timing, splitmethod, taxFile;
 	double cutoff, splitcutoff;
-	int precision, length, processors;
+	int precision, length, processors, taxLevelCutoff;
 	bool print_start, abort, hard;
 	time_t start;
 	ofstream outList, outRabund, outSabund;
diff --git a/splitmatrix.cpp b/splitmatrix.cpp
index 0b59fa2..1c61f31 100644
--- a/splitmatrix.cpp
+++ b/splitmatrix.cpp
@@ -241,6 +241,8 @@ int SplitMatrix::splitDistance(){
 /***********************************************************************/
 int SplitMatrix::splitClassify(){
 	try {
+		cutoff = int(cutoff);
+		
 		map<string, int> seqGroup;
 		map<string, int>::iterator it;
 		map<string, int>::iterator it2;
@@ -272,11 +274,11 @@ int SplitMatrix::splitClassify(){
 		
 			//is this node within the cutoff
 			TaxNode taxon = phylo->get(i);
-			
+		
 			if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
-				if (taxon.children.size() > 1) { //if this taxon just has one seq its a singleton
-					for (it = taxon.children.begin(); it != taxon.children.end(); it++) {
-						seqGroup[it->first] = numGroups;
+				if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
+					for (int j = 0; j < taxon.accessions.size(); j++) {
+						seqGroup[taxon.accessions[j]] = numGroups;
 					}
 					numGroups++;
 				}
-- 
2.39.5