X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=clustersplitcommand.cpp;h=9cdd7b065b857e5ddb7eaacbc27e4c3af84a43cb;hb=f816b683e586575bfe3479760a8afd5ab08e8573;hp=9999eb0fb81e461e9f168c2c81b7531a3774607a;hpb=220dc345e493cddc569521111ce32ac4d965ab7f;p=mothur.git

diff --git a/clustersplitcommand.cpp b/clustersplitcommand.cpp
index 9999eb0..9cdd7b0 100644
--- a/clustersplitcommand.cpp
+++ b/clustersplitcommand.cpp
@@ -8,12 +8,7 @@
  */
 
 #include "clustersplitcommand.h"
-#include "readcluster.h"
-#include "splitmatrix.h"
-#include "readphylip.h"
-#include "readcolumn.h"
-#include "readmatrix.hpp"
-#include "inputdata.h"
+
 
 
 //**********************************************************************************************************************
@@ -24,13 +19,13 @@ vector<string> ClusterSplitCommand::setParameters(){
 		CommandParameter pfasta("fasta", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "FastaTaxName",false,false); parameters.push_back(pfasta);
 		CommandParameter pname("name", "InputTypes", "", "", "none", "none", "ColumnName-FastaTaxName",false,false); parameters.push_back(pname);
 		CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumnFasta", "PhylipColumnFasta", "ColumnName",false,false); parameters.push_back(pcolumn);
-		CommandParameter ptaxlevel("taxlevel", "Number", "", "1", "", "", "",false,false); parameters.push_back(ptaxlevel);
+		CommandParameter ptaxlevel("taxlevel", "Number", "", "3", "", "", "",false,false); parameters.push_back(ptaxlevel);
 		CommandParameter psplitmethod("splitmethod", "Multiple", "classify-fasta-distance", "distance", "", "", "",false,false); parameters.push_back(psplitmethod);
 		CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
 		CommandParameter pshowabund("showabund", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pshowabund);
 		CommandParameter ptiming("timing", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(ptiming);
 		CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
-		CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff);
+		CommandParameter pcutoff("cutoff", "Number", "", "0.25", "", "", "",false,false); parameters.push_back(pcutoff);
 		CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
 		CommandParameter pmethod("method", "Multiple", "furthest-nearest-average-weighted", "average", "", "", "",false,false); parameters.push_back(pmethod);
 		CommandParameter phard("hard", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(phard);
@@ -60,12 +55,12 @@ string ClusterSplitCommand::getHelpString(){
 		helpString += "The phylip and column parameter allow you to enter your distance file. \n";
 		helpString += "The fasta parameter allows you to enter your aligned fasta file. \n";
 		helpString += "The name parameter allows you to enter your name file and is required if your distance file is in column format. \n";
-		helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n";
+		helpString += "The cutoff parameter allow you to set the distance you want to cluster to, default is 0.25. \n";
 		helpString += "The precision parameter allows you specify the precision of the precision of the distances outputted, default=100, meaning 2 decimal places. \n";
 		helpString += "The method allows you to specify what clustering algorythm you want to use, default=average, option furthest, nearest, or average. \n";
 		helpString += "The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n";
 		helpString += "The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n";
-		helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1, meaning use the first taxon in each list. \n";
+		helpString += "The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=3, meaning use the first taxon in each list. \n";
 		helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n";
 #ifdef USE_MPI
 		helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n";
@@ -201,9 +196,9 @@ ClusterSplitCommand::ClusterSplitCommand(string option)  {
 			else { distfile = fastafile;  splitmethod = "fasta";  m->setFastaFile(fastafile); }
 			
 			taxFile = validParameter.validFile(parameters, "taxonomy", true);
-			if (taxFile == "not open") { abort = true; }	
+			if (taxFile == "not open") { taxFile = ""; abort = true; }	
 			else if (taxFile == "not found") { taxFile = ""; }
-			else {  m->setTaxonomyFile(taxFile); }
+			else {  m->setTaxonomyFile(taxFile); if (splitmethod != "fasta") { splitmethod = "classify"; } }
 			
 			if ((phylipfile == "") && (columnfile == "") && (fastafile == "")) { 
 				//is there are current file available for either of these?
@@ -264,7 +259,7 @@ ClusterSplitCommand::ClusterSplitCommand(string option)  {
 			if (temp == "not found") { temp = "100"; }
 			//saves precision legnth for formatting below
 			length = temp.length();
-			convert(temp, precision); 
+			m->mothurConvert(temp, precision); 
 			
 			temp = validParameter.validFile(parameters, "hard", false);			if (temp == "not found") { temp = "T"; }
 			hard = m->isTrue(temp);
@@ -274,24 +269,24 @@ ClusterSplitCommand::ClusterSplitCommand(string option)  {
 			
 			temp = validParameter.validFile(parameters, "processors", false);	if (temp == "not found"){	temp = m->getProcessors();	}
 			m->setProcessors(temp);
-			convert(temp, processors);
+			m->mothurConvert(temp, processors);
 			
 			temp = validParameter.validFile(parameters, "splitmethod", false);	
-			if (splitmethod != "fasta") {
+			if ((splitmethod != "fasta") && (splitmethod != "classify")) {
 				if (temp == "not found")  { splitmethod = "distance"; }
 				else {  splitmethod = temp; }
 			}
 			
-			temp = validParameter.validFile(parameters, "cutoff", false);		if (temp == "not found")  { temp = "10"; }
-			convert(temp, cutoff); 
+			temp = validParameter.validFile(parameters, "cutoff", false);		if (temp == "not found")  { temp = "0.25"; }
+			m->mothurConvert(temp, cutoff); 
 			cutoff += (5 / (precision * 10.0));  
 			
-			temp = validParameter.validFile(parameters, "taxlevel", false);		if (temp == "not found")  { temp = "1"; }
-			convert(temp, taxLevelCutoff); 
+			temp = validParameter.validFile(parameters, "taxlevel", false);		if (temp == "not found")  { temp = "3"; }
+			m->mothurConvert(temp, taxLevelCutoff); 
 			
 			method = validParameter.validFile(parameters, "method", false);		if (method == "not found") { method = "average"; }
 			
-			if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
+			if ((method == "furthest") || (method == "nearest") || (method == "average")) { m->mothurOut("Using splitmethod " + splitmethod + ".\n"); }
 			else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
 			
 			if ((splitmethod == "distance") || (splitmethod == "classify") || (splitmethod == "fasta")) { }
@@ -555,72 +550,21 @@ int ClusterSplitCommand::execute(){
 		MPI_Barrier(MPI_COMM_WORLD);
 		
 	#else
-
-		#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+		///////////////////// WINDOWS CAN ONLY USE 1 PROCESSORS ACCESS VIOLATION UNRESOLVED ///////////////////////
+		//sanity check
+		if (processors > distName.size()) { processors = distName.size(); }
+		
+		#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
 				if(processors == 1){
 					listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 				}else{
-					vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
-					dividedNames.resize(processors);
-					
-					//for each file group figure out which process will complete it
-					//want to divide the load intelligently so the big files are spread between processes
-					for (int i = 0; i < distName.size(); i++) { 
-						int processToAssign = (i+1) % processors; 
-						if (processToAssign == 0) { processToAssign = processors; }
-						
-						dividedNames[(processToAssign-1)].push_back(distName[i]);
-					}
-					
-					//not lets reverse the order of ever other process, so we balance big files running with little ones
-					for (int i = 0; i < processors; i++) {
-						int remainder = ((i+1) % processors);
-						if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
-					}
-					
-					createProcesses(dividedNames);
-							
-					if (m->control_pressed) { return 0; }
-
-					//get list of list file names from each process
-					for(int i=0;i<processors;i++){
-						string filename = toString(processIDS[i]) + ".temp";
-						ifstream in;
-						m->openInputFile(filename, in);
-						
-						in >> tag; m->gobble(in);
-						
-						while(!in.eof()) {
-							string tempName;
-							in >> tempName; m->gobble(in);
-							listFileNames.push_back(tempName);
-						}
-						in.close();
-						remove((toString(processIDS[i]) + ".temp").c_str());
-						
-						//get labels
-						filename = toString(processIDS[i]) + ".temp.labels";
-						ifstream in2;
-						m->openInputFile(filename, in2);
-						
-						float tempCutoff;
-						in2 >> tempCutoff; m->gobble(in2);
-						if (tempCutoff < cutoff) { cutoff = tempCutoff; }
-						
-						while(!in2.eof()) {
-							string tempName;
-							in2 >> tempName; m->gobble(in2);
-							if (labels.count(tempName) == 0) { labels.insert(tempName); }
-						}
-						in2.close();
-						remove((toString(processIDS[i]) + ".temp.labels").c_str());
-					}
-				}
+					listFileNames = createProcesses(distName, labels);
+                }
 		#else
 				listFileNames = cluster(distName, labels); //clusters individual files and returns names of list files
 		#endif
 	#endif	
-		if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
+		if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { m->mothurRemove(listFileNames[i]); } return 0; }
 		
 		if (saveCutoff != cutoff) { m->mothurOut("Cutoff was " + toString(saveCutoff) + " changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();  }
 		
@@ -637,11 +581,11 @@ int ClusterSplitCommand::execute(){
 		ListVector* listSingle;
 		map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
 		
-		if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
+		if (m->control_pressed) { if (listSingle != NULL) { delete listSingle; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
 		
 		mergeLists(listFileNames, labelBins, listSingle);
 
-		if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
+		if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
 		
 		m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
 		
@@ -703,7 +647,7 @@ map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames,
 				listSingle->push_back(secondCol);
 			}
 			in.close();
-			remove(singleton.c_str());
+			m->mothurRemove(singleton);
 			
 			numSingleBins = listSingle->getNumBins();
 		}else{  listSingle = NULL; numSingleBins = 0;  }
@@ -729,8 +673,8 @@ map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames,
 		for (int k = 0; k < listNames.size(); k++) {
 	
 			if (m->control_pressed) {  
-				if (listSingle != NULL) { delete listSingle; listSingle = NULL; remove(singleton.c_str());  }
-				for (int i = 0; i < listNames.size(); i++) {   remove(listNames[i].c_str());  }
+				if (listSingle != NULL) { delete listSingle; listSingle = NULL; m->mothurRemove(singleton);  }
+				for (int i = 0; i < listNames.size(); i++) {   m->mothurRemove(listNames[i]);  }
 				return labelBin;
 			}
 			
@@ -783,7 +727,7 @@ map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames,
 			delete input;
 			
 			outFilled.close();
-			remove(listNames[k].c_str());
+			m->mothurRemove(listNames[k]);
 			rename(filledInList.c_str(), listNames[k].c_str());
 		}
 		
@@ -833,7 +777,7 @@ int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> us
 			//get the list info from each file
 			for (int k = 0; k < listNames.size(); k++) {
 	
-				if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { remove(listNames[i].c_str());  } delete rabund; return 0; }
+				if (m->control_pressed) {  if (listSingle != NULL) { delete listSingle;   } for (int i = 0; i < listNames.size(); i++) { m->mothurRemove(listNames[i]);  } delete rabund; return 0; }
 				
 				InputData* input = new InputData(listNames[k], "list");
 				ListVector* list = input->getListVector(thisLabel);
@@ -865,7 +809,7 @@ int ClusterSplitCommand::mergeLists(vector<string> listNames, map<float, int> us
 		
 		if (listSingle != NULL) { delete listSingle;  }
 		
-		for (int i = 0; i < listNames.size(); i++) {  remove(listNames[i].c_str());  }
+		for (int i = 0; i < listNames.size(); i++) {  m->mothurRemove(listNames[i]);  }
 		
 		return 0;
 	}
@@ -897,12 +841,35 @@ void ClusterSplitCommand::printData(ListVector* oldList){
 	}
 }
 //**********************************************************************************************************************
-int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> > > dividedNames){
+vector<string>  ClusterSplitCommand::createProcesses(vector< map<string, string> > distName, set<string>& labels){
 	try {
+        
+        vector<string> listFiles;
+        vector < vector < map<string, string> > > dividedNames; //distNames[1] = vector of filenames for process 1...
+        dividedNames.resize(processors);
+        
+        //for each file group figure out which process will complete it
+        //want to divide the load intelligently so the big files are spread between processes
+        for (int i = 0; i < distName.size(); i++) { 
+            //cout << i << endl;
+            int processToAssign = (i+1) % processors; 
+            if (processToAssign == 0) { processToAssign = processors; }
+            
+            dividedNames[(processToAssign-1)].push_back(distName[i]);
+            if ((processToAssign-1) == 1) { m->mothurOut(distName[i].begin()->first + "\n"); }
+        }
+        
+        //not lets reverse the order of ever other process, so we balance big files running with little ones
+        for (int i = 0; i < processors; i++) {
+            //cout << i << endl;
+            int remainder = ((i+1) % processors);
+            if (remainder) {  reverse(dividedNames[i].begin(), dividedNames[i].end());  }
+        }
+        
+        if (m->control_pressed) { return listFiles; }
 	
-	#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-		int process = 0;
-		int exitCommand = 1;
+	#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
+		int process = 1;
 		processIDS.clear();
 		
 		//loop through and create all the processes you want
@@ -943,14 +910,99 @@ int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> >
 			}
 		}
 		
+        //do your part
+        listFiles = cluster(dividedNames[0], labels);
+        
 		//force parent to wait until all the processes are done
-		for (int i=0;i<processors;i++) { 
+		for (int i=0;i< processIDS.size();i++) { 
 			int temp = processIDS[i];
 			wait(&temp);
 		}
+        
+        //get list of list file names from each process
+        for(int i=0;i<processIDS.size();i++){
+            string filename = toString(processIDS[i]) + ".temp";
+            ifstream in;
+            m->openInputFile(filename, in);
+            
+            in >> tag; m->gobble(in);
+            
+            while(!in.eof()) {
+                string tempName;
+                in >> tempName; m->gobble(in);
+                listFiles.push_back(tempName);
+            }
+            in.close();
+            m->mothurRemove((toString(processIDS[i]) + ".temp"));
+            
+            //get labels
+            filename = toString(processIDS[i]) + ".temp.labels";
+            ifstream in2;
+            m->openInputFile(filename, in2);
+            
+            float tempCutoff;
+            in2 >> tempCutoff; m->gobble(in2);
+            if (tempCutoff < cutoff) { cutoff = tempCutoff; }
+            
+            while(!in2.eof()) {
+                string tempName;
+                in2 >> tempName; m->gobble(in2);
+                if (labels.count(tempName) == 0) { labels.insert(tempName); }
+            }
+            in2.close();
+            m->mothurRemove((toString(processIDS[i]) + ".temp.labels"));
+        }
+        
+
+    #else
+       
+        //////////////////////////////////////////////////////////////////////////////////////////////////////
+		//Windows version shared memory, so be careful when passing variables through the clusterData struct. 
+		//Above fork() will clone, so memory is separate, but that's not the case with windows, 
+		//Taking advantage of shared memory to allow both threads to add labels.
+		//////////////////////////////////////////////////////////////////////////////////////////////////////
 		
-		return exitCommand;
+		vector<clusterData*> pDataArray; 
+		DWORD   dwThreadIdArray[processors-1];
+		HANDLE  hThreadArray[processors-1]; 
+		
+		//Create processor worker threads.
+		for( int i=1; i<processors; i++ ){
+			// Allocate memory for thread data.
+			clusterData* tempCluster = new clusterData(dividedNames[i], m, cutoff, method, outputDir, hard, precision, length, i);
+			pDataArray.push_back(tempCluster);
+			processIDS.push_back(i);
+            
+			//MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
+			//default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
+			hThreadArray[i-1] = CreateThread(NULL, 0, MyClusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);  
+            
+		}
+        
+        //do your part
+        listFiles = cluster(dividedNames[0], labels);
+        
+		//Wait until all threads have terminated.
+		WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
+		
+		//Close all thread handles and free memory allocations.
+		for(int i=0; i < pDataArray.size(); i++){
+            //get tag
+            tag = pDataArray[i]->tag;
+            //get listfiles created
+            for(int j=0; j < pDataArray[i]->listFiles.size(); j++){ listFiles.push_back(pDataArray[i]->listFiles[j]); }
+            //get labels
+            set<string>::iterator it;
+            for(it = pDataArray[i]->labels.begin(); it != pDataArray[i]->labels.end(); it++){ labels.insert(*it); }
+			//check cutoff
+            if (pDataArray[i]->cutoff < cutoff) { cutoff = pDataArray[i]->cutoff; }
+			CloseHandle(hThreadArray[i]);
+			delete pDataArray[i];
+		}
+
 	#endif		
+        
+        return listFiles;
 	
 	}
 	catch(exception& e) {
@@ -962,18 +1014,19 @@ int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> >
 
 vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNames, set<string>& labels){
 	try {
-		Cluster* cluster;
-		SparseMatrix* matrix;
-		ListVector* list;
-		ListVector oldList;
-		RAbundVector* rabund;
 		
 		vector<string> listFileNames;
-		
 		double smallestCutoff = cutoff;
 		
 		//cluster each distance file
 		for (int i = 0; i < distNames.size(); i++) {
+            
+            Cluster* cluster = NULL;
+            SparseMatrix* matrix = NULL;
+            ListVector* list = NULL;
+            ListVector oldList;
+            RAbundVector* rabund = NULL;
+            
 			if (m->control_pressed) { return listFileNames; }
 			
 			string thisNamefile = distNames[i].begin()->second;
@@ -1004,8 +1057,8 @@ vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNa
 			oldList = *list;
 			matrix = read->getMatrix();
 			
-			delete read; 
-			delete nameMap; 
+			delete read;  read = NULL;
+			delete nameMap; nameMap = NULL;
 			
 			
 			#ifdef USE_MPI
@@ -1047,7 +1100,7 @@ vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNa
 				if (m->control_pressed) { //clean up
 					delete matrix; delete list;	delete cluster; delete rabund;
 					listFile.close();
-					for (int i = 0; i < listFileNames.size(); i++) {	remove(listFileNames[i].c_str()); 	}
+					for (int i = 0; i < listFileNames.size(); i++) {	m->mothurRemove(listFileNames[i]); 	}
 					listFileNames.clear(); return listFileNames;
 				}
 		
@@ -1090,15 +1143,16 @@ vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNa
 			}
 	
 			delete matrix; delete list;	delete cluster; delete rabund; 
+            matrix = NULL; list = NULL; cluster = NULL; rabund = NULL;
 			listFile.close();
 			
 			if (m->control_pressed) { //clean up
-				for (int i = 0; i < listFileNames.size(); i++) {	remove(listFileNames[i].c_str()); 	}
+				for (int i = 0; i < listFileNames.size(); i++) {	m->mothurRemove(listFileNames[i]); 	}
 				listFileNames.clear(); return listFileNames;
 			}
 			
-			remove(thisDistFile.c_str());
-			remove(thisNamefile.c_str());
+			m->mothurRemove(thisDistFile);
+			m->mothurRemove(thisNamefile);
 			
 			if (saveCutoff != cutoff) { 
 				if (hard)	{  saveCutoff = m->ceilDist(saveCutoff, precision);	}
@@ -1137,7 +1191,7 @@ int ClusterSplitCommand::createMergedDistanceFile(vector< map<string, string> >
 		string thisOutputDir = outputDir;
 		if (outputDir == "") { thisOutputDir = m->hasPath(fastafile); }
 		string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "dist";
-		remove(outputFileName.c_str());
+		m->mothurRemove(outputFileName);
 		
 		
 		for (int i = 0; i < distNames.size(); i++) {