X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=splitmatrix.cpp;h=f6b5c4d81cf5446adc5e5253808380be7ebbf28d;hp=0b59fa2e2548ac14c5baa8923510409ad56274b3;hb=d1c97b8c04bb75faca1e76ffad60b37a4d789d3d;hpb=47faf5463d44570ad66148384763db1c8238b563

diff --git a/splitmatrix.cpp b/splitmatrix.cpp
index 0b59fa2..f6b5c4d 100644
--- a/splitmatrix.cpp
+++ b/splitmatrix.cpp
@@ -9,16 +9,35 @@
 
 #include "splitmatrix.h"
 #include "phylotree.h"
+#include "distancecommand.h"
+#include "seqsummarycommand.h"
 
 /***********************************************************************/
 
-SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t){
+SplitMatrix::SplitMatrix(string distfile, string name, string count, string tax, float c, string t, bool l){
 	m = MothurOut::getInstance();
 	distFile = distfile;
 	cutoff = c;
 	namefile = name;
 	method = t;
 	taxFile = tax;
+    countfile = count;
+	large = l;
+}
+/***********************************************************************/
+
+SplitMatrix::SplitMatrix(string ffile, string name, string count, string tax, float c, float cu, string t, int p, bool cl, string output){
+	m = MothurOut::getInstance();
+	fastafile = ffile;
+	namefile = name;
+    countfile = count;
+	taxFile = tax;
+	cutoff = c;  //tax level cutoff
+	distCutoff = cu; //for fasta method if you are creating distance matrix you need a cutoff for that
+	method = t;
+	processors = p;
+    classic = cl;
+	outputDir = output;
 }
 
 /***********************************************************************/
@@ -28,12 +47,13 @@ int SplitMatrix::split(){
         
 		if (method == "distance") {  
 			splitDistance();
-		}else if (method == "classify") {
+		}else if ((method == "classify") || (method == "fasta")) {
 			splitClassify();
 		}else {
 			m->mothurOut("Unknown splitting method, aborting split."); m->mothurOutEndLine();
 			map<string, string> temp;
-			temp[distFile] = namefile;
+			if (namefile != "") {  temp[distFile] = namefile; }
+            else { temp[distFile] = countfile; }
 			dists.push_back(temp);
 		}
 		
@@ -48,12 +68,276 @@ int SplitMatrix::split(){
 int SplitMatrix::splitDistance(){
 	try {
         
-		vector<set<string> > groups;
+		if (large)	{ splitDistanceLarge(); }
+		else		{ splitDistanceRAM();	}
+		
+		return 0;
+			
+	}
+	catch(exception& e) {
+		m->errorOut(e, "SplitMatrix", "splitDistance");
+		exit(1);
+	}
+}
+
+/***********************************************************************/
+int SplitMatrix::splitClassify(){
+	try {
+		cutoff = int(cutoff);
+				
+		map<string, int> seqGroup;
+		map<string, int>::iterator it;
+		map<string, int>::iterator it2;
+		
 		int numGroups = 0;
+		
+		//build tree from users taxonomy file
+		PhyloTree* phylo = new PhyloTree();
+		
+        map<string, string> temp;
+        m->readTax(taxFile, temp);
+        
+        for (map<string, string>::iterator itTemp = temp.begin(); itTemp != temp.end();) {
+            phylo->addSeqToTree(itTemp->first, itTemp->second);
+            temp.erase(itTemp++);
+        }
+		
+		phylo->assignHeirarchyIDs(0);
 
+		//make sure the cutoff is not greater than maxlevel
+		if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
+	
+		//for each node in tree
+		for (int i = 0; i < phylo->getNumNodes(); i++) {
+		
+			//is this node within the cutoff
+			TaxNode taxon = phylo->get(i);
+	
+			if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
+				if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
+					for (int j = 0; j < taxon.accessions.size(); j++) {
+						seqGroup[taxon.accessions[j]] = numGroups;
+					}
+					numGroups++;
+				}
+			}
+		}
+	
+		delete phylo;
+		
+		if (method == "classify") {
+			splitDistanceFileByTax(seqGroup, numGroups);
+		}else {
+			createDistanceFilesFromTax(seqGroup, numGroups);
+		}
+		
+		return 0;
+			
+	}
+	catch(exception& e) {
+		m->errorOut(e, "SplitMatrix", "splitClassify");
+		exit(1);
+	}
+}
+/***********************************************************************/
+int SplitMatrix::createDistanceFilesFromTax(map<string, int>& seqGroup, int numGroups){
+	try {
+		map<string, int> copyGroups = seqGroup;
+		map<string, int>::iterator it;
+		set<string> names;
+				
+		for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
+			m->mothurRemove((fastafile + "." + toString(i) + ".temp"));
+		}
+			
+		ifstream in;
+		m->openInputFile(fastafile, in);
+	
+		//parse fastafile
 		ofstream outFile;
+		while (!in.eof()) {
+			Sequence query(in); m->gobble(in);
+			if (query.getName() != "") {
+		
+				it = seqGroup.find(query.getName());
+				
+				//save names in case no namefile is given
+				if ((namefile == "") && (countfile == "")) {  names.insert(query.getName()); }
+			
+				if (it != seqGroup.end()) { //not singleton 
+					m->openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile);
+					query.printSequence(outFile); 
+					outFile.close();
+					
+					copyGroups.erase(query.getName());
+				}
+			}
+		}
+		in.close();
+		
+		//warn about sequence in groups that are not in fasta file
+		for(it = copyGroups.begin(); it != copyGroups.end(); it++) {
+			m->mothurOut("ERROR: " + it->first + " is missing from your fastafile. This could happen if your taxonomy file is not unique and your fastafile is, or it could indicate and error."); m->mothurOutEndLine();
+			exit(1);
+		}
+		
+		copyGroups.clear();
+        
+		//process each distance file
+		for (int i = 0; i < numGroups; i++) { 
+			
+			string options = "";
+            if (classic) { options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", output=lt"; }
+            else { options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors) + ", cutoff=" + toString(distCutoff); }
+			if (outputDir != "") { options += ", outputdir=" + outputDir; }
+			
+            m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
+            
+			Command* command = new DistanceCommand(options);
+			
+            m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
+            
+			command->execute();
+			delete command;
+			
+			m->mothurRemove((fastafile + "." + toString(i) + ".temp"));
+			
+			//remove old names files just in case
+			if (namefile != "") { m->mothurRemove((namefile + "." + toString(i) + ".temp")); }
+            else { m->mothurRemove((countfile + "." + toString(i) + ".temp")); }
+		}
+        
+        //restore old fasta file name since dist.seqs overwrites it with the temp files
+        m->setFastaFile(fastafile);
+        
+        vector<string> tempDistFiles;    
+        for(int i=0;i<numGroups;i++){
+            if (outputDir == "") { outputDir = m->hasPath(fastafile); }
+            string tempDistFile = "";
+            if (classic) { tempDistFile =  outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "phylip.dist";}
+            else { tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist"; }
+            tempDistFiles.push_back(tempDistFile);
+        }
+        
+        splitNames(seqGroup, numGroups, tempDistFiles);
+        
+		if (m->control_pressed)	 {  for (int i = 0; i < dists.size(); i++) { m->mothurRemove((dists[i].begin()->first)); m->mothurRemove((dists[i].begin()->second)); } dists.clear(); }
+		
+		return 0;
+	}
+	catch(exception& e) {
+		m->errorOut(e, "SplitMatrix", "createDistanceFilesFromTax");
+		exit(1);
+	}
+}
+/***********************************************************************/
+int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroups){
+	try {
+		map<string, int>::iterator it;
+		map<string, int>::iterator it2;
+		
+        ofstream outFile;
+		ifstream dFile;
+		m->openInputFile(distFile, dFile);
+		
+		
+		for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
+			m->mothurRemove((distFile + "." + toString(i) + ".temp"));
+		}
+		
+		//for buffering the io to improve speed
+		 //allow for 10 dists to be stored, then output.
+		vector<string> outputs;  outputs.resize(numGroups, "");
+		vector<int> numOutputs;	 numOutputs.resize(numGroups, 0);	
+		
+		//you can have a group made, but their may be no distances in the file for this group if the taxonomy file and distance file don't match
+		//this can occur if we have converted the phylip to column, since we reduce the size at that step by using the cutoff value
+		vector<bool> validDistances;   validDistances.resize(numGroups, false); 
+		
+		//for each distance
+		while(dFile){
+			string seqA, seqB;
+			float dist;
+			
+			if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { m->mothurRemove((distFile + "." + toString(i) + ".temp"));	} }
+			
+			dFile >> seqA >> seqB >> dist;  m->gobble(dFile);
+			
+			//if both sequences are in the same group then they are within the cutoff
+			it = seqGroup.find(seqA);
+			it2 = seqGroup.find(seqB);
+			
+			if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons 
+				if (it->second == it2->second) { //they are from the same group so add the distance
+					if (numOutputs[it->second] > 30) {
+						m->openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
+						outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
+						outFile.close();
+						outputs[it->second] = "";
+						numOutputs[it->second] = 0;
+						validDistances[it->second] = true;
+					}else{
+						outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
+						numOutputs[it->second]++;
+					}
+				}
+			}
+		}
+		dFile.close();
+        
+        string inputFile = namefile;
+        if (countfile != "") { inputFile = countfile; }
+        
+        vector<string> tempDistFiles;
+		for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
+            string tempDistFile = distFile + "." + toString(i) + ".temp";
+            tempDistFiles.push_back(tempDistFile);
+			m->mothurRemove((inputFile + "." + toString(i) + ".temp"));
+			
+			//write out any remaining buffers
+			if (numOutputs[i] > 0) {
+				m->openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
+				outFile << outputs[i];
+				outFile.close();
+				outputs[i] = "";
+				numOutputs[i] = 0;
+				validDistances[i] = true;
+			}
+		}
+		
+        splitNames(seqGroup, numGroups, tempDistFiles);
+        
+		if (m->control_pressed)	 {  
+			for (int i = 0; i < dists.size(); i++) { 
+				m->mothurRemove((dists[i].begin()->first));
+				m->mothurRemove((dists[i].begin()->second));
+			}
+			dists.clear();
+		}
+		
+		return 0;
+	}
+	catch(exception& e) {
+		m->errorOut(e, "SplitMatrix", "splitDistanceFileByTax");
+		exit(1);
+	}
+}
+/***********************************************************************/
+int SplitMatrix::splitDistanceLarge(){
+	try {
+		vector<set<string> > groups;
+		
+		//for buffering the io to improve speed
+		 //allow for 30 dists to be stored, then output.
+		vector<string> outputs;
+		vector<int> numOutputs;
+		vector<bool> wroteOutPut;
+		
+		int numGroups = 0;
+
+		//ofstream outFile;
 		ifstream dFile;
-		openInputFile(distFile, dFile);
+		m->openInputFile(distFile, dFile);
 	
 		while(dFile){
 			string seqA, seqB;
@@ -61,14 +345,13 @@ int SplitMatrix::splitDistance(){
 
 			dFile >> seqA >> seqB >> dist;
 			
-			if (m->control_pressed) {  outFile.close(); dFile.close();  for(int i=0;i<numGroups;i++){	if(groups[i].size() > 0){  remove((distFile + "." + toString(i) + ".temp").c_str()); }  } return 0; }
+			if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){	if(groups[i].size() > 0){  m->mothurRemove((distFile + "." + toString(i) + ".temp")); }  } return 0; }
 					
 			if(dist < cutoff){
 				//cout << "in cutoff: " << dist << endl;
 				int groupIDA = -1;
 				int groupIDB = -1;
 				int groupID = -1;
-				int prevGroupID = -1;
 				
 				for(int i=0;i<numGroups;i++){
 					set<string>::iterator aIt = groups[i].find(seqA);
@@ -116,264 +399,432 @@ int SplitMatrix::splitDistance(){
 					newGroup.insert(seqB);
 					groups.push_back(newGroup);
 									
-					outFile.close();
-					string fileName = distFile + "." + toString(numGroups) + ".temp";
-					outFile.open(fileName.c_str(), ios::ate);
-
-					outFile << seqA << '\t' << seqB << '\t' << dist << endl;
+					string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
+					outputs.push_back(tempOut);
+					numOutputs.push_back(1);
+					wroteOutPut.push_back(false);
+					
 					numGroups++;
 				}
 				else{
 					string fileName = distFile + "." + toString(groupID) + ".temp";
-					if(groupID != prevGroupID){
-						outFile.close();
+											
+					//have we reached the max buffer size
+					if (numOutputs[groupID] > 60) { //write out sequence
+                        ofstream outFile;
 						outFile.open(fileName.c_str(), ios::app);
-						prevGroupID	= groupID;
+						outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
+						outFile.close();
+						
+						outputs[groupID] = "";
+						numOutputs[groupID] = 0;
+						wroteOutPut[groupID] = true;
+					}else {
+						outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
+						numOutputs[groupID]++;
 					}
-					outFile << seqA << '\t' << seqB << '\t' << dist << endl;
 					
 					if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
 						string row, column, distance;
 						if(groupIDA<groupIDB){
-							string fileName = distFile + "." + toString(groupIDB) + ".temp";
-							ifstream fileB(fileName.c_str());
-							while(fileB){
-								fileB >> row >> column >> distance;
-								outFile << row << '\t' << column << '\t' << distance << endl;
-								gobble(fileB);
-							}
-							fileB.close();
-							remove(fileName.c_str());
+							
+							//merge memory
+							numOutputs[groupID] += numOutputs[groupIDB];
+							outputs[groupID] += outputs[groupIDB];
+							
+							outputs[groupIDB] = "";
+							numOutputs[groupIDB] = 0;
+							
+							//if groupB is written to file it is above buffer size so read and write to new merged file
+							if (wroteOutPut[groupIDB]) {
+								string fileName2 = distFile + "." + toString(groupIDB) + ".temp";
+								/*ifstream fileB(fileName2.c_str(), ios::ate);
+								
+								outFile.open(fileName.c_str(), ios::app);
+								
+								long size;
+								char* memblock;
+
+								size = fileB.tellg();
+				
+								fileB.seekg (0, ios::beg);
+								
+								int numRead = size / 1024;
+								int lastRead = size % 1024;
+
+								for (int i = 0; i < numRead; i++) {
+				
+									memblock = new char [1024];
+								
+									fileB.read (memblock, 1024);
+									
+									string temp = memblock;
+									outFile << temp.substr(0, 1024);
+									
+									delete memblock;
+								}
+								
+								memblock = new char [lastRead];
+								
+								fileB.read (memblock, lastRead);
+								
+								//not sure why but it will read more than lastRead char...??
+								string temp = memblock;
+								outFile << temp.substr(0, lastRead);
+								delete memblock;
+								
+								fileB.close();*/
+                                m->appendFiles(fileName2, fileName);
+								m->mothurRemove(fileName2);
+                        
+								
+								//write out the merged memory
+								if (numOutputs[groupID] > 60) {
+                                    ofstream tempOut;
+                                    m->openOutputFile(fileName, tempOut);
+									tempOut << outputs[groupID];
+									outputs[groupID] = "";
+									numOutputs[groupID] = 0;
+                                    tempOut.close();
+								}
+								
+								//outFile.close();
+								
+								wroteOutPut[groupID] = true;
+								wroteOutPut[groupIDB] = false;
+							}else{ } //just merge b's memory with a's memory 
 						}
 						else{
-							string fileName = distFile + "." + toString(groupIDA) + ".temp";
-							ifstream fileA(fileName.c_str());
-							while(fileA){
-								fileA >> row >> column >> distance;
-								outFile << row << '\t' << column << '\t' << distance << endl;
-								gobble(fileA);
-							}
-							fileA.close();
-							remove(fileName.c_str());
+							numOutputs[groupID] += numOutputs[groupIDA];
+							outputs[groupID] += outputs[groupIDA];
+							
+							outputs[groupIDA] = "";
+							numOutputs[groupIDA] = 0;
+							
+							if (wroteOutPut[groupIDA]) {
+								string fileName2 = distFile + "." + toString(groupIDA) + ".temp";
+								/*ifstream fileB(fileName2.c_str(), ios::ate);
+								
+								outFile.open(fileName.c_str(), ios::app);
+								
+								long size;
+								char* memblock;
+
+								size = fileB.tellg();
+															
+								fileB.seekg (0, ios::beg);
+								
+								int numRead = size / 1024;
+								int lastRead = size % 1024;
+
+								for (int i = 0; i < numRead; i++) {
+				
+									memblock = new char [1024];
+								
+									fileB.read (memblock, 1024);
+									string temp = memblock;
+									outFile << temp.substr(0, 1024);
+									
+									delete memblock;
+								}
+								
+								memblock = new char [lastRead];
+								
+								fileB.read (memblock, lastRead);
+								
+								//not sure why but it will read more than lastRead char...??
+								string temp = memblock;
+								outFile << temp.substr(0, lastRead);
+									
+								delete memblock;
+								
+								fileB.close();*/
+                                m->appendFiles(fileName2, fileName);
+								m->mothurRemove(fileName2);
+								
+								//write out the merged memory
+								if (numOutputs[groupID] > 60) {
+                                    ofstream tempOut;
+                                    m->openOutputFile(fileName, tempOut);
+									tempOut << outputs[groupID];
+									outputs[groupID] = "";
+									numOutputs[groupID] = 0;
+                                    tempOut.close();
+								}
+								
+								//outFile.close();
+								
+								wroteOutPut[groupID] = true;
+								wroteOutPut[groupIDA] = false;
+							}else { } //just merge memory
 						}					
 					}
 				}
 			}
-			gobble(dFile);
+			m->gobble(dFile);
 		}
-		outFile.close();
 		dFile.close();
-	
-		ifstream bigNameFile(namefile.c_str());
-		if(!bigNameFile){
-			cerr << "Error: We can't open the name file\n";
-			exit(1);
-		}
-		
-		map<string, string> nameMap;
-		string name, nameList;
-		while(bigNameFile){
-			bigNameFile >> name >> nameList;
-			nameMap[name] = nameList;
-			gobble(bigNameFile);
-		}
-		bigNameFile.close();
-			
-		for(int i=0;i<numGroups;i++){  //parse names file to match distance files
-			int numSeqsInGroup = groups[i].size();
+        
+		vector<string> tempDistFiles;
+		for (int i = 0; i < numGroups; i++) {
+            string fileName = distFile + "." + toString(i) + ".temp";
+            tempDistFiles.push_back(fileName);
+            //remove old names files just in case
 			
-			if(numSeqsInGroup > 0){
-				string fileName = namefile + "." + toString(i) + ".temp";
-				ofstream smallNameFile(fileName.c_str(), ios::ate);
-				
-				for(set<string>::iterator gIt=groups[i].begin();gIt!=groups[i].end();gIt++){
-					map<string,string>::iterator nIt = nameMap.find(*gIt);
-					
-					if (nIt != nameMap.end()) {
-						smallNameFile << nIt->first << '\t' << nIt->second << endl;
-						nameMap.erase(nIt);
-					}else{
-						m->mothurOut((*gIt) + " is in your distance file and not in your namefile.  Please correct."); m->mothurOutEndLine(); exit(1);
-					}
-				}
-				smallNameFile.close();
+			if (numOutputs[i] > 0) {
+                ofstream outFile;
+				outFile.open(fileName.c_str(), ios::app);
+				outFile << outputs[i];
+				outFile.close();
 			}
 		}
-		
-		//names of singletons
-		if (nameMap.size() != 0) {
-			singleton = namefile + ".extra.temp";
-			ofstream remainingNames(singleton.c_str(), ios::ate);
-			for(map<string,string>::iterator nIt=nameMap.begin();nIt!=nameMap.end();nIt++){
-				remainingNames << nIt->first << '\t' << nIt->second << endl;
-			}
-			remainingNames.close();
-		}else { singleton = "none"; }
-			
-		for(int i=0;i<numGroups;i++){
-			if(groups[i].size() > 0){
-				string tempNameFile = namefile + "." + toString(i) + ".temp";
-				string tempDistFile = distFile + "." + toString(i) + ".temp";
+        
+        map<string, int> seqGroup;
+        for (int i = 0; i < groups.size(); i++) {
+            for (set<string>::iterator itNames = groups[i].begin(); itNames != groups[i].end();) {
+                seqGroup[*itNames] = i;
+                groups[i].erase(itNames++);
+            }
+        }
+        
+		splitNames(seqGroup, numGroups, tempDistFiles);
 				
+		return 0;			
+	}
+	catch(exception& e) {
+		m->errorOut(e, "SplitMatrix", "splitDistanceLarge");
+		exit(1);
+	}
+}
+//********************************************************************************************************************
+int SplitMatrix::splitNames(map<string, int>& seqGroup, int numGroups, vector<string>& tempDistFiles){
+	try {
+        ofstream outFile;
+        map<string, int>::iterator it;
+        
+        string inputFile = namefile;
+        if (countfile != "") { inputFile = countfile; }
+        
+        for(int i=0;i<numGroups;i++){  m->mothurRemove((inputFile + "." + toString(i) + ".temp")); }
+
+        singleton = inputFile + ".extra.temp";
+        ofstream remainingNames;
+        m->openOutputFile(singleton, remainingNames);
+        
+        bool wroteExtra = false;
+        
+        ifstream bigNameFile;
+        m->openInputFile(inputFile, bigNameFile);
+        
+        //grab header line 
+        string headers = "";
+        if (countfile != "") { headers = m->getline(bigNameFile); m->gobble(bigNameFile); }
+        
+        string name, nameList;
+        while(!bigNameFile.eof()){
+            bigNameFile >> name >> nameList;  
+            m->getline(bigNameFile); m->gobble(bigNameFile); //extra getline is for rest of countfile line if groups are given.
+            
+            //did this sequence get assigned a group
+            it = seqGroup.find(name);
+            
+            if (it != seqGroup.end()) {  
+                m->openOutputFileAppend((inputFile + "." + toString(it->second) + ".temp"), outFile);
+                outFile << name << '\t' << nameList << endl;
+                outFile.close();
+            }else{
+                wroteExtra = true;
+                remainingNames << name << '\t' << nameList << endl;
+            }
+        }
+        bigNameFile.close();
+        
+		for(int i=0;i<numGroups;i++){
+			string tempNameFile = inputFile + "." + toString(i) + ".temp";
+			string tempDistFile = tempDistFiles[i];
+            
+            //if there are valid distances
+            ifstream fileHandle;
+            fileHandle.open(tempDistFile.c_str());
+            if(fileHandle) 	{	
+                m->gobble(fileHandle);
+                if (!fileHandle.eof()) {  //check
 				map<string, string> temp;
+                if (countfile != "") {
+                    //add header
+                    ofstream out;
+                    string newtempNameFile = tempNameFile + "2";
+                    m->openOutputFile(newtempNameFile, out);
+                    out << "Representative_Sequence\ttotal" << endl;
+                    out.close();
+                    m->appendFiles(tempNameFile, newtempNameFile);
+                    m->mothurRemove(tempNameFile);
+                    m->renameFile(newtempNameFile, tempNameFile);
+                }
 				temp[tempDistFile] = tempNameFile;
 				dists.push_back(temp);
+			}else{
+				ifstream in;
+				m->openInputFile(tempNameFile, in);
+				
+				while(!in.eof()) { 
+					in >> name >> nameList;  m->gobble(in);
+					wroteExtra = true;
+					remainingNames << name << '\t' << nameList << endl;
+				}
+				in.close();
+				m->mothurRemove(tempNameFile);
 			}
+            }
+            fileHandle.close();
 		}
 		
-		if (m->control_pressed)	 {  
-			for (int i = 0; i < dists.size(); i++) { 
-				remove((dists[i].begin()->first).c_str());
-				remove((dists[i].begin()->second).c_str());
-			}
-			dists.clear();
-		}
+		remainingNames.close();
+		
+		if (!wroteExtra) { 
+			m->mothurRemove(singleton);
+			singleton = "none";
+		}else if (countfile != "") {
+            //add header
+            ofstream out;
+            string newtempNameFile = singleton + "2";
+            m->openOutputFile(newtempNameFile, out);
+            out << "Representative_Sequence\ttotal" << endl; 
+            out.close();
+            m->appendFiles(singleton, newtempNameFile);
+            m->mothurRemove(singleton);
+            m->renameFile(newtempNameFile, singleton);
+        }
 		
 		return 0;
-			
 	}
 	catch(exception& e) {
-		m->errorOut(e, "SplitMatrix", "splitDistance");
+		m->errorOut(e, "SplitMatrix", "splitNames");
 		exit(1);
 	}
 }
-
-/***********************************************************************/
-int SplitMatrix::splitClassify(){
+//********************************************************************************************************************
+int SplitMatrix::splitDistanceRAM(){
 	try {
-		map<string, int> seqGroup;
-		map<string, int>::iterator it;
-		map<string, int>::iterator it2;
+		vector<set<string> > groups;
+		vector<string> outputs;
 		
 		int numGroups = 0;
-		
-		//build tree from users taxonomy file
-		PhyloTree* phylo = new PhyloTree();
-		
-		ifstream in;
-		openInputFile(taxFile, in);
-			
-		//read in users taxonomy file and add sequences to tree
-		string seqname, tax;
-		while(!in.eof()){
-			in >> seqname >> tax; gobble(in);
-				
-			phylo->addSeqToTree(seqname, tax);
-		}
-		in.close();
-		
-		phylo->assignHeirarchyIDs(0);
-
-		//make sure the cutoff is not greater than maxlevel
-		if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
-		
-		//for each node in tree
-		for (int i = 0; i < phylo->getNumNodes(); i++) {
-		
-			//is this node within the cutoff
-			TaxNode taxon = phylo->get(i);
-			
-			if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
-				if (taxon.children.size() > 1) { //if this taxon just has one seq its a singleton
-					for (it = taxon.children.begin(); it != taxon.children.end(); it++) {
-						seqGroup[it->first] = numGroups;
-					}
-					numGroups++;
-				}
-			}
-		}
 
 		ifstream dFile;
-		openInputFile(distFile, dFile);
-		ofstream outFile;
-		
-		for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
-			remove((distFile + "." + toString(i) + ".temp").c_str());
-		}
-		
-		//for each distance
+		m->openInputFile(distFile, dFile);
+
 		while(dFile){
 			string seqA, seqB;
 			float dist;
+
+			dFile >> seqA >> seqB >> dist;
 			
-			if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { remove((distFile + "." + toString(i) + ".temp").c_str());	} }
-			
-			dFile >> seqA >> seqB >> dist;  gobble(dFile);
-			
-			//if both sequences are in the same group then they are within the cutoff
-			it = seqGroup.find(seqA);
-			it2 = seqGroup.find(seqB);
-			
-			if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons 
-				if (it->second == it2->second) { //they are from the same group so add the distance
-					openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
-					outFile << seqA << '\t' << seqB << '\t' << dist << endl;
-					outFile.close();
+			if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){	if(groups[i].size() > 0){  m->mothurRemove((distFile + "." + toString(i) + ".temp")); }  } return 0; }
+					
+			if(dist < cutoff){
+				//cout << "in cutoff: " << dist << endl;
+				int groupIDA = -1;
+				int groupIDB = -1;
+				int groupID = -1;
+				
+				for(int i=0;i<numGroups;i++){
+					set<string>::iterator aIt = groups[i].find(seqA);
+					set<string>::iterator bIt = groups[i].find(seqB);
+					
+					if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
+						groups[i].insert(seqB);
+						groupIDA = i;
+						groupID = groupIDA;
+
+						//cout << "in aIt: " << groupID << endl;
+	//					break;
+					}
+					else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
+						groups[i].insert(seqA);
+						groupIDB = i;
+						groupID = groupIDB;
+
+					//	cout << "in bIt: " << groupID << endl;
+	//					break;
+					}
+				
+					if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
+						if(groupIDA < groupIDB){
+						//	cout << "A: " << groupIDA << "\t" << groupIDB << endl;
+							groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
+							groups[groupIDB].clear(); 
+							groupID = groupIDA;
+						}
+						else{
+						//	cout << "B: " << groupIDA << "\t" << groupIDB << endl;
+							groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
+							groups[groupIDA].clear();  
+							groupID = groupIDB;
+						}
+						break;
+					}
+				}
+				
+	//windows is gonna gag on the reuse of outFile, will need to make it local...
+				
+				if(groupIDA == -1 && groupIDB == -1){ //we need a new group
+					set<string> newGroup;
+					newGroup.insert(seqA);
+					newGroup.insert(seqB);
+					groups.push_back(newGroup);
+									
+					string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
+					outputs.push_back(tempOut);
+					numGroups++;
+				}
+				else{
+											
+					outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
+					
+					if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
+						string row, column, distance;
+						if(groupIDA<groupIDB){
+							//merge memory
+							outputs[groupID] += outputs[groupIDB];
+							outputs[groupIDB] = "";
+						}else{
+							outputs[groupID] += outputs[groupIDA];
+							outputs[groupIDA] = "";
+						}					
+					}
 				}
 			}
+			m->gobble(dFile);
 		}
 		dFile.close();
-	
-		
-		for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
-			remove((namefile + "." + toString(i) + ".temp").c_str());
-		}
 		
-		ifstream bigNameFile;
-		openInputFile(namefile, bigNameFile);
-		
-		singleton = namefile + ".extra.temp";
-		ofstream remainingNames;
-		openOutputFile(singleton, remainingNames);
-		
-		bool wroteExtra = false;
-						
-		string name, nameList;
-		while(!bigNameFile.eof()){
-			bigNameFile >> name >> nameList;  gobble(bigNameFile);
-			
-			//did this sequence get assigned a group
-			it = seqGroup.find(name);
-			
-			if (it != seqGroup.end()) {  
-				openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
-				outFile << name << '\t' << nameList << endl;
+        vector<string> tempDistFiles;
+		for (int i = 0; i < numGroups; i++) {
+            string fileName = distFile + "." + toString(i) + ".temp";
+            tempDistFiles.push_back(fileName);
+			if (outputs[i] != "") {
+				ofstream outFile;
+				outFile.open(fileName.c_str(), ios::ate);
+				outFile << outputs[i];
 				outFile.close();
-			}else{
-				wroteExtra = true;
-				remainingNames << name << '\t' << nameList << endl;
 			}
 		}
-		bigNameFile.close();
-		remainingNames.close();
-		
-		if (!wroteExtra) { 
-			remove(singleton.c_str());
-			singleton = "none";
-		}
-			
-		for(int i=0;i<numGroups;i++){
-			string tempNameFile = namefile + "." + toString(i) + ".temp";
-			string tempDistFile = distFile + "." + toString(i) + ".temp";
+        
+        map<string, int> seqGroup;
+        for (int i = 0; i < groups.size(); i++) {
+            for (set<string>::iterator itNames = groups[i].begin(); itNames != groups[i].end();) {
+                seqGroup[*itNames] = i;
+                groups[i].erase(itNames++);
+            }
+        }
+        
+		splitNames(seqGroup, numGroups, tempDistFiles);
 				
-			map<string, string> temp;
-			temp[tempDistFile] = tempNameFile;
-			dists.push_back(temp);
-		}
-		
-		if (m->control_pressed)	 {  
-			for (int i = 0; i < dists.size(); i++) { 
-				remove((dists[i].begin()->first).c_str());
-				remove((dists[i].begin()->second).c_str());
-			}
-			dists.clear();
-		}
-		
-		return 0;
-			
+		return 0;			
 	}
 	catch(exception& e) {
-		m->errorOut(e, "SplitMatrix", "splitClassify");
+		m->errorOut(e, "SplitMatrix", "splitDistanceRAM");
 		exit(1);
 	}
 }