From 2a7d1455e8cfe4f67a7173f3a7249762c5436217 Mon Sep 17 00:00:00 2001
From: westcott <westcott>
Date: Wed, 9 Jun 2010 12:07:11 +0000
Subject: [PATCH] changes to blastdb to make filenames given to blast unique,
 changes to split.abund, fixed bugs in chimera.slayer realigner, optimized
 splitmatrix and modified cluster.split command.

---
 blastdb.cpp              |  21 +-
 chimerarealigner.cpp     |  25 +-
 chimeraslayercommand.cpp |   2 -
 clustersplitcommand.cpp  |  29 +-
 clustersplitcommand.h    |   4 +-
 makefile                 |   4 +-
 maligner.cpp             |   8 +-
 readtree.cpp             |  16 +-
 splitabundcommand.cpp    | 909 +++++++++++++++++++++++++++++----------
 splitabundcommand.h      |  20 +-
 splitmatrix.cpp          | 516 ++++++++++++++--------
 splitmatrix.h            |   6 +-
 tree.cpp                 |  91 ++--
 tree.h                   |   4 +
 14 files changed, 1185 insertions(+), 470 deletions(-)
diff --git a/blastdb.cpp b/blastdb.cpp
index 35a19f4..760824d 100644
--- a/blastdb.cpp
+++ b/blastdb.cpp
@@ -54,21 +54,22 @@ vector<int> BlastDB::findClosestSequences(Sequence* seq, int n) {
 		vector<int> topMatches;
 		
 		ofstream queryFile;
-		openOutputFile(queryFileName, queryFile);
+		openOutputFile((queryFileName+seq->getName()), queryFile);
 		queryFile << '>' << seq->getName() << endl;
 		queryFile << seq->getUnaligned() << endl;
 		queryFile.close();
+
 				
 		//	the goal here is to quickly survey the database to find the closest match.  To do this we are using the default
 		//	wordsize used in megablast.  I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
 		//	long.  With this setting, it seems comparable in speed to the suffix tree approach.
 		
 		string blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -m 8 -W 28 -v " + toString(n) + " -b " + toString(n);;
-		blastCommand += (" -i " + queryFileName + " -o " + blastFileName);
+		blastCommand += (" -i " + (queryFileName+seq->getName()) + " -o " + blastFileName+seq->getName());
 		system(blastCommand.c_str());
 		
 		ifstream m8FileHandle;
-		openInputFile(blastFileName, m8FileHandle);
+		openInputFile(blastFileName+seq->getName(), m8FileHandle, "no error");
 		
 		string dummy;
 		int templateAccession;
@@ -84,7 +85,9 @@ vector<int> BlastDB::findClosestSequences(Sequence* seq, int n) {
 			topMatches.push_back(templateAccession);
 		}
 		m8FileHandle.close();
-		
+		remove((queryFileName+seq->getName()).c_str());
+		remove((blastFileName+seq->getName()).c_str());
+
 		return topMatches;
 	}
 	catch(exception& e) {
@@ -100,7 +103,7 @@ vector<int> BlastDB::findClosestMegaBlast(Sequence* seq, int n) {
 		vector<int> topMatches;
 		
 		ofstream queryFile;
-		openOutputFile(queryFileName, queryFile);
+		openOutputFile((queryFileName+seq->getName()), queryFile);
 		queryFile << '>' << seq->getName() << endl;
 		queryFile << seq->getUnaligned() << endl;
 		queryFile.close();
@@ -110,11 +113,11 @@ vector<int> BlastDB::findClosestMegaBlast(Sequence* seq, int n) {
 		//	long.  With this setting, it seems comparable in speed to the suffix tree approach.
 	
 		string blastCommand = path + "blast/bin/megablast -e 1e-10 -d " + dbFileName + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn
-		blastCommand += (" -i " + queryFileName + " -o " + blastFileName);
+		blastCommand += (" -i " + (queryFileName+seq->getName()) + " -o " + blastFileName+seq->getName());
 		system(blastCommand.c_str());
-		
+
 		ifstream m8FileHandle;
-		openInputFile(blastFileName, m8FileHandle, "no error");
+		openInputFile(blastFileName+seq->getName(), m8FileHandle, "no error");
 	
 		string dummy;
 		int templateAccession;
@@ -131,6 +134,8 @@ vector<int> BlastDB::findClosestMegaBlast(Sequence* seq, int n) {
 //cout << templateAccession << endl;
 		}
 		m8FileHandle.close();
+		remove((queryFileName+seq->getName()).c_str());
+		remove((blastFileName+seq->getName()).c_str());
 //cout << "\n\n" ;		
 		return topMatches;
 	}
diff --git a/chimerarealigner.cpp b/chimerarealigner.cpp
index 487e7ca..1da3278 100644
--- a/chimerarealigner.cpp
+++ b/chimerarealigner.cpp
@@ -24,19 +24,21 @@ void ChimeraReAligner::reAlign(Sequence* query, vector<results> parents) {
 			
 			string qAligned = query->getAligned();
 			string newQuery = "";
-		
+			
 			//sort parents by region start
 			sort(parents.begin(), parents.end(), compareRegionStart);
 
 			//make sure you don't cutoff beginning of query 
 			if (parents[0].nastRegionStart > 0) {  newQuery += qAligned.substr(0, parents[0].nastRegionStart);  }
 			int longest = 0;
+
 			//take query and break apart into pieces using breakpoints given by results of parents
 			for (int i = 0; i < parents.size(); i++) {
+			
 				int length = parents[i].nastRegionEnd - parents[i].nastRegionStart+1;
 				string q = qAligned.substr(parents[i].nastRegionStart, length);
 				Sequence* queryFrag = new Sequence(query->getName(), q);
-				
+
 				queryParts.push_back(queryFrag);
 			
 				Sequence* parent = getSequence(parents[i].parent);
@@ -46,26 +48,35 @@ void ChimeraReAligner::reAlign(Sequence* query, vector<results> parents) {
 				parent->setAligned(p);
 				
 				parentParts.push_back(parent);
-				
+
 				if (q.length() > longest)	{ longest = q.length(); }
 				if (p.length() > longest)	{ longest = p.length();	}
 			}
-											
+
 			//align each peice to correct parent from results
 			for (int i = 0; i < queryParts.size(); i++) {
 				alignment = new NeedlemanOverlap(-2.0, match, misMatch, longest+1); //default gapopen, match, mismatch, longestbase
 				Nast nast(alignment, queryParts[i], parentParts[i]);
 				delete alignment;
 			}
-												
+
 			//recombine pieces to form new query sequence
 			for (int i = 0; i < queryParts.size(); i++) {
+				//sometimes the parent regions do not meet, for example region 1 may end at 1000 and region 2 starts at 1100.  
+				//we don't want to loose length so in this case we will leave query alone
+				if (i != 0) {
+					int space = parents[i].nastRegionStart - parents[i-1].nastRegionEnd - 1;
+					if (space > 0) { //they don't meet and we need to add query piece
+						string q = qAligned.substr(parents[i-1].nastRegionEnd+1, space);
+						newQuery += q;
+					}
+				}
+
 				newQuery += queryParts[i]->getAligned();
 			}
 			
-			
 			//make sure you don't cutoff end of query 
-			if (parents[parents.size()-1].nastRegionEnd < qAligned.length()) {  newQuery += qAligned.substr(parents[parents.size()-1].nastRegionEnd);  }
+			if (parents[parents.size()-1].nastRegionEnd < (qAligned.length()-1)) {  newQuery += qAligned.substr(parents[parents.size()-1].nastRegionEnd+1);  }
 		
 			//set query to new aligned string
 			query->setAligned(newQuery);
diff --git a/chimeraslayercommand.cpp b/chimeraslayercommand.cpp
index c41e026..8a0e287 100644
--- a/chimeraslayercommand.cpp
+++ b/chimeraslayercommand.cpp
@@ -292,7 +292,6 @@ int ChimeraSlayerCommand::execute(){
 				int startIndex =  pid * numSeqsPerProcessor;
 				if(pid == (processors - 1)){	numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor; 	}
 				
-				
 				//align your part
 				driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, MPIPos);
 				
@@ -377,7 +376,6 @@ int ChimeraSlayerCommand::execute(){
 					lines.push_back(new linePair(startPos, numSeqsPerProcessor));
 				}
 				
-				
 				createProcesses(outputFileName, fastafile, accnosFileName); 
 			
 				rename((outputFileName + toString(processIDS[0]) + ".temp").c_str(), outputFileName.c_str());
diff --git a/clustersplitcommand.cpp b/clustersplitcommand.cpp
index 861c4fb..c3faab7 100644
--- a/clustersplitcommand.cpp
+++ b/clustersplitcommand.cpp
@@ -27,7 +27,7 @@ ClusterSplitCommand::ClusterSplitCommand(string option)  {
 		
 		else {
 			//valid paramters for this command
-			string Array[] =  {"phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","showabund","timing","hard","processors","outputdir","inputdir"};
+			string Array[] =  {"phylip","column","name","cutoff","precision","method","splitmethod","taxonomy","taxlevel","large","showabund","timing","hard","processors","outputdir","inputdir"};
 			vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
 			
 			OptionParser parser(option);
@@ -125,6 +125,9 @@ ClusterSplitCommand::ClusterSplitCommand(string option)  {
 			temp = validParameter.validFile(parameters, "hard", false);			if (temp == "not found") { temp = "F"; }
 			hard = isTrue(temp);
 			
+			temp = validParameter.validFile(parameters, "large", false);			if (temp == "not found") { temp = "F"; }
+			large = isTrue(temp);
+			
 			temp = validParameter.validFile(parameters, "processors", false);	if (temp == "not found"){	temp = "1";				}
 			convert(temp, processors); 
 			
@@ -165,7 +168,7 @@ ClusterSplitCommand::ClusterSplitCommand(string option)  {
 
 void ClusterSplitCommand::help(){
 	try {
-		m->mothurOut("The cluster.split command parameter options are phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, processors. Phylip or column and name are required.\n");
+		m->mothurOut("The cluster.split command parameter options are phylip, column, name, cutoff, precision, method, splitmethod, taxonomy, taxlevel, showabund, timing, hard, large, processors. Phylip or column and name are required.\n");
 		m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
 		m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
 		m->mothurOut("The cutoff parameter allow you to set the distance you want to cluster to, default is 10.0. \n");
@@ -174,6 +177,7 @@ void ClusterSplitCommand::help(){
 		m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance or classify. \n");
 		m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n");
 		m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1. \n");
+		m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
 		m->mothurOut("The cluster.split command should be in the following format: \n");
 		m->mothurOut("cluster.split(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision, splitmethod=yourSplitmethod, taxonomy=yourTaxonomyfile, taxlevel=yourtaxlevel) \n");
 		m->mothurOut("Example: cluster.split(column=abrecovery.dist, name=abrecovery.names, method=furthest, cutoff=0.10, precision=1000, splitmethod=classify, taxonomy=abrecovery.silva.slv.taxonomy, taxlevel=5) \n");	
@@ -229,11 +233,12 @@ int ClusterSplitCommand::execute(){
 		if (m->control_pressed) { return 0; }
 		
 		time_t estart = time(NULL);
+		m->mothurOut("Splitting the distance file..."); m->mothurOutEndLine();
 		
 		//split matrix into non-overlapping groups
 		SplitMatrix* split;
-		if (splitmethod == "distance")	{	split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod);			}
-		else							{	split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod);  }
+		if (splitmethod == "distance")	{	split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large);			}
+		else							{	split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large);   }
 		
 		split->split();
 		
@@ -284,6 +289,8 @@ int ClusterSplitCommand::execute(){
 						ifstream in;
 						openInputFile(filename, in);
 						
+						in >> tag; gobble(in);
+						
 						while(!in.eof()) {
 							string tempName;
 							in >> tempName; gobble(in);
@@ -299,7 +306,7 @@ int ClusterSplitCommand::execute(){
 						
 						while(!in2.eof()) {
 							string tempName;
-							in2 >> tempName; gobble(in);
+							in2 >> tempName; gobble(in2);
 							if (labels.count(tempName) == 0) { labels.insert(tempName); }
 						}
 						in2.close();
@@ -312,7 +319,12 @@ int ClusterSplitCommand::execute(){
 		
 		if (m->control_pressed) { for (int i = 0; i < listFileNames.size(); i++) { remove(listFileNames[i].c_str()); } return 0; }
 		
+		m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
+		
 		//****************** merge list file and create rabund and sabund files ******************************//
+		estart = time(NULL);
+		m->mothurOut("Merging the clustered files..."); m->mothurOutEndLine();
+		
 		ListVector* listSingle;
 		map<float, int> labelBins = completeListFile(listFileNames, singletonName, labels, listSingle); //returns map of label to numBins
 		
@@ -322,7 +334,7 @@ int ClusterSplitCommand::execute(){
 
 		if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
 		
-		m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster"); m->mothurOutEndLine();
+		m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to merge."); m->mothurOutEndLine();
 		
 		m->mothurOutEndLine();
 		m->mothurOut("Output File Names: "); m->mothurOutEndLine();
@@ -337,7 +349,7 @@ int ClusterSplitCommand::execute(){
 	}
 }
 //**********************************************************************************************************************
-map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string> userLabels, ListVector*& listSingle){
+map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames, string singleton, set<string>& userLabels, ListVector*& listSingle){
 	try {
 				
 		map<float, int> labelBin;
@@ -367,7 +379,7 @@ map<float, int> ClusterSplitCommand::completeListFile(vector<string> listNames,
 
 			if ((*it != "unique") && (convertTestFloat(*it, temp) == true))	{	convert(*it, temp);	}
 			else if (*it == "unique")										{	temp = -1.0;		}
-			
+						
 			orderFloat.push_back(temp);
 			labelBin[temp] = numSingleBins; //initialize numbins 
 		}
@@ -571,6 +583,7 @@ int ClusterSplitCommand::createProcesses(vector < vector < map<string, string> >
 				string filename = toString(getpid()) + ".temp";
 				ofstream out;
 				openOutputFile(filename, out);
+				out << tag << endl;
 				for (int j = 0; j < listFileNames.size(); j++) { out << listFileNames[j] << endl;  }
 				out.close();
 				
diff --git a/clustersplitcommand.h b/clustersplitcommand.h
index 05ae8b8..c696329 100644
--- a/clustersplitcommand.h
+++ b/clustersplitcommand.h
@@ -35,7 +35,7 @@ private:
 	string method, fileroot, tag, outputDir, phylipfile, columnfile, namefile, distfile, format, showabund, timing, splitmethod, taxFile;
 	double cutoff, splitcutoff;
 	int precision, length, processors, taxLevelCutoff;
-	bool print_start, abort, hard;
+	bool print_start, abort, hard, large;
 	time_t start;
 	ofstream outList, outRabund, outSabund;
 	
@@ -43,7 +43,7 @@ private:
 	int createProcesses(vector < vector < map<string, string> > >);
 	vector<string> cluster(vector< map<string, string> >, set<string>&);
 	int mergeLists(vector<string>, map<float, int>, ListVector*);
-	map<float, int> completeListFile(vector<string>, string, set<string>, ListVector*&);
+	map<float, int> completeListFile(vector<string>, string, set<string>&, ListVector*&);
 };
 
 #endif
diff --git a/makefile b/makefile
index a973241..91a269c 100644
--- a/makefile
+++ b/makefile
@@ -1697,11 +1697,11 @@ install : mothur
 ./splitmatrix.o : splitmatrix.cpp
 	$(CC) $(CC_OPTIONS) splitmatrix.cpp -c $(INCLUDE) -o ./splitmatrix.o
 	
-# Item # 207 -- splitmatrix --
+# Item # 207 -- clustersplit --
 ./clustersplitcommand.o : clustersplitcommand.cpp
 	$(CC) $(CC_OPTIONS) clustersplitcommand.cpp -c $(INCLUDE) -o ./clustersplitcommand.o
 	
-# Item # 207 -- splitmatrix --
+# Item # 208 -- classifyotu --
 ./classifyotucommand.o : classifyotucommand.cpp
 	$(CC) $(CC_OPTIONS) classifyotucommand.cpp -c $(INCLUDE) -o ./classifyotucommand.o
 
diff --git a/maligner.cpp b/maligner.cpp
index 5da4b9f..1d52589 100644
--- a/maligner.cpp
+++ b/maligner.cpp
@@ -502,13 +502,13 @@ vector<Sequence*> Maligner::getBlastSeqs(Sequence* q, int num) {
 		string leftQuery = queryUnAligned.substr(0, int(queryUnAligned.length() * 0.33)); //first 1/3 of the sequence
 		string rightQuery = queryUnAligned.substr(int(queryUnAligned.length() * 0.66)); //last 1/3 of the sequence
 
-		Sequence* queryLeft = new Sequence(q->getName(), leftQuery);
-		Sequence* queryRight = new Sequence(q->getName(), rightQuery);
+		Sequence* queryLeft = new Sequence(q->getName()+"left", leftQuery);
+		Sequence* queryRight = new Sequence(q->getName()+"right", rightQuery);
 		
 		vector<int> tempIndexesRight = databaseLeft->findClosestMegaBlast(queryRight, num+1);
 		vector<int> tempIndexesLeft = databaseLeft->findClosestMegaBlast(queryLeft, num+1);
-		
-		//if ((tempIndexesRight.size() != (num+1)) || (tempIndexesLeft.size() != (num+1)))  {  m->mothurOut("megablast returned " + toString(tempIndexesRight.size()) + " results for the right end, and " + toString(tempIndexesLeft.size()) + " for the left end. Needed " + toString(num+1) + ". Unable to porcess sequence " + q->getName()); m->mothurOutEndLine(); return refResults; }
+			
+		//if ((tempIndexesRight.size() == 0) && (tempIndexesLeft.size() == 0))  {  m->mothurOut("megablast returned " + toString(tempIndexesRight.size()) + " results for the right end, and " + toString(tempIndexesLeft.size()) + " for the left end. Needed " + toString(num+1) + ". Unable to process sequence " + q->getName()); m->mothurOutEndLine(); return refResults; }
 		
 		vector<int> smaller;
 		vector<int> larger;
diff --git a/readtree.cpp b/readtree.cpp
index 0d25f7e..51f44e2 100644
--- a/readtree.cpp
+++ b/readtree.cpp
@@ -314,7 +314,21 @@ int ReadNewickTree::readNewickInt(istream& f, int& n, Tree* T) {
 			}else{
 				T->tree[n].setBranchLength(0.0); 
 			}						
-		
+			
+			//to account for multifurcating trees generated by fasttree, we are forcing them to be bifurcating
+		/*	if(f.peek() == ','){
+				
+				//force this node to be left child and read new rc
+				T->tree[n].setChildren(lc,rc);
+				T->tree[lc].setParent(n);
+				T->tree[rc].setParent(n);
+				
+				n++;
+				
+
+			
+			}*/
+			
 			T->tree[n].setChildren(lc,rc);
 			T->tree[lc].setParent(n);
 			T->tree[rc].setParent(n);
diff --git a/splitabundcommand.cpp b/splitabundcommand.cpp
index cb57684..f86891b 100644
--- a/splitabundcommand.cpp
+++ b/splitabundcommand.cpp
@@ -13,8 +13,6 @@
 SplitAbundCommand::SplitAbundCommand(string option)  {
 	try {
 		abort = false;
-		wroteRareList = false;
-		wroteAbundList = false;
 		allLines = 1;
 			
 		//allow user to run help
@@ -22,7 +20,7 @@ SplitAbundCommand::SplitAbundCommand(string option)  {
 		
 		else {
 			//valid paramters for this command
-			string Array[] =  {"list","name","group","label","accnos","cutoff","outputdir","inputdir"};
+			string Array[] =  {"name","group","label","accnos","groups","fasta","cutoff","outputdir","inputdir"}; //"list",
 			vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
 			
 			OptionParser parser(option);
@@ -57,6 +55,14 @@ SplitAbundCommand::SplitAbundCommand(string option)  {
 					if (path == "") {	parameters["group"] = inputDir + it->second;		}
 				}
 				
+				it = parameters.find("fasta");
+				//user has given a template file
+				if(it != parameters.end()){ 
+					path = hasPath(it->second);
+					//if the user has not given a path then, add inputdir. else leave path alone.
+					if (path == "") {	parameters["fasta"] = inputDir + it->second;		}
+				}
+				
 				it = parameters.find("name");
 				//user has given a template file
 				if(it != parameters.end()){ 
@@ -74,26 +80,42 @@ SplitAbundCommand::SplitAbundCommand(string option)  {
 			//check for required parameters
 			listfile = validParameter.validFile(parameters, "list", true);
 			if (listfile == "not open") { abort = true; }
-			else if (listfile == "not found") { listfile = ""; }	
+			else if (listfile == "not found") { listfile = ""; }
+			else{ inputFile = listfile; }	
 			
-			//check for required parameters
 			namefile = validParameter.validFile(parameters, "name", true);
 			if (namefile == "not open") { abort = true; }
 			else if (namefile == "not found") { namefile = ""; }	
+			else{ inputFile = namefile; }	
+		
+			fastafile = validParameter.validFile(parameters, "fasta", true);
+			if (fastafile == "not open") { abort = true; }
+			else if (fastafile == "not found") { fastafile = ""; m->mothurOut("fasta is a required parameter for the split.abund command. "); m->mothurOutEndLine(); abort = true;  }	
 			
 			groupfile = validParameter.validFile(parameters, "group", true);
-			if (groupfile == "not open") { abort = true; }	
+			if (groupfile == "not open") {  groupfile = ""; abort = true; }	
 			else if (groupfile == "not found") { groupfile = ""; }
 			else {  
 				groupMap = new GroupMap(groupfile);
 				
 				int error = groupMap->readMap();
 				if (error == 1) { abort = true; }
+	
 			}
 			
+			groups = validParameter.validFile(parameters, "groups", false);		
+			if (groups == "not found") { groups = ""; }
+			else if (groups == "all") { 
+				if (groupfile != "") {  Groups = groupMap->namesOfGroups;  } 
+				else {  m->mothurOut("You cannot select groups without a valid groupfile, I will disregard your groups selection. "); m->mothurOutEndLine(); groups = "";   }
+			}else { 
+				splitAtDash(groups, Groups);
+			}
+			
+			if ((groupfile == "") && (groups != "")) {  m->mothurOut("You cannot select groups without a valid groupfile, I will disregard your groups selection. "); m->mothurOutEndLine(); groups = "";  Groups.clear(); }
+			
 			//do you have all files needed
 			if ((listfile == "") && (namefile == "")) { m->mothurOut("You must either a listfile or a namefile for the split.abund command. "); m->mothurOutEndLine(); abort = true;  }
-			if ((listfile != "") && (namefile != "")) { m->mothurOut("You must either a listfile or a namefile for the split.abund command, but NOT BOTH. "); m->mothurOutEndLine(); abort = true;  }
 			
 			//check for optional parameter and set defaults
 			// ...at some point should added some additional type checking...
@@ -123,13 +145,16 @@ SplitAbundCommand::SplitAbundCommand(string option)  {
 //**********************************************************************************************************************
 void SplitAbundCommand::help(){
 	try {
-		m->mothurOut("The split.abund command reads a list or a names file splits the sequences into rare and abundant groups.. \n");
-		m->mothurOut("The split.abund command parameters are list, name, cutoff, group, label and accnos.\n");
-		m->mothurOut("The list or name parameter is required, and you must provide a cutoff value.\n");
+		m->mothurOut("The split.abund command reads a fasta file and a list or a names file splits the sequences into rare and abundant groups. \n");
+		m->mothurOut("The split.abund command parameters are list, name, cutoff, group, label, groups and accnos.\n");
+		m->mothurOut("The fasta and a list or name parameter are required, and you must provide a cutoff value.\n");
 		m->mothurOut("The cutoff parameter is used to qualify what is abundant and rare.\n");
 		m->mothurOut("The group parameter allows you to parse a group file into rare and abundant groups.\n");
 		m->mothurOut("The label parameter is used to read specific labels in your listfile you want to use.\n");
 		m->mothurOut("The accnos parameter allows you to output a .rare.accnos and .abund.accnos files to use with the get.seqs and remove.seqs commands.\n");
+		m->mothurOut("The groups parameter allows you to parse the files into rare and abundant files by group.  \n");
+		m->mothurOut("For example if you set groups=A-B-C, you will get a .A.abund, .A.rare, .B.abund, .B.rare, .C.abund, .C.rare files.  \n");
+		m->mothurOut("If you want .abund and .rare files for all groups, set groups=all.  \n");
 		m->mothurOut("The split.abund command should be used in the following format: split.abund(list=yourListFile, group=yourGroupFile, label=yourLabels, cutoff=yourCutoff).\n");
 		m->mothurOut("Example: split.abundt(list=abrecovery.fn.list, group=abrecovery.groups, label=0.03, cutoff=2).\n");
 		m->mothurOut("Note: No spaces between parameter labels (i.e. list), '=' and parameters (i.e.yourListfile).\n\n");
@@ -150,13 +175,25 @@ int SplitAbundCommand::execute(){
 	
 		if (abort == true) {	return 0;	}
 		
-		if (namefile != "") {  split();  }
-		else {
+		if (listfile != "") { //you are using a listfile to determine abundance
 		
 			//remove old files so you can append later....
 			string fileroot = outputDir + getRootName(getSimpleName(listfile));
-			remove((fileroot + "rare.list").c_str());
-			remove((fileroot + "abund.list").c_str());
+			if (Groups.size() == 0) {
+				remove((fileroot + "rare.list").c_str());
+				remove((fileroot + "abund.list").c_str());
+				
+				wroteListFile["rare"] = false;
+				wroteListFile["abund"] = false;
+			}else{
+				for (int i=0; i<Groups.size(); i++) {
+					remove((fileroot + Groups[i] + ".rare.list").c_str());
+					remove((fileroot + Groups[i] + ".abund.list").c_str());
+			
+					wroteListFile[(Groups[i] + ".rare")] = false;
+					wroteListFile[(Groups[i] + ".abund")] = false;
+				}
+			}
 			
 			//if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
 			set<string> processedLabels;
@@ -166,6 +203,10 @@ int SplitAbundCommand::execute(){
 			list = input->getListVector();
 			string lastLabel = list->getLabel();
 			
+			//do you have a namefile or do we need to similate one?
+			if (namefile != "") {  readNamesFile();		}
+			else				{ createNameMap(list);	}
+			
 			if (m->control_pressed) { delete input; delete list; for (int i = 0; i < outputNames.size(); i++) {	remove(outputNames[i].c_str()); } return 0; }
 			
 			while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
@@ -175,7 +216,7 @@ int SplitAbundCommand::execute(){
 				if(allLines == 1 || labels.count(list->getLabel()) == 1){
 						
 						m->mothurOut(list->getLabel()); m->mothurOutEndLine();
-						split(list);
+						splitList(list);
 											
 						processedLabels.insert(list->getLabel());
 						userLabels.erase(list->getLabel());
@@ -188,7 +229,7 @@ int SplitAbundCommand::execute(){
 						list = input->getListVector(lastLabel); //get new list vector to process
 						
 						m->mothurOut(list->getLabel()); m->mothurOutEndLine();
-						split(list);
+						splitList(list);
 						
 						processedLabels.insert(list->getLabel());
 						userLabels.erase(list->getLabel());
@@ -228,20 +269,36 @@ int SplitAbundCommand::execute(){
 				list = input->getListVector(lastLabel); //get new list vector to process
 				
 				m->mothurOut(list->getLabel()); m->mothurOutEndLine();
-				split(list);		
+				splitList(list);		
 				
 				delete list;
 			}
 			
 			delete input;
 			
+			for (map<string, bool>::iterator itBool = wroteListFile.begin(); itBool != wroteListFile.end(); itBool++) {
+				string filename = fileroot + itBool->first;
+				if (itBool->second) { //we wrote to this file
+					outputNames.push_back(filename);
+				}else{
+					remove(filename.c_str());
+				}
+			}
+			
 			if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {	remove(outputNames[i].c_str()); }	return 0;	}
+
+									
+		}else { //you are using the namefile to determine abundance
+
+			splitNames(); 
+			writeNames();
 			
-			if (wroteAbundList) {  outputNames.push_back(fileroot + "abund.list");		}
-			if (wroteRareList)	{  outputNames.push_back(fileroot + "rare.list");		}
+			string tag = "";
+			if (groupfile != "")				{  parseGroup(tag);		}
+			if (accnos)							{  writeAccnos(tag);	}
+			if (fastafile != "")				{  parseFasta(tag);		}
 		}
-		
-		
+
 		m->mothurOutEndLine();
 		m->mothurOut("Output File Names: "); m->mothurOutEndLine();
 		for (int i = 0; i < outputNames.size(); i++) {	m->mothurOut(outputNames[i]); m->mothurOutEndLine();	}
@@ -255,185 +312,217 @@ int SplitAbundCommand::execute(){
 	}
 }
 /**********************************************************************************************************************/
-int SplitAbundCommand::split(ListVector* thisList) {
+int SplitAbundCommand::splitList(ListVector* thisList) {
 	try {
-	
-		SAbundVector* sabund = new SAbundVector();
-		*sabund = thisList->getSAbundVector();
+		rareNames.clear();
+		abundNames.clear();
 		
-		//find out how many bins are rare and how many are abundant so you can process the list vector one bin at a time
-		// and don't have to store the bins until you are done with the whole vector, this save alot of space.
-		int numRareBins = 0;
-		for (int i = 0; i <= sabund->getMaxRank(); i++) {
-			if (i > cutoff) { break; }
-			numRareBins += sabund->get(i);
-		}
-		int numAbundBins = thisList->getNumBins() - numRareBins;
-		delete sabund;
+		//get rareNames and abundNames
+		for (int i = 0; i < thisList->getNumBins(); i++) {
+			if (m->control_pressed) { return 0; }
+			
+			string bin = thisList->get(i);
+						
+			vector<string> names;
+			splitAtComma(bin, names);  //parses bin into individual sequence names
+			int size = names.size();
+				
+			if (size <= cutoff) {
+				for (int j = 0; j < names.size(); j++) {  rareNames.insert(names[j]);  }
+			}else{
+				for (int j = 0; j < names.size(); j++) {  abundNames.insert(names[j]);  }
+			}
+		}//end for
+
+		writeList(thisList);
+		
+		string tag = thisList->getLabel() + ".";
+		if (groupfile != "")				{  parseGroup(tag);		}
+		if (accnos)							{  writeAccnos(tag);	}
+		if (fastafile != "")				{  parseFasta(tag);		}
+		
+		return 0;
+
+	}
+	catch(exception& e) {
+		m->errorOut(e, "SplitAbundCommand", "splitList");
+		exit(1);
+	}
+}
+/**********************************************************************************************************************/
+int SplitAbundCommand::writeList(ListVector* thisList) { 
+	try {
+		
+		map<string, ofstream*> filehandles;
 		
-		//setup output files
-		ofstream outListAbund;
-		ofstream outListRare;
-		ofstream outGroupRare;
-		ofstream outGroupAbund;
-		ofstream outAccnosRare;
-		ofstream outAccnosAbund;
+		if (Groups.size() == 0) {
+			SAbundVector* sabund = new SAbundVector();
+			*sabund = thisList->getSAbundVector();
 		
-		string fileroot = outputDir + getRootName(getSimpleName(listfile));
-		if (numRareBins > 0) {
-			wroteRareList = true;
-			string listRareName = fileroot + "rare.list";
-			openOutputFileAppend(listRareName, outListRare);
-			outListRare << thisList->getLabel() << '\t' << numRareBins << '\t';
+			//find out how many bins are rare and how many are abundant so you can process the list vector one bin at a time
+			// and don't have to store the bins until you are done with the whole vector, this save alot of space.
+			int numRareBins = 0;
+			for (int i = 0; i <= sabund->getMaxRank(); i++) {
+				if (i > cutoff) { break; }
+				numRareBins += sabund->get(i);
+			}
+			int numAbundBins = thisList->getNumBins() - numRareBins;
+			delete sabund;
+
+			ofstream aout;
+			ofstream rout;
+			
+			if (rareNames.size() != 0) {
+				string rare = outputDir + getRootName(getSimpleName(listfile))  + ".rare.list";
+				wroteListFile["rare"] = true;
+				openOutputFileAppend(rare, rout);
+				rout << thisList->getLabel() << '\t' << numRareBins << '\t';
+			}
 			
-			if (accnos) {
-				string accnosName = fileroot + thisList->getLabel() + ".rare.accnos";
-				openOutputFile(accnosName, outAccnosRare);
-				outputNames.push_back(accnosName);
+			if (abundNames.size() != 0) {
+				string abund = outputDir + getRootName(getSimpleName(listfile))  + ".abund.list";
+				wroteListFile["abund"] = true;
+				openOutputFileAppend(abund, aout);
+				rout << thisList->getLabel() << '\t' << numAbundBins << '\t';
 			}
+
+			for (int i = 0; i < thisList->getNumBins(); i++) {
+				if (m->control_pressed) { break; }
+			
+				string bin = list->get(i); 
 			
-			if (groupfile != "") {
-				string groupFileName = outputDir + getRootName(getSimpleName(groupfile)) + thisList->getLabel() + ".rare.group";
-				openOutputFile(groupFileName, outGroupRare);
-				outputNames.push_back(groupFileName);
+				int size = getNumNames(bin);
+			
+				if (size <= cutoff) {  rout << bin << '\t';  }
+				else				{  aout << bin << '\t'; }
 			}
-		}
-		
-		if (numAbundBins > 0) {
-			wroteAbundList = true;
-			string listAbundName = fileroot + "abund.list";
-			openOutputFileAppend(listAbundName, outListAbund);
-			outListAbund << thisList->getLabel() << '\t' << numAbundBins << '\t';
 			
-			if (accnos) {
-				string accnosName = fileroot + thisList->getLabel() + ".abund.accnos";
-				openOutputFile(accnosName, outAccnosAbund);
-				outputNames.push_back(accnosName);
+			if (rareNames.size() != 0)	{ rout << endl; rout.close(); }
+			if (abundNames.size() != 0) { aout << endl; aout.close(); }
+
+		}else{ //parse names by abundance and group
+			string fileroot =  outputDir + getRootName(getSimpleName(listfile));
+			ofstream* temp;
+			ofstream* temp2;
+			map<string, bool> wroteFile;
+			map<string, ofstream*> filehandles;
+			map<string, ofstream*>::iterator it3;
+
+			for (int i=0; i<Groups.size(); i++) {
+				temp = new ofstream;
+				filehandles[Groups[i]+".rare"] = temp;
+				temp2 = new ofstream;
+				filehandles[Groups[i]+".abund"] = temp2;
+				
+				openOutputFileAppend(fileroot + Groups[i] + ".rare.list", *(filehandles[Groups[i]+".rare"]));
+				openOutputFileAppend(fileroot + Groups[i] + ".abund.list", *(filehandles[Groups[i]+".abund"]));
 			}
 			
-			if (groupfile != "") {
-				string groupFileName = outputDir + getRootName(getSimpleName(groupfile)) + thisList->getLabel() + ".abund.group";
-				openOutputFile(groupFileName, outGroupAbund);
-				outputNames.push_back(groupFileName);
+			map<string, string> groupVector;
+			map<string, string>::iterator itGroup;
+			map<string, int> groupNumBins;
+		
+			for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) {
+				groupNumBins[it3->first] = 0;
+				groupVector[it3->first] = "";
 			}
-		}
 		
-		for (int i = 0; i < thisList->getNumBins(); i++) {
-			if (m->control_pressed) { break; }
-			
-			string bin = list->get(i); 
+			for (int i = 0; i < thisList->getNumBins(); i++) {
+				if (m->control_pressed) { break; }
 			
-			int size = getNumNames(bin);
+				map<string, string> groupBins;
+				string bin = list->get(i); 
 			
-			if (size <= cutoff) {  outListRare << bin << '\t';  }
-			else				{  outListAbund << bin << '\t'; }
-			
-			if ((groupfile != "") || (accnos)) { //you need to parse the bin...
 				vector<string> names;
 				splitAtComma(bin, names);  //parses bin into individual sequence names
-				
+			
 				//parse bin into list of sequences in each group
 				for (int j = 0; j < names.size(); j++) {
-					
-					//write to accnos file
-					if (accnos) {
-						if (size <= cutoff) {  outAccnosRare << names[j] << endl;  }
-						else				{  outAccnosAbund << names[j] << endl; }
+					string rareAbund;
+					if (rareNames.count(names[j]) != 0) { //you are a rare name
+						rareAbund = ".rare";
+					}else{ //you are a abund name
+						rareAbund = ".abund";
 					}
 					
-					//write to groupfile
-					if (groupfile != "") {
-						string group = groupMap->getGroup(names[j]);
-					
-						if (group == "not found") {  //error in groupfile so close and remove output file and disregard groupfile
-							m->mothurOut(names[j] + " is not in your groupfile. disregarding groupfile."); m->mothurOutEndLine(); 
-							delete groupMap; 
-							if (numAbundBins > 0) { 
-								outGroupAbund.close();
-								remove((outputDir + getRootName(getSimpleName(groupfile)) + thisList->getLabel() + ".abund.group").c_str()); 
-							}
-							if (numRareBins > 0) { 
-								outGroupRare.close();
-								remove((outputDir + getRootName(getSimpleName(groupfile)) + thisList->getLabel() + ".rare.group").c_str());  
-							}
-							groupfile = "";
-						}else {
-							if (size <= cutoff) {  outGroupRare << names[j] << '\t' << group << endl;  }
-							else				{  outGroupAbund << names[j] << '\t' << group << endl; }
+					string group = groupMap->getGroup(names[j]);
+				
+					if (inUsersGroups(group, Groups)) { //only add if this is in a group we want
+						itGroup = groupBins.find(group+rareAbund);
+						if(itGroup == groupBins.end()) {
+							groupBins[group+rareAbund] = names[j];  //add first name
+							groupNumBins[group+rareAbund]++;
+						}else{ //add another name
+							groupBins[group+rareAbund] +=  "," + names[j];
 						}
+					}else if(group == "not found") {
+						m->mothurOut(names[j] + " is not in your groupfile. Ignoring."); m->mothurOutEndLine();
 					}
-					
-				}//end for names
-			}//end if parse
-		}//end for list
-		
-		
-		//close files
-		if (numRareBins > 0) {	
-			outListRare << endl;
-			outListRare.close();
-			if (accnos) {	outAccnosRare.close();	}
-			if (groupfile != "") {  outGroupRare.close();	}
+				}
+			
+			
+				for (itGroup = groupBins.begin(); itGroup != groupBins.end(); itGroup++) {
+					groupVector[itGroup->first] +=  itGroup->second + '\t'; 
+				}
+			}
+			
+			//end list vector
+			for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) {
+				(*(filehandles[it3->first])) << thisList->getLabel() << '\t' << groupNumBins[it3->first] << '\t' << groupVector[it3->first] << endl;  // label numBins  listvector for that group
+				wroteListFile[it3->first] = true;
+				(*(filehandles[it3->first])).close();
+				delete it3->second;
+			}
 		}
 		
-		if (numAbundBins > 0) {	
-			outListAbund << endl;
-			outListAbund.close();
-			if (accnos) {	outAccnosAbund.close();	}
-			if (groupfile != "") {  outGroupAbund.close();	}
-		}
-
 		return 0;
 
 	}
 	catch(exception& e) {
-		m->errorOut(e, "SplitAbundCommand", "split");
+		m->errorOut(e, "SplitAbundCommand", "writeList");
 		exit(1);
 	}
 }
 /**********************************************************************************************************************/
-int SplitAbundCommand::split() { //namefile
+int SplitAbundCommand::splitNames() { //namefile
 	try {
-		//setup output files
-		ofstream outNameAbund;
-		ofstream outNameRare;
-		ofstream outGroupRare;
-		ofstream outGroupAbund;
-		ofstream outAccnosRare;
-		ofstream outAccnosAbund;
 		
-		bool wroteNameAbund = false;
-		bool wroteNameRare = false;
-		bool wroteGroupRare = false;
-		bool wroteGroupAbund = false;
-		bool wroteAccnosRare = false;
-		bool wroteAccnosAbund = false;
-		
-		//prepare output files
-		string fileroot = outputDir + getRootName(getSimpleName(namefile));
-			
-		string nameRareName = fileroot + "rare.names";
-		openOutputFile(nameRareName, outNameRare);
-		string nameAbundName = fileroot + "abund.names";
-		openOutputFile(nameAbundName, outNameAbund);
+		rareNames.clear();
+		abundNames.clear();	
 			
-		if (accnos) {
-			string accnosName = fileroot + "rare.accnos";
-			openOutputFile(accnosName, outAccnosRare);
+		//open input file
+		ifstream in;
+		openInputFile(namefile, in);
+		
+		while (!in.eof()) {
+			if (m->control_pressed) { break; }
 			
-			accnosName = fileroot + "abund.accnos";
-			openOutputFile(accnosName, outAccnosAbund);
-		}
+			string firstCol, secondCol;
+			in >> firstCol >> secondCol; gobble(in);
 			
-		if (groupfile != "") {
-			string groupFileName = outputDir + getRootName(getSimpleName(groupfile))  + ".rare.group";
-			openOutputFile(groupFileName, outGroupRare);
+			nameMap[firstCol] = secondCol;
 			
-			groupFileName = outputDir + getRootName(getSimpleName(groupfile))  + ".abund.group";
-			openOutputFile(groupFileName, outGroupAbund);
+			int size = getNumNames(secondCol);
+				
+			if (size <= cutoff) {
+				rareNames.insert(firstCol); 
+			}else{
+				abundNames.insert(firstCol); 
+			}
 		}
-		
-		
+		in.close();
+				
+		return 0;
+
+	}
+	catch(exception& e) {
+		m->errorOut(e, "SplitAbundCommand", "splitNames");
+		exit(1);
+	}
+}
+/**********************************************************************************************************************/
+int SplitAbundCommand::readNamesFile() { 
+	try {
 		//open input file
 		ifstream in;
 		openInputFile(namefile, in);
@@ -444,86 +533,470 @@ int SplitAbundCommand::split() { //namefile
 			string firstCol, secondCol;
 			in >> firstCol >> secondCol; gobble(in);
 			
-			int size = getNumNames(secondCol);
+			nameMap[firstCol] = secondCol;
+		}
+		in.close();
+				
+		return 0;
+
+	}
+	catch(exception& e) {
+		m->errorOut(e, "SplitAbundCommand", "readNamesFile");
+		exit(1);
+	}
+}
+/**********************************************************************************************************************/
+int SplitAbundCommand::createNameMap(ListVector* thisList) {
+	try {
+		
+		if (thisList != NULL) {
+			for (int i = 0; i < thisList->getNumBins(); i++) {
+				if (m->control_pressed) { return 0; }
+				
+				string bin = thisList->get(i);
+							
+				vector<string> names;
+				splitAtComma(bin, names);  //parses bin into individual sequence names
 				
-			if (size <= cutoff) {  outNameRare << firstCol << '\t' << secondCol << endl;  wroteNameRare = true;  }
-			else				{  outNameAbund << firstCol << '\t' << secondCol << endl; wroteNameAbund = true; }
+				for (int j = 0; j < names.size(); j++) {  nameMap[names[j]] = names[j];  }
+			}//end for
+		}
+		
+		return 0;
+	}
+	catch(exception& e) {
+		m->errorOut(e, "SplitAbundCommand", "createNameMap");
+		exit(1);
+	}
+}
+/**********************************************************************************************************************/
+int SplitAbundCommand::writeNames() { //namefile
+	try {
+		
+		map<string, ofstream*> filehandles;
+
+		if (Groups.size() == 0) {
+			ofstream aout;
+			ofstream rout;
+			
+			if (rareNames.size() != 0) {
+				string rare = outputDir + getRootName(getSimpleName(namefile))  + "rare.names";
+				openOutputFile(rare, rout);
+				outputNames.push_back(rare);
+				
+				for (set<string>::iterator itRare = rareNames.begin(); itRare != rareNames.end(); itRare++) {
+					rout << (*itRare) << '\t' << nameMap[(*itRare)] << endl;
+				}
+				rout.close();
+			}
+			
+			if (abundNames.size() != 0) {
+				string abund = outputDir + getRootName(getSimpleName(namefile))  + "abund.names";
+				openOutputFile(abund, aout);
+				outputNames.push_back(abund);
+				
+				for (set<string>::iterator itAbund = abundNames.begin(); itAbund != abundNames.end(); itAbund++) {
+					aout << (*itAbund) << '\t' << nameMap[(*itAbund)] << endl;
+				}
+				aout.close();
+			}
+
+		}else{ //parse names by abundance and group
+			string fileroot =  outputDir + getRootName(getSimpleName(namefile));
+			ofstream* temp;
+			ofstream* temp2;
+			map<string, bool> wroteFile;
+			map<string, ofstream*> filehandles;
+			map<string, ofstream*>::iterator it3;
 
+			for (int i=0; i<Groups.size(); i++) {
+				temp = new ofstream;
+				filehandles[Groups[i]+".rare"] = temp;
+				temp2 = new ofstream;
+				filehandles[Groups[i]+".abund"] = temp2;
+				
+				openOutputFile(fileroot + Groups[i] + ".rare.names", *(filehandles[Groups[i]+".rare"]));
+				openOutputFile(fileroot + Groups[i] + ".abund.names", *(filehandles[Groups[i]+".abund"]));
+				
+				wroteFile[Groups[i] + ".rare"] = false;
+				wroteFile[Groups[i] + ".abund"] = false;
+			}
 			
-			if ((groupfile != "") || (accnos)) { //you need to parse the bin...
+			for (map<string, string>::iterator itName = nameMap.begin(); itName != nameMap.end(); itName++) {				
 				vector<string> names;
-				splitAtComma(secondCol, names);  //parses bin into individual sequence names
+				splitAtComma(itName->second, names);  //parses bin into individual sequence names
 				
-				//parse bin into list of sequences in each group
-				for (int j = 0; j < names.size(); j++) {
+				string rareAbund;
+				if (rareNames.count(itName->first) != 0) { //you are a rare name
+						rareAbund = ".rare";
+				}else{ //you are a abund name
+						rareAbund = ".abund";
+				}
+				
+				map<string, string> outputStrings;
+				map<string, string>::iterator itout;
+				for (int i = 0; i < names.size(); i++) {
 					
-					//write to accnos file
-					if (accnos) {
-						if (size <= cutoff) {  outAccnosRare << names[j] << endl;  wroteAccnosRare = true; }
-						else				{  outAccnosAbund << names[j] << endl; wroteAccnosAbund = true; }
+					string group = groupMap->getGroup(names[i]);
+					
+					if (inUsersGroups(group, Groups)) { //only add if this is in a group we want
+						itout = outputStrings.find(group+rareAbund);
+						if (itout == outputStrings.end()) {  
+							outputStrings[group+rareAbund] = names[i] + '\t' + names[i];
+						}else {   outputStrings[group+rareAbund] += "," + names[i]; }
+					}else if(group == "not found") {
+						m->mothurOut(names[i] + " is not in your groupfile. Ignoring."); m->mothurOutEndLine();
 					}
+				}
+				
+				for (itout = outputStrings.begin(); itout != outputStrings.end(); itout++) { 
+					*(filehandles[itout->first]) << itout->second << endl;
+					wroteFile[itout->first] = true;
+				}
+			}
+			
+			
+			for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) { 
+				(*(filehandles[it3->first])).close();
+				if (wroteFile[it3->first] == true) {  outputNames.push_back(fileroot + it3->first + ".names");  }
+				else { remove((it3->first).c_str()); }
+				delete it3->second;
+			}
+		}
+				
+		return 0;
+
+	}
+	catch(exception& e) {
+		m->errorOut(e, "SplitAbundCommand", "writeNames");
+		exit(1);
+	}
+}
+/**********************************************************************************************************************/
+//just write the unique names - if a namesfile is given
+int SplitAbundCommand::writeAccnos(string tag) { 
+	try {
+		
+		map<string, ofstream*> filehandles;
+		
+		if (Groups.size() == 0) {
+			ofstream aout;
+			ofstream rout;
+			
+			if (rareNames.size() != 0) {
+				string rare = outputDir + getRootName(getSimpleName(inputFile))  + tag + "rare.accnos";
+				openOutputFile(rare, rout);
+				outputNames.push_back(rare);
+				
+				for (set<string>::iterator itRare = rareNames.begin(); itRare != rareNames.end(); itRare++) {
+					rout << (*itRare) << endl;
+				}
+				rout.close();
+			}
+			
+			if (abundNames.size() != 0) {
+				string abund = outputDir + getRootName(getSimpleName(inputFile)) + tag  + "abund.accnos";
+				openOutputFile(abund, aout);
+				outputNames.push_back(abund);
+				
+				for (set<string>::iterator itAbund = abundNames.begin(); itAbund != abundNames.end(); itAbund++) {
+					aout << (*itAbund) << endl;
+				}
+				aout.close();
+			}
+		}else{ //parse names by abundance and group
+			string fileroot =  outputDir + getRootName(getSimpleName(inputFile));
+			ofstream* temp;
+			ofstream* temp2;
+			map<string, bool> wroteFile;
+			map<string, ofstream*> filehandles;
+			map<string, ofstream*>::iterator it3;
+			
+			for (int i=0; i<Groups.size(); i++) {
+				temp = new ofstream;
+				filehandles[Groups[i]+".rare"] = temp;
+				temp2 = new ofstream;
+				filehandles[Groups[i]+".abund"] = temp2;
+				
+				openOutputFile(fileroot + tag + Groups[i] + ".rare.accnos", *(filehandles[Groups[i]+".rare"]));
+				openOutputFile(fileroot + tag + Groups[i] + ".abund.accnos", *(filehandles[Groups[i]+".abund"]));
+				
+				wroteFile[Groups[i] + ".rare"] = false;
+				wroteFile[Groups[i] + ".abund"] = false;
+			}
+			
+			//write rare
+			for (set<string>::iterator itRare = rareNames.begin(); itRare != rareNames.end(); itRare++) {
+					string group = groupMap->getGroup(*itRare);
 					
-					//write to groupfile
-					if (groupfile != "") {
-						string group = groupMap->getGroup(names[j]);
+					if (inUsersGroups(group, Groups)) { //only add if this is in a group we want
+						*(filehandles[group+".rare"]) << *itRare << endl;
+						wroteFile[group+".rare"] = true;
+					}
+			}
+				
+			//write abund	
+			for (set<string>::iterator itAbund = abundNames.begin(); itAbund != abundNames.end(); itAbund++) {
+					string group = groupMap->getGroup(*itAbund);
 					
-						if (group == "not found") {  //error in groupfile so close and remove output file and disregard groupfile
-							m->mothurOut(names[j] + " is not in your groupfile. disregarding groupfile."); m->mothurOutEndLine(); 
-							delete groupMap; 
-						
-							outGroupAbund.close();
-							remove((outputDir + getRootName(getSimpleName(groupfile))  + ".abund.group").c_str()); 
-							outGroupRare.close();
-							remove((outputDir + getRootName(getSimpleName(groupfile)) + ".rare.group").c_str());  
-							
-							groupfile = "";
-							wroteGroupRare = false;
-							wroteGroupAbund = false;
-						}else {
-							if (size <= cutoff) {  outGroupRare << names[j] << '\t' << group << endl;  wroteGroupRare = true; }
-							else				{  outGroupAbund << names[j] << '\t' << group << endl; wroteGroupAbund = true; }
+					if (inUsersGroups(group, Groups)) { //only add if this is in a group we want
+						*(filehandles[group+".abund"]) << *itAbund << endl;
+						wroteFile[group+".abund"] = true;
+					}
+			}
+			
+			//close files
+			for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) { 
+				(*(filehandles[it3->first])).close();
+				if (wroteFile[it3->first] == true) {  outputNames.push_back(fileroot + tag + it3->first + ".accnos");  }
+				else { remove((fileroot + tag + it3->first + ".accnos").c_str()); }
+				delete it3->second;
+			}
+		}
+				
+		return 0;
+
+	}
+	catch(exception& e) {
+		m->errorOut(e, "SplitAbundCommand", "writeAccnos");
+		exit(1);
+	}
+}
+/**********************************************************************************************************************/
+int SplitAbundCommand::parseGroup(string tag) { //namefile
+	try {
+		
+		map<string, ofstream*> filehandles;
+	
+		if (Groups.size() == 0) {
+			ofstream aout;
+			ofstream rout;
+			
+			if (rareNames.size() != 0) {
+				string rare = outputDir + getRootName(getSimpleName(groupfile))  + tag + "rare.groups";
+				openOutputFile(rare, rout);
+				outputNames.push_back(rare);
+			}
+			
+			if (abundNames.size() != 0) {
+				string abund = outputDir + getRootName(getSimpleName(groupfile))  + tag + "abund.groups";
+				openOutputFile(abund, aout);
+				outputNames.push_back(abund);
+			}
+
+				
+			for (map<string, string>::iterator itName = nameMap.begin(); itName != nameMap.end(); itName++) {				
+				vector<string> names;
+				splitAtComma(itName->second, names);  //parses bin into individual sequence names
+				
+				for (int i = 0; i < names.size(); i++) {
+				
+					string group = groupMap->getGroup(names[i]);
+				
+					if (group == "not found") { 
+						m->mothurOut(names[i] + " is not in your groupfile, ignoring, please correct."); m->mothurOutEndLine();
+					}else {
+						if (rareNames.count(itName->first) != 0) { //you are a rare name
+							rout << names[i] << '\t' << group << endl;
+						}else{ //you are a abund name
+							rout << names[i] << '\t' << group << endl;
 						}
 					}
-					
-				}//end for names
-			}//end if parse
-		}//end while
+				}
+			}
+			
+			if (rareNames.size() != 0)	{ rout.close(); }
+			if (abundNames.size() != 0) { aout.close(); }
+
+		}else{ //parse names by abundance and group
+			string fileroot =  outputDir + getRootName(getSimpleName(groupfile));
+			ofstream* temp;
+			ofstream* temp2;
+			map<string, bool> wroteFile;
+			map<string, ofstream*> filehandles;
+			map<string, ofstream*>::iterator it3;
+
+			for (int i=0; i<Groups.size(); i++) {
+				temp = new ofstream;
+				filehandles[Groups[i]+".rare"] = temp;
+				temp2 = new ofstream;
+				filehandles[Groups[i]+".abund"] = temp2;
+				
+				openOutputFile(fileroot + tag + Groups[i] + ".rare.groups", *(filehandles[Groups[i]+".rare"]));
+				openOutputFile(fileroot + tag + Groups[i] + ".abund.groups", *(filehandles[Groups[i]+".abund"]));
+				
+				wroteFile[Groups[i] + ".rare"] = false;
+				wroteFile[Groups[i] + ".abund"] = false;
+			}
+			
+			for (map<string, string>::iterator itName = nameMap.begin(); itName != nameMap.end(); itName++) {				
+				vector<string> names;
+				splitAtComma(itName->second, names);  //parses bin into individual sequence names
+				
+				string rareAbund;
+				if (rareNames.count(itName->first) != 0) { //you are a rare name
+					rareAbund = ".rare";
+				}else{ //you are a abund name
+					rareAbund = ".abund";
+				}
+				
+				for (int i = 0; i < names.size(); i++) {
+				
+					string group = groupMap->getGroup(names[i]);
+									
+					if (inUsersGroups(group, Groups)) { //only add if this is in a group we want
+						*(filehandles[group+rareAbund]) << names[i] << '\t' << group << endl;
+						wroteFile[group+rareAbund] = true;
+					}
+				}
+			}
+			
+			for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) { 
+				(*(filehandles[it3->first])).close();
+				if (wroteFile[it3->first] == true) {  outputNames.push_back(fileroot + tag + it3->first + ".groups");  }
+				else { remove((fileroot + tag + it3->first + ".groups").c_str()); }
+				delete it3->second;
+			}
+		}
+				
+		return 0;
+
+	}
+	catch(exception& e) {
+		m->errorOut(e, "SplitAbundCommand", "parseGroups");
+		exit(1);
+	}
+}
+/**********************************************************************************************************************/
+int SplitAbundCommand::parseFasta(string tag) { //namefile
+	try {
 		
+		map<string, ofstream*> filehandles;
 		
-		//close files
-		in.close();
-		outNameRare.close();
-		outNameAbund.close();
-		if (!wroteNameRare) { remove((fileroot + "rare.names").c_str());  }
-		else { outputNames.push_back((fileroot + "rare.names"));  }
-		if (!wroteNameAbund) { remove((fileroot + "abund.names").c_str());  }
-		else { outputNames.push_back((fileroot + "abund.names"));  }
+		if (Groups.size() == 0) {
+			ofstream aout;
+			ofstream rout;
+			
+			if (rareNames.size() != 0) {
+				string rare = outputDir + getRootName(getSimpleName(fastafile))  + tag + "rare.fasta";
+				openOutputFile(rare, rout);
+				outputNames.push_back(rare);
+			}
+			
+			if (abundNames.size() != 0) {
+				string abund = outputDir + getRootName(getSimpleName(fastafile))  + tag + "abund.fasta";
+				openOutputFile(abund, aout);
+				outputNames.push_back(abund);
+			}
+
+				
+			//open input file
+			ifstream in;
+			openInputFile(fastafile, in);
+	
+			while (!in.eof()) {
+				if (m->control_pressed) { break; }
 		
-		if (groupfile != "") {  
-			outGroupRare.close();	 outGroupAbund.close();
-			if (!wroteGroupRare) { remove((outputDir + getRootName(getSimpleName(groupfile))  + ".rare.group").c_str());  }
-			else { outputNames.push_back((outputDir + getRootName(getSimpleName(groupfile))  + ".rare.group"));  }
-			if (!wroteGroupAbund) { remove((outputDir + getRootName(getSimpleName(groupfile))  + ".abund.group").c_str());  }
-			else { outputNames.push_back((outputDir + getRootName(getSimpleName(groupfile))  + ".abund.group"));  }
-		}
+				Sequence seq(in); gobble(in);
+				
+				if (seq.getName() != "") { 
+					
+					map<string, string>::iterator itNames;
+					
+					itNames = nameMap.find(seq.getName());
+					
+					if (itNames == nameMap.end()) {
+						m->mothurOut(seq.getName() + " is not in your namesfile, ignoring."); m->mothurOutEndLine();
+					}else{
+						if (rareNames.count(seq.getName()) != 0) { //you are a rare name
+							seq.printSequence(rout);
+						}else{ //you are a abund name
+							seq.printSequence(aout);
+						}
+					}
+				}
+			}
+			in.close();
+			if (rareNames.size() != 0)	{ rout.close(); }
+			if (abundNames.size() != 0) { aout.close(); }
+
+		}else{ //parse names by abundance and group
+			string fileroot =  outputDir + getRootName(getSimpleName(fastafile));
+			ofstream* temp;
+			ofstream* temp2;
+			map<string, bool> wroteFile;
+			map<string, ofstream*> filehandles;
+			map<string, ofstream*>::iterator it3;
+
+			for (int i=0; i<Groups.size(); i++) {
+				temp = new ofstream;
+				filehandles[Groups[i]+".rare"] = temp;
+				temp2 = new ofstream;
+				filehandles[Groups[i]+".abund"] = temp2;
+				
+				openOutputFile(fileroot + tag + Groups[i] + ".rare.fasta", *(filehandles[Groups[i]+".rare"]));
+				openOutputFile(fileroot + tag + Groups[i] + ".abund.fasta", *(filehandles[Groups[i]+".abund"]));
+				
+				wroteFile[Groups[i] + ".rare"] = false;
+				wroteFile[Groups[i] + ".abund"] = false;
+			}
+			
+			//open input file
+			ifstream in;
+			openInputFile(fastafile, in);
 	
-		if (accnos) {	
-			outAccnosAbund.close();	outAccnosRare.close();
-			if (!wroteAccnosRare) { remove((fileroot + "rare.accnos").c_str());  }
-			else { outputNames.push_back((fileroot + "rare.accnos"));  }
-			if (!wroteAccnosAbund) { remove((fileroot + "abund.accnos").c_str());  }
-			else { outputNames.push_back((fileroot + "abund.accnos"));  }
+			while (!in.eof()) {
+				if (m->control_pressed) { break; }
+		
+				Sequence seq(in); gobble(in);
+				
+				if (seq.getName() != "") { 
+					map<string, string>::iterator itNames = nameMap.find(seq.getName());
+					
+					if (itNames == nameMap.end()) {
+						m->mothurOut(seq.getName() + " is not in your namesfile, ignoring."); m->mothurOutEndLine();
+					}else{
+						vector<string> names;
+						splitAtComma(itNames->second, names);  //parses bin into individual sequence names
+				
+						string rareAbund;
+						if (rareNames.count(itNames->first) != 0) { //you are a rare name
+							rareAbund = ".rare";
+						}else{ //you are a abund name
+							rareAbund = ".abund";
+						}
+				
+						for (int i = 0; i < names.size(); i++) {
+				
+							string group = groupMap->getGroup(seq.getName());
+					
+							if (inUsersGroups(group, Groups)) { //only add if this is in a group we want
+								seq.printSequence(*(filehandles[group+rareAbund]));
+								wroteFile[group+rareAbund] = true;
+							}else if(group == "not found") {
+								m->mothurOut(seq.getName() + " is not in your groupfile. Ignoring."); m->mothurOutEndLine();
+							}
+						}
+					}
+				}
+			}
+			in.close();
+			
+			for (it3 = filehandles.begin(); it3 != filehandles.end(); it3++) { 
+				(*(filehandles[it3->first])).close();
+				if (wroteFile[it3->first] == true) {  outputNames.push_back(fileroot + tag + it3->first + ".fasta");  }
+				else { remove((fileroot + tag + it3->first + ".fasta").c_str()); }
+				delete it3->second;
+			}
 		}
-						
+				
 		return 0;
 
 	}
 	catch(exception& e) {
-		m->errorOut(e, "SplitAbundCommand", "split");
+		m->errorOut(e, "SplitAbundCommand", "parseFasta");
 		exit(1);
 	}
 }
-
 /**********************************************************************************************************************/
 
-
diff --git a/splitabundcommand.h b/splitabundcommand.h
index 8d2ab2e..fb88380 100644
--- a/splitabundcommand.h
+++ b/splitabundcommand.h
@@ -34,18 +34,28 @@ public:
 
 	
 private:
-	int split(ListVector*);
-	int split(); //namefile
+	int splitList(ListVector*);
+	int splitNames(); //namefile
+	int writeNames(); 
+	int writeList(ListVector*); 
+	int writeAccnos(string); 
+	int parseGroup(string); 
+	int parseFasta(string); 
+	int readNamesFile(); //namefile
+	int createNameMap(ListVector*);
 	
 	vector<string> outputNames;
 	ListVector* list;
 	GroupMap* groupMap;
 	InputData* input;
 	
-	string outputDir, listfile, namefile, groupfile, label;
-	set<string> labels;
-	bool abort, allLines, accnos, wroteRareList, wroteAbundList;
+	string outputDir, listfile, namefile, groupfile, label, groups, fastafile, inputFile;
+	set<string> labels, rareNames, abundNames;
+	vector<string> Groups;
+	bool abort, allLines, accnos;
 	int cutoff;
+	map<string, bool> wroteListFile;
+	map<string, string> nameMap;
 	
 	
 
diff --git a/splitmatrix.cpp b/splitmatrix.cpp
index 718c8a3..0929cb0 100644
--- a/splitmatrix.cpp
+++ b/splitmatrix.cpp
@@ -12,13 +12,14 @@
 
 /***********************************************************************/
 
-SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t){
+SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, string t, bool l){
 	m = MothurOut::getInstance();
 	distFile = distfile;
 	cutoff = c;
 	namefile = name;
 	method = t;
 	taxFile = tax;
+	large = l;
 }
 
 /***********************************************************************/
@@ -48,10 +49,185 @@ int SplitMatrix::split(){
 int SplitMatrix::splitDistance(){
 	try {
         
-		vector<set<string> > groups;
+		if (large)	{ splitDistanceLarge(); }
+		else		{ splitDistanceRAM();	}
+			
+	}
+	catch(exception& e) {
+		m->errorOut(e, "SplitMatrix", "splitDistance");
+		exit(1);
+	}
+}
+
+/***********************************************************************/
+int SplitMatrix::splitClassify(){
+	try {
+		cutoff = int(cutoff);
+		
+		map<string, int> seqGroup;
+		map<string, int>::iterator it;
+		map<string, int>::iterator it2;
+		
+		int numGroups = 0;
+		
+		//build tree from users taxonomy file
+		PhyloTree* phylo = new PhyloTree();
+		
+		ifstream in;
+		openInputFile(taxFile, in);
+			
+		//read in users taxonomy file and add sequences to tree
+		string seqname, tax;
+		while(!in.eof()){
+			in >> seqname >> tax; gobble(in);
+				
+			phylo->addSeqToTree(seqname, tax);
+		}
+		in.close();
+		
+		phylo->assignHeirarchyIDs(0);
+
+		//make sure the cutoff is not greater than maxlevel
+		if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
+		
+		//for each node in tree
+		for (int i = 0; i < phylo->getNumNodes(); i++) {
+		
+			//is this node within the cutoff
+			TaxNode taxon = phylo->get(i);
+		
+			if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
+				if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
+					for (int j = 0; j < taxon.accessions.size(); j++) {
+						seqGroup[taxon.accessions[j]] = numGroups;
+					}
+					numGroups++;
+				}
+			}
+		}
+
+		ifstream dFile;
+		openInputFile(distFile, dFile);
+		ofstream outFile;
+		
+		for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
+			remove((distFile + "." + toString(i) + ".temp").c_str());
+		}
+		
 		
 		//for buffering the io to improve speed
 		 //allow for 10 dists to be stored, then output.
+		vector<string> outputs;  outputs.resize(numGroups, "");
+		vector<int> numOutputs;	 numOutputs.resize(numGroups, 0);	
+		
+		//for each distance
+		while(dFile){
+			string seqA, seqB;
+			float dist;
+			
+			if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { remove((distFile + "." + toString(i) + ".temp").c_str());	} }
+			
+			dFile >> seqA >> seqB >> dist;  gobble(dFile);
+			
+			//if both sequences are in the same group then they are within the cutoff
+			it = seqGroup.find(seqA);
+			it2 = seqGroup.find(seqB);
+			
+			if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons 
+				if (it->second == it2->second) { //they are from the same group so add the distance
+					if (numOutputs[it->second] > 10) {
+						openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
+						outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
+						outFile.close();
+						outputs[it->second] = "";
+						numOutputs[it->second] = 0;
+					}else{
+						outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
+						numOutputs[it->second]++;
+					}
+				}
+			}
+		}
+		dFile.close();
+	
+		for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
+			remove((namefile + "." + toString(i) + ".temp").c_str());
+			
+			//write out any remaining buffers
+			if (numOutputs[it->second] > 0) {
+				openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
+				outFile << outputs[i];
+				outFile.close();
+				outputs[i] = "";
+				numOutputs[i] = 0;
+			}
+		}
+		
+		ifstream bigNameFile;
+		openInputFile(namefile, bigNameFile);
+		
+		singleton = namefile + ".extra.temp";
+		ofstream remainingNames;
+		openOutputFile(singleton, remainingNames);
+		
+		bool wroteExtra = false;
+						
+		string name, nameList;
+		while(!bigNameFile.eof()){
+			bigNameFile >> name >> nameList;  gobble(bigNameFile);
+			
+			//did this sequence get assigned a group
+			it = seqGroup.find(name);
+			
+			if (it != seqGroup.end()) {  
+				openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
+				outFile << name << '\t' << nameList << endl;
+				outFile.close();
+			}else{
+				wroteExtra = true;
+				remainingNames << name << '\t' << nameList << endl;
+			}
+		}
+		bigNameFile.close();
+		remainingNames.close();
+		
+		if (!wroteExtra) { 
+			remove(singleton.c_str());
+			singleton = "none";
+		}
+			
+		for(int i=0;i<numGroups;i++){
+			string tempNameFile = namefile + "." + toString(i) + ".temp";
+			string tempDistFile = distFile + "." + toString(i) + ".temp";
+				
+			map<string, string> temp;
+			temp[tempDistFile] = tempNameFile;
+			dists.push_back(temp);
+		}
+		
+		if (m->control_pressed)	 {  
+			for (int i = 0; i < dists.size(); i++) { 
+				remove((dists[i].begin()->first).c_str());
+				remove((dists[i].begin()->second).c_str());
+			}
+			dists.clear();
+		}
+		
+		return 0;
+			
+	}
+	catch(exception& e) {
+		m->errorOut(e, "SplitMatrix", "splitClassify");
+		exit(1);
+	}
+}
+/***********************************************************************/
+int SplitMatrix::splitDistanceLarge(){
+	try {
+		vector<set<string> > groups;
+		
+		//for buffering the io to improve speed
+		 //allow for 30 dists to be stored, then output.
 		vector<string> outputs;
 		vector<int> numOutputs;
 		vector<bool> wroteOutPut;
@@ -68,14 +244,13 @@ int SplitMatrix::splitDistance(){
 
 			dFile >> seqA >> seqB >> dist;
 			
-			if (m->control_pressed) {  outFile.close(); dFile.close();  for(int i=0;i<numGroups;i++){	if(groups[i].size() > 0){  remove((distFile + "." + toString(i) + ".temp").c_str()); }  } return 0; }
+			if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){	if(groups[i].size() > 0){  remove((distFile + "." + toString(i) + ".temp").c_str()); }  } return 0; }
 					
 			if(dist < cutoff){
 				//cout << "in cutoff: " << dist << endl;
 				int groupIDA = -1;
 				int groupIDB = -1;
 				int groupID = -1;
-				int prevGroupID = -1;
 				
 				for(int i=0;i<numGroups;i++){
 					set<string>::iterator aIt = groups[i].find(seqA);
@@ -123,10 +298,6 @@ int SplitMatrix::splitDistance(){
 					newGroup.insert(seqB);
 					groups.push_back(newGroup);
 									
-					outFile.close();
-					string fileName = distFile + "." + toString(numGroups) + ".temp";
-					outFile.open(fileName.c_str(), ios::ate);
-
 					string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
 					outputs.push_back(tempOut);
 					numOutputs.push_back(1);
@@ -136,16 +307,13 @@ int SplitMatrix::splitDistance(){
 				}
 				else{
 					string fileName = distFile + "." + toString(groupID) + ".temp";
-					
-					if(groupID != prevGroupID){
-						outFile.close();
-						outFile.open(fileName.c_str(), ios::app);
-						prevGroupID	= groupID;
-					}
-					
+											
 					//have we reached the max buffer size
-					if (numOutputs[groupID] > 10) { //write out sequence
+					if (numOutputs[groupID] > 60) { //write out sequence
+						outFile.open(fileName.c_str(), ios::app);
 						outFile << outputs[groupID] << seqA << '\t' << seqB << '\t' << dist << endl;
+						outFile.close();
+						
 						outputs[groupID] = "";
 						numOutputs[groupID] = 0;
 						wroteOutPut[groupID] = true;
@@ -157,13 +325,20 @@ int SplitMatrix::splitDistance(){
 					if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
 						string row, column, distance;
 						if(groupIDA<groupIDB){
-			
+							
+							//merge memory
 							numOutputs[groupID] += numOutputs[groupIDB];
 							outputs[groupID] += outputs[groupIDB];
 							
+							outputs[groupIDB] = "";
+							numOutputs[groupIDB] = 0;
+							
+							//if groupB is written to file it is above buffer size so read and write to new merged file
 							if (wroteOutPut[groupIDB]) {
-								string fileName = distFile + "." + toString(groupIDB) + ".temp";
-								ifstream fileB(fileName.c_str(), ios::ate);
+								string fileName2 = distFile + "." + toString(groupIDB) + ".temp";
+								ifstream fileB(fileName2.c_str(), ios::ate);
+								
+								outFile.open(fileName.c_str(), ios::app);
 								
 								long size;
 								char* memblock;
@@ -197,30 +372,33 @@ int SplitMatrix::splitDistance(){
 								delete memblock;
 								
 								fileB.close();
-								remove(fileName.c_str());
+								remove(fileName2.c_str());
+								
+								//write out the merged memory
+								if (numOutputs[groupID] > 60) {
+									outFile << outputs[groupID];
+									outputs[groupID] = "";
+									numOutputs[groupID] = 0;
+								}
+								
+								outFile.close();
 								
 								wroteOutPut[groupID] = true;
 								wroteOutPut[groupIDB] = false;
-							}
-							
-							if (numOutputs[groupID] != 0) {
-								outFile << outputs[groupID];
-								wroteOutPut[groupID] = true;
-								outputs[groupID] = "";
-								numOutputs[groupID] = 0;
-								
-								outputs[groupIDB] = "";
-								numOutputs[groupIDB] = 0;
-							}
-							
+							}else{ } //just merge b's memory with a's memory 
 						}
 						else{
 							numOutputs[groupID] += numOutputs[groupIDA];
 							outputs[groupID] += outputs[groupIDA];
 							
+							outputs[groupIDA] = "";
+							numOutputs[groupIDA] = 0;
+							
 							if (wroteOutPut[groupIDA]) {
-								string fileName = distFile + "." + toString(groupIDA) + ".temp";
-								ifstream fileB(fileName.c_str(), ios::ate);
+								string fileName2 = distFile + "." + toString(groupIDA) + ".temp";
+								ifstream fileB(fileName2.c_str(), ios::ate);
+								
+								outFile.open(fileName.c_str(), ios::app);
 								
 								long size;
 								char* memblock;
@@ -254,29 +432,26 @@ int SplitMatrix::splitDistance(){
 								delete memblock;
 								
 								fileB.close();
-								remove(fileName.c_str());
+								remove(fileName2.c_str());
+								
+								//write out the merged memory
+								if (numOutputs[groupID] > 60) {
+									outFile << outputs[groupID];
+									outputs[groupID] = "";
+									numOutputs[groupID] = 0;
+								}
+								
+								outFile.close();
 								
 								wroteOutPut[groupID] = true;
 								wroteOutPut[groupIDA] = false;
-							}
-							
-							if (numOutputs[groupID] != 0) {
-								outFile << outputs[groupID];
-								wroteOutPut[groupID] = true;
-								outputs[groupID] = "";
-								numOutputs[groupID] = 0;
-								
-								outputs[groupIDA] = "";
-								numOutputs[groupIDA] = 0;
-							}
-
+							}else { } //just merge memory
 						}					
 					}
 				}
 			}
 			gobble(dFile);
 		}
-		outFile.close();
 		dFile.close();
 		
 		for (int i = 0; i < numGroups; i++) {
@@ -287,6 +462,20 @@ int SplitMatrix::splitDistance(){
 				outFile.close();
 			}
 		}
+
+		splitNames(groups);
+				
+		return 0;			
+	}
+	catch(exception& e) {
+		m->errorOut(e, "SplitMatrix", "splitDistanceLarge");
+		exit(1);
+	}
+}
+//********************************************************************************************************************
+int SplitMatrix::splitNames(vector<set<string> >& groups){
+	try {
+		int numGroups = groups.size();
 	
 		ifstream bigNameFile(namefile.c_str());
 		if(!bigNameFile){
@@ -353,173 +542,124 @@ int SplitMatrix::splitDistance(){
 		}
 		
 		return 0;
-			
 	}
 	catch(exception& e) {
-		m->errorOut(e, "SplitMatrix", "splitDistance");
+		m->errorOut(e, "SplitMatrix", "splitNames");
 		exit(1);
 	}
 }
-
-/***********************************************************************/
-int SplitMatrix::splitClassify(){
+//********************************************************************************************************************
+int SplitMatrix::splitDistanceRAM(){
 	try {
-		cutoff = int(cutoff);
-		
-		map<string, int> seqGroup;
-		map<string, int>::iterator it;
-		map<string, int>::iterator it2;
+		vector<set<string> > groups;
+		vector<string> outputs;
 		
 		int numGroups = 0;
-		
-		//build tree from users taxonomy file
-		PhyloTree* phylo = new PhyloTree();
-		
-		ifstream in;
-		openInputFile(taxFile, in);
-			
-		//read in users taxonomy file and add sequences to tree
-		string seqname, tax;
-		while(!in.eof()){
-			in >> seqname >> tax; gobble(in);
-				
-			phylo->addSeqToTree(seqname, tax);
-		}
-		in.close();
-		
-		phylo->assignHeirarchyIDs(0);
-
-		//make sure the cutoff is not greater than maxlevel
-		if (cutoff > phylo->getMaxLevel()) { m->mothurOut("splitcutoff is greater than the longest taxonomy, using " + toString(phylo->getMaxLevel())); m->mothurOutEndLine(); cutoff = phylo->getMaxLevel(); }
-		
-		//for each node in tree
-		for (int i = 0; i < phylo->getNumNodes(); i++) {
-		
-			//is this node within the cutoff
-			TaxNode taxon = phylo->get(i);
-		
-			if (taxon.level == cutoff) {//if yes, then create group containing this nodes sequences
-				if (taxon.accessions.size() > 1) { //if this taxon just has one seq its a singleton
-					for (int j = 0; j < taxon.accessions.size(); j++) {
-						seqGroup[taxon.accessions[j]] = numGroups;
-					}
-					numGroups++;
-				}
-			}
-		}
 
 		ifstream dFile;
 		openInputFile(distFile, dFile);
-		ofstream outFile;
-		
-		for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
-			remove((distFile + "." + toString(i) + ".temp").c_str());
-		}
-		
-		
-		//for buffering the io to improve speed
-		 //allow for 10 dists to be stored, then output.
-		vector<string> outputs;  outputs.resize(numGroups, "");
-		vector<int> numOutputs;	 numOutputs.resize(numGroups, 0);	
-		
-		//for each distance
+
 		while(dFile){
 			string seqA, seqB;
 			float dist;
+
+			dFile >> seqA >> seqB >> dist;
 			
-			if (m->control_pressed) { dFile.close(); for (int i = 0; i < numGroups; i++) { remove((distFile + "." + toString(i) + ".temp").c_str());	} }
-			
-			dFile >> seqA >> seqB >> dist;  gobble(dFile);
-			
-			//if both sequences are in the same group then they are within the cutoff
-			it = seqGroup.find(seqA);
-			it2 = seqGroup.find(seqB);
-			
-			if ((it != seqGroup.end()) && (it2 != seqGroup.end())) { //they are both not singletons 
-				if (it->second == it2->second) { //they are from the same group so add the distance
-					if (numOutputs[it->second] > 10) {
-						openOutputFileAppend((distFile + "." + toString(it->second) + ".temp"), outFile);
-						outFile << outputs[it->second] << seqA << '\t' << seqB << '\t' << dist << endl;
-						outFile.close();
-						outputs[it->second] = "";
-						numOutputs[it->second] = 0;
-					}else{
-						outputs[it->second] += seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
-						numOutputs[it->second]++;
+			if (m->control_pressed) {   dFile.close();  for(int i=0;i<numGroups;i++){	if(groups[i].size() > 0){  remove((distFile + "." + toString(i) + ".temp").c_str()); }  } return 0; }
+					
+			if(dist < cutoff){
+				//cout << "in cutoff: " << dist << endl;
+				int groupIDA = -1;
+				int groupIDB = -1;
+				int groupID = -1;
+				
+				for(int i=0;i<numGroups;i++){
+					set<string>::iterator aIt = groups[i].find(seqA);
+					set<string>::iterator bIt = groups[i].find(seqB);
+					
+					if(groupIDA == -1 && aIt != groups[i].end()){//seqA is not already assigned to a group and is in group[i], so assign seqB to group[i]
+						groups[i].insert(seqB);
+						groupIDA = i;
+						groupID = groupIDA;
+
+						//cout << "in aIt: " << groupID << endl;
+	//					break;
+					}
+					else if(groupIDB == -1 && bIt != groups[i].end()){//seqB is not already assigned to a group and is in group[i], so assign seqA to group[i]
+						groups[i].insert(seqA);
+						groupIDB = i;
+						groupID = groupIDB;
+
+					//	cout << "in bIt: " << groupID << endl;
+	//					break;
+					}
+				
+					if(groupIDA != -1 && groupIDB != -1){//both ifs above have been executed, so we need to decide who to assign them to
+						if(groupIDA < groupIDB){
+						//	cout << "A: " << groupIDA << "\t" << groupIDB << endl;
+							groups[groupIDA].insert(groups[groupIDB].begin(), groups[groupIDB].end()); //merge two groups into groupIDA
+							groups[groupIDB].clear(); 
+							groupID = groupIDA;
+						}
+						else{
+						//	cout << "B: " << groupIDA << "\t" << groupIDB << endl;
+							groups[groupIDB].insert(groups[groupIDA].begin(), groups[groupIDA].end()); //merge two groups into groupIDB
+							groups[groupIDA].clear();  
+							groupID = groupIDB;
+						}
+						break;
+					}
+				}
+				
+	//windows is gonna gag on the reuse of outFile, will need to make it local...
+				
+				if(groupIDA == -1 && groupIDB == -1){ //we need a new group
+					set<string> newGroup;
+					newGroup.insert(seqA);
+					newGroup.insert(seqB);
+					groups.push_back(newGroup);
+									
+					string tempOut = seqA + '\t' + seqB + '\t' + toString(dist) + '\n';
+					outputs.push_back(tempOut);
+					numGroups++;
+				}
+				else{
+											
+					outputs[groupID] +=  seqA + '\t' + seqB + '\t' + toString(dist)  + '\n';
+					
+					if(groupIDA != -1 && groupIDB != -1){ //merge distance files of two groups you merged above
+						string row, column, distance;
+						if(groupIDA<groupIDB){
+							//merge memory
+							outputs[groupID] += outputs[groupIDB];
+							outputs[groupIDB] = "";
+						}else{
+							outputs[groupID] += outputs[groupIDA];
+							outputs[groupIDA] = "";
+						}					
 					}
 				}
 			}
+			gobble(dFile);
 		}
 		dFile.close();
-	
-		for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
-			remove((namefile + "." + toString(i) + ".temp").c_str());
-			
-			//write out any remaining buffers
-			if (numOutputs[it->second] > 0) {
-				openOutputFileAppend((distFile + "." + toString(i) + ".temp"), outFile);
-				outFile << outputs[i];
-				outFile.close();
-				outputs[i] = "";
-				numOutputs[i] = 0;
-			}
-		}
 		
-		ifstream bigNameFile;
-		openInputFile(namefile, bigNameFile);
-		
-		singleton = namefile + ".extra.temp";
-		ofstream remainingNames;
-		openOutputFile(singleton, remainingNames);
-		
-		bool wroteExtra = false;
-						
-		string name, nameList;
-		while(!bigNameFile.eof()){
-			bigNameFile >> name >> nameList;  gobble(bigNameFile);
-			
-			//did this sequence get assigned a group
-			it = seqGroup.find(name);
-			
-			if (it != seqGroup.end()) {  
-				openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
-				outFile << name << '\t' << nameList << endl;
+		for (int i = 0; i < numGroups; i++) {
+			if (outputs[i] != "") {
+				ofstream outFile;
+				string fileName = distFile + "." + toString(i) + ".temp";
+				outFile.open(fileName.c_str(), ios::ate);
+				outFile << outputs[i];
 				outFile.close();
-			}else{
-				wroteExtra = true;
-				remainingNames << name << '\t' << nameList << endl;
 			}
 		}
-		bigNameFile.close();
-		remainingNames.close();
-		
-		if (!wroteExtra) { 
-			remove(singleton.c_str());
-			singleton = "none";
-		}
-			
-		for(int i=0;i<numGroups;i++){
-			string tempNameFile = namefile + "." + toString(i) + ".temp";
-			string tempDistFile = distFile + "." + toString(i) + ".temp";
+
+		splitNames(groups);
 				
-			map<string, string> temp;
-			temp[tempDistFile] = tempNameFile;
-			dists.push_back(temp);
-		}
-		
-		if (m->control_pressed)	 {  
-			for (int i = 0; i < dists.size(); i++) { 
-				remove((dists[i].begin()->first).c_str());
-				remove((dists[i].begin()->second).c_str());
-			}
-			dists.clear();
-		}
-		
-		return 0;
-			
+		return 0;			
 	}
 	catch(exception& e) {
-		m->errorOut(e, "SplitMatrix", "splitClassify");
+		m->errorOut(e, "SplitMatrix", "splitDistanceRAM");
 		exit(1);
 	}
 }
diff --git a/splitmatrix.h b/splitmatrix.h
index e38f8fc..95fe0dc 100644
--- a/splitmatrix.h
+++ b/splitmatrix.h
@@ -19,7 +19,7 @@ class SplitMatrix  {
 	
 	public:
 
-		SplitMatrix(string, string, string, float, string); //column formatted distance file, namesfile, cutoff, method
+		SplitMatrix(string, string, string, float, string, bool); //column formatted distance file, namesfile, cutoff, method
 		~SplitMatrix();
 		int split();
 		vector< map<string, string> > getDistanceFiles();  //returns map of distance files -> namefile sorted by distance file size
@@ -31,9 +31,13 @@ class SplitMatrix  {
 		string distFile, namefile, singleton, method, taxFile;
 		vector< map< string, string> > dists;
 		float cutoff;
+		bool large;
 				
 		int splitDistance();
 		int splitClassify();
+		int splitDistanceLarge();
+		int splitDistanceRAM();
+		int splitNames(vector<set<string> >& groups);
 };
 
 /******************************************************/
diff --git a/tree.cpp b/tree.cpp
index 8e45981..400a72a 100644
--- a/tree.cpp
+++ b/tree.cpp
@@ -736,39 +736,20 @@ int Tree::readTreeString(ifstream& filehandle)	{
 //cout << " at beginning of while " <<  k << endl;			
 			if(c == ')')  {    
 				//to pass over labels in trees
-				c=filehandle.get();
-				while((c!=',') && (c != -1) && (c!= ':') && (c!=';')){ c=filehandle.get(); }
-				filehandle.putback(c);
+				string label = readLabel(filehandle);
 			}
+			
 			if(c == ';') { return 0; }
 			if(c == -1) { return 0; }
+			
 			//if you are a name
 			if((c != '(') && (c != ')') && (c != ',') && (c != ':') && (c != '\n') && (c != '\t') && (c != 32)) { //32 is space
-				name = "";
-				c = filehandle.get();
-			//k = c;
-//cout << k << endl;
-				while ((c != '(') && (c != ')') && (c != ',') && (c != ':') && (c != '\n') && (c != 32) && (c != '\t')) {			
-					name += c;
-					c = filehandle.get();
-			//k = c;
-//cout << " in name while " << k << endl;
-				}
-				
-//cout << "name = " << name << endl;
+				name = readName(filehandle);
 				globaldata->Treenames.push_back(name);
-				filehandle.putback(c);
-//k = c;
-//cout << " after putback" <<  k << endl;
 			} 
 			
 			if(c  == ':') { //read until you reach the end of the branch length
-				while ((c != '(') && (c != ')') && (c != ',') && (c != ';') && (c != '\n') && (c != '\t') && (c != 32)) {
-					c = filehandle.get();
-	//k = c;
-	//cout << " in branch while " << k << endl;
-				}
-				filehandle.putback(c);
+				string bl = readBranchLength(filehandle);
 			}
 		
 			c = filehandle.get();
@@ -789,6 +770,68 @@ int Tree::readTreeString(ifstream& filehandle)	{
 }	
 
 /*******************************************************/
+string Tree::readLabel(ifstream& filehandle)	{
+	try {
+		
+		string label = "";
+		
+		//to pass over labels in trees
+		int c=filehandle.get();
+		while((c!=',') && (c != -1) && (c!= ':') && (c!=';')){ label += c; c=filehandle.get(); }
+		filehandle.putback(c);
+		
+		return label;
+		
+	}
+	catch(exception& e) {
+		m->errorOut(e, "Tree", "readLabel");
+		exit(1);
+	}
+}	
+/*******************************************************/
+string Tree::readName(ifstream& filehandle)	{
+	try {
+		
+		string name = "";
+		int c = filehandle.get();
+		
+		while ((c != '(') && (c != ')') && (c != ',') && (c != ':') && (c != '\n') && (c != 32) && (c != '\t')) {			
+			name += c;
+			c = filehandle.get();
+		}
+				
+//cout << "name = " << name << endl;
+		filehandle.putback(c);
+		
+		return name;
+		
+	}
+	catch(exception& e) {
+		m->errorOut(e, "Tree", "readName");
+		exit(1);
+	}
+}	
+/*******************************************************/
+string Tree::readBranchLength(ifstream& filehandle)	{
+	try {
+		
+		string br = "";
+		int c;
+		while ((c != '(') && (c != ')') && (c != ',') && (c != ';') && (c != '\n') && (c != '\t') && (c != 32)) {
+			br += c;
+			c = filehandle.get();
+		}
+		filehandle.putback(c);
+		
+		return br;
+		
+	}
+	catch(exception& e) {
+		m->errorOut(e, "Tree", "readBranchLength");
+		exit(1);
+	}
+}	
 
 /*******************************************************/
+/*******************************************************/
 
diff --git a/tree.h b/tree.h
index 23ddf5d..99cd68f 100644
--- a/tree.h
+++ b/tree.h
@@ -62,6 +62,10 @@ private:
 							//not included in the tree. 
 							//only takes names from the first tree in the tree file and assumes that all trees use the same names.
 	int readTreeString(ifstream&);
+	string readLabel(ifstream&);
+	string readName(ifstream&);
+	string readBranchLength(ifstream&);
+	
 	MothurOut* m;
 	
 };
-- 
2.39.5