From 7e354c9abb09ea3cf5b500a16cc7f6dd79ccb6f5 Mon Sep 17 00:00:00 2001
From: westcott <westcott>
Date: Thu, 2 Jul 2009 15:11:04 +0000
Subject: [PATCH] made changes to concensus to find a better tree.  also fixed
 minor bug in venn with 4 groups

---
 Mothur.xcodeproj/project.pbxproj |   8 ++
 chimeraseqscommand.cpp           | 230 +++++++++++++++++++++++++++++++
 chimeraseqscommand.h             |  57 ++++++++
 consensuscommand.cpp             | 165 +++++++++++++++++-----
 consensuscommand.h               |   6 +-
 filters.h                        | 102 ++++++++++++++
 filterseqscommand.cpp            |  95 +++----------
 filterseqscommand.h              |   8 +-
 venn.cpp                         |  63 ++++++---
 9 files changed, 600 insertions(+), 134 deletions(-)
 create mode 100644 chimeraseqscommand.cpp
 create mode 100644 chimeraseqscommand.h
 create mode 100644 filters.h
diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj
index 79629fc..b8a1057 100644
--- a/Mothur.xcodeproj/project.pbxproj
+++ b/Mothur.xcodeproj/project.pbxproj
@@ -70,6 +70,7 @@
 		3792948A0F2E258500B9034A /* parsimony.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 379294890F2E258500B9034A /* parsimony.cpp */; };
 		3799A9500FD6A58C00E33EDE /* distancedb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3799A94C0FD6A58C00E33EDE /* distancedb.cpp */; };
 		3799A9510FD6A58C00E33EDE /* seqsummarycommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3799A94E0FD6A58C00E33EDE /* seqsummarycommand.cpp */; };
+		379D3D510FF90E090068C1C0 /* chimeraseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 379D3D500FF90E090068C1C0 /* chimeraseqscommand.cpp */; };
 		37AD4CE40F28AEA300AA2D49 /* sharedlistvector.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 37AD4CE30F28AEA300AA2D49 /* sharedlistvector.cpp */; };
 		37AD4DBB0F28E2FE00AA2D49 /* tree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 37AD4DBA0F28E2FE00AA2D49 /* tree.cpp */; };
 		37AD4DCA0F28F3DD00AA2D49 /* readtree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 37AD4DC90F28F3DD00AA2D49 /* readtree.cpp */; };
@@ -304,6 +305,9 @@
 		3799A94D0FD6A58C00E33EDE /* distancedb.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = distancedb.hpp; sourceTree = SOURCE_ROOT; };
 		3799A94E0FD6A58C00E33EDE /* seqsummarycommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = seqsummarycommand.cpp; sourceTree = SOURCE_ROOT; };
 		3799A94F0FD6A58C00E33EDE /* seqsummarycommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = seqsummarycommand.h; sourceTree = SOURCE_ROOT; };
+		379D3D4F0FF90E090068C1C0 /* chimeraseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraseqscommand.h; sourceTree = "<group>"; };
+		379D3D500FF90E090068C1C0 /* chimeraseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraseqscommand.cpp; sourceTree = "<group>"; };
+		379D3D6B0FF917F40068C1C0 /* filters.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = filters.h; sourceTree = "<group>"; };
 		37AD4CE20F28AEA300AA2D49 /* sharedlistvector.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sharedlistvector.h; sourceTree = SOURCE_ROOT; };
 		37AD4CE30F28AEA300AA2D49 /* sharedlistvector.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sharedlistvector.cpp; sourceTree = SOURCE_ROOT; };
 		37AD4DB90F28E2FE00AA2D49 /* tree.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = tree.h; sourceTree = SOURCE_ROOT; };
@@ -650,6 +654,7 @@
 				37C753F40FB34C0300DBD02E /* eachgapignore.h */,
 				373C69960FC1E63600137ACD /* efron.cpp */,
 				373C69970FC1E63600137ACD /* efron.h */,
+				379D3D6B0FF917F40068C1C0 /* filters.h */,
 				EB9303F70F53517300E8EF26 /* geom.h */,
 				EB9303F80F53517300E8EF26 /* geom.cpp */,
 				378C1AF10FB0644D004D63F5 /* goodscoverage.h */,
@@ -755,6 +760,8 @@
 				37D927CB0F21331F001D4494 /* collectsharedcommand.cpp */,
 				377326640FAF16E0007ABB8B /* consensuscommand.h */,
 				377326630FAF16E0007ABB8B /* consensuscommand.cpp */,
+				379D3D4F0FF90E090068C1C0 /* chimeraseqscommand.h */,
+				379D3D500FF90E090068C1C0 /* chimeraseqscommand.cpp */,
 				37B28F660F27590100808A62 /* deconvolutecommand.h */,
 				37B28F670F27590100808A62 /* deconvolutecommand.cpp */,
 				37C753CC0FB3415200DBD02E /* distancecommand.h */,
@@ -1103,6 +1110,7 @@
 				378598740FDD4C1500EF9D03 /* heatmapsim.cpp in Sources */,
 				378599100FDD7E8E00EF9D03 /* optionparser.cpp in Sources */,
 				7E5A17AF0FE57292003C6A03 /* mergefilecommand.cpp in Sources */,
+				379D3D510FF90E090068C1C0 /* chimeraseqscommand.cpp in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
diff --git a/chimeraseqscommand.cpp b/chimeraseqscommand.cpp
new file mode 100644
index 0000000..b810b9f
--- /dev/null
+++ b/chimeraseqscommand.cpp
@@ -0,0 +1,230 @@
+/*
+ *  chimeraseqscommand.cpp
+ *  Mothur
+ *
+ *  Created by Sarah Westcott on 6/29/09.
+ *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
+ *
+ */
+
+#include "chimeraseqscommand.h"
+
+//***************************************************************************************************************
+
+ChimeraSeqsCommand::ChimeraSeqsCommand(string option){
+	try {
+		abort = false;
+		
+		//allow user to run help
+		if(option == "help") { help(); abort = true; }
+		
+		else {
+			//valid paramters for this command
+			string Array[] =  {"fasta", "filter", "correction", "processors", "method" };
+			vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
+			
+			OptionParser parser(option);
+			map<string,string> parameters = parser.getParameters();
+			
+			ValidParameters validParameter;
+			
+			//check to make sure all parameters are valid for command
+			for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
+				if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
+			}
+			
+			//check for required parameters
+			fastafile = validParameter.validFile(parameters, "fasta", true);
+			if (fastafile == "not open") { abort = true; }
+			else if (fastafile == "not found") { fastafile = ""; mothurOut("fasta is a required parameter for the chimera.seqs command."); mothurOutEndLine(); abort = true;  }	
+			
+			string temp;
+			temp = validParameter.validFile(parameters, "filter", false);			if (temp == "not found") { temp = "F"; }
+			filter = isTrue(temp);
+			
+			temp = validParameter.validFile(parameters, "correction", false);		if (temp == "not found") { temp = "T"; }
+			correction = isTrue(temp);
+			
+			temp = validParameter.validFile(parameters, "processors", true);		if (temp == "not found") { temp = "1"; }
+			convert(temp, processors);
+			
+			method = validParameter.validFile(parameters, "method", false);		if (method == "not found") { method = "bellerophon"; }
+
+		}
+	}
+	catch(exception& e) {
+		errorOut(e, "ChimeraSeqsCommand", "ChimeraSeqsCommand");
+		exit(1);
+	}
+}
+//**********************************************************************************************************************
+
+void ChimeraSeqsCommand::help(){
+	try {
+		mothurOut("The chimera.seqs command reads a fastafile and creates a sorted priority score list of potentially chimeric sequences (ideally, the sequences should already be aligned).\n");
+		mothurOut("The chimera.seqs command parameters are fasta, filter, correction, processors and method.  fasta is required.\n");
+		mothurOut("The filter parameter allows you to specify if you would like to apply a 50% soft filter.  The default is false. \n");
+		mothurOut("The correction parameter allows you to .....  The default is true. \n");
+		mothurOut("The processors parameter allows you to specify how many processors you would like to use.  The default is 1. \n");
+		mothurOut("The method parameter allows you to specify the method for finding chimeric sequences.  The default is bellerophon. \n");
+		mothurOut("The chimera.seqs command should be in the following format: \n");
+		mothurOut("chimera.seqs(fasta=yourFastaFile, filter=yourFilter, correction=yourCorrection, processors=yourProcessors, method=bellerophon) \n");
+		mothurOut("Example: chimera.seqs(fasta=AD.align, filter=True, correction=true, processors=2, method=yourMethod) \n");
+		mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");	
+	}
+	catch(exception& e) {
+		errorOut(e, "ChimeraSeqsCommand", "help");
+		exit(1);
+	}
+}
+
+//***************************************************************************************************************
+
+ChimeraSeqsCommand::~ChimeraSeqsCommand(){	/*	do nothing	*/	}
+
+//***************************************************************************************************************
+
+int ChimeraSeqsCommand::execute(){
+	try{
+		
+		if (abort == true) { return 0; }
+		
+		//do soft filter
+		if (filter)  {
+			string optionString = "fasta=" + fastafile + ", soft=50.0, vertical=F";
+			filterSeqs = new FilterSeqsCommand(optionString);
+			filterSeqs->execute();
+			delete filterSeqs;
+			
+			//reset fastafile to filtered file
+			fastafile = getRootName(fastafile) + "filter.fasta";
+		}
+		
+		//read in sequences
+		readSeqs();
+		
+		//int numSeqs = seqs.size();
+		
+		//find average midpoint of seqs
+		midpoint = findAverageMidPoint();
+				
+		//this should be parallelized
+		//generatePreferences();
+		
+				
+		//output results to screen						
+		mothurOutEndLine();
+		mothurOut("\t\t"); mothurOutEndLine();
+		//mothurOut("Minimum:\t" + toString(startPosition[0]) + "\t" + toString(endPosition[0]) + "\t" + toString(seqLength[0]) + "\t" + toString(ambigBases[0]) + "\t" + toString(longHomoPolymer[0])); mothurOutEndLine();
+		//mothurOut("2.5%-tile:\t" + toString(startPosition[ptile0_25]) + "\t" + toString(endPosition[ptile0_25]) + "\t" + toString(seqLength[ptile0_25]) + "\t" + toString(ambigBases[ptile0_25]) + "\t"+ toString(longHomoPolymer[ptile0_25])); mothurOutEndLine();
+		//mothurOut("25%-tile:\t" + toString(startPosition[ptile25]) + "\t" + toString(endPosition[ptile25]) + "\t" + toString(seqLength[ptile25]) + "\t" + toString(ambigBases[ptile25]) + "\t" + toString(longHomoPolymer[ptile25])); mothurOutEndLine();
+		//mothurOut("Median: \t" + toString(startPosition[ptile50]) + "\t" + toString(endPosition[ptile50]) + "\t" + toString(seqLength[ptile50]) + "\t" + toString(ambigBases[ptile50]) + "\t" + toString(longHomoPolymer[ptile50])); mothurOutEndLine();
+		//mothurOut("75%-tile:\t" + toString(startPosition[ptile75]) + "\t" + toString(endPosition[ptile75]) + "\t" + toString(seqLength[ptile75]) + "\t" + toString(ambigBases[ptile75]) + "\t" + toString(longHomoPolymer[ptile75])); mothurOutEndLine();
+		//mothurOut("97.5%-tile:\t" + toString(startPosition[ptile97_5]) + "\t" + toString(endPosition[ptile97_5]) + "\t" + toString(seqLength[ptile97_5]) + "\t" + toString(ambigBases[ptile97_5]) + "\t" + toString(longHomoPolymer[ptile97_5])); mothurOutEndLine();
+		//mothurOut("Maximum:\t" + toString(startPosition[ptile100]) + "\t" + toString(endPosition[ptile100]) + "\t" + toString(seqLength[ptile100]) + "\t" + toString(ambigBases[ptile100]) + "\t" + toString(longHomoPolymer[ptile100])); mothurOutEndLine();
+		//mothurOut("# of Seqs:\t" + toString(numSeqs)); mothurOutEndLine();
+		
+		//outSummary.close();
+		return 0;
+	}
+	catch(exception& e) {
+		errorOut(e, "ChimeraSeqsCommand", "execute");
+		exit(1);
+	}
+}
+
+//***************************************************************************************************************
+void ChimeraSeqsCommand::readSeqs(){
+	try {
+		ifstream inFASTA;
+		openInputFile(fastafile, inFASTA);
+		
+		//read in seqs and store in vector
+		while(!inFASTA.eof()){
+			Sequence current(inFASTA);
+			
+			seqs.push_back(current);
+			
+			gobble(inFASTA);
+		}
+		inFASTA.close();
+
+	}
+	catch(exception& e) {
+		errorOut(e, "ChimeraSeqsCommand", "readSeqs");
+		exit(1);
+	}
+}
+
+
+//***************************************************************************************************************
+int ChimeraSeqsCommand::findAverageMidPoint(){
+	try {
+		int totalMids = 0;
+		int averageMid = 0;
+		
+		//loop through the seqs and find midpoint
+		for (int i = 0; i < seqs.size(); i++) {
+			
+			//get unaligned sequence
+			seqs[i].setUnaligned(seqs[i].getUnaligned());  //if you read an aligned file the unaligned is really aligned, so we need to make sure its unaligned
+			
+			string unaligned = seqs[i].getUnaligned();
+			string aligned = seqs[i].getAligned();
+			
+			//find midpoint of this seq
+			int count = 0;
+			int thismid = 0;
+			for (int j = 0; j < aligned.length(); j++) {
+				
+				thismid++;
+				
+				//if you are part of the unaligned sequence increment
+				if (isalpha(aligned[j])) {  count++;  }
+				
+				//if you have reached the halfway point stop
+				if (count >= (unaligned.length() / 2)) { break; }
+			}
+			
+			//add this mid to total
+			totalMids += thismid;
+		
+		}
+		
+		averageMid = (totalMids / seqs.size());
+		
+		return averageMid; 
+	
+	
+	}
+	catch(exception& e) {
+		errorOut(e, "ChimeraSeqsCommand", "findAverageMidPoint");
+		exit(1);
+	}
+}
+
+/***************************************************************************************************************
+int ChimeraSeqsCommand::createSparseMatrix(int startLine, int endLine, SparseMatrix* sparse){
+	try {
+
+		for(int i=startLine; i<endLine; i++){
+			
+			for(int j=0;j<i;j++){
+			
+				distCalculator->calcDist(seqs.get(i), seqs.get(j));
+				double dist = distCalculator->getDist();
+				
+				
+				
+			}
+			
+	
+		return 1;
+	}
+	catch(exception& e) {
+		errorOut(e, "ChimeraSeqsCommand", "createSparseMatrix");
+		exit(1);
+	}
+}
+/**************************************************************************************************/
+
diff --git a/chimeraseqscommand.h b/chimeraseqscommand.h
new file mode 100644
index 0000000..455ac92
--- /dev/null
+++ b/chimeraseqscommand.h
@@ -0,0 +1,57 @@
+#ifndef CHIMERACOMMAND_H
+#define CHIMERACOMMAND_H
+
+/*
+ *  chimeraseqscommand.h
+ *  Mothur
+ *
+ *  Created by Sarah Westcott on 6/29/09.
+ *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
+ *
+ */
+
+#include "mothur.h"
+#include "command.hpp"
+#include "filterseqscommand.h"
+#include "sequence.hpp"
+
+
+
+/***********************************************************/
+
+class ChimeraSeqsCommand : public Command {
+public:
+	ChimeraSeqsCommand(string);
+	~ChimeraSeqsCommand();
+	int execute();
+	void help();
+	
+private:
+	//Dist* distCalculator;
+	
+	struct Preference {
+		string leftParent;
+		string rightParent;
+		float score;
+
+	};
+
+
+	bool abort;
+	string method, fastafile;
+	bool filter, correction;
+	int processors, midpoint;
+	FilterSeqsCommand* filterSeqs;
+	vector<Sequence> seqs;
+	vector<Preference> pref;
+	
+	int findAverageMidPoint();
+	void readSeqs();
+	
+
+};
+
+/***********************************************************/
+
+#endif
+
diff --git a/consensuscommand.cpp b/consensuscommand.cpp
index 9a90219..8c34068 100644
--- a/consensuscommand.cpp
+++ b/consensuscommand.cpp
@@ -17,18 +17,9 @@ ConcensusCommand::ConcensusCommand(string fileroot){
 		abort = false;
 		
 		filename = fileroot;
-		//allow user to run help
-		//if(option == "help") { help(); abort = true; }
 		
-		//else {
-			//if (option != "") { mothurOut("There are no valid parameters for the consensus command."); mothurOutEndLine(); abort = true; }
-			
-		//	//no trees were read
-		//	if (globaldata->gTree.size() == 0) {  mothurOut("You must execute the read.tree command, before you may use the consensus command."); mothurOutEndLine(); abort = true;  }
-			//else { 
-			 t = globaldata->gTree;
-			 //	}
-		//}
+		t = globaldata->gTree;
+	
 	}
 	catch(exception& e) {
 		errorOut(e, "ConcensusCommand", "ConcensusCommand");
@@ -97,9 +88,10 @@ int ConcensusCommand::execute(){
 		out2 << "Species in Order: " << endl << endl;
 		for (int i = 0; i < treeSet.size(); i++) {  out2 << i+1 << ".  " << treeSet[i] << endl; }
 		
-		vector<string> temp; 
 		//output sets included
 		out2 << endl << "Sets included in the consensus tree:" << endl << endl;
+		
+		vector<string> temp;
 		for (it2 = nodePairsInTree.begin(); it2 != nodePairsInTree.end(); it2++) {
 			//only output pairs not leaves
 			if (it2->first.size() > 1) { 
@@ -112,6 +104,7 @@ int ConcensusCommand::execute(){
 					//find spot 
 					int index = findSpot(it2->first[i]);
 					temp[index] = "*";
+					//temp[index] = it2->first[i] + "  ";
 				}
 				
 				//output temp
@@ -173,8 +166,8 @@ int ConcensusCommand::buildConcensusTree(vector<string> nodeSet) {
 		//terminate recursion
 		}else if (count == numNodes) { return 0; }
 		else {
-			leftChildSet = getNextAvailableSet(nodeSet);
-			rightChildSet = getRestSet(nodeSet, leftChildSet);
+			//finds best child pair
+			leftChildSet = getNextAvailableSet(nodeSet, rightChildSet);
 			int left = buildConcensusTree(leftChildSet);
 			int right = buildConcensusTree(rightChildSet);
 			consensusTree->tree[count].setChildren(left, right);
@@ -221,6 +214,7 @@ void ConcensusCommand::getSets() {
 			}
 		}
 		
+		
 		//add each leaf to terminate recursion in consensus
 		//you want the leaves in there but with insignifigant sightings value so it is added last
 		//for each leaf node get descendant info.
@@ -238,33 +232,70 @@ void ConcensusCommand::getSets() {
 		}
 
 		sort(treeSet.begin(), treeSet.end());
+		
+		
+		map< vector<string>, int> nodePairsCopy = nodePairs;
+		
+		
+		//set initial rating on pairs to sightings + subgroup sightings
+		while (nodePairsCopy.size() != 0) {
+		
+			vector<string> small = getSmallest(nodePairsCopy);
+			
+			int subgrouprate = getSubgroupRating(small);
+		
+			nodePairsInitialRate[small] = nodePairs[small] + subgrouprate;
+			
+			nodePairsCopy.erase(small);
+		}
+		
 	}
 	catch(exception& e) {
 		errorOut(e, "ConcensusCommand", "getSets");
 		exit(1);
 	}
 }
+//**********************************************************************************************************************
+vector<string> ConcensusCommand::getSmallest(map< vector<string>, int> nodes) {
+	try{
+		vector<string> smallest = nodes.begin()->first;
+		int smallsize = smallest.size();
+		
+		for(it2 = nodes.begin(); it2 != nodes.end(); it2++) {
+			if(it2->first.size() < smallsize) { smallsize = it2->first.size();  smallest = it2->first;  }
+		}
+		
+		return smallest;
+	}
+	catch(exception& e) {
+		errorOut(e, "ConcensusCommand", "getSmallest");
+		exit(1);
+	}
+}
 
 //**********************************************************************************************************************
-vector<string> ConcensusCommand::getNextAvailableSet(vector<string> bigset) {
+vector<string> ConcensusCommand::getNextAvailableSet(vector<string> bigset, vector<string>& rest) {
 	try {
+//cout << "new call " << endl << endl << endl;
 		vector<string> largest; largest.clear();
-		int largestSighting = -1;
-		
-		//go through the sets
-		for (it2 = nodePairs.begin(); it2 != nodePairs.end(); it2++) {
-			//are you a subset of bigset
-			if (isSubset(bigset, it2->first) == true) {
-			
-				//are you the largest. if you are the same size as current largest refer to sighting
-				if (it2->first.size() > largest.size()) { largest = it2->first;  largestSighting = it2->second; }
-				else if (it2->first.size() == largest.size()) {
-					if (it2->second > largestSighting) { largest = it2->first;  largestSighting = it2->second; }
-				}
-				
-			}
+		rest.clear();
+		
+		//if you are just 2 groups
+		if (bigset.size() == 2)  {   
+			rest.push_back(bigset[0]);
+			largest.push_back(bigset[1]);
+		}else{
+			rest = bestSplit[bigset][0];
+			largest = bestSplit[bigset][1];
 		}
 		
+		
+		//save for printing out later and for branch lengths
+		nodePairsInTree[rest] = nodePairs[rest];
+		
+		//delete whatever set you return because it is no longer available
+		nodePairs.erase(rest);
+
 		//save for printing out later and for branch lengths
 		nodePairsInTree[largest] = nodePairs[largest];
 		
@@ -280,6 +311,73 @@ vector<string> ConcensusCommand::getNextAvailableSet(vector<string> bigset) {
 	}
 }
 
+/**********************************************************************************************************************/
+int ConcensusCommand::getSubgroupRating(vector<string> group) {
+	try {
+		map< vector<string>, int>::iterator ittemp;
+		map< vector< vector<string> > , int >::iterator it3;
+		int rate = 0;
+		
+		// ***********************************************************************************//
+		//1. this function must be called passing it littlest sets to biggest 
+		//		since it the rating is made from your sighting plus you best splits rating
+		//2. it saves the top pair to use later
+		// ***********************************************************************************//
+
+		
+		if (group.size() < 3) {  return rate;  }
+		
+		map< vector<string>, int> possiblePairing;  //this is all the subsets of group
+		
+		//go through the sets
+		for (it2 = nodePairs.begin(); it2 != nodePairs.end(); it2++) {
+			//are you a subset of bigset, then save in possiblePairings
+			if (isSubset(group, it2->first) == true) {  possiblePairing[it2->first] = it2->second;  }
+		}		
+	
+		map< vector< vector<string> > , int > rating;
+		
+		while (possiblePairing.size() != 0) {
+		
+			it2 = possiblePairing.begin(); 
+			vector<string> temprest = getRestSet(group, it2->first);
+			
+			//is the rest a set available in possiblePairings
+			ittemp = possiblePairing.find(temprest);
+			if (ittemp != possiblePairing.end()) {  //if the rest is in the possible pairings then add this pair to rating map
+				vector< vector<string> > temprate;
+				temprate.push_back(it2->first);  temprate.push_back(temprest);
+				
+				rating[temprate] = (nodePairsInitialRate[it2->first] + nodePairsInitialRate[temprest]);
+				
+				//erase so you dont add 1,2 and 2,1.
+				possiblePairing.erase(temprest);
+			}
+			
+			possiblePairing.erase(it2);
+		}
+
+
+		it3 = rating.begin();
+		rate = it3->second;
+		vector< vector<string> > topPair = it3->first;
+		
+		//choose the split with the best rating
+		for (it3 = rating.begin(); it3 != rating.end(); it3++) {
+			
+			if (it3->second > rate) {  rate = it3->second;  topPair = it3->first;  }
+		}
+		
+		bestSplit[group] = topPair;
+		
+		return rate;
+	}
+	catch(exception& e) {
+		errorOut(e, "ConcensusCommand", "getSubgroupRating");
+		exit(1);
+	}
+}
+
 //**********************************************************************************************************************
 vector<string> ConcensusCommand::getRestSet(vector<string> bigset, vector<string> subset) {
 	try {
@@ -294,12 +392,6 @@ vector<string> ConcensusCommand::getRestSet(vector<string> bigset, vector<string
 			//its not in the subset so put it in the rest
 			if (inSubset == false) { rest.push_back(bigset[i]); }
 		}
-		
-		//save for printing out later and for branch lengths
-		nodePairsInTree[rest] = nodePairs[rest];
-		
-		//delete whatever set you return because it is no longer available
-		nodePairs.erase(rest);
 
 		return rest;
 	
@@ -314,6 +406,9 @@ vector<string> ConcensusCommand::getRestSet(vector<string> bigset, vector<string
 bool ConcensusCommand::isSubset(vector<string> bigset, vector<string> subset) {
 	try {
 		
+	
+		if (subset.size() > bigset.size()) { return false;  }
+		
 		//check if each guy in suset is also in bigset
 		for (int i = 0; i < subset.size(); i++) {
 			bool match = false;
diff --git a/consensuscommand.h b/consensuscommand.h
index 53b8094..145bea5 100644
--- a/consensuscommand.h
+++ b/consensuscommand.h
@@ -35,6 +35,8 @@ private:
 										// ie. combos FI and EGK would create nodePairs[vector containing F and I] = 1; nodePairs[vector containing E, G and K] = 1
 										// if you saw the combo FI again in another tree you would then update nodePairs[vector containing F and I] = 2;
 										// requires vectors to be sorted to find key.
+	map< vector<string>, vector< vector<string> > > bestSplit;  //maps a group to its best split
+	map< vector<string>, int > nodePairsInitialRate;
 	map< vector<string>, int > nodePairsInTree;
 	map<string, int>::iterator it;
 	map< vector<string>, int>::iterator it2;
@@ -43,7 +45,9 @@ private:
 	int numNodes, numLeaves, count;  //count is the next available spot in the tree vector
 									 	
 	void getSets();
-	vector<string> getNextAvailableSet(vector<string>);  //gets next largest and highest rated set that is a subset of the set passed in.
+	int getSubgroupRating(vector<string>);
+	vector<string> getSmallest(map< vector<string>, int>);
+	vector<string> getNextAvailableSet(vector<string>, vector<string>&);  
 	vector<string> getRestSet(vector<string>, vector<string>);
 	bool isSubset(vector<string>, vector<string>); 
 	int findSpot(string); 
diff --git a/filters.h b/filters.h
new file mode 100644
index 0000000..4c39607
--- /dev/null
+++ b/filters.h
@@ -0,0 +1,102 @@
+#ifndef FILTERS_H
+#define FILTERS_H
+
+/*
+ *  filters.h
+ *  Mothur
+ *
+ *  Created by Sarah Westcott on 6/29/09.
+ *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
+ *
+ */
+
+#include "mothur.h"
+#include "sequence.hpp"
+
+
+/***********************************************************************/
+
+class Filters {
+
+public:
+	Filters() {};
+	~Filters(){};
+		
+	string getFilter()			{	return filter;		}
+	void setFilter(string s)	{  filter = s;			}
+	void setLength(int l)		{ alignmentLength = l;	}
+	void setSoft(float s)		{		soft = s;		}
+	void setTrump(float t)		{		trump = t;		}
+	void setNumSeqs(int num)	{	numSeqs = num;		}
+	
+	void initialize() {
+		a.assign(alignmentLength, 0);
+		t.assign(alignmentLength, 0);
+		g.assign(alignmentLength, 0);
+		c.assign(alignmentLength, 0);
+		gap.assign(alignmentLength, 0);
+	}
+
+	void doSoft() { 
+		int threshold = int (soft * numSeqs);
+	
+		for(int i=0;i<alignmentLength;i++){
+			if(a[i] < threshold && t[i] < threshold && g[i] < threshold && c[i] < threshold){	filter[i] = 0;	}
+		}
+	}
+	
+	void doVertical() {
+
+		for(int i=0;i<alignmentLength;i++){
+			if(gap[i] == numSeqs)	{	filter[i] = '0';	}
+		}
+	
+	}
+	
+	void doTrump(Sequence seq) {
+	
+		string curAligned = seq.getAligned();
+
+		for(int j = 0; j < alignmentLength; j++) {
+			if(curAligned[j] == trump){
+				filter[j] = '0';
+			}
+		}
+
+	}
+
+	void doHard(string hard) {
+		ifstream fileHandle;
+		openInputFile(hard, fileHandle);
+	
+		fileHandle >> filter;
+	
+		fileHandle.close();
+	}
+
+	void getFreqs(Sequence seq) {
+	
+		string curAligned = seq.getAligned();
+	
+		for(int j=0;j<alignmentLength;j++){
+			if(toupper(curAligned[j]) == 'A')										{	a[j]++;		}
+			else if(toupper(curAligned[j]) == 'T' || toupper(curAligned[j]) == 'U')	{	t[j]++;		}
+			else if(toupper(curAligned[j]) == 'G')									{	g[j]++;		}
+			else if(toupper(curAligned[j]) == 'C')									{	c[j]++;		}
+			else if(curAligned[j] == '-' || curAligned[j] == '.')					{	gap[j]++;	}
+		}
+	}
+		
+protected:
+	string filter;
+	vector<int> a, t, g, c, gap;
+	int alignmentLength, numSeqs;
+	float soft;
+	char trump;
+
+};
+
+/***********************************************************************/
+
+#endif
+
diff --git a/filterseqscommand.cpp b/filterseqscommand.cpp
index ad491aa..68450fb 100644
--- a/filterseqscommand.cpp
+++ b/filterseqscommand.cpp
@@ -56,6 +56,13 @@ FilterSeqsCommand::FilterSeqsCommand(string option){
 			
 			numSeqs = 0;
 			
+			if (abort == false) {
+			
+				if (soft != 0)			{  F.setSoft(soft);		}
+				if (trump != '*')		{  F.setTrump(trump);	}
+			
+			}
+						
 		}
 		
 	}
@@ -90,70 +97,6 @@ void FilterSeqsCommand::help(){
 
 /**************************************************************************************/
 
-void FilterSeqsCommand::doHard() {
-	
-	ifstream fileHandle;
-	openInputFile(hard, fileHandle);
-	
-	fileHandle >> filter;
-	
-	fileHandle.close();
-
-}
-
-/**************************************************************************************/
-
-void FilterSeqsCommand::doTrump(Sequence seq) {
-	
-	string curAligned = seq.getAligned();
-
-	for(int j = 0; j < alignmentLength; j++) {
-		if(curAligned[j] == trump){
-			filter[j] = '0';
-		}
-	}
-
-}
-
-/**************************************************************************************/
-
-void FilterSeqsCommand::doVertical() {
-
-	for(int i=0;i<alignmentLength;i++){
-		if(gap[i] == numSeqs)	{	filter[i] = '0';	}
-	}
-	
-}
-
-/**************************************************************************************/
-
-void FilterSeqsCommand::doSoft() {
-	
-	int threshold = int (soft * numSeqs);
-	
-	for(int i=0;i<alignmentLength;i++){
-		if(a[i] < threshold && t[i] < threshold && g[i] < threshold && c[i] < threshold){	filter[i] = 0;	}
-	}
-}
-
-/**************************************************************************************/
-
-void FilterSeqsCommand::getFreqs(Sequence seq) {
-
-	string curAligned = seq.getAligned();;
-	
-	for(int j=0;j<alignmentLength;j++){
-		if(toupper(curAligned[j]) == 'A')										{	a[j]++;		}
-		else if(toupper(curAligned[j]) == 'T' || toupper(curAligned[j]) == 'U')	{	t[j]++;		}
-		else if(toupper(curAligned[j]) == 'G')									{	g[j]++;		}
-		else if(toupper(curAligned[j]) == 'C')									{	c[j]++;		}
-		else if(curAligned[j] == '-' || curAligned[j] == '.')					{	gap[j]++;	}
-	}
-	
-}
-
-/**************************************************************************************/
-
 int FilterSeqsCommand::execute() {	
 	try {
 	
@@ -166,31 +109,33 @@ int FilterSeqsCommand::execute() {
 		alignmentLength = testSeq.getAlignLength();
 		inFASTA.seekg(0);
 		
+		F.setLength(alignmentLength);
+		
 		if(soft != 0 || isTrue(vertical)){
-			a.assign(alignmentLength, 0);
-			t.assign(alignmentLength, 0);
-			g.assign(alignmentLength, 0);
-			c.assign(alignmentLength, 0);
-			gap.assign(alignmentLength, 0);
+			F.initialize();
 		}
 		
-		if(hard.compare("") != 0)	{	doHard();		}
-		else						{	filter = string(alignmentLength, '1');	}
+		if(hard.compare("") != 0)	{	F.doHard(hard);		}
+		else						{	F.setFilter(string(alignmentLength, '1'));	}
 
 		if(trump != '*' || isTrue(vertical) || soft != 0){
 			while(!inFASTA.eof()){	//read through and create the filter...
 				Sequence seq(inFASTA);
-				if(trump != '*'){	doTrump(seq);	}
-				if(isTrue(vertical) || soft != 0){	getFreqs(seq);	}
+				if(trump != '*'){	F.doTrump(seq);	}
+				if(isTrue(vertical) || soft != 0){	F.getFreqs(seq);	}
 				numSeqs++;
 				cout.flush();
 			}
 		
 		}
 		inFASTA.close();
+		F.setNumSeqs(numSeqs);
+		
+		
+		if(isTrue(vertical) == 1)	{	F.doVertical();	}
+		if(soft != 0)				{	F.doSoft();		}
 		
-		if(isTrue(vertical) == 1)	{	doVertical();	}
-		if(soft != 0)	{	doSoft();		}			
+		filter = F.getFilter();
 
 		ofstream outFilter;
 		string filterFile = getRootName(fastafile) + "filter";
diff --git a/filterseqscommand.h b/filterseqscommand.h
index cfa56b3..611cbbc 100644
--- a/filterseqscommand.h
+++ b/filterseqscommand.h
@@ -11,6 +11,7 @@
  */
 
 #include "command.hpp"
+#include "filters.h"
 
 class Sequence;
 class FilterSeqsCommand : public Command {
@@ -22,11 +23,6 @@ public:
 	void help();
 	
 private:
-	void doHard();
-	void doTrump(Sequence);
-	void doVertical();
-	void doSoft();
-	void getFreqs(Sequence);
 	string vertical, filter, fastafile, hard;	
 	int alignmentLength;
 
@@ -35,6 +31,8 @@ private:
 	float soft;
 	int numSeqs;
 	
+	Filters F;
+		
 	vector<int> a, t, g, c, gap;
 
 };
diff --git a/venn.cpp b/venn.cpp
index 2c40c25..217dd35 100644
--- a/venn.cpp
+++ b/venn.cpp
@@ -380,78 +380,105 @@ void Venn::getPic(vector<SharedRAbundVector*> lookup, vector<Calculator*> vCalcs
 					//get estimates for numA
 					data = singleCalc->getValues(sabundA);
 					numA = data[0];
+	//cout << "num a = " << numA << endl;	
  			
 					//get estimates for numB
 					data = singleCalc->getValues(sabundB);
 					numB = data[0];
- 				
+ 	//cout << "num b = " << numB << endl;				
 					//get estimates for numC
 					data = singleCalc->getValues(sabundC);
 					numC = data[0];
-				
+	//cout << "num c = " << numC << endl;				
 					//get estimates for numD
 					data = singleCalc->getValues(sabundD);
 					numD = data[0];
-
+//cout << "num d = " << numD << endl;	
 
 					//get estimates for pairs
 					subset.clear();
 					subset.push_back(lookup[0]); subset.push_back(lookup[1]);
 					data = vCalcs[i]->getValues(subset);
 					sharedAB = data[0];
-				
+	//cout << "num ab = " << sharedAB << endl;			
 					subset.clear();
 					subset.push_back(lookup[0]); subset.push_back(lookup[2]);
 					data = vCalcs[i]->getValues(subset);
 					sharedAC = data[0];
-				
+	//cout << "num ac = " << sharedAC << endl;				
 					subset.clear();
 					subset.push_back(lookup[0]); subset.push_back(lookup[3]);
 					data = vCalcs[i]->getValues(subset);
 					sharedAD = data[0];
-				
+	//cout << "num ad = " << sharedAD << endl;			
 					subset.clear();
 					subset.push_back(lookup[1]); subset.push_back(lookup[2]);
 					data = vCalcs[i]->getValues(subset);
 					sharedBC = data[0];
-				
+	//cout << "num bc = " << sharedBC << endl;				
 					subset.clear();
 					subset.push_back(lookup[1]); subset.push_back(lookup[3]);
 					data = vCalcs[i]->getValues(subset);
 					sharedBD = data[0];
-				
+		//cout << "num bd = " << sharedBD << endl;						
 					subset.clear();
 					subset.push_back(lookup[2]); subset.push_back(lookup[3]);
 					data = vCalcs[i]->getValues(subset);
 					sharedCD = data[0];
-				
-				
+						
+	//cout << "num cd = " << sharedCD << endl;				
 					//get estimates for combos of 3
 					subset.clear();
 					subset.push_back(lookup[0]); subset.push_back(lookup[1]); subset.push_back(lookup[2]);
 					data = vCalcs[i]->getValues(subset);
 					sharedABC = data[0];
-				
+		//cout << "num abc = " << sharedABC << endl;					
 					subset.clear();
 					subset.push_back(lookup[0]); subset.push_back(lookup[2]); subset.push_back(lookup[3]);
 					data = vCalcs[i]->getValues(subset);
 					sharedACD = data[0];
-				
+			//cout << "num acd = " << sharedACD << endl;	
 					subset.clear();
 					subset.push_back(lookup[1]); subset.push_back(lookup[2]); subset.push_back(lookup[3]);
 					data = vCalcs[i]->getValues(subset);
 					sharedBCD = data[0];
-				
+		//cout << "num bcd = " << sharedBCD << endl;		
 					subset.clear();
 					subset.push_back(lookup[0]); subset.push_back(lookup[1]); subset.push_back(lookup[3]);
 					data = vCalcs[i]->getValues(subset);
 					sharedABD = data[0];
-
+//cout << "num abd = " << sharedABD << endl;
 					//get estimate for all four
 					data = vCalcs[i]->getValues(lookup);
 					sharedABCD = data[0];
-
-		
+		//cout << "num abcd = " << sharedABCD << endl << endl;			
+					//make adjustments
+					sharedABC = sharedABC - sharedABCD;
+			//cout << "num abc = " << sharedABC << endl;		
+					sharedABD = sharedABD - sharedABCD;
+				//cout << "num abd = " << sharedABD << endl;
+					sharedACD = sharedACD - sharedABCD;
+				//cout << "num acd = " << sharedACD << endl;
+					sharedBCD = sharedBCD - sharedABCD;
+				//cout << "num bcd = " << sharedBCD << endl;
+					
+					sharedAB = sharedAB - sharedABC - sharedABCD - sharedABD; // cout << "num ab = " << sharedAB << endl;
+					sharedAC = sharedAC - sharedABC - sharedABCD - sharedACD; // cout << "num ac = " << sharedAC << endl;
+					sharedAD = sharedAD - sharedABD - sharedABCD - sharedACD; // cout << "num ad = " << sharedAD << endl;
+					
+					sharedBC = sharedBC - sharedABC - sharedABCD - sharedBCD; // cout << "num bc = " << sharedBC << endl;
+					sharedBD = sharedBD - sharedABD - sharedABCD - sharedBCD; // cout << "num bd = " << sharedBD << endl; 
+					sharedCD = sharedCD - sharedACD - sharedABCD - sharedBCD; // cout << "num cd = " << sharedCD << endl;
+					
+					numA = numA - sharedAB - sharedAC - sharedAD - sharedABCD - sharedABC - sharedACD - sharedABD;
+			//cout << "num a = " << numA << endl;		
+					numB = numB - sharedAB - sharedBC - sharedBD - sharedABCD - sharedABC - sharedABD - sharedBCD;
+			//cout << "num b = " << numB << endl;		
+					numC = numC - sharedAC - sharedBC - sharedCD - sharedABCD - sharedABC - sharedACD - sharedBCD;
+			//cout << "num c = " << numC << endl;		
+					numD = numD - sharedAD - sharedBD - sharedCD - sharedABCD - sharedBCD - sharedACD - sharedABD;
+			//cout << "num d = " << numD << endl;		
+					
 					//image window
 					outsvg << "<svg width=\"100%\" height=\"100%\" viewBox=\"0 0 700 700\" >\n";
 					outsvg << "<g>\n";
@@ -478,8 +505,8 @@ void Venn::getPic(vector<SharedRAbundVector*> lookup, vector<Calculator*> vCalcs
 					outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString(215 - ((int)toString(sharedBC).length() / 2)) + "\" y=\"190\">" + toString(sharedBC) + "</text>\n";  
 					outsvg << "<text fill=\"black\" class=\"seri\"  x=\"" + toString(150 - ((int)toString(numD).length() / 2)) + "\"   y=\"230\">" + toString(numD) + "</text>\n";  
 					outsvg << "<text fill=\"black\" class=\"seri\"  x=\"" + toString(150 - ((int)lookup[3]->getGroup().length() / 2)) + "\"   y=\"210\">" + lookup[3]->getGroup() + "</text>\n"; 
-					outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString(240 - ((int)toString(sharedBC).length() / 2)) + "\" y=\"325\">" + toString(sharedAD) + "</text>\n"; 
-					outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString(470 - ((int)toString(sharedBC).length() / 2)) + "\" y=\"325\">" + toString(sharedBD) + "</text>\n";
+					outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString(240 - ((int)toString(sharedAD).length() / 2)) + "\" y=\"325\">" + toString(sharedAD) + "</text>\n"; 
+					outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString(470 - ((int)toString(sharedBD).length() / 2)) + "\" y=\"325\">" + toString(sharedBD) + "</text>\n";
 					outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString(350 - ((int)toString(sharedCD).length() / 2)) + "\" y=\"430\">" + toString(sharedCD) + "</text>\n"; 
 					outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString(275 - ((int)toString(sharedABD).length() / 2)) + "\" y=\"240\">" + toString(sharedABD) + "</text>\n"; 
 					outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString(400 - ((int)toString(sharedBCD).length() / 2)) + "\" y=\"360\">" + toString(sharedBCD) + "</text>\n";
-- 
2.39.2