From: pschloss <pschloss>
Date: Tue, 19 Jan 2010 14:20:22 +0000 (+0000)
Subject: pat's mods to morisitahorn and pre.cluster
X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=5b9b3e01150055e3b4bb1a911ea4c61d0b755e89

pat's mods to morisitahorn and pre.cluster
---

diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj
index 7f8568e..8e21f64 100644
--- a/Mothur.xcodeproj/project.pbxproj
+++ b/Mothur.xcodeproj/project.pbxproj
@@ -581,8 +581,8 @@
 		A7D86C7C10FE09AB00865661 /* formatcolumn.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = formatcolumn.cpp; sourceTree = SOURCE_ROOT; };
 		A7D86C8A10FE09FE00865661 /* formatphylip.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = formatphylip.h; sourceTree = SOURCE_ROOT; };
 		A7D86C8B10FE09FE00865661 /* formatphylip.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = formatphylip.cpp; sourceTree = SOURCE_ROOT; };
-		A7E0AAB410D2886D00CF407B /* mgclustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mgclustercommand.h; sourceTree = "<group>"; };
-		A7E0AAB510D2886D00CF407B /* mgclustercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mgclustercommand.cpp; sourceTree = "<group>"; };
+		A7E0AAB410D2886D00CF407B /* mgclustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mgclustercommand.h; sourceTree = SOURCE_ROOT; };
+		A7E0AAB510D2886D00CF407B /* mgclustercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mgclustercommand.cpp; sourceTree = SOURCE_ROOT; };
 		A7E4A781106913F900688F62 /* getsharedotucommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getsharedotucommand.h; sourceTree = SOURCE_ROOT; };
 		A7E4A782106913F900688F62 /* getsharedotucommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getsharedotucommand.cpp; sourceTree = SOURCE_ROOT; };
 		A7E4A822106A9AD700688F62 /* maligner.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = maligner.h; sourceTree = SOURCE_ROOT; };
@@ -1370,6 +1370,8 @@
 		1DEB923608733DC60010E9CD /* Debug */ = {
 			isa = XCBuildConfiguration;
 			buildSettings = {
+				ARCHS = "$(ARCHS_STANDARD_64_BIT_PRE_XCODE_3_1)";
+				ARCHS_STANDARD_64_BIT_PRE_XCODE_3_1 = x86_64;
 				GCC_OPTIMIZATION_LEVEL = 3;
 				GCC_WARN_ABOUT_MISSING_NEWLINE = YES;
 				GCC_WARN_ABOUT_RETURN_TYPE = YES;
@@ -1386,10 +1388,8 @@
 		1DEB923708733DC60010E9CD /* Release */ = {
 			isa = XCBuildConfiguration;
 			buildSettings = {
-				ARCHS = (
-					ppc,
-					i386,
-				);
+				ARCHS = "$(ARCHS_STANDARD_64_BIT_PRE_XCODE_3_1)";
+				ARCHS_STANDARD_64_BIT_PRE_XCODE_3_1 = x86_64;
 				GCC_OPTIMIZATION_LEVEL = 3;
 				GCC_WARN_ABOUT_MISSING_NEWLINE = YES;
 				GCC_WARN_ABOUT_MISSING_PROTOTYPES = YES;
diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp
index 39dda9f..3e4f6ad 100644
--- a/classifyseqscommand.cpp
+++ b/classifyseqscommand.cpp
@@ -82,6 +82,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option){
 			
 			namefile = validParameter.validFile(parameters, "name", false);
 			if (namefile == "not found") { namefile = "";  }
+
 			else { 
 				splitAtDash(namefile, namefileNames);
 				
diff --git a/fileoutput.cpp b/fileoutput.cpp
index 004ed83..cd3a1c7 100644
--- a/fileoutput.cpp
+++ b/fileoutput.cpp
@@ -306,7 +306,7 @@ void OneColumnFile::initFile(string label){
 		}
 		else{
 			openOutputFile(outName, outFile);
-			outFile << "numsequences\t" << label << endl;
+			outFile << "numsampled\t" << label << endl;
 		}
 	
 		outFile.setf(ios::fixed, ios::floatfield);
diff --git a/heatmapsim.cpp b/heatmapsim.cpp
index 88a49ed..c68d0cc 100644
--- a/heatmapsim.cpp
+++ b/heatmapsim.cpp
@@ -50,7 +50,8 @@ void HeatMapSim::getPic(vector<SharedRAbundVector*> lookup, vector<Calculator*>
 			}
 			
 			sims.clear();
-			double biggest = -1;
+//			double biggest = -1;
+			double biggest = 1;
 			float scaler;
 
 			//get sim for each comparison and save them so you can find the relative similairity
@@ -65,7 +66,7 @@ void HeatMapSim::getPic(vector<SharedRAbundVector*> lookup, vector<Calculator*>
 						sims.push_back(data[0]);
 					
 						//save biggest similairity to set relative sim
-						if (data[0] > biggest) { biggest = data[0]; }
+//						if (data[0] > biggest) { biggest = data[0]; }
 				}
 			}
 			
@@ -171,6 +172,7 @@ void HeatMapSim::getPic(vector< vector<double> > dists, vector<string> groups) {
 
 void HeatMapSim::printLegend(int y, float maxSim) {
 	try {
+		maxSim = 1;
 		
 		//output legend and color labels
 		//go through map and give each score a color value
@@ -180,22 +182,22 @@ void HeatMapSim::printLegend(int y, float maxSim) {
 		//prints legend
 		for (int i = 1; i < 255; i++) {
 			color = toHex(int((float)(i)));
-			outsvg << "<rect fill=\"#" + color + "0000\" stroke=\"#" + color + "0000\" x=\"" + toString(x) + "\" y=\"" + toString(y) + "\" width=\"1\" height=\"10\"/>\n";
-			x += 1;
+			outsvg << "<rect fill=\"#" + color + "0000\" stroke=\"#" + color + "0000\" x=\"" + toString(x) + "\" y=\"" + toString(y) + "\" width=\"3\" height=\"30\"/>\n";
+			x += 3;
 		}
 		
 		float scaler = maxSim / 5.0;
 		
 		//prints legend labels
-		x = 10;
-		for (int i = 1; i<=5; i++) {
+		x = 0;
+		for (int i = 0; i<=5; i++) {
 			float label = scaler*i;
 			label = int(label * 1000 + 0.5);
 			label /= 1000.0;
-			string text = toString(label, 3);
+			string text = toString(label, 1);
 			
 			outsvg << "<text fill=\"black\" class=\"seri\" x=\"" + toString(x) + "\" y=\"" + toString(y-3) + "\">" + text + "</text>\n";
-			x += 60;
+			x += 153;
 		}
 	}
 	
diff --git a/preclustercommand.cpp b/preclustercommand.cpp
index 479a862..66fd850 100644
--- a/preclustercommand.cpp
+++ b/preclustercommand.cpp
@@ -43,9 +43,10 @@ PreClusterCommand::PreClusterCommand(string option){
 			//check for optional parameter and set defaults
 			// ...at some point should added some additional type checking...
 			namefile = validParameter.validFile(parameters, "name", true);
+
 			if (namefile == "not found") { namefile =  "";  }
 			else if (namefile == "not open") { abort = true; }	
-			else {  readNameFile();  }
+//			else {  readNameFile();  }
 			
 			string temp	= validParameter.validFile(parameters, "diffs", false);				if(temp == "not found"){	temp = "1"; }
 			convert(temp, diffs); 
@@ -87,68 +88,62 @@ int PreClusterCommand::execute(){
 		if (abort == true) { return 0; }
 		
 		//reads fasta file and return number of seqs
-		int numSeqs = readSeqs(); //fills alignSeqs and makes all seqs active
+		int numSeqs = readNamesFASTA(); //fills alignSeqs and makes all seqs active
 	
 		if (numSeqs == 0) { mothurOut("Error reading fasta file...please correct."); mothurOutEndLine(); return 0;  }
 		if (diffs > length) { mothurOut("Error: diffs is greater than your sequence length."); mothurOutEndLine(); return 0;  }
 		
 		//clear sizes since you only needed this info to build the alignSeqs seqPNode structs
-		sizes.clear();
+//		sizes.clear();
 	
 		//sort seqs by number of identical seqs
 		sort(alignSeqs.begin(), alignSeqs.end(), comparePriority);
 	
 		//go through active list and cluster everthing you can, until all nodes are inactive
 		//taking advantage of the fact that maps are already sorted
-		map<string, bool>::iterator itActive;
-		map<string, bool>::iterator it2Active;
+//		map<string, bool>::iterator itActive;
+//		map<string, bool>::iterator it2Active;
 		int count = 0;
 		
-		for (int i = 0; i < alignSeqs.size(); i++) {
-		
-			//are you active
-			itActive = active.find(alignSeqs[i].seq.getName());
+		for (int i = 0; i < numSeqs; i++) {
 			
-			if (itActive != active.end()) {  //this sequence has not been merged yet
+			//are you active
+			//			itActive = active.find(alignSeqs[i].seq.getName());
 			
-				//try to merge it with all smaller seqs
-				for (int j = i; j < alignSeqs.size(); j++) {
-					
-					if (i != j) {
-						//are you active
-						it2Active = active.find(alignSeqs[j].seq.getName());
-						if (it2Active != active.end()) {  //this sequence has not been merged yet
-							//are you within "diff" bases
-							int mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned());
-							
-							if (mismatch <= diffs) {
-								//merge
-								names[alignSeqs[i].seq.getName()] += "," + names[alignSeqs[j].seq.getName()];
-							
-								//remove from active list
-								active.erase(it2Active);
-								
-								//set numIdentical to 0, so you only print the representative seqs in the fasta file
-								alignSeqs[j].numIdentical = 0;
-								count++;
-							}
-						}//end if j active
-					}//end if i != j
-				}//end for loop
+			if (alignSeqs[i].active) {  //this sequence has not been merged yet
 				
-				//remove from active list 
-				active.erase(itActive);
+				//try to merge it with all smaller seqs
+				for (int j = i+1; j < numSeqs; j++) {
+					if (alignSeqs[j].active) {  //this sequence has not been merged yet
+						//are you within "diff" bases
+						int mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned());
+						
+						if (mismatch <= diffs) {
+							//merge
+							alignSeqs[i].names += ',' + alignSeqs[j].names;
+							alignSeqs[i].numIdentical += alignSeqs[j].numIdentical;
+
+							alignSeqs[j].active = 0;
+							alignSeqs[j].numIdentical = 0;
+							count++;
+						}
+					}//end if j active
+				}//end if i != j
+			
+			//remove from active list 
+				alignSeqs[i].active = 0;
 			}//end if active i
+			if(i % 100 == 0)	{ cout << i << '\t' << numSeqs - count << '\t' << count << endl;	}
 		}
 	
 		string newFastaFile = getRootName(fastafile) + "precluster" + getExtension(fastafile);
 		string newNamesFile = getRootName(fastafile) + "precluster.names";
 		
-		printData(newFastaFile, newNamesFile);
 		
 		mothurOut("Total number of sequences before precluster was " + toString(alignSeqs.size()) + "."); mothurOutEndLine();
 		mothurOut("pre.cluster removed " + toString(count) + " sequences."); mothurOutEndLine(); 
-		
+		printData(newFastaFile, newNamesFile);
+
 		return 0;
 		
 	}
@@ -158,65 +153,85 @@ int PreClusterCommand::execute(){
 	}
 }
 /**************************************************************************************************/
-int PreClusterCommand::readSeqs(){
+int PreClusterCommand::readFASTA(){
 	try {
-		ifstream inFasta;
-		openInputFile(fastafile, inFasta);
-		length = 0;
-		map<string, string>::iterator it;
-				
-		while (!inFasta.eof()) {
-			Sequence temp(inFasta);  //read seq
-			
-			if (temp.getName() != "") {  
-				if (namefile != "") {
-					//make sure fasta and name files match
-					it = names.find(temp.getName());
-					if (it == names.end()) { mothurOut(temp.getName() + " is not in your names file, please correct."); mothurOutEndLine(); exit(1); }
-				}else { sizes[temp.getName()] = 1; }
-				
-				seqPNode tempNode(sizes[temp.getName()], temp);
-				alignSeqs.push_back(tempNode); 
-				active[temp.getName()] = true;
-			}
-			gobble(inFasta);
-		}
-		inFasta.close();
-		
-		if (alignSeqs.size() != 0) {  length = alignSeqs[0].seq.getAligned().length();  }
-		
+//		ifstream inFasta;
+//		openInputFile(fastafile, inFasta);
+//		length = 0;
+////		map<string, string>::iterator it;
+//
+//		while (!inFasta.eof()) {
+//			Sequence temp(inFasta);  //read seq
+//			
+//			if (temp.getName() != "") {  
+//				if (namefile != "") {
+//					//make sure fasta and name files match
+//					it = names.find(temp.getName());
+//					if (it == names.end()) { mothurOut(temp.getName() + " is not in your names file, please correct."); mothurOutEndLine(); exit(1); }
+//				}else { sizes[temp.getName()] = 1; }
+//				
+//				seqPNode tempNode(sizes[temp.getName()], temp);
+//				alignSeqs.push_back(tempNode); 
+//				active[temp.getName()] = true;
+//			}
+//			gobble(inFasta);
+//		}
+//		inFasta.close();
+//		
+//		if (alignSeqs.size() != 0) {  length = alignSeqs[0].seq.getAligned().length();  }
+//		
 		return alignSeqs.size();
 	}
 	catch(exception& e) {
-		errorOut(e, "PreClusterCommand", "readSeqs");
+		errorOut(e, "PreClusterCommand", "readFASTA");
 		exit(1);
 	}
 }
 /**************************************************************************************************/
-void PreClusterCommand::readNameFile(){
+
+int PreClusterCommand::readNamesFASTA(){
 	try {
-		ifstream in;
-		openInputFile(namefile, in);
-		string firstCol, secondCol;
-				
-		while (!in.eof()) {
-			in >> firstCol >> secondCol; gobble(in);
-			names[firstCol] = secondCol;
+		ifstream inNames;
+		ifstream inFasta;
+		
+		openInputFile(namefile, inNames);
+		openInputFile(fastafile, inFasta);
+		
+		string firstCol, secondCol, nameString;
+		length = 0;
+		
+		while (inFasta && inNames) {
+	
+			inNames >> firstCol >> secondCol;
+			nameString = secondCol;
+			
+			gobble(inNames);
 			int size = 1;
 			while (secondCol.find_first_of(',') != -1) { 
 				size++;
 				secondCol = secondCol.substr(secondCol.find_first_of(',')+1, secondCol.length());
 			}
-			sizes[firstCol] = size;
+			Sequence seq(inFasta);
+			if (seq.getName() != firstCol) { mothurOut(seq.getName() + " is not in your names file, please correct."); mothurOutEndLine(); exit(1); }
+			else{
+				seqPNode tempNode(size, seq, nameString);
+				alignSeqs.push_back(tempNode);
+				if (seq.getAligned().length() > length) {  length = alignSeqs[0].seq.getAligned().length();  }
+			}			
 		}
-		in.close();
+		inFasta.close();
+		inNames.close();
+		return alignSeqs.size();
 	}
+	
 	catch(exception& e) {
-		errorOut(e, "PreClusterCommand", "readNameFile");
+		errorOut(e, "PreClusterCommand", "readNamesFASTA");
 		exit(1);
 	}
 }
+
 /**************************************************************************************************/
+
 int PreClusterCommand::calcMisMatches(string seq1, string seq2){
 	try {
 		int numBad = 0;
@@ -234,23 +249,22 @@ int PreClusterCommand::calcMisMatches(string seq1, string seq2){
 		exit(1);
 	}
 }
+
 /**************************************************************************************************/
+
 void PreClusterCommand::printData(string newfasta, string newname){
 	try {
 		ofstream outFasta;
 		ofstream outNames;
+		
 		openOutputFile(newfasta, outFasta);
 		openOutputFile(newname, outNames);
 				
-		map<string, string>::iterator itNames;
 		
 		for (int i = 0; i < alignSeqs.size(); i++) {
 			if (alignSeqs[i].numIdentical != 0) {
 				alignSeqs[i].seq.printSequence(outFasta); 
-				
-				itNames = names.find(alignSeqs[i].seq.getName());
-				
-				outNames << itNames->first << '\t' << itNames->second << endl;
+				outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl;
 			}
 		}
 		
diff --git a/preclustercommand.h b/preclustercommand.h
index bb666cd..5113fa4 100644
--- a/preclustercommand.h
+++ b/preclustercommand.h
@@ -19,8 +19,10 @@
 struct seqPNode {
 	int numIdentical;
 	Sequence seq;
+	string names;
+	bool active;
 	seqPNode() {}
-	seqPNode(int s, Sequence q) : numIdentical(s), seq(q) {}
+	seqPNode(int n, Sequence s, string nm) : numIdentical(n), seq(s), names(nm), active(1) {}
 	~seqPNode() {}
 };
 /************************************************************/
@@ -38,13 +40,13 @@ private:
 	bool abort;
 	string fastafile, namefile;
 	vector<seqPNode> alignSeqs; //maps the number of identical seqs to a sequence
-	map<string, string> names; //represents the names file first column maps to second column
-	map<string, int> sizes;  //this map a seq name to the number of identical seqs in the names file
-	map<string, bool> active; //maps sequence name to whether it has already been merged or not.
+//	map<string, string> names; //represents the names file first column maps to second column
+//	map<string, int> sizes;  //this map a seq name to the number of identical seqs in the names file
+//	map<string, bool> active; //maps sequence name to whether it has already been merged or not.
 	
-	int readSeqs();
+	int readFASTA();
+	int readNamesFASTA();
 	int calcMisMatches(string, string);
-	void readNameFile();
 	void printData(string, string); //fasta filename, names file name
 };
 
diff --git a/screenseqscommand.cpp b/screenseqscommand.cpp
index 9225e99..32426f3 100644
--- a/screenseqscommand.cpp
+++ b/screenseqscommand.cpp
@@ -151,6 +151,7 @@ int ScreenSeqsCommand::execute(){
 			gobble(inFASTA);
 		}	
 		if(namefile != "" && groupfile != "")	{	screenNameGroupFile(badSeqNames);	}	// this screens both names and groups
+		else if(namefile != "")					{	screenNameGroupFile(badSeqNames);	}
 		else if(groupfile != "")				{	screenGroupFile(badSeqNames);		}	// this screens just the groups
 		if(alignreport != "")					{	screenAlignReport(badSeqNames);		}
 		
diff --git a/sharedmorisitahorn.cpp b/sharedmorisitahorn.cpp
index f9c1b6b..d5cdeec 100644
--- a/sharedmorisitahorn.cpp
+++ b/sharedmorisitahorn.cpp
@@ -14,7 +14,7 @@ EstOutput MorHorn::getValues(vector<SharedRAbundVector*> shared) {
 	try {	
 		data.resize(1,0);
 		
-		int Atotal, Btotal, tempA, tempB;
+		float Atotal, Btotal, tempA, tempB;
 		Atotal = 0; Btotal = 0; 
 		float  morhorn, sumSharedA, sumSharedB, a, b, d;
 		morhorn = 0.0; sumSharedA = 0.0; sumSharedB = 0.0; a = 0.0; b = 0.0; d = 0.0;
@@ -26,23 +26,21 @@ EstOutput MorHorn::getValues(vector<SharedRAbundVector*> shared) {
 			Btotal += shared[1]->getAbundance(i);
 		}
 		
-		//calculate the theta denominator sums
+		//calculate the denominator sums
 		for (int j = 0; j < shared[0]->size(); j++) {
 			//store in temps to avoid multiple repetitive function calls
 			tempA = shared[0]->getAbundance(j);
 			tempB = shared[1]->getAbundance(j);
+			float relA = tempA / Atotal;
+			float relB = tempB / Btotal;
 			
-			a += tempA * tempA;
-			b += tempB * tempB;
-			d += tempA * tempB;
+			a += relA * relA;
+			b += relB * relB;
+			d += relA * relB;
 		}
 
-		a /= double(Atotal * Atotal);
-		b /= double(Btotal * Btotal);
-		d /= double(Atotal * Btotal);
-		
 		morhorn = (2 * d) / (a + b);
-		
+
 		if (isnan(morhorn) || isinf(morhorn)) { morhorn = 0; }
 		
 		data[0] = morhorn;
diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp
index f777324..a6950ac 100644
--- a/trimseqscommand.cpp
+++ b/trimseqscommand.cpp
@@ -185,15 +185,17 @@ int TrimSeqsCommand::execute(){
 					}
 					if(!success)			{	trashCode += 'q';								}
 				}
+				
 				if(barcodes.size() != 0){
-					
 					success = stripBarcode(currSeq, group);
 					if(!success){	trashCode += 'b';	}
 				}
+				
 				if(numFPrimers != 0){
 					success = stripForward(currSeq);
 					if(!success){	trashCode += 'f';	}
 				}
+				
 				if(numRPrimers != 0){
 					success = stripReverse(currSeq);
 					if(!success){	trashCode += 'r';	}