From: westcott <westcott>
Date: Tue, 9 Nov 2010 17:40:05 +0000 (+0000)
Subject: parsimony calculator changed order of groups to reflect change in shared Utils made... 
X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=4ad32478c7b1bbb464c839a6e133f5b517a3efec;p=mothur.git

parsimony calculator changed order of groups to reflect change in shared Utils made to make output of summary.shared match unifrac commands
---

diff --git a/fastamap.cpp b/fastamap.cpp
index 25860e1..bf55493 100644
--- a/fastamap.cpp
+++ b/fastamap.cpp
@@ -64,11 +64,13 @@ void FastaMap::readFastaFile(string inFastaFile, string oldNameFileName){ //prin
 	m->openInputFile(oldNameFileName, oldNameFile);
 	
 	map<string,string> oldNameMap;
+	map<string, string>::iterator itName;
 	string name, list;
 	while(!oldNameFile.eof()){
 		if (m->control_pressed) { break; }
 		
-		oldNameFile >> name >> list;
+		oldNameFile >> name; m->gobble(oldNameFile);
+		oldNameFile >> list;
 		oldNameMap[name] = list;
 		m->gobble(oldNameFile);
 	}
@@ -87,6 +89,10 @@ void FastaMap::readFastaFile(string inFastaFile, string oldNameFileName){ //prin
 			if(currSeq.getIsAligned())	{	sequence = currSeq.getAligned();	}
 			else						{	sequence = currSeq.getUnaligned();	}
 			
+			itName = seqmap.find(name);
+			if (itName == seqmap.end()) { seqmap[name] = sequence;  }
+			else { m->mothurOut("You already have a sequence named " + name + ", sequence names must be unique, please correct."); m->mothurOutEndLine(); }
+			
 			seqmap[name] = sequence;  
 			map<string,group>::iterator it = data.find(sequence);
 			if (it == data.end()) { 	//it's unique.
diff --git a/listseqscommand.cpp b/listseqscommand.cpp
index b4b4674..1488cda 100644
--- a/listseqscommand.cpp
+++ b/listseqscommand.cpp
@@ -367,7 +367,7 @@ int ListSeqsCommand::readGroup(){
 			
 			if (m->control_pressed) { in.close(); return 0; }
 			
-			in >> name;				//read from first column
+			in >> name;	m->gobble(in);			//read from first column
 			in >> group;			//read from second column
 			
 			names.push_back(name);
diff --git a/mothur b/mothur
index e33ddcd..1b53d19 100755
Binary files a/mothur and b/mothur differ
diff --git a/mothurout.cpp b/mothurout.cpp
index 9ec847c..bd1a098 100644
--- a/mothurout.cpp
+++ b/mothurout.cpp
@@ -522,7 +522,7 @@ string MothurOut::getFullPathName(string fileName){
 				newFileName = homeDir + fileName.substr(fileName.find("~")+1);
 				return newFileName;
 			}else { //find path
-				if (path.rfind("./") == -1) { return fileName; } //already complete name
+				if (path.rfind("./") == string::npos) { return fileName; } //already complete name
 				else { newFileName = fileName.substr(fileName.rfind("./")+2); } //save the complete part of the name
 				
 				//char* cwdpath = new char[1024];
@@ -542,7 +542,7 @@ string MothurOut::getFullPathName(string fileName){
 				
 				//break apart the current working directory
 				vector<string> dirs;
-				while (simpleCWD.find_first_of('/') != -1) {
+				while (simpleCWD.find_first_of('/') != string::npos) {
 					string dir = simpleCWD.substr(0,simpleCWD.find_first_of('/'));
 					simpleCWD = simpleCWD.substr(simpleCWD.find_first_of('/')+1, simpleCWD.length());
 					dirs.push_back(dir);
@@ -553,7 +553,7 @@ string MothurOut::getFullPathName(string fileName){
 			
 				int index = dirs.size()-1;
 		
-				while((pos = path.rfind("./")) != -1) { //while you don't have a complete path
+				while((pos = path.rfind("./")) != string::npos) { //while you don't have a complete path
 					if (pos == 0) { break;  //you are at the end
 					}else if (path[(pos-1)] == '.') { //you want your parent directory ../
 						path = path.substr(0, pos-1);
@@ -573,12 +573,12 @@ string MothurOut::getFullPathName(string fileName){
 				return newFileName;
 			}	
 		#else
-			if (path.find("~") != -1) { //go to home directory
+			if (path.find("~") != string::npos) { //go to home directory
 				string homeDir = getenv ("HOMEPATH");
 				newFileName = homeDir + fileName.substr(fileName.find("~")+1);
 				return newFileName;
 			}else { //find path
-				if (path.rfind(".\\") == -1) { return fileName; } //already complete name
+				if (path.rfind(".\\") == string::npos) { return fileName; } //already complete name
 				else { newFileName = fileName.substr(fileName.rfind(".\\")+2); } //save the complete part of the name
 							
 				char *cwdpath = NULL;
@@ -599,7 +599,7 @@ string MothurOut::getFullPathName(string fileName){
 					
 				int index = dirs.size()-1;
 					
-				while((pos = path.rfind(".\\")) != -1) { //while you don't have a complete path
+				while((pos = path.rfind(".\\")) != string::npos) { //while you don't have a complete path
 					if (pos == 0) { break;  //you are at the end
 					}else if (path[(pos-1)] == '.') { //you want your parent directory ../
 						path = path.substr(0, pos-1);
@@ -658,7 +658,7 @@ int MothurOut::openInputFile(string fileName, ifstream& fileHandle, string m){
 
 			fileHandle.open(completeFileName.c_str());
 			if(!fileHandle) {
-				mothurOut("[ERROR]: Could not open " + completeFileName); mothurOutEndLine();
+				//mothurOut("[ERROR]: Could not open " + completeFileName); mothurOutEndLine();
 				return 1;
 			}else {
 				//check for blank file
diff --git a/parsefastaqcommand.cpp b/parsefastaqcommand.cpp
index 59126fb..f0a7abc 100644
--- a/parsefastaqcommand.cpp
+++ b/parsefastaqcommand.cpp
@@ -175,7 +175,7 @@ int ParseFastaQCommand::execute(){
 			if (qual == "") {  m->mothurOut("[ERROR]: missing quality for " + name2); m->mothurOutEndLine(); m->control_pressed = true; break; }
 			
 			//sanity check sequence length and number of quality scores match
-			if (name != name2) { m->mothurOut("[ERROR]: names do not match. read " + name + " for fasta and " + name2 + " for quality."); m->mothurOutEndLine(); m->control_pressed = true; break; }
+			if (name2 != "") { if (name != name2) { m->mothurOut("[ERROR]: names do not match. read " + name + " for fasta and " + name2 + " for quality."); m->mothurOutEndLine(); m->control_pressed = true; break; } }
 			if (qual.length() != sequence.length()) { m->mothurOut("[ERROR]: lengths do not match. read " + toString(sequence.length()) + " characters for fasta and " + toString(qual.length()) + " characters for quality scores."); m->mothurOutEndLine(); m->control_pressed = true; break; }
 			
 			//convert quality scores
diff --git a/parsimony.cpp b/parsimony.cpp
index 0487822..efc7d3a 100644
--- a/parsimony.cpp
+++ b/parsimony.cpp
@@ -34,7 +34,7 @@ EstOutput Parsimony::getValues(Tree* t) {
 		
 		int count = 0;
 		for (int a=0; a<numGroups; a++) { 
-			for (int l = a+1; l < numGroups; l++) {
+			for (int l = 0; l < a; l++) {
 				int score = 0;
 				
 				//groups in this combo
diff --git a/sharedlistvector.cpp b/sharedlistvector.cpp
index 0f4042f..1493ee9 100644
--- a/sharedlistvector.cpp
+++ b/sharedlistvector.cpp
@@ -29,7 +29,7 @@ SharedListVector::SharedListVector(ifstream& f) : DataVector(), maxRank(0), numB
 
 		//set up groupmap for later.
 		groupmap = new GroupMap(globaldata->getGroupFile());
-		groupmap->readMap();
+		groupmap->readMap(); 
 
 		int hold;
 		string inputData;
diff --git a/subsamplecommand.cpp b/subsamplecommand.cpp
index 036341f..3def1d1 100644
--- a/subsamplecommand.cpp
+++ b/subsamplecommand.cpp
@@ -8,6 +8,7 @@
  */
 
 #include "subsamplecommand.h"
+#include "sharedutilities.h"
 
 //**********************************************************************************************************************
 vector<string> SubSampleCommand::getValidParameters(){	
@@ -226,6 +227,9 @@ SubSampleCommand::SubSampleCommand(string option) {
 			if ((groupfile != "") && ((fastafile == "") && (listfile == ""))) { 
 				m->mothurOut("Group file only valid with listfile or fastafile."); m->mothurOutEndLine(); abort = true; }
 			
+			if ((groupfile != "") && ((fastafile != "") && (listfile != ""))) { 
+				m->mothurOut("A new group file can only be made from the subsample of a listfile or fastafile, not both. Please correct."); m->mothurOutEndLine(); abort = true; }
+			
 		}
 
 	}
@@ -270,10 +274,16 @@ int SubSampleCommand::execute(){
 		if (abort == true) { return 0; }
 		
 		if (sharedfile != "")	{   getSubSampleShared();	}
-		//if (listfile != "")		{   getSubSampleList();		}
+		if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {	remove(outputNames[i].c_str()); return 0; } }
+		if (listfile != "")		{   getSubSampleList();		}
+		if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {	remove(outputNames[i].c_str()); return 0; } }
 		//if (rabund != "")		{   getSubSampleRabund();	}
+		if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {	remove(outputNames[i].c_str()); return 0; } }
 		//if (sabundfile != "")	{   getSubSampleSabund();	}
+		if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {	remove(outputNames[i].c_str()); return 0; } }
 		//if (fastafile != "")	{   getSubSampleFasta();	}
+		if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {	remove(outputNames[i].c_str()); return 0; } }
+			
 				
 		m->mothurOutEndLine();
 		m->mothurOut("Output File Names: "); m->mothurOutEndLine();
@@ -302,31 +312,31 @@ int SubSampleCommand::getSubSampleShared() {
 		InputData* input = new InputData(sharedfile, "sharedfile");
 		vector<SharedRAbundVector*> lookup = input->getSharedRAbundVectors();
 		string lastLabel = lookup[0]->getLabel();
-	
+		
 		//if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
 		set<string> processedLabels;
 		set<string> userLabels = labels;
-
-	
+		
+		
 		//as long as you are not at the end of the file or done wih the lines you want
 		while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
-			if (m->control_pressed) {  out.close(); return 0;  }
-	
+			if (m->control_pressed) {  delete input; out.close(); return 0;  }
+			
 			if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){			
-
+				
 				m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
 				
 				processShared(lookup, out);
-																
+				
 				processedLabels.insert(lookup[0]->getLabel());
 				userLabels.erase(lookup[0]->getLabel());
 			}
 			
 			if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
 				string saveLabel = lookup[0]->getLabel();
-		
+				
 				for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }  
-		
+				
 				lookup = input->getSharedRAbundVectors(lastLabel);
 				m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
 				
@@ -342,14 +352,14 @@ int SubSampleCommand::getSubSampleShared() {
 			lastLabel = lookup[0]->getLabel();
 			//prevent memory leak
 			for (int i = 0; i < lookup.size(); i++) {  delete lookup[i]; lookup[i] = NULL; }
-						
+			
 			//get next line to process
 			lookup = input->getSharedRAbundVectors();				
 		}
 		
 		
 		if (m->control_pressed) {  out.close(); return 0;  }
-
+		
 		//output error messages about any remaining user labels
 		set<string>::iterator it;
 		bool needToRun = false;
@@ -362,7 +372,7 @@ int SubSampleCommand::getSubSampleShared() {
 				m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
 			}
 		}
-	
+		
 		//run last label if you need to
 		if (needToRun == true)  {
 			for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } }  
@@ -374,11 +384,12 @@ int SubSampleCommand::getSubSampleShared() {
 			
 			for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
 		}
-	
+		
+		delete input;
 		out.close();  
-				
+		
 		return 0;
- 
+		
 	}
 	catch(exception& e) {
 		m->errorOut(e, "SubSampleCommand", "getSubSampleShared");
@@ -389,7 +400,7 @@ int SubSampleCommand::getSubSampleShared() {
 int SubSampleCommand::processShared(vector<SharedRAbundVector*>& thislookup, ofstream& out) {
 	try {
 		
-		if (pickedGroups) { eliminateZeroOTUS(thislookup); }
+		//if (pickedGroups) { eliminateZeroOTUS(thislookup); }
 		
 		if (size == 0) { //user has not set size, set size = smallest samples size
 			size = thislookup[0]->getNumSeqs();
@@ -403,11 +414,11 @@ int SubSampleCommand::processShared(vector<SharedRAbundVector*>& thislookup, ofs
 		int numBins = thislookup[0]->getNumBins();
 		for (int i = 0; i < thislookup.size(); i++) {		
 			int thisSize = thislookup[i]->getNumSeqs();
-				
+			
 			if (thisSize != size) {
 				
 				string thisgroup = thislookup[i]->getGroup();
-	
+				
 				OrderVector* order = new OrderVector();
 				for(int p=0;p<numBins;p++){
 					for(int j=0;j<thislookup[i]->getAbundance(p);j++){
@@ -425,6 +436,9 @@ int SubSampleCommand::processShared(vector<SharedRAbundVector*>& thislookup, ofs
 				
 				
 				for (int j = 0; j < size; j++) {
+					
+					if (m->control_pressed) { delete order; return 0; }
+					
 					//get random number to sample from order between 0 and thisSize-1.
 					int myrand = (int)((float)(rand()) / (RAND_MAX / (thisSize-1) + 1));
 					
@@ -440,6 +454,8 @@ int SubSampleCommand::processShared(vector<SharedRAbundVector*>& thislookup, ofs
 		//subsampling may have created some otus with no sequences in them
 		eliminateZeroOTUS(thislookup);
 		
+		if (m->control_pressed) { return 0; }
+		
 		for (int i = 0; i < thislookup.size(); i++) {
 			out << thislookup[i]->getLabel() << '\t' << thislookup[i]->getGroup() << '\t';
 			thislookup[i]->print(out);
@@ -449,7 +465,232 @@ int SubSampleCommand::processShared(vector<SharedRAbundVector*>& thislookup, ofs
 		
 	}
 	catch(exception& e) {
-		m->errorOut(e, "SubSampleCommand", "eliminateZeroOTUS");
+		m->errorOut(e, "SubSampleCommand", "processShared");
+		exit(1);
+	}
+}			
+//**********************************************************************************************************************
+int SubSampleCommand::getSubSampleList() {
+	try {
+		
+		string thisOutputDir = outputDir;
+		if (outputDir == "") {  thisOutputDir += m->hasPath(listfile);  }
+		string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(listfile)) + "subsample" + m->getExtension(listfile);
+		
+		ofstream out;
+		m->openOutputFile(outputFileName, out);
+		outputTypes["list"].push_back(outputFileName);  outputNames.push_back(outputFileName);
+		
+		InputData* input;
+		//if you have a groupfile you want to read a shared list
+		if (groupfile != "") {
+			
+			GroupMap* groupMap = new GroupMap(groupfile);
+			groupMap->readMap();
+			
+			//takes care of user setting groupNames that are invalid or setting groups=all
+			SharedUtil* util = new SharedUtil();
+			util->setGroups(Groups, groupMap->namesOfGroups);
+			delete util;
+			
+			//create outputfiles
+			string groupOutputDir = outputDir;
+			if (outputDir == "") {  groupOutputDir += m->hasPath(groupfile);  }
+			string groupOutputFileName = groupOutputDir + m->getRootName(m->getSimpleName(groupfile)) + "subsample" + m->getExtension(groupfile);
+			
+			ofstream outGroup;
+			m->openOutputFile(groupOutputFileName, outGroup);
+			outputTypes["group"].push_back(groupOutputFileName);  outputNames.push_back(groupOutputFileName);
+			
+			globaldata->setGroupFile(groupfile); //shared list needs this
+			
+			input = new InputData(listfile, "list");
+			ListVector* list = input->getListVector();
+			string lastLabel = list->getLabel();
+			
+			//if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
+			set<string> processedLabels;
+			set<string> userLabels = labels;
+			
+			//file mismatch quit
+			if (list->getNumSeqs() != groupMap->getNumSeqs()) { 
+				m->mothurOut("[ERROR]: your list file contains " + toString(list->getNumSeqs()) + " sequences, and your groupfile contains " + toString(groupMap->getNumSeqs()) + ", please correct."); 
+				m->mothurOutEndLine();
+				delete groupMap;
+				delete list;
+				delete input;
+				out.close();
+				outGroup.close();
+				return 0;
+			}
+			
+			//as long as you are not at the end of the file or done wih the lines you want
+			while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
+				
+				if (m->control_pressed) {  delete list; delete input; delete groupMap; out.close(); outGroup.close(); return 0;  }
+				
+				if(allLines == 1 || labels.count(list->getLabel()) == 1){			
+					
+					m->mothurOut(list->getLabel()); m->mothurOutEndLine();
+					
+					processListGroup(list, groupMap, out, outGroup);
+					
+					processedLabels.insert(list->getLabel());
+					userLabels.erase(list->getLabel());
+				}
+				
+				if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
+					string saveLabel = list->getLabel();
+					
+					delete list; 
+					
+					list = input->getListVector(lastLabel);
+					m->mothurOut(list->getLabel()); m->mothurOutEndLine();
+					
+					processListGroup(list, groupMap, out, outGroup);
+					
+					processedLabels.insert(list->getLabel());
+					userLabels.erase(list->getLabel());
+					
+					//restore real lastlabel to save below
+					list->setLabel(saveLabel);
+				}
+				
+				lastLabel = list->getLabel();
+				
+				delete list; list = NULL;
+				
+				//get next line to process
+				list = input->getListVector();				
+			}
+			
+			
+			if (m->control_pressed) {  if (list != NULL) { delete list; } delete input; delete groupMap; out.close(); outGroup.close(); return 0;  }
+			
+			//output error messages about any remaining user labels
+			set<string>::iterator it;
+			bool needToRun = false;
+			for (it = userLabels.begin(); it != userLabels.end(); it++) {  
+				m->mothurOut("Your file does not include the label " + *it); 
+				if (processedLabels.count(lastLabel) != 1) {
+					m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
+					needToRun = true;
+				}else {
+					m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
+				}
+			}
+			
+			//run last label if you need to
+			if (needToRun == true)  {
+				if (list != NULL) { delete list; }
+				
+				list = input->getListVector(lastLabel);
+				
+				m->mothurOut(list->getLabel()); m->mothurOutEndLine();
+				
+				processListGroup(list, groupMap, out, outGroup);
+				
+				delete list; list = NULL;
+			}
+			
+			out.close();  outGroup.close();
+			if (list != NULL) { delete list; }
+			delete groupMap;
+			
+		}else {
+			//need to complete
+		}
+		
+		
+		delete input;
+						
+		return 0;
+ 
+	}
+	catch(exception& e) {
+		m->errorOut(e, "SubSampleCommand", "getSubSampleList");
+		exit(1);
+	}
+}
+//**********************************************************************************************************************
+int SubSampleCommand::processListGroup(ListVector*& list, GroupMap*& groupMap, ofstream& out, ofstream& outGroup) {
+	try {
+				
+		if (size == 0) { //user has not set size, set size = 10% samples size
+			size = int (list->getNumSeqs() * 0.10);
+		}
+		
+		int numBins = list->getNumBins();
+		int thisSize = list->getNumSeqs();
+		
+		if (size > thisSize) { m->mothurOut("Your list file only contains " + toString(thisSize) + " sequences. Setting size to " + toString(thisSize) + "."); m->mothurOutEndLine();
+			size = thisSize;
+		}
+		
+		vector<nameToBin> seqs;
+		for (int i = 0; i < numBins; i++) {
+			string names = list->get(i);
+			
+			//parse names
+			string individual = "";
+			int length = names.length();
+			for(int j=0;j<length;j++){
+				if(names[j] == ','){
+					nameToBin temp(individual, i);
+					seqs.push_back(temp);
+					individual = "";				
+				}
+				else{
+					individual += names[j];
+				}
+			}
+			nameToBin temp(individual, i);
+			seqs.push_back(temp);
+		}
+		
+					
+		ListVector* temp = new ListVector(numBins);
+		temp->setLabel(list->getLabel());
+		
+		delete list; 
+		list = temp;
+		
+		set<int> alreadySelected; //dont want repeat sequence names added
+		alreadySelected.insert(-1);
+		for (int j = 0; j < size; j++) {
+			
+			if (m->control_pressed) { return 0; }
+			
+			//get random sequence to add, making sure we have not already added it
+			int myrand = -1;
+			while (alreadySelected.count(myrand) != 0) {
+				myrand = (int)((float)(rand()) / (RAND_MAX / (thisSize-1) + 1));
+			}
+			alreadySelected.insert(myrand);
+			
+			//update new list
+			string oldNames = temp->get(seqs[myrand].bin);
+			if (oldNames == "") { oldNames += seqs[myrand].name; }
+			else { oldNames += "," + seqs[myrand].name; }
+			
+			temp->set(seqs[myrand].bin, oldNames);
+			
+			//update group file
+			string group = groupMap->getGroup(seqs[myrand].name);
+			if (group == "not found") { m->mothurOut("[ERROR]: " + seqs[myrand].name + " is not in your groupfile. please correct."); m->mothurOutEndLine(); group = "NOTFOUND"; }
+			
+			outGroup << seqs[myrand].name << '\t' << group << endl;
+		}	
+
+		if (m->control_pressed) { return 0; }
+		
+		list->print(out);
+		
+		return 0;
+		
+	}
+	catch(exception& e) {
+		m->errorOut(e, "SubSampleCommand", "processListGroup");
 		exit(1);
 	}
 }
diff --git a/subsamplecommand.h b/subsamplecommand.h
index f068ac3..9f3e55b 100644
--- a/subsamplecommand.h
+++ b/subsamplecommand.h
@@ -32,6 +32,14 @@ public:
 	void help();
 	
 private:
+	
+	struct nameToBin {
+		string name;
+		int bin;
+		
+		nameToBin(string n, int b) : name(n), bin(b) {}
+	};
+	
 	GlobalData* globaldata;
 	
 	bool abort, pickedGroups, allLines;
@@ -44,7 +52,9 @@ private:
 	
 	int eliminateZeroOTUS(vector<SharedRAbundVector*>&);
 	int getSubSampleShared();
+	int getSubSampleList();
 	int processShared(vector<SharedRAbundVector*>&, ofstream&);
+	int processListGroup(ListVector*&, GroupMap*&, ofstream&, ofstream&);
 	
 };