From: westcott <westcott>
Date: Fri, 7 May 2010 17:59:09 +0000 (+0000)
Subject: changed confidence scores calculation in bayesian
X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=7a7870ab773b993d8d1fd89703b1df3beb47f8d4

changed confidence scores calculation in bayesian
---

diff --git a/bayesian.cpp b/bayesian.cpp
index 00467ff..a4229c6 100644
--- a/bayesian.cpp
+++ b/bayesian.cpp
@@ -216,17 +216,11 @@ string Bayesian::getTaxonomy(Sequence* seq) {
 /**************************************************************************************************/
 string Bayesian::bootstrapResults(vector<int> kmers, int tax, int numToSelect) {
 	try {
-		
-		//taxConfidenceScore.clear(); //clear out previous seqs scores
 				
-		vector< map<string, int> > confidenceScores; //you need the added vector level of confusion to account for the level that name is seen since they can be the same
-										//map of classification to confidence for all areas seen
-									   //ie. Bacteria;Alphaproteobacteria;Rhizobiales;Azorhizobium_et_rel.;Methylobacterium_et_rel.;Bosea;
-									   //ie. Bacteria -> 100, Alphaproteobacteria -> 100, Rhizobiales -> 87, Azorhizobium_et_rel. -> 78, Methylobacterium_et_rel. -> 70, Bosea -> 50
-		confidenceScores.resize(100);  //if you have more than 100 levels of classification...
-		
-		map<string, int>::iterator itBoot;
-		map<string, int>::iterator itBoot2;
+		map<int, int> confidenceScores; 
+				
+		map<int, int>::iterator itBoot;
+		map<int, int>::iterator itBoot2;
 		map<int, int>::iterator itConvert;
 		
 		for (int i = 0; i < iters; i++) {
@@ -248,14 +242,15 @@ string Bayesian::bootstrapResults(vector<int> kmers, int tax, int numToSelect) {
 			//add to confidence results
 			while (taxonomy.level != 0) { //while you are not at the root
 				
-				itBoot2 = confidenceScores[taxonomy.level].find(taxonomy.name); //is this a classification we already have a count on
+				itBoot2 = confidenceScores.find(newTax); //is this a classification we already have a count on
 				
-				if (itBoot2 == confidenceScores[taxonomy.level].end()) { //not already in confidence scores
-					confidenceScores[taxonomy.level][taxonomy.name] = 1;
+				if (itBoot2 == confidenceScores.end()) { //not already in confidence scores
+					confidenceScores[newTax] = 1;
 				}else{
-					confidenceScores[taxonomy.level][taxonomy.name]++;
+					confidenceScores[newTax]++;
 				}
-		
+				
+				newTax = taxonomy.parent;
 				taxonomy = phyloTree->get(taxonomy.parent);
 			}
 	
@@ -263,15 +258,17 @@ string Bayesian::bootstrapResults(vector<int> kmers, int tax, int numToSelect) {
 		
 		string confidenceTax = "";
 		simpleTax = "";
+		
+		int seqTaxIndex = tax;
 		TaxNode seqTax = phyloTree->get(tax);
 		
 		while (seqTax.level != 0) { //while you are not at the root
 					
-				itBoot2 = confidenceScores[seqTax.level].find(seqTax.name); //is this a classification we already have a count on
+				itBoot2 = confidenceScores.find(seqTaxIndex); //is this a classification we already have a count on
 				
 				int confidence = 0;
-				if (itBoot2 != confidenceScores[seqTax.level].end()) { //not already in confidence scores
-					confidence = confidenceScores[seqTax.level][seqTax.name];
+				if (itBoot2 != confidenceScores.end()) { //already in confidence scores
+					confidence = confidenceScores[seqTaxIndex];
 				}
 				
 				if (((confidence/(float)iters) * 100) >= confidenceThreshold) {
@@ -279,6 +276,7 @@ string Bayesian::bootstrapResults(vector<int> kmers, int tax, int numToSelect) {
 					simpleTax = seqTax.name + ";" + simpleTax;
 				}
 				
+				seqTaxIndex = seqTax.parent;
 				seqTax = phyloTree->get(seqTax.parent);
 		}
 		
diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp
index bdf059c..580dd9b 100644
--- a/classifyseqscommand.cpp
+++ b/classifyseqscommand.cpp
@@ -82,47 +82,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option)  {
 			}
 			else if (templateFileName == "not open") { abort = true; }	
 			
-			groupfile = validParameter.validFile(parameters, "group", false);
-			if (groupfile == "not found") { groupfile = "";  }
-			else { 
-				splitAtDash(groupfile, groupfileNames);
-				
-				//go through files and make sure they are good, if not, then disregard them
-				for (int i = 0; i < groupfileNames.size(); i++) {
-					if (inputDir != "") {
-						string path = hasPath(groupfileNames[i]);
-						//if the user has not given a path then, add inputdir. else leave path alone.
-						if (path == "") {	groupfileNames[i] = inputDir + groupfileNames[i];		}
-					}
-					int ableToOpen;
-					
-					#ifdef USE_MPI	
-						int pid;
-						MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
-						MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
-				
-						if (pid == 0) {
-					#endif
-
-					ifstream in;
-					ableToOpen = openInputFile(groupfileNames[i], in);
-					in.close();
-					
-					#ifdef USE_MPI	
-							for (int j = 1; j < processors; j++) {
-								MPI_Send(&ableToOpen, 1, MPI_INT, j, 2001, MPI_COMM_WORLD); 
-							}
-						}else{
-							MPI_Status status;
-							MPI_Recv(&ableToOpen, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
-						}
 						
-					#endif
-					if (ableToOpen == 1) {  m->mothurOut("Unable to match group file with fasta file."); m->mothurOutEndLine(); abort = true;	}
-					
-				}
-			}
-			
 			fastaFileName = validParameter.validFile(parameters, "fasta", false);
 			if (fastaFileName == "not found") { m->mothurOut("fasta is a required parameter for the classify.seqs command."); m->mothurOutEndLine(); abort = true;  }
 			else { 
@@ -230,8 +190,51 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option)  {
 				if (namefileNames.size() != fastaFileNames.size()) { abort = true; m->mothurOut("If you provide a name file, you must have one for each fasta file."); m->mothurOutEndLine(); }
 			}
 			
+			groupfile = validParameter.validFile(parameters, "group", false);
+			if (groupfile == "not found") { groupfile = "";  }
+			else { 
+				splitAtDash(groupfile, groupfileNames);
+				
+				//go through files and make sure they are good, if not, then disregard them
+				for (int i = 0; i < groupfileNames.size(); i++) {
+					if (inputDir != "") {
+						string path = hasPath(groupfileNames[i]);
+						//if the user has not given a path then, add inputdir. else leave path alone.
+						if (path == "") {	groupfileNames[i] = inputDir + groupfileNames[i];		}
+					}
+					int ableToOpen;
+					
+					#ifdef USE_MPI	
+						int pid;
+						MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running
+						MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+				
+						if (pid == 0) {
+					#endif
+
+					ifstream in;
+					ableToOpen = openInputFile(groupfileNames[i], in);
+					in.close();
+					
+					#ifdef USE_MPI	
+							for (int j = 1; j < processors; j++) {
+								MPI_Send(&ableToOpen, 1, MPI_INT, j, 2001, MPI_COMM_WORLD); 
+							}
+						}else{
+							MPI_Status status;
+							MPI_Recv(&ableToOpen, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status);
+						}
+						
+					#endif
+					if (ableToOpen == 1) {  m->mothurOut("Unable to match group file with fasta file."); m->mothurOutEndLine(); abort = true;	}
+					
+				}
+			}
+
 			if (groupfile != "") {
 				if (groupfileNames.size() != fastaFileNames.size()) { abort = true; m->mothurOut("If you provide a group file, you must have one for each fasta file."); m->mothurOutEndLine(); }
+			}else {
+				for (int i = 0; i < fastaFileNames.size(); i++) {  groupfileNames.push_back("");  }
 			}
 			
 			//check for optional parameter and set defaults