X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bayesian.cpp;h=b46f7703a971b8b5141d42faf14e26d45037cba0;hb=2a07f8664a7fd1ef0d572b88e8b5bede24cc0b3c;hp=e5543cdf0c320847a7cb9bc16957b64bb73d3b6a;hpb=cd9dbd8b53bbe32af3e9c6bead4aa6d796a278a5;p=mothur.git diff --git a/bayesian.cpp b/bayesian.cpp index e5543cd..b46f770 100644 --- a/bayesian.cpp +++ b/bayesian.cpp @@ -17,9 +17,11 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { try { /************calculate the probablity that each word will be in a specific taxonomy*************/ - string phyloTreeName = tfile.substr(0,tfile.find_last_of(".")+1) + "tree.train"; - string probFileName = tfile.substr(0,tfile.find_last_of(".")+1) + tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer.prob"; - string probFileName2 = tfile.substr(0,tfile.find_last_of(".")+1) + tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer.numNonZero"; + string tfileroot = tfile.substr(0,tfile.find_last_of(".")+1); + string tempfileroot = getRootName(getSimpleName(tempFile)); + string phyloTreeName = tfileroot + "tree.train"; + string probFileName = tfileroot + tempfileroot + char('0'+ kmerSize) + "mer.prob"; + string probFileName2 = tfileroot + tempfileroot + char('0'+ kmerSize) + "mer.numNonZero"; ofstream out; ofstream out2; @@ -177,6 +179,18 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { exit(1); } } +/**************************************************************************************************/ +Bayesian::~Bayesian() { + try { + delete phyloTree; + if (database != NULL) { delete database; } + } + catch(exception& e) { + m->errorOut(e, "Bayesian", "~Bayesian"); + exit(1); + } +} + /**************************************************************************************************/ string Bayesian::getTaxonomy(Sequence* seq) { try { @@ -201,7 +215,7 @@ string Bayesian::getTaxonomy(Sequence* seq) { int index = getMostProbableTaxonomy(queryKmers); if (m->control_pressed) { return tax; } - +//cout << seq->getName() << '\t' << index << endl; //bootstrap - to set confidenceScore int numToSelect = queryKmers.size() / 8; tax = bootstrapResults(queryKmers, index, numToSelect); @@ -216,17 +230,11 @@ string Bayesian::getTaxonomy(Sequence* seq) { /**************************************************************************************************/ string Bayesian::bootstrapResults(vector kmers, int tax, int numToSelect) { try { - - //taxConfidenceScore.clear(); //clear out previous seqs scores - vector< map > confidenceScores; //you need the added vector level of confusion to account for the level that name is seen since they can be the same - //map of classification to confidence for all areas seen - //ie. Bacteria;Alphaproteobacteria;Rhizobiales;Azorhizobium_et_rel.;Methylobacterium_et_rel.;Bosea; - //ie. Bacteria -> 100, Alphaproteobacteria -> 100, Rhizobiales -> 87, Azorhizobium_et_rel. -> 78, Methylobacterium_et_rel. -> 70, Bosea -> 50 - confidenceScores.resize(100); //if you have more than 100 levels of classification... - - map::iterator itBoot; - map::iterator itBoot2; + map confidenceScores; + + map::iterator itBoot; + map::iterator itBoot2; map::iterator itConvert; for (int i = 0; i < iters; i++) { @@ -243,42 +251,46 @@ string Bayesian::bootstrapResults(vector kmers, int tax, int numToSelect) { //get taxonomy int newTax = getMostProbableTaxonomy(temp); - TaxNode taxonomy = phyloTree->get(newTax); - + TaxNode taxonomyTemp = phyloTree->get(newTax); + //add to confidence results - while (taxonomy.level != 0) { //while you are not at the root + while (taxonomyTemp.level != 0) { //while you are not at the root - itBoot2 = confidenceScores[taxonomy.level].find(taxonomy.name); //is this a classification we already have a count on + itBoot2 = confidenceScores.find(newTax); //is this a classification we already have a count on - if (itBoot2 == confidenceScores[taxonomy.level].end()) { //not already in confidence scores - confidenceScores[taxonomy.level][taxonomy.name] = 1; + if (itBoot2 == confidenceScores.end()) { //not already in confidence scores + confidenceScores[newTax] = 1; }else{ - confidenceScores[taxonomy.level][taxonomy.name]++; + confidenceScores[newTax]++; } - - taxonomy = phyloTree->get(taxonomy.parent); + + newTax = taxonomyTemp.parent; + taxonomyTemp = phyloTree->get(newTax); } } string confidenceTax = ""; simpleTax = ""; + + int seqTaxIndex = tax; TaxNode seqTax = phyloTree->get(tax); while (seqTax.level != 0) { //while you are not at the root - itBoot2 = confidenceScores[seqTax.level].find(seqTax.name); //is this a classification we already have a count on + itBoot2 = confidenceScores.find(seqTaxIndex); //is this a classification we already have a count on int confidence = 0; - if (itBoot2 != confidenceScores[seqTax.level].end()) { //not already in confidence scores - confidence = confidenceScores[seqTax.level][seqTax.name]; + if (itBoot2 != confidenceScores.end()) { //already in confidence scores + confidence = confidenceScores[seqTaxIndex]; } - if (confidence >= confidenceThreshold) { + if (((confidence/(float)iters) * 100) >= confidenceThreshold) { confidenceTax = seqTax.name + "(" + toString(((confidence/(float)iters) * 100)) + ");" + confidenceTax; simpleTax = seqTax.name + ";" + simpleTax; } + seqTaxIndex = seqTax.parent; seqTax = phyloTree->get(seqTax.parent); } @@ -352,7 +364,7 @@ void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string #ifdef USE_MPI - int pid, num, num2; + int pid, num, num2, processors; vector positions; vector positions2; @@ -360,6 +372,8 @@ void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string MPI_File inMPI; MPI_File inMPI2; MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + MPI_Comm_size(MPI_COMM_WORLD, &processors); + int tag = 2001; char inFileName[1024]; strcpy(inFileName, inNumName.c_str()); @@ -372,26 +386,24 @@ void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string if (pid == 0) { positions = setFilePosEachLine(inNumName, num); - - //send file positions to all processes - MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs - MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos - positions2 = setFilePosEachLine(inName, num2); - //send file positions to all processes - MPI_Bcast(&num2, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs - MPI_Bcast(&positions2[0], (num2+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos + for(int i = 1; i < processors; i++) { + MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD); + MPI_Send(&positions[0], (num+1), MPI_LONG, i, tag, MPI_COMM_WORLD); + + MPI_Send(&num2, 1, MPI_INT, i, tag, MPI_COMM_WORLD); + MPI_Send(&positions2[0], (num2+1), MPI_LONG, i, tag, MPI_COMM_WORLD); + } }else{ - MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs - positions.resize(num); - MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions + MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status); + positions.resize(num+1); + MPI_Recv(&positions[0], (num+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status); - MPI_Bcast(&num2, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs - positions2.resize(num2); - MPI_Bcast(&positions2[0], (num2+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions - + MPI_Recv(&num2, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status); + positions2.resize(num2+1); + MPI_Recv(&positions2[0], (num2+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status); } //read numKmers @@ -463,6 +475,7 @@ void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string } MPI_File_close(&inMPI2); + MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case #else in >> numKmers; gobble(in);