try {
/************calculate the probablity that each word will be in a specific taxonomy*************/
- string phyloTreeName = tfile.substr(0,tfile.find_last_of(".")+1) + "tree.train";
- string probFileName = tfile.substr(0,tfile.find_last_of(".")+1) + tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer.prob";
- string probFileName2 = tfile.substr(0,tfile.find_last_of(".")+1) + tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer.numNonZero";
+ string tfileroot = tfile.substr(0,tfile.find_last_of(".")+1);
+ string tempfileroot = getRootName(getSimpleName(tempFile));
+ string phyloTreeName = tfileroot + "tree.train";
+ string probFileName = tfileroot + tempfileroot + char('0'+ kmerSize) + "mer.prob";
+ string probFileName2 = tfileroot + tempfileroot + char('0'+ kmerSize) + "mer.numNonZero";
ofstream out;
ofstream out2;
//create search database and names vector
generateDatabaseAndNames(tfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0);
- genusNodes = phyloTree->getGenusNodes();
- genusTotals = phyloTree->getGenusTotals();
-
- m->mothurOut("Calculating template taxonomy tree... "); cout.flush();
-
- phyloTree->printTreeNodes(phyloTreeName);
-
- m->mothurOut("DONE."); m->mothurOutEndLine();
-
- m->mothurOut("Calculating template probabilities... "); cout.flush();
-
- numKmers = database->getMaxKmer() + 1;
-
- //initialze probabilities
- wordGenusProb.resize(numKmers);
-
- for (int j = 0; j < wordGenusProb.size(); j++) { wordGenusProb[j].resize(genusNodes.size()); }
-
-
- #ifdef USE_MPI
- int pid;
- MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
-
- if (pid == 0) {
- #endif
-
- ofstream out;
- openOutputFile(probFileName, out);
-
- out << numKmers << endl;
-
- ofstream out2;
- openOutputFile(probFileName2, out2);
+ //prevents errors caused by creating shortcut files if you had an error in the sanity check.
+ if (m->control_pressed) { remove(phyloTreeName.c_str()); remove(probFileName.c_str()); remove(probFileName2.c_str()); }
+ else{
+ genusNodes = phyloTree->getGenusNodes();
+ genusTotals = phyloTree->getGenusTotals();
+
+ m->mothurOut("Calculating template taxonomy tree... "); cout.flush();
+
+ phyloTree->printTreeNodes(phyloTreeName);
+
+ m->mothurOut("DONE."); m->mothurOutEndLine();
+
+ m->mothurOut("Calculating template probabilities... "); cout.flush();
+
+ numKmers = database->getMaxKmer() + 1;
- #ifdef USE_MPI
- }
- #endif
-
+ //initialze probabilities
+ wordGenusProb.resize(numKmers);
- //for each word
- for (int i = 0; i < numKmers; i++) {
- if (m->control_pressed) { break; }
+ for (int j = 0; j < wordGenusProb.size(); j++) { wordGenusProb[j].resize(genusNodes.size()); }
+
#ifdef USE_MPI
+ int pid;
MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
if (pid == 0) {
#endif
- out << i << '\t';
+ ofstream out;
+ openOutputFile(probFileName, out);
+
+ out << numKmers << endl;
+
+ ofstream out2;
+ openOutputFile(probFileName2, out2);
#ifdef USE_MPI
}
#endif
+
- vector<int> seqsWithWordi = database->getSequencesWithKmer(i);
-
- map<int, int> count;
- for (int k = 0; k < genusNodes.size(); k++) { count[genusNodes[k]] = 0; }
-
- //for each sequence with that word
- for (int j = 0; j < seqsWithWordi.size(); j++) {
- int temp = phyloTree->getIndex(names[seqsWithWordi[j]]);
- count[temp]++; //increment count of seq in this genus who have this word
- }
-
- //probabilityInTemplate = (# of seqs with that word in template + 0.05) / (total number of seqs in template + 1);
- float probabilityInTemplate = (seqsWithWordi.size() + 0.50) / (float) (names.size() + 1);
-
- int numNotZero = 0;
- for (int k = 0; k < genusNodes.size(); k++) {
- //probabilityInThisTaxonomy = (# of seqs with that word in this taxonomy + probabilityInTemplate) / (total number of seqs in this taxonomy + 1);
- wordGenusProb[i][k] = log((count[genusNodes[k]] + probabilityInTemplate) / (float) (genusTotals[k] + 1));
- if (count[genusNodes[k]] != 0) {
- #ifdef USE_MPI
- MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
- if (pid == 0) {
- #endif
+ //for each word
+ for (int i = 0; i < numKmers; i++) {
+ if (m->control_pressed) { break; }
+
+ #ifdef USE_MPI
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
- out << k << '\t' << wordGenusProb[i][k] << '\t';
-
- #ifdef USE_MPI
- }
- #endif
+ if (pid == 0) {
+ #endif
- numNotZero++;
+ out << i << '\t';
+
+ #ifdef USE_MPI
+ }
+ #endif
+
+ vector<int> seqsWithWordi = database->getSequencesWithKmer(i);
+
+ map<int, int> count;
+ for (int k = 0; k < genusNodes.size(); k++) { count[genusNodes[k]] = 0; }
+
+ //for each sequence with that word
+ for (int j = 0; j < seqsWithWordi.size(); j++) {
+ int temp = phyloTree->getIndex(names[seqsWithWordi[j]]);
+ count[temp]++; //increment count of seq in this genus who have this word
}
+
+ //probabilityInTemplate = (# of seqs with that word in template + 0.05) / (total number of seqs in template + 1);
+ float probabilityInTemplate = (seqsWithWordi.size() + 0.50) / (float) (names.size() + 1);
+
+ int numNotZero = 0;
+ for (int k = 0; k < genusNodes.size(); k++) {
+ //probabilityInThisTaxonomy = (# of seqs with that word in this taxonomy + probabilityInTemplate) / (total number of seqs in this taxonomy + 1);
+ wordGenusProb[i][k] = log((count[genusNodes[k]] + probabilityInTemplate) / (float) (genusTotals[k] + 1));
+ if (count[genusNodes[k]] != 0) {
+ #ifdef USE_MPI
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+ if (pid == 0) {
+ #endif
+
+ out << k << '\t' << wordGenusProb[i][k] << '\t';
+
+ #ifdef USE_MPI
+ }
+ #endif
+
+ numNotZero++;
+ }
+ }
+
+ #ifdef USE_MPI
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+ if (pid == 0) {
+ #endif
+
+ out << endl;
+ out2 << probabilityInTemplate << '\t' << numNotZero << endl;
+
+ #ifdef USE_MPI
+ }
+ #endif
}
#ifdef USE_MPI
if (pid == 0) {
#endif
- out << endl;
- out2 << probabilityInTemplate << '\t' << numNotZero << endl;
+ out.close();
+ out2.close();
#ifdef USE_MPI
}
#endif
+
+ //read in new phylotree with less info. - its faster
+ ifstream phyloTreeTest(phyloTreeName.c_str());
+ delete phyloTree;
+
+ phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName);
}
-
- #ifdef USE_MPI
- MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
- if (pid == 0) {
- #endif
-
- out.close();
- out2.close();
-
- #ifdef USE_MPI
- }
- #endif
-
- //read in new phylotree with less info. - its faster
- ifstream phyloTreeTest(phyloTreeName.c_str());
- delete phyloTree;
-
- phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName);
}
m->mothurOut("DONE."); m->mothurOutEndLine();
exit(1);
}
}
+/**************************************************************************************************/
+Bayesian::~Bayesian() {
+ try {
+ delete phyloTree;
+ if (database != NULL) { delete database; }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Bayesian", "~Bayesian");
+ exit(1);
+ }
+}
+
/**************************************************************************************************/
string Bayesian::getTaxonomy(Sequence* seq) {
try {
int index = getMostProbableTaxonomy(queryKmers);
if (m->control_pressed) { return tax; }
-
+//cout << seq->getName() << '\t' << index << endl;
//bootstrap - to set confidenceScore
int numToSelect = queryKmers.size() / 8;
tax = bootstrapResults(queryKmers, index, numToSelect);
/**************************************************************************************************/
string Bayesian::bootstrapResults(vector<int> kmers, int tax, int numToSelect) {
try {
-
- //taxConfidenceScore.clear(); //clear out previous seqs scores
- vector< map<string, int> > confidenceScores; //you need the added vector level of confusion to account for the level that name is seen since they can be the same
- //map of classification to confidence for all areas seen
- //ie. Bacteria;Alphaproteobacteria;Rhizobiales;Azorhizobium_et_rel.;Methylobacterium_et_rel.;Bosea;
- //ie. Bacteria -> 100, Alphaproteobacteria -> 100, Rhizobiales -> 87, Azorhizobium_et_rel. -> 78, Methylobacterium_et_rel. -> 70, Bosea -> 50
- confidenceScores.resize(100); //if you have more than 100 levels of classification...
-
- map<string, int>::iterator itBoot;
- map<string, int>::iterator itBoot2;
+ map<int, int> confidenceScores;
+
+ map<int, int>::iterator itBoot;
+ map<int, int>::iterator itBoot2;
map<int, int>::iterator itConvert;
for (int i = 0; i < iters; i++) {
//get taxonomy
int newTax = getMostProbableTaxonomy(temp);
- TaxNode taxonomy = phyloTree->get(newTax);
-
+ TaxNode taxonomyTemp = phyloTree->get(newTax);
+
//add to confidence results
- while (taxonomy.level != 0) { //while you are not at the root
+ while (taxonomyTemp.level != 0) { //while you are not at the root
- itBoot2 = confidenceScores[taxonomy.level].find(taxonomy.name); //is this a classification we already have a count on
+ itBoot2 = confidenceScores.find(newTax); //is this a classification we already have a count on
- if (itBoot2 == confidenceScores[taxonomy.level].end()) { //not already in confidence scores
- confidenceScores[taxonomy.level][taxonomy.name] = 1;
+ if (itBoot2 == confidenceScores.end()) { //not already in confidence scores
+ confidenceScores[newTax] = 1;
}else{
- confidenceScores[taxonomy.level][taxonomy.name]++;
+ confidenceScores[newTax]++;
}
-
- taxonomy = phyloTree->get(taxonomy.parent);
+
+ newTax = taxonomyTemp.parent;
+ taxonomyTemp = phyloTree->get(newTax);
}
}
string confidenceTax = "";
simpleTax = "";
+
+ int seqTaxIndex = tax;
TaxNode seqTax = phyloTree->get(tax);
while (seqTax.level != 0) { //while you are not at the root
- itBoot2 = confidenceScores[seqTax.level].find(seqTax.name); //is this a classification we already have a count on
+ itBoot2 = confidenceScores.find(seqTaxIndex); //is this a classification we already have a count on
int confidence = 0;
- if (itBoot2 != confidenceScores[seqTax.level].end()) { //not already in confidence scores
- confidence = confidenceScores[seqTax.level][seqTax.name];
+ if (itBoot2 != confidenceScores.end()) { //already in confidence scores
+ confidence = confidenceScores[seqTaxIndex];
}
if (((confidence/(float)iters) * 100) >= confidenceThreshold) {
simpleTax = seqTax.name + ";" + simpleTax;
}
+ seqTaxIndex = seqTax.parent;
seqTax = phyloTree->get(seqTax.parent);
}
#ifdef USE_MPI
- int pid, num, num2;
- vector<long> positions;
- vector<long> positions2;
+ int pid, num, num2, processors;
+ vector<unsigned long int> positions;
+ vector<unsigned long int> positions2;
MPI_Status status;
MPI_File inMPI;
MPI_File inMPI2;
MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+ MPI_Comm_size(MPI_COMM_WORLD, &processors);
+ int tag = 2001;
char inFileName[1024];
strcpy(inFileName, inNumName.c_str());
if (pid == 0) {
positions = setFilePosEachLine(inNumName, num);
-
- //send file positions to all processes
- MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs
- MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos
-
positions2 = setFilePosEachLine(inName, num2);
- //send file positions to all processes
- MPI_Bcast(&num2, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs
- MPI_Bcast(&positions2[0], (num2+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos
+ for(int i = 1; i < processors; i++) {
+ MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
+ MPI_Send(&positions[0], (num+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
+
+ MPI_Send(&num2, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
+ MPI_Send(&positions2[0], (num2+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
+ }
}else{
- MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
- positions.resize(num);
- MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
+ MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
+ positions.resize(num+1);
+ MPI_Recv(&positions[0], (num+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
- MPI_Bcast(&num2, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
- positions2.resize(num2);
- MPI_Bcast(&positions2[0], (num2+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
-
+ MPI_Recv(&num2, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
+ positions2.resize(num2+1);
+ MPI_Recv(&positions2[0], (num2+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
}
//read numKmers
}
MPI_File_close(&inMPI2);
+ MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
#else
in >> numKmers; gobble(in);