try {
/************calculate the probablity that each word will be in a specific taxonomy*************/
- string phyloTreeName = tfile.substr(0,tfile.find_last_of(".")+1) + "tree.train";
- string probFileName = tfile.substr(0,tfile.find_last_of(".")+1) + tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer.prob";
- string probFileName2 = tfile.substr(0,tfile.find_last_of(".")+1) + tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer.numNonZero";
+ string tfileroot = tfile.substr(0,tfile.find_last_of(".")+1);
+ string tempfileroot = getRootName(getSimpleName(tempFile));
+ string phyloTreeName = tfileroot + "tree.train";
+ string probFileName = tfileroot + tempfileroot + char('0'+ kmerSize) + "mer.prob";
+ string probFileName2 = tfileroot + tempfileroot + char('0'+ kmerSize) + "mer.numNonZero";
ofstream out;
ofstream out2;
exit(1);
}
}
+/**************************************************************************************************/
+Bayesian::~Bayesian() {
+ try {
+ delete phyloTree;
+ if (database != NULL) { delete database; }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Bayesian", "~Bayesian");
+ exit(1);
+ }
+}
+
/**************************************************************************************************/
string Bayesian::getTaxonomy(Sequence* seq) {
try {
int index = getMostProbableTaxonomy(queryKmers);
if (m->control_pressed) { return tax; }
-
+//cout << seq->getName() << '\t' << index << endl;
//bootstrap - to set confidenceScore
int numToSelect = queryKmers.size() / 8;
tax = bootstrapResults(queryKmers, index, numToSelect);
/**************************************************************************************************/
string Bayesian::bootstrapResults(vector<int> kmers, int tax, int numToSelect) {
try {
-
- //taxConfidenceScore.clear(); //clear out previous seqs scores
- vector< map<string, int> > confidenceScores; //you need the added vector level of confusion to account for the level that name is seen since they can be the same
- //map of classification to confidence for all areas seen
- //ie. Bacteria;Alphaproteobacteria;Rhizobiales;Azorhizobium_et_rel.;Methylobacterium_et_rel.;Bosea;
- //ie. Bacteria -> 100, Alphaproteobacteria -> 100, Rhizobiales -> 87, Azorhizobium_et_rel. -> 78, Methylobacterium_et_rel. -> 70, Bosea -> 50
- confidenceScores.resize(100); //if you have more than 100 levels of classification...
-
- map<string, int>::iterator itBoot;
- map<string, int>::iterator itBoot2;
+ map<int, int> confidenceScores;
+
+ map<int, int>::iterator itBoot;
+ map<int, int>::iterator itBoot2;
map<int, int>::iterator itConvert;
for (int i = 0; i < iters; i++) {
//get taxonomy
int newTax = getMostProbableTaxonomy(temp);
- TaxNode taxonomy = phyloTree->get(newTax);
-
+ TaxNode taxonomyTemp = phyloTree->get(newTax);
+
//add to confidence results
- while (taxonomy.level != 0) { //while you are not at the root
+ while (taxonomyTemp.level != 0) { //while you are not at the root
- itBoot2 = confidenceScores[taxonomy.level].find(taxonomy.name); //is this a classification we already have a count on
+ itBoot2 = confidenceScores.find(newTax); //is this a classification we already have a count on
- if (itBoot2 == confidenceScores[taxonomy.level].end()) { //not already in confidence scores
- confidenceScores[taxonomy.level][taxonomy.name] = 1;
+ if (itBoot2 == confidenceScores.end()) { //not already in confidence scores
+ confidenceScores[newTax] = 1;
}else{
- confidenceScores[taxonomy.level][taxonomy.name]++;
+ confidenceScores[newTax]++;
}
-
- taxonomy = phyloTree->get(taxonomy.parent);
+
+ newTax = taxonomyTemp.parent;
+ taxonomyTemp = phyloTree->get(newTax);
}
}
string confidenceTax = "";
simpleTax = "";
+
+ int seqTaxIndex = tax;
TaxNode seqTax = phyloTree->get(tax);
while (seqTax.level != 0) { //while you are not at the root
- itBoot2 = confidenceScores[seqTax.level].find(seqTax.name); //is this a classification we already have a count on
+ itBoot2 = confidenceScores.find(seqTaxIndex); //is this a classification we already have a count on
int confidence = 0;
- if (itBoot2 != confidenceScores[seqTax.level].end()) { //not already in confidence scores
- confidence = confidenceScores[seqTax.level][seqTax.name];
+ if (itBoot2 != confidenceScores.end()) { //already in confidence scores
+ confidence = confidenceScores[seqTaxIndex];
}
if (((confidence/(float)iters) * 100) >= confidenceThreshold) {
simpleTax = seqTax.name + ";" + simpleTax;
}
+ seqTaxIndex = seqTax.parent;
seqTax = phyloTree->get(seqTax.parent);
}
#ifdef USE_MPI
- int pid, num, num2;
+ int pid, num, num2, processors;
vector<long> positions;
vector<long> positions2;
MPI_File inMPI;
MPI_File inMPI2;
MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+ MPI_Comm_size(MPI_COMM_WORLD, &processors);
+ int tag = 2001;
char inFileName[1024];
strcpy(inFileName, inNumName.c_str());
if (pid == 0) {
positions = setFilePosEachLine(inNumName, num);
-
- //send file positions to all processes
- MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs
- MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos
-
positions2 = setFilePosEachLine(inName, num2);
- //send file positions to all processes
- MPI_Bcast(&num2, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs
- MPI_Bcast(&positions2[0], (num2+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos
+ for(int i = 1; i < processors; i++) {
+ MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
+ MPI_Send(&positions[0], (num+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
+
+ MPI_Send(&num2, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
+ MPI_Send(&positions2[0], (num2+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
+ }
}else{
- MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
- positions.resize(num);
- MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
+ MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
+ positions.resize(num+1);
+ MPI_Recv(&positions[0], (num+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
- MPI_Bcast(&num2, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
- positions2.resize(num2);
- MPI_Bcast(&positions2[0], (num2+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
-
+ MPI_Recv(&num2, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
+ positions2.resize(num2+1);
+ MPI_Recv(&positions2[0], (num2+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
}
//read numKmers
}
MPI_File_close(&inMPI2);
+ MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
#else
in >> numKmers; gobble(in);