From 159fd324dfecacb6617669246d85c787ae67f630 Mon Sep 17 00:00:00 2001 From: Sarah Westcott Date: Wed, 12 Sep 2012 11:22:14 -0400 Subject: [PATCH] optimizing classify.seqs calculating of template probabilities. --- bayesian.cpp | 13 +++++-------- classify.cpp | 3 +-- phylotree.cpp | 33 ++++++++++++++++++++------------- phylotree.h | 5 +++-- trimseqscommand.cpp | 4 ++++ 5 files changed, 33 insertions(+), 25 deletions(-) diff --git a/bayesian.cpp b/bayesian.cpp index bccf0ce..49be4af 100644 --- a/bayesian.cpp +++ b/bayesian.cpp @@ -64,7 +64,7 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { } saveIn.close(); } -FilesGood = false; + if(probFileTest && probFileTest2 && phyloTreeTest && probFileTest3 && FilesGood){ if (tempFile == "saved") { m->mothurOutEndLine(); m->mothurOut("Using sequences from " + rdb->getSavedReference() + " that are saved in memory."); m->mothurOutEndLine(); } @@ -143,7 +143,6 @@ FilesGood = false; } #endif - //for each word for (int i = 0; i < numKmers; i++) { if (m->control_pressed) { break; } @@ -162,12 +161,10 @@ FilesGood = false; vector seqsWithWordi = database->getSequencesWithKmer(i); - map count; - for (int k = 0; k < genusNodes.size(); k++) { count[genusNodes[k]] = 0; } - //for each sequence with that word + vector count; count.resize(genusNodes.size(), 0); for (int j = 0; j < seqsWithWordi.size(); j++) { - int temp = phyloTree->getIndex(names[seqsWithWordi[j]]); + int temp = phyloTree->getGenusIndex(names[seqsWithWordi[j]]); count[temp]++; //increment count of seq in this genus who have this word } @@ -181,9 +178,9 @@ FilesGood = false; //probabilityInThisTaxonomy = (# of seqs with that word in this taxonomy + probabilityInTemplate) / (total number of seqs in this taxonomy + 1); - wordGenusProb[i][k] = log((count[genusNodes[k]] + probabilityInTemplate) / (float) (genusTotals[k] + 1)); + wordGenusProb[i][k] = log((count[k] + probabilityInTemplate) / (float) (genusTotals[k] + 1)); - if (count[genusNodes[k]] != 0) { + if (count[k] != 0) { #ifdef USE_MPI int pid; MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are diff --git a/classify.cpp b/classify.cpp index f44e66c..ace89b9 100644 --- a/classify.cpp +++ b/classify.cpp @@ -200,8 +200,7 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me } fastaFile.close(); - if ((method == "kmer") && (!shortcuts)) {;} //don't print - else {database->generateDB(); } + database->generateDB(); }else if ((method == "kmer") && (!needToGenerate)) { ifstream kmerFileTest(kmerDBName.c_str()); diff --git a/phylotree.cpp b/phylotree.cpp index 73cb461..8a7c712 100644 --- a/phylotree.cpp +++ b/phylotree.cpp @@ -75,7 +75,7 @@ PhyloTree::PhyloTree(ifstream& in, string filename){ for (int i = 0; i < numGenus; i++) { iss >> gnode >> gsize; m->gobble(iss); - uniqueTaxonomies[gnode] = gnode; + uniqueTaxonomies.insert(gnode); totals.push_back(gsize); } @@ -102,7 +102,7 @@ PhyloTree::PhyloTree(ifstream& in, string filename){ for (int i = 0; i < numGenus; i++) { in >> gnode >> gsize; m->gobble(in); - uniqueTaxonomies[gnode] = gnode; + uniqueTaxonomies.insert(gnode); totals.push_back(gsize); } @@ -260,7 +260,7 @@ int PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){ //use print to reassign the taxa id taxon = getNextTaxon(seqTaxonomy, seqName); - if (taxon == "") { m->mothurOut(seqName + " has an error in the taxonomy. This may be due to a ;;"); m->mothurOutEndLine(); if (currentNode != 0) { uniqueTaxonomies[currentNode] = currentNode; } break; } + if (taxon == "") { m->mothurOut(seqName + " has an error in the taxonomy. This may be due to a ;;"); m->mothurOutEndLine(); if (currentNode != 0) { uniqueTaxonomies.insert(currentNode); } break; } childPointer = tree[currentNode].children.find(taxon); @@ -280,7 +280,7 @@ int PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){ name2Taxonomy[seqName] = currentNode; } - if (seqTaxonomy == "") { uniqueTaxonomies[currentNode] = currentNode; } + if (seqTaxonomy == "") { uniqueTaxonomies.insert(currentNode); } } return 0; @@ -295,9 +295,16 @@ vector PhyloTree::getGenusNodes() { try { genusIndex.clear(); //generate genusIndexes - map::iterator it2; - for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) { genusIndex.push_back(it2->first); } - + set::iterator it2; + map temp; + for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) { genusIndex.push_back(*it2); temp[*it2] = genusIndex.size()-1; } + + for (map::iterator itName = name2Taxonomy.begin(); itName != name2Taxonomy.end(); itName++) { + map::iterator itTemp = temp.find(itName->second); + if (itTemp != temp.end()) { name2GenusNodeIndex[itName->first] = itTemp->second; } + else { m->mothurOut("[ERROR]: trouble making name2GenusNodeIndex, aborting.\n"); m->control_pressed = true; } + } + return genusIndex; } catch(exception& e) { @@ -541,8 +548,8 @@ void PhyloTree::printTreeNodes(string treefilename) { //print genus nodes outTree << endl << uniqueTaxonomies.size() << endl; - map::iterator it2; - for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) { outTree << it2->first << '\t' << tree[it2->first].accessions.size() << endl; } + set::iterator it2; + for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) { outTree << *it2 << '\t' << tree[*it2].accessions.size() << endl; } outTree << endl; outTree.close(); @@ -594,12 +601,12 @@ string PhyloTree::getName(int i ){ } } /**************************************************************************************************/ -int PhyloTree::getIndex(string seqName){ +int PhyloTree::getGenusIndex(string seqName){ try { - map::iterator itFind = name2Taxonomy.find(seqName); + map::iterator itFind = name2GenusNodeIndex.find(seqName); - if (itFind != name2Taxonomy.end()) { return itFind->second; } - else { m->mothurOut("Cannot find " + seqName + ". Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1);} + if (itFind != name2GenusNodeIndex.end()) { return itFind->second; } + else { m->mothurOut("Cannot find " + seqName + ". Could be a mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1);} } catch(exception& e) { m->errorOut(e, "PhyloTree", "get"); diff --git a/phylotree.h b/phylotree.h index 7aae8f1..e000220 100644 --- a/phylotree.h +++ b/phylotree.h @@ -44,7 +44,7 @@ public: TaxNode get(int i); TaxNode get(string seqName); string getName(int i); - int getIndex(string seqName); + int getGenusIndex(string seqName); string getFullTaxonomy(string); //pass a sequence name return taxonomy int getMaxLevel() { return maxLevel; } @@ -63,7 +63,8 @@ private: vector genusIndex; //holds the indexes in tree where the genus level taxonomies are stored vector totals; //holds the numSeqs at each genus level taxonomy map name2Taxonomy; //maps name to index in tree - map uniqueTaxonomies; //map of unique taxonomies + map name2GenusNodeIndex; + set uniqueTaxonomies; //map of unique taxonomies map leafNodes; //used to create static reference taxonomy file //void print(int, ofstream&); int numNodes; diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index a90e7ad..b29723c 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -704,6 +704,8 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string } } + if (m->debug) { m->mothurOut("[DEBUG]: " + currSeq.getName() + ", trashcode= " + trashCode); if (trashCode.length() != 0) { m->mothurOutEndLine(); } } + if(trashCode.length() == 0){ currSeq.setAligned(currSeq.getUnaligned()); currSeq.printSequence(trimFASTAFile); @@ -732,6 +734,8 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string } } + if (m->debug) { m->mothurOut(", group= " + thisGroup + "\n"); } + outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl; int numRedundants = 0; -- 2.39.2