]> git.donarmstrong.com Git - mothur.git/commitdiff
optimizing classify.seqs calculating of template probabilities.
authorSarah Westcott <mothur.westcott@gmail.com>
Wed, 12 Sep 2012 15:22:14 +0000 (11:22 -0400)
committerSarah Westcott <mothur.westcott@gmail.com>
Wed, 12 Sep 2012 15:22:14 +0000 (11:22 -0400)
bayesian.cpp
classify.cpp
phylotree.cpp
phylotree.h
trimseqscommand.cpp

index bccf0ce0dda18501c129b7604546f4e9fc43382b..49be4af57ff66f46ac04912535d7918518cc75d2 100644 (file)
@@ -64,7 +64,7 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) {
                        }
                        saveIn.close();                 
                }
-FilesGood = false;
+
                if(probFileTest && probFileTest2 && phyloTreeTest && probFileTest3 && FilesGood){       
                        if (tempFile == "saved") { m->mothurOutEndLine();  m->mothurOut("Using sequences from " + rdb->getSavedReference() + " that are saved in memory.");     m->mothurOutEndLine(); }
                        
@@ -143,7 +143,6 @@ FilesGood = false;
                                        }
                                #endif
 
-                               
                                //for each word
                                for (int i = 0; i < numKmers; i++) {
                                        if (m->control_pressed) {  break; }
@@ -162,12 +161,10 @@ FilesGood = false;
                                        
                                        vector<int> seqsWithWordi = database->getSequencesWithKmer(i);
                                        
-                                       map<int, int> count;
-                                       for (int k = 0; k < genusNodes.size(); k++) {  count[genusNodes[k]] = 0;  }                     
-                                                       
                                        //for each sequence with that word
+                    vector<int> count; count.resize(genusNodes.size(), 0);
                                        for (int j = 0; j < seqsWithWordi.size(); j++) {
-                                               int temp = phyloTree->getIndex(names[seqsWithWordi[j]]);
+                                               int temp = phyloTree->getGenusIndex(names[seqsWithWordi[j]]);
                                                count[temp]++;  //increment count of seq in this genus who have this word
                                        }
                                        
@@ -181,9 +178,9 @@ FilesGood = false;
                                                //probabilityInThisTaxonomy = (# of seqs with that word in this taxonomy + probabilityInTemplate) / (total number of seqs in this taxonomy + 1);
                                                
                                                
-                                               wordGenusProb[i][k] = log((count[genusNodes[k]] + probabilityInTemplate) / (float) (genusTotals[k] + 1));  
+                                               wordGenusProb[i][k] = log((count[k] + probabilityInTemplate) / (float) (genusTotals[k] + 1));  
                                                                        
-                                               if (count[genusNodes[k]] != 0) { 
+                                               if (count[k] != 0) { 
                                                        #ifdef USE_MPI
                                                                int pid;
                                                                MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
index f44e66c359873f6830dac72d99fb1727a73c27ad..ace89b9cbe9a4c0350ee14396b70340b1b7bc6b3 100644 (file)
@@ -200,8 +200,7 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me
                                }
                                fastaFile.close();
 
-                if ((method == "kmer") && (!shortcuts)) {;} //don't print
-                else {database->generateDB(); }
+                database->generateDB(); 
                                
                        }else if ((method == "kmer") && (!needToGenerate)) {    
                                ifstream kmerFileTest(kmerDBName.c_str());
index 73cb461b852c3946d62140c2d6520481cff0e937..8a7c712b0f255db20da492fd95bc15d3116ce151 100644 (file)
@@ -75,7 +75,7 @@ PhyloTree::PhyloTree(ifstream& in, string filename){
                        for (int i = 0; i < numGenus; i++) {
                                iss >> gnode >> gsize; m->gobble(iss);
                                
-                               uniqueTaxonomies[gnode] = gnode;
+                               uniqueTaxonomies.insert(gnode);
                                totals.push_back(gsize);
                        }
                        
@@ -102,7 +102,7 @@ PhyloTree::PhyloTree(ifstream& in, string filename){
                        for (int i = 0; i < numGenus; i++) {
                                in >> gnode >> gsize; m->gobble(in);
                                
-                               uniqueTaxonomies[gnode] = gnode;
+                               uniqueTaxonomies.insert(gnode);
                                totals.push_back(gsize);
                        }
                        
@@ -260,7 +260,7 @@ int PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){
                        //use print to reassign the taxa id
                        taxon = getNextTaxon(seqTaxonomy, seqName);
                        
-                       if (taxon == "") {  m->mothurOut(seqName + " has an error in the taxonomy.  This may be due to a ;;"); m->mothurOutEndLine(); if (currentNode != 0) {  uniqueTaxonomies[currentNode] = currentNode; } break;  }
+                       if (taxon == "") {  m->mothurOut(seqName + " has an error in the taxonomy.  This may be due to a ;;"); m->mothurOutEndLine(); if (currentNode != 0) {  uniqueTaxonomies.insert(currentNode); } break;  }
                        
                        childPointer = tree[currentNode].children.find(taxon);
                        
@@ -280,7 +280,7 @@ int PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){
                                name2Taxonomy[seqName] = currentNode;
                        }
        
-                       if (seqTaxonomy == "") {   uniqueTaxonomies[currentNode] = currentNode; }
+                       if (seqTaxonomy == "") {   uniqueTaxonomies.insert(currentNode);        }
                }
                
                return 0;
@@ -295,9 +295,16 @@ vector<int> PhyloTree::getGenusNodes()     {
        try {
                genusIndex.clear();
                //generate genusIndexes
-               map<int, int>::iterator it2;
-               for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  genusIndex.push_back(it2->first);     }
-               
+               set<int>::iterator it2;
+        map<int, int> temp;
+               for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  genusIndex.push_back(*it2);   temp[*it2] = genusIndex.size()-1; }
+               
+        for (map<string, int>::iterator itName = name2Taxonomy.begin(); itName != name2Taxonomy.end(); itName++) {
+            map<int, int>::iterator itTemp = temp.find(itName->second);
+            if (itTemp != temp.end()) { name2GenusNodeIndex[itName->first] = itTemp->second; }
+            else {  m->mothurOut("[ERROR]: trouble making name2GenusNodeIndex, aborting.\n"); m->control_pressed = true; }
+        }
+        
                return genusIndex;
        }
        catch(exception& e) {
@@ -541,8 +548,8 @@ void PhyloTree::printTreeNodes(string treefilename) {
                        
                        //print genus nodes
                        outTree << endl << uniqueTaxonomies.size() << endl;
-                       map<int, int>::iterator it2;
-                       for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  outTree << it2->first << '\t' << tree[it2->first].accessions.size() << endl;  }
+                       set<int>::iterator it2;
+                       for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  outTree << *it2 << '\t' << tree[*it2].accessions.size() << endl;      }
                        outTree << endl;
                        
                        outTree.close();
@@ -594,12 +601,12 @@ string PhyloTree::getName(int i ){
        }
 }
 /**************************************************************************************************/
-int PhyloTree::getIndex(string seqName){
+int PhyloTree::getGenusIndex(string seqName){
        try {
-               map<string, int>::iterator itFind = name2Taxonomy.find(seqName);
+               map<string, int>::iterator itFind = name2GenusNodeIndex.find(seqName);
        
-               if (itFind != name2Taxonomy.end()) {  return itFind->second;  }
-               else { m->mothurOut("Cannot find " + seqName + ". Mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1);}
+               if (itFind != name2GenusNodeIndex.end()) {  return itFind->second;  }
+               else { m->mothurOut("Cannot find " + seqName + ". Could be a mismatch with taxonomy and template files. Cannot continue."); m->mothurOutEndLine(); exit(1);}
        }
        catch(exception& e) {
                m->errorOut(e, "PhyloTree", "get");
index 7aae8f1adedcf5f1db6b94d1bbe76a968a8e3100..e0002209e4d83aaaf9468573173858d9b0602b61 100644 (file)
@@ -44,7 +44,7 @@ public:
        TaxNode get(int i);                             
        TaxNode get(string seqName);
        string getName(int i);                  
-       int getIndex(string seqName);   
+       int getGenusIndex(string seqName);      
        string getFullTaxonomy(string);  //pass a sequence name return taxonomy
        
        int getMaxLevel()               {       return maxLevel;        }
@@ -63,7 +63,8 @@ private:
        vector<int> genusIndex; //holds the indexes in tree where the genus level taxonomies are stored
        vector<int> totals; //holds the numSeqs at each genus level taxonomy
        map<string, int> name2Taxonomy;  //maps name to index in tree
-       map<int, int> uniqueTaxonomies;  //map of unique taxonomies
+    map<string, int> name2GenusNodeIndex;
+       set<int> uniqueTaxonomies;  //map of unique taxonomies
        map<int, int> leafNodes; //used to create static reference taxonomy file
        //void print(int, ofstream&);
        int numNodes;
index a90e7adc5c8cb870714c716b02bec5eb6bfddd73..b29723c032814891f2f84dc8ba437ab789908773 100644 (file)
@@ -704,6 +704,8 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                                        }
                                }
                                
+                if (m->debug) { m->mothurOut("[DEBUG]: " + currSeq.getName() + ", trashcode= " + trashCode); if (trashCode.length() != 0) { m->mothurOutEndLine(); } }
+                
                                if(trashCode.length() == 0){
                                        currSeq.setAligned(currSeq.getUnaligned());
                                        currSeq.printSequence(trimFASTAFile);
@@ -732,6 +734,8 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string
                                                                } 
                                                        }
                                                        
+                            if (m->debug) { m->mothurOut(", group= " + thisGroup + "\n"); }
+                            
                                                        outGroupsFile << currSeq.getName() << '\t' << thisGroup << endl;
                                                        
                             int numRedundants = 0;