]> git.donarmstrong.com Git - mothur.git/blobdiff - classify.cpp
added zap method to classify.seqs and changed bayesian method name to wang.
[mothur.git] / classify.cpp
index 7770999e779917996a916e87836a03c3a2a8f26b..8aa3cdb381ed7a389667ce61962d47cefac15ddd 100644 (file)
@@ -47,7 +47,7 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me
                                }
                        }
                        else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }
-                       else if(method == "blast")              {       database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch);   }
+                       else if(method == "blast")              {       database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch, "", threadID);     }
                        else if(method == "distance")   {       database = new DistanceDB();    }
                        else {
                                m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
@@ -61,7 +61,8 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me
                                        names.push_back(temp.getName());
                                        database->addSequence(temp);    
                                }
-                               database->generateDB();
+                               if ((method == "kmer") && (!shortcuts)) {;} //don't print
+                else {database->generateDB(); }
                        }else if ((method == "kmer") && (!needToGenerate)) {    
                                ifstream kmerFileTest(kmerDBName.c_str());
                                database->readKmerDB(kmerFileTest);     
@@ -89,7 +90,7 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me
                        m->mothurOut("Generating search database...    "); cout.flush();
        #ifdef USE_MPI  
                                int pid, processors;
-                               vector<unsigned long int> positions;
+                               vector<unsigned long long> positions;
                                int tag = 2001;
                        
                                MPI_Status status; 
@@ -123,7 +124,7 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me
                                //create database
                                if(method == "kmer")                    {       database = new KmerDB(tempFile, kmerSize);                      }
                                else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }
-                               else if(method == "blast")              {       database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch);   }
+                               else if(method == "blast")              {       database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch, "", pid);  }
                                else if(method == "distance")   {       database = new DistanceDB();    }
                                else {
                                        m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine();
@@ -176,7 +177,7 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me
                                }
                        }
                        else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }
-                       else if(method == "blast")              {       database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch);   }
+                       else if(method == "blast")              {       database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch, "", threadID);     }
                        else if(method == "distance")   {       database = new DistanceDB();    }
                        else {
                                m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
@@ -200,7 +201,8 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me
                                }
                                fastaFile.close();
 
-                               database->generateDB();
+                if ((method == "kmer") && (!shortcuts)) {;} //don't print
+                else {database->generateDB(); } 
                                
                        }else if ((method == "kmer") && (!needToGenerate)) {    
                                ifstream kmerFileTest(kmerDBName.c_str());
@@ -238,7 +240,7 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me
        }
 }
 /**************************************************************************************************/
-Classify::Classify() {         m = MothurOut::getInstance();   database = NULL;        }
+Classify::Classify() {         m = MothurOut::getInstance();   database = NULL;        flipped=false; }
 /**************************************************************************************************/
 
 int Classify::readTaxonomy(string file) {
@@ -249,19 +251,17 @@ int Classify::readTaxonomy(string file) {
                
                m->mothurOutEndLine();
                m->mothurOut("Reading in the " + file + " taxonomy...\t");      cout.flush();
-
+        if (m->debug) { m->mothurOut("[DEBUG]: Taxonomies read in...\n"); }
+        
 #ifdef USE_MPI 
                int pid, num, processors;
-               vector<unsigned long int> positions;
+               vector<unsigned long long> positions;
                int tag = 2001;
                
                MPI_Status status; 
                MPI_File inMPI;
                MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
                MPI_Comm_size(MPI_COMM_WORLD, &processors);
-
-               //char* inFileName = new char[file.length()];
-               //memcpy(inFileName, file.c_str(), file.length());
                
                char inFileName[1024];
                strcpy(inFileName, file.c_str());
@@ -296,30 +296,21 @@ int Classify::readTaxonomy(string file) {
                        delete buf4;
 
                        istringstream iss (tempBuf,istringstream::in);
-                       iss >> name >> taxInfo;
+                       iss >> name; m->gobble(iss);
+            iss >> taxInfo;
+            if (m->debug) { m->mothurOut("[DEBUG]: name = " + name + " tax = " + taxInfo + "\n"); }
                        taxonomy[name] = taxInfo;
                        phyloTree->addSeqToTree(name, taxInfo);
                }
                
                MPI_File_close(&inMPI);
                MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
-#else                          
-               ifstream inTax;
-               m->openInputFile(file, inTax);
-       
-               //read template seqs and save
-               while (!inTax.eof()) {
-                       inTax >> name >> taxInfo;
-                       
-                       taxonomy[name] = taxInfo;
-                       
-                       phyloTree->addSeqToTree(name, taxInfo);
-               
-                       m->gobble(inTax);
-               }
-               inTax.close();
+#else  
+        
+        taxonomy.clear(); 
+        m->readTax(file, taxonomy);
+        for (map<string, string>::iterator itTax = taxonomy.begin(); itTax != taxonomy.end(); itTax++) {  phyloTree->addSeqToTree(itTax->first, itTax->second);  }
 #endif 
-       
                phyloTree->assignHeirarchyIDs(0);
                
                phyloTree->setUp(file);
@@ -363,3 +354,37 @@ vector<string> Classify::parseTax(string tax) {
 }
 /**************************************************************************************************/
 
+double Classify::getLogExpSum(vector<double> probabilities, int& maxIndex){
+       try {
+        //     http://jblevins.org/notes/log-sum-exp
+        
+        double maxProb = probabilities[0];
+        maxIndex = 0;
+        
+        int numProbs = (int)probabilities.size();
+        
+        for(int i=1;i<numProbs;i++){
+            if(probabilities[i] >= maxProb){
+                maxProb = probabilities[i];
+                maxIndex = i;
+            }
+        }
+        
+        double probSum = 0.0000;
+        
+        for(int i=0;i<numProbs;i++){
+            probSum += exp(probabilities[i] - maxProb);                
+        }
+        
+        probSum = log(probSum) + maxProb;
+        
+        return probSum;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "Classify", "getLogExpSum");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+