]> git.donarmstrong.com Git - mothur.git/blobdiff - classify.cpp
added modify names parameter to set.dir
[mothur.git] / classify.cpp
index 8ea9e4bcf050ce505b3913e4038fc75e1b052272..36179f471da4ac5d40b563e609feb2f8d3d32e6c 100644 (file)
 #include "suffixdb.hpp"
 #include "blastdb.hpp"
 #include "distancedb.hpp"
+#include "referencedb.h"
 
 /**************************************************************************************************/
 void Classify::generateDatabaseAndNames(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch)  {            
        try {   
-               taxFile = tfile;
-               readTaxonomy(taxFile);  
+               ReferenceDB* rdb = ReferenceDB::getInstance();
                
-               templateFile = tempFile;        
+               if (tfile == "saved") { tfile = rdb->getSavedTaxonomy(); }
                
-               int start = time(NULL);
-               int numSeqs = 0;
+               taxFile = tfile;
                
-               m->mothurOut("Generating search database...    "); cout.flush();
-#ifdef USE_MPI 
-                       int pid, processors;
-                       vector<unsigned long int> positions;
-                       int tag = 2001;
+               int numSeqs = 0;
                
-                       MPI_Status status; 
-                       MPI_File inMPI;
-                       MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
-                       MPI_Comm_size(MPI_COMM_WORLD, &processors);
-
-                       //char* inFileName = new char[tempFile.length()];
-                       //memcpy(inFileName, tempFile.c_str(), tempFile.length());
+               if (tempFile == "saved") {
+                       int start = time(NULL);
+                       m->mothurOutEndLine();  m->mothurOut("Using sequences from " + rdb->getSavedReference() + " that are saved in memory.");        m->mothurOutEndLine();
                        
-                       char inFileName[1024];
-                       strcpy(inFileName, tempFile.c_str());
-
-                       MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
-                       //delete inFileName;
-
-                       if (pid == 0) { //only one process needs to scan file
-                               positions = m->setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs
-
-                               //send file positions to all processes
-                               for(int i = 1; i < processors; i++) { 
-                                       MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
-                                       MPI_Send(&positions[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
+                       numSeqs = rdb->referenceSeqs.size();
+                       templateFile = rdb->getSavedReference();
+                       tempFile = rdb->getSavedReference();
+                       
+                       bool needToGenerate = true;
+                       string kmerDBName;
+                       if(method == "kmer")                    {       
+                               database = new KmerDB(tempFile, kmerSize);                      
+                               
+                               kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
+                               ifstream kmerFileTest(kmerDBName.c_str());
+                               if(kmerFileTest){       
+                                       bool GoodFile = m->checkReleaseVersion(kmerFileTest, m->getVersion());
+                                       if (GoodFile) {  needToGenerate = false;        }
                                }
-                       }else{
-                               MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
-                               positions.resize(numSeqs+1);
-                               MPI_Recv(&positions[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
                        }
-                       
-                       //create database
-                       if(method == "kmer")                    {       database = new KmerDB(tempFile, kmerSize);                      }
                        else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }
-                       else if(method == "blast")              {       database = new BlastDB(gapOpen, gapExtend, match, misMatch);    }
+                       else if(method == "blast")              {       database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch, "", threadID);     }
                        else if(method == "distance")   {       database = new DistanceDB();    }
                        else {
-                               m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine();
+                               m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
+                               m->mothurOutEndLine();
                                database = new KmerDB(tempFile, 8);
                        }
-
-                       //read file 
-                       for(int i=0;i<numSeqs;i++){
-                               //read next sequence
-                               int length = positions[i+1] - positions[i];
-                               char* buf4 = new char[length];
-                               MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
-                               
-                               string tempBuf = buf4;
-                               if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
-                               delete buf4;
-                               istringstream iss (tempBuf,istringstream::in);
-                               
-                               Sequence temp(iss);  
-                               if (temp.getName() != "") {
+                       
+                       if (needToGenerate) {
+                               for (int k = 0; k < rdb->referenceSeqs.size(); k++) {
+                                       Sequence temp(rdb->referenceSeqs[k].getName(), rdb->referenceSeqs[k].getAligned());
                                        names.push_back(temp.getName());
                                        database->addSequence(temp);    
                                }
-                       }
+                               if ((method == "kmer") && (!shortcuts)) {;} //don't print
+                else {database->generateDB(); }
+                       }else if ((method == "kmer") && (!needToGenerate)) {    
+                               ifstream kmerFileTest(kmerDBName.c_str());
+                               database->readKmerDB(kmerFileTest);     
+                               
+                               for (int k = 0; k < rdb->referenceSeqs.size(); k++) {
+                                       names.push_back(rdb->referenceSeqs[k].getName());
+                               }                       
+                       }       
                        
-                       database->generateDB();
-                       MPI_File_close(&inMPI);
-                       MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
-       #else
-               
-               //need to know number of template seqs for suffixdb
-               if (method == "suffix") {
-                       ifstream inFASTA;
-                       m->openInputFile(tempFile, inFASTA);
-                       m->getNumSeqs(inFASTA, numSeqs);
-                       inFASTA.close();
-               }
-
-               bool needToGenerate = true;
-               string kmerDBName;
-               if(method == "kmer")                    {       
-                       database = new KmerDB(tempFile, kmerSize);                      
+                       database->setNumSeqs(numSeqs);
                        
-                       kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
-                       ifstream kmerFileTest(kmerDBName.c_str());
-                       if(kmerFileTest){       
-                               bool GoodFile = m->checkReleaseVersion(kmerFileTest, m->getVersion());
-                               if (GoodFile) {  needToGenerate = false;        }
-                       }
-               }
-               else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }
-               else if(method == "blast")              {       database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch);   }
-               else if(method == "distance")   {       database = new DistanceDB();    }
-               else {
-                       m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
-                       m->mothurOutEndLine();
-                       database = new KmerDB(tempFile, 8);
-               }
-               
-               if (needToGenerate) {
-                       ifstream fastaFile;
-                       m->openInputFile(tempFile, fastaFile);
+                       m->mothurOut("It took " + toString(time(NULL) - start) + " to load " + toString(rdb->referenceSeqs.size()) + " sequences and generate the search databases.");m->mothurOutEndLine();  
                        
-                       while (!fastaFile.eof()) {
-                               Sequence temp(fastaFile);
-                               m->gobble(fastaFile);
+               }else {
                        
-                               names.push_back(temp.getName());
-                                                       
-                               database->addSequence(temp);    
-                       }
-                       fastaFile.close();
+                       templateFile = tempFile;        
+                       
+                       int start = time(NULL);
+                       
+                       m->mothurOut("Generating search database...    "); cout.flush();
+       #ifdef USE_MPI  
+                               int pid, processors;
+                               vector<unsigned long long> positions;
+                               int tag = 2001;
+                       
+                               MPI_Status status; 
+                               MPI_File inMPI;
+                               MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+                               MPI_Comm_size(MPI_COMM_WORLD, &processors);
+
+                               //char* inFileName = new char[tempFile.length()];
+                               //memcpy(inFileName, tempFile.c_str(), tempFile.length());
+                               
+                               char inFileName[1024];
+                               strcpy(inFileName, tempFile.c_str());
+
+                               MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
+                               //delete inFileName;
 
-                       database->generateDB();
+                               if (pid == 0) { //only one process needs to scan file
+                                       positions = m->setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs
+
+                                       //send file positions to all processes
+                                       for(int i = 1; i < processors; i++) { 
+                                               MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
+                                               MPI_Send(&positions[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
+                                       }
+                               }else{
+                                       MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
+                                       positions.resize(numSeqs+1);
+                                       MPI_Recv(&positions[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
+                               }
+                               
+                               //create database
+                               if(method == "kmer")                    {       database = new KmerDB(tempFile, kmerSize);                      }
+                               else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }
+                               else if(method == "blast")              {       database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch, "", pid);  }
+                               else if(method == "distance")   {       database = new DistanceDB();    }
+                               else {
+                                       m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine();
+                                       database = new KmerDB(tempFile, 8);
+                               }
+
+                               //read file 
+                               for(int i=0;i<numSeqs;i++){
+                                       //read next sequence
+                                       int length = positions[i+1] - positions[i];
+                                       char* buf4 = new char[length];
+                                       MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
+                                       
+                                       string tempBuf = buf4;
+                                       if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
+                                       delete buf4;
+                                       istringstream iss (tempBuf,istringstream::in);
+                                       
+                                       Sequence temp(iss);  
+                                       if (temp.getName() != "") {
+                                               if (rdb->save) { rdb->referenceSeqs.push_back(temp); }
+                                               names.push_back(temp.getName());
+                                               database->addSequence(temp);    
+                                       }
+                               }
+                               
+                               database->generateDB();
+                               MPI_File_close(&inMPI);
+                               MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
+               #else
                        
-               }else if ((method == "kmer") && (!needToGenerate)) {    
-                       ifstream kmerFileTest(kmerDBName.c_str());
-                       database->readKmerDB(kmerFileTest);     
-               
-                       ifstream fastaFile;
-                       m->openInputFile(tempFile, fastaFile);
+                       //need to know number of template seqs for suffixdb
+                       if (method == "suffix") {
+                               ifstream inFASTA;
+                               m->openInputFile(tempFile, inFASTA);
+                               m->getNumSeqs(inFASTA, numSeqs);
+                               inFASTA.close();
+                       }
+
+                       bool needToGenerate = true;
+                       string kmerDBName;
+                       if(method == "kmer")                    {       
+                               database = new KmerDB(tempFile, kmerSize);                      
+                               
+                               kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
+                               ifstream kmerFileTest(kmerDBName.c_str());
+                               if(kmerFileTest){       
+                                       bool GoodFile = m->checkReleaseVersion(kmerFileTest, m->getVersion());
+                                       if (GoodFile) {  needToGenerate = false;        }
+                               }
+                       }
+                       else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }
+                       else if(method == "blast")              {       database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch, "", threadID);     }
+                       else if(method == "distance")   {       database = new DistanceDB();    }
+                       else {
+                               m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
+                               m->mothurOutEndLine();
+                               database = new KmerDB(tempFile, 8);
+                       }
                        
-                       while (!fastaFile.eof()) {
-                               Sequence temp(fastaFile);
-                               m->gobble(fastaFile);
+                       if (needToGenerate) {
+                               ifstream fastaFile;
+                               m->openInputFile(tempFile, fastaFile);
+                               
+                               while (!fastaFile.eof()) {
+                                       Sequence temp(fastaFile);
+                                       m->gobble(fastaFile);
+                                       
+                                       if (rdb->save) { rdb->referenceSeqs.push_back(temp); }
+                                       
+                                       names.push_back(temp.getName());
+                                                               
+                                       database->addSequence(temp);    
+                               }
+                               fastaFile.close();
 
-                               names.push_back(temp.getName());
+                if ((method == "kmer") && (!shortcuts)) {;} //don't print
+                else {database->generateDB(); } 
+                               
+                       }else if ((method == "kmer") && (!needToGenerate)) {    
+                               ifstream kmerFileTest(kmerDBName.c_str());
+                               database->readKmerDB(kmerFileTest);     
+                       
+                               ifstream fastaFile;
+                               m->openInputFile(tempFile, fastaFile);
+                               
+                               while (!fastaFile.eof()) {
+                                       Sequence temp(fastaFile);
+                                       m->gobble(fastaFile);
+                                       
+                                       if (rdb->save) { rdb->referenceSeqs.push_back(temp); }
+                                       names.push_back(temp.getName());
+                               }
+                               fastaFile.close();
                        }
-                       fastaFile.close();
+       #endif  
+               
+                       database->setNumSeqs(names.size());
+                       
+                       m->mothurOut("DONE."); m->mothurOutEndLine();
+                       m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine();
                }
-#endif 
-       
-               database->setNumSeqs(names.size());
-               
-               //sanity check
-               bool okay = phyloTree->ErrorCheck(names);
-               
-               if (!okay) { m->control_pressed = true; }
-               
-               m->mothurOut("DONE."); m->mothurOutEndLine();
-               m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine();
-
+        
+        readTaxonomy(taxFile);
+        
+        //sanity check
+        bool okay = phyloTree->ErrorCheck(names);
+        
+        if (!okay) { m->control_pressed = true; }
        }
        catch(exception& e) {
                m->errorOut(e, "Classify", "generateDatabaseAndNames");
@@ -172,7 +236,7 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me
        }
 }
 /**************************************************************************************************/
-Classify::Classify() {         m = MothurOut::getInstance();   database = NULL;        }
+Classify::Classify() {         m = MothurOut::getInstance();   database = NULL;        phyloTree=NULL; flipped=false; }
 /**************************************************************************************************/
 
 int Classify::readTaxonomy(string file) {
@@ -183,19 +247,17 @@ int Classify::readTaxonomy(string file) {
                
                m->mothurOutEndLine();
                m->mothurOut("Reading in the " + file + " taxonomy...\t");      cout.flush();
-
+        if (m->debug) { m->mothurOut("[DEBUG]: Taxonomies read in...\n"); }
+        
 #ifdef USE_MPI 
                int pid, num, processors;
-               vector<unsigned long int> positions;
+               vector<unsigned long long> positions;
                int tag = 2001;
                
                MPI_Status status; 
                MPI_File inMPI;
                MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
                MPI_Comm_size(MPI_COMM_WORLD, &processors);
-
-               //char* inFileName = new char[file.length()];
-               //memcpy(inFileName, file.c_str(), file.length());
                
                char inFileName[1024];
                strcpy(inFileName, file.c_str());
@@ -230,30 +292,34 @@ int Classify::readTaxonomy(string file) {
                        delete buf4;
 
                        istringstream iss (tempBuf,istringstream::in);
-                       iss >> name >> taxInfo;
-                       taxonomy[name] = taxInfo;
-                       phyloTree->addSeqToTree(name, taxInfo);
-               }
+                       iss >> name; m->gobble(iss);
+            iss >> taxInfo;
+            if (m->debug) { m->mothurOut("[DEBUG]: name = " + name + " tax = " + taxInfo + "\n"); }
+                       if (m->inUsersGroups(name, names)) {
+                taxonomy[name] = taxInfo;
+                phyloTree->addSeqToTree(name, taxInfo);
+            }else {
+                m->mothurOut("[WARNING]: " + name + " is in your taxonomy file and not in your reference file, ignoring.\n");
+            }          
+        }
                
                MPI_File_close(&inMPI);
                MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
-#else                          
-               ifstream inTax;
-               m->openInputFile(file, inTax);
-       
-               //read template seqs and save
-               while (!inTax.eof()) {
-                       inTax >> name >> taxInfo;
-                       
-                       taxonomy[name] = taxInfo;
-                       
-                       phyloTree->addSeqToTree(name, taxInfo);
-               
-                       m->gobble(inTax);
-               }
-               inTax.close();
+#else  
+        
+        taxonomy.clear(); 
+        m->readTax(file, taxonomy);
+        map<string, string> tempTaxonomy;
+        for (map<string, string>::iterator itTax = taxonomy.begin(); itTax != taxonomy.end(); itTax++) {  
+            if (m->inUsersGroups(itTax->first, names)) {
+                phyloTree->addSeqToTree(itTax->first, itTax->second); 
+                tempTaxonomy[itTax->first] = itTax->second;
+            }else {
+                m->mothurOut("[WARNING]: " + itTax->first + " is in your taxonomy file and not in your reference file, ignoring.\n");
+            }
+        }
+        taxonomy = tempTaxonomy;
 #endif 
-       
                phyloTree->assignHeirarchyIDs(0);
                
                phyloTree->setUp(file);
@@ -297,3 +363,37 @@ vector<string> Classify::parseTax(string tax) {
 }
 /**************************************************************************************************/
 
+double Classify::getLogExpSum(vector<double> probabilities, int& maxIndex){
+       try {
+        //     http://jblevins.org/notes/log-sum-exp
+        
+        double maxProb = probabilities[0];
+        maxIndex = 0;
+        
+        int numProbs = (int)probabilities.size();
+        
+        for(int i=1;i<numProbs;i++){
+            if(probabilities[i] >= maxProb){
+                maxProb = probabilities[i];
+                maxIndex = i;
+            }
+        }
+        
+        double probSum = 0.0000;
+        
+        for(int i=0;i<numProbs;i++){
+            probSum += exp(probabilities[i] - maxProb);                
+        }
+        
+        probSum = log(probSum) + maxProb;
+        
+        return probSum;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "Classify", "getLogExpSum");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+