]> git.donarmstrong.com Git - mothur.git/blobdiff - classify.cpp
added citation function to commands
[mothur.git] / classify.cpp
index 557f17c6b85c7eb61865591ad3a8481ffa38ed38..925dedacbf0edee93af4d89b4cf6386169b6cfc0 100644 (file)
 #include "distancedb.hpp"
 
 /**************************************************************************************************/
-Classify::Classify(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch) : taxFile(tfile), templateFile(tempFile) {         
+void Classify::generateDatabaseAndNames(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch)  {            
        try {   
-               m = MothurOut::getInstance();                                                                   
+               taxFile = tfile;
                readTaxonomy(taxFile);  
                
+               templateFile = tempFile;        
+               
                int start = time(NULL);
                int numSeqs = 0;
                
                m->mothurOut("Generating search database...    "); cout.flush();
 #ifdef USE_MPI 
-                       int pid;
-                       vector<long> positions;
+                       int pid, processors;
+                       vector<unsigned long int> positions;
+                       int tag = 2001;
                
                        MPI_Status status; 
                        MPI_File inMPI;
                        MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
-       
-                       char inFileName[tempFile.length()];
+                       MPI_Comm_size(MPI_COMM_WORLD, &processors);
+
+                       //char* inFileName = new char[tempFile.length()];
+                       //memcpy(inFileName, tempFile.c_str(), tempFile.length());
+                       
+                       char inFileName[1024];
                        strcpy(inFileName, tempFile.c_str());
-       
+
                        MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
-       
+                       //delete inFileName;
+
                        if (pid == 0) { //only one process needs to scan file
-                               positions = setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs
+                               positions = m->setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs
 
                                //send file positions to all processes
-                               MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs
-                               MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos     
+                               for(int i = 1; i < processors; i++) { 
+                                       MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
+                                       MPI_Send(&positions[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
+                               }
                        }else{
-                               MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
-                               positions.resize(numSeqs);
-                               MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
+                               MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
+                               positions.resize(numSeqs+1);
+                               MPI_Recv(&positions[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
                        }
                        
                        //create database
                        if(method == "kmer")                    {       database = new KmerDB(tempFile, kmerSize);                      }
                        else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }
-                       else if(method == "blast")              {       database = new BlastDB(gapOpen, gapExtend, match, misMatch);    }
+                       else if(method == "blast")              {       database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch);   }
                        else if(method == "distance")   {       database = new DistanceDB();    }
                        else {
                                m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine();
@@ -63,12 +73,12 @@ Classify::Classify(string tfile, string tempFile, string method, int kmerSize, f
                        for(int i=0;i<numSeqs;i++){
                                //read next sequence
                                int length = positions[i+1] - positions[i];
-                               char buf4[length];
+                               char* buf4 = new char[length];
                                MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
                                
                                string tempBuf = buf4;
                                if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
-                               
+                               delete buf4;
                                istringstream iss (tempBuf,istringstream::in);
                                
                                Sequence temp(iss);  
@@ -80,13 +90,14 @@ Classify::Classify(string tfile, string tempFile, string method, int kmerSize, f
                        
                        database->generateDB();
                        MPI_File_close(&inMPI);
+                       MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
        #else
                
                //need to know number of template seqs for suffixdb
                if (method == "suffix") {
                        ifstream inFASTA;
-                       openInputFile(tempFile, inFASTA);
-                       numSeqs = count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
+                       m->openInputFile(tempFile, inFASTA);
+                       m->getNumSeqs(inFASTA, numSeqs);
                        inFASTA.close();
                }
 
@@ -97,10 +108,13 @@ Classify::Classify(string tfile, string tempFile, string method, int kmerSize, f
                        
                        kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
                        ifstream kmerFileTest(kmerDBName.c_str());
-                       if(kmerFileTest){       needToGenerate = false;         }
+                       if(kmerFileTest){       
+                               bool GoodFile = m->checkReleaseVersion(kmerFileTest, m->getVersion());
+                               if (GoodFile) {  needToGenerate = false;        }
+                       }
                }
                else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }
-               else if(method == "blast")              {       database = new BlastDB(gapOpen, gapExtend, match, misMatch);    }
+               else if(method == "blast")              {       database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch);   }
                else if(method == "distance")   {       database = new DistanceDB();    }
                else {
                        m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
@@ -110,14 +124,14 @@ Classify::Classify(string tfile, string tempFile, string method, int kmerSize, f
                
                if (needToGenerate) {
                        ifstream fastaFile;
-                       openInputFile(tempFile, fastaFile);
+                       m->openInputFile(tempFile, fastaFile);
                        
                        while (!fastaFile.eof()) {
                                Sequence temp(fastaFile);
-                               gobble(fastaFile);
+                               m->gobble(fastaFile);
                        
                                names.push_back(temp.getName());
-                                                               
+                                                       
                                database->addSequence(temp);    
                        }
                        fastaFile.close();
@@ -127,33 +141,41 @@ Classify::Classify(string tfile, string tempFile, string method, int kmerSize, f
                }else if ((method == "kmer") && (!needToGenerate)) {    
                        ifstream kmerFileTest(kmerDBName.c_str());
                        database->readKmerDB(kmerFileTest);     
-                       
+               
                        ifstream fastaFile;
-                       openInputFile(tempFile, fastaFile);
+                       m->openInputFile(tempFile, fastaFile);
                        
                        while (!fastaFile.eof()) {
                                Sequence temp(fastaFile);
-                               gobble(fastaFile);
-                       
+                               m->gobble(fastaFile);
+
                                names.push_back(temp.getName());
                        }
                        fastaFile.close();
                }
-#endif         
+#endif 
+       
                database->setNumSeqs(names.size());
                
+               //sanity check
+               bool okay = phyloTree->ErrorCheck(names);
+               
+               if (!okay) { m->control_pressed = true; }
+               
                m->mothurOut("DONE."); m->mothurOutEndLine();
                m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine();
 
        }
        catch(exception& e) {
-               m->errorOut(e, "Classify", "Classify");
+               m->errorOut(e, "Classify", "generateDatabaseAndNames");
                exit(1);
        }
 }
 /**************************************************************************************************/
+Classify::Classify() {         m = MothurOut::getInstance();   database = NULL;        }
+/**************************************************************************************************/
 
-void Classify::readTaxonomy(string file) {
+int Classify::readTaxonomy(string file) {
        try {
                
                phyloTree = new PhyloTree();
@@ -163,41 +185,50 @@ void Classify::readTaxonomy(string file) {
                m->mothurOut("Reading in the " + file + " taxonomy...\t");      cout.flush();
 
 #ifdef USE_MPI 
-               int pid, num;
-               vector<long> positions;
+               int pid, num, processors;
+               vector<unsigned long int> positions;
+               int tag = 2001;
                
                MPI_Status status; 
                MPI_File inMPI;
                MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+               MPI_Comm_size(MPI_COMM_WORLD, &processors);
+
+               //char* inFileName = new char[file.length()];
+               //memcpy(inFileName, file.c_str(), file.length());
                
-               char inFileName[file.length()];
+               char inFileName[1024];
                strcpy(inFileName, file.c_str());
-               
+
                MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
-               
+               //delete inFileName;
+
                if (pid == 0) {
-                       positions = setFilePosEachLine(file, num);
+                       positions = m->setFilePosEachLine(file, num);
                        
                        //send file positions to all processes
-                       MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs
-                       MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos 
+                       for(int i = 1; i < processors; i++) { 
+                               MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
+                               MPI_Send(&positions[0], (num+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
+                       }
                }else{
-                       MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
-                       positions.resize(num);
-                       MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
+                       MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
+                       positions.resize(num+1);
+                       MPI_Recv(&positions[0], (num+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
                }
        
                //read file 
                for(int i=0;i<num;i++){
                        //read next sequence
                        int length = positions[i+1] - positions[i];
-                       char buf4[length];
+                       char* buf4 = new char[length];
 
                        MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
 
                        string tempBuf = buf4;
                        if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
-                       
+                       delete buf4;
+
                        istringstream iss (tempBuf,istringstream::in);
                        iss >> name >> taxInfo;
                        taxonomy[name] = taxInfo;
@@ -205,9 +236,10 @@ void Classify::readTaxonomy(string file) {
                }
                
                MPI_File_close(&inMPI);
+               MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
 #else                          
                ifstream inTax;
-               openInputFile(file, inTax);
+               m->openInputFile(file, inTax);
        
                //read template seqs and save
                while (!inTax.eof()) {
@@ -217,15 +249,19 @@ void Classify::readTaxonomy(string file) {
                        
                        phyloTree->addSeqToTree(name, taxInfo);
                
-                       gobble(inTax);
+                       m->gobble(inTax);
                }
                inTax.close();
 #endif 
        
                phyloTree->assignHeirarchyIDs(0);
                
+               phyloTree->setUp(file);
+       
                m->mothurOut("DONE.");
                m->mothurOutEndLine();  cout.flush();
+               
+               return phyloTree->getNumSeqs();
        
        }
        catch(exception& e) {