]> git.donarmstrong.com Git - mothur.git/blobdiff - classify.cpp
1.9
[mothur.git] / classify.cpp
index 557f17c6b85c7eb61865591ad3a8481ffa38ed38..f09cbf5da8a59b9446e91542c7b48a5a1fd22bc3 100644 (file)
-/*
- *  classify.cpp
- *  Mothur
- *
- *  Created by westcott on 11/3/09.
- *  Copyright 2009 Schloss Lab. All rights reserved.
- *
- */
-
-#include "classify.h"
-#include "sequence.hpp"
-#include "kmerdb.hpp"
-#include "suffixdb.hpp"
-#include "blastdb.hpp"
-#include "distancedb.hpp"
-
-/**************************************************************************************************/
-Classify::Classify(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch) : taxFile(tfile), templateFile(tempFile) {         
-       try {   
-               m = MothurOut::getInstance();                                                                   
-               readTaxonomy(taxFile);  
-               
-               int start = time(NULL);
-               int numSeqs = 0;
-               
-               m->mothurOut("Generating search database...    "); cout.flush();
-#ifdef USE_MPI 
-                       int pid;
-                       vector<long> positions;
-               
-                       MPI_Status status; 
-                       MPI_File inMPI;
-                       MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
-       
-                       char inFileName[tempFile.length()];
-                       strcpy(inFileName, tempFile.c_str());
-       
-                       MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
-       
-                       if (pid == 0) { //only one process needs to scan file
-                               positions = setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs
-
-                               //send file positions to all processes
-                               MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs
-                               MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos     
-                       }else{
-                               MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
-                               positions.resize(numSeqs);
-                               MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
-                       }
-                       
-                       //create database
-                       if(method == "kmer")                    {       database = new KmerDB(tempFile, kmerSize);                      }
-                       else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }
-                       else if(method == "blast")              {       database = new BlastDB(gapOpen, gapExtend, match, misMatch);    }
-                       else if(method == "distance")   {       database = new DistanceDB();    }
-                       else {
-                               m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine();
-                               database = new KmerDB(tempFile, 8);
-                       }
-
-                       //read file 
-                       for(int i=0;i<numSeqs;i++){
-                               //read next sequence
-                               int length = positions[i+1] - positions[i];
-                               char buf4[length];
-                               MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
-                               
-                               string tempBuf = buf4;
-                               if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
-                               
-                               istringstream iss (tempBuf,istringstream::in);
-                               
-                               Sequence temp(iss);  
-                               if (temp.getName() != "") {
-                                       names.push_back(temp.getName());
-                                       database->addSequence(temp);    
-                               }
-                       }
-                       
-                       database->generateDB();
-                       MPI_File_close(&inMPI);
-       #else
-               
-               //need to know number of template seqs for suffixdb
-               if (method == "suffix") {
-                       ifstream inFASTA;
-                       openInputFile(tempFile, inFASTA);
-                       numSeqs = count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
-                       inFASTA.close();
-               }
-
-               bool needToGenerate = true;
-               string kmerDBName;
-               if(method == "kmer")                    {       
-                       database = new KmerDB(tempFile, kmerSize);                      
-                       
-                       kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
-                       ifstream kmerFileTest(kmerDBName.c_str());
-                       if(kmerFileTest){       needToGenerate = false;         }
-               }
-               else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }
-               else if(method == "blast")              {       database = new BlastDB(gapOpen, gapExtend, match, misMatch);    }
-               else if(method == "distance")   {       database = new DistanceDB();    }
-               else {
-                       m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
-                       m->mothurOutEndLine();
-                       database = new KmerDB(tempFile, 8);
-               }
-               
-               if (needToGenerate) {
-                       ifstream fastaFile;
-                       openInputFile(tempFile, fastaFile);
-                       
-                       while (!fastaFile.eof()) {
-                               Sequence temp(fastaFile);
-                               gobble(fastaFile);
-                       
-                               names.push_back(temp.getName());
-                                                               
-                               database->addSequence(temp);    
-                       }
-                       fastaFile.close();
-
-                       database->generateDB();
-                       
-               }else if ((method == "kmer") && (!needToGenerate)) {    
-                       ifstream kmerFileTest(kmerDBName.c_str());
-                       database->readKmerDB(kmerFileTest);     
-                       
-                       ifstream fastaFile;
-                       openInputFile(tempFile, fastaFile);
-                       
-                       while (!fastaFile.eof()) {
-                               Sequence temp(fastaFile);
-                               gobble(fastaFile);
-                       
-                               names.push_back(temp.getName());
-                       }
-                       fastaFile.close();
-               }
-#endif         
-               database->setNumSeqs(names.size());
-               
-               m->mothurOut("DONE."); m->mothurOutEndLine();
-               m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine();
-
-       }
-       catch(exception& e) {
-               m->errorOut(e, "Classify", "Classify");
-               exit(1);
-       }
-}
-/**************************************************************************************************/
-
-void Classify::readTaxonomy(string file) {
-       try {
-               
-               phyloTree = new PhyloTree();
-               string name, taxInfo;
-               
-               m->mothurOutEndLine();
-               m->mothurOut("Reading in the " + file + " taxonomy...\t");      cout.flush();
-
-#ifdef USE_MPI 
-               int pid, num;
-               vector<long> positions;
-               
-               MPI_Status status; 
-               MPI_File inMPI;
-               MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
-               
-               char inFileName[file.length()];
-               strcpy(inFileName, file.c_str());
-               
-               MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
-               
-               if (pid == 0) {
-                       positions = setFilePosEachLine(file, num);
-                       
-                       //send file positions to all processes
-                       MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs
-                       MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos 
-               }else{
-                       MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
-                       positions.resize(num);
-                       MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
-               }
-       
-               //read file 
-               for(int i=0;i<num;i++){
-                       //read next sequence
-                       int length = positions[i+1] - positions[i];
-                       char buf4[length];
-
-                       MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
-
-                       string tempBuf = buf4;
-                       if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
-                       
-                       istringstream iss (tempBuf,istringstream::in);
-                       iss >> name >> taxInfo;
-                       taxonomy[name] = taxInfo;
-                       phyloTree->addSeqToTree(name, taxInfo);
-               }
-               
-               MPI_File_close(&inMPI);
-#else                          
-               ifstream inTax;
-               openInputFile(file, inTax);
-       
-               //read template seqs and save
-               while (!inTax.eof()) {
-                       inTax >> name >> taxInfo;
-                       
-                       taxonomy[name] = taxInfo;
-                       
-                       phyloTree->addSeqToTree(name, taxInfo);
-               
-                       gobble(inTax);
-               }
-               inTax.close();
-#endif 
-       
-               phyloTree->assignHeirarchyIDs(0);
-               
-               m->mothurOut("DONE.");
-               m->mothurOutEndLine();  cout.flush();
-       
-       }
-       catch(exception& e) {
-               m->errorOut(e, "Classify", "readTaxonomy");
-               exit(1);
-       }
-}
-/**************************************************************************************************/
-
-vector<string> Classify::parseTax(string tax) {
-       try {
-               vector<string> taxons;
-               
-               tax = tax.substr(0, tax.length()-1);  //get rid of last ';'
-       
-               //parse taxonomy
-               string individual;
-               while (tax.find_first_of(';') != -1) {
-                       individual = tax.substr(0,tax.find_first_of(';'));
-                       tax = tax.substr(tax.find_first_of(';')+1, tax.length());
-                       taxons.push_back(individual);
-                       
-               }
-               //get last one
-               taxons.push_back(tax);
-               
-               return taxons;
-       }
-       catch(exception& e) {
-               m->errorOut(e, "Classify", "parseTax");
-               exit(1);
-       }
-}
-/**************************************************************************************************/
-
+/*\r
+ *  classify.cpp\r
+ *  Mothur\r
+ *\r
+ *  Created by westcott on 11/3/09.\r
+ *  Copyright 2009 Schloss Lab. All rights reserved.\r
+ *\r
+ */\r
+\r
+#include "classify.h"\r
+#include "sequence.hpp"\r
+#include "kmerdb.hpp"\r
+#include "suffixdb.hpp"\r
+#include "blastdb.hpp"\r
+#include "distancedb.hpp"\r
+\r
+/**************************************************************************************************/\r
+Classify::Classify(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch) : taxFile(tfile), templateFile(tempFile) {         \r
+       try {   \r
+               m = MothurOut::getInstance();                                                                   \r
+               readTaxonomy(taxFile);  \r
+               \r
+               int start = time(NULL);\r
+               int numSeqs = 0;\r
+               \r
+               m->mothurOut("Generating search database...    "); cout.flush();\r
+#ifdef USE_MPI \r
+                       int pid;\r
+                       vector<long> positions;\r
+               \r
+                       MPI_Status status; \r
+                       MPI_File inMPI;\r
+                       MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are\r
+\r
+                       char* inFileName = new char[tempFile.length()];\r
+                       memcpy(inFileName, tempFile.c_str(), tempFile.length());\r
+       \r
+                       MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer\r
+                       delete inFileName;\r
+\r
+                       if (pid == 0) { //only one process needs to scan file\r
+                               positions = setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs\r
+\r
+                               //send file positions to all processes\r
+                               MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs\r
+                               MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos     \r
+                       }else{\r
+                               MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs\r
+                               positions.resize(numSeqs);\r
+                               MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions\r
+                       }\r
+                       \r
+                       //create database\r
+                       if(method == "kmer")                    {       database = new KmerDB(tempFile, kmerSize);                      }\r
+                       else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }\r
+                       else if(method == "blast")              {       database = new BlastDB(gapOpen, gapExtend, match, misMatch);    }\r
+                       else if(method == "distance")   {       database = new DistanceDB();    }\r
+                       else {\r
+                               m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine();\r
+                               database = new KmerDB(tempFile, 8);\r
+                       }\r
+\r
+                       //read file \r
+                       for(int i=0;i<numSeqs;i++){\r
+                               //read next sequence\r
+                               int length = positions[i+1] - positions[i];\r
+                               char* buf4 = new char[length];\r
+                               MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);\r
+                               \r
+                               string tempBuf = buf4;\r
+                               if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }\r
+                               delete buf4;\r
+                               istringstream iss (tempBuf,istringstream::in);\r
+                               \r
+                               Sequence temp(iss);  \r
+                               if (temp.getName() != "") {\r
+                                       names.push_back(temp.getName());\r
+                                       database->addSequence(temp);    \r
+                               }\r
+                       }\r
+                       \r
+                       database->generateDB();\r
+                       MPI_File_close(&inMPI);\r
+       #else\r
+               \r
+               //need to know number of template seqs for suffixdb\r
+               if (method == "suffix") {\r
+                       ifstream inFASTA;\r
+                       openInputFile(tempFile, inFASTA);\r
+                       numSeqs = count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');\r
+                       inFASTA.close();\r
+               }\r
+\r
+               bool needToGenerate = true;\r
+               string kmerDBName;\r
+               if(method == "kmer")                    {       \r
+                       database = new KmerDB(tempFile, kmerSize);                      \r
+                       \r
+                       kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";\r
+                       ifstream kmerFileTest(kmerDBName.c_str());\r
+                       if(kmerFileTest){       needToGenerate = false;         }\r
+               }\r
+               else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }\r
+               else if(method == "blast")              {       database = new BlastDB(gapOpen, gapExtend, match, misMatch);    }\r
+               else if(method == "distance")   {       database = new DistanceDB();    }\r
+               else {\r
+                       m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");\r
+                       m->mothurOutEndLine();\r
+                       database = new KmerDB(tempFile, 8);\r
+               }\r
+               \r
+               if (needToGenerate) {\r
+                       ifstream fastaFile;\r
+                       openInputFile(tempFile, fastaFile);\r
+                       \r
+                       while (!fastaFile.eof()) {\r
+                               Sequence temp(fastaFile);\r
+                               gobble(fastaFile);\r
+                       \r
+                               names.push_back(temp.getName());\r
+                                                               \r
+                               database->addSequence(temp);    \r
+                       }\r
+                       fastaFile.close();\r
+\r
+                       database->generateDB();\r
+                       \r
+               }else if ((method == "kmer") && (!needToGenerate)) {    \r
+                       ifstream kmerFileTest(kmerDBName.c_str());\r
+                       database->readKmerDB(kmerFileTest);     \r
+                       \r
+                       ifstream fastaFile;\r
+                       openInputFile(tempFile, fastaFile);\r
+                       \r
+                       while (!fastaFile.eof()) {\r
+                               Sequence temp(fastaFile);\r
+                               gobble(fastaFile);\r
+                       \r
+                               names.push_back(temp.getName());\r
+                       }\r
+                       fastaFile.close();\r
+               }\r
+#endif         \r
+               database->setNumSeqs(names.size());\r
+               \r
+               m->mothurOut("DONE."); m->mothurOutEndLine();\r
+               m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine();\r
+\r
+       }\r
+       catch(exception& e) {\r
+               m->errorOut(e, "Classify", "Classify");\r
+               exit(1);\r
+       }\r
+}\r
+/**************************************************************************************************/\r
+\r
+void Classify::readTaxonomy(string file) {\r
+       try {\r
+               \r
+               phyloTree = new PhyloTree();\r
+               string name, taxInfo;\r
+               \r
+               m->mothurOutEndLine();\r
+               m->mothurOut("Reading in the " + file + " taxonomy...\t");      cout.flush();\r
+\r
+#ifdef USE_MPI \r
+               int pid, num;\r
+               vector<long> positions;\r
+               \r
+               MPI_Status status; \r
+               MPI_File inMPI;\r
+               MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are\r
+\r
+               char* inFileName = new char[file.length()];\r
+               memcpy(inFileName, file.c_str(), file.length());\r
+               \r
+               MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer\r
+               delete inFileName;\r
+\r
+               if (pid == 0) {\r
+                       positions = setFilePosEachLine(file, num);\r
+                       \r
+                       //send file positions to all processes\r
+                       MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs\r
+                       MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos \r
+               }else{\r
+                       MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs\r
+                       positions.resize(num);\r
+                       MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions\r
+               }\r
+       \r
+               //read file \r
+               for(int i=0;i<num;i++){\r
+                       //read next sequence\r
+                       int length = positions[i+1] - positions[i];\r
+                       char* buf4 = new char[length];\r
+\r
+                       MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);\r
+\r
+                       string tempBuf = buf4;\r
+                       if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }\r
+                       delete buf4;\r
+\r
+                       istringstream iss (tempBuf,istringstream::in);\r
+                       iss >> name >> taxInfo;\r
+                       taxonomy[name] = taxInfo;\r
+                       phyloTree->addSeqToTree(name, taxInfo);\r
+               }\r
+               \r
+               MPI_File_close(&inMPI);\r
+#else                          \r
+               ifstream inTax;\r
+               openInputFile(file, inTax);\r
+       \r
+               //read template seqs and save\r
+               while (!inTax.eof()) {\r
+                       inTax >> name >> taxInfo;\r
+                       \r
+                       taxonomy[name] = taxInfo;\r
+                       \r
+                       phyloTree->addSeqToTree(name, taxInfo);\r
+               \r
+                       gobble(inTax);\r
+               }\r
+               inTax.close();\r
+#endif \r
+       \r
+               phyloTree->assignHeirarchyIDs(0);\r
+               \r
+               m->mothurOut("DONE.");\r
+               m->mothurOutEndLine();  cout.flush();\r
+       \r
+       }\r
+       catch(exception& e) {\r
+               m->errorOut(e, "Classify", "readTaxonomy");\r
+               exit(1);\r
+       }\r
+}\r
+/**************************************************************************************************/\r
+\r
+vector<string> Classify::parseTax(string tax) {\r
+       try {\r
+               vector<string> taxons;\r
+               \r
+               tax = tax.substr(0, tax.length()-1);  //get rid of last ';'\r
+       \r
+               //parse taxonomy\r
+               string individual;\r
+               while (tax.find_first_of(';') != -1) {\r
+                       individual = tax.substr(0,tax.find_first_of(';'));\r
+                       tax = tax.substr(tax.find_first_of(';')+1, tax.length());\r
+                       taxons.push_back(individual);\r
+                       \r
+               }\r
+               //get last one\r
+               taxons.push_back(tax);\r
+               \r
+               return taxons;\r
+       }\r
+       catch(exception& e) {\r
+               m->errorOut(e, "Classify", "parseTax");\r
+               exit(1);\r
+       }\r
+}\r
+/**************************************************************************************************/\r
+\r