-/*\r
- * classify.cpp\r
- * Mothur\r
- *\r
- * Created by westcott on 11/3/09.\r
- * Copyright 2009 Schloss Lab. All rights reserved.\r
- *\r
- */\r
-\r
-#include "classify.h"\r
-#include "sequence.hpp"\r
-#include "kmerdb.hpp"\r
-#include "suffixdb.hpp"\r
-#include "blastdb.hpp"\r
-#include "distancedb.hpp"\r
-\r
-/**************************************************************************************************/\r
-Classify::Classify(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch) : taxFile(tfile), templateFile(tempFile) { \r
- try { \r
- m = MothurOut::getInstance(); \r
- readTaxonomy(taxFile); \r
- \r
- int start = time(NULL);\r
- int numSeqs = 0;\r
- \r
- m->mothurOut("Generating search database... "); cout.flush();\r
-#ifdef USE_MPI \r
- int pid;\r
- vector<long> positions;\r
- \r
- MPI_Status status; \r
- MPI_File inMPI;\r
- MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are\r
-\r
- //char* inFileName = new char[tempFile.length()];\r
- //memcpy(inFileName, tempFile.c_str(), tempFile.length());\r
- \r
- char inFileName[1024];\r
- strcpy(inFileName, tempFile.c_str());\r
-\r
- MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer\r
- //delete inFileName;\r
-\r
- if (pid == 0) { //only one process needs to scan file\r
- positions = setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs\r
-\r
- //send file positions to all processes\r
- MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs\r
- MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos \r
- }else{\r
- MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs\r
- positions.resize(numSeqs);\r
- MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions\r
- }\r
- \r
- //create database\r
- if(method == "kmer") { database = new KmerDB(tempFile, kmerSize); }\r
- else if(method == "suffix") { database = new SuffixDB(numSeqs); }\r
- else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); }\r
- else if(method == "distance") { database = new DistanceDB(); }\r
- else {\r
- m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine();\r
- database = new KmerDB(tempFile, 8);\r
- }\r
-\r
- //read file \r
- for(int i=0;i<numSeqs;i++){\r
- //read next sequence\r
- int length = positions[i+1] - positions[i];\r
- char* buf4 = new char[length];\r
- MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);\r
- \r
- string tempBuf = buf4;\r
- if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }\r
- delete buf4;\r
- istringstream iss (tempBuf,istringstream::in);\r
- \r
- Sequence temp(iss); \r
- if (temp.getName() != "") {\r
- names.push_back(temp.getName());\r
- database->addSequence(temp); \r
- }\r
- }\r
- \r
- database->generateDB();\r
- MPI_File_close(&inMPI);\r
- #else\r
- \r
- //need to know number of template seqs for suffixdb\r
- if (method == "suffix") {\r
- ifstream inFASTA;\r
- openInputFile(tempFile, inFASTA);\r
- numSeqs = count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');\r
- inFASTA.close();\r
- }\r
-\r
- bool needToGenerate = true;\r
- string kmerDBName;\r
- if(method == "kmer") { \r
- database = new KmerDB(tempFile, kmerSize); \r
- \r
- kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";\r
- ifstream kmerFileTest(kmerDBName.c_str());\r
- if(kmerFileTest){ needToGenerate = false; }\r
- }\r
- else if(method == "suffix") { database = new SuffixDB(numSeqs); }\r
- else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); }\r
- else if(method == "distance") { database = new DistanceDB(); }\r
- else {\r
- m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");\r
- m->mothurOutEndLine();\r
- database = new KmerDB(tempFile, 8);\r
- }\r
- \r
- if (needToGenerate) {\r
- ifstream fastaFile;\r
- openInputFile(tempFile, fastaFile);\r
- \r
- while (!fastaFile.eof()) {\r
- Sequence temp(fastaFile);\r
- gobble(fastaFile);\r
- \r
- names.push_back(temp.getName());\r
- \r
- database->addSequence(temp); \r
- }\r
- fastaFile.close();\r
-\r
- database->generateDB();\r
- \r
- }else if ((method == "kmer") && (!needToGenerate)) { \r
- ifstream kmerFileTest(kmerDBName.c_str());\r
- database->readKmerDB(kmerFileTest); \r
- \r
- ifstream fastaFile;\r
- openInputFile(tempFile, fastaFile);\r
- \r
- while (!fastaFile.eof()) {\r
- Sequence temp(fastaFile);\r
- gobble(fastaFile);\r
- \r
- names.push_back(temp.getName());\r
- }\r
- fastaFile.close();\r
- }\r
-#endif \r
- database->setNumSeqs(names.size());\r
- \r
- m->mothurOut("DONE."); m->mothurOutEndLine();\r
- m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine();\r
-\r
- }\r
- catch(exception& e) {\r
- m->errorOut(e, "Classify", "Classify");\r
- exit(1);\r
- }\r
-}\r
-/**************************************************************************************************/\r
-\r
-void Classify::readTaxonomy(string file) {\r
- try {\r
- \r
- phyloTree = new PhyloTree();\r
- string name, taxInfo;\r
- \r
- m->mothurOutEndLine();\r
- m->mothurOut("Reading in the " + file + " taxonomy...\t"); cout.flush();\r
-\r
-#ifdef USE_MPI \r
- int pid, num;\r
- vector<long> positions;\r
- \r
- MPI_Status status; \r
- MPI_File inMPI;\r
- MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are\r
-\r
- //char* inFileName = new char[file.length()];\r
- //memcpy(inFileName, file.c_str(), file.length());\r
- \r
- char inFileName[1024];\r
- strcpy(inFileName, file.c_str());\r
-\r
- MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer\r
- //delete inFileName;\r
-\r
- if (pid == 0) {\r
- positions = setFilePosEachLine(file, num);\r
- \r
- //send file positions to all processes\r
- MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs\r
- MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos \r
- }else{\r
- MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs\r
- positions.resize(num);\r
- MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions\r
- }\r
- \r
- //read file \r
- for(int i=0;i<num;i++){\r
- //read next sequence\r
- int length = positions[i+1] - positions[i];\r
- char* buf4 = new char[length];\r
-\r
- MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);\r
-\r
- string tempBuf = buf4;\r
- if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }\r
- delete buf4;\r
-\r
- istringstream iss (tempBuf,istringstream::in);\r
- iss >> name >> taxInfo;\r
- taxonomy[name] = taxInfo;\r
- phyloTree->addSeqToTree(name, taxInfo);\r
- }\r
- \r
- MPI_File_close(&inMPI);\r
-#else \r
- ifstream inTax;\r
- openInputFile(file, inTax);\r
- \r
- //read template seqs and save\r
- while (!inTax.eof()) {\r
- inTax >> name >> taxInfo;\r
- \r
- taxonomy[name] = taxInfo;\r
- \r
- phyloTree->addSeqToTree(name, taxInfo);\r
- \r
- gobble(inTax);\r
- }\r
- inTax.close();\r
-#endif \r
- \r
- phyloTree->assignHeirarchyIDs(0);\r
- \r
- m->mothurOut("DONE.");\r
- m->mothurOutEndLine(); cout.flush();\r
- \r
- }\r
- catch(exception& e) {\r
- m->errorOut(e, "Classify", "readTaxonomy");\r
- exit(1);\r
- }\r
-}\r
-/**************************************************************************************************/\r
-\r
-vector<string> Classify::parseTax(string tax) {\r
- try {\r
- vector<string> taxons;\r
- \r
- tax = tax.substr(0, tax.length()-1); //get rid of last ';'\r
- \r
- //parse taxonomy\r
- string individual;\r
- while (tax.find_first_of(';') != -1) {\r
- individual = tax.substr(0,tax.find_first_of(';'));\r
- tax = tax.substr(tax.find_first_of(';')+1, tax.length());\r
- taxons.push_back(individual);\r
- \r
- }\r
- //get last one\r
- taxons.push_back(tax);\r
- \r
- return taxons;\r
- }\r
- catch(exception& e) {\r
- m->errorOut(e, "Classify", "parseTax");\r
- exit(1);\r
- }\r
-}\r
-/**************************************************************************************************/\r
-\r
+/*
+ * classify.cpp
+ * Mothur
+ *
+ * Created by westcott on 11/3/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "classify.h"
+#include "sequence.hpp"
+#include "kmerdb.hpp"
+#include "suffixdb.hpp"
+#include "blastdb.hpp"
+#include "distancedb.hpp"
+
+/**************************************************************************************************/
+void Classify::generateDatabaseAndNames(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch) {
+ try {
+ taxFile = tfile;
+ readTaxonomy(taxFile);
+
+ templateFile = tempFile;
+
+ int start = time(NULL);
+ int numSeqs = 0;
+
+ m->mothurOut("Generating search database... "); cout.flush();
+#ifdef USE_MPI
+ int pid, processors;
+ vector<unsigned long int> positions;
+ int tag = 2001;
+
+ MPI_Status status;
+ MPI_File inMPI;
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+ MPI_Comm_size(MPI_COMM_WORLD, &processors);
+
+ //char* inFileName = new char[tempFile.length()];
+ //memcpy(inFileName, tempFile.c_str(), tempFile.length());
+
+ char inFileName[1024];
+ strcpy(inFileName, tempFile.c_str());
+
+ MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
+ //delete inFileName;
+
+ if (pid == 0) { //only one process needs to scan file
+ positions = m->setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs
+
+ //send file positions to all processes
+ for(int i = 1; i < processors; i++) {
+ MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
+ MPI_Send(&positions[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
+ }
+ }else{
+ MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
+ positions.resize(numSeqs+1);
+ MPI_Recv(&positions[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
+ }
+
+ //create database
+ if(method == "kmer") { database = new KmerDB(tempFile, kmerSize); }
+ else if(method == "suffix") { database = new SuffixDB(numSeqs); }
+ else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); }
+ else if(method == "distance") { database = new DistanceDB(); }
+ else {
+ m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine();
+ database = new KmerDB(tempFile, 8);
+ }
+
+ //read file
+ for(int i=0;i<numSeqs;i++){
+ //read next sequence
+ int length = positions[i+1] - positions[i];
+ char* buf4 = new char[length];
+ MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
+
+ string tempBuf = buf4;
+ if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
+ delete buf4;
+ istringstream iss (tempBuf,istringstream::in);
+
+ Sequence temp(iss);
+ if (temp.getName() != "") {
+ names.push_back(temp.getName());
+ database->addSequence(temp);
+ }
+ }
+
+ database->generateDB();
+ MPI_File_close(&inMPI);
+ MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
+ #else
+
+ //need to know number of template seqs for suffixdb
+ if (method == "suffix") {
+ ifstream inFASTA;
+ m->openInputFile(tempFile, inFASTA);
+ m->getNumSeqs(inFASTA, numSeqs);
+ inFASTA.close();
+ }
+
+ bool needToGenerate = true;
+ string kmerDBName;
+ if(method == "kmer") {
+ database = new KmerDB(tempFile, kmerSize);
+
+ kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
+ ifstream kmerFileTest(kmerDBName.c_str());
+ if(kmerFileTest){
+ bool GoodFile = m->checkReleaseVersion(kmerFileTest, m->getVersion());
+ if (GoodFile) { needToGenerate = false; }
+ }
+ }
+ else if(method == "suffix") { database = new SuffixDB(numSeqs); }
+ else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); }
+ else if(method == "distance") { database = new DistanceDB(); }
+ else {
+ m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
+ m->mothurOutEndLine();
+ database = new KmerDB(tempFile, 8);
+ }
+
+ if (needToGenerate) {
+ ifstream fastaFile;
+ m->openInputFile(tempFile, fastaFile);
+
+ while (!fastaFile.eof()) {
+ Sequence temp(fastaFile);
+ m->gobble(fastaFile);
+
+ names.push_back(temp.getName());
+
+ database->addSequence(temp);
+ }
+ fastaFile.close();
+
+ database->generateDB();
+
+ }else if ((method == "kmer") && (!needToGenerate)) {
+ ifstream kmerFileTest(kmerDBName.c_str());
+ database->readKmerDB(kmerFileTest);
+
+ ifstream fastaFile;
+ m->openInputFile(tempFile, fastaFile);
+
+ while (!fastaFile.eof()) {
+ Sequence temp(fastaFile);
+ m->gobble(fastaFile);
+
+ names.push_back(temp.getName());
+ }
+ fastaFile.close();
+ }
+#endif
+
+ database->setNumSeqs(names.size());
+
+ //sanity check
+ bool okay = phyloTree->ErrorCheck(names);
+
+ if (!okay) { m->control_pressed = true; }
+
+ m->mothurOut("DONE."); m->mothurOutEndLine();
+ m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine();
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Classify", "generateDatabaseAndNames");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+Classify::Classify() { m = MothurOut::getInstance(); database = NULL; }
+/**************************************************************************************************/
+
+int Classify::readTaxonomy(string file) {
+ try {
+
+ phyloTree = new PhyloTree();
+ string name, taxInfo;
+
+ m->mothurOutEndLine();
+ m->mothurOut("Reading in the " + file + " taxonomy...\t"); cout.flush();
+
+#ifdef USE_MPI
+ int pid, num, processors;
+ vector<unsigned long int> positions;
+ int tag = 2001;
+
+ MPI_Status status;
+ MPI_File inMPI;
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+ MPI_Comm_size(MPI_COMM_WORLD, &processors);
+
+ //char* inFileName = new char[file.length()];
+ //memcpy(inFileName, file.c_str(), file.length());
+
+ char inFileName[1024];
+ strcpy(inFileName, file.c_str());
+
+ MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
+ //delete inFileName;
+
+ if (pid == 0) {
+ positions = m->setFilePosEachLine(file, num);
+
+ //send file positions to all processes
+ for(int i = 1; i < processors; i++) {
+ MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
+ MPI_Send(&positions[0], (num+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
+ }
+ }else{
+ MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
+ positions.resize(num+1);
+ MPI_Recv(&positions[0], (num+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
+ }
+
+ //read file
+ for(int i=0;i<num;i++){
+ //read next sequence
+ int length = positions[i+1] - positions[i];
+ char* buf4 = new char[length];
+
+ MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
+
+ string tempBuf = buf4;
+ if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
+ delete buf4;
+
+ istringstream iss (tempBuf,istringstream::in);
+ iss >> name >> taxInfo;
+ taxonomy[name] = taxInfo;
+ phyloTree->addSeqToTree(name, taxInfo);
+ }
+
+ MPI_File_close(&inMPI);
+ MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
+#else
+ ifstream inTax;
+ m->openInputFile(file, inTax);
+
+ //read template seqs and save
+ while (!inTax.eof()) {
+ inTax >> name >> taxInfo;
+
+ taxonomy[name] = taxInfo;
+
+ phyloTree->addSeqToTree(name, taxInfo);
+
+ m->gobble(inTax);
+ }
+ inTax.close();
+#endif
+
+ phyloTree->assignHeirarchyIDs(0);
+
+ phyloTree->setUp(file);
+
+ m->mothurOut("DONE.");
+ m->mothurOutEndLine(); cout.flush();
+
+ return phyloTree->getNumSeqs();
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Classify", "readTaxonomy");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
+vector<string> Classify::parseTax(string tax) {
+ try {
+ vector<string> taxons;
+
+ tax = tax.substr(0, tax.length()-1); //get rid of last ';'
+
+ //parse taxonomy
+ string individual;
+ while (tax.find_first_of(';') != -1) {
+ individual = tax.substr(0,tax.find_first_of(';'));
+ tax = tax.substr(tax.find_first_of(';')+1, tax.length());
+ taxons.push_back(individual);
+
+ }
+ //get last one
+ taxons.push_back(tax);
+
+ return taxons;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Classify", "parseTax");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+