-/*
- * classify.cpp
- * Mothur
- *
- * Created by westcott on 11/3/09.
- * Copyright 2009 Schloss Lab. All rights reserved.
- *
- */
-
-#include "classify.h"
-#include "sequence.hpp"
-#include "kmerdb.hpp"
-#include "suffixdb.hpp"
-#include "blastdb.hpp"
-#include "distancedb.hpp"
-
-/**************************************************************************************************/
-Classify::Classify(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch) : taxFile(tfile), templateFile(tempFile) {
- try {
- m = MothurOut::getInstance();
- readTaxonomy(taxFile);
-
- int start = time(NULL);
- int numSeqs = 0;
-
- m->mothurOut("Generating search database... "); cout.flush();
-#ifdef USE_MPI
- int pid;
- vector<long> positions;
-
- MPI_Status status;
- MPI_File inMPI;
- MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
-
- char inFileName[tempFile.length()];
- strcpy(inFileName, tempFile.c_str());
-
- MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
-
- if (pid == 0) { //only one process needs to scan file
- positions = setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs
-
- //send file positions to all processes
- MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs
- MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos
- }else{
- MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
- positions.resize(numSeqs);
- MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
- }
-
- //create database
- if(method == "kmer") { database = new KmerDB(tempFile, kmerSize); }
- else if(method == "suffix") { database = new SuffixDB(numSeqs); }
- else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); }
- else if(method == "distance") { database = new DistanceDB(); }
- else {
- m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine();
- database = new KmerDB(tempFile, 8);
- }
-
- //read file
- for(int i=0;i<numSeqs;i++){
- //read next sequence
- int length = positions[i+1] - positions[i];
- char buf4[length];
- MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
-
- string tempBuf = buf4;
- if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
-
- istringstream iss (tempBuf,istringstream::in);
-
- Sequence temp(iss);
- if (temp.getName() != "") {
- names.push_back(temp.getName());
- database->addSequence(temp);
- }
- }
-
- database->generateDB();
- MPI_File_close(&inMPI);
- #else
-
- //need to know number of template seqs for suffixdb
- if (method == "suffix") {
- ifstream inFASTA;
- openInputFile(tempFile, inFASTA);
- numSeqs = count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
- inFASTA.close();
- }
-
- bool needToGenerate = true;
- string kmerDBName;
- if(method == "kmer") {
- database = new KmerDB(tempFile, kmerSize);
-
- kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
- ifstream kmerFileTest(kmerDBName.c_str());
- if(kmerFileTest){ needToGenerate = false; }
- }
- else if(method == "suffix") { database = new SuffixDB(numSeqs); }
- else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); }
- else if(method == "distance") { database = new DistanceDB(); }
- else {
- m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
- m->mothurOutEndLine();
- database = new KmerDB(tempFile, 8);
- }
-
- if (needToGenerate) {
- ifstream fastaFile;
- openInputFile(tempFile, fastaFile);
-
- while (!fastaFile.eof()) {
- Sequence temp(fastaFile);
- gobble(fastaFile);
-
- names.push_back(temp.getName());
-
- database->addSequence(temp);
- }
- fastaFile.close();
-
- database->generateDB();
-
- }else if ((method == "kmer") && (!needToGenerate)) {
- ifstream kmerFileTest(kmerDBName.c_str());
- database->readKmerDB(kmerFileTest);
-
- ifstream fastaFile;
- openInputFile(tempFile, fastaFile);
-
- while (!fastaFile.eof()) {
- Sequence temp(fastaFile);
- gobble(fastaFile);
-
- names.push_back(temp.getName());
- }
- fastaFile.close();
- }
-#endif
- database->setNumSeqs(names.size());
-
- m->mothurOut("DONE."); m->mothurOutEndLine();
- m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine();
-
- }
- catch(exception& e) {
- m->errorOut(e, "Classify", "Classify");
- exit(1);
- }
-}
-/**************************************************************************************************/
-
-void Classify::readTaxonomy(string file) {
- try {
-
- phyloTree = new PhyloTree();
- string name, taxInfo;
-
- m->mothurOutEndLine();
- m->mothurOut("Reading in the " + file + " taxonomy...\t"); cout.flush();
-
-#ifdef USE_MPI
- int pid, num;
- vector<long> positions;
-
- MPI_Status status;
- MPI_File inMPI;
- MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
-
- char inFileName[file.length()];
- strcpy(inFileName, file.c_str());
-
- MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
-
- if (pid == 0) {
- positions = setFilePosEachLine(file, num);
-
- //send file positions to all processes
- MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs
- MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos
- }else{
- MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
- positions.resize(num);
- MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
- }
-
- //read file
- for(int i=0;i<num;i++){
- //read next sequence
- int length = positions[i+1] - positions[i];
- char buf4[length];
-
- MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
-
- string tempBuf = buf4;
- if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
-
- istringstream iss (tempBuf,istringstream::in);
- iss >> name >> taxInfo;
- taxonomy[name] = taxInfo;
- phyloTree->addSeqToTree(name, taxInfo);
- }
-
- MPI_File_close(&inMPI);
-#else
- ifstream inTax;
- openInputFile(file, inTax);
-
- //read template seqs and save
- while (!inTax.eof()) {
- inTax >> name >> taxInfo;
-
- taxonomy[name] = taxInfo;
-
- phyloTree->addSeqToTree(name, taxInfo);
-
- gobble(inTax);
- }
- inTax.close();
-#endif
-
- phyloTree->assignHeirarchyIDs(0);
-
- m->mothurOut("DONE.");
- m->mothurOutEndLine(); cout.flush();
-
- }
- catch(exception& e) {
- m->errorOut(e, "Classify", "readTaxonomy");
- exit(1);
- }
-}
-/**************************************************************************************************/
-
-vector<string> Classify::parseTax(string tax) {
- try {
- vector<string> taxons;
-
- tax = tax.substr(0, tax.length()-1); //get rid of last ';'
-
- //parse taxonomy
- string individual;
- while (tax.find_first_of(';') != -1) {
- individual = tax.substr(0,tax.find_first_of(';'));
- tax = tax.substr(tax.find_first_of(';')+1, tax.length());
- taxons.push_back(individual);
-
- }
- //get last one
- taxons.push_back(tax);
-
- return taxons;
- }
- catch(exception& e) {
- m->errorOut(e, "Classify", "parseTax");
- exit(1);
- }
-}
-/**************************************************************************************************/
-
+/*\r
+ * classify.cpp\r
+ * Mothur\r
+ *\r
+ * Created by westcott on 11/3/09.\r
+ * Copyright 2009 Schloss Lab. All rights reserved.\r
+ *\r
+ */\r
+\r
+#include "classify.h"\r
+#include "sequence.hpp"\r
+#include "kmerdb.hpp"\r
+#include "suffixdb.hpp"\r
+#include "blastdb.hpp"\r
+#include "distancedb.hpp"\r
+\r
+/**************************************************************************************************/\r
+Classify::Classify(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch) : taxFile(tfile), templateFile(tempFile) { \r
+ try { \r
+ m = MothurOut::getInstance(); \r
+ readTaxonomy(taxFile); \r
+ \r
+ int start = time(NULL);\r
+ int numSeqs = 0;\r
+ \r
+ m->mothurOut("Generating search database... "); cout.flush();\r
+#ifdef USE_MPI \r
+ int pid;\r
+ vector<long> positions;\r
+ \r
+ MPI_Status status; \r
+ MPI_File inMPI;\r
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are\r
+\r
+ char* inFileName = new char[tempFile.length()];\r
+ memcpy(inFileName, tempFile.c_str(), tempFile.length());\r
+ \r
+ MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer\r
+ delete inFileName;\r
+\r
+ if (pid == 0) { //only one process needs to scan file\r
+ positions = setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs\r
+\r
+ //send file positions to all processes\r
+ MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs\r
+ MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos \r
+ }else{\r
+ MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs\r
+ positions.resize(numSeqs);\r
+ MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions\r
+ }\r
+ \r
+ //create database\r
+ if(method == "kmer") { database = new KmerDB(tempFile, kmerSize); }\r
+ else if(method == "suffix") { database = new SuffixDB(numSeqs); }\r
+ else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); }\r
+ else if(method == "distance") { database = new DistanceDB(); }\r
+ else {\r
+ m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine();\r
+ database = new KmerDB(tempFile, 8);\r
+ }\r
+\r
+ //read file \r
+ for(int i=0;i<numSeqs;i++){\r
+ //read next sequence\r
+ int length = positions[i+1] - positions[i];\r
+ char* buf4 = new char[length];\r
+ MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);\r
+ \r
+ string tempBuf = buf4;\r
+ if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }\r
+ delete buf4;\r
+ istringstream iss (tempBuf,istringstream::in);\r
+ \r
+ Sequence temp(iss); \r
+ if (temp.getName() != "") {\r
+ names.push_back(temp.getName());\r
+ database->addSequence(temp); \r
+ }\r
+ }\r
+ \r
+ database->generateDB();\r
+ MPI_File_close(&inMPI);\r
+ #else\r
+ \r
+ //need to know number of template seqs for suffixdb\r
+ if (method == "suffix") {\r
+ ifstream inFASTA;\r
+ openInputFile(tempFile, inFASTA);\r
+ numSeqs = count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');\r
+ inFASTA.close();\r
+ }\r
+\r
+ bool needToGenerate = true;\r
+ string kmerDBName;\r
+ if(method == "kmer") { \r
+ database = new KmerDB(tempFile, kmerSize); \r
+ \r
+ kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";\r
+ ifstream kmerFileTest(kmerDBName.c_str());\r
+ if(kmerFileTest){ needToGenerate = false; }\r
+ }\r
+ else if(method == "suffix") { database = new SuffixDB(numSeqs); }\r
+ else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); }\r
+ else if(method == "distance") { database = new DistanceDB(); }\r
+ else {\r
+ m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");\r
+ m->mothurOutEndLine();\r
+ database = new KmerDB(tempFile, 8);\r
+ }\r
+ \r
+ if (needToGenerate) {\r
+ ifstream fastaFile;\r
+ openInputFile(tempFile, fastaFile);\r
+ \r
+ while (!fastaFile.eof()) {\r
+ Sequence temp(fastaFile);\r
+ gobble(fastaFile);\r
+ \r
+ names.push_back(temp.getName());\r
+ \r
+ database->addSequence(temp); \r
+ }\r
+ fastaFile.close();\r
+\r
+ database->generateDB();\r
+ \r
+ }else if ((method == "kmer") && (!needToGenerate)) { \r
+ ifstream kmerFileTest(kmerDBName.c_str());\r
+ database->readKmerDB(kmerFileTest); \r
+ \r
+ ifstream fastaFile;\r
+ openInputFile(tempFile, fastaFile);\r
+ \r
+ while (!fastaFile.eof()) {\r
+ Sequence temp(fastaFile);\r
+ gobble(fastaFile);\r
+ \r
+ names.push_back(temp.getName());\r
+ }\r
+ fastaFile.close();\r
+ }\r
+#endif \r
+ database->setNumSeqs(names.size());\r
+ \r
+ m->mothurOut("DONE."); m->mothurOutEndLine();\r
+ m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine();\r
+\r
+ }\r
+ catch(exception& e) {\r
+ m->errorOut(e, "Classify", "Classify");\r
+ exit(1);\r
+ }\r
+}\r
+/**************************************************************************************************/\r
+\r
+void Classify::readTaxonomy(string file) {\r
+ try {\r
+ \r
+ phyloTree = new PhyloTree();\r
+ string name, taxInfo;\r
+ \r
+ m->mothurOutEndLine();\r
+ m->mothurOut("Reading in the " + file + " taxonomy...\t"); cout.flush();\r
+\r
+#ifdef USE_MPI \r
+ int pid, num;\r
+ vector<long> positions;\r
+ \r
+ MPI_Status status; \r
+ MPI_File inMPI;\r
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are\r
+\r
+ char* inFileName = new char[file.length()];\r
+ memcpy(inFileName, file.c_str(), file.length());\r
+ \r
+ MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer\r
+ delete inFileName;\r
+\r
+ if (pid == 0) {\r
+ positions = setFilePosEachLine(file, num);\r
+ \r
+ //send file positions to all processes\r
+ MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs\r
+ MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos \r
+ }else{\r
+ MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs\r
+ positions.resize(num);\r
+ MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions\r
+ }\r
+ \r
+ //read file \r
+ for(int i=0;i<num;i++){\r
+ //read next sequence\r
+ int length = positions[i+1] - positions[i];\r
+ char* buf4 = new char[length];\r
+\r
+ MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);\r
+\r
+ string tempBuf = buf4;\r
+ if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }\r
+ delete buf4;\r
+\r
+ istringstream iss (tempBuf,istringstream::in);\r
+ iss >> name >> taxInfo;\r
+ taxonomy[name] = taxInfo;\r
+ phyloTree->addSeqToTree(name, taxInfo);\r
+ }\r
+ \r
+ MPI_File_close(&inMPI);\r
+#else \r
+ ifstream inTax;\r
+ openInputFile(file, inTax);\r
+ \r
+ //read template seqs and save\r
+ while (!inTax.eof()) {\r
+ inTax >> name >> taxInfo;\r
+ \r
+ taxonomy[name] = taxInfo;\r
+ \r
+ phyloTree->addSeqToTree(name, taxInfo);\r
+ \r
+ gobble(inTax);\r
+ }\r
+ inTax.close();\r
+#endif \r
+ \r
+ phyloTree->assignHeirarchyIDs(0);\r
+ \r
+ m->mothurOut("DONE.");\r
+ m->mothurOutEndLine(); cout.flush();\r
+ \r
+ }\r
+ catch(exception& e) {\r
+ m->errorOut(e, "Classify", "readTaxonomy");\r
+ exit(1);\r
+ }\r
+}\r
+/**************************************************************************************************/\r
+\r
+vector<string> Classify::parseTax(string tax) {\r
+ try {\r
+ vector<string> taxons;\r
+ \r
+ tax = tax.substr(0, tax.length()-1); //get rid of last ';'\r
+ \r
+ //parse taxonomy\r
+ string individual;\r
+ while (tax.find_first_of(';') != -1) {\r
+ individual = tax.substr(0,tax.find_first_of(';'));\r
+ tax = tax.substr(tax.find_first_of(';')+1, tax.length());\r
+ taxons.push_back(individual);\r
+ \r
+ }\r
+ //get last one\r
+ taxons.push_back(tax);\r
+ \r
+ return taxons;\r
+ }\r
+ catch(exception& e) {\r
+ m->errorOut(e, "Classify", "parseTax");\r
+ exit(1);\r
+ }\r
+}\r
+/**************************************************************************************************/\r
+\r