5 * Created by westcott on 11/3/09.
\r
6 * Copyright 2009 Schloss Lab. All rights reserved.
\r
10 #include "classify.h"
\r
11 #include "sequence.hpp"
\r
12 #include "kmerdb.hpp"
\r
13 #include "suffixdb.hpp"
\r
14 #include "blastdb.hpp"
\r
15 #include "distancedb.hpp"
\r
17 /**************************************************************************************************/
\r
18 Classify::Classify(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch) : taxFile(tfile), templateFile(tempFile) {
\r
20 m = MothurOut::getInstance();
\r
21 readTaxonomy(taxFile);
\r
23 int start = time(NULL);
\r
26 m->mothurOut("Generating search database... "); cout.flush();
\r
29 vector<long> positions;
\r
33 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
\r
35 //char* inFileName = new char[tempFile.length()];
\r
36 //memcpy(inFileName, tempFile.c_str(), tempFile.length());
\r
38 char inFileName[1024];
\r
39 strcpy(inFileName, tempFile.c_str());
\r
41 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
\r
42 //delete inFileName;
\r
44 if (pid == 0) { //only one process needs to scan file
\r
45 positions = setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs
\r
47 //send file positions to all processes
\r
48 MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs
\r
49 MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos
\r
51 MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
\r
52 positions.resize(numSeqs);
\r
53 MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
\r
57 if(method == "kmer") { database = new KmerDB(tempFile, kmerSize); }
\r
58 else if(method == "suffix") { database = new SuffixDB(numSeqs); }
\r
59 else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); }
\r
60 else if(method == "distance") { database = new DistanceDB(); }
\r
62 m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine();
\r
63 database = new KmerDB(tempFile, 8);
\r
67 for(int i=0;i<numSeqs;i++){
\r
68 //read next sequence
\r
69 int length = positions[i+1] - positions[i];
\r
70 char* buf4 = new char[length];
\r
71 MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
\r
73 string tempBuf = buf4;
\r
74 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
\r
76 istringstream iss (tempBuf,istringstream::in);
\r
78 Sequence temp(iss);
\r
79 if (temp.getName() != "") {
\r
80 names.push_back(temp.getName());
\r
81 database->addSequence(temp);
\r
85 database->generateDB();
\r
86 MPI_File_close(&inMPI);
\r
89 //need to know number of template seqs for suffixdb
\r
90 if (method == "suffix") {
\r
92 openInputFile(tempFile, inFASTA);
\r
93 numSeqs = count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
\r
97 bool needToGenerate = true;
\r
99 if(method == "kmer") {
\r
100 database = new KmerDB(tempFile, kmerSize);
\r
102 kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
\r
103 ifstream kmerFileTest(kmerDBName.c_str());
\r
104 if(kmerFileTest){ needToGenerate = false; }
\r
106 else if(method == "suffix") { database = new SuffixDB(numSeqs); }
\r
107 else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); }
\r
108 else if(method == "distance") { database = new DistanceDB(); }
\r
110 m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
\r
111 m->mothurOutEndLine();
\r
112 database = new KmerDB(tempFile, 8);
\r
115 if (needToGenerate) {
\r
116 ifstream fastaFile;
\r
117 openInputFile(tempFile, fastaFile);
\r
119 while (!fastaFile.eof()) {
\r
120 Sequence temp(fastaFile);
\r
123 names.push_back(temp.getName());
\r
125 database->addSequence(temp);
\r
129 database->generateDB();
\r
131 }else if ((method == "kmer") && (!needToGenerate)) {
\r
132 ifstream kmerFileTest(kmerDBName.c_str());
\r
133 database->readKmerDB(kmerFileTest);
\r
135 ifstream fastaFile;
\r
136 openInputFile(tempFile, fastaFile);
\r
138 while (!fastaFile.eof()) {
\r
139 Sequence temp(fastaFile);
\r
142 names.push_back(temp.getName());
\r
147 database->setNumSeqs(names.size());
\r
149 m->mothurOut("DONE."); m->mothurOutEndLine();
\r
150 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine();
\r
153 catch(exception& e) {
\r
154 m->errorOut(e, "Classify", "Classify");
\r
158 /**************************************************************************************************/
\r
160 void Classify::readTaxonomy(string file) {
\r
163 phyloTree = new PhyloTree();
\r
164 string name, taxInfo;
\r
166 m->mothurOutEndLine();
\r
167 m->mothurOut("Reading in the " + file + " taxonomy...\t"); cout.flush();
\r
171 vector<long> positions;
\r
173 MPI_Status status;
\r
175 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
\r
177 //char* inFileName = new char[file.length()];
\r
178 //memcpy(inFileName, file.c_str(), file.length());
\r
180 char inFileName[1024];
\r
181 strcpy(inFileName, file.c_str());
\r
183 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
\r
184 //delete inFileName;
\r
187 positions = setFilePosEachLine(file, num);
\r
189 //send file positions to all processes
\r
190 MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs
\r
191 MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos
\r
193 MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
\r
194 positions.resize(num);
\r
195 MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
\r
199 for(int i=0;i<num;i++){
\r
200 //read next sequence
\r
201 int length = positions[i+1] - positions[i];
\r
202 char* buf4 = new char[length];
\r
204 MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
\r
206 string tempBuf = buf4;
\r
207 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
\r
210 istringstream iss (tempBuf,istringstream::in);
\r
211 iss >> name >> taxInfo;
\r
212 taxonomy[name] = taxInfo;
\r
213 phyloTree->addSeqToTree(name, taxInfo);
\r
216 MPI_File_close(&inMPI);
\r
219 openInputFile(file, inTax);
\r
221 //read template seqs and save
\r
222 while (!inTax.eof()) {
\r
223 inTax >> name >> taxInfo;
\r
225 taxonomy[name] = taxInfo;
\r
227 phyloTree->addSeqToTree(name, taxInfo);
\r
234 phyloTree->assignHeirarchyIDs(0);
\r
236 m->mothurOut("DONE.");
\r
237 m->mothurOutEndLine(); cout.flush();
\r
240 catch(exception& e) {
\r
241 m->errorOut(e, "Classify", "readTaxonomy");
\r
245 /**************************************************************************************************/
\r
247 vector<string> Classify::parseTax(string tax) {
\r
249 vector<string> taxons;
\r
251 tax = tax.substr(0, tax.length()-1); //get rid of last ';'
\r
255 while (tax.find_first_of(';') != -1) {
\r
256 individual = tax.substr(0,tax.find_first_of(';'));
\r
257 tax = tax.substr(tax.find_first_of(';')+1, tax.length());
\r
258 taxons.push_back(individual);
\r
262 taxons.push_back(tax);
\r
266 catch(exception& e) {
\r
267 m->errorOut(e, "Classify", "parseTax");
\r
271 /**************************************************************************************************/
\r