5 * Created by westcott on 11/3/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
11 #include "sequence.hpp"
13 #include "suffixdb.hpp"
14 #include "blastdb.hpp"
15 #include "distancedb.hpp"
16 #include "referencedb.h"
18 /**************************************************************************************************/
19 void Classify::generateDatabaseAndNames(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch) {
21 ReferenceDB* rdb = ReferenceDB::getInstance();
23 if (tfile == "saved") { tfile = rdb->getSavedTaxonomy(); }
26 readTaxonomy(taxFile);
29 if (tempFile == "saved") {
30 int start = time(NULL);
31 m->mothurOutEndLine(); m->mothurOut("Using sequences from " + rdb->getSavedReference() + " that are saved in memory."); m->mothurOutEndLine();
33 numSeqs = rdb->referenceSeqs.size();
34 templateFile = rdb->getSavedReference();
35 tempFile = rdb->getSavedReference();
37 bool needToGenerate = true;
39 if(method == "kmer") {
40 database = new KmerDB(tempFile, kmerSize);
42 kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
43 ifstream kmerFileTest(kmerDBName.c_str());
45 bool GoodFile = m->checkReleaseVersion(kmerFileTest, m->getVersion());
46 if (GoodFile) { needToGenerate = false; }
49 else if(method == "suffix") { database = new SuffixDB(numSeqs); }
50 else if(method == "blast") { database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch, ""); }
51 else if(method == "distance") { database = new DistanceDB(); }
53 m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
54 m->mothurOutEndLine();
55 database = new KmerDB(tempFile, 8);
59 for (int k = 0; k < rdb->referenceSeqs.size(); k++) {
60 Sequence temp(rdb->referenceSeqs[k].getName(), rdb->referenceSeqs[k].getAligned());
61 names.push_back(temp.getName());
62 database->addSequence(temp);
64 database->generateDB();
65 }else if ((method == "kmer") && (!needToGenerate)) {
66 ifstream kmerFileTest(kmerDBName.c_str());
67 database->readKmerDB(kmerFileTest);
69 for (int k = 0; k < rdb->referenceSeqs.size(); k++) {
70 names.push_back(rdb->referenceSeqs[k].getName());
74 database->setNumSeqs(numSeqs);
77 bool okay = phyloTree->ErrorCheck(names);
79 if (!okay) { m->control_pressed = true; }
81 m->mothurOut("It took " + toString(time(NULL) - start) + " to load " + toString(rdb->referenceSeqs.size()) + " sequences and generate the search databases.");m->mothurOutEndLine();
85 templateFile = tempFile;
87 int start = time(NULL);
89 m->mothurOut("Generating search database... "); cout.flush();
92 vector<unsigned long int> positions;
97 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
98 MPI_Comm_size(MPI_COMM_WORLD, &processors);
100 //char* inFileName = new char[tempFile.length()];
101 //memcpy(inFileName, tempFile.c_str(), tempFile.length());
103 char inFileName[1024];
104 strcpy(inFileName, tempFile.c_str());
106 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
109 if (pid == 0) { //only one process needs to scan file
110 positions = m->setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs
112 //send file positions to all processes
113 for(int i = 1; i < processors; i++) {
114 MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
115 MPI_Send(&positions[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
118 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
119 positions.resize(numSeqs+1);
120 MPI_Recv(&positions[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
124 if(method == "kmer") { database = new KmerDB(tempFile, kmerSize); }
125 else if(method == "suffix") { database = new SuffixDB(numSeqs); }
126 else if(method == "blast") { database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch); }
127 else if(method == "distance") { database = new DistanceDB(); }
129 m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine();
130 database = new KmerDB(tempFile, 8);
134 for(int i=0;i<numSeqs;i++){
136 int length = positions[i+1] - positions[i];
137 char* buf4 = new char[length];
138 MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
140 string tempBuf = buf4;
141 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
143 istringstream iss (tempBuf,istringstream::in);
146 if (temp.getName() != "") {
147 if (rdb->save) { rdb->referenceSeqs.push_back(temp); }
148 names.push_back(temp.getName());
149 database->addSequence(temp);
153 database->generateDB();
154 MPI_File_close(&inMPI);
155 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
158 //need to know number of template seqs for suffixdb
159 if (method == "suffix") {
161 m->openInputFile(tempFile, inFASTA);
162 m->getNumSeqs(inFASTA, numSeqs);
166 bool needToGenerate = true;
168 if(method == "kmer") {
169 database = new KmerDB(tempFile, kmerSize);
171 kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
172 ifstream kmerFileTest(kmerDBName.c_str());
174 bool GoodFile = m->checkReleaseVersion(kmerFileTest, m->getVersion());
175 if (GoodFile) { needToGenerate = false; }
178 else if(method == "suffix") { database = new SuffixDB(numSeqs); }
179 else if(method == "blast") { database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch, ""); }
180 else if(method == "distance") { database = new DistanceDB(); }
182 m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
183 m->mothurOutEndLine();
184 database = new KmerDB(tempFile, 8);
187 if (needToGenerate) {
189 m->openInputFile(tempFile, fastaFile);
191 while (!fastaFile.eof()) {
192 Sequence temp(fastaFile);
193 m->gobble(fastaFile);
195 if (rdb->save) { rdb->referenceSeqs.push_back(temp); }
197 names.push_back(temp.getName());
199 database->addSequence(temp);
203 database->generateDB();
205 }else if ((method == "kmer") && (!needToGenerate)) {
206 ifstream kmerFileTest(kmerDBName.c_str());
207 database->readKmerDB(kmerFileTest);
210 m->openInputFile(tempFile, fastaFile);
212 while (!fastaFile.eof()) {
213 Sequence temp(fastaFile);
214 m->gobble(fastaFile);
216 if (rdb->save) { rdb->referenceSeqs.push_back(temp); }
217 names.push_back(temp.getName());
223 database->setNumSeqs(names.size());
226 bool okay = phyloTree->ErrorCheck(names);
228 if (!okay) { m->control_pressed = true; }
230 m->mothurOut("DONE."); m->mothurOutEndLine();
231 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine();
235 catch(exception& e) {
236 m->errorOut(e, "Classify", "generateDatabaseAndNames");
240 /**************************************************************************************************/
241 Classify::Classify() { m = MothurOut::getInstance(); database = NULL; }
242 /**************************************************************************************************/
244 int Classify::readTaxonomy(string file) {
247 phyloTree = new PhyloTree();
248 string name, taxInfo;
250 m->mothurOutEndLine();
251 m->mothurOut("Reading in the " + file + " taxonomy...\t"); cout.flush();
254 int pid, num, processors;
255 vector<unsigned long int> positions;
260 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
261 MPI_Comm_size(MPI_COMM_WORLD, &processors);
263 //char* inFileName = new char[file.length()];
264 //memcpy(inFileName, file.c_str(), file.length());
266 char inFileName[1024];
267 strcpy(inFileName, file.c_str());
269 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
273 positions = m->setFilePosEachLine(file, num);
275 //send file positions to all processes
276 for(int i = 1; i < processors; i++) {
277 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
278 MPI_Send(&positions[0], (num+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
281 MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
282 positions.resize(num+1);
283 MPI_Recv(&positions[0], (num+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
287 for(int i=0;i<num;i++){
289 int length = positions[i+1] - positions[i];
290 char* buf4 = new char[length];
292 MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
294 string tempBuf = buf4;
295 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
298 istringstream iss (tempBuf,istringstream::in);
299 iss >> name >> taxInfo;
300 taxonomy[name] = taxInfo;
301 phyloTree->addSeqToTree(name, taxInfo);
304 MPI_File_close(&inMPI);
305 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
308 m->openInputFile(file, inTax);
310 //read template seqs and save
311 while (!inTax.eof()) {
312 inTax >> name >> taxInfo;
314 taxonomy[name] = taxInfo;
316 phyloTree->addSeqToTree(name, taxInfo);
323 phyloTree->assignHeirarchyIDs(0);
325 phyloTree->setUp(file);
327 m->mothurOut("DONE.");
328 m->mothurOutEndLine(); cout.flush();
330 return phyloTree->getNumSeqs();
333 catch(exception& e) {
334 m->errorOut(e, "Classify", "readTaxonomy");
338 /**************************************************************************************************/
340 vector<string> Classify::parseTax(string tax) {
342 vector<string> taxons;
344 tax = tax.substr(0, tax.length()-1); //get rid of last ';'
348 while (tax.find_first_of(';') != -1) {
349 individual = tax.substr(0,tax.find_first_of(';'));
350 tax = tax.substr(tax.find_first_of(';')+1, tax.length());
351 taxons.push_back(individual);
355 taxons.push_back(tax);
359 catch(exception& e) {
360 m->errorOut(e, "Classify", "parseTax");
364 /**************************************************************************************************/