5 * Created by westcott on 11/3/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
11 #include "sequence.hpp"
13 #include "suffixdb.hpp"
14 #include "blastdb.hpp"
15 #include "distancedb.hpp"
17 /**************************************************************************************************/
18 void Classify::generateDatabaseAndNames(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch) {
21 readTaxonomy(taxFile);
23 templateFile = tempFile;
25 int start = time(NULL);
28 m->mothurOut("Generating search database... "); cout.flush();
31 vector<unsigned long int> positions;
36 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
37 MPI_Comm_size(MPI_COMM_WORLD, &processors);
39 //char* inFileName = new char[tempFile.length()];
40 //memcpy(inFileName, tempFile.c_str(), tempFile.length());
42 char inFileName[1024];
43 strcpy(inFileName, tempFile.c_str());
45 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
48 if (pid == 0) { //only one process needs to scan file
49 positions = m->setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs
51 //send file positions to all processes
52 for(int i = 1; i < processors; i++) {
53 MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
54 MPI_Send(&positions[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
57 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
58 positions.resize(numSeqs+1);
59 MPI_Recv(&positions[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
63 if(method == "kmer") { database = new KmerDB(tempFile, kmerSize); }
64 else if(method == "suffix") { database = new SuffixDB(numSeqs); }
65 else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); }
66 else if(method == "distance") { database = new DistanceDB(); }
68 m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine();
69 database = new KmerDB(tempFile, 8);
73 for(int i=0;i<numSeqs;i++){
75 int length = positions[i+1] - positions[i];
76 char* buf4 = new char[length];
77 MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
79 string tempBuf = buf4;
80 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
82 istringstream iss (tempBuf,istringstream::in);
85 if (temp.getName() != "") {
86 names.push_back(temp.getName());
87 database->addSequence(temp);
91 database->generateDB();
92 MPI_File_close(&inMPI);
93 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
96 //need to know number of template seqs for suffixdb
97 if (method == "suffix") {
99 m->openInputFile(tempFile, inFASTA);
100 m->getNumSeqs(inFASTA, numSeqs);
104 bool needToGenerate = true;
106 if(method == "kmer") {
107 database = new KmerDB(tempFile, kmerSize);
109 kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
110 ifstream kmerFileTest(kmerDBName.c_str());
112 bool GoodFile = m->checkReleaseVersion(kmerFileTest, m->getVersion());
113 if (GoodFile) { needToGenerate = false; }
116 else if(method == "suffix") { database = new SuffixDB(numSeqs); }
117 else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); }
118 else if(method == "distance") { database = new DistanceDB(); }
120 m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
121 m->mothurOutEndLine();
122 database = new KmerDB(tempFile, 8);
125 if (needToGenerate) {
127 m->openInputFile(tempFile, fastaFile);
129 while (!fastaFile.eof()) {
130 Sequence temp(fastaFile);
131 m->gobble(fastaFile);
133 names.push_back(temp.getName());
135 database->addSequence(temp);
139 database->generateDB();
141 }else if ((method == "kmer") && (!needToGenerate)) {
142 ifstream kmerFileTest(kmerDBName.c_str());
143 database->readKmerDB(kmerFileTest);
146 m->openInputFile(tempFile, fastaFile);
148 while (!fastaFile.eof()) {
149 Sequence temp(fastaFile);
150 m->gobble(fastaFile);
152 names.push_back(temp.getName());
158 database->setNumSeqs(names.size());
161 bool okay = phyloTree->ErrorCheck(names);
163 if (!okay) { m->control_pressed = true; }
165 m->mothurOut("DONE."); m->mothurOutEndLine();
166 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine();
169 catch(exception& e) {
170 m->errorOut(e, "Classify", "generateDatabaseAndNames");
174 /**************************************************************************************************/
175 Classify::Classify() { m = MothurOut::getInstance(); database = NULL; }
176 /**************************************************************************************************/
178 int Classify::readTaxonomy(string file) {
181 phyloTree = new PhyloTree();
182 string name, taxInfo;
184 m->mothurOutEndLine();
185 m->mothurOut("Reading in the " + file + " taxonomy...\t"); cout.flush();
188 int pid, num, processors;
189 vector<unsigned long int> positions;
194 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
195 MPI_Comm_size(MPI_COMM_WORLD, &processors);
197 //char* inFileName = new char[file.length()];
198 //memcpy(inFileName, file.c_str(), file.length());
200 char inFileName[1024];
201 strcpy(inFileName, file.c_str());
203 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
207 positions = m->setFilePosEachLine(file, num);
209 //send file positions to all processes
210 for(int i = 1; i < processors; i++) {
211 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
212 MPI_Send(&positions[0], (num+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
215 MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
216 positions.resize(num+1);
217 MPI_Recv(&positions[0], (num+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
221 for(int i=0;i<num;i++){
223 int length = positions[i+1] - positions[i];
224 char* buf4 = new char[length];
226 MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
228 string tempBuf = buf4;
229 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
232 istringstream iss (tempBuf,istringstream::in);
233 iss >> name >> taxInfo;
234 taxonomy[name] = taxInfo;
235 phyloTree->addSeqToTree(name, taxInfo);
238 MPI_File_close(&inMPI);
239 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
242 m->openInputFile(file, inTax);
244 //read template seqs and save
245 while (!inTax.eof()) {
246 inTax >> name >> taxInfo;
248 taxonomy[name] = taxInfo;
250 phyloTree->addSeqToTree(name, taxInfo);
257 phyloTree->assignHeirarchyIDs(0);
259 phyloTree->setUp(file);
261 m->mothurOut("DONE.");
262 m->mothurOutEndLine(); cout.flush();
264 return phyloTree->getNumSeqs();
267 catch(exception& e) {
268 m->errorOut(e, "Classify", "readTaxonomy");
272 /**************************************************************************************************/
274 vector<string> Classify::parseTax(string tax) {
276 vector<string> taxons;
278 tax = tax.substr(0, tax.length()-1); //get rid of last ';'
282 while (tax.find_first_of(';') != -1) {
283 individual = tax.substr(0,tax.find_first_of(';'));
284 tax = tax.substr(tax.find_first_of(';')+1, tax.length());
285 taxons.push_back(individual);
289 taxons.push_back(tax);
293 catch(exception& e) {
294 m->errorOut(e, "Classify", "parseTax");
298 /**************************************************************************************************/