5 * Created by westcott on 11/3/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
11 #include "sequence.hpp"
13 #include "suffixdb.hpp"
14 #include "blastdb.hpp"
15 #include "distancedb.hpp"
16 #include "referencedb.h"
18 /**************************************************************************************************/
19 void Classify::generateDatabaseAndNames(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch) {
21 ReferenceDB* rdb = ReferenceDB::getInstance();
23 if (tfile == "saved") { tfile = rdb->getSavedTaxonomy(); }
29 if (tempFile == "saved") {
30 int start = time(NULL);
31 m->mothurOutEndLine(); m->mothurOut("Using sequences from " + rdb->getSavedReference() + " that are saved in memory."); m->mothurOutEndLine();
33 numSeqs = rdb->referenceSeqs.size();
34 templateFile = rdb->getSavedReference();
35 tempFile = rdb->getSavedReference();
37 bool needToGenerate = true;
39 if(method == "kmer") {
40 database = new KmerDB(tempFile, kmerSize);
42 kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
43 ifstream kmerFileTest(kmerDBName.c_str());
45 bool GoodFile = m->checkReleaseVersion(kmerFileTest, m->getVersion());
46 if (GoodFile) { needToGenerate = false; }
49 else if(method == "suffix") { database = new SuffixDB(numSeqs); }
50 else if(method == "blast") { database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch, "", threadID); }
51 else if(method == "distance") { database = new DistanceDB(); }
53 m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
54 m->mothurOutEndLine();
55 database = new KmerDB(tempFile, 8);
59 for (int k = 0; k < rdb->referenceSeqs.size(); k++) {
60 Sequence temp(rdb->referenceSeqs[k].getName(), rdb->referenceSeqs[k].getAligned());
61 names.push_back(temp.getName());
62 database->addSequence(temp);
64 if ((method == "kmer") && (!shortcuts)) {;} //don't print
65 else {database->generateDB(); }
66 }else if ((method == "kmer") && (!needToGenerate)) {
67 ifstream kmerFileTest(kmerDBName.c_str());
68 database->readKmerDB(kmerFileTest);
70 for (int k = 0; k < rdb->referenceSeqs.size(); k++) {
71 names.push_back(rdb->referenceSeqs[k].getName());
75 database->setNumSeqs(numSeqs);
77 m->mothurOut("It took " + toString(time(NULL) - start) + " to load " + toString(rdb->referenceSeqs.size()) + " sequences and generate the search databases.");m->mothurOutEndLine();
81 templateFile = tempFile;
83 int start = time(NULL);
85 m->mothurOut("Generating search database... "); cout.flush();
88 vector<unsigned long long> positions;
93 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
94 MPI_Comm_size(MPI_COMM_WORLD, &processors);
96 //char* inFileName = new char[tempFile.length()];
97 //memcpy(inFileName, tempFile.c_str(), tempFile.length());
99 char inFileName[1024];
100 strcpy(inFileName, tempFile.c_str());
102 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
105 if (pid == 0) { //only one process needs to scan file
106 positions = m->setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs
108 //send file positions to all processes
109 for(int i = 1; i < processors; i++) {
110 MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
111 MPI_Send(&positions[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
114 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
115 positions.resize(numSeqs+1);
116 MPI_Recv(&positions[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
120 if(method == "kmer") { database = new KmerDB(tempFile, kmerSize); }
121 else if(method == "suffix") { database = new SuffixDB(numSeqs); }
122 else if(method == "blast") { database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch, "", pid); }
123 else if(method == "distance") { database = new DistanceDB(); }
125 m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine();
126 database = new KmerDB(tempFile, 8);
130 for(int i=0;i<numSeqs;i++){
132 int length = positions[i+1] - positions[i];
133 char* buf4 = new char[length];
134 MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
136 string tempBuf = buf4;
137 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
139 istringstream iss (tempBuf,istringstream::in);
142 if (temp.getName() != "") {
143 if (rdb->save) { rdb->referenceSeqs.push_back(temp); }
144 names.push_back(temp.getName());
145 database->addSequence(temp);
149 database->generateDB();
150 MPI_File_close(&inMPI);
151 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
154 //need to know number of template seqs for suffixdb
155 if (method == "suffix") {
157 m->openInputFile(tempFile, inFASTA);
158 m->getNumSeqs(inFASTA, numSeqs);
162 bool needToGenerate = true;
164 if(method == "kmer") {
165 database = new KmerDB(tempFile, kmerSize);
167 kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
168 ifstream kmerFileTest(kmerDBName.c_str());
170 bool GoodFile = m->checkReleaseVersion(kmerFileTest, m->getVersion());
171 if (GoodFile) { needToGenerate = false; }
174 else if(method == "suffix") { database = new SuffixDB(numSeqs); }
175 else if(method == "blast") { database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch, "", threadID); }
176 else if(method == "distance") { database = new DistanceDB(); }
178 m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
179 m->mothurOutEndLine();
180 database = new KmerDB(tempFile, 8);
183 if (needToGenerate) {
185 m->openInputFile(tempFile, fastaFile);
187 while (!fastaFile.eof()) {
188 Sequence temp(fastaFile);
189 m->gobble(fastaFile);
191 if (rdb->save) { rdb->referenceSeqs.push_back(temp); }
193 names.push_back(temp.getName());
195 database->addSequence(temp);
199 if ((method == "kmer") && (!shortcuts)) {;} //don't print
200 else {database->generateDB(); }
202 }else if ((method == "kmer") && (!needToGenerate)) {
203 ifstream kmerFileTest(kmerDBName.c_str());
204 database->readKmerDB(kmerFileTest);
207 m->openInputFile(tempFile, fastaFile);
209 while (!fastaFile.eof()) {
210 Sequence temp(fastaFile);
211 m->gobble(fastaFile);
213 if (rdb->save) { rdb->referenceSeqs.push_back(temp); }
214 names.push_back(temp.getName());
220 database->setNumSeqs(names.size());
222 m->mothurOut("DONE."); m->mothurOutEndLine();
223 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine();
226 readTaxonomy(taxFile);
229 bool okay = phyloTree->ErrorCheck(names);
231 if (!okay) { m->control_pressed = true; }
233 catch(exception& e) {
234 m->errorOut(e, "Classify", "generateDatabaseAndNames");
238 /**************************************************************************************************/
239 Classify::Classify() { m = MothurOut::getInstance(); database = NULL; phyloTree=NULL; flipped=false; }
240 /**************************************************************************************************/
242 int Classify::readTaxonomy(string file) {
245 phyloTree = new PhyloTree();
246 string name, taxInfo;
248 m->mothurOutEndLine();
249 m->mothurOut("Reading in the " + file + " taxonomy...\t"); cout.flush();
250 if (m->debug) { m->mothurOut("[DEBUG]: Taxonomies read in...\n"); }
253 int pid, num, processors;
254 vector<unsigned long long> positions;
259 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
260 MPI_Comm_size(MPI_COMM_WORLD, &processors);
262 char inFileName[1024];
263 strcpy(inFileName, file.c_str());
265 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
269 positions = m->setFilePosEachLine(file, num);
271 //send file positions to all processes
272 for(int i = 1; i < processors; i++) {
273 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
274 MPI_Send(&positions[0], (num+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
277 MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
278 positions.resize(num+1);
279 MPI_Recv(&positions[0], (num+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
283 for(int i=0;i<num;i++){
285 int length = positions[i+1] - positions[i];
286 char* buf4 = new char[length];
288 MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
290 string tempBuf = buf4;
291 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
294 istringstream iss (tempBuf,istringstream::in);
295 iss >> name; m->gobble(iss);
297 if (m->debug) { m->mothurOut("[DEBUG]: name = " + name + " tax = " + taxInfo + "\n"); }
298 //commented out to save time with large templates. 10/7/13
299 //if (m->inUsersGroups(name, names)) {
300 taxonomy[name] = taxInfo;
301 phyloTree->addSeqToTree(name, taxInfo);
303 // m->mothurOut("[WARNING]: " + name + " is in your taxonomy file and not in your reference file, ignoring.\n");
307 MPI_File_close(&inMPI);
308 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
312 m->readTax(file, taxonomy);
314 //commented out to save time with large templates. 6/12/13
315 //map<string, string> tempTaxonomy;
316 for (map<string, string>::iterator itTax = taxonomy.begin(); itTax != taxonomy.end(); itTax++) {
317 //if (m->inUsersGroups(itTax->first, names)) {
318 phyloTree->addSeqToTree(itTax->first, itTax->second);
319 if (m->control_pressed) { break; }
320 //tempTaxonomy[itTax->first] = itTax->second;
322 // m->mothurOut("[WARNING]: " + itTax->first + " is in your taxonomy file and not in your reference file, ignoring.\n");
325 //taxonomy = tempTaxonomy;
327 phyloTree->assignHeirarchyIDs(0);
329 phyloTree->setUp(file);
331 m->mothurOut("DONE.");
332 m->mothurOutEndLine(); cout.flush();
334 return phyloTree->getNumSeqs();
337 catch(exception& e) {
338 m->errorOut(e, "Classify", "readTaxonomy");
342 /**************************************************************************************************/
344 vector<string> Classify::parseTax(string tax) {
346 vector<string> taxons;
347 m->splitAtChar(tax, taxons, ';');
350 catch(exception& e) {
351 m->errorOut(e, "Classify", "parseTax");
355 /**************************************************************************************************/
357 double Classify::getLogExpSum(vector<double> probabilities, int& maxIndex){
359 // http://jblevins.org/notes/log-sum-exp
361 double maxProb = probabilities[0];
364 int numProbs = (int)probabilities.size();
366 for(int i=1;i<numProbs;i++){
367 if(probabilities[i] >= maxProb){
368 maxProb = probabilities[i];
373 double probSum = 0.0000;
375 for(int i=0;i<numProbs;i++){
376 probSum += exp(probabilities[i] - maxProb);
379 probSum = log(probSum) + maxProb;
383 catch(exception& e) {
384 m->errorOut(e, "Classify", "getLogExpSum");
389 /**************************************************************************************************/