]> git.donarmstrong.com Git - mothur.git/blob - classify.cpp
added zap method to classify.seqs and changed bayesian method name to wang.
[mothur.git] / classify.cpp
1 /*
2  *  classify.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 11/3/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "classify.h"
11 #include "sequence.hpp"
12 #include "kmerdb.hpp"
13 #include "suffixdb.hpp"
14 #include "blastdb.hpp"
15 #include "distancedb.hpp"
16 #include "referencedb.h"
17
18 /**************************************************************************************************/
19 void Classify::generateDatabaseAndNames(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch)  {             
20         try {   
21                 ReferenceDB* rdb = ReferenceDB::getInstance();
22                 
23                 if (tfile == "saved") { tfile = rdb->getSavedTaxonomy(); }
24                 
25                 taxFile = tfile;
26                 readTaxonomy(taxFile);  
27                 int numSeqs = 0;
28                 
29                 if (tempFile == "saved") {
30                         int start = time(NULL);
31                         m->mothurOutEndLine();  m->mothurOut("Using sequences from " + rdb->getSavedReference() + " that are saved in memory.");        m->mothurOutEndLine();
32                         
33                         numSeqs = rdb->referenceSeqs.size();
34                         templateFile = rdb->getSavedReference();
35                         tempFile = rdb->getSavedReference();
36                         
37                         bool needToGenerate = true;
38                         string kmerDBName;
39                         if(method == "kmer")                    {       
40                                 database = new KmerDB(tempFile, kmerSize);                      
41                                 
42                                 kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
43                                 ifstream kmerFileTest(kmerDBName.c_str());
44                                 if(kmerFileTest){       
45                                         bool GoodFile = m->checkReleaseVersion(kmerFileTest, m->getVersion());
46                                         if (GoodFile) {  needToGenerate = false;        }
47                                 }
48                         }
49                         else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }
50                         else if(method == "blast")              {       database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch, "", threadID);     }
51                         else if(method == "distance")   {       database = new DistanceDB();    }
52                         else {
53                                 m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
54                                 m->mothurOutEndLine();
55                                 database = new KmerDB(tempFile, 8);
56                         }
57                         
58                         if (needToGenerate) {
59                                 for (int k = 0; k < rdb->referenceSeqs.size(); k++) {
60                                         Sequence temp(rdb->referenceSeqs[k].getName(), rdb->referenceSeqs[k].getAligned());
61                                         names.push_back(temp.getName());
62                                         database->addSequence(temp);    
63                                 }
64                                 if ((method == "kmer") && (!shortcuts)) {;} //don't print
65                 else {database->generateDB(); }
66                         }else if ((method == "kmer") && (!needToGenerate)) {    
67                                 ifstream kmerFileTest(kmerDBName.c_str());
68                                 database->readKmerDB(kmerFileTest);     
69                                 
70                                 for (int k = 0; k < rdb->referenceSeqs.size(); k++) {
71                                         names.push_back(rdb->referenceSeqs[k].getName());
72                                 }                       
73                         }       
74                         
75                         database->setNumSeqs(numSeqs);
76                         
77                         //sanity check
78                         bool okay = phyloTree->ErrorCheck(names);
79                         
80                         if (!okay) { m->control_pressed = true; }
81                         
82                         m->mothurOut("It took " + toString(time(NULL) - start) + " to load " + toString(rdb->referenceSeqs.size()) + " sequences and generate the search databases.");m->mothurOutEndLine();  
83                         
84                 }else {
85                         
86                         templateFile = tempFile;        
87                         
88                         int start = time(NULL);
89                         
90                         m->mothurOut("Generating search database...    "); cout.flush();
91         #ifdef USE_MPI  
92                                 int pid, processors;
93                                 vector<unsigned long long> positions;
94                                 int tag = 2001;
95                         
96                                 MPI_Status status; 
97                                 MPI_File inMPI;
98                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
99                                 MPI_Comm_size(MPI_COMM_WORLD, &processors);
100
101                                 //char* inFileName = new char[tempFile.length()];
102                                 //memcpy(inFileName, tempFile.c_str(), tempFile.length());
103                                 
104                                 char inFileName[1024];
105                                 strcpy(inFileName, tempFile.c_str());
106
107                                 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
108                                 //delete inFileName;
109
110                                 if (pid == 0) { //only one process needs to scan file
111                                         positions = m->setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs
112
113                                         //send file positions to all processes
114                                         for(int i = 1; i < processors; i++) { 
115                                                 MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
116                                                 MPI_Send(&positions[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
117                                         }
118                                 }else{
119                                         MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
120                                         positions.resize(numSeqs+1);
121                                         MPI_Recv(&positions[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
122                                 }
123                                 
124                                 //create database
125                                 if(method == "kmer")                    {       database = new KmerDB(tempFile, kmerSize);                      }
126                                 else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }
127                                 else if(method == "blast")              {       database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch, "", pid);  }
128                                 else if(method == "distance")   {       database = new DistanceDB();    }
129                                 else {
130                                         m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine();
131                                         database = new KmerDB(tempFile, 8);
132                                 }
133
134                                 //read file 
135                                 for(int i=0;i<numSeqs;i++){
136                                         //read next sequence
137                                         int length = positions[i+1] - positions[i];
138                                         char* buf4 = new char[length];
139                                         MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
140                                         
141                                         string tempBuf = buf4;
142                                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
143                                         delete buf4;
144                                         istringstream iss (tempBuf,istringstream::in);
145                                         
146                                         Sequence temp(iss);  
147                                         if (temp.getName() != "") {
148                                                 if (rdb->save) { rdb->referenceSeqs.push_back(temp); }
149                                                 names.push_back(temp.getName());
150                                                 database->addSequence(temp);    
151                                         }
152                                 }
153                                 
154                                 database->generateDB();
155                                 MPI_File_close(&inMPI);
156                                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
157                 #else
158                         
159                         //need to know number of template seqs for suffixdb
160                         if (method == "suffix") {
161                                 ifstream inFASTA;
162                                 m->openInputFile(tempFile, inFASTA);
163                                 m->getNumSeqs(inFASTA, numSeqs);
164                                 inFASTA.close();
165                         }
166
167                         bool needToGenerate = true;
168                         string kmerDBName;
169                         if(method == "kmer")                    {       
170                                 database = new KmerDB(tempFile, kmerSize);                      
171                                 
172                                 kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
173                                 ifstream kmerFileTest(kmerDBName.c_str());
174                                 if(kmerFileTest){       
175                                         bool GoodFile = m->checkReleaseVersion(kmerFileTest, m->getVersion());
176                                         if (GoodFile) {  needToGenerate = false;        }
177                                 }
178                         }
179                         else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }
180                         else if(method == "blast")              {       database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch, "", threadID);     }
181                         else if(method == "distance")   {       database = new DistanceDB();    }
182                         else {
183                                 m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
184                                 m->mothurOutEndLine();
185                                 database = new KmerDB(tempFile, 8);
186                         }
187                         
188                         if (needToGenerate) {
189                                 ifstream fastaFile;
190                                 m->openInputFile(tempFile, fastaFile);
191                                 
192                                 while (!fastaFile.eof()) {
193                                         Sequence temp(fastaFile);
194                                         m->gobble(fastaFile);
195                                         
196                                         if (rdb->save) { rdb->referenceSeqs.push_back(temp); }
197                                         
198                                         names.push_back(temp.getName());
199                                                                 
200                                         database->addSequence(temp);    
201                                 }
202                                 fastaFile.close();
203
204                 if ((method == "kmer") && (!shortcuts)) {;} //don't print
205                 else {database->generateDB(); } 
206                                 
207                         }else if ((method == "kmer") && (!needToGenerate)) {    
208                                 ifstream kmerFileTest(kmerDBName.c_str());
209                                 database->readKmerDB(kmerFileTest);     
210                         
211                                 ifstream fastaFile;
212                                 m->openInputFile(tempFile, fastaFile);
213                                 
214                                 while (!fastaFile.eof()) {
215                                         Sequence temp(fastaFile);
216                                         m->gobble(fastaFile);
217                                         
218                                         if (rdb->save) { rdb->referenceSeqs.push_back(temp); }
219                                         names.push_back(temp.getName());
220                                 }
221                                 fastaFile.close();
222                         }
223         #endif  
224                 
225                         database->setNumSeqs(names.size());
226                         
227                         //sanity check
228                         bool okay = phyloTree->ErrorCheck(names);
229                         
230                         if (!okay) { m->control_pressed = true; }
231                         
232                         m->mothurOut("DONE."); m->mothurOutEndLine();
233                         m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine();
234                 }
235
236         }
237         catch(exception& e) {
238                 m->errorOut(e, "Classify", "generateDatabaseAndNames");
239                 exit(1);
240         }
241 }
242 /**************************************************************************************************/
243 Classify::Classify() {          m = MothurOut::getInstance();   database = NULL;        flipped=false; }
244 /**************************************************************************************************/
245
246 int Classify::readTaxonomy(string file) {
247         try {
248                 
249                 phyloTree = new PhyloTree();
250                 string name, taxInfo;
251                 
252                 m->mothurOutEndLine();
253                 m->mothurOut("Reading in the " + file + " taxonomy...\t");      cout.flush();
254         if (m->debug) { m->mothurOut("[DEBUG]: Taxonomies read in...\n"); }
255         
256 #ifdef USE_MPI  
257                 int pid, num, processors;
258                 vector<unsigned long long> positions;
259                 int tag = 2001;
260                 
261                 MPI_Status status; 
262                 MPI_File inMPI;
263                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
264                 MPI_Comm_size(MPI_COMM_WORLD, &processors);
265                 
266                 char inFileName[1024];
267                 strcpy(inFileName, file.c_str());
268
269                 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
270                 //delete inFileName;
271
272                 if (pid == 0) {
273                         positions = m->setFilePosEachLine(file, num);
274                         
275                         //send file positions to all processes
276                         for(int i = 1; i < processors; i++) { 
277                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
278                                 MPI_Send(&positions[0], (num+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
279                         }
280                 }else{
281                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
282                         positions.resize(num+1);
283                         MPI_Recv(&positions[0], (num+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
284                 }
285         
286                 //read file 
287                 for(int i=0;i<num;i++){
288                         //read next sequence
289                         int length = positions[i+1] - positions[i];
290                         char* buf4 = new char[length];
291
292                         MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
293
294                         string tempBuf = buf4;
295                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
296                         delete buf4;
297
298                         istringstream iss (tempBuf,istringstream::in);
299                         iss >> name; m->gobble(iss);
300             iss >> taxInfo;
301             if (m->debug) { m->mothurOut("[DEBUG]: name = " + name + " tax = " + taxInfo + "\n"); }
302                         taxonomy[name] = taxInfo;
303                         phyloTree->addSeqToTree(name, taxInfo);
304                 }
305                 
306                 MPI_File_close(&inMPI);
307                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
308 #else   
309         
310         taxonomy.clear(); 
311         m->readTax(file, taxonomy);
312         for (map<string, string>::iterator itTax = taxonomy.begin(); itTax != taxonomy.end(); itTax++) {  phyloTree->addSeqToTree(itTax->first, itTax->second);  }
313 #endif  
314                 phyloTree->assignHeirarchyIDs(0);
315                 
316                 phyloTree->setUp(file);
317         
318                 m->mothurOut("DONE.");
319                 m->mothurOutEndLine();  cout.flush();
320                 
321                 return phyloTree->getNumSeqs();
322         
323         }
324         catch(exception& e) {
325                 m->errorOut(e, "Classify", "readTaxonomy");
326                 exit(1);
327         }
328 }
329 /**************************************************************************************************/
330
331 vector<string> Classify::parseTax(string tax) {
332         try {
333                 vector<string> taxons;
334                 
335                 tax = tax.substr(0, tax.length()-1);  //get rid of last ';'
336         
337                 //parse taxonomy
338                 string individual;
339                 while (tax.find_first_of(';') != -1) {
340                         individual = tax.substr(0,tax.find_first_of(';'));
341                         tax = tax.substr(tax.find_first_of(';')+1, tax.length());
342                         taxons.push_back(individual);
343                         
344                 }
345                 //get last one
346                 taxons.push_back(tax);
347                 
348                 return taxons;
349         }
350         catch(exception& e) {
351                 m->errorOut(e, "Classify", "parseTax");
352                 exit(1);
353         }
354 }
355 /**************************************************************************************************/
356
357 double Classify::getLogExpSum(vector<double> probabilities, int& maxIndex){
358         try {
359         //      http://jblevins.org/notes/log-sum-exp
360         
361         double maxProb = probabilities[0];
362         maxIndex = 0;
363         
364         int numProbs = (int)probabilities.size();
365         
366         for(int i=1;i<numProbs;i++){
367             if(probabilities[i] >= maxProb){
368                 maxProb = probabilities[i];
369                 maxIndex = i;
370             }
371         }
372         
373         double probSum = 0.0000;
374         
375         for(int i=0;i<numProbs;i++){
376             probSum += exp(probabilities[i] - maxProb);         
377         }
378         
379         probSum = log(probSum) + maxProb;
380         
381         return probSum;
382         }
383         catch(exception& e) {
384                 m->errorOut(e, "Classify", "getLogExpSum");
385                 exit(1);
386         }
387 }
388
389 /**************************************************************************************************/
390