]> git.donarmstrong.com Git - mothur.git/blob - classify.cpp
changing command name classify.shared to classifyrf.shared
[mothur.git] / classify.cpp
1 /*
2  *  classify.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 11/3/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "classify.h"
11 #include "sequence.hpp"
12 #include "kmerdb.hpp"
13 #include "suffixdb.hpp"
14 #include "blastdb.hpp"
15 #include "distancedb.hpp"
16 #include "referencedb.h"
17
18 /**************************************************************************************************/
19 void Classify::generateDatabaseAndNames(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch)  {             
20         try {   
21                 ReferenceDB* rdb = ReferenceDB::getInstance();
22                 
23                 if (tfile == "saved") { tfile = rdb->getSavedTaxonomy(); }
24                 
25                 taxFile = tfile;
26                 
27                 int numSeqs = 0;
28                 
29                 if (tempFile == "saved") {
30                         int start = time(NULL);
31                         m->mothurOutEndLine();  m->mothurOut("Using sequences from " + rdb->getSavedReference() + " that are saved in memory.");        m->mothurOutEndLine();
32                         
33                         numSeqs = rdb->referenceSeqs.size();
34                         templateFile = rdb->getSavedReference();
35                         tempFile = rdb->getSavedReference();
36                         
37                         bool needToGenerate = true;
38                         string kmerDBName;
39                         if(method == "kmer")                    {       
40                                 database = new KmerDB(tempFile, kmerSize);                      
41                                 
42                                 kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
43                                 ifstream kmerFileTest(kmerDBName.c_str());
44                                 if(kmerFileTest){       
45                                         bool GoodFile = m->checkReleaseVersion(kmerFileTest, m->getVersion());
46                                         if (GoodFile) {  needToGenerate = false;        }
47                                 }
48                         }
49                         else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }
50                         else if(method == "blast")              {       database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch, "", threadID);     }
51                         else if(method == "distance")   {       database = new DistanceDB();    }
52                         else {
53                                 m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
54                                 m->mothurOutEndLine();
55                                 database = new KmerDB(tempFile, 8);
56                         }
57                         
58                         if (needToGenerate) {
59                                 for (int k = 0; k < rdb->referenceSeqs.size(); k++) {
60                                         Sequence temp(rdb->referenceSeqs[k].getName(), rdb->referenceSeqs[k].getAligned());
61                                         names.push_back(temp.getName());
62                                         database->addSequence(temp);    
63                                 }
64                                 if ((method == "kmer") && (!shortcuts)) {;} //don't print
65                 else {database->generateDB(); }
66                         }else if ((method == "kmer") && (!needToGenerate)) {    
67                                 ifstream kmerFileTest(kmerDBName.c_str());
68                                 database->readKmerDB(kmerFileTest);     
69                                 
70                                 for (int k = 0; k < rdb->referenceSeqs.size(); k++) {
71                                         names.push_back(rdb->referenceSeqs[k].getName());
72                                 }                       
73                         }       
74                         
75                         database->setNumSeqs(numSeqs);
76                         
77                         m->mothurOut("It took " + toString(time(NULL) - start) + " to load " + toString(rdb->referenceSeqs.size()) + " sequences and generate the search databases.");m->mothurOutEndLine();  
78                         
79                 }else {
80                         
81                         templateFile = tempFile;        
82                         
83                         int start = time(NULL);
84                         
85                         m->mothurOut("Generating search database...    "); cout.flush();
86         #ifdef USE_MPI  
87                                 int pid, processors;
88                                 vector<unsigned long long> positions;
89                                 int tag = 2001;
90                         
91                                 MPI_Status status; 
92                                 MPI_File inMPI;
93                                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
94                                 MPI_Comm_size(MPI_COMM_WORLD, &processors);
95
96                                 //char* inFileName = new char[tempFile.length()];
97                                 //memcpy(inFileName, tempFile.c_str(), tempFile.length());
98                                 
99                                 char inFileName[1024];
100                                 strcpy(inFileName, tempFile.c_str());
101
102                                 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
103                                 //delete inFileName;
104
105                                 if (pid == 0) { //only one process needs to scan file
106                                         positions = m->setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs
107
108                                         //send file positions to all processes
109                                         for(int i = 1; i < processors; i++) { 
110                                                 MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
111                                                 MPI_Send(&positions[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
112                                         }
113                                 }else{
114                                         MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
115                                         positions.resize(numSeqs+1);
116                                         MPI_Recv(&positions[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
117                                 }
118                                 
119                                 //create database
120                                 if(method == "kmer")                    {       database = new KmerDB(tempFile, kmerSize);                      }
121                                 else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }
122                                 else if(method == "blast")              {       database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch, "", pid);  }
123                                 else if(method == "distance")   {       database = new DistanceDB();    }
124                                 else {
125                                         m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine();
126                                         database = new KmerDB(tempFile, 8);
127                                 }
128
129                                 //read file 
130                                 for(int i=0;i<numSeqs;i++){
131                                         //read next sequence
132                                         int length = positions[i+1] - positions[i];
133                                         char* buf4 = new char[length];
134                                         MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
135                                         
136                                         string tempBuf = buf4;
137                                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
138                                         delete buf4;
139                                         istringstream iss (tempBuf,istringstream::in);
140                                         
141                                         Sequence temp(iss);  
142                                         if (temp.getName() != "") {
143                                                 if (rdb->save) { rdb->referenceSeqs.push_back(temp); }
144                                                 names.push_back(temp.getName());
145                                                 database->addSequence(temp);    
146                                         }
147                                 }
148                                 
149                                 database->generateDB();
150                                 MPI_File_close(&inMPI);
151                                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
152                 #else
153                         
154                         //need to know number of template seqs for suffixdb
155                         if (method == "suffix") {
156                                 ifstream inFASTA;
157                                 m->openInputFile(tempFile, inFASTA);
158                                 m->getNumSeqs(inFASTA, numSeqs);
159                                 inFASTA.close();
160                         }
161
162                         bool needToGenerate = true;
163                         string kmerDBName;
164                         if(method == "kmer")                    {       
165                                 database = new KmerDB(tempFile, kmerSize);                      
166                                 
167                                 kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
168                                 ifstream kmerFileTest(kmerDBName.c_str());
169                                 if(kmerFileTest){       
170                                         bool GoodFile = m->checkReleaseVersion(kmerFileTest, m->getVersion());
171                                         if (GoodFile) {  needToGenerate = false;        }
172                                 }
173                         }
174                         else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }
175                         else if(method == "blast")              {       database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch, "", threadID);     }
176                         else if(method == "distance")   {       database = new DistanceDB();    }
177                         else {
178                                 m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
179                                 m->mothurOutEndLine();
180                                 database = new KmerDB(tempFile, 8);
181                         }
182                         
183                         if (needToGenerate) {
184                                 ifstream fastaFile;
185                                 m->openInputFile(tempFile, fastaFile);
186                                 
187                                 while (!fastaFile.eof()) {
188                                         Sequence temp(fastaFile);
189                                         m->gobble(fastaFile);
190                                         
191                                         if (rdb->save) { rdb->referenceSeqs.push_back(temp); }
192                                         
193                                         names.push_back(temp.getName());
194                                                                 
195                                         database->addSequence(temp);    
196                                 }
197                                 fastaFile.close();
198
199                 if ((method == "kmer") && (!shortcuts)) {;} //don't print
200                 else {database->generateDB(); } 
201                                 
202                         }else if ((method == "kmer") && (!needToGenerate)) {    
203                                 ifstream kmerFileTest(kmerDBName.c_str());
204                                 database->readKmerDB(kmerFileTest);     
205                         
206                                 ifstream fastaFile;
207                                 m->openInputFile(tempFile, fastaFile);
208                                 
209                                 while (!fastaFile.eof()) {
210                                         Sequence temp(fastaFile);
211                                         m->gobble(fastaFile);
212                                         
213                                         if (rdb->save) { rdb->referenceSeqs.push_back(temp); }
214                                         names.push_back(temp.getName());
215                                 }
216                                 fastaFile.close();
217                         }
218         #endif  
219                 
220                         database->setNumSeqs(names.size());
221                         
222                         m->mothurOut("DONE."); m->mothurOutEndLine();
223                         m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine();
224                 }
225         
226         readTaxonomy(taxFile);
227         
228         //sanity check
229         bool okay = phyloTree->ErrorCheck(names);
230         
231         if (!okay) { m->control_pressed = true; }
232         }
233         catch(exception& e) {
234                 m->errorOut(e, "Classify", "generateDatabaseAndNames");
235                 exit(1);
236         }
237 }
238 /**************************************************************************************************/
239 Classify::Classify() {          m = MothurOut::getInstance();   database = NULL;        phyloTree=NULL; flipped=false; }
240 /**************************************************************************************************/
241
242 int Classify::readTaxonomy(string file) {
243         try {
244                 
245                 phyloTree = new PhyloTree();
246                 string name, taxInfo;
247                 
248                 m->mothurOutEndLine();
249                 m->mothurOut("Reading in the " + file + " taxonomy...\t");      cout.flush();
250         if (m->debug) { m->mothurOut("[DEBUG]: Taxonomies read in...\n"); }
251         
252 #ifdef USE_MPI  
253                 int pid, num, processors;
254                 vector<unsigned long long> positions;
255                 int tag = 2001;
256                 
257                 MPI_Status status; 
258                 MPI_File inMPI;
259                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
260                 MPI_Comm_size(MPI_COMM_WORLD, &processors);
261                 
262                 char inFileName[1024];
263                 strcpy(inFileName, file.c_str());
264
265                 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
266                 //delete inFileName;
267
268                 if (pid == 0) {
269                         positions = m->setFilePosEachLine(file, num);
270                         
271                         //send file positions to all processes
272                         for(int i = 1; i < processors; i++) { 
273                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
274                                 MPI_Send(&positions[0], (num+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
275                         }
276                 }else{
277                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
278                         positions.resize(num+1);
279                         MPI_Recv(&positions[0], (num+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
280                 }
281         
282                 //read file 
283                 for(int i=0;i<num;i++){
284                         //read next sequence
285                         int length = positions[i+1] - positions[i];
286                         char* buf4 = new char[length];
287
288                         MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
289
290                         string tempBuf = buf4;
291                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
292                         delete buf4;
293
294                         istringstream iss (tempBuf,istringstream::in);
295                         iss >> name; m->gobble(iss);
296             iss >> taxInfo;
297             if (m->debug) { m->mothurOut("[DEBUG]: name = " + name + " tax = " + taxInfo + "\n"); }
298                         if (m->inUsersGroups(name, names)) {
299                 taxonomy[name] = taxInfo;
300                 phyloTree->addSeqToTree(name, taxInfo);
301             }else {
302                 m->mothurOut("[WARNING]: " + name + " is in your taxonomy file and not in your reference file, ignoring.\n");
303             }           
304         }
305                 
306                 MPI_File_close(&inMPI);
307                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
308 #else   
309         
310         taxonomy.clear(); 
311         m->readTax(file, taxonomy);
312         
313         //commented out to save time with large templates. 6/12/13
314         //map<string, string> tempTaxonomy;
315         for (map<string, string>::iterator itTax = taxonomy.begin(); itTax != taxonomy.end(); itTax++) {  
316             //if (m->inUsersGroups(itTax->first, names)) {
317                 phyloTree->addSeqToTree(itTax->first, itTax->second);
318             if (m->control_pressed) { break; }
319                 //tempTaxonomy[itTax->first] = itTax->second;
320            // }else {
321             //    m->mothurOut("[WARNING]: " + itTax->first + " is in your taxonomy file and not in your reference file, ignoring.\n");
322             //}
323         }
324         //taxonomy = tempTaxonomy;
325 #endif  
326                 phyloTree->assignHeirarchyIDs(0);
327                 
328                 phyloTree->setUp(file);
329         
330                 m->mothurOut("DONE.");
331                 m->mothurOutEndLine();  cout.flush();
332                 
333                 return phyloTree->getNumSeqs();
334         
335         }
336         catch(exception& e) {
337                 m->errorOut(e, "Classify", "readTaxonomy");
338                 exit(1);
339         }
340 }
341 /**************************************************************************************************/
342
343 vector<string> Classify::parseTax(string tax) {
344         try {
345                 vector<string> taxons;
346                 m->splitAtChar(tax, taxons, ';');
347         return taxons;
348         }
349         catch(exception& e) {
350                 m->errorOut(e, "Classify", "parseTax");
351                 exit(1);
352         }
353 }
354 /**************************************************************************************************/
355
356 double Classify::getLogExpSum(vector<double> probabilities, int& maxIndex){
357         try {
358         //      http://jblevins.org/notes/log-sum-exp
359         
360         double maxProb = probabilities[0];
361         maxIndex = 0;
362         
363         int numProbs = (int)probabilities.size();
364         
365         for(int i=1;i<numProbs;i++){
366             if(probabilities[i] >= maxProb){
367                 maxProb = probabilities[i];
368                 maxIndex = i;
369             }
370         }
371         
372         double probSum = 0.0000;
373         
374         for(int i=0;i<numProbs;i++){
375             probSum += exp(probabilities[i] - maxProb);         
376         }
377         
378         probSum = log(probSum) + maxProb;
379         
380         return probSum;
381         }
382         catch(exception& e) {
383                 m->errorOut(e, "Classify", "getLogExpSum");
384                 exit(1);
385         }
386 }
387
388 /**************************************************************************************************/
389