]> git.donarmstrong.com Git - mothur.git/blob - classify.cpp
changed tags on temp blast db to include template name
[mothur.git] / classify.cpp
1 /*
2  *  classify.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 11/3/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "classify.h"
11 #include "sequence.hpp"
12 #include "kmerdb.hpp"
13 #include "suffixdb.hpp"
14 #include "blastdb.hpp"
15 #include "distancedb.hpp"
16
17 /**************************************************************************************************/
18 void Classify::generateDatabaseAndNames(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch)  {             
19         try {   
20                 taxFile = tfile;
21                 readTaxonomy(taxFile);  
22                 
23                 templateFile = tempFile;        
24                 
25                 int start = time(NULL);
26                 int numSeqs = 0;
27                 
28                 m->mothurOut("Generating search database...    "); cout.flush();
29 #ifdef USE_MPI  
30                         int pid, processors;
31                         vector<unsigned long int> positions;
32                         int tag = 2001;
33                 
34                         MPI_Status status; 
35                         MPI_File inMPI;
36                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
37                         MPI_Comm_size(MPI_COMM_WORLD, &processors);
38
39                         //char* inFileName = new char[tempFile.length()];
40                         //memcpy(inFileName, tempFile.c_str(), tempFile.length());
41                         
42                         char inFileName[1024];
43                         strcpy(inFileName, tempFile.c_str());
44
45                         MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
46                         //delete inFileName;
47
48                         if (pid == 0) { //only one process needs to scan file
49                                 positions = m->setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs
50
51                                 //send file positions to all processes
52                                 for(int i = 1; i < processors; i++) { 
53                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
54                                         MPI_Send(&positions[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
55                                 }
56                         }else{
57                                 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
58                                 positions.resize(numSeqs+1);
59                                 MPI_Recv(&positions[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
60                         }
61                         
62                         //create database
63                         if(method == "kmer")                    {       database = new KmerDB(tempFile, kmerSize);                      }
64                         else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }
65                         else if(method == "blast")              {       database = new BlastDB(gapOpen, gapExtend, match, misMatch);    }
66                         else if(method == "distance")   {       database = new DistanceDB();    }
67                         else {
68                                 m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine();
69                                 database = new KmerDB(tempFile, 8);
70                         }
71
72                         //read file 
73                         for(int i=0;i<numSeqs;i++){
74                                 //read next sequence
75                                 int length = positions[i+1] - positions[i];
76                                 char* buf4 = new char[length];
77                                 MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
78                                 
79                                 string tempBuf = buf4;
80                                 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
81                                 delete buf4;
82                                 istringstream iss (tempBuf,istringstream::in);
83                                 
84                                 Sequence temp(iss);  
85                                 if (temp.getName() != "") {
86                                         names.push_back(temp.getName());
87                                         database->addSequence(temp);    
88                                 }
89                         }
90                         
91                         database->generateDB();
92                         MPI_File_close(&inMPI);
93                         MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
94         #else
95                 
96                 //need to know number of template seqs for suffixdb
97                 if (method == "suffix") {
98                         ifstream inFASTA;
99                         m->openInputFile(tempFile, inFASTA);
100                         m->getNumSeqs(inFASTA, numSeqs);
101                         inFASTA.close();
102                 }
103
104                 bool needToGenerate = true;
105                 string kmerDBName;
106                 if(method == "kmer")                    {       
107                         database = new KmerDB(tempFile, kmerSize);                      
108                         
109                         kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
110                         ifstream kmerFileTest(kmerDBName.c_str());
111                         if(kmerFileTest){       
112                                 bool GoodFile = m->checkReleaseVersion(kmerFileTest, m->getVersion());
113                                 if (GoodFile) {  needToGenerate = false;        }
114                         }
115                 }
116                 else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }
117                 else if(method == "blast")              {       database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch);   }
118                 else if(method == "distance")   {       database = new DistanceDB();    }
119                 else {
120                         m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
121                         m->mothurOutEndLine();
122                         database = new KmerDB(tempFile, 8);
123                 }
124                 
125                 if (needToGenerate) {
126                         ifstream fastaFile;
127                         m->openInputFile(tempFile, fastaFile);
128                         
129                         while (!fastaFile.eof()) {
130                                 Sequence temp(fastaFile);
131                                 m->gobble(fastaFile);
132                         
133                                 names.push_back(temp.getName());
134                                                         
135                                 database->addSequence(temp);    
136                         }
137                         fastaFile.close();
138
139                         database->generateDB();
140                         
141                 }else if ((method == "kmer") && (!needToGenerate)) {    
142                         ifstream kmerFileTest(kmerDBName.c_str());
143                         database->readKmerDB(kmerFileTest);     
144                 
145                         ifstream fastaFile;
146                         m->openInputFile(tempFile, fastaFile);
147                         
148                         while (!fastaFile.eof()) {
149                                 Sequence temp(fastaFile);
150                                 m->gobble(fastaFile);
151
152                                 names.push_back(temp.getName());
153                         }
154                         fastaFile.close();
155                 }
156 #endif  
157         
158                 database->setNumSeqs(names.size());
159                 
160                 //sanity check
161                 bool okay = phyloTree->ErrorCheck(names);
162                 
163                 if (!okay) { m->control_pressed = true; }
164                 
165                 m->mothurOut("DONE."); m->mothurOutEndLine();
166                 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine();
167
168         }
169         catch(exception& e) {
170                 m->errorOut(e, "Classify", "generateDatabaseAndNames");
171                 exit(1);
172         }
173 }
174 /**************************************************************************************************/
175 Classify::Classify() {          m = MothurOut::getInstance();   database = NULL;        }
176 /**************************************************************************************************/
177
178 int Classify::readTaxonomy(string file) {
179         try {
180                 
181                 phyloTree = new PhyloTree();
182                 string name, taxInfo;
183                 
184                 m->mothurOutEndLine();
185                 m->mothurOut("Reading in the " + file + " taxonomy...\t");      cout.flush();
186
187 #ifdef USE_MPI  
188                 int pid, num, processors;
189                 vector<unsigned long int> positions;
190                 int tag = 2001;
191                 
192                 MPI_Status status; 
193                 MPI_File inMPI;
194                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
195                 MPI_Comm_size(MPI_COMM_WORLD, &processors);
196
197                 //char* inFileName = new char[file.length()];
198                 //memcpy(inFileName, file.c_str(), file.length());
199                 
200                 char inFileName[1024];
201                 strcpy(inFileName, file.c_str());
202
203                 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
204                 //delete inFileName;
205
206                 if (pid == 0) {
207                         positions = m->setFilePosEachLine(file, num);
208                         
209                         //send file positions to all processes
210                         for(int i = 1; i < processors; i++) { 
211                                 MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
212                                 MPI_Send(&positions[0], (num+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
213                         }
214                 }else{
215                         MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
216                         positions.resize(num+1);
217                         MPI_Recv(&positions[0], (num+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
218                 }
219         
220                 //read file 
221                 for(int i=0;i<num;i++){
222                         //read next sequence
223                         int length = positions[i+1] - positions[i];
224                         char* buf4 = new char[length];
225
226                         MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
227
228                         string tempBuf = buf4;
229                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
230                         delete buf4;
231
232                         istringstream iss (tempBuf,istringstream::in);
233                         iss >> name >> taxInfo;
234                         taxonomy[name] = taxInfo;
235                         phyloTree->addSeqToTree(name, taxInfo);
236                 }
237                 
238                 MPI_File_close(&inMPI);
239                 MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case
240 #else                           
241                 ifstream inTax;
242                 m->openInputFile(file, inTax);
243         
244                 //read template seqs and save
245                 while (!inTax.eof()) {
246                         inTax >> name >> taxInfo;
247                         
248                         taxonomy[name] = taxInfo;
249                         
250                         phyloTree->addSeqToTree(name, taxInfo);
251                 
252                         m->gobble(inTax);
253                 }
254                 inTax.close();
255 #endif  
256         
257                 phyloTree->assignHeirarchyIDs(0);
258                 
259                 phyloTree->setUp(file);
260         
261                 m->mothurOut("DONE.");
262                 m->mothurOutEndLine();  cout.flush();
263                 
264                 return phyloTree->getNumSeqs();
265         
266         }
267         catch(exception& e) {
268                 m->errorOut(e, "Classify", "readTaxonomy");
269                 exit(1);
270         }
271 }
272 /**************************************************************************************************/
273
274 vector<string> Classify::parseTax(string tax) {
275         try {
276                 vector<string> taxons;
277                 
278                 tax = tax.substr(0, tax.length()-1);  //get rid of last ';'
279         
280                 //parse taxonomy
281                 string individual;
282                 while (tax.find_first_of(';') != -1) {
283                         individual = tax.substr(0,tax.find_first_of(';'));
284                         tax = tax.substr(tax.find_first_of(';')+1, tax.length());
285                         taxons.push_back(individual);
286                         
287                 }
288                 //get last one
289                 taxons.push_back(tax);
290                 
291                 return taxons;
292         }
293         catch(exception& e) {
294                 m->errorOut(e, "Classify", "parseTax");
295                 exit(1);
296         }
297 }
298 /**************************************************************************************************/
299