]> git.donarmstrong.com Git - mothur.git/blob - classify.cpp
147f499b87357f5782b98fc882db80f3233f2aee
[mothur.git] / classify.cpp
1 /*
2  *  classify.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 11/3/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "classify.h"
11 #include "sequence.hpp"
12 #include "kmerdb.hpp"
13 #include "suffixdb.hpp"
14 #include "blastdb.hpp"
15 #include "distancedb.hpp"
16
17 /**************************************************************************************************/
18 void Classify::generateDatabaseAndNames(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch)  {             
19         try {   
20                 taxFile = tfile;
21                 readTaxonomy(taxFile);  
22                 
23                 templateFile = tempFile;        
24                 
25                 int start = time(NULL);
26                 int numSeqs = 0;
27                 
28                 m->mothurOut("Generating search database...    "); cout.flush();
29 #ifdef USE_MPI  
30                         int pid;
31                         vector<long> positions;
32                 
33                         MPI_Status status; 
34                         MPI_File inMPI;
35                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
36
37                         //char* inFileName = new char[tempFile.length()];
38                         //memcpy(inFileName, tempFile.c_str(), tempFile.length());
39                         
40                         char inFileName[1024];
41                         strcpy(inFileName, tempFile.c_str());
42
43                         MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
44                         //delete inFileName;
45
46                         if (pid == 0) { //only one process needs to scan file
47                                 positions = setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs
48
49                                 //send file positions to all processes
50                                 MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs
51                                 MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos     
52                         }else{
53                                 MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
54                                 positions.resize(numSeqs);
55                                 MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
56                         }
57                         
58                         //create database
59                         if(method == "kmer")                    {       database = new KmerDB(tempFile, kmerSize);                      }
60                         else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }
61                         else if(method == "blast")              {       database = new BlastDB(gapOpen, gapExtend, match, misMatch);    }
62                         else if(method == "distance")   {       database = new DistanceDB();    }
63                         else {
64                                 m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine();
65                                 database = new KmerDB(tempFile, 8);
66                         }
67
68                         //read file 
69                         for(int i=0;i<numSeqs;i++){
70                                 //read next sequence
71                                 int length = positions[i+1] - positions[i];
72                                 char* buf4 = new char[length];
73                                 MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
74                                 
75                                 string tempBuf = buf4;
76                                 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
77                                 delete buf4;
78                                 istringstream iss (tempBuf,istringstream::in);
79                                 
80                                 Sequence temp(iss);  
81                                 if (temp.getName() != "") {
82                                         names.push_back(temp.getName());
83                                         database->addSequence(temp);    
84                                 }
85                         }
86                         
87                         database->generateDB();
88                         MPI_File_close(&inMPI);
89         #else
90                 
91                 //need to know number of template seqs for suffixdb
92                 if (method == "suffix") {
93                         ifstream inFASTA;
94                         openInputFile(tempFile, inFASTA);
95                         numSeqs = count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
96                         inFASTA.close();
97                 }
98
99                 bool needToGenerate = true;
100                 string kmerDBName;
101                 if(method == "kmer")                    {       
102                         database = new KmerDB(tempFile, kmerSize);                      
103                         
104                         kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
105                         ifstream kmerFileTest(kmerDBName.c_str());
106                         if(kmerFileTest){       needToGenerate = false;         }
107                 }
108                 else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }
109                 else if(method == "blast")              {       database = new BlastDB(gapOpen, gapExtend, match, misMatch);    }
110                 else if(method == "distance")   {       database = new DistanceDB();    }
111                 else {
112                         m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
113                         m->mothurOutEndLine();
114                         database = new KmerDB(tempFile, 8);
115                 }
116                 
117                 if (needToGenerate) {
118                         ifstream fastaFile;
119                         openInputFile(tempFile, fastaFile);
120                         
121                         while (!fastaFile.eof()) {
122                                 Sequence temp(fastaFile);
123                                 gobble(fastaFile);
124                         
125                                 names.push_back(temp.getName());
126                                                                 
127                                 database->addSequence(temp);    
128                         }
129                         fastaFile.close();
130
131                         database->generateDB();
132                         
133                 }else if ((method == "kmer") && (!needToGenerate)) {    
134                         ifstream kmerFileTest(kmerDBName.c_str());
135                         database->readKmerDB(kmerFileTest);     
136                         
137                         ifstream fastaFile;
138                         openInputFile(tempFile, fastaFile);
139                         
140                         while (!fastaFile.eof()) {
141                                 Sequence temp(fastaFile);
142                                 gobble(fastaFile);
143
144                                 names.push_back(temp.getName());
145                         }
146                         fastaFile.close();
147                 }
148 #endif          
149                 database->setNumSeqs(names.size());
150                 
151                 m->mothurOut("DONE."); m->mothurOutEndLine();
152                 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine();
153
154         }
155         catch(exception& e) {
156                 m->errorOut(e, "Classify", "generateDatabaseAndNames");
157                 exit(1);
158         }
159 }
160 /**************************************************************************************************/
161 Classify::Classify() {          m = MothurOut::getInstance();   database = NULL;        }
162 /**************************************************************************************************/
163
164 int Classify::readTaxonomy(string file) {
165         try {
166                 
167                 phyloTree = new PhyloTree();
168                 string name, taxInfo;
169                 
170                 m->mothurOutEndLine();
171                 m->mothurOut("Reading in the " + file + " taxonomy...\t");      cout.flush();
172
173 #ifdef USE_MPI  
174                 int pid, num;
175                 vector<long> positions;
176                 
177                 MPI_Status status; 
178                 MPI_File inMPI;
179                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
180
181                 //char* inFileName = new char[file.length()];
182                 //memcpy(inFileName, file.c_str(), file.length());
183                 
184                 char inFileName[1024];
185                 strcpy(inFileName, file.c_str());
186
187                 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
188                 //delete inFileName;
189
190                 if (pid == 0) {
191                         positions = setFilePosEachLine(file, num);
192                         
193                         //send file positions to all processes
194                         MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs
195                         MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos 
196                 }else{
197                         MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
198                         positions.resize(num);
199                         MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
200                 }
201         
202                 //read file 
203                 for(int i=0;i<num;i++){
204                         //read next sequence
205                         int length = positions[i+1] - positions[i];
206                         char* buf4 = new char[length];
207
208                         MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
209
210                         string tempBuf = buf4;
211                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
212                         delete buf4;
213
214                         istringstream iss (tempBuf,istringstream::in);
215                         iss >> name >> taxInfo;
216                         taxonomy[name] = taxInfo;
217                         phyloTree->addSeqToTree(name, taxInfo);
218                 }
219                 
220                 MPI_File_close(&inMPI);
221 #else                           
222                 ifstream inTax;
223                 openInputFile(file, inTax);
224         
225                 //read template seqs and save
226                 while (!inTax.eof()) {
227                         inTax >> name >> taxInfo;
228                         
229                         taxonomy[name] = taxInfo;
230                         
231                         phyloTree->addSeqToTree(name, taxInfo);
232                 
233                         gobble(inTax);
234                 }
235                 inTax.close();
236 #endif  
237         
238                 phyloTree->assignHeirarchyIDs(0);
239                 
240                 phyloTree->setUp(file);
241                 
242                 m->mothurOut("DONE.");
243                 m->mothurOutEndLine();  cout.flush();
244                 
245                 return phyloTree->getNumSeqs();
246         
247         }
248         catch(exception& e) {
249                 m->errorOut(e, "Classify", "readTaxonomy");
250                 exit(1);
251         }
252 }
253 /**************************************************************************************************/
254
255 vector<string> Classify::parseTax(string tax) {
256         try {
257                 vector<string> taxons;
258                 
259                 tax = tax.substr(0, tax.length()-1);  //get rid of last ';'
260         
261                 //parse taxonomy
262                 string individual;
263                 while (tax.find_first_of(';') != -1) {
264                         individual = tax.substr(0,tax.find_first_of(';'));
265                         tax = tax.substr(tax.find_first_of(';')+1, tax.length());
266                         taxons.push_back(individual);
267                         
268                 }
269                 //get last one
270                 taxons.push_back(tax);
271                 
272                 return taxons;
273         }
274         catch(exception& e) {
275                 m->errorOut(e, "Classify", "parseTax");
276                 exit(1);
277         }
278 }
279 /**************************************************************************************************/
280