]> git.donarmstrong.com Git - mothur.git/blob - classify.cpp
557f17c6b85c7eb61865591ad3a8481ffa38ed38
[mothur.git] / classify.cpp
1 /*
2  *  classify.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 11/3/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "classify.h"
11 #include "sequence.hpp"
12 #include "kmerdb.hpp"
13 #include "suffixdb.hpp"
14 #include "blastdb.hpp"
15 #include "distancedb.hpp"
16
17 /**************************************************************************************************/
18 Classify::Classify(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch) : taxFile(tfile), templateFile(tempFile) {          
19         try {   
20                 m = MothurOut::getInstance();                                                                   
21                 readTaxonomy(taxFile);  
22                 
23                 int start = time(NULL);
24                 int numSeqs = 0;
25                 
26                 m->mothurOut("Generating search database...    "); cout.flush();
27 #ifdef USE_MPI  
28                         int pid;
29                         vector<long> positions;
30                 
31                         MPI_Status status; 
32                         MPI_File inMPI;
33                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
34         
35                         char inFileName[tempFile.length()];
36                         strcpy(inFileName, tempFile.c_str());
37         
38                         MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
39         
40                         if (pid == 0) { //only one process needs to scan file
41                                 positions = setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs
42
43                                 //send file positions to all processes
44                                 MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs
45                                 MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos     
46                         }else{
47                                 MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
48                                 positions.resize(numSeqs);
49                                 MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
50                         }
51                         
52                         //create database
53                         if(method == "kmer")                    {       database = new KmerDB(tempFile, kmerSize);                      }
54                         else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }
55                         else if(method == "blast")              {       database = new BlastDB(gapOpen, gapExtend, match, misMatch);    }
56                         else if(method == "distance")   {       database = new DistanceDB();    }
57                         else {
58                                 m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine();
59                                 database = new KmerDB(tempFile, 8);
60                         }
61
62                         //read file 
63                         for(int i=0;i<numSeqs;i++){
64                                 //read next sequence
65                                 int length = positions[i+1] - positions[i];
66                                 char buf4[length];
67                                 MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
68                                 
69                                 string tempBuf = buf4;
70                                 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
71                                 
72                                 istringstream iss (tempBuf,istringstream::in);
73                                 
74                                 Sequence temp(iss);  
75                                 if (temp.getName() != "") {
76                                         names.push_back(temp.getName());
77                                         database->addSequence(temp);    
78                                 }
79                         }
80                         
81                         database->generateDB();
82                         MPI_File_close(&inMPI);
83         #else
84                 
85                 //need to know number of template seqs for suffixdb
86                 if (method == "suffix") {
87                         ifstream inFASTA;
88                         openInputFile(tempFile, inFASTA);
89                         numSeqs = count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
90                         inFASTA.close();
91                 }
92
93                 bool needToGenerate = true;
94                 string kmerDBName;
95                 if(method == "kmer")                    {       
96                         database = new KmerDB(tempFile, kmerSize);                      
97                         
98                         kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
99                         ifstream kmerFileTest(kmerDBName.c_str());
100                         if(kmerFileTest){       needToGenerate = false;         }
101                 }
102                 else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }
103                 else if(method == "blast")              {       database = new BlastDB(gapOpen, gapExtend, match, misMatch);    }
104                 else if(method == "distance")   {       database = new DistanceDB();    }
105                 else {
106                         m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
107                         m->mothurOutEndLine();
108                         database = new KmerDB(tempFile, 8);
109                 }
110                 
111                 if (needToGenerate) {
112                         ifstream fastaFile;
113                         openInputFile(tempFile, fastaFile);
114                         
115                         while (!fastaFile.eof()) {
116                                 Sequence temp(fastaFile);
117                                 gobble(fastaFile);
118                         
119                                 names.push_back(temp.getName());
120                                                                 
121                                 database->addSequence(temp);    
122                         }
123                         fastaFile.close();
124
125                         database->generateDB();
126                         
127                 }else if ((method == "kmer") && (!needToGenerate)) {    
128                         ifstream kmerFileTest(kmerDBName.c_str());
129                         database->readKmerDB(kmerFileTest);     
130                         
131                         ifstream fastaFile;
132                         openInputFile(tempFile, fastaFile);
133                         
134                         while (!fastaFile.eof()) {
135                                 Sequence temp(fastaFile);
136                                 gobble(fastaFile);
137                         
138                                 names.push_back(temp.getName());
139                         }
140                         fastaFile.close();
141                 }
142 #endif          
143                 database->setNumSeqs(names.size());
144                 
145                 m->mothurOut("DONE."); m->mothurOutEndLine();
146                 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine();
147
148         }
149         catch(exception& e) {
150                 m->errorOut(e, "Classify", "Classify");
151                 exit(1);
152         }
153 }
154 /**************************************************************************************************/
155
156 void Classify::readTaxonomy(string file) {
157         try {
158                 
159                 phyloTree = new PhyloTree();
160                 string name, taxInfo;
161                 
162                 m->mothurOutEndLine();
163                 m->mothurOut("Reading in the " + file + " taxonomy...\t");      cout.flush();
164
165 #ifdef USE_MPI  
166                 int pid, num;
167                 vector<long> positions;
168                 
169                 MPI_Status status; 
170                 MPI_File inMPI;
171                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
172                 
173                 char inFileName[file.length()];
174                 strcpy(inFileName, file.c_str());
175                 
176                 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
177                 
178                 if (pid == 0) {
179                         positions = setFilePosEachLine(file, num);
180                         
181                         //send file positions to all processes
182                         MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs
183                         MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos 
184                 }else{
185                         MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
186                         positions.resize(num);
187                         MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
188                 }
189         
190                 //read file 
191                 for(int i=0;i<num;i++){
192                         //read next sequence
193                         int length = positions[i+1] - positions[i];
194                         char buf4[length];
195
196                         MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
197
198                         string tempBuf = buf4;
199                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
200                         
201                         istringstream iss (tempBuf,istringstream::in);
202                         iss >> name >> taxInfo;
203                         taxonomy[name] = taxInfo;
204                         phyloTree->addSeqToTree(name, taxInfo);
205                 }
206                 
207                 MPI_File_close(&inMPI);
208 #else                           
209                 ifstream inTax;
210                 openInputFile(file, inTax);
211         
212                 //read template seqs and save
213                 while (!inTax.eof()) {
214                         inTax >> name >> taxInfo;
215                         
216                         taxonomy[name] = taxInfo;
217                         
218                         phyloTree->addSeqToTree(name, taxInfo);
219                 
220                         gobble(inTax);
221                 }
222                 inTax.close();
223 #endif  
224         
225                 phyloTree->assignHeirarchyIDs(0);
226                 
227                 m->mothurOut("DONE.");
228                 m->mothurOutEndLine();  cout.flush();
229         
230         }
231         catch(exception& e) {
232                 m->errorOut(e, "Classify", "readTaxonomy");
233                 exit(1);
234         }
235 }
236 /**************************************************************************************************/
237
238 vector<string> Classify::parseTax(string tax) {
239         try {
240                 vector<string> taxons;
241                 
242                 tax = tax.substr(0, tax.length()-1);  //get rid of last ';'
243         
244                 //parse taxonomy
245                 string individual;
246                 while (tax.find_first_of(';') != -1) {
247                         individual = tax.substr(0,tax.find_first_of(';'));
248                         tax = tax.substr(tax.find_first_of(';')+1, tax.length());
249                         taxons.push_back(individual);
250                         
251                 }
252                 //get last one
253                 taxons.push_back(tax);
254                 
255                 return taxons;
256         }
257         catch(exception& e) {
258                 m->errorOut(e, "Classify", "parseTax");
259                 exit(1);
260         }
261 }
262 /**************************************************************************************************/
263