]> git.donarmstrong.com Git - mothur.git/blob - classify.cpp
1.9
[mothur.git] / classify.cpp
1 /*\r
2  *  classify.cpp\r
3  *  Mothur\r
4  *\r
5  *  Created by westcott on 11/3/09.\r
6  *  Copyright 2009 Schloss Lab. All rights reserved.\r
7  *\r
8  */\r
9 \r
10 #include "classify.h"\r
11 #include "sequence.hpp"\r
12 #include "kmerdb.hpp"\r
13 #include "suffixdb.hpp"\r
14 #include "blastdb.hpp"\r
15 #include "distancedb.hpp"\r
16 \r
17 /**************************************************************************************************/\r
18 Classify::Classify(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch) : taxFile(tfile), templateFile(tempFile) {          \r
19         try {   \r
20                 m = MothurOut::getInstance();                                                                   \r
21                 readTaxonomy(taxFile);  \r
22                 \r
23                 int start = time(NULL);\r
24                 int numSeqs = 0;\r
25                 \r
26                 m->mothurOut("Generating search database...    "); cout.flush();\r
27 #ifdef USE_MPI  \r
28                         int pid;\r
29                         vector<long> positions;\r
30                 \r
31                         MPI_Status status; \r
32                         MPI_File inMPI;\r
33                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are\r
34 \r
35                         char* inFileName = new char[tempFile.length()];\r
36                         memcpy(inFileName, tempFile.c_str(), tempFile.length());\r
37         \r
38                         MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer\r
39                         delete inFileName;\r
40 \r
41                         if (pid == 0) { //only one process needs to scan file\r
42                                 positions = setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs\r
43 \r
44                                 //send file positions to all processes\r
45                                 MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs\r
46                                 MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos     \r
47                         }else{\r
48                                 MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs\r
49                                 positions.resize(numSeqs);\r
50                                 MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions\r
51                         }\r
52                         \r
53                         //create database\r
54                         if(method == "kmer")                    {       database = new KmerDB(tempFile, kmerSize);                      }\r
55                         else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }\r
56                         else if(method == "blast")              {       database = new BlastDB(gapOpen, gapExtend, match, misMatch);    }\r
57                         else if(method == "distance")   {       database = new DistanceDB();    }\r
58                         else {\r
59                                 m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine();\r
60                                 database = new KmerDB(tempFile, 8);\r
61                         }\r
62 \r
63                         //read file \r
64                         for(int i=0;i<numSeqs;i++){\r
65                                 //read next sequence\r
66                                 int length = positions[i+1] - positions[i];\r
67                                 char* buf4 = new char[length];\r
68                                 MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);\r
69                                 \r
70                                 string tempBuf = buf4;\r
71                                 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }\r
72                                 delete buf4;\r
73                                 istringstream iss (tempBuf,istringstream::in);\r
74                                 \r
75                                 Sequence temp(iss);  \r
76                                 if (temp.getName() != "") {\r
77                                         names.push_back(temp.getName());\r
78                                         database->addSequence(temp);    \r
79                                 }\r
80                         }\r
81                         \r
82                         database->generateDB();\r
83                         MPI_File_close(&inMPI);\r
84         #else\r
85                 \r
86                 //need to know number of template seqs for suffixdb\r
87                 if (method == "suffix") {\r
88                         ifstream inFASTA;\r
89                         openInputFile(tempFile, inFASTA);\r
90                         numSeqs = count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');\r
91                         inFASTA.close();\r
92                 }\r
93 \r
94                 bool needToGenerate = true;\r
95                 string kmerDBName;\r
96                 if(method == "kmer")                    {       \r
97                         database = new KmerDB(tempFile, kmerSize);                      \r
98                         \r
99                         kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";\r
100                         ifstream kmerFileTest(kmerDBName.c_str());\r
101                         if(kmerFileTest){       needToGenerate = false;         }\r
102                 }\r
103                 else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }\r
104                 else if(method == "blast")              {       database = new BlastDB(gapOpen, gapExtend, match, misMatch);    }\r
105                 else if(method == "distance")   {       database = new DistanceDB();    }\r
106                 else {\r
107                         m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");\r
108                         m->mothurOutEndLine();\r
109                         database = new KmerDB(tempFile, 8);\r
110                 }\r
111                 \r
112                 if (needToGenerate) {\r
113                         ifstream fastaFile;\r
114                         openInputFile(tempFile, fastaFile);\r
115                         \r
116                         while (!fastaFile.eof()) {\r
117                                 Sequence temp(fastaFile);\r
118                                 gobble(fastaFile);\r
119                         \r
120                                 names.push_back(temp.getName());\r
121                                                                 \r
122                                 database->addSequence(temp);    \r
123                         }\r
124                         fastaFile.close();\r
125 \r
126                         database->generateDB();\r
127                         \r
128                 }else if ((method == "kmer") && (!needToGenerate)) {    \r
129                         ifstream kmerFileTest(kmerDBName.c_str());\r
130                         database->readKmerDB(kmerFileTest);     \r
131                         \r
132                         ifstream fastaFile;\r
133                         openInputFile(tempFile, fastaFile);\r
134                         \r
135                         while (!fastaFile.eof()) {\r
136                                 Sequence temp(fastaFile);\r
137                                 gobble(fastaFile);\r
138                         \r
139                                 names.push_back(temp.getName());\r
140                         }\r
141                         fastaFile.close();\r
142                 }\r
143 #endif          \r
144                 database->setNumSeqs(names.size());\r
145                 \r
146                 m->mothurOut("DONE."); m->mothurOutEndLine();\r
147                 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine();\r
148 \r
149         }\r
150         catch(exception& e) {\r
151                 m->errorOut(e, "Classify", "Classify");\r
152                 exit(1);\r
153         }\r
154 }\r
155 /**************************************************************************************************/\r
156 \r
157 void Classify::readTaxonomy(string file) {\r
158         try {\r
159                 \r
160                 phyloTree = new PhyloTree();\r
161                 string name, taxInfo;\r
162                 \r
163                 m->mothurOutEndLine();\r
164                 m->mothurOut("Reading in the " + file + " taxonomy...\t");      cout.flush();\r
165 \r
166 #ifdef USE_MPI  \r
167                 int pid, num;\r
168                 vector<long> positions;\r
169                 \r
170                 MPI_Status status; \r
171                 MPI_File inMPI;\r
172                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are\r
173 \r
174                 char* inFileName = new char[file.length()];\r
175                 memcpy(inFileName, file.c_str(), file.length());\r
176                 \r
177                 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer\r
178                 delete inFileName;\r
179 \r
180                 if (pid == 0) {\r
181                         positions = setFilePosEachLine(file, num);\r
182                         \r
183                         //send file positions to all processes\r
184                         MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs\r
185                         MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos \r
186                 }else{\r
187                         MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs\r
188                         positions.resize(num);\r
189                         MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions\r
190                 }\r
191         \r
192                 //read file \r
193                 for(int i=0;i<num;i++){\r
194                         //read next sequence\r
195                         int length = positions[i+1] - positions[i];\r
196                         char* buf4 = new char[length];\r
197 \r
198                         MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);\r
199 \r
200                         string tempBuf = buf4;\r
201                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }\r
202                         delete buf4;\r
203 \r
204                         istringstream iss (tempBuf,istringstream::in);\r
205                         iss >> name >> taxInfo;\r
206                         taxonomy[name] = taxInfo;\r
207                         phyloTree->addSeqToTree(name, taxInfo);\r
208                 }\r
209                 \r
210                 MPI_File_close(&inMPI);\r
211 #else                           \r
212                 ifstream inTax;\r
213                 openInputFile(file, inTax);\r
214         \r
215                 //read template seqs and save\r
216                 while (!inTax.eof()) {\r
217                         inTax >> name >> taxInfo;\r
218                         \r
219                         taxonomy[name] = taxInfo;\r
220                         \r
221                         phyloTree->addSeqToTree(name, taxInfo);\r
222                 \r
223                         gobble(inTax);\r
224                 }\r
225                 inTax.close();\r
226 #endif  \r
227         \r
228                 phyloTree->assignHeirarchyIDs(0);\r
229                 \r
230                 m->mothurOut("DONE.");\r
231                 m->mothurOutEndLine();  cout.flush();\r
232         \r
233         }\r
234         catch(exception& e) {\r
235                 m->errorOut(e, "Classify", "readTaxonomy");\r
236                 exit(1);\r
237         }\r
238 }\r
239 /**************************************************************************************************/\r
240 \r
241 vector<string> Classify::parseTax(string tax) {\r
242         try {\r
243                 vector<string> taxons;\r
244                 \r
245                 tax = tax.substr(0, tax.length()-1);  //get rid of last ';'\r
246         \r
247                 //parse taxonomy\r
248                 string individual;\r
249                 while (tax.find_first_of(';') != -1) {\r
250                         individual = tax.substr(0,tax.find_first_of(';'));\r
251                         tax = tax.substr(tax.find_first_of(';')+1, tax.length());\r
252                         taxons.push_back(individual);\r
253                         \r
254                 }\r
255                 //get last one\r
256                 taxons.push_back(tax);\r
257                 \r
258                 return taxons;\r
259         }\r
260         catch(exception& e) {\r
261                 m->errorOut(e, "Classify", "parseTax");\r
262                 exit(1);\r
263         }\r
264 }\r
265 /**************************************************************************************************/\r
266 \r