]> git.donarmstrong.com Git - mothur.git/blob - classify.cpp
a few modifications for 1.9
[mothur.git] / classify.cpp
1 /*\r
2  *  classify.cpp\r
3  *  Mothur\r
4  *\r
5  *  Created by westcott on 11/3/09.\r
6  *  Copyright 2009 Schloss Lab. All rights reserved.\r
7  *\r
8  */\r
9 \r
10 #include "classify.h"\r
11 #include "sequence.hpp"\r
12 #include "kmerdb.hpp"\r
13 #include "suffixdb.hpp"\r
14 #include "blastdb.hpp"\r
15 #include "distancedb.hpp"\r
16 \r
17 /**************************************************************************************************/\r
18 Classify::Classify(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch) : taxFile(tfile), templateFile(tempFile) {          \r
19         try {   \r
20                 m = MothurOut::getInstance();                                                                   \r
21                 readTaxonomy(taxFile);  \r
22                 \r
23                 int start = time(NULL);\r
24                 int numSeqs = 0;\r
25                 \r
26                 m->mothurOut("Generating search database...    "); cout.flush();\r
27 #ifdef USE_MPI  \r
28                         int pid;\r
29                         vector<long> positions;\r
30                 \r
31                         MPI_Status status; \r
32                         MPI_File inMPI;\r
33                         MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are\r
34 \r
35                         //char* inFileName = new char[tempFile.length()];\r
36                         //memcpy(inFileName, tempFile.c_str(), tempFile.length());\r
37                         \r
38                         char inFileName[1024];\r
39                         strcpy(inFileName, tempFile.c_str());\r
40 \r
41                         MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer\r
42                         //delete inFileName;\r
43 \r
44                         if (pid == 0) { //only one process needs to scan file\r
45                                 positions = setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs\r
46 \r
47                                 //send file positions to all processes\r
48                                 MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs\r
49                                 MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos     \r
50                         }else{\r
51                                 MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs\r
52                                 positions.resize(numSeqs);\r
53                                 MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions\r
54                         }\r
55                         \r
56                         //create database\r
57                         if(method == "kmer")                    {       database = new KmerDB(tempFile, kmerSize);                      }\r
58                         else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }\r
59                         else if(method == "blast")              {       database = new BlastDB(gapOpen, gapExtend, match, misMatch);    }\r
60                         else if(method == "distance")   {       database = new DistanceDB();    }\r
61                         else {\r
62                                 m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine();\r
63                                 database = new KmerDB(tempFile, 8);\r
64                         }\r
65 \r
66                         //read file \r
67                         for(int i=0;i<numSeqs;i++){\r
68                                 //read next sequence\r
69                                 int length = positions[i+1] - positions[i];\r
70                                 char* buf4 = new char[length];\r
71                                 MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);\r
72                                 \r
73                                 string tempBuf = buf4;\r
74                                 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }\r
75                                 delete buf4;\r
76                                 istringstream iss (tempBuf,istringstream::in);\r
77                                 \r
78                                 Sequence temp(iss);  \r
79                                 if (temp.getName() != "") {\r
80                                         names.push_back(temp.getName());\r
81                                         database->addSequence(temp);    \r
82                                 }\r
83                         }\r
84                         \r
85                         database->generateDB();\r
86                         MPI_File_close(&inMPI);\r
87         #else\r
88                 \r
89                 //need to know number of template seqs for suffixdb\r
90                 if (method == "suffix") {\r
91                         ifstream inFASTA;\r
92                         openInputFile(tempFile, inFASTA);\r
93                         numSeqs = count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');\r
94                         inFASTA.close();\r
95                 }\r
96 \r
97                 bool needToGenerate = true;\r
98                 string kmerDBName;\r
99                 if(method == "kmer")                    {       \r
100                         database = new KmerDB(tempFile, kmerSize);                      \r
101                         \r
102                         kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";\r
103                         ifstream kmerFileTest(kmerDBName.c_str());\r
104                         if(kmerFileTest){       needToGenerate = false;         }\r
105                 }\r
106                 else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }\r
107                 else if(method == "blast")              {       database = new BlastDB(gapOpen, gapExtend, match, misMatch);    }\r
108                 else if(method == "distance")   {       database = new DistanceDB();    }\r
109                 else {\r
110                         m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");\r
111                         m->mothurOutEndLine();\r
112                         database = new KmerDB(tempFile, 8);\r
113                 }\r
114                 \r
115                 if (needToGenerate) {\r
116                         ifstream fastaFile;\r
117                         openInputFile(tempFile, fastaFile);\r
118                         \r
119                         while (!fastaFile.eof()) {\r
120                                 Sequence temp(fastaFile);\r
121                                 gobble(fastaFile);\r
122                         \r
123                                 names.push_back(temp.getName());\r
124                                                                 \r
125                                 database->addSequence(temp);    \r
126                         }\r
127                         fastaFile.close();\r
128 \r
129                         database->generateDB();\r
130                         \r
131                 }else if ((method == "kmer") && (!needToGenerate)) {    \r
132                         ifstream kmerFileTest(kmerDBName.c_str());\r
133                         database->readKmerDB(kmerFileTest);     \r
134                         \r
135                         ifstream fastaFile;\r
136                         openInputFile(tempFile, fastaFile);\r
137                         \r
138                         while (!fastaFile.eof()) {\r
139                                 Sequence temp(fastaFile);\r
140                                 gobble(fastaFile);\r
141                         \r
142                                 names.push_back(temp.getName());\r
143                         }\r
144                         fastaFile.close();\r
145                 }\r
146 #endif          \r
147                 database->setNumSeqs(names.size());\r
148                 \r
149                 m->mothurOut("DONE."); m->mothurOutEndLine();\r
150                 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine();\r
151 \r
152         }\r
153         catch(exception& e) {\r
154                 m->errorOut(e, "Classify", "Classify");\r
155                 exit(1);\r
156         }\r
157 }\r
158 /**************************************************************************************************/\r
159 \r
160 void Classify::readTaxonomy(string file) {\r
161         try {\r
162                 \r
163                 phyloTree = new PhyloTree();\r
164                 string name, taxInfo;\r
165                 \r
166                 m->mothurOutEndLine();\r
167                 m->mothurOut("Reading in the " + file + " taxonomy...\t");      cout.flush();\r
168 \r
169 #ifdef USE_MPI  \r
170                 int pid, num;\r
171                 vector<long> positions;\r
172                 \r
173                 MPI_Status status; \r
174                 MPI_File inMPI;\r
175                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are\r
176 \r
177                 //char* inFileName = new char[file.length()];\r
178                 //memcpy(inFileName, file.c_str(), file.length());\r
179                 \r
180                 char inFileName[1024];\r
181                 strcpy(inFileName, file.c_str());\r
182 \r
183                 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer\r
184                 //delete inFileName;\r
185 \r
186                 if (pid == 0) {\r
187                         positions = setFilePosEachLine(file, num);\r
188                         \r
189                         //send file positions to all processes\r
190                         MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs\r
191                         MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos \r
192                 }else{\r
193                         MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs\r
194                         positions.resize(num);\r
195                         MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions\r
196                 }\r
197         \r
198                 //read file \r
199                 for(int i=0;i<num;i++){\r
200                         //read next sequence\r
201                         int length = positions[i+1] - positions[i];\r
202                         char* buf4 = new char[length];\r
203 \r
204                         MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);\r
205 \r
206                         string tempBuf = buf4;\r
207                         if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }\r
208                         delete buf4;\r
209 \r
210                         istringstream iss (tempBuf,istringstream::in);\r
211                         iss >> name >> taxInfo;\r
212                         taxonomy[name] = taxInfo;\r
213                         phyloTree->addSeqToTree(name, taxInfo);\r
214                 }\r
215                 \r
216                 MPI_File_close(&inMPI);\r
217 #else                           \r
218                 ifstream inTax;\r
219                 openInputFile(file, inTax);\r
220         \r
221                 //read template seqs and save\r
222                 while (!inTax.eof()) {\r
223                         inTax >> name >> taxInfo;\r
224                         \r
225                         taxonomy[name] = taxInfo;\r
226                         \r
227                         phyloTree->addSeqToTree(name, taxInfo);\r
228                 \r
229                         gobble(inTax);\r
230                 }\r
231                 inTax.close();\r
232 #endif  \r
233         \r
234                 phyloTree->assignHeirarchyIDs(0);\r
235                 \r
236                 m->mothurOut("DONE.");\r
237                 m->mothurOutEndLine();  cout.flush();\r
238         \r
239         }\r
240         catch(exception& e) {\r
241                 m->errorOut(e, "Classify", "readTaxonomy");\r
242                 exit(1);\r
243         }\r
244 }\r
245 /**************************************************************************************************/\r
246 \r
247 vector<string> Classify::parseTax(string tax) {\r
248         try {\r
249                 vector<string> taxons;\r
250                 \r
251                 tax = tax.substr(0, tax.length()-1);  //get rid of last ';'\r
252         \r
253                 //parse taxonomy\r
254                 string individual;\r
255                 while (tax.find_first_of(';') != -1) {\r
256                         individual = tax.substr(0,tax.find_first_of(';'));\r
257                         tax = tax.substr(tax.find_first_of(';')+1, tax.length());\r
258                         taxons.push_back(individual);\r
259                         \r
260                 }\r
261                 //get last one\r
262                 taxons.push_back(tax);\r
263                 \r
264                 return taxons;\r
265         }\r
266         catch(exception& e) {\r
267                 m->errorOut(e, "Classify", "parseTax");\r
268                 exit(1);\r
269         }\r
270 }\r
271 /**************************************************************************************************/\r
272 \r