5 * Created by westcott on 11/3/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
11 #include "sequence.hpp"
13 #include "suffixdb.hpp"
14 #include "blastdb.hpp"
15 #include "distancedb.hpp"
17 /**************************************************************************************************/
18 Classify::Classify(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch) : taxFile(tfile), templateFile(tempFile) {
20 m = MothurOut::getInstance();
21 readTaxonomy(taxFile);
23 int start = time(NULL);
26 m->mothurOut("Generating search database... "); cout.flush();
29 vector<long> positions;
33 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
35 char inFileName[tempFile.length()];
36 strcpy(inFileName, tempFile.c_str());
38 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
40 if (pid == 0) { //only one process needs to scan file
41 positions = setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs
43 //send file positions to all processes
44 MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs
45 MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos
47 MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
48 positions.resize(numSeqs);
49 MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
53 if(method == "kmer") { database = new KmerDB(tempFile, kmerSize); }
54 else if(method == "suffix") { database = new SuffixDB(numSeqs); }
55 else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); }
56 else if(method == "distance") { database = new DistanceDB(); }
58 m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine();
59 database = new KmerDB(tempFile, 8);
63 for(int i=0;i<numSeqs;i++){
65 int length = positions[i+1] - positions[i];
67 MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
69 string tempBuf = buf4;
70 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
72 istringstream iss (tempBuf,istringstream::in);
75 if (temp.getName() != "") {
76 names.push_back(temp.getName());
77 database->addSequence(temp);
81 database->generateDB();
82 MPI_File_close(&inMPI);
85 //need to know number of template seqs for suffixdb
86 if (method == "suffix") {
88 openInputFile(tempFile, inFASTA);
89 numSeqs = count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
93 bool needToGenerate = true;
95 if(method == "kmer") {
96 database = new KmerDB(tempFile, kmerSize);
98 kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
99 ifstream kmerFileTest(kmerDBName.c_str());
100 if(kmerFileTest){ needToGenerate = false; }
102 else if(method == "suffix") { database = new SuffixDB(numSeqs); }
103 else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); }
104 else if(method == "distance") { database = new DistanceDB(); }
106 m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
107 m->mothurOutEndLine();
108 database = new KmerDB(tempFile, 8);
111 if (needToGenerate) {
113 openInputFile(tempFile, fastaFile);
115 while (!fastaFile.eof()) {
116 Sequence temp(fastaFile);
119 names.push_back(temp.getName());
121 database->addSequence(temp);
125 database->generateDB();
127 }else if ((method == "kmer") && (!needToGenerate)) {
128 ifstream kmerFileTest(kmerDBName.c_str());
129 database->readKmerDB(kmerFileTest);
132 openInputFile(tempFile, fastaFile);
134 while (!fastaFile.eof()) {
135 Sequence temp(fastaFile);
138 names.push_back(temp.getName());
143 database->setNumSeqs(names.size());
145 m->mothurOut("DONE."); m->mothurOutEndLine();
146 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine();
149 catch(exception& e) {
150 m->errorOut(e, "Classify", "Classify");
154 /**************************************************************************************************/
156 void Classify::readTaxonomy(string file) {
159 phyloTree = new PhyloTree();
160 string name, taxInfo;
162 m->mothurOutEndLine();
163 m->mothurOut("Reading in the " + file + " taxonomy...\t"); cout.flush();
167 vector<long> positions;
171 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
173 char inFileName[file.length()];
174 strcpy(inFileName, file.c_str());
176 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
179 positions = setFilePosEachLine(file, num);
181 //send file positions to all processes
182 MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs
183 MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos
185 MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
186 positions.resize(num);
187 MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
191 for(int i=0;i<num;i++){
193 int length = positions[i+1] - positions[i];
196 MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
198 string tempBuf = buf4;
199 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
201 istringstream iss (tempBuf,istringstream::in);
202 iss >> name >> taxInfo;
203 taxonomy[name] = taxInfo;
204 phyloTree->addSeqToTree(name, taxInfo);
207 MPI_File_close(&inMPI);
210 openInputFile(file, inTax);
212 //read template seqs and save
213 while (!inTax.eof()) {
214 inTax >> name >> taxInfo;
216 taxonomy[name] = taxInfo;
218 phyloTree->addSeqToTree(name, taxInfo);
225 phyloTree->assignHeirarchyIDs(0);
227 m->mothurOut("DONE.");
228 m->mothurOutEndLine(); cout.flush();
231 catch(exception& e) {
232 m->errorOut(e, "Classify", "readTaxonomy");
236 /**************************************************************************************************/
238 vector<string> Classify::parseTax(string tax) {
240 vector<string> taxons;
242 tax = tax.substr(0, tax.length()-1); //get rid of last ';'
246 while (tax.find_first_of(';') != -1) {
247 individual = tax.substr(0,tax.find_first_of(';'));
248 tax = tax.substr(tax.find_first_of(';')+1, tax.length());
249 taxons.push_back(individual);
253 taxons.push_back(tax);
257 catch(exception& e) {
258 m->errorOut(e, "Classify", "parseTax");
262 /**************************************************************************************************/