5 * Created by westcott on 11/3/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
11 #include "sequence.hpp"
13 #include "suffixdb.hpp"
14 #include "blastdb.hpp"
15 #include "distancedb.hpp"
17 /**************************************************************************************************/
18 void Classify::generateDatabaseAndNames(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch) {
21 readTaxonomy(taxFile);
23 templateFile = tempFile;
25 int start = time(NULL);
28 m->mothurOut("Generating search database... "); cout.flush();
31 vector<long> positions;
35 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
37 //char* inFileName = new char[tempFile.length()];
38 //memcpy(inFileName, tempFile.c_str(), tempFile.length());
40 char inFileName[1024];
41 strcpy(inFileName, tempFile.c_str());
43 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
46 if (pid == 0) { //only one process needs to scan file
47 positions = setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs
49 //send file positions to all processes
50 MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs
51 MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos
53 MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
54 positions.resize(numSeqs);
55 MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
59 if(method == "kmer") { database = new KmerDB(tempFile, kmerSize); }
60 else if(method == "suffix") { database = new SuffixDB(numSeqs); }
61 else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); }
62 else if(method == "distance") { database = new DistanceDB(); }
64 m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine();
65 database = new KmerDB(tempFile, 8);
69 for(int i=0;i<numSeqs;i++){
71 int length = positions[i+1] - positions[i];
72 char* buf4 = new char[length];
73 MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
75 string tempBuf = buf4;
76 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
78 istringstream iss (tempBuf,istringstream::in);
81 if (temp.getName() != "") {
82 names.push_back(temp.getName());
83 database->addSequence(temp);
87 database->generateDB();
88 MPI_File_close(&inMPI);
91 //need to know number of template seqs for suffixdb
92 if (method == "suffix") {
94 openInputFile(tempFile, inFASTA);
95 numSeqs = count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
99 bool needToGenerate = true;
101 if(method == "kmer") {
102 database = new KmerDB(tempFile, kmerSize);
104 kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
105 ifstream kmerFileTest(kmerDBName.c_str());
106 if(kmerFileTest){ needToGenerate = false; }
108 else if(method == "suffix") { database = new SuffixDB(numSeqs); }
109 else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); }
110 else if(method == "distance") { database = new DistanceDB(); }
112 m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
113 m->mothurOutEndLine();
114 database = new KmerDB(tempFile, 8);
117 if (needToGenerate) {
119 openInputFile(tempFile, fastaFile);
121 while (!fastaFile.eof()) {
122 Sequence temp(fastaFile);
125 names.push_back(temp.getName());
127 database->addSequence(temp);
131 database->generateDB();
133 }else if ((method == "kmer") && (!needToGenerate)) {
134 ifstream kmerFileTest(kmerDBName.c_str());
135 database->readKmerDB(kmerFileTest);
138 openInputFile(tempFile, fastaFile);
140 while (!fastaFile.eof()) {
141 Sequence temp(fastaFile);
144 names.push_back(temp.getName());
149 database->setNumSeqs(names.size());
151 m->mothurOut("DONE."); m->mothurOutEndLine();
152 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine();
155 catch(exception& e) {
156 m->errorOut(e, "Classify", "generateDatabaseAndNames");
160 /**************************************************************************************************/
161 Classify::Classify() { m = MothurOut::getInstance(); database = NULL; }
162 /**************************************************************************************************/
164 int Classify::readTaxonomy(string file) {
167 phyloTree = new PhyloTree();
168 string name, taxInfo;
170 m->mothurOutEndLine();
171 m->mothurOut("Reading in the " + file + " taxonomy...\t"); cout.flush();
175 vector<long> positions;
179 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
181 //char* inFileName = new char[file.length()];
182 //memcpy(inFileName, file.c_str(), file.length());
184 char inFileName[1024];
185 strcpy(inFileName, file.c_str());
187 MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
191 positions = setFilePosEachLine(file, num);
193 //send file positions to all processes
194 MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs
195 MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos
197 MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
198 positions.resize(num);
199 MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
203 for(int i=0;i<num;i++){
205 int length = positions[i+1] - positions[i];
206 char* buf4 = new char[length];
208 MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
210 string tempBuf = buf4;
211 if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
214 istringstream iss (tempBuf,istringstream::in);
215 iss >> name >> taxInfo;
216 taxonomy[name] = taxInfo;
217 phyloTree->addSeqToTree(name, taxInfo);
220 MPI_File_close(&inMPI);
223 openInputFile(file, inTax);
225 //read template seqs and save
226 while (!inTax.eof()) {
227 inTax >> name >> taxInfo;
229 taxonomy[name] = taxInfo;
231 phyloTree->addSeqToTree(name, taxInfo);
238 phyloTree->assignHeirarchyIDs(0);
240 phyloTree->setUp(file);
242 m->mothurOut("DONE.");
243 m->mothurOutEndLine(); cout.flush();
245 return phyloTree->getNumSeqs();
248 catch(exception& e) {
249 m->errorOut(e, "Classify", "readTaxonomy");
253 /**************************************************************************************************/
255 vector<string> Classify::parseTax(string tax) {
257 vector<string> taxons;
259 tax = tax.substr(0, tax.length()-1); //get rid of last ';'
263 while (tax.find_first_of(';') != -1) {
264 individual = tax.substr(0,tax.find_first_of(';'));
265 tax = tax.substr(tax.find_first_of(';')+1, tax.length());
266 taxons.push_back(individual);
270 taxons.push_back(tax);
274 catch(exception& e) {
275 m->errorOut(e, "Classify", "parseTax");
279 /**************************************************************************************************/