X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=classify.cpp;h=875628d85d8dbcb4caf6d712b41be517eefef733;hb=5e5253ff472de3c6349e562d2580873287be0c65;hp=f6b7e80be51f9383c2dd6599f737c24fa36a0f6d;hpb=7b3c9ca940891c1b20b3b7ec13e05d7e7b316b63;p=mothur.git diff --git a/classify.cpp b/classify.cpp index f6b7e80..875628d 100644 --- a/classify.cpp +++ b/classify.cpp @@ -12,25 +12,95 @@ #include "kmerdb.hpp" #include "suffixdb.hpp" #include "blastdb.hpp" +#include "distancedb.hpp" /**************************************************************************************************/ - -Classify::Classify(string tfile, string tempFile, string method, int kmerSize, int gapOpen, int gapExtend, int match, int misMatch) : taxFile(tfile), templateFile(tempFile) { - try { +void Classify::generateDatabaseAndNames(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch) { + try { + taxFile = tfile; readTaxonomy(taxFile); + templateFile = tempFile; + int start = time(NULL); int numSeqs = 0; + + m->mothurOut("Generating search database... "); cout.flush(); +#ifdef USE_MPI + int pid, processors; + vector positions; + int tag = 2001; + + MPI_Status status; + MPI_File inMPI; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + MPI_Comm_size(MPI_COMM_WORLD, &processors); + + //char* inFileName = new char[tempFile.length()]; + //memcpy(inFileName, tempFile.c_str(), tempFile.length()); + + char inFileName[1024]; + strcpy(inFileName, tempFile.c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer + //delete inFileName; + + if (pid == 0) { //only one process needs to scan file + positions = m->setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs + + //send file positions to all processes + for(int i = 1; i < processors; i++) { + MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD); + MPI_Send(&positions[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD); + } + }else{ + MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status); + positions.resize(numSeqs+1); + MPI_Recv(&positions[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status); + } + + //create database + if(method == "kmer") { database = new KmerDB(tempFile, kmerSize); } + else if(method == "suffix") { database = new SuffixDB(numSeqs); } + else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); } + else if(method == "distance") { database = new DistanceDB(); } + else { + m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine(); + database = new KmerDB(tempFile, 8); + } + + //read file + for(int i=0;i length) { tempBuf = tempBuf.substr(0, length); } + delete buf4; + istringstream iss (tempBuf,istringstream::in); + + Sequence temp(iss); + if (temp.getName() != "") { + names.push_back(temp.getName()); + database->addSequence(temp); + } + } + + database->generateDB(); + MPI_File_close(&inMPI); + MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case + #else + //need to know number of template seqs for suffixdb if (method == "suffix") { ifstream inFASTA; - openInputFile(tempFile, inFASTA); - numSeqs = count(istreambuf_iterator(inFASTA),istreambuf_iterator(), '>'); + m->openInputFile(tempFile, inFASTA); + m->getNumSeqs(inFASTA, numSeqs); inFASTA.close(); } - mothurOut("Generating search database... "); cout.flush(); - bool needToGenerate = true; string kmerDBName; if(method == "kmer") { @@ -38,26 +108,30 @@ Classify::Classify(string tfile, string tempFile, string method, int kmerSize, i kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer"; ifstream kmerFileTest(kmerDBName.c_str()); - if(kmerFileTest){ needToGenerate = false; } + if(kmerFileTest){ + bool GoodFile = m->checkReleaseVersion(kmerFileTest, m->getVersion()); + if (GoodFile) { needToGenerate = false; } + } } else if(method == "suffix") { database = new SuffixDB(numSeqs); } else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); } + else if(method == "distance") { database = new DistanceDB(); } else { - mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); - mothurOutEndLine(); + m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); + m->mothurOutEndLine(); database = new KmerDB(tempFile, 8); } if (needToGenerate) { ifstream fastaFile; - openInputFile(tempFile, fastaFile); + m->openInputFile(tempFile, fastaFile); while (!fastaFile.eof()) { Sequence temp(fastaFile); - gobble(fastaFile); + m->gobble(fastaFile); names.push_back(temp.getName()); - + database->addSequence(temp); } fastaFile.close(); @@ -67,68 +141,131 @@ Classify::Classify(string tfile, string tempFile, string method, int kmerSize, i }else if ((method == "kmer") && (!needToGenerate)) { ifstream kmerFileTest(kmerDBName.c_str()); database->readKmerDB(kmerFileTest); - + ifstream fastaFile; - openInputFile(tempFile, fastaFile); + m->openInputFile(tempFile, fastaFile); while (!fastaFile.eof()) { Sequence temp(fastaFile); - gobble(fastaFile); - + m->gobble(fastaFile); + names.push_back(temp.getName()); } fastaFile.close(); } - +#endif + database->setNumSeqs(names.size()); - mothurOut("DONE."); mothurOutEndLine(); - mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); mothurOutEndLine(); + //sanity check + bool okay = phyloTree->ErrorCheck(names); + + if (!okay) { m->control_pressed = true; } + + m->mothurOut("DONE."); m->mothurOutEndLine(); + m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine(); } catch(exception& e) { - errorOut(e, "Classify", "Classify"); + m->errorOut(e, "Classify", "generateDatabaseAndNames"); exit(1); } } /**************************************************************************************************/ +Classify::Classify() { m = MothurOut::getInstance(); database = NULL; } +/**************************************************************************************************/ -void Classify::readTaxonomy(string file) { +int Classify::readTaxonomy(string file) { try { phyloTree = new PhyloTree(); + string name, taxInfo; - ifstream inTax; - openInputFile(file, inTax); + m->mothurOutEndLine(); + m->mothurOut("Reading in the " + file + " taxonomy...\t"); cout.flush(); + +#ifdef USE_MPI + int pid, num, processors; + vector positions; + int tag = 2001; + + MPI_Status status; + MPI_File inMPI; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + MPI_Comm_size(MPI_COMM_WORLD, &processors); + + //char* inFileName = new char[file.length()]; + //memcpy(inFileName, file.c_str(), file.length()); + + char inFileName[1024]; + strcpy(inFileName, file.c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer + //delete inFileName; + + if (pid == 0) { + positions = m->setFilePosEachLine(file, num); + + //send file positions to all processes + for(int i = 1; i < processors; i++) { + MPI_Send(&num, 1, MPI_INT, i, tag, MPI_COMM_WORLD); + MPI_Send(&positions[0], (num+1), MPI_LONG, i, tag, MPI_COMM_WORLD); + } + }else{ + MPI_Recv(&num, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status); + positions.resize(num+1); + MPI_Recv(&positions[0], (num+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status); + } - mothurOutEndLine(); - mothurOut("Reading in the " + file + " taxonomy...\t"); cout.flush(); + //read file + for(int i=0;i length) { tempBuf = tempBuf.substr(0, length); } + delete buf4; + + istringstream iss (tempBuf,istringstream::in); + iss >> name >> taxInfo; + taxonomy[name] = taxInfo; + phyloTree->addSeqToTree(name, taxInfo); + } - string name, taxInfo; + MPI_File_close(&inMPI); + MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case +#else + ifstream inTax; + m->openInputFile(file, inTax); + //read template seqs and save while (!inTax.eof()) { inTax >> name >> taxInfo; taxonomy[name] = taxInfo; - //itTax = taxList.find(taxInfo); - //if (itTax == taxList.end()) { //this is new taxonomy - //taxList[taxInfo] = 1; - //}else { taxList[taxInfo]++; } phyloTree->addSeqToTree(name, taxInfo); - gobble(inTax); + m->gobble(inTax); } - - phyloTree->assignHeirarchyIDs(0); inTax.close(); +#endif - mothurOut("DONE."); - mothurOutEndLine(); cout.flush(); + phyloTree->assignHeirarchyIDs(0); + + phyloTree->setUp(file); + + m->mothurOut("DONE."); + m->mothurOutEndLine(); cout.flush(); + + return phyloTree->getNumSeqs(); } catch(exception& e) { - errorOut(e, "Classify", "readTaxonomy"); + m->errorOut(e, "Classify", "readTaxonomy"); exit(1); } } @@ -154,7 +291,7 @@ vector Classify::parseTax(string tax) { return taxons; } catch(exception& e) { - errorOut(e, "Classify", "parseTax"); + m->errorOut(e, "Classify", "parseTax"); exit(1); } }