X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=classify.cpp;h=36179f471da4ac5d40b563e609feb2f8d3d32e6c;hp=22728eb7a3ee52a57926e5457a33d73c9a1d7d07;hb=df7e3ff9f68ef157b0328a2d353c3258c5d45d89;hpb=cd985cf388dcc4c7de8251339206aec5f7e12f1e diff --git a/classify.cpp b/classify.cpp index 22728eb..36179f4 100644 --- a/classify.cpp +++ b/classify.cpp @@ -13,154 +13,222 @@ #include "suffixdb.hpp" #include "blastdb.hpp" #include "distancedb.hpp" +#include "referencedb.h" /**************************************************************************************************/ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch) { try { - taxFile = tfile; - readTaxonomy(taxFile); + ReferenceDB* rdb = ReferenceDB::getInstance(); - templateFile = tempFile; + if (tfile == "saved") { tfile = rdb->getSavedTaxonomy(); } - int start = time(NULL); - int numSeqs = 0; + taxFile = tfile; - m->mothurOut("Generating search database... "); cout.flush(); -#ifdef USE_MPI - int pid, processors; - vector positions; - int tag = 2001; + int numSeqs = 0; - MPI_Status status; - MPI_File inMPI; - MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are - MPI_Comm_size(MPI_COMM_WORLD, &processors); - - //char* inFileName = new char[tempFile.length()]; - //memcpy(inFileName, tempFile.c_str(), tempFile.length()); + if (tempFile == "saved") { + int start = time(NULL); + m->mothurOutEndLine(); m->mothurOut("Using sequences from " + rdb->getSavedReference() + " that are saved in memory."); m->mothurOutEndLine(); - char inFileName[1024]; - strcpy(inFileName, tempFile.c_str()); - - MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer - //delete inFileName; - - if (pid == 0) { //only one process needs to scan file - positions = setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs - - //send file positions to all processes - for(int i = 1; i < processors; i++) { - MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD); - MPI_Send(&positions[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD); + numSeqs = rdb->referenceSeqs.size(); + templateFile = rdb->getSavedReference(); + tempFile = rdb->getSavedReference(); + + bool needToGenerate = true; + string kmerDBName; + if(method == "kmer") { + database = new KmerDB(tempFile, kmerSize); + + kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer"; + ifstream kmerFileTest(kmerDBName.c_str()); + if(kmerFileTest){ + bool GoodFile = m->checkReleaseVersion(kmerFileTest, m->getVersion()); + if (GoodFile) { needToGenerate = false; } } - }else{ - MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status); - positions.resize(numSeqs+1); - MPI_Recv(&positions[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status); } - - //create database - if(method == "kmer") { database = new KmerDB(tempFile, kmerSize); } else if(method == "suffix") { database = new SuffixDB(numSeqs); } - else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); } + else if(method == "blast") { database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch, "", threadID); } else if(method == "distance") { database = new DistanceDB(); } else { - m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine(); + m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); + m->mothurOutEndLine(); database = new KmerDB(tempFile, 8); } - - //read file - for(int i=0;i length) { tempBuf = tempBuf.substr(0, length); } - delete buf4; - istringstream iss (tempBuf,istringstream::in); - - Sequence temp(iss); - if (temp.getName() != "") { + + if (needToGenerate) { + for (int k = 0; k < rdb->referenceSeqs.size(); k++) { + Sequence temp(rdb->referenceSeqs[k].getName(), rdb->referenceSeqs[k].getAligned()); names.push_back(temp.getName()); database->addSequence(temp); } - } + if ((method == "kmer") && (!shortcuts)) {;} //don't print + else {database->generateDB(); } + }else if ((method == "kmer") && (!needToGenerate)) { + ifstream kmerFileTest(kmerDBName.c_str()); + database->readKmerDB(kmerFileTest); + + for (int k = 0; k < rdb->referenceSeqs.size(); k++) { + names.push_back(rdb->referenceSeqs[k].getName()); + } + } - database->generateDB(); - MPI_File_close(&inMPI); - MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case - #else - - //need to know number of template seqs for suffixdb - if (method == "suffix") { - ifstream inFASTA; - openInputFile(tempFile, inFASTA); - getNumSeqs(inFASTA, numSeqs); - inFASTA.close(); - } - - bool needToGenerate = true; - string kmerDBName; - if(method == "kmer") { - database = new KmerDB(tempFile, kmerSize); + database->setNumSeqs(numSeqs); - kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer"; - ifstream kmerFileTest(kmerDBName.c_str()); - if(kmerFileTest){ needToGenerate = false; } - } - else if(method == "suffix") { database = new SuffixDB(numSeqs); } - else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); } - else if(method == "distance") { database = new DistanceDB(); } - else { - m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); - m->mothurOutEndLine(); - database = new KmerDB(tempFile, 8); - } - - if (needToGenerate) { - ifstream fastaFile; - openInputFile(tempFile, fastaFile); + m->mothurOut("It took " + toString(time(NULL) - start) + " to load " + toString(rdb->referenceSeqs.size()) + " sequences and generate the search databases.");m->mothurOutEndLine(); - while (!fastaFile.eof()) { - Sequence temp(fastaFile); - gobble(fastaFile); + }else { - names.push_back(temp.getName()); - - database->addSequence(temp); - } - fastaFile.close(); - - database->generateDB(); + templateFile = tempFile; - }else if ((method == "kmer") && (!needToGenerate)) { - ifstream kmerFileTest(kmerDBName.c_str()); - database->readKmerDB(kmerFileTest); + int start = time(NULL); - ifstream fastaFile; - openInputFile(tempFile, fastaFile); + m->mothurOut("Generating search database... "); cout.flush(); + #ifdef USE_MPI + int pid, processors; + vector positions; + int tag = 2001; - while (!fastaFile.eof()) { - Sequence temp(fastaFile); - gobble(fastaFile); + MPI_Status status; + MPI_File inMPI; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + MPI_Comm_size(MPI_COMM_WORLD, &processors); - names.push_back(temp.getName()); + //char* inFileName = new char[tempFile.length()]; + //memcpy(inFileName, tempFile.c_str(), tempFile.length()); + + char inFileName[1024]; + strcpy(inFileName, tempFile.c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer + //delete inFileName; + + if (pid == 0) { //only one process needs to scan file + positions = m->setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs + + //send file positions to all processes + for(int i = 1; i < processors; i++) { + MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD); + MPI_Send(&positions[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD); + } + }else{ + MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status); + positions.resize(numSeqs+1); + MPI_Recv(&positions[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status); + } + + //create database + if(method == "kmer") { database = new KmerDB(tempFile, kmerSize); } + else if(method == "suffix") { database = new SuffixDB(numSeqs); } + else if(method == "blast") { database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch, "", pid); } + else if(method == "distance") { database = new DistanceDB(); } + else { + m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine(); + database = new KmerDB(tempFile, 8); + } + + //read file + for(int i=0;i length) { tempBuf = tempBuf.substr(0, length); } + delete buf4; + istringstream iss (tempBuf,istringstream::in); + + Sequence temp(iss); + if (temp.getName() != "") { + if (rdb->save) { rdb->referenceSeqs.push_back(temp); } + names.push_back(temp.getName()); + database->addSequence(temp); + } + } + + database->generateDB(); + MPI_File_close(&inMPI); + MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case + #else + + //need to know number of template seqs for suffixdb + if (method == "suffix") { + ifstream inFASTA; + m->openInputFile(tempFile, inFASTA); + m->getNumSeqs(inFASTA, numSeqs); + inFASTA.close(); } - fastaFile.close(); - } -#endif - database->setNumSeqs(names.size()); - - //sanity check - bool okay = phyloTree->ErrorCheck(names); - - if (!okay) { m->control_pressed = true; } - - m->mothurOut("DONE."); m->mothurOutEndLine(); - m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine(); + bool needToGenerate = true; + string kmerDBName; + if(method == "kmer") { + database = new KmerDB(tempFile, kmerSize); + + kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer"; + ifstream kmerFileTest(kmerDBName.c_str()); + if(kmerFileTest){ + bool GoodFile = m->checkReleaseVersion(kmerFileTest, m->getVersion()); + if (GoodFile) { needToGenerate = false; } + } + } + else if(method == "suffix") { database = new SuffixDB(numSeqs); } + else if(method == "blast") { database = new BlastDB(tempFile.substr(0,tempFile.find_last_of(".")+1), gapOpen, gapExtend, match, misMatch, "", threadID); } + else if(method == "distance") { database = new DistanceDB(); } + else { + m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); + m->mothurOutEndLine(); + database = new KmerDB(tempFile, 8); + } + + if (needToGenerate) { + ifstream fastaFile; + m->openInputFile(tempFile, fastaFile); + + while (!fastaFile.eof()) { + Sequence temp(fastaFile); + m->gobble(fastaFile); + + if (rdb->save) { rdb->referenceSeqs.push_back(temp); } + + names.push_back(temp.getName()); + + database->addSequence(temp); + } + fastaFile.close(); + + if ((method == "kmer") && (!shortcuts)) {;} //don't print + else {database->generateDB(); } + + }else if ((method == "kmer") && (!needToGenerate)) { + ifstream kmerFileTest(kmerDBName.c_str()); + database->readKmerDB(kmerFileTest); + + ifstream fastaFile; + m->openInputFile(tempFile, fastaFile); + + while (!fastaFile.eof()) { + Sequence temp(fastaFile); + m->gobble(fastaFile); + + if (rdb->save) { rdb->referenceSeqs.push_back(temp); } + names.push_back(temp.getName()); + } + fastaFile.close(); + } + #endif + + database->setNumSeqs(names.size()); + + m->mothurOut("DONE."); m->mothurOutEndLine(); + m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine(); + } + + readTaxonomy(taxFile); + + //sanity check + bool okay = phyloTree->ErrorCheck(names); + + if (!okay) { m->control_pressed = true; } } catch(exception& e) { m->errorOut(e, "Classify", "generateDatabaseAndNames"); @@ -168,7 +236,7 @@ void Classify::generateDatabaseAndNames(string tfile, string tempFile, string me } } /**************************************************************************************************/ -Classify::Classify() { m = MothurOut::getInstance(); database = NULL; } +Classify::Classify() { m = MothurOut::getInstance(); database = NULL; phyloTree=NULL; flipped=false; } /**************************************************************************************************/ int Classify::readTaxonomy(string file) { @@ -179,19 +247,17 @@ int Classify::readTaxonomy(string file) { m->mothurOutEndLine(); m->mothurOut("Reading in the " + file + " taxonomy...\t"); cout.flush(); - + if (m->debug) { m->mothurOut("[DEBUG]: Taxonomies read in...\n"); } + #ifdef USE_MPI int pid, num, processors; - vector positions; + vector positions; int tag = 2001; MPI_Status status; MPI_File inMPI; MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are MPI_Comm_size(MPI_COMM_WORLD, &processors); - - //char* inFileName = new char[file.length()]; - //memcpy(inFileName, file.c_str(), file.length()); char inFileName[1024]; strcpy(inFileName, file.c_str()); @@ -200,7 +266,7 @@ int Classify::readTaxonomy(string file) { //delete inFileName; if (pid == 0) { - positions = setFilePosEachLine(file, num); + positions = m->setFilePosEachLine(file, num); //send file positions to all processes for(int i = 1; i < processors; i++) { @@ -226,34 +292,38 @@ int Classify::readTaxonomy(string file) { delete buf4; istringstream iss (tempBuf,istringstream::in); - iss >> name >> taxInfo; - taxonomy[name] = taxInfo; - phyloTree->addSeqToTree(name, taxInfo); - } + iss >> name; m->gobble(iss); + iss >> taxInfo; + if (m->debug) { m->mothurOut("[DEBUG]: name = " + name + " tax = " + taxInfo + "\n"); } + if (m->inUsersGroups(name, names)) { + taxonomy[name] = taxInfo; + phyloTree->addSeqToTree(name, taxInfo); + }else { + m->mothurOut("[WARNING]: " + name + " is in your taxonomy file and not in your reference file, ignoring.\n"); + } + } MPI_File_close(&inMPI); MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case -#else - ifstream inTax; - openInputFile(file, inTax); - - //read template seqs and save - while (!inTax.eof()) { - inTax >> name >> taxInfo; - - taxonomy[name] = taxInfo; - - phyloTree->addSeqToTree(name, taxInfo); - - gobble(inTax); - } - inTax.close(); +#else + + taxonomy.clear(); + m->readTax(file, taxonomy); + map tempTaxonomy; + for (map::iterator itTax = taxonomy.begin(); itTax != taxonomy.end(); itTax++) { + if (m->inUsersGroups(itTax->first, names)) { + phyloTree->addSeqToTree(itTax->first, itTax->second); + tempTaxonomy[itTax->first] = itTax->second; + }else { + m->mothurOut("[WARNING]: " + itTax->first + " is in your taxonomy file and not in your reference file, ignoring.\n"); + } + } + taxonomy = tempTaxonomy; #endif - phyloTree->assignHeirarchyIDs(0); phyloTree->setUp(file); - + m->mothurOut("DONE."); m->mothurOutEndLine(); cout.flush(); @@ -293,3 +363,37 @@ vector Classify::parseTax(string tax) { } /**************************************************************************************************/ +double Classify::getLogExpSum(vector probabilities, int& maxIndex){ + try { + // http://jblevins.org/notes/log-sum-exp + + double maxProb = probabilities[0]; + maxIndex = 0; + + int numProbs = (int)probabilities.size(); + + for(int i=1;i= maxProb){ + maxProb = probabilities[i]; + maxIndex = i; + } + } + + double probSum = 0.0000; + + for(int i=0;ierrorOut(e, "Classify", "getLogExpSum"); + exit(1); + } +} + +/**************************************************************************************************/ +