X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=kmerdb.cpp;h=9a5d23500be955b459413beee989601f6c0ec78f;hp=a5094f0a862b057c571798180b6c064dc77509bf;hb=b206f634aae1b4ce13978d203247fb64757d5482;hpb=bfbc55964f1977da72c2cea984288a427d370a59 diff --git a/kmerdb.cpp b/kmerdb.cpp index a5094f0..9a5d235 100644 --- a/kmerdb.cpp +++ b/kmerdb.cpp @@ -5,11 +5,23 @@ * Created by Pat Schloss on 12/16/08. * Copyright 2008 Patrick D. Schloss. All rights reserved. * - */ + * This class is a child class of the Database class, which stores the template sequences as a kmer table and provides + * a method of searching the kmer table for the sequence with the most kmers in common with a query sequence. + * kmerLocations is the primary storage variable that is a two-dimensional vector where each row represents the + * different number of kmers and each column contains the index to sequences that use that kmer. + * + * Construction of an object of this type will first look for an appropriately named database file and if it is found + * then will read in the database file (readKmerDB), otherwise it will generate one and store the data in memory + * (generateKmerDB) + * + * The search method used here is roughly the same as that used in the SimRank program that is found at the + * greengenes website. The default kmer size is 7. The speed complexity is between O(L) and O(LN). When I use 7mers + * on average a kmer is found in ~100 other sequences with a database of ~5000 sequences. If this is the case then the + * time would be on the order of O(0.1LN) -> fast. + * -using namespace std; + */ -#include "mothur.h" #include "sequence.hpp" #include "kmer.hpp" #include "database.hpp" @@ -17,119 +29,227 @@ using namespace std; /**************************************************************************************************/ -KmerDB::KmerDB(string fastaFileName, int kSize) : Database(fastaFileName), kmerSize(kSize) { - - string kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer"; - ifstream kmerFileTest(kmerDBName.c_str()); - - int power4s[9] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536 }; - - maxKmer = power4s[kmerSize]; - kmerLocations.resize(maxKmer+1); +KmerDB::KmerDB(string fastaFileName, int kSize) : Database(), kmerSize(kSize) { + try { - if(!kmerFileTest){ - cout << "Generating the " << kmerDBName << " database...\t"; cout.flush(); - generateKmerDB(kmerDBName); - } - else{ - cout << "Reading in the " << kmerDBName << " database...\t"; cout.flush(); - readKmerDB(kmerDBName, kmerFileTest); + kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer"; + + int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 }; + count = 0; + + maxKmer = power4s[kmerSize]; + kmerLocations.resize(maxKmer+1); + } - cout << "DONE." << endl << endl; cout.flush(); + catch(exception& e) { + m->errorOut(e, "KmerDB", "KmerDB"); + exit(1); + } } +/**************************************************************************************************/ +KmerDB::KmerDB() : Database() {} +/**************************************************************************************************/ + +KmerDB::~KmerDB(){} /**************************************************************************************************/ -Sequence* KmerDB::findClosestSequence(Sequence* candidateSeq){ - - vector matches(numSeqs, 0); - vector timesKmerFound(kmerLocations.size()+1, 0); - - int maxMatches = 0; - int maxSequence = 0; - - string query = candidateSeq->getUnaligned(); - - int numKmers = query.length() - kmerSize + 1; - Kmer kmer(kmerSize); - - for(int i=0;i KmerDB::findClosestSequences(Sequence* candidateSeq, int num){ + try { + if (num > numSeqs) { m->mothurOut("[WARNING]: you requested " + toString(num) + " closest sequences, but the template only contains " + toString(numSeqs) + ", adjusting."); m->mothurOutEndLine(); num = numSeqs; } + + vector topMatches; + Kmer kmer(kmerSize); + searchScore = 0; + Scores.clear(); - int kmerNumber = kmer.getKmerNumber(query, i); + vector matches(numSeqs, 0); // a record of the sequences with shared kmers + vector timesKmerFound(kmerLocations.size()+1, 0); // a record of the kmers that we have already found - if(timesKmerFound[kmerNumber] == 0){ - for(int j=0;jgetNumBases() - kmerSize + 1; + + for(int i=0;igetUnaligned(), i); // go through the query sequence and get a kmer number + if(timesKmerFound[kmerNumber] == 0){ // if we haven't seen it before... + for(int j=0;j maxMatches){ - maxMatches = matches[i]; - maxSequence = i; + if (num != 1) { + vector seqMatches; seqMatches.resize(numSeqs); + for(int i=0;i bestMatch) { + bestIndex = i; + bestMatch = matches[i]; + } + } + + searchScore = bestMatch; + searchScore = 100 * searchScore / (float) numKmers; // return the Sequence object corresponding to the db + topMatches.push_back(bestIndex); + Scores.push_back(searchScore); } + return topMatches; } - return templateSequences[maxSequence]; - + catch(exception& e) { + m->errorOut(e, "KmerDB", "findClosestSequences"); + exit(1); + } } /**************************************************************************************************/ -void KmerDB::generateKmerDB(string kmerDBName){ - - - Kmer kmer(kmerSize); - - for(int i=0;igetUnaligned(); - int numKmers = seq.length() - kmerSize + 1; +void KmerDB::generateDB(){ + try { - for(int j=0;jopenOutputFile(kmerDBName, kmerFile); // to a file + + //output version + kmerFile << "#" << m->getVersion() << endl; + + for(int i=0;ierrorOut(e, "KmerDB", "generateDB"); exit(1); - } + } - for(int i=0;i seenBefore(maxKmer+1,0); + for(int j=0;jerrorOut(e, "KmerDB", "addSequence"); + exit(1); + } } - /**************************************************************************************************/ -void KmerDB::readKmerDB(string kmerDBName, ifstream& kmerDBFile){ +void KmerDB::readKmerDB(ifstream& kmerDBFile){ + try { + + kmerDBFile.seekg(0); // start at the beginning of the file + + //read version + string line = m->getline(kmerDBFile); m->gobble(kmerDBFile); + + string seqName; + int seqNumber; - kmerDBFile.seekg(0); - - string seqName; - int seqNumber; + for(int i=0;i> seqName >> numValues; + + for(int j=0;j> seqNumber; // 1. number of sequences with the kmer number + kmerLocations[i].push_back(seqNumber); // 2. sequence indices + } + } + kmerDBFile.close(); + + } + catch(exception& e) { + m->errorOut(e, "KmerDB", "readKmerDB"); + exit(1); + } +} + +/**************************************************************************************************/ +int KmerDB::getCount(int kmer) { + try { + if (kmer < 0) { return 0; } //if user gives negative number + else if (kmer > maxKmer) { return 0; } //or a kmer that is bigger than maxkmer + else { return kmerLocations[kmer].size(); } // kmer is in vector range + } + catch(exception& e) { + m->errorOut(e, "KmerDB", "getCount"); + exit(1); + } +} +/**************************************************************************************************/ +int KmerDB::getReversed(int kmerNumber) { + try { + Kmer kmer(kmerSize); + + if (kmerNumber < 0) { return 0; } //if user gives negative number + else if (kmerNumber > maxKmer) { return 0; } //or a kmer that is bigger than maxkmer + else { return kmer.getReverseKmerNumber(kmerNumber); } // kmer is in vector range + } + catch(exception& e) { + m->errorOut(e, "KmerDB", "getReversed"); + exit(1); + } +} +/**************************************************************************************************/ +vector KmerDB::getSequencesWithKmer(int kmer) { + try { + + vector seqs; - for(int i=0;i> seqName >> numValues; + if (kmer < 0) { } //if user gives negative number + else if (kmer > maxKmer) { } //or a kmer that is bigger than maxkmer + else { seqs = kmerLocations[kmer]; } - for(int j=0;j> seqNumber; - kmerLocations[i].push_back(seqNumber); - } + return seqs; } - kmerDBFile.close(); + catch(exception& e) { + m->errorOut(e, "KmerDB", "getSequencesWithKmer"); + exit(1); + } } +/**************************************************************************************************/ + /**************************************************************************************************/ + +