X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=kmerdb.cpp;h=572acdae5d0da9b7d51b7df94cecf8680771dc57;hb=df82ee669d7eb9ae9a1a334339dfab7961cb16c6;hp=ee8168a328a07bd8dbed1da1f6b4fa3ab0aebca7;hpb=c5c7502f435e1413c19e373dab1dfebcaa67588d;p=mothur.git diff --git a/kmerdb.cpp b/kmerdb.cpp index ee8168a..572acda 100644 --- a/kmerdb.cpp +++ b/kmerdb.cpp @@ -5,10 +5,22 @@ * Created by Pat Schloss on 12/16/08. * Copyright 2008 Patrick D. Schloss. All rights reserved. * - */ - -using namespace std; + * This class is a child class of the Database class, which stores the template sequences as a kmer table and provides + * a method of searching the kmer table for the sequence with the most kmers in common with a query sequence. + * kmerLocations is the primary storage variable that is a two-dimensional vector where each row represents the + * different number of kmers and each column contains the index to sequences that use that kmer. + * + * Construction of an object of this type will first look for an appropriately named database file and if it is found + * then will read in the database file (readKmerDB), otherwise it will generate one and store the data in memory + * (generateKmerDB) + * + * The search method used here is roughly the same as that used in the SimRank program that is found at the + * greengenes website. The default kmer size is 7. The speed complexity is between O(L) and O(LN). When I use 7mers + * on average a kmer is found in ~100 other sequences with a database of ~5000 sequences. If this is the case then the + * time would be on the order of O(0.1LN) -> fast. + * + */ #include "sequence.hpp" #include "kmer.hpp" @@ -22,16 +34,16 @@ KmerDB::KmerDB(string fastaFileName, int kSize) : Database(fastaFileName), kmerS string kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer"; ifstream kmerFileTest(kmerDBName.c_str()); - int power4s[9] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536 }; + int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 }; maxKmer = power4s[kmerSize]; kmerLocations.resize(maxKmer+1); - if(!kmerFileTest){ + if(!kmerFileTest){ // if we can open the kmer db file, then read it in... cout << "Generating the " << kmerDBName << " database...\t"; cout.flush(); generateKmerDB(kmerDBName); } - else{ + else{ // ...otherwise generate it. cout << "Reading in the " << kmerDBName << " database...\t"; cout.flush(); readKmerDB(kmerDBName, kmerFileTest); } @@ -43,67 +55,65 @@ KmerDB::KmerDB(string fastaFileName, int kSize) : Database(fastaFileName), kmerS Sequence* KmerDB::findClosestSequence(Sequence* candidateSeq){ - vector matches(numSeqs, 0); - vector timesKmerFound(kmerLocations.size()+1, 0); - - int maxMatches = 0; - int maxSequence = 0; - - string query = candidateSeq->getUnaligned(); - - int numKmers = query.length() - kmerSize + 1; Kmer kmer(kmerSize); + searchScore = 0; + int maxSequence = 0; + + vector matches(numSeqs, 0); // a record of the sequences with shared kmers + vector timesKmerFound(kmerLocations.size()+1, 0); // a record of the kmers that we have already found + + int numKmers = candidateSeq->getNumBases() - kmerSize + 1; + for(int i=0;igetUnaligned(), i); // go through the query sequence and get a kmer number + if(timesKmerFound[kmerNumber] == 0){ // if we haven't seen it before... + for(int j=0;j maxMatches){ - maxMatches = matches[i]; + + for(int i=0;i searchScore){ // the query sequence + searchScore = matches[i]; maxSequence = i; } } - return templateSequences[maxSequence]; - + + searchScore = 100 * searchScore / (float) numKmers; // return the Sequence object corresponding to the db + return templateSequences[maxSequence]; // sequence with the most shared kmers. } /**************************************************************************************************/ void KmerDB::generateKmerDB(string kmerDBName){ - Kmer kmer(kmerSize); - for(int i=0;igetUnaligned(); + string seq = templateSequences[i]->getUnaligned(); // ...take the unaligned sequence... int numKmers = seq.length() - kmerSize + 1; - for(int j=0;j seenBefore(maxKmer+1,0); + for(int j=0;j> seqName >> numValues; - for(int j=0;j> seqNumber; - kmerLocations[i].push_back(seqNumber); + for(int j=0;j> seqNumber; // 1. number of sequences with the kmer number + kmerLocations[i].push_back(seqNumber); // 2. sequence indices } } kmerDBFile.close();