5 * Created by Pat Schloss on 12/16/08.
6 * Copyright 2008 Patrick D. Schloss. All rights reserved.
8 * This class is a child class of the Database class, which stores the template sequences as a kmer table and provides
9 * a method of searching the kmer table for the sequence with the most kmers in common with a query sequence.
10 * kmerLocations is the primary storage variable that is a two-dimensional vector where each row represents the
11 * different number of kmers and each column contains the index to sequences that use that kmer.
13 * Construction of an object of this type will first look for an appropriately named database file and if it is found
14 * then will read in the database file (readKmerDB), otherwise it will generate one and store the data in memory
17 * The search method used here is roughly the same as that used in the SimRank program that is found at the
18 * greengenes website. The default kmer size is 7. The speed complexity is between O(L) and O(LN). When I use 7mers
19 * on average a kmer is found in ~100 other sequences with a database of ~5000 sequences. If this is the case then the
20 * time would be on the order of O(0.1LN) -> fast.
28 #include "sequence.hpp"
30 #include "database.hpp"
33 /**************************************************************************************************/
35 KmerDB::KmerDB(string fastaFileName, int kSize) : Database(fastaFileName), kmerSize(kSize) {
37 string kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
38 ifstream kmerFileTest(kmerDBName.c_str());
40 int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
42 maxKmer = power4s[kmerSize];
43 kmerLocations.resize(maxKmer+1);
45 if(!kmerFileTest){ // if we can open the kmer db file, then read it in...
46 cout << "Generating the " << kmerDBName << " database...\t"; cout.flush();
47 generateKmerDB(kmerDBName);
49 else{ // ...otherwise generate it.
50 cout << "Reading in the " << kmerDBName << " database...\t"; cout.flush();
51 readKmerDB(kmerDBName, kmerFileTest);
53 cout << "DONE." << endl << endl; cout.flush();
57 /**************************************************************************************************/
59 Sequence* KmerDB::findClosestSequence(Sequence* candidateSeq){
61 vector<int> matches(numSeqs, 0); // a record of the sequences with shared kmers
62 vector<int> timesKmerFound(kmerLocations.size()+1, 0); // a record of the kmers that we have already found
67 string query = candidateSeq->getUnaligned(); // we want to search using an unaligned dna sequence
69 int numKmers = query.length() - kmerSize + 1;
72 for(int i=0;i<numKmers;i++){
73 int kmerNumber = kmer.getKmerNumber(query, i); // go through the query sequence and get a kmer number
74 if(timesKmerFound[kmerNumber] == 0){ // if we haven't seen it before...
75 for(int j=0;j<kmerLocations[kmerNumber].size();j++){//increase the count for each sequence that also has
76 matches[kmerLocations[kmerNumber][j]]++; // that kmer
79 timesKmerFound[kmerNumber] = 1; // ok, we've seen the kmer now
82 for(int i=0;i<numSeqs;i++){ // find the db sequence that shares the most kmers with
83 if(matches[i] > searchScore){ // the query sequence
84 searchScore = matches[i];
88 searchScore = 100 * searchScore / (float)numKmers;
89 return templateSequences[maxSequence]; // return the Sequence object corresponding to the db
90 // sequence with the most shared kmers.
93 /**************************************************************************************************/
95 void KmerDB::generateKmerDB(string kmerDBName){
99 for(int i=0;i<numSeqs;i++){ // for all of the template sequences...
101 string seq = templateSequences[i]->getUnaligned(); // ...take the unaligned sequence...
102 int numKmers = seq.length() - kmerSize + 1;
104 vector<int> seenBefore(maxKmer+1,0);
105 for(int j=0;j<numKmers;j++){ // ...step though the sequence and get each kmer...
106 int kmerNumber = kmer.getKmerNumber(seq, j);
107 if(seenBefore[kmerNumber] == 0){
108 kmerLocations[kmerNumber].push_back(i); // ...insert the sequence index into kmerLocations for
109 } // the appropriate kmer number
110 seenBefore[kmerNumber] = 1;
114 ofstream kmerFile; // once we have the kmerLocations folder print it out
115 openOutputFile(kmerDBName, kmerFile); // to a file
117 for(int i=0;i<maxKmer;i++){ // step through all of the possible kmer numbers
118 kmerFile << i << ' ' << kmerLocations[i].size(); // print the kmer number and the number of sequences with
119 for(int j=0;j<kmerLocations[i].size();j++){ // that kmer. then print out the indices of the sequences
120 kmerFile << ' ' << kmerLocations[i][j]; // with that kmer.
128 /**************************************************************************************************/
130 void KmerDB::readKmerDB(string kmerDBName, ifstream& kmerDBFile){
132 kmerDBFile.seekg(0); // start at the beginning of the file
137 for(int i=0;i<maxKmer;i++){
139 kmerDBFile >> seqName >> numValues;
141 for(int j=0;j<numValues;j++){ // for each kmer number get the...
142 kmerDBFile >> seqNumber; // 1. number of sequences with the kmer number
143 kmerLocations[i].push_back(seqNumber); // 2. sequence indices
149 /**************************************************************************************************/