5 * Created by Pat Schloss on 12/16/08.
6 * Copyright 2008 Patrick D. Schloss. All rights reserved.
8 * This class is a child class of the Database class, which stores the template sequences as a kmer table and provides
9 * a method of searching the kmer table for the sequence with the most kmers in common with a query sequence.
10 * kmerLocations is the primary storage variable that is a two-dimensional vector where each row represents the
11 * different number of kmers and each column contains the index to sequences that use that kmer.
13 * Construction of an object of this type will first look for an appropriately named database file and if it is found
14 * then will read in the database file (readKmerDB), otherwise it will generate one and store the data in memory
17 * The search method used here is roughly the same as that used in the SimRank program that is found at the
18 * greengenes website. The default kmer size is 7. The speed complexity is between O(L) and O(LN). When I use 7mers
19 * on average a kmer is found in ~100 other sequences with a database of ~5000 sequences. If this is the case then the
20 * time would be on the order of O(0.1LN) -> fast.
25 #include "sequence.hpp"
27 #include "database.hpp"
30 /**************************************************************************************************/
32 KmerDB::KmerDB(string fastaFileName, int kSize) : Database(), kmerSize(kSize) {
35 kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
37 int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
40 maxKmer = power4s[kmerSize];
41 kmerLocations.resize(maxKmer+1);
45 m->errorOut(e, "KmerDB", "KmerDB");
50 /**************************************************************************************************/
51 KmerDB::KmerDB() : Database() {}
52 /**************************************************************************************************/
56 /**************************************************************************************************/
58 vector<int> KmerDB::findClosestSequences(Sequence* candidateSeq, int num){
60 if (num > numSeqs) { m->mothurOut("[WARNING]: you requested " + toString(num) + " closest sequences, but the template only contains " + toString(numSeqs) + ", adjusting."); m->mothurOutEndLine(); num = numSeqs; }
62 vector<int> topMatches;
67 vector<int> matches(numSeqs, 0); // a record of the sequences with shared kmers
68 vector<int> timesKmerFound(kmerLocations.size()+1, 0); // a record of the kmers that we have already found
70 int numKmers = candidateSeq->getNumBases() - kmerSize + 1;
72 for(int i=0;i<numKmers;i++){
73 int kmerNumber = kmer.getKmerNumber(candidateSeq->getUnaligned(), i); // go through the query sequence and get a kmer number
74 if(timesKmerFound[kmerNumber] == 0){ // if we haven't seen it before...
75 for(int j=0;j<kmerLocations[kmerNumber].size();j++){//increase the count for each sequence that also has
76 matches[kmerLocations[kmerNumber][j]]++; // that kmer
79 timesKmerFound[kmerNumber] = 1; // ok, we've seen the kmer now
83 vector<seqMatch> seqMatches; seqMatches.resize(numSeqs);
84 for(int i=0;i<numSeqs;i++){
85 seqMatches[i].seq = i;
86 seqMatches[i].match = matches[i];
89 //sorts putting largest matches first
90 sort(seqMatches.begin(), seqMatches.end(), compareSeqMatches);
92 searchScore = seqMatches[0].match;
93 searchScore = 100 * searchScore / (float) numKmers; // return the Sequence object corresponding to the db
96 for (int i = 0; i < num; i++) {
97 topMatches.push_back(seqMatches[i].seq);
98 float thisScore = 100 * seqMatches[i].match / (float) numKmers;
99 Scores.push_back(thisScore);
104 for(int i=0;i<numSeqs;i++){
106 if (matches[i] > bestMatch) {
108 bestMatch = matches[i];
112 searchScore = bestMatch;
113 searchScore = 100 * searchScore / (float) numKmers; // return the Sequence object corresponding to the db
114 topMatches.push_back(bestIndex);
115 Scores.push_back(searchScore);
119 catch(exception& e) {
120 m->errorOut(e, "KmerDB", "findClosestSequences");
125 /**************************************************************************************************/
127 void KmerDB::generateDB(){
130 ofstream kmerFile; // once we have the kmerLocations folder print it out
131 m->openOutputFile(kmerDBName, kmerFile); // to a file
134 kmerFile << "#" << m->getVersion() << endl;
136 for(int i=0;i<maxKmer;i++){ // step through all of the possible kmer numbers
137 kmerFile << i << ' ' << kmerLocations[i].size(); // print the kmer number and the number of sequences with
138 for(int j=0;j<kmerLocations[i].size();j++){ // that kmer. then print out the indices of the sequences
139 kmerFile << ' ' << kmerLocations[i][j]; // with that kmer.
146 catch(exception& e) {
147 m->errorOut(e, "KmerDB", "generateDB");
152 /**************************************************************************************************/
153 void KmerDB::addSequence(Sequence seq) {
157 string unaligned = seq.getUnaligned(); // ...take the unaligned sequence...
158 int numKmers = unaligned.length() - kmerSize + 1;
160 vector<int> seenBefore(maxKmer+1,0);
161 for(int j=0;j<numKmers;j++){ // ...step though the sequence and get each kmer...
162 int kmerNumber = kmer.getKmerNumber(unaligned, j);
163 if(seenBefore[kmerNumber] == 0){
164 kmerLocations[kmerNumber].push_back(count); // ...insert the sequence index into kmerLocations for
165 } // the appropriate kmer number
166 seenBefore[kmerNumber] = 1;
171 catch(exception& e) {
172 m->errorOut(e, "KmerDB", "addSequence");
176 /**************************************************************************************************/
178 void KmerDB::readKmerDB(ifstream& kmerDBFile){
181 kmerDBFile.seekg(0); // start at the beginning of the file
184 string line = m->getline(kmerDBFile); m->gobble(kmerDBFile);
189 for(int i=0;i<maxKmer;i++){
191 kmerDBFile >> seqName >> numValues;
193 for(int j=0;j<numValues;j++){ // for each kmer number get the...
194 kmerDBFile >> seqNumber; // 1. number of sequences with the kmer number
195 kmerLocations[i].push_back(seqNumber); // 2. sequence indices
201 catch(exception& e) {
202 m->errorOut(e, "KmerDB", "readKmerDB");
207 /**************************************************************************************************/
208 int KmerDB::getCount(int kmer) {
210 if (kmer < 0) { return 0; } //if user gives negative number
211 else if (kmer > maxKmer) { return 0; } //or a kmer that is bigger than maxkmer
212 else { return kmerLocations[kmer].size(); } // kmer is in vector range
214 catch(exception& e) {
215 m->errorOut(e, "KmerDB", "getCount");
219 /**************************************************************************************************/
220 int KmerDB::getReversed(int kmerNumber) {
224 if (kmerNumber < 0) { return 0; } //if user gives negative number
225 else if (kmerNumber > maxKmer) { return 0; } //or a kmer that is bigger than maxkmer
226 else { return kmer.getReverseKmerNumber(kmerNumber); } // kmer is in vector range
228 catch(exception& e) {
229 m->errorOut(e, "KmerDB", "getReversed");
233 /**************************************************************************************************/
234 vector<int> KmerDB::getSequencesWithKmer(int kmer) {
239 if (kmer < 0) { } //if user gives negative number
240 else if (kmer > maxKmer) { } //or a kmer that is bigger than maxkmer
241 else { seqs = kmerLocations[kmer]; }
245 catch(exception& e) {
246 m->errorOut(e, "KmerDB", "getSequencesWithKmer");
250 /**************************************************************************************************/
253 /**************************************************************************************************/