]> git.donarmstrong.com Git - mothur.git/blob - kmerdb.cpp
fixed some bugs
[mothur.git] / kmerdb.cpp
1 /*
2  *  kmerdb.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/16/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  *      This class is a child class of the Database class, which stores the template sequences as a kmer table and provides
9  *      a method of searching the kmer table for the sequence with the most kmers in common with a query sequence.
10  *      kmerLocations is the primary storage variable that is a two-dimensional vector where each row represents the
11  *      different number of kmers and each column contains the index to sequences that use that kmer.
12  *
13  *      Construction of an object of this type will first look for an appropriately named database file and if it is found
14  *      then will read in the database file (readKmerDB), otherwise it will generate one and store the data in memory
15  *      (generateKmerDB)
16  *
17  *      The search method used here is roughly the same as that used in the SimRank program that is found at the
18  *      greengenes website.  The default kmer size is 7.  The speed complexity is between O(L) and O(LN).  When I use 7mers
19  *      on average a kmer is found in ~100 other sequences with a database of ~5000 sequences.  If this is the case then the
20  *      time would be on the order of O(0.1LN) -> fast.
21  *
22
23  */
24
25 #include "sequence.hpp"
26 #include "kmer.hpp"
27 #include "database.hpp"
28 #include "kmerdb.hpp"
29
30 /**************************************************************************************************/
31
32 KmerDB::KmerDB(string fastaFileName, int kSize) : Database(fastaFileName), kmerSize(kSize) {
33
34         string kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
35         ifstream kmerFileTest(kmerDBName.c_str());
36         
37         int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
38         
39         maxKmer = power4s[kmerSize];
40         kmerLocations.resize(maxKmer+1);
41         
42         if(!kmerFileTest){              //      if we can open the kmer db file, then read it in...
43                 cout << "Generating the " << kmerDBName << " database...\t";    cout.flush();
44                 generateKmerDB(kmerDBName);     
45         }
46         else{                                   //      ...otherwise generate it.
47                 cout << "Reading in the " << kmerDBName << " database...\t";    cout.flush();
48                 readKmerDB(kmerDBName, kmerFileTest);
49         }
50         cout << "DONE." << endl << endl;        cout.flush();
51
52 }
53
54 /**************************************************************************************************/
55
56 Sequence* KmerDB::findClosestSequence(Sequence* candidateSeq){
57         
58         Kmer kmer(kmerSize);
59         
60         searchScore = 0;
61         int maxSequence = 0;
62
63         vector<int> matches(numSeqs, 0);                                                //      a record of the sequences with shared kmers
64         vector<int> timesKmerFound(kmerLocations.size()+1, 0);  //      a record of the kmers that we have already found
65
66         int numKmers = candidateSeq->getNumBases() - kmerSize + 1;      
67
68         for(int i=0;i<numKmers;i++){
69                 int kmerNumber = kmer.getKmerNumber(candidateSeq->getUnaligned(), i);           //      go through the query sequence and get a kmer number
70                 if(timesKmerFound[kmerNumber] == 0){                            //      if we haven't seen it before...
71                         for(int j=0;j<kmerLocations[kmerNumber].size();j++){//increase the count for each sequence that also has
72                                 matches[kmerLocations[kmerNumber][j]]++;        //      that kmer
73                         }
74                 }
75                 timesKmerFound[kmerNumber] = 1;                                         //      ok, we've seen the kmer now
76         }
77
78         for(int i=0;i<numSeqs;i++){                                                             //      find the db sequence that shares the most kmers with
79                 if(matches[i] > searchScore){                                   //      the query sequence
80                         searchScore = matches[i];
81                         maxSequence = i;
82                 }
83         }
84
85         searchScore = 100 * searchScore / (float) numKmers;             //      return the Sequence object corresponding to the db
86         return templateSequences[maxSequence];                                  //      sequence with the most shared kmers.
87 }
88
89 /**************************************************************************************************/
90
91 void KmerDB::generateKmerDB(string kmerDBName){
92         
93         Kmer kmer(kmerSize);
94         
95         for(int i=0;i<numSeqs;i++){                                                             //      for all of the template sequences...
96
97                 string seq = templateSequences[i]->getUnaligned();      //      ...take the unaligned sequence...
98                 int numKmers = seq.length() - kmerSize + 1;
99                 
100                 vector<int> seenBefore(maxKmer+1,0);
101                 for(int j=0;j<numKmers;j++){                                            //      ...step though the sequence and get each kmer...
102                         int kmerNumber = kmer.getKmerNumber(seq, j);
103                         if(seenBefore[kmerNumber] == 0){
104                                 kmerLocations[kmerNumber].push_back(i);         //      ...insert the sequence index into kmerLocations for
105                         }                                                                                               //      the appropriate kmer number
106                         seenBefore[kmerNumber] = 1;
107                 }                                                                                                       
108         }
109         
110         ofstream kmerFile;                                                                              //      once we have the kmerLocations folder print it out
111         openOutputFile(kmerDBName, kmerFile);                                   //      to a file
112         
113         for(int i=0;i<maxKmer;i++){                                                             //      step through all of the possible kmer numbers
114                 kmerFile << i << ' ' << kmerLocations[i].size();        //      print the kmer number and the number of sequences with
115                 for(int j=0;j<kmerLocations[i].size();j++){                     //      that kmer.  then print out the indices of the sequences
116                         kmerFile << ' ' << kmerLocations[i][j];                 //      with that kmer.
117                 }
118                 kmerFile << endl;
119         }
120         kmerFile.close();
121         
122 }
123
124 /**************************************************************************************************/
125
126 void KmerDB::readKmerDB(string kmerDBName, ifstream& kmerDBFile){
127
128         kmerDBFile.seekg(0);                                                                    //      start at the beginning of the file
129         
130         string seqName;
131         int seqNumber;
132         
133         for(int i=0;i<maxKmer;i++){
134                 int numValues;  
135                 kmerDBFile >> seqName >> numValues;
136                 
137                 for(int j=0;j<numValues;j++){                                           //      for each kmer number get the...
138                         kmerDBFile >> seqNumber;                                                //              1. number of sequences with the kmer number
139                         kmerLocations[i].push_back(seqNumber);                  //              2. sequence indices
140                 }
141         }
142         kmerDBFile.close();
143 }
144
145 /**************************************************************************************************/