]> git.donarmstrong.com Git - mothur.git/blob - kmerdb.cpp
added alignment code
[mothur.git] / kmerdb.cpp
1 /*
2  *  kmerdb.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/16/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  *      This class is a child class of the Database class, which stores the template sequences as a kmer table and provides
9  *      a method of searching the kmer table for the sequence with the most kmers in common with a query sequence.
10  *      kmerLocations is the primary storage variable that is a two-dimensional vector where each row represents the
11  *      different number of kmers and each column contains the index to sequences that use that kmer.
12  *
13  *      Construction of an object of this type will first look for an appropriately named database file and if it is found
14  *      then will read in the database file (readKmerDB), otherwise it will generate one and store the data in memory
15  *      (generateKmerDB)
16  *
17  *      The search method used here is roughly the same as that used in the SimRank program that is found at the
18  *      greengenes website.  The default kmer size is 7.  The speed complexity is between O(L) and O(LN).  When I use 7mers
19  *      on average a kmer is found in ~100 other sequences with a database of ~5000 sequences.  If this is the case then the
20  *      time would be on the order of O(0.1LN) -> fast.
21  *
22
23  */
24
25 using namespace std;
26
27
28 #include "sequence.hpp"
29 #include "kmer.hpp"
30 #include "database.hpp"
31 #include "kmerdb.hpp"
32
33 /**************************************************************************************************/
34
35 KmerDB::KmerDB(string fastaFileName, int kSize) : Database(fastaFileName), kmerSize(kSize) {
36
37         string kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
38         ifstream kmerFileTest(kmerDBName.c_str());
39         
40         int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
41         
42         maxKmer = power4s[kmerSize];
43         kmerLocations.resize(maxKmer+1);
44         
45         if(!kmerFileTest){              //      if we can open the kmer db file, then read it in...
46                 cout << "Generating the " << kmerDBName << " database...\t";    cout.flush();
47                 generateKmerDB(kmerDBName);     
48         }
49         else{                                   //      ...otherwise generate it.
50                 cout << "Reading in the " << kmerDBName << " database...\t";    cout.flush();
51                 readKmerDB(kmerDBName, kmerFileTest);
52         }
53         cout << "DONE." << endl << endl;        cout.flush();
54
55 }
56
57 /**************************************************************************************************/
58
59 Sequence* KmerDB::findClosestSequence(Sequence* candidateSeq){
60
61         vector<int> matches(numSeqs, 0);                                                //      a record of the sequences with shared kmers
62         vector<int> timesKmerFound(kmerLocations.size()+1, 0);  //      a record of the kmers that we have already found
63         
64         searchScore = 0;
65         int maxSequence = 0;
66         
67         string query = candidateSeq->getUnaligned();                    //      we want to search using an unaligned dna sequence
68
69         int numKmers = query.length() - kmerSize + 1;   
70         Kmer kmer(kmerSize);
71         
72         for(int i=0;i<numKmers;i++){
73                 int kmerNumber = kmer.getKmerNumber(query, i);          //      go through the query sequence and get a kmer number
74                 if(timesKmerFound[kmerNumber] == 0){                            //      if we haven't seen it before...
75                         for(int j=0;j<kmerLocations[kmerNumber].size();j++){//increase the count for each sequence that also has
76                                 matches[kmerLocations[kmerNumber][j]]++;        //      that kmer
77                         }
78                 }
79                 timesKmerFound[kmerNumber] = 1;                                         //      ok, we've seen the kmer now
80         }
81
82         for(int i=0;i<numSeqs;i++){                                                             //      find the db sequence that shares the most kmers with
83                 if(matches[i] > searchScore){                                           //      the query sequence
84                         searchScore = matches[i];
85                         maxSequence = i;
86                 }
87         }
88         searchScore = 100 * searchScore / (float)numKmers;
89         return templateSequences[maxSequence];                                  //      return the Sequence object corresponding to the db
90                                                                                                                         //      sequence with the most shared kmers.
91 }
92
93 /**************************************************************************************************/
94
95 void KmerDB::generateKmerDB(string kmerDBName){
96         
97         Kmer kmer(kmerSize);
98         
99         for(int i=0;i<numSeqs;i++){                                                             //      for all of the template sequences...
100
101                 string seq = templateSequences[i]->getUnaligned();      //      ...take the unaligned sequence...
102                 int numKmers = seq.length() - kmerSize + 1;
103                 
104                 vector<int> seenBefore(maxKmer+1,0);
105                 for(int j=0;j<numKmers;j++){                                            //      ...step though the sequence and get each kmer...
106                         int kmerNumber = kmer.getKmerNumber(seq, j);
107                         if(seenBefore[kmerNumber] == 0){
108                                 kmerLocations[kmerNumber].push_back(i);         //      ...insert the sequence index into kmerLocations for
109                         }                                                                                               //      the appropriate kmer number
110                         seenBefore[kmerNumber] = 1;
111                 }                                                                                                       
112         }
113         
114         ofstream kmerFile;                                                                              //      once we have the kmerLocations folder print it out
115         openOutputFile(kmerDBName, kmerFile);                                   //      to a file
116         
117         for(int i=0;i<maxKmer;i++){                                                             //      step through all of the possible kmer numbers
118                 kmerFile << i << ' ' << kmerLocations[i].size();        //      print the kmer number and the number of sequences with
119                 for(int j=0;j<kmerLocations[i].size();j++){                     //      that kmer.  then print out the indices of the sequences
120                         kmerFile << ' ' << kmerLocations[i][j];                 //      with that kmer.
121                 }
122                 kmerFile << endl;
123         }
124         kmerFile.close();
125         
126 }
127
128 /**************************************************************************************************/
129
130 void KmerDB::readKmerDB(string kmerDBName, ifstream& kmerDBFile){
131
132         kmerDBFile.seekg(0);                                                                    //      start at the beginning of the file
133         
134         string seqName;
135         int seqNumber;
136         
137         for(int i=0;i<maxKmer;i++){
138                 int numValues;  
139                 kmerDBFile >> seqName >> numValues;
140                 
141                 for(int j=0;j<numValues;j++){                                           //      for each kmer number get the...
142                         kmerDBFile >> seqNumber;                                                //              1. number of sequences with the kmer number
143                         kmerLocations[i].push_back(seqNumber);                  //              2. sequence indices
144                 }
145         }
146         kmerDBFile.close();
147 }
148
149 /**************************************************************************************************/