]> git.donarmstrong.com Git - mothur.git/blob - kmerdb.cpp
fixed bugs in venn and aligner
[mothur.git] / kmerdb.cpp
1 /*
2  *  kmerdb.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/16/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  *      This class is a child class of the Database class, which stores the template sequences as a kmer table and provides
9  *      a method of searching the kmer table for the sequence with the most kmers in common with a query sequence.
10  *      kmerLocations is the primary storage variable that is a two-dimensional vector where each row represents the
11  *      different number of kmers and each column contains the index to sequences that use that kmer.
12  *
13  *      Construction of an object of this type will first look for an appropriately named database file and if it is found
14  *      then will read in the database file (readKmerDB), otherwise it will generate one and store the data in memory
15  *      (generateKmerDB)
16  *
17  *      The search method used here is roughly the same as that used in the SimRank program that is found at the
18  *      greengenes website.  The default kmer size is 7.  The speed complexity is between O(L) and O(LN).  When I use 7mers
19  *      on average a kmer is found in ~100 other sequences with a database of ~5000 sequences.  If this is the case then the
20  *      time would be on the order of O(0.1LN) -> fast.
21  *
22
23  */
24
25 #include "sequence.hpp"
26 #include "kmer.hpp"
27 #include "database.hpp"
28 #include "kmerdb.hpp"
29
30 /**************************************************************************************************/
31
32 KmerDB::KmerDB(string fastaFileName, int kSize) : Database(fastaFileName), kmerSize(kSize) {
33         try { 
34         
35                 string kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
36                 ifstream kmerFileTest(kmerDBName.c_str());
37                 
38                 int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
39                 
40                 maxKmer = power4s[kmerSize];
41                 kmerLocations.resize(maxKmer+1);
42                 
43                 if(!kmerFileTest){              //      if we can open the kmer db file, then read it in...
44                         mothurOut("Generating the " + kmerDBName + " database...\t");   cout.flush();
45                         generateKmerDB(kmerDBName);     
46                 }
47                 else{                                   //      ...otherwise generate it.
48                         mothurOut("Reading in the " + kmerDBName + " database...\t");   cout.flush();
49                         readKmerDB(kmerDBName, kmerFileTest);
50                 }
51                 mothurOut("DONE."); mothurOutEndLine(); mothurOutEndLine(); cout.flush();
52                 
53         }
54         catch(exception& e) {
55                 errorOut(e, "KmerDB", "KmerDB");
56                 exit(1);
57         }       
58
59 }
60 /**************************************************************************************************/
61
62 KmerDB::~KmerDB(){                                                                                                              
63         
64                 //for (int i = 0; i < templateSequences.size(); i++) {  delete templateSequences[i]; }
65                 // templateSequences.clear(); 
66 }
67
68 /**************************************************************************************************/
69
70 Sequence KmerDB::findClosestSequence(Sequence* candidateSeq){
71         try {
72         
73                 Kmer kmer(kmerSize);
74                 
75                 searchScore = 0;
76                 int maxSequence = 0;
77                 
78                 vector<int> matches(numSeqs, 0);                                                //      a record of the sequences with shared kmers
79                 vector<int> timesKmerFound(kmerLocations.size()+1, 0);  //      a record of the kmers that we have already found
80                 
81                 int numKmers = candidateSeq->getNumBases() - kmerSize + 1;      
82                 
83                 for(int i=0;i<numKmers;i++){
84                         int kmerNumber = kmer.getKmerNumber(candidateSeq->getUnaligned(), i);           //      go through the query sequence and get a kmer number
85                         if(timesKmerFound[kmerNumber] == 0){                            //      if we haven't seen it before...
86                                 for(int j=0;j<kmerLocations[kmerNumber].size();j++){//increase the count for each sequence that also has
87                                         matches[kmerLocations[kmerNumber][j]]++;        //      that kmer
88                                 }
89                         }
90                         timesKmerFound[kmerNumber] = 1;                                         //      ok, we've seen the kmer now
91                 }
92                 
93                 for(int i=0;i<numSeqs;i++){                                                             //      find the db sequence that shares the most kmers with
94                         if(matches[i] > searchScore){                                   //      the query sequence
95                                 searchScore = matches[i];
96                                 maxSequence = i;
97                         }
98                 }
99                 
100                 searchScore = 100 * searchScore / (float) numKmers;             //      return the Sequence object corresponding to the db
101                 
102                 return templateSequences[maxSequence];                                  //      sequence with the most shared kmers.
103                 
104         }
105         catch(exception& e) {
106                 errorOut(e, "KmerDB", "findClosestSequence");
107                 exit(1);
108         }       
109 }
110
111 /**************************************************************************************************/
112
113 void KmerDB::generateKmerDB(string kmerDBName){
114         try {
115         
116                 Kmer kmer(kmerSize);
117                 
118                 for(int i=0;i<numSeqs;i++){                                                             //      for all of the template sequences...
119                         
120                         string seq = templateSequences[i].getUnaligned();       //      ...take the unaligned sequence...
121                         int numKmers = seq.length() - kmerSize + 1;
122                         
123                         vector<int> seenBefore(maxKmer+1,0);
124                         for(int j=0;j<numKmers;j++){                                            //      ...step though the sequence and get each kmer...
125                                 int kmerNumber = kmer.getKmerNumber(seq, j);
126                                 if(seenBefore[kmerNumber] == 0){
127                                         kmerLocations[kmerNumber].push_back(i);         //      ...insert the sequence index into kmerLocations for
128                                 }                                                                                               //      the appropriate kmer number
129                                 seenBefore[kmerNumber] = 1;
130                         }                                                                                                       
131                 }
132                 
133                 ofstream kmerFile;                                                                              //      once we have the kmerLocations folder print it out
134                 openOutputFile(kmerDBName, kmerFile);                                   //      to a file
135                 
136                 for(int i=0;i<maxKmer;i++){                                                             //      step through all of the possible kmer numbers
137                         kmerFile << i << ' ' << kmerLocations[i].size();        //      print the kmer number and the number of sequences with
138                         for(int j=0;j<kmerLocations[i].size();j++){                     //      that kmer.  then print out the indices of the sequences
139                                 kmerFile << ' ' << kmerLocations[i][j];                 //      with that kmer.
140                         }
141                         kmerFile << endl;
142                 }
143                 kmerFile.close();
144                 
145         }
146         catch(exception& e) {
147                 errorOut(e, "KmerDB", "generateKmerDB");
148                 exit(1);
149         }       
150         
151 }
152
153 /**************************************************************************************************/
154
155 void KmerDB::readKmerDB(string kmerDBName, ifstream& kmerDBFile){
156         try {
157         
158                 kmerDBFile.seekg(0);                                                                    //      start at the beginning of the file
159                 
160                 string seqName;
161                 int seqNumber;
162                 
163                 for(int i=0;i<maxKmer;i++){
164                         int numValues;  
165                         kmerDBFile >> seqName >> numValues;
166                         
167                         for(int j=0;j<numValues;j++){                                           //      for each kmer number get the...
168                                 kmerDBFile >> seqNumber;                                                //              1. number of sequences with the kmer number
169                                 kmerLocations[i].push_back(seqNumber);                  //              2. sequence indices
170                         }
171                 }
172                 kmerDBFile.close();
173                 
174         }
175         catch(exception& e) {
176                 errorOut(e, "KmerDB", "readKmerDB");
177                 exit(1);
178         }       
179 }
180
181 /**************************************************************************************************/