]> git.donarmstrong.com Git - mothur.git/blob - kmerdb.cpp
working on slayer bug
[mothur.git] / kmerdb.cpp
1 /*
2  *  kmerdb.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/16/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  *      This class is a child class of the Database class, which stores the template sequences as a kmer table and provides
9  *      a method of searching the kmer table for the sequence with the most kmers in common with a query sequence.
10  *      kmerLocations is the primary storage variable that is a two-dimensional vector where each row represents the
11  *      different number of kmers and each column contains the index to sequences that use that kmer.
12  *
13  *      Construction of an object of this type will first look for an appropriately named database file and if it is found
14  *      then will read in the database file (readKmerDB), otherwise it will generate one and store the data in memory
15  *      (generateKmerDB)
16  *
17  *      The search method used here is roughly the same as that used in the SimRank program that is found at the
18  *      greengenes website.  The default kmer size is 7.  The speed complexity is between O(L) and O(LN).  When I use 7mers
19  *      on average a kmer is found in ~100 other sequences with a database of ~5000 sequences.  If this is the case then the
20  *      time would be on the order of O(0.1LN) -> fast.
21  *
22
23  */
24
25 #include "sequence.hpp"
26 #include "kmer.hpp"
27 #include "database.hpp"
28 #include "kmerdb.hpp"
29
30 /**************************************************************************************************/
31
32 KmerDB::KmerDB(string fastaFileName, int kSize) : Database(), kmerSize(kSize) {
33         try { 
34         
35                 kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
36                 
37                 int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
38                 count = 0;
39                 
40                 maxKmer = power4s[kmerSize];
41                 kmerLocations.resize(maxKmer+1);
42                 
43         }
44         catch(exception& e) {
45                 m->errorOut(e, "KmerDB", "KmerDB");
46                 exit(1);
47         }       
48
49 }
50 /**************************************************************************************************/
51 KmerDB::KmerDB() : Database() {}
52 /**************************************************************************************************/
53
54 KmerDB::~KmerDB(){}
55
56 /**************************************************************************************************/
57
58 vector<int> KmerDB::findClosestSequences(Sequence* candidateSeq, int num){
59         try {
60                 vector<int> topMatches;
61                 Kmer kmer(kmerSize);
62                 searchScore = 0;
63                 Scores.clear();
64                 
65                 vector<int> matches(numSeqs, 0);                                                //      a record of the sequences with shared kmers
66                 vector<int> timesKmerFound(kmerLocations.size()+1, 0);  //      a record of the kmers that we have already found
67                 
68                 int numKmers = candidateSeq->getNumBases() - kmerSize + 1;      
69         
70                 for(int i=0;i<numKmers;i++){
71                         int kmerNumber = kmer.getKmerNumber(candidateSeq->getUnaligned(), i);           //      go through the query sequence and get a kmer number
72                         if(timesKmerFound[kmerNumber] == 0){                            //      if we haven't seen it before...
73                                 for(int j=0;j<kmerLocations[kmerNumber].size();j++){//increase the count for each sequence that also has
74                                         matches[kmerLocations[kmerNumber][j]]++;        //      that kmer
75                                 }
76                         }
77                         timesKmerFound[kmerNumber] = 1;                                         //      ok, we've seen the kmer now
78                 }
79                 
80                 if (num != 1) {
81                         vector<seqMatch> seqMatches; seqMatches.resize(numSeqs);
82                         for(int i=0;i<numSeqs;i++){             
83                                 seqMatches[i].seq = i;
84                                 seqMatches[i].match = matches[i];
85                         }
86                         
87                         //sorts putting largest matches first
88                         sort(seqMatches.begin(), seqMatches.end(), compareSeqMatches);
89                         
90                         searchScore = seqMatches[0].match;
91                         searchScore = 100 * searchScore / (float) numKmers;             //      return the Sequence object corresponding to the db
92                 
93                         //save top matches
94                         for (int i = 0; i < num; i++) {
95                                 topMatches.push_back(seqMatches[i].seq);
96                                 float thisScore = 100 * seqMatches[i].match / (float) numKmers;
97                                 Scores.push_back(thisScore);
98                         }
99                 }else{
100                         int bestIndex = 0;
101                         int bestMatch = -1;
102                         for(int i=0;i<numSeqs;i++){     
103                                 
104                                 if (matches[i] > bestMatch) {
105                                         bestIndex = i;
106                                         bestMatch = matches[i];
107                                 }
108                         }
109                         
110                         searchScore = bestMatch;
111                         searchScore = 100 * searchScore / (float) numKmers;             //      return the Sequence object corresponding to the db
112                         topMatches.push_back(bestIndex);
113                         Scores.push_back(searchScore);
114                 }
115                 return topMatches;              
116         }
117         catch(exception& e) {
118                 m->errorOut(e, "KmerDB", "findClosestSequences");
119                 exit(1);
120         }       
121 }
122
123 /**************************************************************************************************/
124
125 void KmerDB::generateDB(){
126         try {
127                 
128                 ofstream kmerFile;                                                                              //      once we have the kmerLocations folder print it out
129                 m->openOutputFile(kmerDBName, kmerFile);                                        //      to a file
130                 
131                 //output version
132                 kmerFile << "#" << m->getVersion() << endl;
133                 
134                 for(int i=0;i<maxKmer;i++){                                                             //      step through all of the possible kmer numbers
135                         kmerFile << i << ' ' << kmerLocations[i].size();        //      print the kmer number and the number of sequences with
136                         for(int j=0;j<kmerLocations[i].size();j++){                     //      that kmer.  then print out the indices of the sequences
137                                 kmerFile << ' ' << kmerLocations[i][j];                 //      with that kmer.
138                         }
139                         kmerFile << endl;
140                 }
141                 kmerFile.close();
142                 
143         }
144         catch(exception& e) {
145                 m->errorOut(e, "KmerDB", "generateDB");
146                 exit(1);
147         }       
148         
149 }
150 /**************************************************************************************************/
151 void KmerDB::addSequence(Sequence seq) {
152         try {
153                 Kmer kmer(kmerSize);
154                 
155                 string unaligned = seq.getUnaligned();  //      ...take the unaligned sequence...
156                 int numKmers = unaligned.length() - kmerSize + 1;
157                         
158                 vector<int> seenBefore(maxKmer+1,0);
159                 for(int j=0;j<numKmers;j++){                                            //      ...step though the sequence and get each kmer...
160                         int kmerNumber = kmer.getKmerNumber(unaligned, j);
161                         if(seenBefore[kmerNumber] == 0){
162                                 kmerLocations[kmerNumber].push_back(count);             //      ...insert the sequence index into kmerLocations for
163                         }                                                                                               //      the appropriate kmer number
164                         seenBefore[kmerNumber] = 1;
165                 }                                                                                                       
166         
167                 count++;
168         }
169         catch(exception& e) {
170                 m->errorOut(e, "KmerDB", "addSequence");
171                 exit(1);
172         }       
173 }
174 /**************************************************************************************************/
175
176 void KmerDB::readKmerDB(ifstream& kmerDBFile){
177         try {
178                                         
179                 kmerDBFile.seekg(0);                                                                    //      start at the beginning of the file
180                 
181                 //read version
182                 string line = m->getline(kmerDBFile); m->gobble(kmerDBFile);
183                 
184                 string seqName;
185                 int seqNumber;
186
187                 for(int i=0;i<maxKmer;i++){
188                         int numValues = 0;      
189                         kmerDBFile >> seqName >> numValues;
190                         
191                         for(int j=0;j<numValues;j++){                                           //      for each kmer number get the...
192                                 kmerDBFile >> seqNumber;                                                //              1. number of sequences with the kmer number
193                                 kmerLocations[i].push_back(seqNumber);                  //              2. sequence indices
194                         }
195                 }
196                 kmerDBFile.close();
197                 
198         }
199         catch(exception& e) {
200                 m->errorOut(e, "KmerDB", "readKmerDB");
201                 exit(1);
202         }       
203 }
204
205 /**************************************************************************************************/
206 int KmerDB::getCount(int kmer) {
207         try {
208                 if (kmer < 0) { return 0; }  //if user gives negative number
209                 else if (kmer > maxKmer) {      return 0;       }  //or a kmer that is bigger than maxkmer
210                 else {  return kmerLocations[kmer].size();      }  // kmer is in vector range
211         }
212         catch(exception& e) {
213                 m->errorOut(e, "KmerDB", "getCount");
214                 exit(1);
215         }       
216 }
217 /**************************************************************************************************/
218 vector<int> KmerDB::getSequencesWithKmer(int kmer) {
219         try {
220                 
221                 vector<int> seqs;
222         
223                 if (kmer < 0) { }  //if user gives negative number
224                 else if (kmer > maxKmer) {      }  //or a kmer that is bigger than maxkmer
225                 else {  seqs = kmerLocations[kmer];     }
226                 
227                 return seqs;
228         }
229         catch(exception& e) {
230                 m->errorOut(e, "KmerDB", "getSequencesWithKmer");
231                 exit(1);
232         }       
233 }
234 /**************************************************************************************************/
235
236
237 /**************************************************************************************************/
238
239