]> git.donarmstrong.com Git - mothur.git/blob - kmerdb.cpp
added versioning info to all shortcut files mothur makes.
[mothur.git] / kmerdb.cpp
1 /*
2  *  kmerdb.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/16/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  *      This class is a child class of the Database class, which stores the template sequences as a kmer table and provides
9  *      a method of searching the kmer table for the sequence with the most kmers in common with a query sequence.
10  *      kmerLocations is the primary storage variable that is a two-dimensional vector where each row represents the
11  *      different number of kmers and each column contains the index to sequences that use that kmer.
12  *
13  *      Construction of an object of this type will first look for an appropriately named database file and if it is found
14  *      then will read in the database file (readKmerDB), otherwise it will generate one and store the data in memory
15  *      (generateKmerDB)
16  *
17  *      The search method used here is roughly the same as that used in the SimRank program that is found at the
18  *      greengenes website.  The default kmer size is 7.  The speed complexity is between O(L) and O(LN).  When I use 7mers
19  *      on average a kmer is found in ~100 other sequences with a database of ~5000 sequences.  If this is the case then the
20  *      time would be on the order of O(0.1LN) -> fast.
21  *
22
23  */
24
25 #include "sequence.hpp"
26 #include "kmer.hpp"
27 #include "database.hpp"
28 #include "kmerdb.hpp"
29
30 /**************************************************************************************************/
31
32 KmerDB::KmerDB(string fastaFileName, int kSize) : Database(), kmerSize(kSize) {
33         try { 
34         
35                 kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
36                 
37                 int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
38                 count = 0;
39                 
40                 maxKmer = power4s[kmerSize];
41                 kmerLocations.resize(maxKmer+1);
42                 
43         }
44         catch(exception& e) {
45                 m->errorOut(e, "KmerDB", "KmerDB");
46                 exit(1);
47         }       
48
49 }
50 /**************************************************************************************************/
51 KmerDB::KmerDB() : Database() {}
52 /**************************************************************************************************/
53
54 KmerDB::~KmerDB(){}
55
56 /**************************************************************************************************/
57
58 vector<int> KmerDB::findClosestSequences(Sequence* candidateSeq, int num){
59         try {
60                 vector<int> topMatches;
61                 Kmer kmer(kmerSize);
62                 searchScore = 0;
63                 
64                 vector<int> matches(numSeqs, 0);                                                //      a record of the sequences with shared kmers
65                 vector<int> timesKmerFound(kmerLocations.size()+1, 0);  //      a record of the kmers that we have already found
66                 
67                 int numKmers = candidateSeq->getNumBases() - kmerSize + 1;      
68         
69                 for(int i=0;i<numKmers;i++){
70                         int kmerNumber = kmer.getKmerNumber(candidateSeq->getUnaligned(), i);           //      go through the query sequence and get a kmer number
71                         if(timesKmerFound[kmerNumber] == 0){                            //      if we haven't seen it before...
72                                 for(int j=0;j<kmerLocations[kmerNumber].size();j++){//increase the count for each sequence that also has
73                                         matches[kmerLocations[kmerNumber][j]]++;        //      that kmer
74                                 }
75                         }
76                         timesKmerFound[kmerNumber] = 1;                                         //      ok, we've seen the kmer now
77                 }
78         
79                 vector<seqMatch> seqMatches;
80                 for(int i=0;i<numSeqs;i++){             
81                         seqMatch temp(i, matches[i]);
82                         seqMatches.push_back(temp);
83                 }
84                 
85                 //sorts putting largest matches first
86                 sort(seqMatches.begin(), seqMatches.end(), compareSeqMatches);
87                 
88                 searchScore = seqMatches[0].match;
89                 searchScore = 100 * searchScore / (float) numKmers;             //      return the Sequence object corresponding to the db
90         
91                 //save top matches
92                 for (int i = 0; i < num; i++) {
93                         topMatches.push_back(seqMatches[i].seq);
94                 }
95                 
96                 return topMatches;              
97         }
98         catch(exception& e) {
99                 m->errorOut(e, "KmerDB", "findClosestSequences");
100                 exit(1);
101         }       
102 }
103
104 /**************************************************************************************************/
105
106 void KmerDB::generateDB(){
107         try {
108                 
109                 ofstream kmerFile;                                                                              //      once we have the kmerLocations folder print it out
110                 openOutputFile(kmerDBName, kmerFile);                                   //      to a file
111                 
112                 //output version
113                 kmerFile << m->getVersion() << endl;
114                 
115                 for(int i=0;i<maxKmer;i++){                                                             //      step through all of the possible kmer numbers
116                         kmerFile << i << ' ' << kmerLocations[i].size();        //      print the kmer number and the number of sequences with
117                         for(int j=0;j<kmerLocations[i].size();j++){                     //      that kmer.  then print out the indices of the sequences
118                                 kmerFile << ' ' << kmerLocations[i][j];                 //      with that kmer.
119                         }
120                         kmerFile << endl;
121                 }
122                 kmerFile.close();
123                 
124         }
125         catch(exception& e) {
126                 m->errorOut(e, "KmerDB", "generateDB");
127                 exit(1);
128         }       
129         
130 }
131 /**************************************************************************************************/
132 void KmerDB::addSequence(Sequence seq) {
133         try {
134                 Kmer kmer(kmerSize);
135                 
136                 string unaligned = seq.getUnaligned();  //      ...take the unaligned sequence...
137                 int numKmers = unaligned.length() - kmerSize + 1;
138                         
139                 vector<int> seenBefore(maxKmer+1,0);
140                 for(int j=0;j<numKmers;j++){                                            //      ...step though the sequence and get each kmer...
141                         int kmerNumber = kmer.getKmerNumber(unaligned, j);
142                         if(seenBefore[kmerNumber] == 0){
143                                 kmerLocations[kmerNumber].push_back(count);             //      ...insert the sequence index into kmerLocations for
144                         }                                                                                               //      the appropriate kmer number
145                         seenBefore[kmerNumber] = 1;
146                 }                                                                                                       
147         
148                 count++;
149         }
150         catch(exception& e) {
151                 m->errorOut(e, "KmerDB", "addSequence");
152                 exit(1);
153         }       
154 }
155 /**************************************************************************************************/
156
157 void KmerDB::readKmerDB(ifstream& kmerDBFile){
158         try {
159                                         
160                 kmerDBFile.seekg(0);                                                                    //      start at the beginning of the file
161                 
162                 //read version
163                 string line = getline(kmerDBFile); gobble(kmerDBFile);
164                 
165                 string seqName;
166                 int seqNumber;
167
168                 for(int i=0;i<maxKmer;i++){
169                         int numValues = 0;      
170                         kmerDBFile >> seqName >> numValues;
171                         
172                         for(int j=0;j<numValues;j++){                                           //      for each kmer number get the...
173                                 kmerDBFile >> seqNumber;                                                //              1. number of sequences with the kmer number
174                                 kmerLocations[i].push_back(seqNumber);                  //              2. sequence indices
175                         }
176                 }
177                 kmerDBFile.close();
178                 
179         }
180         catch(exception& e) {
181                 m->errorOut(e, "KmerDB", "readKmerDB");
182                 exit(1);
183         }       
184 }
185
186 /**************************************************************************************************/
187 int KmerDB::getCount(int kmer) {
188         try {
189                 if (kmer < 0) { return 0; }  //if user gives negative number
190                 else if (kmer > maxKmer) {      return 0;       }  //or a kmer that is bigger than maxkmer
191                 else {  return kmerLocations[kmer].size();      }  // kmer is in vector range
192         }
193         catch(exception& e) {
194                 m->errorOut(e, "KmerDB", "getCount");
195                 exit(1);
196         }       
197 }
198 /**************************************************************************************************/
199 vector<int> KmerDB::getSequencesWithKmer(int kmer) {
200         try {
201                 
202                 vector<int> seqs;
203         
204                 if (kmer < 0) { }  //if user gives negative number
205                 else if (kmer > maxKmer) {      }  //or a kmer that is bigger than maxkmer
206                 else {  seqs = kmerLocations[kmer];     }
207                 
208                 return seqs;
209         }
210         catch(exception& e) {
211                 m->errorOut(e, "KmerDB", "getSequencesWithKmer");
212                 exit(1);
213         }       
214 }
215 /**************************************************************************************************/
216
217
218 /**************************************************************************************************/
219
220