]> git.donarmstrong.com Git - mothur.git/blob - kmerdb.cpp
fixed trim.seqs bug with qtrim parameter and added num=1 special case to database...
[mothur.git] / kmerdb.cpp
1 /*
2  *  kmerdb.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/16/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  *      This class is a child class of the Database class, which stores the template sequences as a kmer table and provides
9  *      a method of searching the kmer table for the sequence with the most kmers in common with a query sequence.
10  *      kmerLocations is the primary storage variable that is a two-dimensional vector where each row represents the
11  *      different number of kmers and each column contains the index to sequences that use that kmer.
12  *
13  *      Construction of an object of this type will first look for an appropriately named database file and if it is found
14  *      then will read in the database file (readKmerDB), otherwise it will generate one and store the data in memory
15  *      (generateKmerDB)
16  *
17  *      The search method used here is roughly the same as that used in the SimRank program that is found at the
18  *      greengenes website.  The default kmer size is 7.  The speed complexity is between O(L) and O(LN).  When I use 7mers
19  *      on average a kmer is found in ~100 other sequences with a database of ~5000 sequences.  If this is the case then the
20  *      time would be on the order of O(0.1LN) -> fast.
21  *
22
23  */
24
25 #include "sequence.hpp"
26 #include "kmer.hpp"
27 #include "database.hpp"
28 #include "kmerdb.hpp"
29
30 /**************************************************************************************************/
31
32 KmerDB::KmerDB(string fastaFileName, int kSize) : Database(), kmerSize(kSize) {
33         try { 
34         
35                 kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
36                 
37                 int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
38                 count = 0;
39                 
40                 maxKmer = power4s[kmerSize];
41                 kmerLocations.resize(maxKmer+1);
42                 
43         }
44         catch(exception& e) {
45                 m->errorOut(e, "KmerDB", "KmerDB");
46                 exit(1);
47         }       
48
49 }
50 /**************************************************************************************************/
51 KmerDB::KmerDB() : Database() {}
52 /**************************************************************************************************/
53
54 KmerDB::~KmerDB(){}
55
56 /**************************************************************************************************/
57
58 vector<int> KmerDB::findClosestSequences(Sequence* candidateSeq, int num){
59         try {
60                 vector<int> topMatches;
61                 Kmer kmer(kmerSize);
62                 searchScore = 0;
63                 
64                 vector<int> matches(numSeqs, 0);                                                //      a record of the sequences with shared kmers
65                 vector<int> timesKmerFound(kmerLocations.size()+1, 0);  //      a record of the kmers that we have already found
66                 
67                 int numKmers = candidateSeq->getNumBases() - kmerSize + 1;      
68         
69                 for(int i=0;i<numKmers;i++){
70                         int kmerNumber = kmer.getKmerNumber(candidateSeq->getUnaligned(), i);           //      go through the query sequence and get a kmer number
71                         if(timesKmerFound[kmerNumber] == 0){                            //      if we haven't seen it before...
72                                 for(int j=0;j<kmerLocations[kmerNumber].size();j++){//increase the count for each sequence that also has
73                                         matches[kmerLocations[kmerNumber][j]]++;        //      that kmer
74                                 }
75                         }
76                         timesKmerFound[kmerNumber] = 1;                                         //      ok, we've seen the kmer now
77                 }
78                 
79                 if (num != 1) {
80                         vector<seqMatch> seqMatches; seqMatches.resize(numSeqs);
81                         for(int i=0;i<numSeqs;i++){             
82                                 seqMatches[i].seq = i;
83                                 seqMatches[i].match = matches[i];
84                         }
85                         
86                         //sorts putting largest matches first
87                         sort(seqMatches.begin(), seqMatches.end(), compareSeqMatches);
88                         
89                         searchScore = seqMatches[0].match;
90                         searchScore = 100 * searchScore / (float) numKmers;             //      return the Sequence object corresponding to the db
91                 
92                         //save top matches
93                         for (int i = 0; i < num; i++) {
94                                 topMatches.push_back(seqMatches[i].seq);
95                         }
96                 }else{
97                         int bestIndex = 0;
98                         int bestMatch = -1;
99                         for(int i=0;i<numSeqs;i++){     
100                                 
101                                 if (matches[i] > bestMatch) {
102                                         bestIndex = i;
103                                         bestMatch = matches[i];
104                                 }
105                         }
106                         
107                         searchScore = bestMatch;
108                         searchScore = 100 * searchScore / (float) numKmers;             //      return the Sequence object corresponding to the db
109                         topMatches.push_back(bestIndex);
110                 }
111                 return topMatches;              
112         }
113         catch(exception& e) {
114                 m->errorOut(e, "KmerDB", "findClosestSequences");
115                 exit(1);
116         }       
117 }
118
119 /**************************************************************************************************/
120
121 void KmerDB::generateDB(){
122         try {
123                 
124                 ofstream kmerFile;                                                                              //      once we have the kmerLocations folder print it out
125                 m->openOutputFile(kmerDBName, kmerFile);                                        //      to a file
126                 
127                 //output version
128                 kmerFile << "#" << m->getVersion() << endl;
129                 
130                 for(int i=0;i<maxKmer;i++){                                                             //      step through all of the possible kmer numbers
131                         kmerFile << i << ' ' << kmerLocations[i].size();        //      print the kmer number and the number of sequences with
132                         for(int j=0;j<kmerLocations[i].size();j++){                     //      that kmer.  then print out the indices of the sequences
133                                 kmerFile << ' ' << kmerLocations[i][j];                 //      with that kmer.
134                         }
135                         kmerFile << endl;
136                 }
137                 kmerFile.close();
138                 
139         }
140         catch(exception& e) {
141                 m->errorOut(e, "KmerDB", "generateDB");
142                 exit(1);
143         }       
144         
145 }
146 /**************************************************************************************************/
147 void KmerDB::addSequence(Sequence seq) {
148         try {
149                 Kmer kmer(kmerSize);
150                 
151                 string unaligned = seq.getUnaligned();  //      ...take the unaligned sequence...
152                 int numKmers = unaligned.length() - kmerSize + 1;
153                         
154                 vector<int> seenBefore(maxKmer+1,0);
155                 for(int j=0;j<numKmers;j++){                                            //      ...step though the sequence and get each kmer...
156                         int kmerNumber = kmer.getKmerNumber(unaligned, j);
157                         if(seenBefore[kmerNumber] == 0){
158                                 kmerLocations[kmerNumber].push_back(count);             //      ...insert the sequence index into kmerLocations for
159                         }                                                                                               //      the appropriate kmer number
160                         seenBefore[kmerNumber] = 1;
161                 }                                                                                                       
162         
163                 count++;
164         }
165         catch(exception& e) {
166                 m->errorOut(e, "KmerDB", "addSequence");
167                 exit(1);
168         }       
169 }
170 /**************************************************************************************************/
171
172 void KmerDB::readKmerDB(ifstream& kmerDBFile){
173         try {
174                                         
175                 kmerDBFile.seekg(0);                                                                    //      start at the beginning of the file
176                 
177                 //read version
178                 string line = m->getline(kmerDBFile); m->gobble(kmerDBFile);
179                 
180                 string seqName;
181                 int seqNumber;
182
183                 for(int i=0;i<maxKmer;i++){
184                         int numValues = 0;      
185                         kmerDBFile >> seqName >> numValues;
186                         
187                         for(int j=0;j<numValues;j++){                                           //      for each kmer number get the...
188                                 kmerDBFile >> seqNumber;                                                //              1. number of sequences with the kmer number
189                                 kmerLocations[i].push_back(seqNumber);                  //              2. sequence indices
190                         }
191                 }
192                 kmerDBFile.close();
193                 
194         }
195         catch(exception& e) {
196                 m->errorOut(e, "KmerDB", "readKmerDB");
197                 exit(1);
198         }       
199 }
200
201 /**************************************************************************************************/
202 int KmerDB::getCount(int kmer) {
203         try {
204                 if (kmer < 0) { return 0; }  //if user gives negative number
205                 else if (kmer > maxKmer) {      return 0;       }  //or a kmer that is bigger than maxkmer
206                 else {  return kmerLocations[kmer].size();      }  // kmer is in vector range
207         }
208         catch(exception& e) {
209                 m->errorOut(e, "KmerDB", "getCount");
210                 exit(1);
211         }       
212 }
213 /**************************************************************************************************/
214 vector<int> KmerDB::getSequencesWithKmer(int kmer) {
215         try {
216                 
217                 vector<int> seqs;
218         
219                 if (kmer < 0) { }  //if user gives negative number
220                 else if (kmer > maxKmer) {      }  //or a kmer that is bigger than maxkmer
221                 else {  seqs = kmerLocations[kmer];     }
222                 
223                 return seqs;
224         }
225         catch(exception& e) {
226                 m->errorOut(e, "KmerDB", "getSequencesWithKmer");
227                 exit(1);
228         }       
229 }
230 /**************************************************************************************************/
231
232
233 /**************************************************************************************************/
234
235