]> git.donarmstrong.com Git - mothur.git/blob - kmerdb.cpp
changed random forest output filename
[mothur.git] / kmerdb.cpp
1 /*
2  *  kmerdb.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/16/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  *      This class is a child class of the Database class, which stores the template sequences as a kmer table and provides
9  *      a method of searching the kmer table for the sequence with the most kmers in common with a query sequence.
10  *      kmerLocations is the primary storage variable that is a two-dimensional vector where each row represents the
11  *      different number of kmers and each column contains the index to sequences that use that kmer.
12  *
13  *      Construction of an object of this type will first look for an appropriately named database file and if it is found
14  *      then will read in the database file (readKmerDB), otherwise it will generate one and store the data in memory
15  *      (generateKmerDB)
16  *
17  *      The search method used here is roughly the same as that used in the SimRank program that is found at the
18  *      greengenes website.  The default kmer size is 7.  The speed complexity is between O(L) and O(LN).  When I use 7mers
19  *      on average a kmer is found in ~100 other sequences with a database of ~5000 sequences.  If this is the case then the
20  *      time would be on the order of O(0.1LN) -> fast.
21  *
22
23  */
24
25 #include "sequence.hpp"
26 #include "kmer.hpp"
27 #include "database.hpp"
28 #include "kmerdb.hpp"
29
30 /**************************************************************************************************/
31
32 KmerDB::KmerDB(string fastaFileName, int kSize) : Database(), kmerSize(kSize) {
33         try { 
34         
35                 kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
36                 
37                 int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
38                 count = 0;
39                 
40                 maxKmer = power4s[kmerSize];
41                 kmerLocations.resize(maxKmer+1);
42                 
43         }
44         catch(exception& e) {
45                 m->errorOut(e, "KmerDB", "KmerDB");
46                 exit(1);
47         }       
48
49 }
50 /**************************************************************************************************/
51 KmerDB::KmerDB() : Database() {}
52 /**************************************************************************************************/
53
54 KmerDB::~KmerDB(){}
55
56 /**************************************************************************************************/
57
58 vector<int> KmerDB::findClosestSequences(Sequence* candidateSeq, int num){
59         try {
60                 if (num > numSeqs) { m->mothurOut("[WARNING]: you requested " + toString(num) + " closest sequences, but the template only contains " + toString(numSeqs) + ", adjusting."); m->mothurOutEndLine(); num = numSeqs; }
61                 
62                 vector<int> topMatches;
63                 Kmer kmer(kmerSize);
64                 searchScore = 0;
65                 Scores.clear();
66                 
67                 vector<int> matches(numSeqs, 0);                                                //      a record of the sequences with shared kmers
68                 vector<int> timesKmerFound(kmerLocations.size()+1, 0);  //      a record of the kmers that we have already found
69                 
70                 int numKmers = candidateSeq->getNumBases() - kmerSize + 1;      
71         
72                 for(int i=0;i<numKmers;i++){
73                         int kmerNumber = kmer.getKmerNumber(candidateSeq->getUnaligned(), i);           //      go through the query sequence and get a kmer number
74                         if(timesKmerFound[kmerNumber] == 0){                            //      if we haven't seen it before...
75                                 for(int j=0;j<kmerLocations[kmerNumber].size();j++){//increase the count for each sequence that also has
76                                         matches[kmerLocations[kmerNumber][j]]++;        //      that kmer
77                                 }
78                         }
79                         timesKmerFound[kmerNumber] = 1;                                         //      ok, we've seen the kmer now
80                 }
81                 
82                 if (num != 1) {
83                         vector<seqMatch> seqMatches; seqMatches.resize(numSeqs);
84                         for(int i=0;i<numSeqs;i++){             
85                                 seqMatches[i].seq = i;
86                                 seqMatches[i].match = matches[i];
87                         }
88                         
89                         //sorts putting largest matches first
90                         sort(seqMatches.begin(), seqMatches.end(), compareSeqMatches);
91                         
92                         searchScore = seqMatches[0].match;
93                         searchScore = 100 * searchScore / (float) numKmers;             //      return the Sequence object corresponding to the db
94                 
95                         //save top matches
96                         for (int i = 0; i < num; i++) {
97                                 topMatches.push_back(seqMatches[i].seq);
98                                 float thisScore = 100 * seqMatches[i].match / (float) numKmers;
99                                 Scores.push_back(thisScore);
100                         }
101                 }else{
102                         int bestIndex = 0;
103                         int bestMatch = -1;
104                         for(int i=0;i<numSeqs;i++){     
105                                 
106                                 if (matches[i] > bestMatch) {
107                                         bestIndex = i;
108                                         bestMatch = matches[i];
109                                 }
110                         }
111                         
112                         searchScore = bestMatch;
113                         searchScore = 100 * searchScore / (float) numKmers;             //      return the Sequence object corresponding to the db
114                         topMatches.push_back(bestIndex);
115                         Scores.push_back(searchScore);
116                 }
117                 return topMatches;              
118         }
119         catch(exception& e) {
120                 m->errorOut(e, "KmerDB", "findClosestSequences");
121                 exit(1);
122         }       
123 }
124
125 /**************************************************************************************************/
126
127 void KmerDB::generateDB(){
128         try {
129                 
130                 ofstream kmerFile;                                                                              //      once we have the kmerLocations folder print it out
131                 m->openOutputFile(kmerDBName, kmerFile);                                        //      to a file
132                 
133                 //output version
134                 kmerFile << "#" << m->getVersion() << endl;
135                 
136                 for(int i=0;i<maxKmer;i++){                                                             //      step through all of the possible kmer numbers
137                         kmerFile << i << ' ' << kmerLocations[i].size();        //      print the kmer number and the number of sequences with
138                         for(int j=0;j<kmerLocations[i].size();j++){                     //      that kmer.  then print out the indices of the sequences
139                                 kmerFile << ' ' << kmerLocations[i][j];                 //      with that kmer.
140                         }
141                         kmerFile << endl;
142                 }
143                 kmerFile.close();
144                 
145         }
146         catch(exception& e) {
147                 m->errorOut(e, "KmerDB", "generateDB");
148                 exit(1);
149         }       
150         
151 }
152 /**************************************************************************************************/
153 void KmerDB::addSequence(Sequence seq) {
154         try {
155                 Kmer kmer(kmerSize);
156                 
157                 string unaligned = seq.getUnaligned();  //      ...take the unaligned sequence...
158                 int numKmers = unaligned.length() - kmerSize + 1;
159                         
160                 vector<int> seenBefore(maxKmer+1,0);
161                 for(int j=0;j<numKmers;j++){                                            //      ...step though the sequence and get each kmer...
162                         int kmerNumber = kmer.getKmerNumber(unaligned, j);
163                         if(seenBefore[kmerNumber] == 0){
164                                 kmerLocations[kmerNumber].push_back(count);             //      ...insert the sequence index into kmerLocations for
165                         }                                                                                               //      the appropriate kmer number
166                         seenBefore[kmerNumber] = 1;
167                 }                                                                                                       
168         
169                 count++;
170         }
171         catch(exception& e) {
172                 m->errorOut(e, "KmerDB", "addSequence");
173                 exit(1);
174         }       
175 }
176 /**************************************************************************************************/
177
178 void KmerDB::readKmerDB(ifstream& kmerDBFile){
179         try {
180                                         
181                 kmerDBFile.seekg(0);                                                                    //      start at the beginning of the file
182                 
183                 //read version
184                 string line = m->getline(kmerDBFile); m->gobble(kmerDBFile);
185                 
186                 string seqName;
187                 int seqNumber;
188
189                 for(int i=0;i<maxKmer;i++){
190                         int numValues = 0;      
191                         kmerDBFile >> seqName >> numValues;
192                         
193                         for(int j=0;j<numValues;j++){                                           //      for each kmer number get the...
194                                 kmerDBFile >> seqNumber;                                                //              1. number of sequences with the kmer number
195                                 kmerLocations[i].push_back(seqNumber);                  //              2. sequence indices
196                         }
197                 }
198                 kmerDBFile.close();
199                 
200         }
201         catch(exception& e) {
202                 m->errorOut(e, "KmerDB", "readKmerDB");
203                 exit(1);
204         }       
205 }
206
207 /**************************************************************************************************/
208 int KmerDB::getCount(int kmer) {
209         try {
210                 if (kmer < 0) { return 0; }  //if user gives negative number
211                 else if (kmer > maxKmer) {      return 0;       }  //or a kmer that is bigger than maxkmer
212                 else {  return kmerLocations[kmer].size();      }  // kmer is in vector range
213         }
214         catch(exception& e) {
215                 m->errorOut(e, "KmerDB", "getCount");
216                 exit(1);
217         }       
218 }
219 /**************************************************************************************************/
220 int KmerDB::getReversed(int kmerNumber) {
221         try {
222                 Kmer kmer(kmerSize);
223                 
224                 if (kmerNumber < 0) { return 0; }  //if user gives negative number
225                 else if (kmerNumber > maxKmer) {        return 0;       }  //or a kmer that is bigger than maxkmer
226                 else {  return kmer.getReverseKmerNumber(kmerNumber);   }  // kmer is in vector range
227         }
228         catch(exception& e) {
229                 m->errorOut(e, "KmerDB", "getReversed");
230                 exit(1);
231         }       
232 }
233 /**************************************************************************************************/
234 vector<int> KmerDB::getSequencesWithKmer(int kmer) {
235         try {
236                 
237                 vector<int> seqs;
238         
239                 if (kmer < 0) { }  //if user gives negative number
240                 else if (kmer > maxKmer) {      }  //or a kmer that is bigger than maxkmer
241                 else {  seqs = kmerLocations[kmer];     }
242                 
243                 return seqs;
244         }
245         catch(exception& e) {
246                 m->errorOut(e, "KmerDB", "getSequencesWithKmer");
247                 exit(1);
248         }       
249 }
250 /**************************************************************************************************/
251
252
253 /**************************************************************************************************/
254
255