]> git.donarmstrong.com Git - mothur.git/blob - kmerdb.cpp
created mothurOut class to handle logfiles
[mothur.git] / kmerdb.cpp
1 /*
2  *  kmerdb.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/16/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  *      This class is a child class of the Database class, which stores the template sequences as a kmer table and provides
9  *      a method of searching the kmer table for the sequence with the most kmers in common with a query sequence.
10  *      kmerLocations is the primary storage variable that is a two-dimensional vector where each row represents the
11  *      different number of kmers and each column contains the index to sequences that use that kmer.
12  *
13  *      Construction of an object of this type will first look for an appropriately named database file and if it is found
14  *      then will read in the database file (readKmerDB), otherwise it will generate one and store the data in memory
15  *      (generateKmerDB)
16  *
17  *      The search method used here is roughly the same as that used in the SimRank program that is found at the
18  *      greengenes website.  The default kmer size is 7.  The speed complexity is between O(L) and O(LN).  When I use 7mers
19  *      on average a kmer is found in ~100 other sequences with a database of ~5000 sequences.  If this is the case then the
20  *      time would be on the order of O(0.1LN) -> fast.
21  *
22
23  */
24
25 #include "sequence.hpp"
26 #include "kmer.hpp"
27 #include "database.hpp"
28 #include "kmerdb.hpp"
29
30 /**************************************************************************************************/
31
32 KmerDB::KmerDB(string fastaFileName, int kSize) : Database(), kmerSize(kSize) {
33         try { 
34         
35                 kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
36                 
37                 int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
38                 count = 0;
39                 
40                 maxKmer = power4s[kmerSize];
41                 kmerLocations.resize(maxKmer+1);
42                 
43         }
44         catch(exception& e) {
45                 m->errorOut(e, "KmerDB", "KmerDB");
46                 exit(1);
47         }       
48
49 }
50 /**************************************************************************************************/
51
52 KmerDB::~KmerDB(){}
53
54 /**************************************************************************************************/
55
56 vector<int> KmerDB::findClosestSequences(Sequence* candidateSeq, int num){
57         try {
58                 vector<int> topMatches;
59                 Kmer kmer(kmerSize);
60                 searchScore = 0;
61                 
62                 vector<int> matches(numSeqs, 0);                                                //      a record of the sequences with shared kmers
63                 vector<int> timesKmerFound(kmerLocations.size()+1, 0);  //      a record of the kmers that we have already found
64                 
65                 int numKmers = candidateSeq->getNumBases() - kmerSize + 1;      
66         
67                 for(int i=0;i<numKmers;i++){
68                         int kmerNumber = kmer.getKmerNumber(candidateSeq->getUnaligned(), i);           //      go through the query sequence and get a kmer number
69                         if(timesKmerFound[kmerNumber] == 0){                            //      if we haven't seen it before...
70                                 for(int j=0;j<kmerLocations[kmerNumber].size();j++){//increase the count for each sequence that also has
71                                         matches[kmerLocations[kmerNumber][j]]++;        //      that kmer
72                                 }
73                         }
74                         timesKmerFound[kmerNumber] = 1;                                         //      ok, we've seen the kmer now
75                 }
76         
77                 vector<seqMatch> seqMatches;
78                 for(int i=0;i<numSeqs;i++){             
79                         seqMatch temp(i, matches[i]);
80                         seqMatches.push_back(temp);
81                 }
82                 
83                 //sorts putting largest matches first
84                 sort(seqMatches.begin(), seqMatches.end(), compareSeqMatches);
85                 
86                 searchScore = seqMatches[0].match;
87                 searchScore = 100 * searchScore / (float) numKmers;             //      return the Sequence object corresponding to the db
88         
89                 //save top matches
90                 for (int i = 0; i < num; i++) {
91                         topMatches.push_back(seqMatches[i].seq);
92                 }
93                 
94                 return topMatches;              
95         }
96         catch(exception& e) {
97                 m->errorOut(e, "KmerDB", "findClosestSequences");
98                 exit(1);
99         }       
100 }
101
102 /**************************************************************************************************/
103
104 void KmerDB::generateDB(){
105         try {
106                 
107                 ofstream kmerFile;                                                                              //      once we have the kmerLocations folder print it out
108                 openOutputFile(kmerDBName, kmerFile);                                   //      to a file
109                 
110                 for(int i=0;i<maxKmer;i++){                                                             //      step through all of the possible kmer numbers
111                         kmerFile << i << ' ' << kmerLocations[i].size();        //      print the kmer number and the number of sequences with
112                         for(int j=0;j<kmerLocations[i].size();j++){                     //      that kmer.  then print out the indices of the sequences
113                                 kmerFile << ' ' << kmerLocations[i][j];                 //      with that kmer.
114                         }
115                         kmerFile << endl;
116                 }
117                 kmerFile.close();
118                 
119         }
120         catch(exception& e) {
121                 m->errorOut(e, "KmerDB", "generateDB");
122                 exit(1);
123         }       
124         
125 }
126 /**************************************************************************************************/
127 void KmerDB::addSequence(Sequence seq) {
128         try {
129                 Kmer kmer(kmerSize);
130                 
131                 string unaligned = seq.getUnaligned();  //      ...take the unaligned sequence...
132                 int numKmers = unaligned.length() - kmerSize + 1;
133                         
134                 vector<int> seenBefore(maxKmer+1,0);
135                 for(int j=0;j<numKmers;j++){                                            //      ...step though the sequence and get each kmer...
136                         int kmerNumber = kmer.getKmerNumber(unaligned, j);
137                         if(seenBefore[kmerNumber] == 0){
138                                 kmerLocations[kmerNumber].push_back(count);             //      ...insert the sequence index into kmerLocations for
139                         }                                                                                               //      the appropriate kmer number
140                         seenBefore[kmerNumber] = 1;
141                 }                                                                                                       
142         
143                 count++;
144         }
145         catch(exception& e) {
146                 m->errorOut(e, "KmerDB", "addSequence");
147                 exit(1);
148         }       
149 }
150 /**************************************************************************************************/
151
152 void KmerDB::readKmerDB(ifstream& kmerDBFile){
153         try {
154                                         
155                 kmerDBFile.seekg(0);                                                                    //      start at the beginning of the file
156                 
157                 string seqName;
158                 int seqNumber;
159
160                 for(int i=0;i<maxKmer;i++){
161                         int numValues = 0;      
162                         kmerDBFile >> seqName >> numValues;
163                         
164                         for(int j=0;j<numValues;j++){                                           //      for each kmer number get the...
165                                 kmerDBFile >> seqNumber;                                                //              1. number of sequences with the kmer number
166                                 kmerLocations[i].push_back(seqNumber);                  //              2. sequence indices
167                         }
168                 }
169                 kmerDBFile.close();
170                 
171         }
172         catch(exception& e) {
173                 m->errorOut(e, "KmerDB", "readKmerDB");
174                 exit(1);
175         }       
176 }
177
178 /**************************************************************************************************/
179 int KmerDB::getCount(int kmer) {
180         try {
181                 if (kmer < 0) { return 0; }  //if user gives negative number
182                 else if (kmer > maxKmer) {      return 0;       }  //or a kmer that is bigger than maxkmer
183                 else {  return kmerLocations[kmer].size();      }  // kmer is in vector range
184         }
185         catch(exception& e) {
186                 m->errorOut(e, "KmerDB", "getCount");
187                 exit(1);
188         }       
189 }
190 /**************************************************************************************************/
191 vector<int> KmerDB::getSequencesWithKmer(int kmer) {
192         try {
193                 
194                 vector<int> seqs;
195         
196                 if (kmer < 0) { }  //if user gives negative number
197                 else if (kmer > maxKmer) {      }  //or a kmer that is bigger than maxkmer
198                 else {  seqs = kmerLocations[kmer];     }
199                 
200                 return seqs;
201         }
202         catch(exception& e) {
203                 m->errorOut(e, "KmerDB", "getSequencesWithKmer");
204                 exit(1);
205         }       
206 }
207
208
209 /**************************************************************************************************/
210
211