]> git.donarmstrong.com Git - mothur.git/blob - kmerdb.cpp
pat's edits of screen.seqs and summary.seqs
[mothur.git] / kmerdb.cpp
1 /*
2  *  kmerdb.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/16/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  *      This class is a child class of the Database class, which stores the template sequences as a kmer table and provides
9  *      a method of searching the kmer table for the sequence with the most kmers in common with a query sequence.
10  *      kmerLocations is the primary storage variable that is a two-dimensional vector where each row represents the
11  *      different number of kmers and each column contains the index to sequences that use that kmer.
12  *
13  *      Construction of an object of this type will first look for an appropriately named database file and if it is found
14  *      then will read in the database file (readKmerDB), otherwise it will generate one and store the data in memory
15  *      (generateKmerDB)
16  *
17  *      The search method used here is roughly the same as that used in the SimRank program that is found at the
18  *      greengenes website.  The default kmer size is 7.  The speed complexity is between O(L) and O(LN).  When I use 7mers
19  *      on average a kmer is found in ~100 other sequences with a database of ~5000 sequences.  If this is the case then the
20  *      time would be on the order of O(0.1LN) -> fast.
21  *
22
23  */
24
25 using namespace std;
26
27
28 #include "sequence.hpp"
29 #include "kmer.hpp"
30 #include "database.hpp"
31 #include "kmerdb.hpp"
32
33 /**************************************************************************************************/
34
35 KmerDB::KmerDB(string fastaFileName, int kSize) : Database(fastaFileName), kmerSize(kSize) {
36
37         string kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
38         ifstream kmerFileTest(kmerDBName.c_str());
39         
40         int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
41         
42         maxKmer = power4s[kmerSize];
43         kmerLocations.resize(maxKmer+1);
44         
45         if(!kmerFileTest){              //      if we can open the kmer db file, then read it in...
46                 cout << "Generating the " << kmerDBName << " database...\t";    cout.flush();
47                 generateKmerDB(kmerDBName);     
48         }
49         else{                                   //      ...otherwise generate it.
50                 cout << "Reading in the " << kmerDBName << " database...\t";    cout.flush();
51                 readKmerDB(kmerDBName, kmerFileTest);
52         }
53         cout << "DONE." << endl << endl;        cout.flush();
54
55 }
56
57 /**************************************************************************************************/
58
59 Sequence* KmerDB::findClosestSequence(Sequence* candidateSeq){
60         
61         Kmer kmer(kmerSize);
62         
63         searchScore = 0;
64         int maxSequence = 0;
65
66         vector<int> matches(numSeqs, 0);                                                //      a record of the sequences with shared kmers
67         vector<int> timesKmerFound(kmerLocations.size()+1, 0);  //      a record of the kmers that we have already found
68
69         int numKmers = candidateSeq->getNumBases() - kmerSize + 1;      
70
71         for(int i=0;i<numKmers;i++){
72                 int kmerNumber = kmer.getKmerNumber(candidateSeq->getUnaligned(), i);           //      go through the query sequence and get a kmer number
73                 if(timesKmerFound[kmerNumber] == 0){                            //      if we haven't seen it before...
74                         for(int j=0;j<kmerLocations[kmerNumber].size();j++){//increase the count for each sequence that also has
75                                 matches[kmerLocations[kmerNumber][j]]++;        //      that kmer
76                         }
77                 }
78                 timesKmerFound[kmerNumber] = 1;                                         //      ok, we've seen the kmer now
79         }
80
81         for(int i=0;i<numSeqs;i++){                                                             //      find the db sequence that shares the most kmers with
82                 if(matches[i] > searchScore){                                   //      the query sequence
83                         searchScore = matches[i];
84                         maxSequence = i;
85                 }
86         }
87
88         searchScore = 100 * searchScore / (float) numKmers;             //      return the Sequence object corresponding to the db
89         return templateSequences[maxSequence];                                  //      sequence with the most shared kmers.
90 }
91
92 /**************************************************************************************************/
93
94 void KmerDB::generateKmerDB(string kmerDBName){
95         
96         Kmer kmer(kmerSize);
97         
98         for(int i=0;i<numSeqs;i++){                                                             //      for all of the template sequences...
99
100                 string seq = templateSequences[i]->getUnaligned();      //      ...take the unaligned sequence...
101                 int numKmers = seq.length() - kmerSize + 1;
102                 
103                 vector<int> seenBefore(maxKmer+1,0);
104                 for(int j=0;j<numKmers;j++){                                            //      ...step though the sequence and get each kmer...
105                         int kmerNumber = kmer.getKmerNumber(seq, j);
106                         if(seenBefore[kmerNumber] == 0){
107                                 kmerLocations[kmerNumber].push_back(i);         //      ...insert the sequence index into kmerLocations for
108                         }                                                                                               //      the appropriate kmer number
109                         seenBefore[kmerNumber] = 1;
110                 }                                                                                                       
111         }
112         
113         ofstream kmerFile;                                                                              //      once we have the kmerLocations folder print it out
114         openOutputFile(kmerDBName, kmerFile);                                   //      to a file
115         
116         for(int i=0;i<maxKmer;i++){                                                             //      step through all of the possible kmer numbers
117                 kmerFile << i << ' ' << kmerLocations[i].size();        //      print the kmer number and the number of sequences with
118                 for(int j=0;j<kmerLocations[i].size();j++){                     //      that kmer.  then print out the indices of the sequences
119                         kmerFile << ' ' << kmerLocations[i][j];                 //      with that kmer.
120                 }
121                 kmerFile << endl;
122         }
123         kmerFile.close();
124         
125 }
126
127 /**************************************************************************************************/
128
129 void KmerDB::readKmerDB(string kmerDBName, ifstream& kmerDBFile){
130
131         kmerDBFile.seekg(0);                                                                    //      start at the beginning of the file
132         
133         string seqName;
134         int seqNumber;
135         
136         for(int i=0;i<maxKmer;i++){
137                 int numValues;  
138                 kmerDBFile >> seqName >> numValues;
139                 
140                 for(int j=0;j<numValues;j++){                                           //      for each kmer number get the...
141                         kmerDBFile >> seqNumber;                                                //              1. number of sequences with the kmer number
142                         kmerLocations[i].push_back(seqNumber);                  //              2. sequence indices
143                 }
144         }
145         kmerDBFile.close();
146 }
147
148 /**************************************************************************************************/