]> git.donarmstrong.com Git - mothur.git/blob - kmerdb.cpp
Initial revision
[mothur.git] / kmerdb.cpp
1 /*
2  *  kmerdb.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/16/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  */
9
10 using namespace std;
11
12 #include <vector>
13 #include <string>
14 #include <fstream>
15 #include <iostream>
16
17 #include "sequence.hpp"
18 #include "kmer.hpp"
19 #include "database.hpp"
20 #include "kmerdb.hpp"
21
22 /**************************************************************************************************/
23
24 KmerDB::KmerDB(string fastaFileName, int kSize) : Database(fastaFileName), kmerSize(kSize) {
25
26         string kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
27         ifstream kmerFileTest(kmerDBName.c_str());
28         
29         int power4s[9] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536 };
30         
31         maxKmer = power4s[kmerSize];
32         kmerLocations.resize(maxKmer+1);
33         
34         if(!kmerFileTest){
35                 cout << "Generating the " << kmerDBName << " database...\t";    cout.flush();
36                 generateKmerDB(kmerDBName);     
37         }
38         else{
39                 cout << "Reading in the " << kmerDBName << " database...\t";    cout.flush();
40                 readKmerDB(kmerDBName, kmerFileTest);
41         }
42         cout << "DONE." << endl << endl;        cout.flush();
43
44 }
45
46 /**************************************************************************************************/
47
48 Sequence* KmerDB::findClosestSequence(Sequence* candidateSeq){
49         
50         vector<int> matches(numSeqs, 0);
51         vector<int> timesKmerFound(kmerLocations.size()+1, 0);
52         
53         int maxMatches = 0;
54         int maxSequence = 0;
55         
56         string query = candidateSeq->getUnaligned();
57         
58         int numKmers = query.length() - kmerSize + 1;
59         Kmer kmer(kmerSize);
60         
61         for(int i=0;i<numKmers;i++){
62                 
63                 int kmerNumber = kmer.getKmerNumber(query, i);
64                 
65                 if(timesKmerFound[kmerNumber] == 0){
66                         for(int j=0;j<kmerLocations[kmerNumber].size();j++){
67                                 matches[kmerLocations[kmerNumber][j]]++;
68                         }
69                 }
70                 timesKmerFound[kmerNumber] = 1;
71                 
72         }
73         for(int i=0;i<numSeqs;i++){
74                 if(matches[i] > maxMatches){
75                         maxMatches = matches[i];
76                         maxSequence = i;
77                 }
78         }
79         return templateSequences[maxSequence];
80         
81 }
82
83 /**************************************************************************************************/
84
85 void KmerDB::generateKmerDB(string kmerDBName){
86         
87         
88         Kmer kmer(kmerSize);
89         
90         for(int i=0;i<numSeqs;i++){
91
92                 string seq = templateSequences[i]->getUnaligned();
93                 int numKmers = seq.length() - kmerSize + 1;
94                 
95                 for(int j=0;j<numKmers;j++){
96                         int kmerNumber = kmer.getKmerNumber(seq, j);
97                         kmerLocations[kmerNumber].push_back(i);
98                 }
99         }
100         
101         ofstream kmerFile(kmerDBName.c_str(), ios::trunc);
102         if(!kmerFile) {
103                 cerr << "Error: Could not open " << kmerDBName << endl;
104                 exit(1);
105         }
106         
107         for(int i=0;i<maxKmer;i++){
108                 kmerFile << i << ' ' << kmerLocations[i].size();
109                 for(int j=0;j<kmerLocations[i].size();j++){
110                         kmerFile << ' ' << kmerLocations[i][j];
111                 }
112                 kmerFile << endl;
113         }
114         kmerFile.close();
115         
116 }
117
118 /**************************************************************************************************/
119
120 void KmerDB::readKmerDB(string kmerDBName, ifstream& kmerDBFile){
121
122         kmerDBFile.seekg(0);
123         
124         string seqName;
125         int seqNumber;
126         
127         for(int i=0;i<numSeqs;i++){
128                 int numValues;
129                 kmerDBFile >> seqName >> numValues;
130                 
131                 for(int j=0;j<numValues;j++){
132                         kmerDBFile >> seqNumber;
133                         kmerLocations[i].push_back(seqNumber);
134                 }
135         }
136         kmerDBFile.close();
137 }
138
139 /**************************************************************************************************/