]> git.donarmstrong.com Git - mothur.git/blob - kmerdb.cpp
added MPI code, broke up chimera.seqs into 5 separated commands, added parse.sff...
[mothur.git] / kmerdb.cpp
1 /*
2  *  kmerdb.cpp
3  *  
4  *
5  *  Created by Pat Schloss on 12/16/08.
6  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
7  *
8  *      This class is a child class of the Database class, which stores the template sequences as a kmer table and provides
9  *      a method of searching the kmer table for the sequence with the most kmers in common with a query sequence.
10  *      kmerLocations is the primary storage variable that is a two-dimensional vector where each row represents the
11  *      different number of kmers and each column contains the index to sequences that use that kmer.
12  *
13  *      Construction of an object of this type will first look for an appropriately named database file and if it is found
14  *      then will read in the database file (readKmerDB), otherwise it will generate one and store the data in memory
15  *      (generateKmerDB)
16  *
17  *      The search method used here is roughly the same as that used in the SimRank program that is found at the
18  *      greengenes website.  The default kmer size is 7.  The speed complexity is between O(L) and O(LN).  When I use 7mers
19  *      on average a kmer is found in ~100 other sequences with a database of ~5000 sequences.  If this is the case then the
20  *      time would be on the order of O(0.1LN) -> fast.
21  *
22
23  */
24
25 #include "sequence.hpp"
26 #include "kmer.hpp"
27 #include "database.hpp"
28 #include "kmerdb.hpp"
29
30 /**************************************************************************************************/
31
32 KmerDB::KmerDB(string fastaFileName, int kSize) : Database(), kmerSize(kSize) {
33         try { 
34         
35                 kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
36                 
37                 int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
38                 count = 0;
39                 
40                 maxKmer = power4s[kmerSize];
41                 kmerLocations.resize(maxKmer+1);
42                 
43         }
44         catch(exception& e) {
45                 m->errorOut(e, "KmerDB", "KmerDB");
46                 exit(1);
47         }       
48
49 }
50 /**************************************************************************************************/
51 KmerDB::KmerDB() : Database() {}
52 /**************************************************************************************************/
53
54 KmerDB::~KmerDB(){}
55
56 /**************************************************************************************************/
57
58 vector<int> KmerDB::findClosestSequences(Sequence* candidateSeq, int num){
59         try {
60                 vector<int> topMatches;
61                 Kmer kmer(kmerSize);
62                 searchScore = 0;
63                 
64                 vector<int> matches(numSeqs, 0);                                                //      a record of the sequences with shared kmers
65                 vector<int> timesKmerFound(kmerLocations.size()+1, 0);  //      a record of the kmers that we have already found
66                 
67                 int numKmers = candidateSeq->getNumBases() - kmerSize + 1;      
68         
69                 for(int i=0;i<numKmers;i++){
70                         int kmerNumber = kmer.getKmerNumber(candidateSeq->getUnaligned(), i);           //      go through the query sequence and get a kmer number
71                         if(timesKmerFound[kmerNumber] == 0){                            //      if we haven't seen it before...
72                                 for(int j=0;j<kmerLocations[kmerNumber].size();j++){//increase the count for each sequence that also has
73                                         matches[kmerLocations[kmerNumber][j]]++;        //      that kmer
74                                 }
75                         }
76                         timesKmerFound[kmerNumber] = 1;                                         //      ok, we've seen the kmer now
77                 }
78         
79                 vector<seqMatch> seqMatches;
80                 for(int i=0;i<numSeqs;i++){             
81                         seqMatch temp(i, matches[i]);
82                         seqMatches.push_back(temp);
83                 }
84                 
85                 //sorts putting largest matches first
86                 sort(seqMatches.begin(), seqMatches.end(), compareSeqMatches);
87                 
88                 searchScore = seqMatches[0].match;
89                 searchScore = 100 * searchScore / (float) numKmers;             //      return the Sequence object corresponding to the db
90         
91                 //save top matches
92                 for (int i = 0; i < num; i++) {
93                         topMatches.push_back(seqMatches[i].seq);
94                 }
95                 
96                 return topMatches;              
97         }
98         catch(exception& e) {
99                 m->errorOut(e, "KmerDB", "findClosestSequences");
100                 exit(1);
101         }       
102 }
103
104 /**************************************************************************************************/
105
106 void KmerDB::generateDB(){
107         try {
108                 
109                 ofstream kmerFile;                                                                              //      once we have the kmerLocations folder print it out
110                 openOutputFile(kmerDBName, kmerFile);                                   //      to a file
111                 
112                 for(int i=0;i<maxKmer;i++){                                                             //      step through all of the possible kmer numbers
113                         kmerFile << i << ' ' << kmerLocations[i].size();        //      print the kmer number and the number of sequences with
114                         for(int j=0;j<kmerLocations[i].size();j++){                     //      that kmer.  then print out the indices of the sequences
115                                 kmerFile << ' ' << kmerLocations[i][j];                 //      with that kmer.
116                         }
117                         kmerFile << endl;
118                 }
119                 kmerFile.close();
120                 
121         }
122         catch(exception& e) {
123                 m->errorOut(e, "KmerDB", "generateDB");
124                 exit(1);
125         }       
126         
127 }
128 /**************************************************************************************************/
129 void KmerDB::addSequence(Sequence seq) {
130         try {
131                 Kmer kmer(kmerSize);
132                 
133                 string unaligned = seq.getUnaligned();  //      ...take the unaligned sequence...
134                 int numKmers = unaligned.length() - kmerSize + 1;
135                         
136                 vector<int> seenBefore(maxKmer+1,0);
137                 for(int j=0;j<numKmers;j++){                                            //      ...step though the sequence and get each kmer...
138                         int kmerNumber = kmer.getKmerNumber(unaligned, j);
139                         if(seenBefore[kmerNumber] == 0){
140                                 kmerLocations[kmerNumber].push_back(count);             //      ...insert the sequence index into kmerLocations for
141                         }                                                                                               //      the appropriate kmer number
142                         seenBefore[kmerNumber] = 1;
143                 }                                                                                                       
144         
145                 count++;
146         }
147         catch(exception& e) {
148                 m->errorOut(e, "KmerDB", "addSequence");
149                 exit(1);
150         }       
151 }
152 /**************************************************************************************************/
153
154 void KmerDB::readKmerDB(ifstream& kmerDBFile){
155         try {
156                                         
157                 kmerDBFile.seekg(0);                                                                    //      start at the beginning of the file
158                 
159                 string seqName;
160                 int seqNumber;
161
162                 for(int i=0;i<maxKmer;i++){
163                         int numValues = 0;      
164                         kmerDBFile >> seqName >> numValues;
165                         
166                         for(int j=0;j<numValues;j++){                                           //      for each kmer number get the...
167                                 kmerDBFile >> seqNumber;                                                //              1. number of sequences with the kmer number
168                                 kmerLocations[i].push_back(seqNumber);                  //              2. sequence indices
169                         }
170                 }
171                 kmerDBFile.close();
172                 
173         }
174         catch(exception& e) {
175                 m->errorOut(e, "KmerDB", "readKmerDB");
176                 exit(1);
177         }       
178 }
179
180 /**************************************************************************************************/
181 int KmerDB::getCount(int kmer) {
182         try {
183                 if (kmer < 0) { return 0; }  //if user gives negative number
184                 else if (kmer > maxKmer) {      return 0;       }  //or a kmer that is bigger than maxkmer
185                 else {  return kmerLocations[kmer].size();      }  // kmer is in vector range
186         }
187         catch(exception& e) {
188                 m->errorOut(e, "KmerDB", "getCount");
189                 exit(1);
190         }       
191 }
192 /**************************************************************************************************/
193 vector<int> KmerDB::getSequencesWithKmer(int kmer) {
194         try {
195                 
196                 vector<int> seqs;
197         
198                 if (kmer < 0) { }  //if user gives negative number
199                 else if (kmer > maxKmer) {      }  //or a kmer that is bigger than maxkmer
200                 else {  seqs = kmerLocations[kmer];     }
201                 
202                 return seqs;
203         }
204         catch(exception& e) {
205                 m->errorOut(e, "KmerDB", "getSequencesWithKmer");
206                 exit(1);
207         }       
208 }
209 #ifdef USE_MPI  
210 /**************************************************************************************************/
211 int KmerDB::MPISend(int receiver) {
212         try {
213                 
214                 //send kmerSize - int
215                 MPI_Send(&kmerSize, 1, MPI_INT, receiver, 2001, MPI_COMM_WORLD); 
216                 
217                 return 0;
218         }
219         catch(exception& e) {
220                 m->errorOut(e, "KmerDB", "MPISend");
221                 exit(1);
222         }
223 }
224 /**************************************************************************************************/
225 int KmerDB::MPIRecv(int sender) {
226         try {
227                 MPI_Status status;
228                 
229                 //receive kmerSize - int
230                 MPI_Recv(&kmerSize, 1, MPI_INT, sender, 2001, MPI_COMM_WORLD, &status);
231                 
232                 //set maxKmer 
233                 int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
234                 count = 0;
235                 maxKmer = power4s[kmerSize];
236                 kmerLocations.resize(maxKmer+1);
237                 
238                 return 0;
239         }
240         catch(exception& e) {
241                 m->errorOut(e, "KmerDB", "MPIRecv");
242                 exit(1);
243         }
244 }
245 #endif
246 /**************************************************************************************************/
247
248
249 /**************************************************************************************************/
250
251