]> git.donarmstrong.com Git - mothur.git/blobdiff - kmerdb.cpp
fixes while testing 1.33.0
[mothur.git] / kmerdb.cpp
index 9652ac472fe38e14bff9707afdaba95526c35bff..9a5d23500be955b459413beee989601f6c0ec78f 100644 (file)
@@ -22,9 +22,6 @@
 
  */
 
-using namespace std;
-
-
 #include "sequence.hpp"
 #include "kmer.hpp"
 #include "database.hpp"
@@ -32,117 +29,227 @@ using namespace std;
 
 /**************************************************************************************************/
 
-KmerDB::KmerDB(string fastaFileName, int kSize) : Database(fastaFileName), kmerSize(kSize) {
-
-       string kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
-       ifstream kmerFileTest(kmerDBName.c_str());
-       
-       int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
-       
-       maxKmer = power4s[kmerSize];
-       kmerLocations.resize(maxKmer+1);
+KmerDB::KmerDB(string fastaFileName, int kSize) : Database(), kmerSize(kSize) {
+       try { 
        
-       if(!kmerFileTest){              //      if we can open the kmer db file, then read it in...
-               cout << "Generating the " << kmerDBName << " database...\t";    cout.flush();
-               generateKmerDB(kmerDBName);     
-       }
-       else{                                   //      ...otherwise generate it.
-               cout << "Reading in the " << kmerDBName << " database...\t";    cout.flush();
-               readKmerDB(kmerDBName, kmerFileTest);
+               kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
+               
+               int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
+               count = 0;
+               
+               maxKmer = power4s[kmerSize];
+               kmerLocations.resize(maxKmer+1);
+               
        }
-       cout << "DONE." << endl << endl;        cout.flush();
+       catch(exception& e) {
+               m->errorOut(e, "KmerDB", "KmerDB");
+               exit(1);
+       }       
 
 }
-
+/**************************************************************************************************/
+KmerDB::KmerDB() : Database() {}
 /**************************************************************************************************/
 
-Sequence* KmerDB::findClosestSequence(Sequence* candidateSeq){
-       
-       Kmer kmer(kmerSize);
-       
-       searchScore = 0;
-       int maxSequence = 0;
-
-       vector<int> matches(numSeqs, 0);                                                //      a record of the sequences with shared kmers
-       vector<int> timesKmerFound(kmerLocations.size()+1, 0);  //      a record of the kmers that we have already found
+KmerDB::~KmerDB(){}
 
-       int numKmers = candidateSeq->getNumBases() - kmerSize + 1;      
+/**************************************************************************************************/
 
-       for(int i=0;i<numKmers;i++){
-               int kmerNumber = kmer.getKmerNumber(candidateSeq->getUnaligned(), i);           //      go through the query sequence and get a kmer number
-               if(timesKmerFound[kmerNumber] == 0){                            //      if we haven't seen it before...
-                       for(int j=0;j<kmerLocations[kmerNumber].size();j++){//increase the count for each sequence that also has
-                               matches[kmerLocations[kmerNumber][j]]++;        //      that kmer
+vector<int> KmerDB::findClosestSequences(Sequence* candidateSeq, int num){
+       try {
+               if (num > numSeqs) { m->mothurOut("[WARNING]: you requested " + toString(num) + " closest sequences, but the template only contains " + toString(numSeqs) + ", adjusting."); m->mothurOutEndLine(); num = numSeqs; }
+               
+               vector<int> topMatches;
+               Kmer kmer(kmerSize);
+               searchScore = 0;
+               Scores.clear();
+               
+               vector<int> matches(numSeqs, 0);                                                //      a record of the sequences with shared kmers
+               vector<int> timesKmerFound(kmerLocations.size()+1, 0);  //      a record of the kmers that we have already found
+               
+               int numKmers = candidateSeq->getNumBases() - kmerSize + 1;      
+       
+               for(int i=0;i<numKmers;i++){
+                       int kmerNumber = kmer.getKmerNumber(candidateSeq->getUnaligned(), i);           //      go through the query sequence and get a kmer number
+                       if(timesKmerFound[kmerNumber] == 0){                            //      if we haven't seen it before...
+                               for(int j=0;j<kmerLocations[kmerNumber].size();j++){//increase the count for each sequence that also has
+                                       matches[kmerLocations[kmerNumber][j]]++;        //      that kmer
+                               }
                        }
+                       timesKmerFound[kmerNumber] = 1;                                         //      ok, we've seen the kmer now
                }
-               timesKmerFound[kmerNumber] = 1;                                         //      ok, we've seen the kmer now
-       }
-
-       for(int i=0;i<numSeqs;i++){                                                             //      find the db sequence that shares the most kmers with
-               if(matches[i] > searchScore){                                   //      the query sequence
-                       searchScore = matches[i];
-                       maxSequence = i;
+               
+               if (num != 1) {
+                       vector<seqMatch> seqMatches; seqMatches.resize(numSeqs);
+                       for(int i=0;i<numSeqs;i++){             
+                               seqMatches[i].seq = i;
+                               seqMatches[i].match = matches[i];
+                       }
+                       
+                       //sorts putting largest matches first
+                       sort(seqMatches.begin(), seqMatches.end(), compareSeqMatches);
+                       
+                       searchScore = seqMatches[0].match;
+                       searchScore = 100 * searchScore / (float) numKmers;             //      return the Sequence object corresponding to the db
+               
+                       //save top matches
+                       for (int i = 0; i < num; i++) {
+                               topMatches.push_back(seqMatches[i].seq);
+                               float thisScore = 100 * seqMatches[i].match / (float) numKmers;
+                               Scores.push_back(thisScore);
+                       }
+               }else{
+                       int bestIndex = 0;
+                       int bestMatch = -1;
+                       for(int i=0;i<numSeqs;i++){     
+                               
+                               if (matches[i] > bestMatch) {
+                                       bestIndex = i;
+                                       bestMatch = matches[i];
+                               }
+                       }
+                       
+                       searchScore = bestMatch;
+                       searchScore = 100 * searchScore / (float) numKmers;             //      return the Sequence object corresponding to the db
+                       topMatches.push_back(bestIndex);
+                       Scores.push_back(searchScore);
                }
+               return topMatches;              
        }
-
-       searchScore = 100 * searchScore / (float) numKmers;             //      return the Sequence object corresponding to the db
-       return templateSequences[maxSequence];                                  //      sequence with the most shared kmers.
+       catch(exception& e) {
+               m->errorOut(e, "KmerDB", "findClosestSequences");
+               exit(1);
+       }       
 }
 
 /**************************************************************************************************/
 
-void KmerDB::generateKmerDB(string kmerDBName){
-       
-       Kmer kmer(kmerSize);
+void KmerDB::generateDB(){
+       try {
+               
+               ofstream kmerFile;                                                                              //      once we have the kmerLocations folder print it out
+               m->openOutputFile(kmerDBName, kmerFile);                                        //      to a file
+               
+               //output version
+               kmerFile << "#" << m->getVersion() << endl;
+               
+               for(int i=0;i<maxKmer;i++){                                                             //      step through all of the possible kmer numbers
+                       kmerFile << i << ' ' << kmerLocations[i].size();        //      print the kmer number and the number of sequences with
+                       for(int j=0;j<kmerLocations[i].size();j++){                     //      that kmer.  then print out the indices of the sequences
+                               kmerFile << ' ' << kmerLocations[i][j];                 //      with that kmer.
+                       }
+                       kmerFile << endl;
+               }
+               kmerFile.close();
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "KmerDB", "generateDB");
+               exit(1);
+       }       
        
-       for(int i=0;i<numSeqs;i++){                                                             //      for all of the template sequences...
-
-               string seq = templateSequences[i]->getUnaligned();      //      ...take the unaligned sequence...
-               int numKmers = seq.length() - kmerSize + 1;
+}
+/**************************************************************************************************/
+void KmerDB::addSequence(Sequence seq) {
+       try {
+               Kmer kmer(kmerSize);
                
+               string unaligned = seq.getUnaligned();  //      ...take the unaligned sequence...
+               int numKmers = unaligned.length() - kmerSize + 1;
+                       
                vector<int> seenBefore(maxKmer+1,0);
                for(int j=0;j<numKmers;j++){                                            //      ...step though the sequence and get each kmer...
-                       int kmerNumber = kmer.getKmerNumber(seq, j);
+                       int kmerNumber = kmer.getKmerNumber(unaligned, j);
                        if(seenBefore[kmerNumber] == 0){
-                               kmerLocations[kmerNumber].push_back(i);         //      ...insert the sequence index into kmerLocations for
+                               kmerLocations[kmerNumber].push_back(count);             //      ...insert the sequence index into kmerLocations for
                        }                                                                                               //      the appropriate kmer number
                        seenBefore[kmerNumber] = 1;
                }                                                                                                       
-       }
        
-       ofstream kmerFile;                                                                              //      once we have the kmerLocations folder print it out
-       openOutputFile(kmerDBName, kmerFile);                                   //      to a file
-       
-       for(int i=0;i<maxKmer;i++){                                                             //      step through all of the possible kmer numbers
-               kmerFile << i << ' ' << kmerLocations[i].size();        //      print the kmer number and the number of sequences with
-               for(int j=0;j<kmerLocations[i].size();j++){                     //      that kmer.  then print out the indices of the sequences
-                       kmerFile << ' ' << kmerLocations[i][j];                 //      with that kmer.
-               }
-               kmerFile << endl;
+               count++;
        }
-       kmerFile.close();
-       
+       catch(exception& e) {
+               m->errorOut(e, "KmerDB", "addSequence");
+               exit(1);
+       }       
 }
-
 /**************************************************************************************************/
 
-void KmerDB::readKmerDB(string kmerDBName, ifstream& kmerDBFile){
+void KmerDB::readKmerDB(ifstream& kmerDBFile){
+       try {
+                                       
+               kmerDBFile.seekg(0);                                                                    //      start at the beginning of the file
+               
+               //read version
+               string line = m->getline(kmerDBFile); m->gobble(kmerDBFile);
+               
+               string seqName;
+               int seqNumber;
+
+               for(int i=0;i<maxKmer;i++){
+                       int numValues = 0;      
+                       kmerDBFile >> seqName >> numValues;
+                       
+                       for(int j=0;j<numValues;j++){                                           //      for each kmer number get the...
+                               kmerDBFile >> seqNumber;                                                //              1. number of sequences with the kmer number
+                               kmerLocations[i].push_back(seqNumber);                  //              2. sequence indices
+                       }
+               }
+               kmerDBFile.close();
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "KmerDB", "readKmerDB");
+               exit(1);
+       }       
+}
 
-       kmerDBFile.seekg(0);                                                                    //      start at the beginning of the file
-       
-       string seqName;
-       int seqNumber;
+/**************************************************************************************************/
+int KmerDB::getCount(int kmer) {
+       try {
+               if (kmer < 0) { return 0; }  //if user gives negative number
+               else if (kmer > maxKmer) {      return 0;       }  //or a kmer that is bigger than maxkmer
+               else {  return kmerLocations[kmer].size();      }  // kmer is in vector range
+       }
+       catch(exception& e) {
+               m->errorOut(e, "KmerDB", "getCount");
+               exit(1);
+       }       
+}
+/**************************************************************************************************/
+int KmerDB::getReversed(int kmerNumber) {
+       try {
+               Kmer kmer(kmerSize);
+               
+               if (kmerNumber < 0) { return 0; }  //if user gives negative number
+               else if (kmerNumber > maxKmer) {        return 0;       }  //or a kmer that is bigger than maxkmer
+               else {  return kmer.getReverseKmerNumber(kmerNumber);   }  // kmer is in vector range
+       }
+       catch(exception& e) {
+               m->errorOut(e, "KmerDB", "getReversed");
+               exit(1);
+       }       
+}
+/**************************************************************************************************/
+vector<int> KmerDB::getSequencesWithKmer(int kmer) {
+       try {
+               
+               vector<int> seqs;
        
-       for(int i=0;i<maxKmer;i++){
-               int numValues;  
-               kmerDBFile >> seqName >> numValues;
+               if (kmer < 0) { }  //if user gives negative number
+               else if (kmer > maxKmer) {      }  //or a kmer that is bigger than maxkmer
+               else {  seqs = kmerLocations[kmer];     }
                
-               for(int j=0;j<numValues;j++){                                           //      for each kmer number get the...
-                       kmerDBFile >> seqNumber;                                                //              1. number of sequences with the kmer number
-                       kmerLocations[i].push_back(seqNumber);                  //              2. sequence indices
-               }
+               return seqs;
        }
-       kmerDBFile.close();
+       catch(exception& e) {
+               m->errorOut(e, "KmerDB", "getSequencesWithKmer");
+               exit(1);
+       }       
 }
+/**************************************************************************************************/
+
 
 /**************************************************************************************************/
+
+