X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=kmerdb.cpp;fp=kmerdb.cpp;h=0000000000000000000000000000000000000000;hb=4a877efa127e56e81a21f53cfdbbfd3bfbe8c4ff;hp=9a5d23500be955b459413beee989601f6c0ec78f;hpb=a6cf29fa4dac0909c7582cb1094151d34093ee76;p=mothur.git diff --git a/kmerdb.cpp b/kmerdb.cpp deleted file mode 100644 index 9a5d235..0000000 --- a/kmerdb.cpp +++ /dev/null @@ -1,255 +0,0 @@ -/* - * kmerdb.cpp - * - * - * Created by Pat Schloss on 12/16/08. - * Copyright 2008 Patrick D. Schloss. All rights reserved. - * - * This class is a child class of the Database class, which stores the template sequences as a kmer table and provides - * a method of searching the kmer table for the sequence with the most kmers in common with a query sequence. - * kmerLocations is the primary storage variable that is a two-dimensional vector where each row represents the - * different number of kmers and each column contains the index to sequences that use that kmer. - * - * Construction of an object of this type will first look for an appropriately named database file and if it is found - * then will read in the database file (readKmerDB), otherwise it will generate one and store the data in memory - * (generateKmerDB) - * - * The search method used here is roughly the same as that used in the SimRank program that is found at the - * greengenes website. The default kmer size is 7. The speed complexity is between O(L) and O(LN). When I use 7mers - * on average a kmer is found in ~100 other sequences with a database of ~5000 sequences. If this is the case then the - * time would be on the order of O(0.1LN) -> fast. - * - - */ - -#include "sequence.hpp" -#include "kmer.hpp" -#include "database.hpp" -#include "kmerdb.hpp" - -/**************************************************************************************************/ - -KmerDB::KmerDB(string fastaFileName, int kSize) : Database(), kmerSize(kSize) { - try { - - kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer"; - - int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 }; - count = 0; - - maxKmer = power4s[kmerSize]; - kmerLocations.resize(maxKmer+1); - - } - catch(exception& e) { - m->errorOut(e, "KmerDB", "KmerDB"); - exit(1); - } - -} -/**************************************************************************************************/ -KmerDB::KmerDB() : Database() {} -/**************************************************************************************************/ - -KmerDB::~KmerDB(){} - -/**************************************************************************************************/ - -vector KmerDB::findClosestSequences(Sequence* candidateSeq, int num){ - try { - if (num > numSeqs) { m->mothurOut("[WARNING]: you requested " + toString(num) + " closest sequences, but the template only contains " + toString(numSeqs) + ", adjusting."); m->mothurOutEndLine(); num = numSeqs; } - - vector topMatches; - Kmer kmer(kmerSize); - searchScore = 0; - Scores.clear(); - - vector matches(numSeqs, 0); // a record of the sequences with shared kmers - vector timesKmerFound(kmerLocations.size()+1, 0); // a record of the kmers that we have already found - - int numKmers = candidateSeq->getNumBases() - kmerSize + 1; - - for(int i=0;igetUnaligned(), i); // go through the query sequence and get a kmer number - if(timesKmerFound[kmerNumber] == 0){ // if we haven't seen it before... - for(int j=0;j seqMatches; seqMatches.resize(numSeqs); - for(int i=0;i bestMatch) { - bestIndex = i; - bestMatch = matches[i]; - } - } - - searchScore = bestMatch; - searchScore = 100 * searchScore / (float) numKmers; // return the Sequence object corresponding to the db - topMatches.push_back(bestIndex); - Scores.push_back(searchScore); - } - return topMatches; - } - catch(exception& e) { - m->errorOut(e, "KmerDB", "findClosestSequences"); - exit(1); - } -} - -/**************************************************************************************************/ - -void KmerDB::generateDB(){ - try { - - ofstream kmerFile; // once we have the kmerLocations folder print it out - m->openOutputFile(kmerDBName, kmerFile); // to a file - - //output version - kmerFile << "#" << m->getVersion() << endl; - - for(int i=0;ierrorOut(e, "KmerDB", "generateDB"); - exit(1); - } - -} -/**************************************************************************************************/ -void KmerDB::addSequence(Sequence seq) { - try { - Kmer kmer(kmerSize); - - string unaligned = seq.getUnaligned(); // ...take the unaligned sequence... - int numKmers = unaligned.length() - kmerSize + 1; - - vector seenBefore(maxKmer+1,0); - for(int j=0;jerrorOut(e, "KmerDB", "addSequence"); - exit(1); - } -} -/**************************************************************************************************/ - -void KmerDB::readKmerDB(ifstream& kmerDBFile){ - try { - - kmerDBFile.seekg(0); // start at the beginning of the file - - //read version - string line = m->getline(kmerDBFile); m->gobble(kmerDBFile); - - string seqName; - int seqNumber; - - for(int i=0;i> seqName >> numValues; - - for(int j=0;j> seqNumber; // 1. number of sequences with the kmer number - kmerLocations[i].push_back(seqNumber); // 2. sequence indices - } - } - kmerDBFile.close(); - - } - catch(exception& e) { - m->errorOut(e, "KmerDB", "readKmerDB"); - exit(1); - } -} - -/**************************************************************************************************/ -int KmerDB::getCount(int kmer) { - try { - if (kmer < 0) { return 0; } //if user gives negative number - else if (kmer > maxKmer) { return 0; } //or a kmer that is bigger than maxkmer - else { return kmerLocations[kmer].size(); } // kmer is in vector range - } - catch(exception& e) { - m->errorOut(e, "KmerDB", "getCount"); - exit(1); - } -} -/**************************************************************************************************/ -int KmerDB::getReversed(int kmerNumber) { - try { - Kmer kmer(kmerSize); - - if (kmerNumber < 0) { return 0; } //if user gives negative number - else if (kmerNumber > maxKmer) { return 0; } //or a kmer that is bigger than maxkmer - else { return kmer.getReverseKmerNumber(kmerNumber); } // kmer is in vector range - } - catch(exception& e) { - m->errorOut(e, "KmerDB", "getReversed"); - exit(1); - } -} -/**************************************************************************************************/ -vector KmerDB::getSequencesWithKmer(int kmer) { - try { - - vector seqs; - - if (kmer < 0) { } //if user gives negative number - else if (kmer > maxKmer) { } //or a kmer that is bigger than maxkmer - else { seqs = kmerLocations[kmer]; } - - return seqs; - } - catch(exception& e) { - m->errorOut(e, "KmerDB", "getSequencesWithKmer"); - exit(1); - } -} -/**************************************************************************************************/ - - -/**************************************************************************************************/ - -