*
*/
-using namespace std;
-
-#include "mothur.h"
#include "kmer.hpp"
/**************************************************************************************************/
-Kmer::Kmer(int size) : kmerSize(size) {
-
- int power4s[9] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536 };
- maxKmer = power4s[kmerSize]+1;// (int)pow(4.,k)+1;
+Kmer::Kmer(int size) : kmerSize(size) { // The constructor sets the size of the kmer
+ int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
+ // No reason to waste the time of recalculating
+ maxKmer = power4s[kmerSize]+1;// (int)pow(4.,k)+1; // powers of 4 everytime through. We need an
+ // extra kmer if we get a non-ATGCU base
}
/**************************************************************************************************/
-string Kmer::getKmerString(string sequence){
- int length = sequence.length();
- int nKmers = length - kmerSize + 1;
+string Kmer::getKmerString(string sequence){ // Calculate kmer for each position in the sequence, count the freq
+ int length = sequence.length(); // of each kmer, and convert it to an ascii character with base '!'.
+ int nKmers = length - kmerSize + 1; // Export the string of characters as a string
vector<int> counts(maxKmer, 0);
- for(int i=0;i<nKmers;i++){
- int kmerNumber = getKmerNumber(sequence, i);
+ for(int i=0;i<nKmers;i++){ // Go though sequence and get the number between 0 and maxKmer for that
+ int kmerNumber = getKmerNumber(sequence, i);// kmer. Increase the count for the kmer in the counts vector
counts[kmerNumber]++;
}
- string kmerString = "";
- for(int i=0;i<maxKmer;i++){
- kmerString += getASCII(counts[i]);
+ string kmerString = "";
+ for(int i=0;i<maxKmer;i++){ // Scan through the vector of counts and convert each element of the
+ kmerString += getASCII(counts[i]); // vector to an ascii character that is equal to or greater than '!'
}
return kmerString;
}
+/**************************************************************************************************/
+
+vector< map<int, int> > Kmer::getKmerCounts(string sequence){ // Calculate kmer for each position in the sequence, save info in a map
+ int length = sequence.length(); // so you know at each spot in the sequence what kmers were found
+ int nKmers = length - kmerSize + 1; //
+ vector< map<int, int> > counts; counts.resize(nKmers); // a map kmer counts for each spot
+ map<int, int>::iterator it;
+
+ for(int i=0;i<nKmers;i++){ // Go though sequence and get the number between 0 and maxKmer for that
+ if (i == 0) {
+ int kmerNumber = getKmerNumber(sequence, i);// kmer.
+ counts[i][kmerNumber] = 1; // add this kmer if not already there
+ }else {
+ //your count is everything that came before and whatever you find now
+ counts[i] = counts[i-1];
+ int kmerNumber = getKmerNumber(sequence, i);// kmer.
+
+ it = counts[i].find(kmerNumber);
+ if (it!= counts[i].end()) { counts[i][kmerNumber]++; } //increment number of times you have seen this kmer
+ else { counts[i][kmerNumber] = 1; } // add this kmer since not already there
+ }
+ }
+
+ return counts;
+}
+
/**************************************************************************************************/
int Kmer::getKmerNumber(string sequence, int index){
- int power4s[9] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536 };
+
+// Here we convert a kmer to a number between 0 and maxKmer. For example, AAAA would equal 0 and TTTT would equal 255.
+// If there's an N in the kmer, it is set to 256 (if we are looking at 4mers). The largest we can look at are 8mers,
+// this could be easily increased.
+//
+// Example: ATGCAT (kSize = 6)
+// i: 012345
+//
+// Score = 0*4^(6-0-1) + 3*4^(6-1-1) + 2*4^(6-2-1) + 1*4^(6-3-1) + 0*4^(6-4-1) + 3*4^(6-5-1)
+// = 0*4^5 + 3*4^4 + 2*4^3 + 1*4^2 + 0*4^1 + 3*4^0
+// = 0 + 3*256 + 2*64 + 1*16 + 0*4 + 3*1
+// = 0 + 768 + 128 + 16 + 0 + 3
+// = 915
+
+ int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
+
int kmer = 0;
+
for(int i=0;i<kmerSize;i++){
if(toupper(sequence[i+index]) == 'A') { kmer += (0 * power4s[kmerSize-i-1]); }
else if(toupper(sequence[i+index]) == 'C') { kmer += (1 * power4s[kmerSize-i-1]); }
/**************************************************************************************************/
string Kmer::getKmerBases(int kmerNumber){
- int power4s[9] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536 };
+
+// Here we convert the kmer number into the kmer in terms of bases.
+//
+// Example: Score = 915 (for a 6-mer)
+// Base6 = (915 / 4^0) % 4 = 915 % 4 = 3 => T [T]
+// Base5 = (915 / 4^1) % 4 = 228 % 4 = 0 => A [AT]
+// Base4 = (915 / 4^2) % 4 = 57 % 4 = 1 => C [CAT]
+// Base3 = (915 / 4^3) % 4 = 14 % 4 = 2 => G [GCAT]
+// Base2 = (915 / 4^4) % 4 = 3 % 4 = 3 => T [TGCAT]
+// Base1 = (915 / 4^5) % 4 = 0 % 4 = 0 => A [ATGCAT] -> this checks out with the previous method
+
+ int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
string kmer = "";
- if(kmerNumber == power4s[kmerSize]){//pow(4.,7)){
- for(int i=0;i<kmerSize;i++){
+ if(kmerNumber == power4s[kmerSize]){//pow(4.,7)){ // if the kmer number is the same as the maxKmer then it must
+ for(int i=0;i<kmerSize;i++){ // have had an N in it and so we'll just call it N x kmerSize
kmer += 'N';
}
}
else{
for(int i=0;i<kmerSize;i++){
- int nt = (int)(kmerNumber / (float)power4s[i]) % 4;
- if(nt == 0) { kmer = 'A' + kmer; }
+ int nt = (int)(kmerNumber / (float)power4s[i]) % 4; // the '%' operator returns the remainder
+ if(nt == 0) { kmer = 'A' + kmer; } // from int-based division ]
else if(nt == 1){ kmer = 'C' + kmer; }
else if(nt == 2){ kmer = 'G' + kmer; }
else if(nt == 3){ kmer = 'T' + kmer; }
/**************************************************************************************************/
-char Kmer::getASCII(int number) { return (char)(33+number); }
-
+char Kmer::getASCII(int number) { return (char)(33+number); } // '!' is the first printable char and
+ // has the int value of 33
/**************************************************************************************************/
-int Kmer::getNumber(char character) { return ((int)(character-'!')); }
+int Kmer::getNumber(char character) { return ((int)(character-'!')); } // '!' has the value of 33
/**************************************************************************************************/