X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;ds=sidebyside;f=EBSeq%2FcalcClusteringInfo.cpp;fp=EBSeq%2FcalcClusteringInfo.cpp;h=2103f6169db7f27f4b636f12c62bdea9d1f054c7;hb=9c2e46183a19d661f0a618a8eabe8ce1f6a8e2d6;hp=0000000000000000000000000000000000000000;hpb=7777bfc71742d4e239d1297d2e3de058895f4ce1;p=rsem.git diff --git a/EBSeq/calcClusteringInfo.cpp b/EBSeq/calcClusteringInfo.cpp new file mode 100644 index 0000000..2103f61 --- /dev/null +++ b/EBSeq/calcClusteringInfo.cpp @@ -0,0 +1,149 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +using namespace std; + +typedef unsigned int INTEGER; + +const int STRLEN = 1005; + +INTEGER M; +int k; // k-mer size +vector names; +vector seqs; +vector effL; + +// tid starts from 1 +struct ReadType { + INTEGER tid, pos; + + ReadType(INTEGER tid, INTEGER pos) { + this->tid = tid; + this->pos = pos; + } + + bool operator< (const ReadType& o) const { + string& a = seqs[tid]; + string& b = seqs[o.tid]; + for (int i = 0; i < k; i++) { + if (a[pos + i] != b[o.pos + i]) { + return a[pos + i] < b[o.pos + i]; + } + } + return tid < o.tid; + } + + bool seq_equal(const ReadType& o) const { + string& a = seqs[tid]; + string& b = seqs[o.tid]; + for (int i = 0; i < k; i++) + if (a[pos + i] != b[o.pos + i]) return false; + return true; + } +}; + +vector cands; +vector clusteringInfo; + +string convert(const string& rawseq) { + int size = (int)rawseq.size(); + string seq = rawseq; + for (int i = 0; i < size; i++) { + seq[i] = toupper(rawseq[i]); + if (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G' && seq[i] != 'T') seq[i] = 'N'; + } + return seq; +} + +void loadRef(char* inpF) { + ifstream fin(inpF); + string tag, line, rawseq; + void *pt; + + assert(fin.is_open()); + + names.clear(); names.push_back(""); + seqs.clear(); seqs.push_back(""); + + pt = getline(fin, line); + while (pt != 0 && line[0] == '>') { + tag = line.substr(1); + rawseq = ""; + while((pt = getline(fin, line)) && line[0] != '>') { + rawseq += line; + } + if (rawseq.size() <= 0) { + printf("Warning: Fasta entry %s has an empty sequence! It is omitted!\n", tag.c_str()); + continue; + } + names.push_back(tag); + seqs.push_back(convert(rawseq)); + } + + fin.close(); + + M = names.size() - 1; + + printf("The reference is loaded.\n"); +} + +int main(int argc, char* argv[]) { + if (argc != 4) { + printf("Usage: rsem-for-ebseq-calculate-clustering-info k input_reference_fasta_file output_file\n"); + exit(-1); + } + + k = atoi(argv[1]); + loadRef(argv[2]); + + cands.clear(); + effL.assign(M + 1, 0); + for (INTEGER i = 1; i <= M; i++) { + effL[i] = seqs[i].length() - k + 1; + if (effL[i] <= 0) effL[i] = 0; // effL should be non-negative + for (INTEGER j = 0; j < effL[i]; j++) + cands.push_back(ReadType(i, j)); + } + printf("All possbile %d mers are generated.\n", k); + + sort(cands.begin(), cands.end()); + printf("All %d mers are sorted.\n", k); + + size_t p = 0; + clusteringInfo.assign(M + 1, 0.0); + + for (size_t i = 1; i <= cands.size(); i++) + if (i == cands.size() || !cands[p].seq_equal(cands[i])) { + size_t denominator = i - p; + size_t q = p; + for (size_t j = p + 1; j <= i; j++) + if (j == i || cands[q].tid != cands[j].tid) { + size_t numerator = j - q; + //double prob = numerator * 1.0 / denominator; + //clusteringInfo[cands[q].tid] += (double)numerator * prob * (1.0 - prob); + if (numerator < denominator) clusteringInfo[cands[q].tid] += numerator; + q = j; + } + p = i; + } + + for (INTEGER i = 1; i <= M; i++) + if (effL[i] == 0) clusteringInfo[i] = -1.0; + else clusteringInfo[i] /= effL[i]; + + printf("Clustering information is calculated.\n"); + + + ofstream fout(argv[3]); + for (INTEGER i = 1; i <= M; i++) fout<