X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=calcClusteringInfo.cpp;fp=calcClusteringInfo.cpp;h=0000000000000000000000000000000000000000;hb=9c2e46183a19d661f0a618a8eabe8ce1f6a8e2d6;hp=2103f6169db7f27f4b636f12c62bdea9d1f054c7;hpb=7777bfc71742d4e239d1297d2e3de058895f4ce1;p=rsem.git diff --git a/calcClusteringInfo.cpp b/calcClusteringInfo.cpp deleted file mode 100644 index 2103f61..0000000 --- a/calcClusteringInfo.cpp +++ /dev/null @@ -1,149 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -using namespace std; - -typedef unsigned int INTEGER; - -const int STRLEN = 1005; - -INTEGER M; -int k; // k-mer size -vector names; -vector seqs; -vector effL; - -// tid starts from 1 -struct ReadType { - INTEGER tid, pos; - - ReadType(INTEGER tid, INTEGER pos) { - this->tid = tid; - this->pos = pos; - } - - bool operator< (const ReadType& o) const { - string& a = seqs[tid]; - string& b = seqs[o.tid]; - for (int i = 0; i < k; i++) { - if (a[pos + i] != b[o.pos + i]) { - return a[pos + i] < b[o.pos + i]; - } - } - return tid < o.tid; - } - - bool seq_equal(const ReadType& o) const { - string& a = seqs[tid]; - string& b = seqs[o.tid]; - for (int i = 0; i < k; i++) - if (a[pos + i] != b[o.pos + i]) return false; - return true; - } -}; - -vector cands; -vector clusteringInfo; - -string convert(const string& rawseq) { - int size = (int)rawseq.size(); - string seq = rawseq; - for (int i = 0; i < size; i++) { - seq[i] = toupper(rawseq[i]); - if (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G' && seq[i] != 'T') seq[i] = 'N'; - } - return seq; -} - -void loadRef(char* inpF) { - ifstream fin(inpF); - string tag, line, rawseq; - void *pt; - - assert(fin.is_open()); - - names.clear(); names.push_back(""); - seqs.clear(); seqs.push_back(""); - - pt = getline(fin, line); - while (pt != 0 && line[0] == '>') { - tag = line.substr(1); - rawseq = ""; - while((pt = getline(fin, line)) && line[0] != '>') { - rawseq += line; - } - if (rawseq.size() <= 0) { - printf("Warning: Fasta entry %s has an empty sequence! It is omitted!\n", tag.c_str()); - continue; - } - names.push_back(tag); - seqs.push_back(convert(rawseq)); - } - - fin.close(); - - M = names.size() - 1; - - printf("The reference is loaded.\n"); -} - -int main(int argc, char* argv[]) { - if (argc != 4) { - printf("Usage: rsem-for-ebseq-calculate-clustering-info k input_reference_fasta_file output_file\n"); - exit(-1); - } - - k = atoi(argv[1]); - loadRef(argv[2]); - - cands.clear(); - effL.assign(M + 1, 0); - for (INTEGER i = 1; i <= M; i++) { - effL[i] = seqs[i].length() - k + 1; - if (effL[i] <= 0) effL[i] = 0; // effL should be non-negative - for (INTEGER j = 0; j < effL[i]; j++) - cands.push_back(ReadType(i, j)); - } - printf("All possbile %d mers are generated.\n", k); - - sort(cands.begin(), cands.end()); - printf("All %d mers are sorted.\n", k); - - size_t p = 0; - clusteringInfo.assign(M + 1, 0.0); - - for (size_t i = 1; i <= cands.size(); i++) - if (i == cands.size() || !cands[p].seq_equal(cands[i])) { - size_t denominator = i - p; - size_t q = p; - for (size_t j = p + 1; j <= i; j++) - if (j == i || cands[q].tid != cands[j].tid) { - size_t numerator = j - q; - //double prob = numerator * 1.0 / denominator; - //clusteringInfo[cands[q].tid] += (double)numerator * prob * (1.0 - prob); - if (numerator < denominator) clusteringInfo[cands[q].tid] += numerator; - q = j; - } - p = i; - } - - for (INTEGER i = 1; i <= M; i++) - if (effL[i] == 0) clusteringInfo[i] = -1.0; - else clusteringInfo[i] /= effL[i]; - - printf("Clustering information is calculated.\n"); - - - ofstream fout(argv[3]); - for (INTEGER i = 1; i <= M; i++) fout<