13 typedef unsigned int INTEGER;
15 const int STRLEN = 1005;
27 ReadType(INTEGER tid, INTEGER pos) {
32 bool operator< (const ReadType& o) const {
33 string& a = seqs[tid];
34 string& b = seqs[o.tid];
35 for (int i = 0; i < k; i++) {
36 if (a[pos + i] != b[o.pos + i]) {
37 return a[pos + i] < b[o.pos + i];
43 bool seq_equal(const ReadType& o) const {
44 string& a = seqs[tid];
45 string& b = seqs[o.tid];
46 for (int i = 0; i < k; i++)
47 if (a[pos + i] != b[o.pos + i]) return false;
52 vector<ReadType> cands;
53 vector<double> clusteringInfo;
55 string convert(const string& rawseq) {
56 int size = (int)rawseq.size();
58 for (int i = 0; i < size; i++) {
59 seq[i] = toupper(rawseq[i]);
60 if (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G' && seq[i] != 'T') seq[i] = 'N';
65 void loadRef(char* inpF) {
67 string tag, line, rawseq;
70 assert(fin.is_open());
72 names.clear(); names.push_back("");
73 seqs.clear(); seqs.push_back("");
75 pt = getline(fin, line);
76 while (pt != 0 && line[0] == '>') {
79 while((pt = getline(fin, line)) && line[0] != '>') {
82 if (rawseq.size() <= 0) {
83 printf("Warning: Fasta entry %s has an empty sequence! It is omitted!\n", tag.c_str());
87 seqs.push_back(convert(rawseq));
94 printf("The reference is loaded.\n");
97 int main(int argc, char* argv[]) {
99 printf("Usage: rsem-for-ebseq-calculate-clustering-info k input_reference_fasta_file output_file\n");
107 effL.assign(M + 1, 0);
108 for (INTEGER i = 1; i <= M; i++) {
109 effL[i] = seqs[i].length() - k + 1;
110 if (effL[i] <= 0) effL[i] = 0; // effL should be non-negative
111 for (INTEGER j = 0; j < effL[i]; j++)
112 cands.push_back(ReadType(i, j));
114 printf("All possbile %d mers are generated.\n", k);
116 sort(cands.begin(), cands.end());
117 printf("All %d mers are sorted.\n", k);
120 clusteringInfo.assign(M + 1, 0.0);
122 for (size_t i = 1; i <= cands.size(); i++)
123 if (i == cands.size() || !cands[p].seq_equal(cands[i])) {
124 size_t denominator = i - p;
126 for (size_t j = p + 1; j <= i; j++)
127 if (j == i || cands[q].tid != cands[j].tid) {
128 size_t numerator = j - q;
129 //double prob = numerator * 1.0 / denominator;
130 //clusteringInfo[cands[q].tid] += (double)numerator * prob * (1.0 - prob);
131 if (numerator < denominator) clusteringInfo[cands[q].tid] += numerator;
137 for (INTEGER i = 1; i <= M; i++)
138 if (effL[i] == 0) clusteringInfo[i] = -1.0;
139 else clusteringInfo[i] /= effL[i];
141 printf("Clustering information is calculated.\n");
144 ofstream fout(argv[3]);
145 for (INTEGER i = 1; i <= M; i++) fout<<names[i]<<"\t"<<setprecision(6)<<clusteringInfo[i]<<endl;