13 typedef unsigned int INTEGER;
15 const int STRLEN = 1005;
27 ReadType(INTEGER tid, INTEGER pos) {
32 bool operator< (const ReadType& o) const {
33 string& a = seqs[tid];
34 string& b = seqs[o.tid];
35 for (int i = 0; i < k; i++) {
36 if (a[pos + i] != b[o.pos + i]) {
37 return a[pos + i] < b[o.pos + i];
43 bool seq_equal(const ReadType& o) const {
44 string& a = seqs[tid];
45 string& b = seqs[o.tid];
46 for (int i = 0; i < k; i++)
47 if (a[pos + i] != b[o.pos + i]) return false;
52 vector<ReadType> cands;
53 vector<double> clusteringInfo;
55 string convert(const string& rawseq) {
56 int size = (int)rawseq.size();
58 for (int i = 0; i < size; i++) {
59 seq[i] = toupper(rawseq[i]);
60 if (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G' && seq[i] != 'T') seq[i] = 'N';
65 void loadRef(char* inpF) {
67 string tag, line, rawseq;
69 assert(fin.is_open());
71 names.clear(); names.push_back("");
72 seqs.clear(); seqs.push_back("");
75 while ((fin) && (line[0] == '>')) {
78 while((getline(fin, line)) && (line[0] != '>')) {
81 if (rawseq.size() <= 0) {
82 printf("Warning: Fasta entry %s has an empty sequence! It is omitted!\n", tag.c_str());
86 seqs.push_back(convert(rawseq));
93 printf("The reference is loaded.\n");
96 int main(int argc, char* argv[]) {
98 printf("Usage: rsem-for-ebseq-calculate-clustering-info k input_reference_fasta_file output_file\n");
106 effL.assign(M + 1, 0);
107 for (INTEGER i = 1; i <= M; i++) {
108 effL[i] = seqs[i].length() - k + 1;
109 if (effL[i] <= 0) effL[i] = 0; // effL should be non-negative
110 for (INTEGER j = 0; j < effL[i]; j++)
111 cands.push_back(ReadType(i, j));
113 printf("All possbile %d mers are generated.\n", k);
115 sort(cands.begin(), cands.end());
116 printf("All %d mers are sorted.\n", k);
119 clusteringInfo.assign(M + 1, 0.0);
121 for (size_t i = 1; i <= cands.size(); i++)
122 if (i == cands.size() || !cands[p].seq_equal(cands[i])) {
123 size_t denominator = i - p;
125 for (size_t j = p + 1; j <= i; j++)
126 if (j == i || cands[q].tid != cands[j].tid) {
127 size_t numerator = j - q;
128 //double prob = numerator * 1.0 / denominator;
129 //clusteringInfo[cands[q].tid] += (double)numerator * prob * (1.0 - prob);
130 if (numerator < denominator) clusteringInfo[cands[q].tid] += numerator;
136 for (INTEGER i = 1; i <= M; i++)
137 if (effL[i] == 0) clusteringInfo[i] = -1.0;
138 else clusteringInfo[i] /= effL[i];
140 printf("Clustering information is calculated.\n");
143 ofstream fout(argv[3]);
144 for (INTEGER i = 1; i <= M; i++) fout<<names[i]<<"\t"<<setprecision(6)<<clusteringInfo[i]<<endl;