13 const int STRLEN = 1005;
25 ReadType(int tid, int pos) {
30 bool operator< (const ReadType& o) const {
31 string& a = seqs[tid];
32 string& b = seqs[o.tid];
33 for (int i = 0; i < k; i++) {
34 if (a[pos + i] != b[o.pos + i]) {
35 return a[pos + i] < b[o.pos + i];
41 bool seq_equal(const ReadType& o) const {
42 string& a = seqs[tid];
43 string& b = seqs[o.tid];
44 for (int i = 0; i < k; i++)
45 if (a[pos + i] != b[o.pos + i]) return false;
50 vector<ReadType> cands;
51 vector<double> clusteringInfo;
53 string convert(const string& rawseq) {
54 int size = (int)rawseq.size();
56 for (int i = 0; i < size; i++) {
57 seq[i] = toupper(rawseq[i]);
58 if (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G' && seq[i] != 'T') seq[i] = 'N';
63 void loadRef(char* inpF) {
65 string tag, line, rawseq;
67 assert(fin.is_open());
69 names.clear(); names.push_back("");
70 seqs.clear(); seqs.push_back("");
73 while ((fin) && (line[0] == '>')) {
76 while((getline(fin, line)) && (line[0] != '>')) {
79 if (rawseq.size() <= 0) {
80 printf("Warning: Fasta entry %s has an empty sequence! It is omitted!\n", tag.c_str());
84 seqs.push_back(convert(rawseq));
91 printf("The reference is loaded.\n");
94 int main(int argc, char* argv[]) {
96 printf("Usage: rsem-for-ebseq-calculate-clustering-info k input_reference_fasta_file output_file\n");
104 effL.assign(M + 1, 0);
105 for (int i = 1; i <= M; i++) {
106 effL[i] = int(seqs[i].length()) - k + 1;
107 if (effL[i] <= 0) effL[i] = 0; // effL should be non-negative
108 for (int j = 0; j < effL[i]; j++)
109 cands.push_back(ReadType(i, j));
111 printf("All possbile %d mers are generated.\n", k);
113 sort(cands.begin(), cands.end());
114 printf("All %d mers are sorted.\n", k);
117 clusteringInfo.assign(M + 1, 0.0);
119 for (size_t i = 1; i <= cands.size(); i++)
120 if (i == cands.size() || !cands[p].seq_equal(cands[i])) {
121 size_t denominator = i - p;
123 for (size_t j = p + 1; j <= i; j++)
124 if (j == i || cands[q].tid != cands[j].tid) {
125 size_t numerator = j - q;
126 //double prob = numerator * 1.0 / denominator;
127 //clusteringInfo[cands[q].tid] += (double)numerator * prob * (1.0 - prob);
128 if (numerator < denominator) clusteringInfo[cands[q].tid] += numerator;
134 for (int i = 1; i <= M; i++)
135 if (effL[i] == 0) clusteringInfo[i] = -1.0;
136 else clusteringInfo[i] /= effL[i];
138 printf("Clustering information is calculated.\n");
141 ofstream fout(argv[3]);
142 for (int i = 1; i <= M; i++) fout<<names[i]<<"\t"<<setprecision(6)<<clusteringInfo[i]<<endl;