+++ /dev/null
-#include<cstdio>
-#include<cctype>
-#include<cstring>
-#include<cstdlib>
-#include<cassert>
-#include<fstream>
-#include<iomanip>
-#include<string>
-#include<vector>
-#include<algorithm>
-using namespace std;
-
-typedef unsigned int INTEGER;
-
-const int STRLEN = 1005;
-
-INTEGER M;
-int k; // k-mer size
-vector<string> names;
-vector<string> seqs;
-vector<INTEGER> effL;
-
-// tid starts from 1
-struct ReadType {
- INTEGER tid, pos;
-
- ReadType(INTEGER tid, INTEGER pos) {
- this->tid = tid;
- this->pos = pos;
- }
-
- bool operator< (const ReadType& o) const {
- string& a = seqs[tid];
- string& b = seqs[o.tid];
- for (int i = 0; i < k; i++) {
- if (a[pos + i] != b[o.pos + i]) {
- return a[pos + i] < b[o.pos + i];
- }
- }
- return tid < o.tid;
- }
-
- bool seq_equal(const ReadType& o) const {
- string& a = seqs[tid];
- string& b = seqs[o.tid];
- for (int i = 0; i < k; i++)
- if (a[pos + i] != b[o.pos + i]) return false;
- return true;
- }
-};
-
-vector<ReadType> cands;
-vector<double> clusteringInfo;
-
-string convert(const string& rawseq) {
- int size = (int)rawseq.size();
- string seq = rawseq;
- for (int i = 0; i < size; i++) {
- seq[i] = toupper(rawseq[i]);
- if (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G' && seq[i] != 'T') seq[i] = 'N';
- }
- return seq;
-}
-
-void loadRef(char* inpF) {
- ifstream fin(inpF);
- string tag, line, rawseq;
- void *pt;
-
- assert(fin.is_open());
-
- names.clear(); names.push_back("");
- seqs.clear(); seqs.push_back("");
-
- pt = getline(fin, line);
- while (pt != 0 && line[0] == '>') {
- tag = line.substr(1);
- rawseq = "";
- while((pt = getline(fin, line)) && line[0] != '>') {
- rawseq += line;
- }
- if (rawseq.size() <= 0) {
- printf("Warning: Fasta entry %s has an empty sequence! It is omitted!\n", tag.c_str());
- continue;
- }
- names.push_back(tag);
- seqs.push_back(convert(rawseq));
- }
-
- fin.close();
-
- M = names.size() - 1;
-
- printf("The reference is loaded.\n");
-}
-
-int main(int argc, char* argv[]) {
- if (argc != 4) {
- printf("Usage: rsem-for-ebseq-calculate-clustering-info k input_reference_fasta_file output_file\n");
- exit(-1);
- }
-
- k = atoi(argv[1]);
- loadRef(argv[2]);
-
- cands.clear();
- effL.assign(M + 1, 0);
- for (INTEGER i = 1; i <= M; i++) {
- effL[i] = seqs[i].length() - k + 1;
- if (effL[i] <= 0) effL[i] = 0; // effL should be non-negative
- for (INTEGER j = 0; j < effL[i]; j++)
- cands.push_back(ReadType(i, j));
- }
- printf("All possbile %d mers are generated.\n", k);
-
- sort(cands.begin(), cands.end());
- printf("All %d mers are sorted.\n", k);
-
- size_t p = 0;
- clusteringInfo.assign(M + 1, 0.0);
-
- for (size_t i = 1; i <= cands.size(); i++)
- if (i == cands.size() || !cands[p].seq_equal(cands[i])) {
- size_t denominator = i - p;
- size_t q = p;
- for (size_t j = p + 1; j <= i; j++)
- if (j == i || cands[q].tid != cands[j].tid) {
- size_t numerator = j - q;
- //double prob = numerator * 1.0 / denominator;
- //clusteringInfo[cands[q].tid] += (double)numerator * prob * (1.0 - prob);
- if (numerator < denominator) clusteringInfo[cands[q].tid] += numerator;
- q = j;
- }
- p = i;
- }
-
- for (INTEGER i = 1; i <= M; i++)
- if (effL[i] == 0) clusteringInfo[i] = -1.0;
- else clusteringInfo[i] /= effL[i];
-
- printf("Clustering information is calculated.\n");
-
-
- ofstream fout(argv[3]);
- for (INTEGER i = 1; i <= M; i++) fout<<names[i]<<"\t"<<setprecision(6)<<clusteringInfo[i]<<endl;
- fout.close();
-
- return 0;
-}