changed output format to contain FPKM etc. ; fixed a bug for paired-end reads

[rsem.git] / calcClusteringInfo.cpp
diff --git a/calcClusteringInfo.cpp b/calcClusteringInfo.cpp

deleted file mode 100644 (file)

index 2103f61..0000000
--- a/calcClusteringInfo.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-#include<cstdio>
-#include<cctype>
-#include<cstring>
-#include<cstdlib>
-#include<cassert>
-#include<fstream>
-#include<iomanip>
-#include<string>
-#include<vector>
-#include<algorithm>
-using namespace std;
-
-typedef unsigned int INTEGER;
-
-const int STRLEN = 1005;
-
-INTEGER M;
-int k; // k-mer size
-vector<string> names;
-vector<string> seqs;
-vector<INTEGER> effL;
-
-// tid starts from 1
-struct ReadType {
-  INTEGER tid, pos;
-
-  ReadType(INTEGER tid, INTEGER pos) {
-    this->tid = tid;
-    this->pos = pos;
-  }
-
-  bool operator< (const ReadType& o) const {
-    string& a = seqs[tid];
-    string& b = seqs[o.tid];
-    for (int i = 0; i < k; i++) {
-      if (a[pos + i] != b[o.pos + i]) {
-       return a[pos + i] < b[o.pos + i];
-      }
-    }
-    return tid < o.tid;
-  }
-
-  bool seq_equal(const ReadType& o) const {
-    string& a = seqs[tid];
-    string& b = seqs[o.tid];
-    for (int i = 0; i < k; i++) 
-      if (a[pos + i] != b[o.pos + i]) return false;
-    return true;
-  }
-};
-
-vector<ReadType> cands;
-vector<double> clusteringInfo; 
-
-string convert(const string& rawseq) {
-  int size = (int)rawseq.size();
-  string seq = rawseq;
-  for (int i = 0; i < size; i++) {
-    seq[i] = toupper(rawseq[i]);
-    if (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G' && seq[i] != 'T') seq[i] = 'N';
-  }
-  return seq;
-}
-
-void loadRef(char* inpF) {
-  ifstream fin(inpF);
-  string tag, line, rawseq;
-  void *pt;
-
-  assert(fin.is_open());
-
-  names.clear(); names.push_back("");
-  seqs.clear(); seqs.push_back("");
-  
-  pt = getline(fin, line);
-  while (pt != 0 && line[0] == '>') {
-    tag = line.substr(1);
-    rawseq = "";
-    while((pt = getline(fin, line)) && line[0] != '>') {
-      rawseq += line;
-    }
-    if (rawseq.size() <= 0) {
-      printf("Warning: Fasta entry %s has an empty sequence! It is omitted!\n", tag.c_str());
-      continue;
-    }
-    names.push_back(tag);
-    seqs.push_back(convert(rawseq));
-  }
-
-  fin.close();
-
-  M = names.size() - 1;
-
-  printf("The reference is loaded.\n");
-}
-
-int main(int argc, char* argv[]) {
-  if (argc != 4) {
-    printf("Usage: rsem-for-ebseq-calculate-clustering-info k input_reference_fasta_file output_file\n");
-    exit(-1);
-  }
-
-  k = atoi(argv[1]);
-  loadRef(argv[2]);
-
-  cands.clear();
-  effL.assign(M + 1, 0);
-  for (INTEGER i = 1; i <= M; i++) {
-    effL[i] = seqs[i].length() - k + 1;
-    if (effL[i] <= 0) effL[i] = 0; // effL should be non-negative
-    for (INTEGER j = 0; j < effL[i]; j++) 
-      cands.push_back(ReadType(i, j));
-  }
-  printf("All possbile %d mers are generated.\n", k);
-
-  sort(cands.begin(), cands.end());
-  printf("All %d mers are sorted.\n", k);
- 
-  size_t p = 0;
-  clusteringInfo.assign(M + 1, 0.0);
-
-  for (size_t i = 1; i <= cands.size(); i++)
-    if (i == cands.size() || !cands[p].seq_equal(cands[i])) {
-      size_t denominator = i - p;
-      size_t q = p; 
-      for (size_t j = p + 1; j <= i; j++)
-       if (j == i || cands[q].tid != cands[j].tid) {
-         size_t numerator = j - q;
-         //double prob = numerator * 1.0 / denominator;
-         //clusteringInfo[cands[q].tid] += (double)numerator * prob * (1.0 - prob);
-         if (numerator < denominator) clusteringInfo[cands[q].tid] += numerator;
-         q = j;
-       }
-      p = i;
-    }
-
-  for (INTEGER i = 1; i <= M; i++) 
-    if (effL[i] == 0) clusteringInfo[i] = -1.0;
-    else clusteringInfo[i] /= effL[i];
-
-  printf("Clustering information is calculated.\n");
-
-
-  ofstream fout(argv[3]);
-  for (INTEGER i = 1; i <= M; i++) fout<<names[i]<<"\t"<<setprecision(6)<<clusteringInfo[i]<<endl;
-  fout.close();
-
-  return 0;
-}