Removed Mac ._* files from repo

[rsem.git] / parseIt.cpp
diff --git a/parseIt.cpp b/parseIt.cpp

index c266715b31693686f7db773155d01e7905321102..3373ef280979428230b74d77439d61bff3a96d47 100644 (file)
--- a/parseIt.cpp
+++ b/parseIt.cpp
@@ -8,10 +8,12 @@
  #include<iostream>
  #include<fstream>
  #include<string>
+#include<map>
  
  #include "utils.h"
  
  #include "GroupInfo.h"
+#include "Transcripts.h"
  
  #include "SingleRead.h"
  #include "SingleReadQ.h"
@@ -26,14 +28,15 @@
  using namespace std;
  
  int read_type; // 0 SingleRead, 1 SingleReadQ, 2 PairedEndRead, 3 PairedEndReadQ
-int N[3]; // note, N = N0 + N1 + N2 , but may not be equal to the total number of reads in data
-int nHits; // # of hits
-int nUnique, nMulti, nIsoMulti;
+READ_INT_TYPE N[3]; // note, N = N0 + N1 + N2 , but may not be equal to the total number of reads in data
+HIT_INT_TYPE nHits; // # of hits
+READ_INT_TYPE nUnique, nMulti, nIsoMulti;
  char fn_list[STRLEN];
-char groupF[STRLEN];
+char groupF[STRLEN], tiF[STRLEN];
  char datF[STRLEN], cntF[STRLEN];
  
  GroupInfo gi;
+Transcripts transcripts;
  
  SamParser *parser;
  ofstream hit_out;
@@ -42,14 +45,14 @@ int n_os; // number of ostreams
  ostream *cat[3][2]; // cat : category  1-dim 0 N0 1 N1 2 N2; 2-dim  0 mate1 1 mate2
  char readOutFs[3][2][STRLEN];
  
-void init(const char* imdName, char alignFType, const char* alignF) {
+map<int, READ_INT_TYPE> counter;
+map<int, READ_INT_TYPE>::iterator iter;
  
-       sprintf(datF, "%s.dat", imdName);
-       sprintf(cntF, "%s.cnt", imdName);
+void init(const char* imdName, char alignFType, const char* alignF) {
  
         char* aux = 0;
         if (strcmp(fn_list, "")) aux = fn_list;
-       parser = new SamParser(alignFType, alignF, aux);
+       parser = new SamParser(alignFType, alignF, transcripts, aux);
  
         memset(cat, 0, sizeof(cat));
         memset(readOutFs, 0, sizeof(readOutFs));
@@ -64,6 +67,8 @@ void init(const char* imdName, char alignFType, const char* alignF) {
                 for (int j = 0; j < n_os; j++)
                         cat[i][j] = new ofstream(readOutFs[i][j]);
         }
+
+       counter.clear();
  }
  
  //Do not allow duplicate for unalignable reads and supressed reads in SAM input
@@ -79,7 +84,7 @@ void parseIt(SamParser *parser) {
         nUnique = nMulti = nIsoMulti = 0;
         memset(N, 0, sizeof(N));
  
-       long long cnt = 0;
+       READ_INT_TYPE cnt = 0;
  
         record_val = -2; //indicate no recorded read now
         while ((val = parser->parseNext(read, hit)) >= 0) {
@@ -88,11 +93,22 @@ void parseIt(SamParser *parser) {
                         if (record_val >= 0) {
                                 record_read.write(n_os, cat[record_val]);
                                 ++N[record_val];
+                       }
+                       // flush out previous read's hits if the read is alignable reads
+                       if (record_val == 1) {
                                 hits.updateRI();
                                 nHits += hits.getNHits();
                                 nMulti += hits.calcNumGeneMultiReads(gi);
                                 nIsoMulti += hits.calcNumIsoformMultiReads();
                                 hits.write(hit_out);
+
+                               iter = counter.find(hits.getNHits());
+                               if (iter != counter.end()) {
+                                       iter->second++;
+                               }
+                               else {
+                                       counter[hits.getNHits()] = 1;
+                               }
                         }
  
                         hits.clear();
@@ -105,17 +121,28 @@ void parseIt(SamParser *parser) {
                 }
  
                 ++cnt;
-               if (verbose && (cnt % 1000000 == 0)) { printf("Parsed %lld entries\n", cnt); }
+               if (verbose && (cnt % 1000000 == 0)) { cout<< "Parsed "<< cnt<< " entries"<< endl; }
         }
  
         if (record_val >= 0) {
                 record_read.write(n_os, cat[record_val]);
                 ++N[record_val];
+       }
+
+       if (record_val == 1) {
                 hits.updateRI();
                 nHits += hits.getNHits();
                 nMulti += hits.calcNumGeneMultiReads(gi);
                 nIsoMulti += hits.calcNumIsoformMultiReads();
                 hits.write(hit_out);
+
+               iter = counter.find(hits.getNHits());
+               if (iter != counter.end()) {
+                       iter->second++;
+               }
+               else {
+                       counter[hits.getNHits()] = 1;
+               }
         }
  
         nUnique = N[1] - nMulti;
@@ -138,15 +165,15 @@ void release() {
  int main(int argc, char* argv[]) {
         bool quiet = false;
  
-       if (argc < 5) {
-               printf("Usage : rsem-parse-alignments refName imdName alignFType('s' for sam, 'b' for bam) alignF [-t Type] [-l fn_list] [-tag tagName] [-q]\n");
+       if (argc < 6) {
+               printf("Usage : rsem-parse-alignments refName imdName statName alignFType('s' for sam, 'b' for bam) alignF [-t Type] [-l fn_list] [-tag tagName] [-q]\n");
                 exit(-1);
         }
  
         strcpy(fn_list, "");
         read_type = 0;
-       if (argc > 5) {
-               for (int i = 5; i < argc; i++) {
+       if (argc > 6) {
+               for (int i = 6; i < argc; i++) {
                         if (!strcmp(argv[i], "-t")) {
                                 read_type = atoi(argv[i + 1]);
                         }
@@ -162,14 +189,19 @@ int main(int argc, char* argv[]) {
  
         verbose = !quiet;
  
-       init(argv[2], argv[3][0], argv[4]);
-
         sprintf(groupF, "%s.grp", argv[1]);
         gi.load(groupF);
+       sprintf(tiF, "%s.ti", argv[1]);
+       transcripts.readFrom(tiF);
+
+       sprintf(datF, "%s.dat", argv[2]);
+       sprintf(cntF, "%s.cnt", argv[3]);
+
+       init(argv[2], argv[4][0], argv[5]);
  
         hit_out.open(datF);
  
-       string firstLine(59, ' ');
+       string firstLine(99, ' ');
         firstLine.append(1, '\n');              //May be dangerous!
         hit_out<<firstLine;
  
@@ -190,6 +222,11 @@ int main(int argc, char* argv[]) {
         fout<<N[0]<<" "<<N[1]<<" "<<N[2]<<" "<<(N[0] + N[1] + N[2])<<endl;
         fout<<nUnique<<" "<<nMulti<<" "<<nIsoMulti<<endl;
         fout<<nHits<<" "<<read_type<<endl;
+       fout<<"0\t"<<N[0]<<endl;
+       for (iter = counter.begin(); iter != counter.end(); iter++) {
+               fout<<iter->first<<'\t'<<iter->second<<endl;
+       }
+       fout<<"Inf\t"<<N[2]<<endl;
         fout.close();
  
         release();