#include<iostream>
#include<fstream>
#include<string>
+#include<map>
#include "utils.h"
#include "GroupInfo.h"
+#include "Transcripts.h"
#include "SingleRead.h"
#include "SingleReadQ.h"
using namespace std;
int read_type; // 0 SingleRead, 1 SingleReadQ, 2 PairedEndRead, 3 PairedEndReadQ
-int N[3]; // note, N = N0 + N1 + N2 , but may not be equal to the total number of reads in data
-int nHits; // # of hits
-int nUnique, nMulti, nIsoMulti;
+READ_INT_TYPE N[3]; // note, N = N0 + N1 + N2 , but may not be equal to the total number of reads in data
+HIT_INT_TYPE nHits; // # of hits
+READ_INT_TYPE nUnique, nMulti, nIsoMulti;
char fn_list[STRLEN];
-char groupF[STRLEN];
+char groupF[STRLEN], tiF[STRLEN];
char datF[STRLEN], cntF[STRLEN];
GroupInfo gi;
+Transcripts transcripts;
SamParser *parser;
ofstream hit_out;
ostream *cat[3][2]; // cat : category 1-dim 0 N0 1 N1 2 N2; 2-dim 0 mate1 1 mate2
char readOutFs[3][2][STRLEN];
-void init(const char* imdName, char alignFType, const char* alignF) {
+map<int, READ_INT_TYPE> counter;
+map<int, READ_INT_TYPE>::iterator iter;
- sprintf(datF, "%s.dat", imdName);
- sprintf(cntF, "%s.cnt", imdName);
+void init(const char* imdName, char alignFType, const char* alignF) {
char* aux = 0;
if (strcmp(fn_list, "")) aux = fn_list;
- parser = new SamParser(alignFType, alignF, aux);
+ parser = new SamParser(alignFType, alignF, transcripts, aux);
memset(cat, 0, sizeof(cat));
memset(readOutFs, 0, sizeof(readOutFs));
for (int j = 0; j < n_os; j++)
cat[i][j] = new ofstream(readOutFs[i][j]);
}
+
+ counter.clear();
}
//Do not allow duplicate for unalignable reads and supressed reads in SAM input
nUnique = nMulti = nIsoMulti = 0;
memset(N, 0, sizeof(N));
- long long cnt = 0;
+ READ_INT_TYPE cnt = 0;
record_val = -2; //indicate no recorded read now
while ((val = parser->parseNext(read, hit)) >= 0) {
if (record_val >= 0) {
record_read.write(n_os, cat[record_val]);
++N[record_val];
+ }
+ // flush out previous read's hits if the read is alignable reads
+ if (record_val == 1) {
hits.updateRI();
nHits += hits.getNHits();
nMulti += hits.calcNumGeneMultiReads(gi);
nIsoMulti += hits.calcNumIsoformMultiReads();
hits.write(hit_out);
+
+ iter = counter.find(hits.getNHits());
+ if (iter != counter.end()) {
+ iter->second++;
+ }
+ else {
+ counter[hits.getNHits()] = 1;
+ }
}
hits.clear();
}
++cnt;
- if (verbose && (cnt % 1000000 == 0)) { printf("Parsed %lld entries\n", cnt); }
+ if (verbose && (cnt % 1000000 == 0)) { cout<< "Parsed "<< cnt<< " entries"<< endl; }
}
if (record_val >= 0) {
record_read.write(n_os, cat[record_val]);
++N[record_val];
+ }
+
+ if (record_val == 1) {
hits.updateRI();
nHits += hits.getNHits();
nMulti += hits.calcNumGeneMultiReads(gi);
nIsoMulti += hits.calcNumIsoformMultiReads();
hits.write(hit_out);
+
+ iter = counter.find(hits.getNHits());
+ if (iter != counter.end()) {
+ iter->second++;
+ }
+ else {
+ counter[hits.getNHits()] = 1;
+ }
}
nUnique = N[1] - nMulti;
int main(int argc, char* argv[]) {
bool quiet = false;
- if (argc < 5) {
- printf("Usage : rsem-parse-alignments refName imdName alignFType('s' for sam, 'b' for bam) alignF [-t Type] [-l fn_list] [-tag tagName] [-q]\n");
+ if (argc < 6) {
+ printf("Usage : rsem-parse-alignments refName imdName statName alignFType('s' for sam, 'b' for bam) alignF [-t Type] [-l fn_list] [-tag tagName] [-q]\n");
exit(-1);
}
strcpy(fn_list, "");
read_type = 0;
- if (argc > 5) {
- for (int i = 5; i < argc; i++) {
+ if (argc > 6) {
+ for (int i = 6; i < argc; i++) {
if (!strcmp(argv[i], "-t")) {
read_type = atoi(argv[i + 1]);
}
verbose = !quiet;
- init(argv[2], argv[3][0], argv[4]);
-
sprintf(groupF, "%s.grp", argv[1]);
gi.load(groupF);
+ sprintf(tiF, "%s.ti", argv[1]);
+ transcripts.readFrom(tiF);
+
+ sprintf(datF, "%s.dat", argv[2]);
+ sprintf(cntF, "%s.cnt", argv[3]);
+
+ init(argv[2], argv[4][0], argv[5]);
hit_out.open(datF);
- string firstLine(59, ' ');
+ string firstLine(99, ' ');
firstLine.append(1, '\n'); //May be dangerous!
hit_out<<firstLine;
fout<<N[0]<<" "<<N[1]<<" "<<N[2]<<" "<<(N[0] + N[1] + N[2])<<endl;
fout<<nUnique<<" "<<nMulti<<" "<<nIsoMulti<<endl;
fout<<nHits<<" "<<read_type<<endl;
+ fout<<"0\t"<<N[0]<<endl;
+ for (iter = counter.begin(); iter != counter.end(); iter++) {
+ fout<<iter->first<<'\t'<<iter->second<<endl;
+ }
+ fout<<"Inf\t"<<N[2]<<endl;
fout.close();
release();