2 * Assume any read should have a name other than ""
15 #include "GroupInfo.h"
16 #include "Transcripts.h"
18 #include "SingleRead.h"
19 #include "SingleReadQ.h"
20 #include "PairedEndRead.h"
21 #include "PairedEndReadQ.h"
22 #include "SingleHit.h"
23 #include "PairedEndHit.h"
25 #include "HitContainer.h"
26 #include "SamParser.h"
30 int read_type; // 0 SingleRead, 1 SingleReadQ, 2 PairedEndRead, 3 PairedEndReadQ
31 READ_INT_TYPE N[3]; // note, N = N0 + N1 + N2 , but may not be equal to the total number of reads in data
32 HIT_INT_TYPE nHits; // # of hits
33 READ_INT_TYPE nUnique, nMulti, nIsoMulti;
35 char groupF[STRLEN], tiF[STRLEN];
37 char datF[STRLEN], cntF[STRLEN];
40 Transcripts transcripts;
45 int n_os; // number of ostreams
46 ostream *cat[3][2]; // cat : category 1-dim 0 N0 1 N1 2 N2; 2-dim 0 mate1 1 mate2
47 char readOutFs[3][2][STRLEN];
49 map<int, READ_INT_TYPE> counter;
50 map<int, READ_INT_TYPE>::iterator iter;
52 void init(const char* imdName, char alignFType, const char* alignF) {
55 if (strcmp(fn_list, "")) aux = fn_list;
56 parser = new SamParser(alignFType, alignF, transcripts, aux);
58 memset(cat, 0, sizeof(cat));
59 memset(readOutFs, 0, sizeof(readOutFs));
63 for (int i = 0; i < 3; i++) {
64 genReadFileNames(imdName, i, read_type, n_os, readOutFs[i]);
66 assert(tmp_n_os < 0 || tmp_n_os == n_os); tmp_n_os = n_os;
68 for (int j = 0; j < n_os; j++)
69 cat[i][j] = new ofstream(readOutFs[i][j]);
75 //Do not allow duplicate for unalignable reads and supressed reads in SAM input
76 template<class ReadType, class HitType>
77 void parseIt(SamParser *parser) {
78 // record_val & record_read are copies of val & read for record purpose
80 ReadType read, record_read;
82 HitContainer<HitType> hits;
85 nUnique = nMulti = nIsoMulti = 0;
86 memset(N, 0, sizeof(N));
88 READ_INT_TYPE cnt = 0;
90 record_val = -2; //indicate no recorded read now
91 while ((val = parser->parseNext(read, hit)) >= 0) {
92 if (val >= 0 && val <= 2) {
93 // flush out previous read's info if needed
94 if (record_val >= 0) {
95 record_read.write(n_os, cat[record_val]);
98 // flush out previous read's hits if the read is alignable reads
99 if (record_val == 1) {
101 nHits += hits.getNHits();
102 nMulti += hits.calcNumGeneMultiReads(gi);
103 nIsoMulti += hits.calcNumIsoformMultiReads();
106 iter = counter.find(hits.getNHits());
107 if (iter != counter.end()) {
111 counter[hits.getNHits()] = 1;
117 record_read = read; // no pointer, thus safe
120 if (val == 1 || val == 5) {
125 if (verbose && (cnt % 1000000 == 0)) { cout<< "Parsed "<< cnt<< " entries"<< endl; }
128 if (record_val >= 0) {
129 record_read.write(n_os, cat[record_val]);
133 if (record_val == 1) {
135 nHits += hits.getNHits();
136 nMulti += hits.calcNumGeneMultiReads(gi);
137 nIsoMulti += hits.calcNumIsoformMultiReads();
140 iter = counter.find(hits.getNHits());
141 if (iter != counter.end()) {
145 counter[hits.getNHits()] = 1;
149 nUnique = N[1] - nMulti;
153 for (int i = 0; i < 3; i++) {
154 for (int j = 0; j < n_os; j++) {
155 ((ofstream*)cat[i][j])->close();
158 if (N[i] > 0) continue;
159 for (int j = 0; j < n_os; j++) {
160 remove(readOutFs[i][j]); //delete if the file is empty
166 int main(int argc, char* argv[]) {
170 printf("Usage : rsem-parse-alignments refName sampleName sampleToken alignFType('s' for sam, 'b' for bam) alignF [-t Type] [-l fn_list] [-tag tagName] [-q]\n");
177 for (int i = 6; i < argc; i++) {
178 if (!strcmp(argv[i], "-t")) {
179 read_type = atoi(argv[i + 1]);
181 if (!strcmp(argv[i], "-l")) {
182 strcpy(fn_list, argv[i + 1]);
184 if (!strcmp(argv[i], "-tag")) {
185 SamParser::setReadTypeTag(argv[i + 1]);
187 if (!strcmp(argv[i], "-q")) { quiet = true; }
193 sprintf(groupF, "%s.grp", argv[1]);
195 sprintf(tiF, "%s.ti", argv[1]);
196 transcripts.readFrom(tiF);
198 sprintf(imdName, "%s.temp/%s", argv[2], argv[3]);
199 sprintf(datF, "%s.dat", imdName);
200 sprintf(cntF, "%s.stat/%s.cnt", argv[2], argv[3]);
202 init(imdName, argv[4][0], argv[5]);
206 string firstLine(99, ' ');
207 firstLine.append(1, '\n'); //May be dangerous!
211 case 0 : parseIt<SingleRead, SingleHit>(parser); break;
212 case 1 : parseIt<SingleReadQ, SingleHit>(parser); break;
213 case 2 : parseIt<PairedEndRead, PairedEndHit>(parser); break;
214 case 3 : parseIt<PairedEndReadQ, PairedEndHit>(parser); break;
217 hit_out.seekp(0, ios_base::beg);
218 hit_out<<N[1]<<" "<<nHits<<" "<<read_type;
222 //cntF for statistics of alignments file
224 fout<<N[0]<<" "<<N[1]<<" "<<N[2]<<" "<<(N[0] + N[1] + N[2])<<endl;
225 fout<<nUnique<<" "<<nMulti<<" "<<nIsoMulti<<endl;
226 fout<<nHits<<" "<<read_type<<endl;
227 fout<<"0\t"<<N[0]<<endl;
228 for (iter = counter.begin(); iter != counter.end(); iter++) {
229 fout<<iter->first<<'\t'<<iter->second<<endl;
231 fout<<"Inf\t"<<N[2]<<endl;
236 if (verbose) { printf("Done!\n"); }