]> git.donarmstrong.com Git - rsem.git/blob - parseIt.cpp
Updated README.md and WHAT_IS_NEW
[rsem.git] / parseIt.cpp
1 /*
2  * Assume any read should have a name other than ""
3  */
4 #include<cstdio>
5 #include<cstring>
6 #include<cstdlib>
7 #include<cassert>
8 #include<iostream>
9 #include<fstream>
10 #include<string>
11 #include<map>
12
13 #include "utils.h"
14
15 #include "GroupInfo.h"
16 #include "Transcripts.h"
17
18 #include "SingleRead.h"
19 #include "SingleReadQ.h"
20 #include "PairedEndRead.h"
21 #include "PairedEndReadQ.h"
22 #include "SingleHit.h"
23 #include "PairedEndHit.h"
24
25 #include "HitContainer.h"
26 #include "SamParser.h"
27
28 using namespace std;
29
30 int read_type; // 0 SingleRead, 1 SingleReadQ, 2 PairedEndRead, 3 PairedEndReadQ
31 READ_INT_TYPE N[3]; // note, N = N0 + N1 + N2 , but may not be equal to the total number of reads in data
32 HIT_INT_TYPE nHits; // # of hits
33 READ_INT_TYPE nUnique, nMulti, nIsoMulti;
34 char fn_list[STRLEN];
35 char groupF[STRLEN], tiF[STRLEN];
36 char datF[STRLEN], cntF[STRLEN];
37
38 GroupInfo gi;
39 Transcripts transcripts;
40
41 SamParser *parser;
42 ofstream hit_out;
43
44 int n_os; // number of ostreams
45 ostream *cat[3][2]; // cat : category  1-dim 0 N0 1 N1 2 N2; 2-dim  0 mate1 1 mate2
46 char readOutFs[3][2][STRLEN];
47
48 map<int, READ_INT_TYPE> counter;
49 map<int, READ_INT_TYPE>::iterator iter;
50
51 void init(const char* imdName, char alignFType, const char* alignF) {
52
53         char* aux = 0;
54         if (strcmp(fn_list, "")) aux = fn_list;
55         parser = new SamParser(alignFType, alignF, transcripts, aux);
56
57         memset(cat, 0, sizeof(cat));
58         memset(readOutFs, 0, sizeof(readOutFs));
59
60         int tmp_n_os = -1;
61
62         for (int i = 0; i < 3; i++) {
63                 genReadFileNames(imdName, i, read_type, n_os, readOutFs[i]);
64
65                 assert(tmp_n_os < 0 || tmp_n_os == n_os); tmp_n_os = n_os;
66
67                 for (int j = 0; j < n_os; j++)
68                         cat[i][j] = new ofstream(readOutFs[i][j]);
69         }
70
71         counter.clear();
72 }
73
74 //Do not allow duplicate for unalignable reads and supressed reads in SAM input
75 template<class ReadType, class HitType>
76 void parseIt(SamParser *parser) {
77         // record_val & record_read are copies of val & read for record purpose
78         int val, record_val;
79         ReadType read, record_read;
80         HitType hit;
81         HitContainer<HitType> hits;
82
83         nHits = 0;
84         nUnique = nMulti = nIsoMulti = 0;
85         memset(N, 0, sizeof(N));
86
87         READ_INT_TYPE cnt = 0;
88
89         record_val = -2; //indicate no recorded read now
90         while ((val = parser->parseNext(read, hit)) >= 0) {
91                 if (val >= 0 && val <= 2) {
92                         // flush out previous read's info if needed
93                         if (record_val >= 0) {
94                                 record_read.write(n_os, cat[record_val]);
95                                 ++N[record_val];
96                         }
97                         // flush out previous read's hits if the read is alignable reads
98                         if (record_val == 1) {
99                                 hits.updateRI();
100                                 nHits += hits.getNHits();
101                                 nMulti += hits.calcNumGeneMultiReads(gi);
102                                 nIsoMulti += hits.calcNumIsoformMultiReads();
103                                 hits.write(hit_out);
104
105                                 iter = counter.find(hits.getNHits());
106                                 if (iter != counter.end()) {
107                                         iter->second++;
108                                 }
109                                 else {
110                                         counter[hits.getNHits()] = 1;
111                                 }
112                         }
113
114                         hits.clear();
115                         record_val = val;
116                         record_read = read; // no pointer, thus safe
117                 }
118
119                 if (val == 1 || val == 5) {
120                         hits.push_back(hit);
121                 }
122
123                 ++cnt;
124                 if (verbose && (cnt % 1000000 == 0)) { cout<< "Parsed "<< cnt<< " entries"<< endl; }
125         }
126
127         if (record_val >= 0) {
128                 record_read.write(n_os, cat[record_val]);
129                 ++N[record_val];
130         }
131
132         if (record_val == 1) {
133                 hits.updateRI();
134                 nHits += hits.getNHits();
135                 nMulti += hits.calcNumGeneMultiReads(gi);
136                 nIsoMulti += hits.calcNumIsoformMultiReads();
137                 hits.write(hit_out);
138
139                 iter = counter.find(hits.getNHits());
140                 if (iter != counter.end()) {
141                         iter->second++;
142                 }
143                 else {
144                         counter[hits.getNHits()] = 1;
145                 }
146         }
147
148         nUnique = N[1] - nMulti;
149 }
150
151 void release() {
152         for (int i = 0; i < 3; i++) {
153                 for (int j = 0; j < n_os; j++) {
154                         ((ofstream*)cat[i][j])->close();
155                         delete cat[i][j];
156                 }
157                 if (N[i] > 0) continue;
158                 for (int j = 0; j < n_os; j++) {
159                         remove(readOutFs[i][j]); //delete if the file is empty
160                 }
161         }
162         delete parser;
163 }
164
165 int main(int argc, char* argv[]) {
166         bool quiet = false;
167
168         if (argc < 6) {
169                 printf("Usage : rsem-parse-alignments refName imdName statName alignFType('s' for sam, 'b' for bam) alignF [-t Type] [-l fn_list] [-tag tagName] [-q]\n");
170                 exit(-1);
171         }
172
173         strcpy(fn_list, "");
174         read_type = 0;
175         if (argc > 6) {
176                 for (int i = 6; i < argc; i++) {
177                         if (!strcmp(argv[i], "-t")) {
178                                 read_type = atoi(argv[i + 1]);
179                         }
180                         if (!strcmp(argv[i], "-l")) {
181                                 strcpy(fn_list, argv[i + 1]);
182                         }
183                         if (!strcmp(argv[i], "-tag")) {
184                                 SamParser::setReadTypeTag(argv[i + 1]);
185                         }
186                         if (!strcmp(argv[i], "-q")) { quiet = true; }
187                 }
188         }
189
190         verbose = !quiet;
191
192         sprintf(groupF, "%s.grp", argv[1]);
193         gi.load(groupF);
194         sprintf(tiF, "%s.ti", argv[1]);
195         transcripts.readFrom(tiF);
196
197         sprintf(datF, "%s.dat", argv[2]);
198         sprintf(cntF, "%s.cnt", argv[3]);
199
200         init(argv[2], argv[4][0], argv[5]);
201
202         hit_out.open(datF);
203
204         string firstLine(99, ' ');
205         firstLine.append(1, '\n');              //May be dangerous!
206         hit_out<<firstLine;
207
208         switch(read_type) {
209         case 0 : parseIt<SingleRead, SingleHit>(parser); break;
210         case 1 : parseIt<SingleReadQ, SingleHit>(parser); break;
211         case 2 : parseIt<PairedEndRead, PairedEndHit>(parser); break;
212         case 3 : parseIt<PairedEndReadQ, PairedEndHit>(parser); break;
213         }
214
215         hit_out.seekp(0, ios_base::beg);
216         hit_out<<N[1]<<" "<<nHits<<" "<<read_type;
217
218         hit_out.close();
219
220         //cntF for statistics of alignments file
221         ofstream fout(cntF);
222         fout<<N[0]<<" "<<N[1]<<" "<<N[2]<<" "<<(N[0] + N[1] + N[2])<<endl;
223         fout<<nUnique<<" "<<nMulti<<" "<<nIsoMulti<<endl;
224         fout<<nHits<<" "<<read_type<<endl;
225         fout<<"0\t"<<N[0]<<endl;
226         for (iter = counter.begin(); iter != counter.end(); iter++) {
227                 fout<<iter->first<<'\t'<<iter->second<<endl;
228         }
229         fout<<"Inf\t"<<N[2]<<endl;
230         fout.close();
231
232         release();
233
234         if (verbose) { printf("Done!\n"); }
235
236         return 0;
237 }