1 /* ReadType here means if the read is unalignable, alignable or aligned too much. It is NOT siheaderngle read or paired-end read */
15 #include "my_assert.h"
17 #include "SingleRead.h"
18 #include "SingleReadQ.h"
19 #include "PairedEndRead.h"
20 #include "PairedEndReadQ.h"
21 #include "SingleHit.h"
22 #include "PairedEndHit.h"
24 #include "Transcripts.h"
28 SamParser(char, const char*, Transcripts&, const char* = 0);
33 * -1 : no more alignment
34 * 0 : new read , type 0
35 * 1 : new read , type 1 with alignment
36 * 2 : new read , type 2
37 * 4 : discard this read
38 * 5 : new alignment but same read
40 int parseNext(SingleRead&, SingleHit&);
41 int parseNext(SingleReadQ&, SingleHit&);
42 int parseNext(PairedEndRead&, PairedEndHit&);
43 int parseNext(PairedEndReadQ&, PairedEndHit&);
45 static void setReadTypeTag(const char* tag) {
54 Transcripts& transcripts;
57 static char rtTag[STRLEN];
60 int getDir(const bam1_t* b) {
61 return ((b->core.flag & 0x0010) ? -1 : 1);
64 std::string getName(const bam1_t* b) {
65 return std::string((char*)bam1_qname(b));
68 std::string getReadSeq(const bam1_t*);
69 std::string getQScore(const bam1_t*);
71 //0 ~ N0 1 ~ N1 2 ~ N2
72 int getReadType(const bam1_t*);
73 int getReadType(const bam1_t*, const bam1_t*); // for paired-end reads
75 bool check(bam1_t *b) {
76 return (b->core.n_cigar == 1) && ((*bam1_cigar(b) & BAM_CIGAR_MASK) == BAM_CMATCH) && (b->core.l_qseq == (int32_t)(*bam1_cigar(b) >> BAM_CIGAR_SHIFT));
80 char SamParser::rtTag[STRLEN] = ""; // default : no tag, thus no Type 2 reads
82 // aux, if not 0, points to the file name of fn_list
83 SamParser::SamParser(char inpType, const char* inpF, Transcripts& transcripts, const char* aux)
84 : transcripts(transcripts)
87 case 'b': sam_in = samopen(inpF, "rb", aux); break;
88 case 's': sam_in = samopen(inpF, "r", aux); break;
89 default: assert(false);
92 general_assert(sam_in != 0, "Cannot open " + cstrtos(inpF) + "! It may not exist.");
93 header = sam_in->header;
94 general_assert(header != 0, "Fail to parse sam header!");
96 transcripts.buildMappings(header->n_targets, header->target_name);
102 SamParser::~SamParser() {
108 // If sam_read1 returns 0 , what does it mean?
109 //Assume b.core.tid is 0-based
110 int SamParser::parseNext(SingleRead& read, SingleHit& hit) {
111 int val; // return value
112 bool canR = (samread(sam_in, b) >= 0);
113 if (!canR) return -1;
115 general_assert(!(b->core.flag & 0x0001), "Find a paired end read in the file!");
117 int readType = getReadType(b);
118 std::string name = getName(b);
120 if (readType != 1 || (readType == 1 && read.getName().compare(name) != 0)) {
122 read = SingleRead(name, getReadSeq(b));
127 if (!check(b)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); }
130 hit = SingleHit(transcripts.getInternalSid(b->core.tid + 1), b->core.pos);
133 hit = SingleHit(-transcripts.getInternalSid(b->core.tid + 1), header->target_len[b->core.tid] - b->core.pos - b->core.l_qseq);
140 int SamParser::parseNext(SingleReadQ& read, SingleHit& hit) {
142 bool canR = (samread(sam_in, b) >= 0);
143 if (!canR) return -1;
145 general_assert(!(b->core.flag & 0x0001), "Find a paired end read in the file!");
147 int readType = getReadType(b);
148 std::string name = getName(b);
150 if (readType != 1 || (readType == 1 && read.getName().compare(name) != 0)) {
152 read = SingleReadQ(name, getReadSeq(b), getQScore(b));
157 if (!check(b)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); }
160 hit = SingleHit(transcripts.getInternalSid(b->core.tid + 1), b->core.pos);
163 hit = SingleHit(-transcripts.getInternalSid(b->core.tid + 1), header->target_len[b->core.tid] - b->core.pos - b->core.l_qseq);
170 //Assume whether aligned or not , two mates of paired-end reads are always get together
171 int SamParser::parseNext(PairedEndRead& read, PairedEndHit& hit) {
173 bool canR = ((samread(sam_in, b) >= 0) && (samread(sam_in, b2) >= 0));
174 if (!canR) return -1;
176 general_assert((b->core.flag & 0x0001) && (b2->core.flag & 0x0001), \
177 "One of the mate is not paired-end! (RSEM assumes the two mates of a paired-end read should be adjacent)");
179 bam1_t *mp1 = NULL, *mp2 = NULL;
181 // If lose mate info, discard. is it necessary?
182 if (!((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) && !((b->core.flag & 0x0080) && (b2->core.flag & 0x0040))) return 4;
183 // If only one mate is mapped, discard
184 if (((b->core.flag & 0x0004) && !(b2->core.flag & 0x0004)) || (!(b->core.flag & 0x0004) && (b2->core.flag & 0x0004))) return 4;
186 if (b->core.flag & 0x0040) {
193 int readType = getReadType(mp1, mp2);
194 std::string name = getName(mp1);
196 if (readType != 1 || (readType == 1 && read.getName().compare(name) != 0)) {
198 SingleRead mate1(getName(mp1), getReadSeq(mp1));
199 SingleRead mate2(getName(mp2), getReadSeq(mp2));
200 read = PairedEndRead(mate1, mate2);
205 if (!check(mp1) || !check(mp2)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); }
207 if (mp1->core.tid != mp2->core.tid) {
208 fprintf(stderr, "The two reads do not come from the same pair!");
211 //assert(mp1->core.tid == mp2->core.tid);
212 if (getDir(mp1) > 0) {
213 hit = PairedEndHit(transcripts.getInternalSid(mp1->core.tid + 1), mp1->core.pos, mp2->core.pos + mp2->core.l_qseq - mp1->core.pos);
216 hit = PairedEndHit(-transcripts.getInternalSid(mp1->core.tid + 1), header->target_len[mp1->core.tid] - mp1->core.pos - mp1->core.l_qseq, mp1->core.pos + mp1->core.l_qseq - mp2->core.pos);
223 int SamParser::parseNext(PairedEndReadQ& read, PairedEndHit& hit) {
225 bool canR = ((samread(sam_in, b) >= 0) && (samread(sam_in, b2) >= 0));
226 if (!canR) return -1;
228 general_assert((b->core.flag & 0x0001) && (b2->core.flag & 0x0001), \
229 "One of the mate is not paired-end! (RSEM assumes the two mates of a paired-end read should be adjacent)");
231 bam1_t *mp1 = NULL, *mp2 = NULL;
233 // If lose mate info, discard. is it necessary?
234 if (!((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) && !((b->core.flag & 0x0080) && (b2->core.flag & 0x0040))) return 4;
235 // If only one mate is mapped, discard
236 if (((b->core.flag & 0x0004) && !(b2->core.flag & 0x0004)) || (!(b->core.flag & 0x0004) && (b2->core.flag & 0x0004))) return 4;
238 if (b->core.flag & 0x0040) {
245 int readType = getReadType(mp1, mp2);
246 std::string name = getName(mp1);
248 if (readType != 1 || (readType == 1 && read.getName().compare(name) != 0)) {
250 SingleReadQ mate1(getName(mp1), getReadSeq(mp1), getQScore(mp1));
251 SingleReadQ mate2(getName(mp2), getReadSeq(mp2), getQScore(mp2));
252 read = PairedEndReadQ(mate1, mate2);
257 if (!check(mp1) || !check(mp2)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); }
259 if (mp1->core.tid != mp2->core.tid) {
260 fprintf(stderr, "The two reads do not come from the same pair!");
263 //assert(mp1->core.tid == mp2->core.tid);
264 if (getDir(mp1) > 0) {
265 hit = PairedEndHit(transcripts.getInternalSid(mp1->core.tid + 1), mp1->core.pos, mp2->core.pos + mp2->core.l_qseq - mp1->core.pos);
268 hit = PairedEndHit(-transcripts.getInternalSid(mp1->core.tid + 1), header->target_len[mp1->core.tid] - mp1->core.pos - mp1->core.l_qseq, mp1->core.pos + mp1->core.l_qseq - mp2->core.pos);
275 inline std::string SamParser::getReadSeq(const bam1_t* b) {
276 uint8_t *p = bam1_seq(b);
277 std::string readseq = "";
281 for (int i = b->core.l_qseq - 1; i >= 0; i--) {
282 switch(bam1_seqi(p, i)) {
283 //case 0 : base = '='; break;
284 case 1 : base = 'T'; break;
285 case 2 : base = 'G'; break;
286 case 4 : base = 'C'; break;
287 case 8 : base = 'A'; break;
288 case 15 : base = 'N'; break;
289 default : assert(false);
291 readseq.append(1, base);
295 for (int i = 0; i < b->core.l_qseq; i++) {
296 switch(bam1_seqi(p, i)) {
297 //case 0 : base = '='; break;
298 case 1 : base = 'A'; break;
299 case 2 : base = 'C'; break;
300 case 4 : base = 'G'; break;
301 case 8 : base = 'T'; break;
302 case 15 : base = 'N'; break;
303 default : assert(false);
305 readseq.append(1, base);
312 inline std::string SamParser::getQScore(const bam1_t* b) {
313 uint8_t *p = bam1_qual(b);
314 std::string qscore = "";
317 for (int i = 0; i < b->core.l_qseq; i++) {
318 qscore.append(1, (char)(*p + 33));
323 p = p + b->core.l_qseq - 1;
324 for (int i = 0; i < b->core.l_qseq; i++) {
325 qscore.append(1, (char)(*p + 33));
333 //0 ~ N0 , 1 ~ N1, 2 ~ N2
334 inline int SamParser::getReadType(const bam1_t* b) {
335 if (!(b->core.flag & 0x0004)) return 1;
337 if (!strcmp(rtTag, "")) return 0;
339 uint8_t *p = bam_aux_get(b, rtTag);
340 if (p == NULL) return 0;
341 return (bam_aux2i(p) > 0 ? 2 : 0);
344 //For paired-end reads, do not print out type 2 reads
345 inline int SamParser::getReadType(const bam1_t* b, const bam1_t* b2) {
346 if (!(b->core.flag & 0x0004) && !(b2->core.flag & 0x0004)) return 1;
348 if (!strcmp(rtTag, "")) return 0;
350 uint8_t *p = bam_aux_get(b, rtTag);
351 if (p != NULL && bam_aux2i(p) > 0) return 2;
353 p = bam_aux_get(b2, rtTag);
354 if (p != NULL && bam_aux2i(p) > 0) return 2;
359 #endif /* SAMPARSER_H_ */