1 /* ReadType here means if the read is unalignable, alignable or aligned too much. It is NOT siheaderngle read or paired-end read */
19 #include "SingleRead.h"
20 #include "SingleReadQ.h"
21 #include "PairedEndRead.h"
22 #include "PairedEndReadQ.h"
23 #include "SingleHit.h"
24 #include "PairedEndHit.h"
28 SamParser(char, const char*, Refs&, const char* = 0);
33 * -1 : no more alignment
34 * 0 : new read , type 0
35 * 1 : new read , type 1 with alignment
36 * 2 : new read , type 2
37 * 4 : discard this read
38 * 5 : new alignment but same read
40 int parseNext(SingleRead&, SingleHit&);
41 int parseNext(SingleReadQ&, SingleHit&);
42 int parseNext(PairedEndRead&, PairedEndHit&);
43 int parseNext(PairedEndReadQ&, PairedEndHit&);
45 static void setReadTypeTag(const char* tag) {
56 static char rtTag[STRLEN];
59 int getDir(const bam1_t* b) {
60 return ((b->core.flag & 0x0010) ? -1 : 1);
63 std::string getName(const bam1_t* b) {
64 return std::string((char*)bam1_qname(b));
67 std::string getReadSeq(const bam1_t*);
68 std::string getQScore(const bam1_t*);
70 //0 ~ N0 1 ~ N1 2 ~ N2
71 int getReadType(const bam1_t*);
72 int getReadType(const bam1_t*, const bam1_t*); // for paired-end reads
74 bool check(bam1_t *b) {
75 return (b->core.n_cigar == 1) && ((*bam1_cigar(b) & BAM_CIGAR_MASK) == BAM_CMATCH) && (b->core.l_qseq == (int32_t)(*bam1_cigar(b) >> BAM_CIGAR_SHIFT));
79 char SamParser::rtTag[STRLEN] = ""; // default : no tag, thus no Type 2 reads
81 // aux, if not 0, points to the file name of fn_list
82 SamParser::SamParser(char inpType, const char* inpF, Refs& refs, const char* aux) {
84 case 'b': sam_in = samopen(inpF, "rb", aux); break;
85 case 's': sam_in = samopen(inpF, "r", aux); break;
86 default: assert(false);
89 if (sam_in == 0) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
90 header = sam_in->header;
91 if (header == 0) { fprintf(stderr, "Fail to parse sam header!\n"); exit(-1); }
93 // Check if the reference used for aligner is the transcript set RSEM generated
94 if (refs.getM() != header->n_targets) {
95 fprintf(stderr, "Number of transcripts does not match! Please align reads against the transcript set and use RSEM generated reference for your aligner!\n");
98 for (int i = 0; i < header->n_targets; i++) {
99 const RefSeq& refseq = refs.getRef(i + 1);
100 // If update int to long, chance the (int) conversion
101 if (refseq.getName().compare(header->target_name[i]) != 0 || refseq.getTotLen() != (int)header->target_len[i]) {
102 fprintf(stderr, "Transcript information does not match! Please align reads against the transcript set and use RSEM generated reference for your aligner!\n");
111 SamParser::~SamParser() {
117 // If sam_read1 returns 0 , what does it mean?
118 //Assume b.core.tid is 0-based
119 int SamParser::parseNext(SingleRead& read, SingleHit& hit) {
120 int val; // return value
121 bool canR = (samread(sam_in, b) >= 0);
122 if (!canR) return -1;
124 if (b->core.flag & 0x0001) { fprintf(stderr, "Find a paired end read in the file!\n"); exit(-1); }
125 //(b->core.flag & 0x0100) && && !(b->core.flag & 0x0004)
127 int readType = getReadType(b);
128 std::string name = getName(b);
130 if (readType != 1 || (readType == 1 && read.getName().compare(name) != 0)) {
132 read = SingleRead(name, getReadSeq(b));
137 if (!check(b)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); }
140 hit = SingleHit(b->core.tid + 1, b->core.pos);
143 hit = SingleHit(-(b->core.tid + 1), header->target_len[b->core.tid] - b->core.pos - b->core.l_qseq);
150 int SamParser::parseNext(SingleReadQ& read, SingleHit& hit) {
152 bool canR = (samread(sam_in, b) >= 0);
153 if (!canR) return -1;
155 if (b->core.flag & 0x0001) { fprintf(stderr, "Find a paired end read in the file!\n"); exit(-1); }
156 //assert(!(b->core.flag & 0x0001)); //(b->core.flag & 0x0100) && && !(b->core.flag & 0x0004)
158 int readType = getReadType(b);
159 std::string name = getName(b);
161 if (readType != 1 || (readType == 1 && read.getName().compare(name) != 0)) {
163 read = SingleReadQ(name, getReadSeq(b), getQScore(b));
168 if (!check(b)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); }
171 hit = SingleHit(b->core.tid + 1, b->core.pos);
174 hit = SingleHit(-(b->core.tid + 1), header->target_len[b->core.tid] - b->core.pos - b->core.l_qseq);
181 //Assume whether aligned or not , two mates of paired-end reads are always get together
182 int SamParser::parseNext(PairedEndRead& read, PairedEndHit& hit) {
184 bool canR = ((samread(sam_in, b) >= 0) && (samread(sam_in, b2) >= 0));
185 if (!canR) return -1;
187 if (!((b->core.flag & 0x0001) && (b2->core.flag & 0x0001))) {
188 fprintf(stderr, "One of the mate is not paired-end! (RSEM assumes the two mates of a paired-end read should be adjacent)\n");
191 //assert((b->core.flag & 0x0001) && (b2->core.flag & 0x0001));
193 bam1_t *mp1 = NULL, *mp2 = NULL;
195 if ((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) {
198 else if ((b->core.flag & 0x0080) && (b2->core.flag & 0x0040)) {
201 else return 4; // If lose mate info, discard. is it necessary?
203 int readType = getReadType(mp1, mp2);
204 std::string name = getName(mp1);
206 if (readType != 1 || (readType == 1 && read.getName().compare(name) != 0)) {
208 SingleRead mate1(getName(mp1), getReadSeq(mp1));
209 SingleRead mate2(getName(mp2), getReadSeq(mp2));
210 read = PairedEndRead(mate1, mate2);
215 if (!check(mp1) || !check(mp2)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); }
217 if (mp1->core.tid != mp2->core.tid) {
218 fprintf(stderr, "The two reads do not come from the same pair!");
221 //assert(mp1->core.tid == mp2->core.tid);
222 if (getDir(mp1) > 0) {
223 hit = PairedEndHit(mp1->core.tid + 1, mp1->core.pos, mp2->core.pos + mp2->core.l_qseq - mp1->core.pos);
226 hit = PairedEndHit(-(mp1->core.tid + 1), header->target_len[mp1->core.tid] - mp1->core.pos - mp1->core.l_qseq, mp1->core.pos + mp1->core.l_qseq - mp2->core.pos);
233 int SamParser::parseNext(PairedEndReadQ& read, PairedEndHit& hit) {
235 bool canR = ((samread(sam_in, b) >= 0) && (samread(sam_in, b2) >= 0));
236 if (!canR) return -1;
238 if (!((b->core.flag & 0x0001) && (b2->core.flag & 0x0001))) {
239 fprintf(stderr, "One of the mate is not paired-end! (RSEM assumes the two mates of a paired-end read should be adjacent)\n");
242 //assert((b->core.flag & 0x0001) && (b2->core.flag & 0x0001));
244 bam1_t *mp1 = NULL, *mp2 = NULL;
246 if ((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) {
249 else if ((b->core.flag & 0x0080) && (b2->core.flag & 0x0040)) {
254 int readType = getReadType(mp1, mp2);
255 std::string name = getName(mp1);
257 if (readType != 1 || (readType == 1 && read.getName().compare(name) != 0)) {
259 SingleReadQ mate1(getName(mp1), getReadSeq(mp1), getQScore(mp1));
260 SingleReadQ mate2(getName(mp2), getReadSeq(mp2), getQScore(mp2));
261 read = PairedEndReadQ(mate1, mate2);
266 if (!check(mp1) || !check(mp2)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); }
268 if (mp1->core.tid != mp2->core.tid) {
269 fprintf(stderr, "The two reads do not come from the same pair!");
272 //assert(mp1->core.tid == mp2->core.tid);
273 if (getDir(mp1) > 0) {
274 hit = PairedEndHit(mp1->core.tid + 1, mp1->core.pos, mp2->core.pos + mp2->core.l_qseq - mp1->core.pos);
277 hit = PairedEndHit(-(mp1->core.tid + 1), header->target_len[mp1->core.tid] - mp1->core.pos - mp1->core.l_qseq, mp1->core.pos + mp1->core.l_qseq - mp2->core.pos);
284 inline std::string SamParser::getReadSeq(const bam1_t* b) {
285 uint8_t *p = bam1_seq(b);
286 std::string readseq = "";
290 for (int i = b->core.l_qseq - 1; i >= 0; i--) {
291 switch(bam1_seqi(p, i)) {
292 //case 0 : base = '='; break;
293 case 1 : base = 'T'; break;
294 case 2 : base = 'G'; break;
295 case 4 : base = 'C'; break;
296 case 8 : base = 'A'; break;
297 case 15 : base = 'N'; break;
298 default : assert(false);
300 readseq.append(1, base);
304 for (int i = 0; i < b->core.l_qseq; i++) {
305 switch(bam1_seqi(p, i)) {
306 //case 0 : base = '='; break;
307 case 1 : base = 'A'; break;
308 case 2 : base = 'C'; break;
309 case 4 : base = 'G'; break;
310 case 8 : base = 'T'; break;
311 case 15 : base = 'N'; break;
312 default : assert(false);
314 readseq.append(1, base);
321 inline std::string SamParser::getQScore(const bam1_t* b) {
322 uint8_t *p = bam1_qual(b);
323 std::string qscore = "";
326 for (int i = 0; i < b->core.l_qseq; i++) {
327 qscore.append(1, (char)(*p + 33));
332 p = p + b->core.l_qseq - 1;
333 for (int i = 0; i < b->core.l_qseq; i++) {
334 qscore.append(1, (char)(*p + 33));
342 //0 ~ N0 , 1 ~ N1, 2 ~ N2
343 inline int SamParser::getReadType(const bam1_t* b) {
344 if (!(b->core.flag & 0x0004)) return 1;
346 if (!strcmp(rtTag, "")) return 0;
348 uint8_t *p = bam_aux_get(b, rtTag);
349 if (p == NULL) return 0;
350 return (bam_aux2i(p) > 0 ? 2 : 0);
354 //For paired-end reads, do not print out type 2 reads
355 inline int SamParser::getReadType(const bam1_t* b, const bam1_t* b2) {
356 if ((b->core.flag & 0x0002) && (b2->core.flag & 0x0002)) return 1;
361 #endif /* SAMPARSER_H_ */