1 /* ReadType here means if the read is unalignable, alignable or aligned too much. It is NOT single read or paired-end read */
14 #include "SingleRead.h"
15 #include "SingleReadQ.h"
16 #include "PairedEndRead.h"
17 #include "PairedEndReadQ.h"
18 #include "SingleHit.h"
19 #include "PairedEndHit.h"
23 SamParser(char, const char*, const char* = 0);
28 * -1 : no more alignment
29 * 0 : new read , type 0
30 * 1 : new read , type 1 with alignment
31 * 2 : new read , type 2
32 * 4 : discard this read
33 * 5 : new alignment but same read
35 int parseNext(SingleRead&, SingleHit&);
36 int parseNext(SingleReadQ&, SingleHit&);
37 int parseNext(PairedEndRead&, PairedEndHit&);
38 int parseNext(PairedEndReadQ&, PairedEndHit&);
40 static void setReadTypeTag(const char* tag) {
50 static char rtTag[STRLEN];
53 int getDir(const bam1_t* b) {
54 return ((b->core.flag & 0x0010) ? -1 : 1);
57 std::string getName(const bam1_t* b) {
58 return std::string((char*)bam1_qname(b));
61 std::string getReadSeq(const bam1_t*);
62 std::string getQScore(const bam1_t*);
64 //0 ~ N0 1 ~ N1 2 ~ N2
65 int getReadType(const bam1_t*);
66 int getReadType(const bam1_t*, const bam1_t*); // for paired-end reads
69 char SamParser::rtTag[STRLEN] = ""; // default : no tag, thus no Type 2 reads
71 // aux, if not 0, points to the file name of fn_list
72 SamParser::SamParser(char inpType, const char* inpF, const char* aux) {
74 case 'b': sam_in = samopen(inpF, "rb", aux); break;
75 case 's': sam_in = samopen(inpF, "r", aux); break;
76 default: assert(false);
79 if (sam_in == 0) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
80 header = sam_in->header;
81 if (header == 0) { fprintf(stderr, "Fail to parse sam header!\n"); exit(-1); }
87 SamParser::~SamParser() {
93 // If sam_read1 returns 0 , what does it mean?
94 //Assume b.core.tid is 0-based
95 int SamParser::parseNext(SingleRead& read, SingleHit& hit) {
96 int val; // return value
97 bool canR = (samread(sam_in, b) >= 0);
100 if (b->core.flag & 0x0001) { fprintf(stderr, "Find a paired end read in the file!\n"); exit(-1); }
101 //(b->core.flag & 0x0100) && && !(b->core.flag & 0x0004)
103 int readType = getReadType(b);
104 std::string name = getName(b);
106 if (readType != 1 || (readType == 1 && read.getName().compare(name) != 0)) {
108 read = SingleRead(name, getReadSeq(b));
114 hit = SingleHit(b->core.tid + 1, b->core.pos);
117 hit = SingleHit(-(b->core.tid + 1), header->target_len[b->core.tid] - b->core.pos - b->core.l_qseq);
124 int SamParser::parseNext(SingleReadQ& read, SingleHit& hit) {
126 bool canR = (samread(sam_in, b) >= 0);
127 if (!canR) return -1;
129 if (b->core.flag & 0x0001) { fprintf(stderr, "Find a paired end read in the file!\n"); exit(-1); }
130 //assert(!(b->core.flag & 0x0001)); //(b->core.flag & 0x0100) && && !(b->core.flag & 0x0004)
132 int readType = getReadType(b);
133 std::string name = getName(b);
135 if (readType != 1 || (readType == 1 && read.getName().compare(name) != 0)) {
137 read = SingleReadQ(name, getReadSeq(b), getQScore(b));
143 hit = SingleHit(b->core.tid + 1, b->core.pos);
146 hit = SingleHit(-(b->core.tid + 1), header->target_len[b->core.tid] - b->core.pos - b->core.l_qseq);
153 //Assume whether aligned or not , two mates of paired-end reads are always get together
154 int SamParser::parseNext(PairedEndRead& read, PairedEndHit& hit) {
156 bool canR = ((samread(sam_in, b) >= 0) && (samread(sam_in, b2) >= 0));
157 if (!canR) return -1;
159 if (!((b->core.flag & 0x0001) && (b2->core.flag & 0x0001))) {
160 fprintf(stderr, "One of the mate is not paired-end! (RSEM assumes the two mates of a paired-end read should be adjacent)\n");
163 //assert((b->core.flag & 0x0001) && (b2->core.flag & 0x0001));
165 bam1_t *mp1 = NULL, *mp2 = NULL;
167 if ((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) {
170 else if ((b->core.flag & 0x0080) && (b2->core.flag & 0x0040)) {
173 else return 4; // If lose mate info, discard. is it necessary?
175 int readType = getReadType(mp1, mp2);
176 std::string name = getName(mp1);
178 if (readType != 1 || (readType == 1 && read.getName().compare(name) != 0)) {
180 SingleRead mate1(getName(mp1), getReadSeq(mp1));
181 SingleRead mate2(getName(mp2), getReadSeq(mp2));
182 read = PairedEndRead(mate1, mate2);
187 if (mp1->core.tid != mp2->core.tid) {
188 fprintf(stderr, "The two reads do not come from the same pair!");
191 //assert(mp1->core.tid == mp2->core.tid);
192 if (getDir(mp1) > 0) {
193 hit = PairedEndHit(mp1->core.tid + 1, mp1->core.pos, mp2->core.pos + mp2->core.l_qseq - mp1->core.pos);
196 hit = PairedEndHit(-(mp1->core.tid + 1), header->target_len[mp1->core.tid] - mp1->core.pos - mp1->core.l_qseq, mp1->core.pos + mp1->core.l_qseq - mp2->core.pos);
203 int SamParser::parseNext(PairedEndReadQ& read, PairedEndHit& hit) {
205 bool canR = ((samread(sam_in, b) >= 0) && (samread(sam_in, b2) >= 0));
206 if (!canR) return -1;
208 if (!((b->core.flag & 0x0001) && (b2->core.flag & 0x0001))) {
209 fprintf(stderr, "One of the mate is not paired-end! (RSEM assumes the two mates of a paired-end read should be adjacent)\n");
212 //assert((b->core.flag & 0x0001) && (b2->core.flag & 0x0001));
214 bam1_t *mp1 = NULL, *mp2 = NULL;
216 if ((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) {
219 else if ((b->core.flag & 0x0080) && (b2->core.flag & 0x0040)) {
224 int readType = getReadType(mp1, mp2);
225 std::string name = getName(mp1);
227 if (readType != 1 || (readType == 1 && read.getName().compare(name) != 0)) {
229 SingleReadQ mate1(getName(mp1), getReadSeq(mp1), getQScore(mp1));
230 SingleReadQ mate2(getName(mp2), getReadSeq(mp2), getQScore(mp2));
231 read = PairedEndReadQ(mate1, mate2);
236 if (mp1->core.tid != mp2->core.tid) {
237 fprintf(stderr, "The two reads do not come from the same pair!");
240 //assert(mp1->core.tid == mp2->core.tid);
241 if (getDir(mp1) > 0) {
242 hit = PairedEndHit(mp1->core.tid + 1, mp1->core.pos, mp2->core.pos + mp2->core.l_qseq - mp1->core.pos);
245 hit = PairedEndHit(-(mp1->core.tid + 1), header->target_len[mp1->core.tid] - mp1->core.pos - mp1->core.l_qseq, mp1->core.pos + mp1->core.l_qseq - mp2->core.pos);
252 inline std::string SamParser::getReadSeq(const bam1_t* b) {
253 uint8_t *p = bam1_seq(b);
254 std::string readseq = "";
258 for (int i = b->core.l_qseq - 1; i >= 0; i--) {
259 switch(bam1_seqi(p, i)) {
260 //case 0 : base = '='; break;
261 case 1 : base = 'T'; break;
262 case 2 : base = 'G'; break;
263 case 4 : base = 'C'; break;
264 case 8 : base = 'A'; break;
265 case 15 : base = 'N'; break;
266 default : assert(false);
268 readseq.append(1, base);
272 for (int i = 0; i < b->core.l_qseq; i++) {
273 switch(bam1_seqi(p, i)) {
274 //case 0 : base = '='; break;
275 case 1 : base = 'A'; break;
276 case 2 : base = 'C'; break;
277 case 4 : base = 'G'; break;
278 case 8 : base = 'T'; break;
279 case 15 : base = 'N'; break;
280 default : assert(false);
282 readseq.append(1, base);
289 inline std::string SamParser::getQScore(const bam1_t* b) {
290 uint8_t *p = bam1_qual(b);
291 std::string qscore = "";
294 for (int i = 0; i < b->core.l_qseq; i++) {
295 qscore.append(1, (char)(*p + 33));
300 p = p + b->core.l_qseq - 1;
301 for (int i = 0; i < b->core.l_qseq; i++) {
302 qscore.append(1, (char)(*p + 33));
310 //0 ~ N0 , 1 ~ N1, 2 ~ N2
311 inline int SamParser::getReadType(const bam1_t* b) {
312 if (!(b->core.flag & 0x0004)) return 1;
314 if (!strcmp(rtTag, "")) return 0;
316 uint8_t *p = bam_aux_get(b, rtTag);
317 if (p == NULL) return 0;
318 return (bam_aux2i(p) > 0 ? 2 : 0);
322 //For paired-end reads, do not print out type 2 reads
323 inline int SamParser::getReadType(const bam1_t* b, const bam1_t* b2) {
324 if ((b->core.flag & 0x0002) && (b2->core.flag & 0x0002)) return 1;
329 #endif /* SAMPARSER_H_ */