1 #ifndef BAMCONVERTER_H_
2 #define BAMCONVERTER_H_
13 #include "sam_rsem_aux.h"
14 #include "sam_rsem_cvt.h"
17 #include "my_assert.h"
19 #include "Transcript.h"
20 #include "Transcripts.h"
24 BamConverter(const char*, const char*, const char*, Transcripts&);
30 Transcripts& transcripts;
32 std::map<std::string, int> refmap;
33 std::map<std::string, int>::iterator iter;
35 CollapseMap collapseMap;
37 void convert(bam1_t*, const Transcript&);
39 void writeCollapsedLines();
40 void flipSeq(uint8_t*, int);
41 void flipQual(uint8_t*, int);
42 void addXSTag(bam1_t*, const Transcript&);
45 BamConverter::BamConverter(const char* inpF, const char* outF, const char* chr_list, Transcripts& transcripts)
46 : transcripts(transcripts)
48 general_assert(transcripts.getType() == 0, "Genome information is not provided! RSEM cannot convert the transcript bam file!");
50 in = samopen(inpF, "rb", NULL);
53 transcripts.buildMappings(in->header->n_targets, in->header->target_name);
55 bam_header_t *out_header = sam_header_read2(chr_list);
58 for (int i = 0; i < out_header->n_targets; i++) {
59 refmap[out_header->target_name[i]] = i;
62 if (in->header->l_text > 0) {
63 char comment[] = "@CO\tThis BAM file is processed by rsem-tbam2gam to convert from transcript coordinates into genomic coordinates.\n";
64 int comment_len = strlen(comment);
66 //Filter @SQ fields if the BAM file is user provided
67 char *text = in->header->text;
68 int l_text = in->header->l_text;
69 char *new_text = new char[l_text + comment_len];
71 while (pos < l_text) {
72 if ((pos + 2 < l_text) && (text[pos] == '@') && (text[pos + 1] == 'S') && (text[pos + 2] == 'Q')) {
74 while (pos < l_text && text[pos] != '\n') ++pos;
76 else new_text[s++] = text[pos];
79 strncpy(new_text + s, comment, comment_len);
82 append_header_text(out_header, new_text, s);
86 out = samopen(outF, "wb", out_header);
89 bam_header_destroy(out_header);
92 BamConverter::~BamConverter() {
97 void BamConverter::process() {
100 bool isPaired = false;
102 HIT_INT_TYPE cnt = 0;
105 b = bam_init1(); b2 = bam_init1();
107 while (samread(in, b) >= 0) {
109 isPaired = (b->core.flag & 0x0001) > 0;
111 assert(samread(in, b2) >= 0 && (b2->core.flag & 0x0001));
112 assert((b->core.flag & 0x0001) && (b2->core.flag & 0x0001));
113 assert(((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) || ((b->core.flag & 0x0080) && (b2->core.flag & 0x0040)));
117 if (cnt % 1000000 == 0) { printf("."); fflush(stdout); }
119 // at least one segment is not properly mapped
120 bool notgood = (b->core.flag & 0x0004) || (isPaired && (b2->core.flag & 0x0004));
122 if (isPaired && notgood) general_assert((b->core.flag & 0x0004) && (b2->core.flag & 0x0004), cstrtos(bam1_qname(b)) + "'s two mates are not all marked as unalignable!");
127 assert(b->core.tid == b2->core.tid);
128 general_assert(b->core.tid == b2->core.tid, cstrtos(bam1_qname(b)) + "'s two mates are aligned to two different transcripts!");
129 if ((b->core.flag & 0x0080) && (b2->core.flag & 0x0040)) {
130 bam1_t *tmp = b; b = b2; b2 = tmp;
134 const Transcript& transcript = transcripts.getTranscriptViaEid(b->core.tid + 1);
136 convert(b, transcript);
138 convert(b2, transcript);
139 b->core.mpos = b2->core.pos;
140 b2->core.mpos = b->core.pos;
143 if (cqname != bam1_qname(b)) {
144 writeCollapsedLines();
145 cqname = bam1_qname(b);
146 collapseMap.init(isPaired);
149 uint8_t *p = bam_aux_get(b, "ZW");
150 float prb = (p != NULL? bam_aux2f(p) : 1.0);
151 collapseMap.insert(b, b2, prb);
154 assert(cqname != bam1_qname(b));
156 writeCollapsedLines();
157 cqname = bam1_qname(b);
158 collapseMap.init(isPaired);
161 if (isPaired) samwrite(out, b2);
165 writeCollapsedLines();
170 if (cnt >= 1000000) printf("\n");
173 void BamConverter::convert(bam1_t* b, const Transcript& transcript) {
174 int pos = b->core.pos;
175 int readlen = b->core.l_qseq;
177 general_assert(readlen > 0, "One alignment line has SEQ field as *. RSEM does not support this currently!");
179 iter = refmap.find(transcript.getSeqName());
180 assert(iter != refmap.end());
181 b->core.tid = iter->second;
182 if (b->core.flag & 0x0001) { b->core.mtid = b->core.tid; }
183 b->core.qual = 255; // set to not available temporarily
185 if (transcript.getStrand() == '-') {
186 b->core.flag ^= 0x0010;
187 if (b->core.flag & 0x0001) {
188 b->core.flag ^= 0x0020;
189 b->core.isize = -b->core.isize;
191 flipSeq(bam1_seq(b), readlen);
192 flipQual(bam1_qual(b), readlen);
195 std::vector<uint32_t> data;
198 int core_pos, core_n_cigar;
199 tr2chr(transcript, pos + 1, pos + readlen, core_pos, core_n_cigar, data);
200 assert(core_pos >= 0);
202 int rest_len = b->data_len - b->core.l_qname - b->core.n_cigar * 4;
203 b->data_len = b->core.l_qname + core_n_cigar * 4 + rest_len;
205 uint8_t* pt = b->data + b->core.l_qname;
206 memmove(pt + core_n_cigar * 4, pt + b->core.n_cigar * 4, rest_len);
207 for (int i = 0; i < core_n_cigar; i++) { memmove(pt, &data[i], 4); pt += 4; }
209 b->core.pos = core_pos;
210 b->core.n_cigar = core_n_cigar;
211 b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&(b->core), bam1_cigar(b)));
213 addXSTag(b, transcript); // check if need to add XS tag, if need, add it
216 inline void BamConverter::writeCollapsedLines() {
217 bam1_t *tmp_b = NULL,*tmp_b2 = NULL;
222 if (!collapseMap.empty(isPaired)) {
223 while (collapseMap.next(tmp_b, tmp_b2, prb)) {
224 p = bam_aux_get(tmp_b, "ZW");
226 memcpy(bam_aux_get(tmp_b, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f'));
227 tmp_b->core.qual = getMAPQ(prb);
229 // otherwise, just use the MAPQ score of the orignal alignment
231 samwrite(out, tmp_b);
233 if (p != NULL) memcpy(bam_aux_get(tmp_b2, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f'));
234 tmp_b2->core.qual = tmp_b->core.qual;
235 samwrite(out, tmp_b2);
238 if (isPaired) bam_destroy1(tmp_b2);
243 inline void BamConverter::flipSeq(uint8_t* s, int readlen) {
245 std::vector<uint8_t> seq;
249 for (int i = 0; i < readlen; i++) {
250 switch (bam1_seqi(s, readlen - i - 1)) {
251 case 1: base = 8; break;
252 case 2: base = 4; break;
253 case 4: base = 2; break;
254 case 8: base = 1; break;
255 case 15: base = 15; break;
256 default: assert(false);
258 code |= base << (4 * (1 - i % 2));
259 if (i % 2 == 1) { seq.push_back(code); code = 0; }
261 if (readlen % 2 == 1) { seq.push_back(code); }
263 for (int i = 0; i < (int)seq.size(); i++) s[i] = seq[i];
266 inline void BamConverter::flipQual(uint8_t* q, int readlen) {
267 int32_t mid = readlen / 2;
269 for (int i = 0; i < mid; i++) {
270 tmp = q[i]; q[i] = q[readlen - i - 1]; q[readlen -i -1] = tmp;
274 inline void BamConverter::addXSTag(bam1_t* b, const Transcript& transcript) {
275 uint32_t* p = bam1_cigar(b);
277 for (int i = 0; i < (int)b->core.n_cigar; i++)
278 if ((*(p + i) & BAM_CIGAR_MASK) == BAM_CREF_SKIP) { hasN = true; break; }
280 char strand = transcript.getStrand();
281 bam_aux_append(b, "XS", 'A', 1, (uint8_t*)&strand);
284 #endif /* BAMCONVERTER_H_ */