]> git.donarmstrong.com Git - rsem.git/blob - BamWriter.h
Fixed a bug in perl scripts for printing error messages
[rsem.git] / BamWriter.h
1 #ifndef BAMWRITER_H_
2 #define BAMWRITER_H_
3
4 #include<cmath>
5 #include<cstdio>
6 #include<cstring>
7 #include<cassert>
8 #include<string>
9 #include<sstream>
10 #include<iostream>
11
12 #include <stdint.h>
13 #include "sam/bam.h"
14 #include "sam/sam.h"
15 #include "sam_rsem_aux.h"
16 #include "sam_rsem_cvt.h"
17
18 #include "utils.h"
19
20 #include "SingleHit.h"
21 #include "PairedEndHit.h"
22
23 #include "HitWrapper.h"
24 #include "Transcript.h"
25 #include "Transcripts.h"
26
27 class BamWriter {
28 public:
29         BamWriter(char, const char*, const char*, const char*, Transcripts&);
30         ~BamWriter();
31
32         void work(HitWrapper<SingleHit>);
33         void work(HitWrapper<PairedEndHit>);
34 private:
35         samfile_t *in, *out;
36         Transcripts& transcripts;
37
38         //convert bam1_t
39         void convert(bam1_t*, double);
40 };
41
42 //fn_list can be NULL
43 BamWriter::BamWriter(char inpType, const char* inpF, const char* fn_list, const char* outF, Transcripts& transcripts)
44         : transcripts(transcripts)
45 {
46         switch(inpType) {
47         case 's': in = samopen(inpF, "r", fn_list); break;
48         case 'b': in = samopen(inpF, "rb", fn_list); break;
49         default: assert(false);
50         }
51         assert(in != 0);
52
53         //build mappings from external sid to internal sid
54         transcripts.buildMappings(in->header->n_targets, in->header->target_name);
55
56         //generate output's header
57         bam_header_t *out_header = bam_header_dwt(in->header);
58         for (int i = 0; i < out_header->n_targets; i++) {
59                 out_header->target_len[i] = transcripts.getTranscriptViaEid(i + 1).getLength();  // transcript length without poly(A) tail
60         }
61
62         std::ostringstream strout;
63         strout<<"@HD\tVN:1.4\tSO:unknown\n@PG\tID:RSEM\n";
64         std::string content = strout.str();
65         append_header_text(out_header, content.c_str(), content.length());
66
67         out = samopen(outF, "wb", out_header);
68         assert(out != 0);
69
70         bam_header_destroy(out_header);
71 }
72
73 BamWriter::~BamWriter() {
74         samclose(in);
75         samclose(out);
76 }
77
78 void BamWriter::work(HitWrapper<SingleHit> wrapper) {
79         bam1_t *b;
80         SingleHit *hit;
81
82         HIT_INT_TYPE cnt = 0;
83
84         b = bam_init1();
85
86         while (samread(in, b) >= 0) {
87                 ++cnt;
88                 if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " alignment lines are loaded!"<< std::endl; }
89
90                 if (!(b->core.flag & 0x0004)) {
91                   hit = wrapper.getNextHit();
92                   assert(hit != NULL);
93
94                   assert(transcripts.getInternalSid(b->core.tid + 1) == hit->getSid());
95                   convert(b, hit->getConPrb());
96                 }
97
98                 //if (b->core.qual > 0) samwrite(out, b); // output only when MAPQ > 0
99                 samwrite(out, b);
100         }
101
102         assert(wrapper.getNextHit() == NULL);
103
104         bam_destroy1(b);
105         if (verbose) { std::cout<< "Bam output file is generated!"<< std::endl; }
106 }
107
108 void BamWriter::work(HitWrapper<PairedEndHit> wrapper) {
109         bam1_t *b, *b2;
110         PairedEndHit *hit;
111
112         int cnt = 0;
113
114         b = bam_init1();
115         b2 = bam_init1();
116
117         while (samread(in, b) >= 0 && samread(in, b2) >= 0) {
118                 cnt += 2;
119                 if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< "alignment lines are loaded!"<< std::endl; }
120
121                 assert((b->core.flag & 0x0001) && (b2->core.flag & 0x0001));
122                 assert((b->core.flag & 0x0040) && (b2->core.flag & 0x0080) || (b->core.flag & 0x0080) && (b2->core.flag & 0x0040));
123
124                 //unalignable reads, skip               
125                 bool notgood = (b->core.flag & 0x0004) || (b2->core.flag & 0x0004);
126
127                 if (!notgood) {
128                   //swap if b is mate 2
129                   if (b->core.flag & 0x0080) {
130                     assert(b2->core.flag & 0x0040);
131                     bam1_t *tmp = b;
132                     b = b2; b2 = tmp;
133                   }
134
135                   hit = wrapper.getNextHit();
136                   assert(hit != NULL);
137
138                   assert(transcripts.getInternalSid(b->core.tid + 1) == hit->getSid());
139                   assert(transcripts.getInternalSid(b2->core.tid + 1) == hit->getSid());
140
141                   convert(b, hit->getConPrb());
142                   convert(b2, hit->getConPrb());
143
144                   b->core.mpos = b2->core.pos;
145                   b2->core.mpos = b->core.pos;
146                 }
147
148                 /*
149                 else {
150                   // if only one mate can be aligned, mask it as unaligned and put an additional tag Z0:A:!
151                   char exclamation = '!';
152                   if (!(b->core.flag & 0x0004)) { 
153                     b->core.flag |= 0x0004;
154                     bam_aux_append(b, "Z0", 'A', bam_aux_type2size('A'), (uint8_t*)&exclamation);
155                   }
156                   if (!(b2->core.flag & 0x0004)) {
157                     b2->core.flag |= 0x0004;
158                     bam_aux_append(b2, "Z0", 'A', bam_aux_type2size('A'), (uint8_t*)&exclamation);
159                   }
160                 }
161                 */
162
163                 samwrite(out, b);
164                 samwrite(out, b2);
165         }
166
167         assert(wrapper.getNextHit() == NULL);
168
169         bam_destroy1(b);
170         bam_destroy1(b2);
171
172         if (verbose) { std::cout<< "Bam output file is generated!"<< std::endl; }
173 }
174
175 void BamWriter::convert(bam1_t *b, double prb) {
176         const Transcript& transcript = transcripts.getTranscriptViaEid(b->core.tid + 1);
177
178         int pos = b->core.pos;
179         int readlen = b->core.l_qseq;
180
181         std::vector<uint32_t> data;
182         data.clear();
183
184         int core_pos, core_n_cigar;
185         std::vector<Interval> vec;
186         vec.assign(1, Interval(1, transcript.getLength()));
187         // make an artificial chromosome coordinates for the transcript to get new CIGAR strings
188         tr2chr(Transcript("", "", "", '+', vec, ""), pos + 1, pos + readlen, core_pos, core_n_cigar, data);
189         assert(core_pos >= 0);
190
191         int rest_len = b->data_len - b->core.l_qname - b->core.n_cigar * 4;
192         b->data_len = b->core.l_qname + core_n_cigar * 4 + rest_len;
193         expand_data_size(b);
194         uint8_t* pt = b->data + b->core.l_qname;
195         memmove(pt + core_n_cigar * 4, pt + b->core.n_cigar * 4, rest_len);
196         for (int i = 0; i < core_n_cigar; i++) { memmove(pt, &data[i], 4); pt += 4; }
197
198         b->core.pos = core_pos;
199         b->core.n_cigar = core_n_cigar;
200         b->core.qual = getMAPQ(prb);
201         b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&(b->core), bam1_cigar(b)));
202
203         float val = (float)prb;
204         bam_aux_append(b, "ZW", 'f', bam_aux_type2size('f'), (uint8_t*)&val);
205 }
206
207 #endif /* BAMWRITER_H_ */