]> git.donarmstrong.com Git - rsem.git/blob - BamWriter.h
ff11397e3539d8662f31e85d6a9e8d09309e51f8
[rsem.git] / BamWriter.h
1 #ifndef BAMWRITER_H_
2 #define BAMWRITER_H_
3
4 #include<cmath>
5 #include<cstdio>
6 #include<cstring>
7 #include<cassert>
8 #include<string>
9 #include<sstream>
10
11 #include <stdint.h>
12 #include "sam/bam.h"
13 #include "sam/sam.h"
14 #include "sam_rsem_aux.h"
15 #include "sam_rsem_cvt.h"
16
17 #include "SingleHit.h"
18 #include "PairedEndHit.h"
19
20 #include "HitWrapper.h"
21 #include "Transcript.h"
22 #include "Transcripts.h"
23
24 class BamWriter {
25 public:
26         BamWriter(char, const char*, const char*, const char*, Transcripts&);
27         ~BamWriter();
28
29         void work(HitWrapper<SingleHit>);
30         void work(HitWrapper<PairedEndHit>);
31 private:
32         samfile_t *in, *out;
33         Transcripts& transcripts;
34
35         //convert bam1_t
36         void convert(bam1_t*, double);
37 };
38
39 //fn_list can be NULL
40 BamWriter::BamWriter(char inpType, const char* inpF, const char* fn_list, const char* outF, Transcripts& transcripts)
41         : transcripts(transcripts)
42 {
43         switch(inpType) {
44         case 's': in = samopen(inpF, "r", fn_list); break;
45         case 'b': in = samopen(inpF, "rb", fn_list); break;
46         default: assert(false);
47         }
48         assert(in != 0);
49
50         //generate output's header
51         bam_header_t *out_header = bam_header_dwt(in->header);
52
53         if (out_header->n_targets != transcripts.getM()) {
54                 fprintf(stderr, "Number of reference sequences recorded in the header is not correct! The header contains %d sequences while there should be %d sequences\n", out_header->n_targets, transcripts.getM());
55                 exit(-1);
56         }
57
58         for (int i = 0; i < out_header->n_targets; i++) {
59                 const Transcript& transcript = transcripts.getTranscriptAt(i + 1);
60                 if (out_header->target_name[i] != transcript.getTranscriptID()) {
61                         fprintf(stderr, "Reference sequence %d's name recorded in the header is not correct! \n", i);
62                         fprintf(stderr, "Name in the header: %s\n", out_header->target_name[i]);
63                         fprintf(stderr, "Should be: %s\n", transcript.getTranscriptID().c_str());
64                         exit(-1);
65                 }
66                 out_header->target_len[i] = transcript.getLength();  // transcript length without poly(A) tail
67         }
68
69         std::ostringstream strout;
70         strout<<"@HD\tVN:1.4\tSO:unknown\n@PG\tID:RSEM\n";
71         std::string content = strout.str();
72         append_header_text(out_header, content.c_str(), content.length());
73
74         out = samopen(outF, "wb", out_header);
75         assert(out != 0);
76
77         bam_header_destroy(out_header);
78 }
79
80 BamWriter::~BamWriter() {
81         samclose(in);
82         samclose(out);
83 }
84
85 void BamWriter::work(HitWrapper<SingleHit> wrapper) {
86         bam1_t *b;
87         SingleHit *hit;
88
89         int cnt = 0;
90
91         b = bam_init1();
92
93         while (samread(in, b) >= 0) {
94                 ++cnt;
95                 if (verbose && cnt % 1000000 == 0) { printf("%d alignment lines are loaded!\n", cnt); }
96
97                 if (b->core.flag & 0x0004) continue;
98
99                 hit = wrapper.getNextHit();
100                 assert(hit != NULL);
101
102                 assert(b->core.tid + 1 == hit->getSid());
103                 convert(b, hit->getConPrb());
104                 if (b->core.qual > 0) samwrite(out, b); // output only when MAPQ > 0
105         }
106
107         assert(wrapper.getNextHit() == NULL);
108
109         bam_destroy1(b);
110         if (verbose) { printf("Bam output file is generated!\n"); }
111 }
112
113 void BamWriter::work(HitWrapper<PairedEndHit> wrapper) {
114         bam1_t *b, *b2;
115         PairedEndHit *hit;
116
117         int cnt = 0;
118
119         b = bam_init1();
120         b2 = bam_init1();
121
122         while (samread(in, b) >= 0 && samread(in, b2) >= 0) {
123                 cnt += 2;
124                 if (verbose && cnt % 1000000 == 0) { printf("%d alignment lines are loaded!\n", cnt); }
125
126                 if ((b->core.flag & 0x0004) || (b2->core.flag & 0x0004)) continue;
127
128                 //swap if b is mate 2
129                 if (b->core.flag & 0x0080) {
130                         assert(b2->core.flag & 0x0040);
131                         bam1_t *tmp = b;
132                         b = b2; b2 = tmp;
133                 }
134
135                 hit = wrapper.getNextHit();
136                 assert(hit != NULL);
137
138                 assert(b->core.tid + 1 == hit->getSid());
139                 assert(b2->core.tid + 1 == hit->getSid());
140
141                 convert(b, hit->getConPrb());
142                 convert(b2, hit->getConPrb());
143
144                 b->core.mpos = b2->core.pos;
145                 b2->core.mpos = b->core.pos;
146
147                 if (b->core.qual > 0) {
148                         samwrite(out, b);
149                         samwrite(out, b2);
150                 }
151         }
152
153         assert(wrapper.getNextHit() == NULL);
154
155         bam_destroy1(b);
156         bam_destroy1(b2);
157
158         if (verbose) { printf("Bam output file is generated!\n"); }
159 }
160
161 void BamWriter::convert(bam1_t *b, double prb) {
162         int sid = b->core.tid + 1;
163         const Transcript& transcript = transcripts.getTranscriptAt(sid);
164
165         int pos = b->core.pos;
166         int readlen = b->core.l_qseq;
167
168         std::vector<uint32_t> data;
169         data.clear();
170
171         int core_pos, core_n_cigar;
172         std::vector<Interval> vec;
173         vec.assign(1, Interval(1, transcript.getLength()));
174         // make an artificial chromosome coordinates for the transcript to get new CIGAR strings
175         tr2chr(Transcript("", "", "", '+', vec, ""), pos + 1, pos + readlen, core_pos, core_n_cigar, data);
176         assert(core_pos >= 0);
177
178         int rest_len = b->data_len - b->core.l_qname - b->core.n_cigar * 4;
179         b->data_len = b->core.l_qname + core_n_cigar * 4 + rest_len;
180         expand_data_size(b);
181         uint8_t* pt = b->data + b->core.l_qname;
182         memmove(pt + core_n_cigar * 4, pt + b->core.n_cigar * 4, rest_len);
183         for (int i = 0; i < core_n_cigar; i++) { memmove(pt, &data[i], 4); pt += 4; }
184
185         b->core.pos = core_pos;
186         b->core.n_cigar = core_n_cigar;
187         b->core.qual = getMAPQ(prb);
188         b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&(b->core), bam1_cigar(b)));
189
190         float val = (float)prb;
191         bam_aux_append(b, "ZW", 'f', bam_aux_type2size('f'), (uint8_t*)&val);
192 }
193
194 #endif /* BAMWRITER_H_ */