]> git.donarmstrong.com Git - rsem.git/blob - BamWriter.h
Modified build rules for sam/libbam.a to enable parallel build
[rsem.git] / BamWriter.h
1 #ifndef BAMWRITER_H_
2 #define BAMWRITER_H_
3
4 #include<cmath>
5 #include<cstdio>
6 #include<cstring>
7 #include<cassert>
8 #include<string>
9 #include<sstream>
10
11 #include <stdint.h>
12 #include "sam/bam.h"
13 #include "sam/sam.h"
14 #include "sam_rsem_aux.h"
15 #include "sam_rsem_cvt.h"
16
17 #include "SingleHit.h"
18 #include "PairedEndHit.h"
19
20 #include "HitWrapper.h"
21 #include "Transcript.h"
22 #include "Transcripts.h"
23
24 class BamWriter {
25 public:
26         BamWriter(char, const char*, const char*, const char*, Transcripts&);
27         ~BamWriter();
28
29         void work(HitWrapper<SingleHit>);
30         void work(HitWrapper<PairedEndHit>);
31 private:
32         samfile_t *in, *out;
33         Transcripts& transcripts;
34
35         //convert bam1_t
36         void convert(bam1_t*, double);
37 };
38
39 //fn_list can be NULL
40 BamWriter::BamWriter(char inpType, const char* inpF, const char* fn_list, const char* outF, Transcripts& transcripts)
41         : transcripts(transcripts)
42 {
43         switch(inpType) {
44         case 's': in = samopen(inpF, "r", fn_list); break;
45         case 'b': in = samopen(inpF, "rb", fn_list); break;
46         default: assert(false);
47         }
48         assert(in != 0);
49
50         //build mappings from external sid to internal sid
51         transcripts.buildMappings(in->header->n_targets, in->header->target_name);
52
53         //generate output's header
54         bam_header_t *out_header = bam_header_dwt(in->header);
55         for (int i = 0; i < out_header->n_targets; i++) {
56                 out_header->target_len[i] = transcripts.getTranscriptViaEid(i + 1).getLength();  // transcript length without poly(A) tail
57         }
58
59         std::ostringstream strout;
60         strout<<"@HD\tVN:1.4\tSO:unknown\n@PG\tID:RSEM\n";
61         std::string content = strout.str();
62         append_header_text(out_header, content.c_str(), content.length());
63
64         out = samopen(outF, "wb", out_header);
65         assert(out != 0);
66
67         bam_header_destroy(out_header);
68 }
69
70 BamWriter::~BamWriter() {
71         samclose(in);
72         samclose(out);
73 }
74
75 void BamWriter::work(HitWrapper<SingleHit> wrapper) {
76         bam1_t *b;
77         SingleHit *hit;
78
79         int cnt = 0;
80
81         b = bam_init1();
82
83         while (samread(in, b) >= 0) {
84                 ++cnt;
85                 if (verbose && cnt % 1000000 == 0) { printf("%d alignment lines are loaded!\n", cnt); }
86
87                 if (b->core.flag & 0x0004) continue;
88
89                 hit = wrapper.getNextHit();
90                 assert(hit != NULL);
91
92                 assert(transcripts.getInternalSid(b->core.tid + 1) == hit->getSid());
93                 convert(b, hit->getConPrb());
94                 if (b->core.qual > 0) samwrite(out, b); // output only when MAPQ > 0
95         }
96
97         assert(wrapper.getNextHit() == NULL);
98
99         bam_destroy1(b);
100         if (verbose) { printf("Bam output file is generated!\n"); }
101 }
102
103 void BamWriter::work(HitWrapper<PairedEndHit> wrapper) {
104         bam1_t *b, *b2;
105         PairedEndHit *hit;
106
107         int cnt = 0;
108
109         b = bam_init1();
110         b2 = bam_init1();
111
112         while (samread(in, b) >= 0 && samread(in, b2) >= 0) {
113                 cnt += 2;
114                 if (verbose && cnt % 1000000 == 0) { printf("%d alignment lines are loaded!\n", cnt); }
115
116                 if ((b->core.flag & 0x0004) || (b2->core.flag & 0x0004)) continue;
117
118                 //swap if b is mate 2
119                 if (b->core.flag & 0x0080) {
120                         assert(b2->core.flag & 0x0040);
121                         bam1_t *tmp = b;
122                         b = b2; b2 = tmp;
123                 }
124
125                 hit = wrapper.getNextHit();
126                 assert(hit != NULL);
127
128                 assert(transcripts.getInternalSid(b->core.tid + 1) == hit->getSid());
129                 assert(transcripts.getInternalSid(b2->core.tid + 1) == hit->getSid());
130
131                 convert(b, hit->getConPrb());
132                 convert(b2, hit->getConPrb());
133
134                 b->core.mpos = b2->core.pos;
135                 b2->core.mpos = b->core.pos;
136
137                 if (b->core.qual > 0) {
138                         samwrite(out, b);
139                         samwrite(out, b2);
140                 }
141         }
142
143         assert(wrapper.getNextHit() == NULL);
144
145         bam_destroy1(b);
146         bam_destroy1(b2);
147
148         if (verbose) { printf("Bam output file is generated!\n"); }
149 }
150
151 void BamWriter::convert(bam1_t *b, double prb) {
152         const Transcript& transcript = transcripts.getTranscriptViaEid(b->core.tid + 1);
153
154         int pos = b->core.pos;
155         int readlen = b->core.l_qseq;
156
157         std::vector<uint32_t> data;
158         data.clear();
159
160         int core_pos, core_n_cigar;
161         std::vector<Interval> vec;
162         vec.assign(1, Interval(1, transcript.getLength()));
163         // make an artificial chromosome coordinates for the transcript to get new CIGAR strings
164         tr2chr(Transcript("", "", "", '+', vec, ""), pos + 1, pos + readlen, core_pos, core_n_cigar, data);
165         assert(core_pos >= 0);
166
167         int rest_len = b->data_len - b->core.l_qname - b->core.n_cigar * 4;
168         b->data_len = b->core.l_qname + core_n_cigar * 4 + rest_len;
169         expand_data_size(b);
170         uint8_t* pt = b->data + b->core.l_qname;
171         memmove(pt + core_n_cigar * 4, pt + b->core.n_cigar * 4, rest_len);
172         for (int i = 0; i < core_n_cigar; i++) { memmove(pt, &data[i], 4); pt += 4; }
173
174         b->core.pos = core_pos;
175         b->core.n_cigar = core_n_cigar;
176         b->core.qual = getMAPQ(prb);
177         b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&(b->core), bam1_cigar(b)));
178
179         float val = (float)prb;
180         bam_aux_append(b, "ZW", 'f', bam_aux_type2size('f'), (uint8_t*)&val);
181 }
182
183 #endif /* BAMWRITER_H_ */