14 #include "sam_rsem_aux.h"
15 #include "sam_rsem_cvt.h"
17 #include "SingleHit.h"
18 #include "PairedEndHit.h"
20 #include "HitWrapper.h"
21 #include "Transcript.h"
22 #include "Transcripts.h"
26 BamWriter(char, const char*, const char*, const char*, Transcripts&);
29 void work(HitWrapper<SingleHit>);
30 void work(HitWrapper<PairedEndHit>);
33 Transcripts& transcripts;
36 void convert(bam1_t*, double);
40 BamWriter::BamWriter(char inpType, const char* inpF, const char* fn_list, const char* outF, Transcripts& transcripts)
41 : transcripts(transcripts)
44 case 's': in = samopen(inpF, "r", fn_list); break;
45 case 'b': in = samopen(inpF, "rb", fn_list); break;
46 default: assert(false);
50 //generate output's header
51 bam_header_t *out_header = bam_header_dwt(in->header);
53 if (out_header->n_targets != transcripts.getM()) {
54 fprintf(stderr, "Number of reference sequences recorded in the header is not correct! The header contains %d sequences while there should be %d sequences\n", out_header->n_targets, transcripts.getM());
58 for (int i = 0; i < out_header->n_targets; i++) {
59 const Transcript& transcript = transcripts.getTranscriptAt(i + 1);
60 if (out_header->target_name[i] != transcript.getTranscriptID()) {
61 fprintf(stderr, "Reference sequence %d's name recorded in the header is not correct! \n", i);
62 fprintf(stderr, "Name in the header: %s\n", out_header->target_name[i]);
63 fprintf(stderr, "Should be: %s\n", transcript.getTranscriptID().c_str());
66 out_header->target_len[i] = transcript.getLength(); // transcript length without poly(A) tail
69 std::ostringstream strout;
70 strout<<"@HD\tVN:1.4\tSO:unknown\n@PG\tID:RSEM\n";
71 std::string content = strout.str();
72 append_header_text(out_header, content.c_str(), content.length());
74 out = samopen(outF, "wb", out_header);
77 bam_header_destroy(out_header);
80 BamWriter::~BamWriter() {
85 void BamWriter::work(HitWrapper<SingleHit> wrapper) {
93 while (samread(in, b) >= 0) {
95 if (verbose && cnt % 1000000 == 0) { printf("%d alignment lines are loaded!\n", cnt); }
97 if (b->core.flag & 0x0004) continue;
99 hit = wrapper.getNextHit();
102 assert(b->core.tid + 1 == hit->getSid());
103 convert(b, hit->getConPrb());
104 if (b->core.qual > 0) samwrite(out, b); // output only when MAPQ > 0
107 assert(wrapper.getNextHit() == NULL);
110 if (verbose) { printf("Bam output file is generated!\n"); }
113 void BamWriter::work(HitWrapper<PairedEndHit> wrapper) {
122 while (samread(in, b) >= 0 && samread(in, b2) >= 0) {
124 if (verbose && cnt % 1000000 == 0) { printf("%d alignment lines are loaded!\n", cnt); }
126 if ((b->core.flag & 0x0004) || (b2->core.flag & 0x0004)) continue;
128 //swap if b is mate 2
129 if (b->core.flag & 0x0080) {
130 assert(b2->core.flag & 0x0040);
135 hit = wrapper.getNextHit();
138 assert(b->core.tid + 1 == hit->getSid());
139 assert(b2->core.tid + 1 == hit->getSid());
141 convert(b, hit->getConPrb());
142 convert(b2, hit->getConPrb());
144 b->core.mpos = b2->core.pos;
145 b2->core.mpos = b->core.pos;
147 if (b->core.qual > 0) {
153 assert(wrapper.getNextHit() == NULL);
158 if (verbose) { printf("Bam output file is generated!\n"); }
161 void BamWriter::convert(bam1_t *b, double prb) {
162 int sid = b->core.tid + 1;
163 const Transcript& transcript = transcripts.getTranscriptAt(sid);
165 int pos = b->core.pos;
166 int readlen = b->core.l_qseq;
168 std::vector<uint32_t> data;
171 int core_pos, core_n_cigar;
172 std::vector<Interval> vec;
173 vec.assign(1, Interval(1, transcript.getLength()));
174 // make an artificial chromosome coordinates for the transcript to get new CIGAR strings
175 tr2chr(Transcript("", "", "", '+', vec, ""), pos + 1, pos + readlen, core_pos, core_n_cigar, data);
176 assert(core_pos >= 0);
178 int rest_len = b->data_len - b->core.l_qname - b->core.n_cigar * 4;
179 b->data_len = b->core.l_qname + core_n_cigar * 4 + rest_len;
181 uint8_t* pt = b->data + b->core.l_qname;
182 memmove(pt + core_n_cigar * 4, pt + b->core.n_cigar * 4, rest_len);
183 for (int i = 0; i < core_n_cigar; i++) { memmove(pt, &data[i], 4); pt += 4; }
185 b->core.pos = core_pos;
186 b->core.n_cigar = core_n_cigar;
187 b->core.qual = getMAPQ(prb);
188 b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&(b->core), bam1_cigar(b)));
190 float val = (float)prb;
191 bam_aux_append(b, "ZW", 'f', bam_aux_type2size('f'), (uint8_t*)&val);
194 #endif /* BAMWRITER_H_ */