#include "sam_rsem_cvt.h"
#include "utils.h"
+#include "my_assert.h"
#include "bc_aux.h"
#include "Transcript.h"
#include "Transcripts.h"
void writeCollapsedLines();
void flipSeq(uint8_t*, int);
void flipQual(uint8_t*, int);
- void addXSTag(bam1_t*, const Transcript&);
+ void modifyTags(bam1_t*, const Transcript&); // modify MD tag and XS tag if needed
};
BamConverter::BamConverter(const char* inpF, const char* outF, const char* chr_list, Transcripts& transcripts)
: transcripts(transcripts)
{
- if (transcripts.getType() != 0)
- exitWithError("Genome information is not provided! RSEM cannot convert the transcript bam file!");
+ general_assert(transcripts.getType() == 0, "Genome information is not provided! RSEM cannot convert the transcript bam file!");
in = samopen(inpF, "rb", NULL);
assert(in != 0);
+ transcripts.buildMappings(in->header->n_targets, in->header->target_name);
+
bam_header_t *out_header = sam_header_read2(chr_list);
+
refmap.clear();
for (int i = 0; i < out_header->n_targets; i++) {
refmap[out_header->target_name[i]] = i;
}
- append_header_text(out_header, in->header->text, in->header->l_text);
+ if (in->header->l_text > 0) {
+ char comment[] = "@CO\tThis BAM file is processed by rsem-tbam2gam to convert from transcript coordinates into genomic coordinates.\n";
+ int comment_len = strlen(comment);
+
+ //Filter @SQ fields if the BAM file is user provided
+ char *text = in->header->text;
+ int l_text = in->header->l_text;
+ char *new_text = new char[l_text + comment_len];
+ int pos = 0, s = 0;
+ while (pos < l_text) {
+ if ((pos + 2 < l_text) && (text[pos] == '@') && (text[pos + 1] == 'S') && (text[pos + 2] == 'Q')) {
+ pos += 3;
+ while (pos < l_text && text[pos] != '\n') ++pos;
+ }
+ else new_text[s++] = text[pos];
+ ++pos;
+ }
+ strncpy(new_text + s, comment, comment_len);
+ s += comment_len;
+
+ append_header_text(out_header, new_text, s);
+ delete[] new_text;
+ }
out = samopen(outF, "wb", out_header);
assert(out != 0);
std::string cqname;
bool isPaired = false;
- int cnt = 0;
+ HIT_INT_TYPE cnt = 0;
cqname = "";
b = bam_init1(); b2 = bam_init1();
++cnt;
isPaired = (b->core.flag & 0x0001) > 0;
if (isPaired) {
- assert(samread(in, b2) >= 0 && (b2->core.flag & 0x0001) && b->core.tid == b2->core.tid);
- assert((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)); // for collapsing
- ++cnt;
+ assert(samread(in, b2) >= 0 && (b2->core.flag & 0x0001));
+ assert((b->core.flag & 0x0001) && (b2->core.flag & 0x0001));
+ assert(((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) || ((b->core.flag & 0x0080) && (b2->core.flag & 0x0040)));
+ ++cnt;
}
if (cnt % 1000000 == 0) { printf("."); fflush(stdout); }
// at least one segment is not properly mapped
- if ((b->core.flag & 0x0004) || isPaired && (b2->core.flag & 0x0004)) continue;
+ bool notgood = (b->core.flag & 0x0004) || (isPaired && (b2->core.flag & 0x0004));
+
+ if (isPaired && notgood) general_assert((b->core.flag & 0x0004) && (b2->core.flag & 0x0004), cstrtos(bam1_qname(b)) + "'s two mates are not all marked as unalignable!");
+
+ if (!notgood) {
+ // for collapsing
+ if (isPaired) {
+ assert(b->core.tid == b2->core.tid);
+ general_assert(b->core.tid == b2->core.tid, cstrtos(bam1_qname(b)) + "'s two mates are aligned to two different transcripts!");
+ if ((b->core.flag & 0x0080) && (b2->core.flag & 0x0040)) {
+ bam1_t *tmp = b; b = b2; b2 = tmp;
+ }
+ }
- const Transcript& transcript = transcripts.getTranscriptAt(b->core.tid + 1);
+ const Transcript& transcript = transcripts.getTranscriptViaEid(b->core.tid + 1);
- convert(b, transcript);
- if (isPaired) {
- convert(b2, transcript);
- b->core.mpos = b2->core.pos;
- b2->core.mpos = b->core.pos;
+ convert(b, transcript);
+ if (isPaired) {
+ convert(b2, transcript);
+ b->core.mpos = b2->core.pos;
+ b2->core.mpos = b->core.pos;
+ }
+
+ if (cqname != bam1_qname(b)) {
+ writeCollapsedLines();
+ cqname = bam1_qname(b);
+ collapseMap.init(isPaired);
+ }
+
+ uint8_t *p = bam_aux_get(b, "ZW");
+ float prb = (p != NULL? bam_aux2f(p) : 1.0);
+ collapseMap.insert(b, b2, prb);
}
+ else {
+ assert(cqname != bam1_qname(b));
- if (cqname != bam1_qname(b)) {
writeCollapsedLines();
cqname = bam1_qname(b);
collapseMap.init(isPaired);
- }
- collapseMap.insert(b, b2, bam_aux2f(bam_aux_get(b, "ZW")));
+ samwrite(out, b);
+ if (isPaired) samwrite(out, b2);
+ }
}
writeCollapsedLines();
int pos = b->core.pos;
int readlen = b->core.l_qseq;
- if (readlen == 0) exitWithError("One alignment line has SEQ field as *. RSEM does not support this currently!");
+ general_assert(readlen > 0, "One alignment line has SEQ field as *. RSEM does not support this currently!");
iter = refmap.find(transcript.getSeqName());
assert(iter != refmap.end());
b->core.n_cigar = core_n_cigar;
b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&(b->core), bam1_cigar(b)));
- addXSTag(b, transcript); // check if need to add XS tag, if need, add it
+ modifyTags(b, transcript); // check if need to add XS tag, if need, add it
}
inline void BamConverter::writeCollapsedLines() {
bam1_t *tmp_b = NULL,*tmp_b2 = NULL;
float prb;
bool isPaired;
+ uint8_t *p;
if (!collapseMap.empty(isPaired)) {
while (collapseMap.next(tmp_b, tmp_b2, prb)) {
- memcpy(bam_aux_get(tmp_b, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f'));
- tmp_b->core.qual = getMAPQ(prb);
- if (tmp_b->core.qual > 0) {
- samwrite(out, tmp_b);
- if (isPaired) {
- memcpy(bam_aux_get(tmp_b2, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f'));
- tmp_b2->core.qual = tmp_b->core.qual;
- samwrite(out, tmp_b2);
- }
+ p = bam_aux_get(tmp_b, "ZW");
+ if (p != NULL) {
+ memcpy(bam_aux_get(tmp_b, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f'));
+ tmp_b->core.qual = getMAPQ(prb);
+ }
+ // otherwise, just use the MAPQ score of the orignal alignment
+
+ samwrite(out, tmp_b);
+ if (isPaired) {
+ if (p != NULL) memcpy(bam_aux_get(tmp_b2, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f'));
+ tmp_b2->core.qual = tmp_b->core.qual;
+ samwrite(out, tmp_b2);
}
bam_destroy1(tmp_b);
if (isPaired) bam_destroy1(tmp_b2);
-
}
}
}
}
}
-inline void BamConverter::addXSTag(bam1_t* b, const Transcript& transcript) {
- uint32_t* p = bam1_cigar(b);
+inline void BamConverter::modifyTags(bam1_t* b, const Transcript& transcript) {
+ char strand = transcript.getStrand();
+ uint8_t *s = NULL;
+
+ if (strand == '-') {
+ s = bam_aux_get(b, "MD");
+ if ((s != NULL) && (*(s) == 'Z') && (bam_aux2Z(s) != NULL)) {
+ char *mis = bam_aux2Z(s);
+ int len = strlen(mis);
+ char *tmp = new char[len];
+ int cur_type = -1, fr = -1, type, base;
+ for (int i = 0; i < len; i++) {
+ type = (mis[i] >= '0' && mis[i] <= '9');
+ if (cur_type != type) {
+ switch(cur_type) {
+ case 0:
+ base = len - 1;
+ if (mis[fr] == '^') { tmp[len - i] = mis[fr]; ++fr; ++base; }
+ for (int j = fr; j < i; j++) tmp[base - j] = ((mis[j] == 'A' || mis[j] == 'C' || mis[j] == 'G' || mis[j] == 'T') ? getOpp(mis[j]) : mis[j]);
+ break;
+ case 1:
+ base = len - i - fr;
+ for (int j = fr; j < i; j++) tmp[base + j] = mis[j];
+ break;
+ }
+ cur_type = type;
+ fr = i;
+ }
+ }
+ switch(cur_type) {
+ case 0:
+ base = len - 1;
+ if (mis[fr] == '^') { tmp[0] = mis[fr]; ++fr; ++base; }
+ for (int j = fr; j < len; j++) tmp[base - j] = ((mis[j] == 'A' || mis[j] == 'C' || mis[j] == 'G' || mis[j] == 'T') ? getOpp(mis[j]) : mis[j]);
+ break;
+ case 1:
+ for (int j = fr; j < len; j++) tmp[j - fr] = mis[j];
+ break;
+ }
+ strncpy(mis, tmp, len);
+ delete[] tmp;
+ }
+ }
+
+ // append XS:A field if necessary
+ s = bam_aux_get(b, "XS");
+ if (s != NULL) bam_aux_del(b, s);
bool hasN = false;
+ uint32_t* p = bam1_cigar(b);
for (int i = 0; i < (int)b->core.n_cigar; i++)
if ((*(p + i) & BAM_CIGAR_MASK) == BAM_CREF_SKIP) { hasN = true; break; }
- if (!hasN) return;
- char strand = transcript.getStrand();
- bam_aux_append(b, "XS", 'A', 1, (uint8_t*)&strand);
+ if (hasN) bam_aux_append(b, "XS", 'A', 1, (uint8_t*)&strand);
}
#endif /* BAMCONVERTER_H_ */