From: Bo Li Date: Wed, 14 Dec 2011 22:07:06 +0000 (-0600) Subject: Merge remote-tracking branch 'origin/master' X-Git-Url: https://git.donarmstrong.com/?p=rsem.git;a=commitdiff_plain;h=fc69cf6af24c0550e55447fc82f01cb6f90c1c42;hp=2bd2fb01cd66b08761de467f03c6bbc1478db97e Merge remote-tracking branch 'origin/master' --- diff --git a/BamConverter.h b/BamConverter.h new file mode 100644 index 0000000..9b54308 --- /dev/null +++ b/BamConverter.h @@ -0,0 +1,230 @@ +#ifndef BAMCONVERTER_H_ +#define BAMCONVERTER_H_ + +#include +#include +#include +#include +#include + +#include +#include "sam/bam.h" +#include "sam/sam.h" +#include "sam_rsem_aux.h" +#include "sam_rsem_cvt.h" + +#include "utils.h" +#include "bc_aux.h" +#include "Transcript.h" +#include "Transcripts.h" + +class BamConverter { +public: + BamConverter(const char*, const char*, const char*, Transcripts&); + ~BamConverter(); + + void process(); +private: + samfile_t *in, *out; + Transcripts& transcripts; + + std::map refmap; + std::map::iterator iter; + + CollapseMap collapseMap; + + void convert(bam1_t*, const Transcript&); + + void writeCollapsedLines(); + void flipSeq(uint8_t*, int); + void flipQual(uint8_t*, int); + void addXSTag(bam1_t*, const Transcript&); +}; + +BamConverter::BamConverter(const char* inpF, const char* outF, const char* chr_list, Transcripts& transcripts) + : transcripts(transcripts) +{ + if (transcripts.getType() != 0) + exitWithError("Genome information is not provided! RSEM cannot convert the transcript bam file!"); + + in = samopen(inpF, "rb", NULL); + assert(in != 0); + + bam_header_t *out_header = sam_header_read2(chr_list); + refmap.clear(); + for (int i = 0; i < out_header->n_targets; i++) { + refmap[out_header->target_name[i]] = i; + } + + append_header_text(out_header, in->header->text, in->header->l_text); + + out = samopen(outF, "wb", out_header); + assert(out != 0); + + bam_header_destroy(out_header); +} + +BamConverter::~BamConverter() { + samclose(in); + samclose(out); +} + +void BamConverter::process() { + bam1_t *b, *b2; + std::string cqname; + bool isPaired = false; + + int cnt = 0; + + cqname = ""; + b = bam_init1(); b2 = bam_init1(); + + while (samread(in, b) >= 0) { + ++cnt; + isPaired = (b->core.flag & 0x0001) > 0; + if (isPaired) { + assert(samread(in, b2) >= 0 && (b2->core.flag & 0x0001) && b->core.tid == b2->core.tid); + assert((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)); // for collapsing + ++cnt; + } + + if (cnt % 1000000 == 0) { printf("."); fflush(stdout); } + + // at least one segment is not properly mapped + if ((b->core.flag & 0x0004) || isPaired && (b2->core.flag & 0x0004)) continue; + + const Transcript& transcript = transcripts.getTranscriptAt(b->core.tid + 1); + + convert(b, transcript); + if (isPaired) { + convert(b2, transcript); + b->core.mpos = b2->core.pos; + b2->core.mpos = b->core.pos; + } + + if (cqname != bam1_qname(b)) { + writeCollapsedLines(); + cqname = bam1_qname(b); + collapseMap.init(isPaired); + } + + collapseMap.insert(b, b2, bam_aux2f(bam_aux_get(b, "ZW"))); + } + + writeCollapsedLines(); + + bam_destroy1(b); + bam_destroy1(b2); + + if (cnt >= 1000000) printf("\n"); +} + +void BamConverter::convert(bam1_t* b, const Transcript& transcript) { + int pos = b->core.pos; + int readlen = b->core.l_qseq; + + if (readlen == 0) exitWithError("One alignment line has SEQ field as *. RSEM does not support this currently!"); + + iter = refmap.find(transcript.getSeqName()); + assert(iter != refmap.end()); + b->core.tid = iter->second; + if (b->core.flag & 0x0001) { b->core.mtid = b->core.tid; } + b->core.qual = 255; // set to not available temporarily + + if (transcript.getStrand() == '-') { + b->core.flag ^= 0x0010; + if (b->core.flag & 0x0001) { + b->core.flag ^= 0x0020; + b->core.isize = -b->core.isize; + } + flipSeq(bam1_seq(b), readlen); + flipQual(bam1_qual(b), readlen); + } + + std::vector data; + data.clear(); + + int core_pos, core_n_cigar; + tr2chr(transcript, pos + 1, pos + readlen, core_pos, core_n_cigar, data); + assert(core_pos >= 0); + + int rest_len = b->data_len - b->core.l_qname - b->core.n_cigar * 4; + b->data_len = b->core.l_qname + core_n_cigar * 4 + rest_len; + expand_data_size(b); + uint8_t* pt = b->data + b->core.l_qname; + memmove(pt + core_n_cigar * 4, pt + b->core.n_cigar * 4, rest_len); + for (int i = 0; i < core_n_cigar; i++) { memmove(pt, &data[i], 4); pt += 4; } + + b->core.pos = core_pos; + b->core.n_cigar = core_n_cigar; + b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&(b->core), bam1_cigar(b))); + + addXSTag(b, transcript); // check if need to add XS tag, if need, add it +} + +inline void BamConverter::writeCollapsedLines() { + bam1_t *tmp_b = NULL,*tmp_b2 = NULL; + float prb; + bool isPaired; + + if (!collapseMap.empty(isPaired)) { + while (collapseMap.next(tmp_b, tmp_b2, prb)) { + memcpy(bam_aux_get(tmp_b, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f')); + tmp_b->core.qual = getMAPQ(prb); + if (tmp_b->core.qual > 0) { + samwrite(out, tmp_b); + if (isPaired) { + memcpy(bam_aux_get(tmp_b2, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f')); + tmp_b2->core.qual = tmp_b->core.qual; + samwrite(out, tmp_b2); + } + } + bam_destroy1(tmp_b); + if (isPaired) bam_destroy1(tmp_b2); + + } + } +} + +inline void BamConverter::flipSeq(uint8_t* s, int readlen) { + uint8_t code, base; + std::vector seq; + + code = 0; base = 0; + seq.clear(); + for (int i = 0; i < readlen; i++) { + switch (bam1_seqi(s, readlen - i - 1)) { + case 1: base = 8; break; + case 2: base = 4; break; + case 4: base = 2; break; + case 8: base = 1; break; + case 15: base = 15; break; + default: assert(false); + } + code |= base << (4 * (1 - i % 2)); + if (i % 2 == 1) { seq.push_back(code); code = 0; } + } + if (readlen % 2 == 1) { seq.push_back(code); } + + for (int i = 0; i < (int)seq.size(); i++) s[i] = seq[i]; +} + +inline void BamConverter::flipQual(uint8_t* q, int readlen) { + int32_t mid = readlen / 2; + uint8_t tmp; + for (int i = 0; i < mid; i++) { + tmp = q[i]; q[i] = q[readlen - i - 1]; q[readlen -i -1] = tmp; + } +} + +inline void BamConverter::addXSTag(bam1_t* b, const Transcript& transcript) { + uint32_t* p = bam1_cigar(b); + bool hasN = false; + for (int i = 0; i < (int)b->core.n_cigar; i++) + if ((*(p + i) & BAM_CIGAR_MASK) == BAM_CREF_SKIP) { hasN = true; break; } + if (!hasN) return; + char strand = transcript.getStrand(); + bam_aux_append(b, "XS", 'A', 1, (uint8_t*)&strand); +} + +#endif /* BAMCONVERTER_H_ */ diff --git a/BamWriter.h b/BamWriter.h index b39400f..ff11397 100644 --- a/BamWriter.h +++ b/BamWriter.h @@ -6,13 +6,14 @@ #include #include #include -#include #include +#include #include "sam/bam.h" #include "sam/sam.h" +#include "sam_rsem_aux.h" +#include "sam_rsem_cvt.h" -#include "utils.h" #include "SingleHit.h" #include "PairedEndHit.h" @@ -22,136 +23,23 @@ class BamWriter { public: - BamWriter(char, const char*, const char*, const char*, const char*); + BamWriter(char, const char*, const char*, const char*, Transcripts&); ~BamWriter(); - void work(HitWrapper, Transcripts&); - void work(HitWrapper, Transcripts&); + void work(HitWrapper); + void work(HitWrapper); private: samfile_t *in, *out; + Transcripts& transcripts; - std::map refmap; - std::map::iterator iter; - - struct SingleEndT { - bam1_t *b; - - SingleEndT(bam1_t *b = NULL) { - this->b = b; - } - - bool operator< (const SingleEndT& o) const { - int strand1, strand2; - uint32_t *p1, *p2; - - if (b->core.tid != o.b->core.tid) return b->core.tid < o.b->core.tid; - if (b->core.pos != o.b->core.pos) return b->core.pos < o.b->core.pos; - strand1 = b->core.flag & 0x0010; strand2 = o.b->core.flag & 0x0010; - if (strand1 != strand2) return strand1 < strand2; - if (b->core.n_cigar != o.b->core.n_cigar) return b->core.n_cigar < o.b->core.n_cigar; - p1 = bam1_cigar(b); p2 = bam1_cigar(o.b); - for (int i = 0; i < (int)b->core.n_cigar; i++) { - if (*p1 != *p2) return *p1 < *p2; - ++p1; ++p2; - } - return false; - } - }; - - //b is mate 1, b2 is mate 2 - struct PairedEndT { - bam1_t *b, *b2; - - PairedEndT() { b = NULL; b2 = NULL;} - - PairedEndT(bam1_t *b, bam1_t *b2) { - this->b = b; - this->b2 = b2; - } - - bool operator< (const PairedEndT& o) const { - int strand1, strand2; - uint32_t *p1, *p2; - - //compare b - if (b->core.tid != o.b->core.tid) return b->core.tid < o.b->core.tid; - if (b->core.pos != o.b->core.pos) return b->core.pos < o.b->core.pos; - strand1 = b->core.flag & 0x0010; strand2 = o.b->core.flag & 0x0010; - if (strand1 != strand2) return strand1 < strand2; - if (b->core.n_cigar != o.b->core.n_cigar) return b->core.n_cigar < o.b->core.n_cigar; - p1 = bam1_cigar(b); p2 = bam1_cigar(o.b); - for (int i = 0; i < (int)b->core.n_cigar; i++) { - if (*p1 != *p2) return *p1 < *p2; - ++p1; ++p2; - } - - //compare b2 - if (b2->core.tid != o.b2->core.tid) return b2->core.tid < o.b2->core.tid; - if (b2->core.pos != o.b2->core.pos) return b2->core.pos < o.b2->core.pos; - strand1 = b2->core.flag & 0x0010; strand2 = o.b2->core.flag & 0x0010; - if (strand1 != strand2) return strand1 < strand2; - if (b2->core.n_cigar != o.b2->core.n_cigar) return b2->core.n_cigar < o.b2->core.n_cigar; - p1 = bam1_cigar(b2); p2 = bam1_cigar(o.b2); - for (int i = 0; i < (int)b2->core.n_cigar; i++) { - if (*p1 != *p2) return *p1 < *p2; - ++p1; ++p2; - } - - return false; - } - }; - - uint8_t getMAPQ(double val) { - double err = 1.0 - val; - if (err <= 1e-10) return 100; - return (uint8_t)(-10 * log10(err) + .5); // round it - } - - void push_qname(const uint8_t* qname, int l_qname, std::vector& data) { - for (int i = 0; i < l_qname; i++) data.push_back(*(qname + i)); - } - - void push_seq(const uint8_t* seq, int readlen, char strand, std::vector& data) { - int seq_len = (readlen + 1) / 2; - - switch (strand) { - case '+': for (int i = 0; i < seq_len; i++) data.push_back(*(seq + i)); break; - case '-': - uint8_t code, base; - code = 0; base = 0; - for (int i = 0; i < readlen; i++) { - switch (bam1_seqi(seq, readlen - i - 1)) { - case 1: base = 8; break; - case 2: base = 4; break; - case 4: base = 2; break; - case 8: base = 1; break; - case 15: base = 15; break; - default: assert(false); - } - code |= base << (4 * (1 - i % 2)); - if (i % 2 == 1) { data.push_back(code); code = 0; } - } - - if (readlen % 2 == 1) { data.push_back(code); } - break; - default: assert(false); - } - } - - void push_qual(const uint8_t* qual, int readlen, char strand, std::vector& data) { - switch (strand) { - case '+': for (int i = 0; i < readlen; i++) data.push_back(*(qual + i)); break; - case '-': for (int i = readlen - 1; i >= 0; i--) data.push_back(*(qual + i)); break; - default: assert(false); - } - } - - //convert transcript coordinate to chromosome coordinate and generate CIGAR string - void tr2chr(const Transcript&, int, int, int&, int&, std::vector&); + //convert bam1_t + void convert(bam1_t*, double); }; //fn_list can be NULL -BamWriter::BamWriter(char inpType, const char* inpF, const char* fn_list, const char* outF, const char* chr_list) { +BamWriter::BamWriter(char inpType, const char* inpF, const char* fn_list, const char* outF, Transcripts& transcripts) + : transcripts(transcripts) +{ switch(inpType) { case 's': in = samopen(inpF, "r", fn_list); break; case 'b': in = samopen(inpF, "rb", fn_list); break; @@ -160,25 +48,33 @@ BamWriter::BamWriter(char inpType, const char* inpF, const char* fn_list, const assert(in != 0); //generate output's header - bam_header_t *out_header = NULL; - refmap.clear(); + bam_header_t *out_header = bam_header_dwt(in->header); - if (chr_list == NULL) { - out_header = in->header; + if (out_header->n_targets != transcripts.getM()) { + fprintf(stderr, "Number of reference sequences recorded in the header is not correct! The header contains %d sequences while there should be %d sequences\n", out_header->n_targets, transcripts.getM()); + exit(-1); } - else { - out_header = sam_header_read2(chr_list); - for (int i = 0; i < out_header->n_targets; i++) { - refmap[out_header->target_name[i]] = i; + for (int i = 0; i < out_header->n_targets; i++) { + const Transcript& transcript = transcripts.getTranscriptAt(i + 1); + if (out_header->target_name[i] != transcript.getTranscriptID()) { + fprintf(stderr, "Reference sequence %d's name recorded in the header is not correct! \n", i); + fprintf(stderr, "Name in the header: %s\n", out_header->target_name[i]); + fprintf(stderr, "Should be: %s\n", transcript.getTranscriptID().c_str()); + exit(-1); } + out_header->target_len[i] = transcript.getLength(); // transcript length without poly(A) tail } + std::ostringstream strout; + strout<<"@HD\tVN:1.4\tSO:unknown\n@PG\tID:RSEM\n"; + std::string content = strout.str(); + append_header_text(out_header, content.c_str(), content.length()); out = samopen(outF, "wb", out_header); assert(out != 0); - if (chr_list != NULL) { bam_header_destroy(out_header); } + bam_header_destroy(out_header); } BamWriter::~BamWriter() { @@ -186,140 +82,48 @@ BamWriter::~BamWriter() { samclose(out); } -void BamWriter::work(HitWrapper wrapper, Transcripts& transcripts) { +void BamWriter::work(HitWrapper wrapper) { bam1_t *b; - std::string cqname; // cqname : current query name - std::map hmap; - std::map::iterator hmapIter; SingleHit *hit; int cnt = 0; - cqname = ""; b = bam_init1(); - hmap.clear(); while (samread(in, b) >= 0) { - - if (verbose && cnt > 0 && cnt % 1000000 == 0) { printf("%d entries are finished!\n", cnt); } ++cnt; + if (verbose && cnt % 1000000 == 0) { printf("%d alignment lines are loaded!\n", cnt); } if (b->core.flag & 0x0004) continue; hit = wrapper.getNextHit(); assert(hit != NULL); - int sid = b->core.tid + 1; - assert(sid == hit->getSid()); - const Transcript& transcript = transcripts.getTranscriptAt(sid); - - if (transcripts.getType() == 0) { - int pos = b->core.pos; - int readlen = b->core.l_qseq; - uint8_t *qname = b->data, *seq = bam1_seq(b), *qual = bam1_qual(b); - std::vector data; - data.clear(); - - iter = refmap.find(transcript.getSeqName()); - assert(iter != refmap.end()); - b->core.tid = iter->second; - b->core.qual = 255; - - uint16_t rstrand = b->core.flag & 0x0010; // read strand - b->core.flag -= rstrand; - rstrand = (((!rstrand && transcript.getStrand() == '+') || (rstrand && transcript.getStrand() == '-')) ? 0 : 0x0010); - b->core.flag += rstrand; - - push_qname(qname, b->core.l_qname, data); - int core_pos, core_n_cigar; - tr2chr(transcript, pos + 1, pos + readlen, core_pos, core_n_cigar, data); - if (core_pos < 0) b->core.tid = -1; - b->core.pos = core_pos; - b->core.n_cigar = core_n_cigar; - push_seq(seq, readlen, transcript.getStrand(), data); - push_qual(qual, readlen, transcript.getStrand(), data); - - free(b->data); - b->m_data = b->data_len = data.size() + 7; // 7 extra bytes for ZW tag - b->l_aux = 7; - b->data = (uint8_t*)malloc(b->m_data); - for (int i = 0; i < b->data_len; i++) b->data[i] = data[i]; - - b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&(b->core), bam1_cigar(b))); - } - else { - b->m_data = b->data_len = b->data_len - b->l_aux + 7; // 7 extra bytes for ZW tag - b->l_aux = 7; - b->data = (uint8_t*)realloc(b->data, b->m_data); - } - - - if (cqname != bam1_qname(b)) { - if (!hmap.empty()) { - for (hmapIter = hmap.begin(); hmapIter != hmap.end(); hmapIter++) { - bam1_t *tmp_b = hmapIter->first.b; - tmp_b->core.qual = getMAPQ(hmapIter->second); - uint8_t *p = bam1_aux(tmp_b); - *p = 'Z'; ++p; *p = 'W'; ++p; *p = 'f'; ++p; - float val = (float)hmapIter->second; - memcpy(p, &val, 4); - samwrite(out, tmp_b); - bam_destroy1(tmp_b); // now hmapIter->b makes no sense - } - hmap.clear(); - } - cqname = bam1_qname(b); - } - - hmapIter = hmap.find(SingleEndT(b)); - if (hmapIter == hmap.end()) { - hmap[SingleEndT(bam_dup1(b))] = hit->getConPrb(); - } - else { - hmapIter->second += hit->getConPrb(); - } + assert(b->core.tid + 1 == hit->getSid()); + convert(b, hit->getConPrb()); + if (b->core.qual > 0) samwrite(out, b); // output only when MAPQ > 0 } assert(wrapper.getNextHit() == NULL); - if (!hmap.empty()) { - for (hmapIter = hmap.begin(); hmapIter != hmap.end(); hmapIter++) { - bam1_t *tmp_b = hmapIter->first.b; - tmp_b->core.qual = getMAPQ(hmapIter->second); - uint8_t *p = bam1_aux(tmp_b); - *p = 'Z'; ++p; *p = 'W'; ++p; *p = 'f'; ++p; - float val = (float)hmapIter->second; - memcpy(p, &val, 4); - samwrite(out, tmp_b); - bam_destroy1(tmp_b); // now hmapIter->b makes no sense - } - hmap.clear(); - } - bam_destroy1(b); if (verbose) { printf("Bam output file is generated!\n"); } } -void BamWriter::work(HitWrapper wrapper, Transcripts& transcripts) { +void BamWriter::work(HitWrapper wrapper) { bam1_t *b, *b2; - std::string cqname; // cqname : current query name - std::map hmap; - std::map::iterator hmapIter; PairedEndHit *hit; int cnt = 0; - cqname = ""; b = bam_init1(); b2 = bam_init1(); - hmap.clear(); while (samread(in, b) >= 0 && samread(in, b2) >= 0) { + cnt += 2; + if (verbose && cnt % 1000000 == 0) { printf("%d alignment lines are loaded!\n", cnt); } - if (verbose && cnt > 0 && cnt % 1000000 == 0) { printf("%d entries are finished!\n", cnt); } - ++cnt; - - if (!((b->core.flag & 0x0002) && (b2->core.flag & 0x0002))) continue; + if ((b->core.flag & 0x0004) || (b2->core.flag & 0x0004)) continue; //swap if b is mate 2 if (b->core.flag & 0x0080) { @@ -331,236 +135,60 @@ void BamWriter::work(HitWrapper wrapper, Transcripts& transcripts) hit = wrapper.getNextHit(); assert(hit != NULL); - int sid = b->core.tid + 1; - assert(sid == hit->getSid()); - assert(sid == b2->core.tid + 1); - const Transcript& transcript = transcripts.getTranscriptAt(sid); - - if (transcripts.getType() == 0) { - int pos = b->core.pos, pos2 = b2->core.pos; - int readlen = b->core.l_qseq, readlen2 = b2->core.l_qseq; - uint8_t *qname = b->data, *seq = bam1_seq(b), *qual = bam1_qual(b); - uint8_t *qname2 = b2->data, *seq2 = bam1_seq(b2), *qual2 = bam1_qual(b2); - std::vector data, data2; - - data.clear(); - data2.clear(); - - iter = refmap.find(transcript.getSeqName()); - assert(iter != refmap.end()); - b->core.tid = iter->second; b->core.mtid = iter->second; - b2->core.tid = iter->second; b2->core.mtid = iter->second; - - uint16_t rstrand = b->core.flag & 0x0010; - b->core.flag = b->core.flag - (b->core.flag & 0x0010) - (b->core.flag & 0x0020); - b2->core.flag = b2->core.flag - (b2->core.flag & 0x0010) - (b2->core.flag & 0x0020); - - uint16_t add, add2; - if ((!rstrand && transcript.getStrand() == '+') || (rstrand && transcript.getStrand() == '-')) { - add = 0x0020; add2 = 0x0010; - } - else { - add = 0x0010; add2 = 0x0020; - } - b->core.flag += add; - b2->core.flag += add2; - - b->core.qual = b2->core.qual = 255; - - //Do I really need this? The insert size uses transcript coordinates - if (transcript.getStrand() == '-') { - b->core.isize = -b->core.isize; - b2->core.isize = -b2->core.isize; - } - - push_qname(qname, b->core.l_qname, data); - push_qname(qname2, b2->core.l_qname, data2); - int core_pos, core_n_cigar; - tr2chr(transcript, pos + 1, pos + readlen, core_pos, core_n_cigar, data); - if (core_pos < 0) b->core.tid = -1; - b->core.pos = core_pos; b->core.n_cigar = core_n_cigar; - tr2chr(transcript, pos2 + 1, pos2 + readlen2, core_pos, core_n_cigar, data2); - if (core_pos < 0) b2->core.tid = -1; - b2->core.pos = core_pos; b2->core.n_cigar = core_n_cigar; - b->core.mpos = b2->core.pos; - b2->core.mpos = b->core.pos; - push_seq(seq, readlen, transcript.getStrand(), data); - push_seq(seq2, readlen2, transcript.getStrand(), data2); - push_qual(qual, readlen, transcript.getStrand(), data); - push_qual(qual2, readlen2, transcript.getStrand(), data2); - - free(b->data); - b->m_data = b->data_len = data.size() + 7; // 7 extra bytes for ZW tag - b->l_aux = 7; - b->data = (uint8_t*)malloc(b->m_data); - for (int i = 0; i < b->data_len; i++) b->data[i] = data[i]; - - free(b2->data); - b2->m_data = b2->data_len = data2.size() + 7; // 7 extra bytes for ZW tag - b2->l_aux = 7; - b2->data = (uint8_t*)malloc(b2->m_data); - for (int i = 0; i < b2->data_len; i++) b2->data[i] = data2[i]; - - b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&(b->core), bam1_cigar(b))); - b2->core.bin = bam_reg2bin(b2->core.pos, bam_calend(&(b2->core), bam1_cigar(b2))); - } - else { - b->m_data = b->data_len = b->data_len - b->l_aux + 7; // 7 extra bytes for ZW tag - b->l_aux = 7; - b->data = (uint8_t*)realloc(b->data, b->m_data); - - b2->m_data = b2->data_len = b2->data_len - b2->l_aux + 7; // 7 extra bytes for ZW tag - b2->l_aux = 7; - b2->data = (uint8_t*)realloc(b2->data, b2->m_data); - } - - if (cqname != bam1_qname(b)) { - if (!hmap.empty()) { - for (hmapIter = hmap.begin(); hmapIter != hmap.end(); hmapIter++) { - bam1_t *tmp_b = hmapIter->first.b; - bam1_t *tmp_b2 = hmapIter->first.b2; - - tmp_b->core.qual = tmp_b2->core.qual = getMAPQ(hmapIter->second); + assert(b->core.tid + 1 == hit->getSid()); + assert(b2->core.tid + 1 == hit->getSid()); - uint8_t *p = bam1_aux(tmp_b), *p2 = bam1_aux(tmp_b2); - *p = 'Z'; ++p; *p = 'W'; ++p; *p = 'f'; ++p; - *p2 = 'Z'; ++p2; *p2 = 'W'; ++p2; *p2 = 'f'; ++p2; + convert(b, hit->getConPrb()); + convert(b2, hit->getConPrb()); - float val = (float)hmapIter->second; - memcpy(p, &val, 4); - memcpy(p2, &val, 4); + b->core.mpos = b2->core.pos; + b2->core.mpos = b->core.pos; - samwrite(out, tmp_b); - samwrite(out, tmp_b2); - - bam_destroy1(tmp_b); - bam_destroy1(tmp_b2); - } - hmap.clear(); - } - cqname = bam1_qname(b); - } - - hmapIter = hmap.find(PairedEndT(b, b2)); - if (hmapIter == hmap.end()) { - hmap[PairedEndT(bam_dup1(b), bam_dup1(b2))] = hit->getConPrb(); - } - else { - hmapIter->second += hit->getConPrb(); + if (b->core.qual > 0) { + samwrite(out, b); + samwrite(out, b2); } } assert(wrapper.getNextHit() == NULL); - if (!hmap.empty()) { - for (hmapIter = hmap.begin(); hmapIter != hmap.end(); hmapIter++) { - bam1_t *tmp_b = hmapIter->first.b; - bam1_t *tmp_b2 = hmapIter->first.b2; - - tmp_b->core.qual = tmp_b2->core.qual = getMAPQ(hmapIter->second); - - uint8_t *p = bam1_aux(tmp_b), *p2 = bam1_aux(tmp_b2); - *p = 'Z'; ++p; *p = 'W'; ++p; *p = 'f'; ++p; - *p2 = 'Z'; ++p2; *p2 = 'W'; ++p2; *p2 = 'f'; ++p2; - - float val = (float)hmapIter->second; - memcpy(p, &val, 4); - memcpy(p2, &val, 4); - - samwrite(out, tmp_b); - samwrite(out, tmp_b2); - - bam_destroy1(tmp_b); - bam_destroy1(tmp_b2); - } - hmap.clear(); - } - bam_destroy1(b); bam_destroy1(b2); if (verbose) { printf("Bam output file is generated!\n"); } } -void BamWriter::tr2chr(const Transcript& transcript, int sp, int ep, int& pos, int& n_cigar, std::vector& data) { - int length = transcript.getLength(); - char strand = transcript.getStrand(); - const std::vector& structure = transcript.getStructure(); - - int s, i; - int oldlen, curlen; - - uint32_t operation; - uint8_t *p; - - n_cigar = 0; - s = structure.size(); - - if (strand == '-') { - int tmp = sp; - sp = length - ep + 1; - ep = length - tmp + 1; - } - - if (ep < 1 || sp > length) { // a read which align to polyA tails totally! - pos = (sp > length ? structure[s - 1].end : structure[0].start - 1); // 0 based - - n_cigar = 1; - operation = (ep - sp + 1) << BAM_CIGAR_SHIFT | BAM_CINS; //BAM_CSOFT_CLIP; - p = (uint8_t*)(&operation); - for (int j = 0; j < 4; j++) data.push_back(*(p + j)); - - return; - } - - if (sp < 1) { - n_cigar++; - operation = (1 - sp) << BAM_CIGAR_SHIFT | BAM_CINS; //BAM_CSOFT_CLIP; - p = (uint8_t*)(&operation); - for (int j = 0; j < 4; j++) data.push_back(*(p + j)); - sp = 1; - } - - oldlen = curlen = 0; - - for (i = 0; i < s; i++) { - oldlen = curlen; - curlen += structure[i].end - structure[i].start + 1; - if (curlen >= sp) break; - } - assert(i < s); - pos = structure[i].start + (sp - oldlen - 1) - 1; // 0 based - - while (curlen < ep && i < s) { - n_cigar++; - operation = (curlen - sp + 1) << BAM_CIGAR_SHIFT | BAM_CMATCH; - p = (uint8_t*)(&operation); - for (int j = 0; j < 4; j++) data.push_back(*(p + j)); - - ++i; - if (i >= s) continue; - n_cigar++; - operation = (structure[i].start - structure[i - 1].end - 1) << BAM_CIGAR_SHIFT | BAM_CREF_SKIP; - p = (uint8_t*)(&operation); - for (int j = 0; j < 4; j++) data.push_back(*(p + j)); - - oldlen = curlen; - sp = oldlen + 1; - curlen += structure[i].end - structure[i].start + 1; - } - - if (i >= s) { - n_cigar++; - operation = (ep - length) << BAM_CIGAR_SHIFT | BAM_CINS; //BAM_CSOFT_CLIP; - p = (uint8_t*)(&operation); - for (int j = 0; j < 4; j++) data.push_back(*(p + j)); - } - else { - n_cigar++; - operation = (ep - sp + 1) << BAM_CIGAR_SHIFT | BAM_CMATCH; - p = (uint8_t*)(&operation); - for (int j = 0; j < 4; j++) data.push_back(*(p + j)); - } +void BamWriter::convert(bam1_t *b, double prb) { + int sid = b->core.tid + 1; + const Transcript& transcript = transcripts.getTranscriptAt(sid); + + int pos = b->core.pos; + int readlen = b->core.l_qseq; + + std::vector data; + data.clear(); + + int core_pos, core_n_cigar; + std::vector vec; + vec.assign(1, Interval(1, transcript.getLength())); + // make an artificial chromosome coordinates for the transcript to get new CIGAR strings + tr2chr(Transcript("", "", "", '+', vec, ""), pos + 1, pos + readlen, core_pos, core_n_cigar, data); + assert(core_pos >= 0); + + int rest_len = b->data_len - b->core.l_qname - b->core.n_cigar * 4; + b->data_len = b->core.l_qname + core_n_cigar * 4 + rest_len; + expand_data_size(b); + uint8_t* pt = b->data + b->core.l_qname; + memmove(pt + core_n_cigar * 4, pt + b->core.n_cigar * 4, rest_len); + for (int i = 0; i < core_n_cigar; i++) { memmove(pt, &data[i], 4); pt += 4; } + + b->core.pos = core_pos; + b->core.n_cigar = core_n_cigar; + b->core.qual = getMAPQ(prb); + b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&(b->core), bam1_cigar(b))); + + float val = (float)prb; + bam_aux_append(b, "ZW", 'f', bam_aux_type2size('f'), (uint8_t*)&val); } #endif /* BAMWRITER_H_ */ diff --git a/EM.cpp b/EM.cpp index 322a306..f9449a8 100644 --- a/EM.cpp +++ b/EM.cpp @@ -10,6 +10,7 @@ #include #include "utils.h" +#include "sampling.h" #include "Read.h" #include "SingleRead.h" @@ -58,6 +59,7 @@ int nThreads; bool genBamF; // If user wants to generate bam file, true; otherwise, false. +bool bamSampling; // true if sampling from read posterior distribution when bam file is generated bool updateModel, calcExpectedWeights; bool genGibbsOut; // generate file for Gibbs sampler @@ -100,7 +102,7 @@ void init(ReadReader **&readers, HitContainer **&hitvs, doubl indices[i] = new ReadIndex(readFs[i]); } for (int i = 0; i < nThreads; i++) { - readers[i] = new ReadReader(s, readFs); + readers[i] = new ReadReader(s, readFs, refs.hasPolyA(), mparams.seedLen); // allow calculation of calc_lq() function readers[i]->setIndices(indices); } @@ -338,7 +340,7 @@ void writeResults(ModelType& model, double* counts) { fprintf(fo, "%.15g%c", tau[i], (i < M ? '\t' : '\n')); for (int i = 1; i <= M; i++) { const Transcript& transcript = transcripts.getTranscriptAt(i); - fprintf(fo, "%s%c", transcript.getLeft().c_str(), (i < M ? '\t' : '\n')); + fprintf(fo, "%s%c", transcript.getGeneID().c_str(), (i < M ? '\t' : '\n')); } fclose(fo); @@ -394,8 +396,6 @@ void release(ReadReader **readers, HitContainer **hitvs, doub delete[] mhps; } -int tmp_n; - inline bool doesUpdateModel(int ROUND) { // return ROUND <= 20 || ROUND % 100 == 0; return ROUND <= 10; @@ -614,15 +614,34 @@ void EM() { writeResults(model, countvs[0]); if (genBamF) { - sprintf(outBamF, "%s.bam", outName); - if (transcripts.getType() == 0) { - sprintf(chr_list, "%s.chrlist", refName); - pt_chr_list = (char*)(&chr_list); + sprintf(outBamF, "%s.transcript.bam", outName); + + if (bamSampling) { + int local_N; + int fr, to, len, id; + vector arr; + arr.clear(); + + if (verbose) printf("Begin to sample reads from their posteriors.\n"); + for (int i = 0; i < nThreads; i++) { + local_N = hitvs[i]->getN(); + for (int j = 0; j < local_N; j++) { + fr = hitvs[i]->getSAt(j); + to = hitvs[i]->getSAt(j + 1); + len = to - fr + 1; + arr.resize(len); + arr[0] = ncpvs[i][j]; + for (int k = fr; k < to; k++) arr[k - fr + 1] = arr[k - fr] + hitvs[i]->getHitAt(k).getConPrb(); + id = (arr[len - 1] < EPSILON ? -1 : sample(arr, len)); // if all entries in arr are 0, let id be -1 + for (int k = fr; k < to; k++) hitvs[i]->getHitAt(k).setConPrb(k - fr + 1 == id ? 1.0 : 0.0); + } + } + if (verbose) printf("Sampling is finished.\n"); } - BamWriter writer(inpSamType, inpSamF, pt_fn_list, outBamF, pt_chr_list); + BamWriter writer(inpSamType, inpSamF, pt_fn_list, outBamF, transcripts); HitWrapper wrapper(nThreads, hitvs); - writer.work(wrapper, transcripts); + writer.work(wrapper); } release(readers, hitvs, ncpvs, mhps); @@ -633,7 +652,7 @@ int main(int argc, char* argv[]) { bool quiet = false; if (argc < 5) { - printf("Usage : rsem-run-em refName read_type sampleName sampleToken [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out]\n\n"); + printf("Usage : rsem-run-em refName read_type sampleName sampleToken [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out] [--sampling]\n\n"); printf(" refName: reference name\n"); printf(" read_type: 0 single read without quality score; 1 single read with quality score; 2 paired-end read without quality score; 3 paired-end read with quality score.\n"); printf(" sampleName: sample's name, including the path\n"); @@ -641,7 +660,8 @@ int main(int argc, char* argv[]) { printf(" -p: number of threads which user wants to use. (default: 1)\n"); printf(" -b: produce bam format output file. (default: off)\n"); printf(" -q: set it quiet\n"); - printf(" --gibbs-out: generate output file use by Gibbs sampler. (default: off)\n"); + printf(" --gibbs-out: generate output file used by Gibbs sampler. (default: off)\n"); + printf(" --sampling: sample each read from its posterior distribution when bam file is generated. (default: off)\n"); printf("// model parameters should be in imdName.mparams.\n"); exit(-1); } @@ -657,6 +677,7 @@ int main(int argc, char* argv[]) { nThreads = 1; genBamF = false; + bamSampling = false; genGibbsOut = false; pt_fn_list = pt_chr_list = NULL; @@ -673,6 +694,7 @@ int main(int argc, char* argv[]) { } if (!strcmp(argv[i], "-q")) { quiet = true; } if (!strcmp(argv[i], "--gibbs-out")) { genGibbsOut = true; } + if (!strcmp(argv[i], "--sampling")) { bamSampling = true; } } if (nThreads <= 0) { fprintf(stderr, "Number of threads should be bigger than 0!\n"); exit(-1); } //assert(nThreads > 0); diff --git a/Gibbs.cpp b/Gibbs.cpp index 5bdfb24..979860b 100644 --- a/Gibbs.cpp +++ b/Gibbs.cpp @@ -1,4 +1,3 @@ -#include #include #include #include @@ -7,9 +6,8 @@ #include #include -#include "boost/random.hpp" - #include "utils.h" +#include "sampling.h" #include "Model.h" #include "SingleModel.h" @@ -53,9 +51,6 @@ bool quiet; vector arr; -boost::mt19937 rng(time(NULL)); -boost::uniform_01 rg(rng); - void load_data(char* reference_name, char* statName, char* imdName) { ifstream fin; string line; @@ -121,27 +116,6 @@ void load_data(char* reference_name, char* statName, char* imdName) { if (verbose) { printf("Loading Data is finished!\n"); } } -// arr should be cumulative! -// interval : [,) -// random number should be in [0, arr[len - 1]) -// If by chance arr[len - 1] == 0.0, one possibility is to sample uniformly from 0...len-1 -int sample(vector& arr, int len) { - int l, r, mid; - double prb = rg() * arr[len - 1]; - - l = 0; r = len - 1; - while (l <= r) { - mid = (l + r) / 2; - if (arr[mid] <= prb) l = mid + 1; - else r = mid - 1; - } - - if (l >= len) { printf("%d %lf %lf\n", len, arr[len - 1], prb); } - assert(l < len); - - return l; -} - void init() { int len, fr, to; diff --git a/PairedEndModel.h b/PairedEndModel.h index c011cee..70d14c9 100644 --- a/PairedEndModel.h +++ b/PairedEndModel.h @@ -230,25 +230,25 @@ void PairedEndModel::estimateFromReads(const char* readFN) { for (int i = 0; i < 3; i++) if (N[i] > 0) { genReadFileNames(readFN, i, read_type, s, readFs); - ReadReader reader(s, readFs); + ReadReader reader(s, readFs, refs->hasPolyA(), seedLen); // allow calculation of calc_lq() function int cnt = 0; while (reader.next(read)) { SingleRead mate1 = read.getMate1(); SingleRead mate2 = read.getMate2(); - if (!read.isLowQuality()) { - mld->update(mate1.getReadLength(), 1.0); - mld->update(mate2.getReadLength(), 1.0); + if (!read.isLowQuality()) { + mld->update(mate1.getReadLength(), 1.0); + mld->update(mate2.getReadLength(), 1.0); - if (i == 0) { - npro->updateC(mate1.getReadSeq()); - npro->updateC(mate2.getReadSeq()); - } - } - else if (verbose && (mate1.getReadLength() < OLEN || mate2.getReadLength() < OLEN)) { - printf("Warning: Read %s is ignored due to at least one of the mates' length < %d!\n", read.getName().c_str(), OLEN); - } + if (i == 0) { + npro->updateC(mate1.getReadSeq()); + npro->updateC(mate2.getReadSeq()); + } + } + else if (verbose && (mate1.getReadLength() < seedLen || mate2.getReadLength() < seedLen)) { + printf("Warning: Read %s is ignored due to at least one of the mates' length < seed length %d!\n", read.getName().c_str(), seedLen); + } ++cnt; if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); } @@ -407,39 +407,39 @@ void PairedEndModel::finishSimulation() { } void PairedEndModel::calcMW() { - assert(seedLen >= OLEN && mld->getMinL() >= seedLen); - - memset(mw, 0, sizeof(double) * (M + 1)); - mw[0] = 1.0; - - for (int i = 1; i <= M; i++) { - RefSeq& ref = refs->getRef(i); - int totLen = ref.getTotLen(); - int fullLen = ref.getFullLen(); - int end = std::min(fullLen, totLen - gld->getMinL() + 1); - double value = 0.0; - int minL, maxL; - int effL, pfpos; - - //seedPos is fpos here - for (int seedPos = 0; seedPos < end; seedPos++) - if (ref.getMask(seedPos)) { - minL = gld->getMinL(); - maxL = std::min(gld->getMaxL(), totLen - seedPos); - pfpos = seedPos; - for (int fragLen = minL; fragLen <= maxL; fragLen++) { - effL = std::min(fullLen, totLen - fragLen + 1); - value += gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen); + assert(mld->getMinL() >= seedLen); + + memset(mw, 0, sizeof(double) * (M + 1)); + mw[0] = 1.0; + + for (int i = 1; i <= M; i++) { + RefSeq& ref = refs->getRef(i); + int totLen = ref.getTotLen(); + int fullLen = ref.getFullLen(); + int end = std::min(fullLen, totLen - gld->getMinL() + 1); + double value = 0.0; + int minL, maxL; + int effL, pfpos; + + //seedPos is fpos here + for (int seedPos = 0; seedPos < end; seedPos++) + if (ref.getMask(seedPos)) { + minL = gld->getMinL(); + maxL = std::min(gld->getMaxL(), totLen - seedPos); + pfpos = seedPos; + for (int fragLen = minL; fragLen <= maxL; fragLen++) { + effL = std::min(fullLen, totLen - fragLen + 1); + value += gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen); + } + } + + mw[i] = 1.0 - value; + + if (mw[i] < 1e-8) { + //fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i); + mw[i] = 0.0; + } } - } - - mw[i] = 1.0 - value; - - if (mw[i] < 1e-8) { - //fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i); - mw[i] = 0.0; - } - } } #endif /* PAIREDENDMODEL_H_ */ diff --git a/PairedEndQModel.h b/PairedEndQModel.h index cbc5475..2c638ab 100644 --- a/PairedEndQModel.h +++ b/PairedEndQModel.h @@ -236,28 +236,28 @@ void PairedEndQModel::estimateFromReads(const char* readFN) { for (int i = 0; i < 3; i++) if (N[i] > 0) { genReadFileNames(readFN, i, read_type, s, readFs); - ReadReader reader(s, readFs); + ReadReader reader(s, readFs, refs->hasPolyA(), seedLen); // allow calculation of calc_lq() function int cnt = 0; while (reader.next(read)) { SingleReadQ mate1 = read.getMate1(); SingleReadQ mate2 = read.getMate2(); - if (!read.isLowQuality()) { - mld->update(mate1.getReadLength(), 1.0); - mld->update(mate2.getReadLength(), 1.0); + if (!read.isLowQuality()) { + mld->update(mate1.getReadLength(), 1.0); + mld->update(mate2.getReadLength(), 1.0); - qd->update(mate1.getQScore()); - qd->update(mate2.getQScore()); + qd->update(mate1.getQScore()); + qd->update(mate2.getQScore()); - if (i == 0) { - nqpro->updateC(mate1.getReadSeq(), mate1.getQScore()); - nqpro->updateC(mate2.getReadSeq(), mate2.getQScore()); - } - } - else if (verbose && (mate1.getReadLength() < OLEN || mate2.getReadLength() < OLEN)) { - printf("Warning: Read %s is ignored due to at least one of the mates' length < %d!\n", read.getName().c_str(), OLEN); - } + if (i == 0) { + nqpro->updateC(mate1.getReadSeq(), mate1.getQScore()); + nqpro->updateC(mate2.getReadSeq(), mate2.getQScore()); + } + } + else if (verbose && (mate1.getReadLength() < seedLen || mate2.getReadLength() < seedLen)) { + printf("Warning: Read %s is ignored due to at least one of the mates' length < seed length %d!\n", read.getName().c_str(), seedLen); + } ++cnt; if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); } @@ -427,39 +427,39 @@ void PairedEndQModel::finishSimulation() { void PairedEndQModel::calcMW() { - assert(seedLen >= OLEN && mld->getMinL() >= seedLen); - - memset(mw, 0, sizeof(double) * (M + 1)); - mw[0] = 1.0; - - for (int i = 1; i <= M; i++) { - RefSeq& ref = refs->getRef(i); - int totLen = ref.getTotLen(); - int fullLen = ref.getFullLen(); - int end = std::min(fullLen, totLen - gld->getMinL() + 1); - double value = 0.0; - int minL, maxL; - int effL, pfpos; - - //seedPos is fpos here - for (int seedPos = 0; seedPos < end; seedPos++) - if (ref.getMask(seedPos)) { - minL = gld->getMinL(); - maxL = std::min(gld->getMaxL(), totLen - seedPos); - pfpos = seedPos; - for (int fragLen = minL; fragLen <= maxL; fragLen++) { - effL = std::min(fullLen, totLen - fragLen + 1); - value += gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen); - } - } + assert(mld->getMinL() >= seedLen); + + memset(mw, 0, sizeof(double) * (M + 1)); + mw[0] = 1.0; + + for (int i = 1; i <= M; i++) { + RefSeq& ref = refs->getRef(i); + int totLen = ref.getTotLen(); + int fullLen = ref.getFullLen(); + int end = std::min(fullLen, totLen - gld->getMinL() + 1); + double value = 0.0; + int minL, maxL; + int effL, pfpos; + + //seedPos is fpos here + for (int seedPos = 0; seedPos < end; seedPos++) + if (ref.getMask(seedPos)) { + minL = gld->getMinL(); + maxL = std::min(gld->getMaxL(), totLen - seedPos); + pfpos = seedPos; + for (int fragLen = minL; fragLen <= maxL; fragLen++) { + effL = std::min(fullLen, totLen - fragLen + 1); + value += gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen); + } + } - mw[i] = 1.0 - value; + mw[i] = 1.0 - value; - if (mw[i] < 1e-8) { - // fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i); - mw[i] = 0.0; - } - } + if (mw[i] < 1e-8) { + // fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i); + mw[i] = 0.0; + } + } } #endif /* PAIREDENDQMODEL_H_ */ diff --git a/PairedEndRead.h b/PairedEndRead.h index 6504e61..01d3d00 100644 --- a/PairedEndRead.h +++ b/PairedEndRead.h @@ -9,30 +9,28 @@ #include "SingleRead.h" class PairedEndRead : public Read { - public: - PairedEndRead() : mate1(), mate2() {} - PairedEndRead(const SingleRead& mate1, const SingleRead& mate2) { - this->mate1 = mate1; - this->mate2 = mate2; - this->name = mate1.getName(); - - calc_lq(); - } - - bool read(int argc, std::istream* argv[], int flags = 7); - void write(int argc, std::ostream* argv[]); - - const SingleRead& getMate1() const { return mate1; } - const SingleRead& getMate2() const { return mate2; } - const SingleRead& getMate(int i) const { - if (i == 1) return mate1; - else return mate2; - } - - private: - SingleRead mate1, mate2; - - void calc_lq(); +public: + PairedEndRead() : mate1(), mate2() {} + PairedEndRead(const SingleRead& mate1, const SingleRead& mate2) { + this->mate1 = mate1; + this->mate2 = mate2; + this->name = mate1.getName(); + } + + bool read(int argc, std::istream* argv[], int flags = 7); + void write(int argc, std::ostream* argv[]); + + const SingleRead& getMate1() const { return mate1; } + const SingleRead& getMate2() const { return mate2; } + const SingleRead& getMate(int i) const { + if (i == 1) return mate1; + else return mate2; + } + + void calc_lq(bool, int); // calculate if this read is low quality. Without calling this function, isLowQuality() will always be false + +private: + SingleRead mate1, mate2; }; bool PairedEndRead::read(int argc, std::istream* argv[], int flags) { @@ -45,8 +43,6 @@ bool PairedEndRead::read(int argc, std::istream* argv[], int flags) { name = ""; if (flags & 4) { name = mate1.getName(); } //May chop 1 char later if we want - if (flags & 1) calc_lq(); - return success; } @@ -59,9 +55,13 @@ void PairedEndRead::write(int argc, std::ostream *argv[]) { mate2.write(1, outMate2); } -void PairedEndRead::calc_lq() { - low_quality = mate1.isLowQuality() && mate2.isLowQuality(); - if (mate1.getReadLength() < OLEN || mate2.getReadLength() < OLEN) low_quality = true; +//calculate if this read is low quality +void PairedEndRead::calc_lq(bool hasPolyA, int seedLen) { + low_quality = false; + mate1.calc_lq(hasPolyA, seedLen); + mate2.calc_lq(hasPolyA, seedLen); + if (mate1.getReadLength() < seedLen || mate2.getReadLength() < seedLen) low_quality = true; + else low_quality = mate1.isLowQuality() && mate2.isLowQuality(); } #endif diff --git a/PairedEndReadQ.h b/PairedEndReadQ.h index 36b703b..7513820 100644 --- a/PairedEndReadQ.h +++ b/PairedEndReadQ.h @@ -15,8 +15,6 @@ public: this->mate1 = mate1; this->mate2 = mate2; this->name = mate1.getName(); - - calc_lq(); } bool read(int argc, std::istream* argv[], int flags = 7); @@ -29,10 +27,10 @@ public: else return mate2; } + void calc_lq(bool, int); // calculate if this read is low quality. Without calling this function, isLowQuality() will always be false + private: SingleReadQ mate1, mate2; - - void calc_lq(); }; bool PairedEndReadQ::read(int argc, std::istream* argv[], int flags) { @@ -45,8 +43,6 @@ bool PairedEndReadQ::read(int argc, std::istream* argv[], int flags) { name = ""; if (flags & 4) { name = mate1.getName(); } //May chop 1 char later if we want - if (flags & 1) calc_lq(); - return success; } @@ -59,9 +55,13 @@ void PairedEndReadQ::write(int argc, std::ostream* argv[]) { mate2.write(1, outMate2); } -void PairedEndReadQ::calc_lq() { - low_quality = mate1.isLowQuality() && mate2.isLowQuality(); - if (mate1.getReadLength() < OLEN || mate2.getReadLength() < OLEN) low_quality = true; +//calculate if this read is low quality +void PairedEndReadQ::calc_lq(bool hasPolyA, int seedLen) { + low_quality = false; + mate1.calc_lq(hasPolyA, seedLen); + mate2.calc_lq(hasPolyA, seedLen); + if (mate1.getReadLength() < seedLen || mate2.getReadLength() < seedLen) low_quality = true; + else low_quality = mate1.isLowQuality() && mate2.isLowQuality(); } #endif /* PAIREDENDREADQ_H_ */ diff --git a/README.md b/README.md index 7ef198c..b46826a 100644 --- a/README.md +++ b/README.md @@ -26,9 +26,11 @@ levels from RNA-Seq data. The new RSEM package (rsem-1.x) provides an user-friendly interface, supports threads for parallel computation of the EM algorithm, single-end and paired-end read data, quality scores, variable-length reads and RSPD estimation. It can also generate -genomic-coordinate BAM files and UCSC wiggle files for visualization. In -addition, it provides posterior mean and 95% credibility interval -estimates for expression levels. +genomic-coordinate BAM files and UCSC wiggle files for +visualization. In addition, it provides posterior mean and 95% +credibility interval estimates for expression levels. For +visualization, it can also generate transcript-coordinate BAM files +and visualize them and also models learned. ## Compilation & Installation @@ -82,8 +84,8 @@ documentation page](http://deweylab.biostat.wisc.edu/rsem/rsem-calculate-express #### Calculating expression values from single-end data For single-end models, users have the option of providing a fragment -length distribution via the --fragment-length-mean and ---fragment-length-sd options. The specification of an accurate fragment +length distribution via the '--fragment-length-mean' and +'--fragment-length-sd' options. The specification of an accurate fragment length distribution is important for the accuracy of expression level estimates from single-end data. If the fragment length mean and sd are not provided, RSEM will not take a fragment length distribution into @@ -94,12 +96,21 @@ consideration. By default, RSEM automates the alignment of reads to reference transcripts using the Bowtie alignment program. To use an alternative alignment program, align the input reads against the file -'reference_name.idx.fa' generated by rsem-prepare-reference, and format +'reference_name.idx.fa' generated by 'rsem-prepare-reference', and format the alignment output in SAM or BAM format. Then, instead of providing -reads to rsem-calculate-expression, specify the --sam or --bam option +reads to 'rsem-calculate-expression', specify the '--sam' or '--bam' option and provide the SAM or BAM file as an argument. When using an -alternative aligner, you may also want to provide the --no-bowtie option -to rsem-prepare-reference so that the Bowtie indices are not built. +alternative aligner, you may also want to provide the '--no-bowtie' option +to 'rsem-prepare-reference' so that the Bowtie indices are not built. + +Some aligners' (other than Bowtie) output might need to be converted +so that RSEM can use. For conversion, please run + + convert-sam-for-rsem --help + +to get usage information or visit the [convert-sam-for-rsem +documentation +page](http://deweylab.biostat.wisc.edu/rsem/convert-sam-for-rsem.html). However, please note that RSEM does ** not ** support gapped alignments. So make sure that your aligner does not produce alignments @@ -109,18 +120,24 @@ aligner's indices. ### III. Visualization -RSEM contains a version of samtools in the 'sam' subdirectory. When -users specify the --out-bam option RSEM will produce three files: -'sample_name.bam', the unsorted BAM file, 'sample_name.sorted.bam' and -'sample_name.sorted.bam.bai' the sorted BAM file and indices generated -by the samtools included. +RSEM contains a version of samtools in the 'sam' subdirectory. RSEM +will always produce three files:'sample_name.transcript.bam', the +unsorted BAM file, 'sample_name.transcript.sorted.bam' and +'sample_name.transcript.sorted.bam.bai' the sorted BAM file and +indices generated by the samtools included. All three files are in +transcript coordinates. When users specify the --output-genome-bam +option RSEM will produce three files: 'sample_name.genome.bam', the +unsorted BAM file, 'sample_name.genome.sorted.bam' and +'sample_name.genome.sorted.bam.bai' the sorted BAM file and indices +generated by the samtools included. All these files are in genomic +coordinates. #### a) Generating a UCSC Wiggle file A wiggle plot representing the expected number of reads overlapping -each position in the genome can be generated from the sorted BAM file -output. To generate the wiggle plot, run the 'rsem-bam2wig' program on -the 'sample_name.sorted.bam' file. +each position in the genome can be generated from the sorted genome +BAM file output. To generate the wiggle plot, run the 'rsem-bam2wig' +program on the 'sample_name.genome.sorted.bam' file. Usage: @@ -134,16 +151,26 @@ wiggle_name: the name the user wants to use for this wiggle plot Refer to the [UCSC custom track help page](http://genome.ucsc.edu/goldenPath/help/customTrack.html). -#### c) Visualize the model learned by RSEM +#### c) Generating Transcript Wiggle Plots + +To generate transcript wiggle plots, you should run the +'rsem-plot-transcript-wiggles' program. Run + + rsem-plot-transcript-wiggles --help + +to get usage information or visit the [rsem-plot-transcript-wiggles +documentation page](http://deweylab.biostat.wisc.edu/rsem/rsem-plot-transcript-wiggles.html). + +#### d) Visualize the model learned by RSEM RSEM provides an R script, 'rsem-plot-model', for visulazing the model learned. Usage: - rsem-plot-model sample_name outF + rsem-plot-model sample_name output_plot_file sample_name: the name of the sample analyzed -outF: the file name for plots generated from the model. It is a pdf file +output_plot_file: the file name for plots generated from the model. It is a pdf file The plots generated depends on read type and user configuration. It may include fragment length distribution, mate length distribution, @@ -164,23 +191,28 @@ Histogram of reads with different number of alignments: x-axis is the number of ## Example -Suppose we download the mouse genome from UCSC Genome Browser. We will -use a reference_name of 'mm9'. We have a FASTQ-formatted file, -'mmliver.fq', containing single-end reads from one sample, which we call -'mmliver_single_quals'. We want to estimate expression values by using -the single-end model with a fragment length distribution. We know that -the fragment length distribution is approximated by a normal -distribution with a mean of 150 and a standard deviation of 35. We wish -to generate 95% credibility intervals in addition to maximum likelihood -estimates. RSEM will be allowed 1G of memory for the credibility -interval calculation. We will visualize the probabilistic read mappings -generated by RSEM. +Suppose we download the mouse genome from UCSC Genome Browser. We +will use a reference_name of 'mm9'. We have a FASTQ-formatted file, +'mmliver.fq', containing single-end reads from one sample, which we +call 'mmliver_single_quals'. We want to estimate expression values by +using the single-end model with a fragment length distribution. We +know that the fragment length distribution is approximated by a normal +distribution with a mean of 150 and a standard deviation of 35. We +wish to generate 95% credibility intervals in addition to maximum +likelihood estimates. RSEM will be allowed 1G of memory for the +credibility interval calculation. We will visualize the probabilistic +read mappings generated by RSEM on UCSC genome browser. We will +generate a list of genes' transcript wiggle plots in 'output.pdf'. The +list is 'gene_ids.txt'. We will visualize the models learned in +'mmliver_single_quals.models.pdf' The commands for this scenario are as follows: rsem-prepare-reference --gtf mm9.gtf --mapping knownIsoforms.txt --bowtie-path /sw/bowtie /data/mm9 /ref/mm9 - rsem-calculate-expression --bowtie-path /sw/bowtie --phred64-quals --fragment-length-mean 150.0 --fragment-length-sd 35.0 -p 8 --out-bam --calc-ci --memory-allocate 1024 /data/mmliver.fq /ref/mm9 mmliver_single_quals + rsem-calculate-expression --bowtie-path /sw/bowtie --phred64-quals --fragment-length-mean 150.0 --fragment-length-sd 35.0 -p 8 --output-genome-bam --calc-ci --memory-allocate 1024 /data/mmliver.fq /ref/mm9 mmliver_single_quals rsem-bam2wig mmliver_single_quals.sorted.bam mmliver_single_quals.sorted.wig mmliver_single_quals + rsem-plot-transcript-wiggles --gene-list --show-unique mmliver_single_quals gene_ids.txt output.pdf + rsem-plot-model mmliver_single_quals mmliver_single_quals.models.pdf ## Simulation diff --git a/ReadReader.h b/ReadReader.h index f1bd1f5..d244efa 100644 --- a/ReadReader.h +++ b/ReadReader.h @@ -19,8 +19,8 @@ template class ReadReader { public: - ReadReader() { s = 0; indices = NULL; arr = NULL; } - ReadReader(int s, char readFs[][STRLEN]); + ReadReader() { s = 0; indices = NULL; arr = NULL; hasPolyA = false; seedLen = -1; } + ReadReader(int s, char readFs[][STRLEN], bool hasPolyA = false, int seedLen = -1); ~ReadReader(); void setIndices(ReadIndex** indices) { @@ -31,7 +31,9 @@ public: void reset(); bool next(ReadType& read, int flags = 7) { - return read.read(s, (std::istream**)arr, flags); + bool success = read.read(s, (std::istream**)arr, flags); + if (success && seedLen > 0) { read.calc_lq(hasPolyA, seedLen); } + return success; } private: @@ -39,10 +41,13 @@ private: ReadIndex **indices; std::ifstream** arr; std::streampos *locations; + + bool hasPolyA; + int seedLen; }; template -ReadReader::ReadReader(int s, char readFs[][STRLEN]) { +ReadReader::ReadReader(int s, char readFs[][STRLEN], bool hasPolyA, int seedLen) { assert(s > 0); this->s = s; arr = new std::ifstream*[s]; @@ -53,6 +58,8 @@ ReadReader::ReadReader(int s, char readFs[][STRLEN]) { if (!arr[i]->is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", readFs[i]); exit(-1); } locations[i] = arr[i]->tellg(); } + this->hasPolyA = hasPolyA; + this->seedLen = seedLen; } template diff --git a/RefSeq.h b/RefSeq.h index 0e7f2ca..88b149e 100644 --- a/RefSeq.h +++ b/RefSeq.h @@ -10,132 +10,131 @@ //Each Object can only be used once class RefSeq { - public: - - RefSeq() { - fullLen = totLen = 0; - name = ""; seq = ""; - fmasks.clear(); +public: + RefSeq() { + fullLen = totLen = 0; + name = ""; seq = ""; + fmasks.clear(); + } + + //Constructor , seq : the forward strand of the reference + //tag does not contain ">" + //polyALen : length of polyA tail we add + RefSeq(const std::string& name, const std::string& seq, int polyALen) { + fullLen = seq.length(); + totLen = fullLen + polyALen; + this->name = name; + this->seq = seq; + this->seq.append(polyALen, 'A'); + + assert(fullLen > 0 && totLen >= fullLen); + + int len = (fullLen - 1) / NBITS + 1; + fmasks.assign(len, 0); + // set mask if poly(A) tail is added + if (polyALen > 0) { + for (int i = std::max(fullLen - OLEN + 1, 0); i < fullLen; i++) setMask(i); + } } - //Constructor , seq : the forward strand of the reference - //tag does not contain ">" - //polyALen : length of polyA tail we add - RefSeq(const std::string& name, const std::string& seq, int polyALen) { - fullLen = seq.length(); - totLen = fullLen + polyALen; - this->name = name; - this->seq = seq; - this->seq.append(polyALen, 'A'); - - assert(fullLen > 0 && totLen >= fullLen); - - int len = (fullLen - 1) / NBITS + 1; - fmasks.clear(); fmasks.resize(len, 0); - // ask read to be at least OLEN long! - for (int i = std::max(fullLen - OLEN + 1, 0); i < fullLen; i++) setMask(i); - } + RefSeq(const RefSeq& o) { + fullLen = o.fullLen; + totLen = o.totLen; + name = o.name; + seq = o.seq; + fmasks = o.fmasks; + } - RefSeq(const RefSeq& o) { - fullLen = o.fullLen; - totLen = o.totLen; - name = o.name; - seq = o.seq; - fmasks = o.fmasks; - } + RefSeq& operator= (const RefSeq &rhs) { + if (this != &rhs) { + fullLen = rhs.fullLen; + totLen = rhs.totLen; + name = rhs.name; + seq = rhs.seq; + fmasks = rhs.fmasks; + } - RefSeq& operator= (const RefSeq &rhs) { - if (this != &rhs) { - fullLen = rhs.fullLen; - totLen = rhs.totLen; - name = rhs.name; - seq = rhs.seq; - fmasks = rhs.fmasks; - } + return *this; + } - return *this; - } + ~RefSeq() {} - ~RefSeq() { - } + bool read(std::ifstream&, int = 0); + void write(std::ofstream&); - bool read(std::ifstream&, int = 0); - void write(std::ofstream&); + int getFullLen() const { return fullLen; } - int getFullLen() const { return fullLen; } + int getTotLen() const { return totLen; } - int getTotLen() const { return totLen; } + const std::string& getName() const { return name; } - const std::string& getName() const { return name; } + std::string getSeq() const { return seq; } - std::string getSeq() const { return seq; } - - std::string getRSeq() const { - std::string rseq = ""; - for (int i = totLen - 1; i >= 0; i--) rseq.push_back(getCharacter(get_rbase_id(seq[i]))); - return rseq; - } - - //get the sequence dir 0 : + 1 : - - std::string getSeq(int dir) const { - return (dir == 0 ? getSeq() : getRSeq()); - } + std::string getRSeq() const { + std::string rseq = ""; + for (int i = totLen - 1; i >= 0; i--) rseq.push_back(getCharacter(get_rbase_id(seq[i]))); + return rseq; + } + //get the sequence dir 0 : + 1 : - + std::string getSeq(int dir) const { + return (dir == 0 ? getSeq() : getRSeq()); + } - int get_id(int pos, int dir) const { - assert(pos >= 0 && pos < totLen); - return (dir == 0 ? get_base_id(seq[pos]) : get_rbase_id(seq[totLen - pos - 1])); - } - - bool getMask(int seedPos) const { - assert(seedPos >= 0 && seedPos < totLen); - return fmasks[seedPos / NBITS] & mask_codes[seedPos % NBITS]; - } - - void setMask(int seedPos) { - assert(seedPos >= 0 && seedPos < totLen); - fmasks[seedPos / NBITS] |= mask_codes[seedPos % NBITS]; - } + int get_id(int pos, int dir) const { + assert(pos >= 0 && pos < totLen); + return (dir == 0 ? get_base_id(seq[pos]) : get_rbase_id(seq[totLen - pos - 1])); + } + + bool getMask(int seedPos) const { + assert(seedPos >= 0 && seedPos < totLen); + return fmasks[seedPos / NBITS] & mask_codes[seedPos % NBITS]; + } + + void setMask(int seedPos) { + assert(seedPos >= 0 && seedPos < totLen); + fmasks[seedPos / NBITS] |= mask_codes[seedPos % NBITS]; + } - private: - int fullLen; // fullLen : the original length of an isoform - int totLen; // totLen : the total length, included polyA tails, if any - std::string name; // the tag - std::string seq; // the raw sequence, in forward strand - std::vector fmasks; // record masks for forward strand, each position occupies 1 bit +private: + int fullLen; // fullLen : the original length of an isoform + int totLen; // totLen : the total length, included polyA tails, if any + std::string name; // the tag + std::string seq; // the raw sequence, in forward strand + std::vector fmasks; // record masks for forward strand, each position occupies 1 bit }; //internal read; option 0 : read all 1 : do not read seqences bool RefSeq::read(std::ifstream& fin, int option) { - std::string line; + std::string line; - if (!(fin>>fullLen>>totLen)) return false; - assert(fullLen > 0 && totLen >= fullLen); - getline(fin, line); - if (!getline(fin, name)) return false; - if (!getline(fin, seq)) return false; - - int len = (fullLen - 1) / NBITS + 1; // assume each cell contains NBITS bits - fmasks.resize(len, 0); - for (int i = 0; i < len; i++) - if (!(fin>>fmasks[i])) return false; - getline(fin, line); + if (!(fin>>fullLen>>totLen)) return false; + assert(fullLen > 0 && totLen >= fullLen); + getline(fin, line); + if (!getline(fin, name)) return false; + if (!getline(fin, seq)) return false; + + int len = (fullLen - 1) / NBITS + 1; // assume each cell contains NBITS bits + fmasks.assign(len, 0); + for (int i = 0; i < len; i++) + if (!(fin>>fmasks[i])) return false; + getline(fin, line); - assert(option == 0 || option == 1); - if (option == 1) { seq = ""; } + assert(option == 0 || option == 1); + if (option == 1) { seq = ""; } - return true; + return true; } //write to file in "internal" format void RefSeq::write(std::ofstream& fout) { - fout<& getRefs() { return seqs; } // may be slow, for copying the whole thing + bool hasPolyA() { return has_polyA; } // if any of sequence has poly(A) tail added + //lim : >=0 If mismatch > lim , return; -1 find all mismatches int countMismatch(const std::string& seq, int pos, const std::string& readseq, int LEN, int lim = -1) { int nMis = 0; // number of mismatches @@ -73,7 +76,7 @@ class Refs { private: int M; // # of isoforms, id starts from 1 std::vector seqs; // reference sequences, starts from 1; 0 is for noise gene - + bool has_polyA; // if at least one sequence has polyA added, the value is true; otherwise, the value is false }; //inpF in fasta format @@ -87,6 +90,7 @@ void Refs::makeRefs(char *inpF, RefSeqPolicy& policy, PolyARules& rules) { seqs.push_back(RefSeq()); // noise isoform M = 0; + has_polyA = false; fin.open(inpF); if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); } @@ -103,6 +107,7 @@ void Refs::makeRefs(char *inpF, RefSeqPolicy& policy, PolyARules& rules) { } ++M; seqs.push_back(RefSeq(tag, policy.convert(rawseq), rules.getLenAt(tag))); + has_polyA = has_polyA || seqs[M].getFullLen() < seqs[M].getTotLen(); } fin.close(); @@ -121,6 +126,7 @@ void Refs::loadRefs(char *inpF, int option) { seqs.push_back(RefSeq()); M = 0; + has_polyA = false; bool success; do { @@ -128,6 +134,7 @@ void Refs::loadRefs(char *inpF, int option) { if (success) { seqs.push_back(seq); ++M; + has_polyA = has_polyA || seq.getFullLen() < seq.getTotLen(); } } while (success); diff --git a/SingleModel.h b/SingleModel.h index 97ffd29..59db6ec 100644 --- a/SingleModel.h +++ b/SingleModel.h @@ -269,15 +269,17 @@ void SingleModel::estimateFromReads(const char* readFN) { for (int i = 0; i < 3; i++) if (N[i] > 0) { genReadFileNames(readFN, i, read_type, s, readFs); - ReadReader reader(s, readFs); + ReadReader reader(s, readFs, refs->hasPolyA(), seedLen); // allow calculation of calc_lq() function int cnt = 0; while (reader.next(read)) { if (!read.isLowQuality()) { - mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0); - if (i == 0) { npro->updateC(read.getReadSeq()); } + mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0); + if (i == 0) { npro->updateC(read.getReadSeq()); } + } + else if (verbose && read.getReadLength() < seedLen) { + printf("Warning: Read %s is ignored due to read length %d < seed length %d!\n", read.getName().c_str(), read.getReadLength(), seedLen); } - else if (verbose && read.getReadLength() < OLEN) { printf("Warning: Read %s is ignored due to read length < %d!\n", read.getName().c_str(), OLEN); } ++cnt; if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); } @@ -443,68 +445,67 @@ void SingleModel::finishSimulation() { } void SingleModel::calcMW() { - double probF, probR; + double probF, probR; - assert(seedLen >= OLEN && (mld == NULL ? gld->getMinL() : mld->getMinL()) >= seedLen); - - memset(mw, 0, sizeof(double) * (M + 1)); - mw[0] = 1.0; - + assert((mld == NULL ? gld->getMinL() : mld->getMinL()) >= seedLen); - probF = ori->getProb(0); - probR = ori->getProb(1); - - for (int i = 1; i <= M; i++) { - RefSeq& ref = refs->getRef(i); - int totLen = ref.getTotLen(); - int fullLen = ref.getFullLen(); - double value = 0.0; - int minL, maxL; - int effL, pfpos; - int end = std::min(fullLen, totLen - seedLen + 1); - double factor; - - for (int seedPos = 0; seedPos < end; seedPos++) - if (ref.getMask(seedPos)) { - //forward - minL = gld->getMinL(); - maxL = std::min(gld->getMaxL(), totLen - seedPos); - pfpos = seedPos; - for (int fragLen = minL; fragLen <= maxL; fragLen++) { - effL = std::min(fullLen, totLen - fragLen + 1); - factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen)); - value += probF * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor; - } - //reverse - minL = gld->getMinL(); - maxL = std::min(gld->getMaxL(), seedPos + seedLen); - for (int fragLen = minL; fragLen <= maxL; fragLen++) { - pfpos = seedPos - (fragLen - seedLen); - effL = std::min(fullLen, totLen - fragLen + 1); - factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen)); - value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor; - } - } - - //for reverse strand masking - for (int seedPos = end; seedPos <= totLen - seedLen; seedPos++) { - minL = std::max(gld->getMinL(), seedPos + seedLen - fullLen + 1); - maxL = std::min(gld->getMaxL(), seedPos + seedLen); - for (int fragLen = minL; fragLen <= maxL; fragLen++) { - pfpos = seedPos - (fragLen - seedLen); - effL = std::min(fullLen, totLen - fragLen + 1); - factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen)); - value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor; - } - } + memset(mw, 0, sizeof(double) * (M + 1)); + mw[0] = 1.0; + + probF = ori->getProb(0); + probR = ori->getProb(1); + + for (int i = 1; i <= M; i++) { + RefSeq& ref = refs->getRef(i); + int totLen = ref.getTotLen(); + int fullLen = ref.getFullLen(); + double value = 0.0; + int minL, maxL; + int effL, pfpos; + int end = std::min(fullLen, totLen - seedLen + 1); + double factor; + + for (int seedPos = 0; seedPos < end; seedPos++) + if (ref.getMask(seedPos)) { + //forward + minL = gld->getMinL(); + maxL = std::min(gld->getMaxL(), totLen - seedPos); + pfpos = seedPos; + for (int fragLen = minL; fragLen <= maxL; fragLen++) { + effL = std::min(fullLen, totLen - fragLen + 1); + factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen)); + value += probF * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor; + } + //reverse + minL = gld->getMinL(); + maxL = std::min(gld->getMaxL(), seedPos + seedLen); + for (int fragLen = minL; fragLen <= maxL; fragLen++) { + pfpos = seedPos - (fragLen - seedLen); + effL = std::min(fullLen, totLen - fragLen + 1); + factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen)); + value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor; + } + } - mw[i] = 1.0 - value; + //for reverse strand masking + for (int seedPos = end; seedPos <= totLen - seedLen; seedPos++) { + minL = std::max(gld->getMinL(), seedPos + seedLen - fullLen + 1); + maxL = std::min(gld->getMaxL(), seedPos + seedLen); + for (int fragLen = minL; fragLen <= maxL; fragLen++) { + pfpos = seedPos - (fragLen - seedLen); + effL = std::min(fullLen, totLen - fragLen + 1); + factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen)); + value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor; + } + } + + mw[i] = 1.0 - value; - if (mw[i] < 1e-8) { - // fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i); - mw[i] = 0.0; - } - } + if (mw[i] < 1e-8) { + // fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i); + mw[i] = 0.0; + } + } } #endif /* SINGLEMODEL_H_ */ diff --git a/SingleQModel.h b/SingleQModel.h index 5d4191a..786d647 100644 --- a/SingleQModel.h +++ b/SingleQModel.h @@ -279,16 +279,18 @@ void SingleQModel::estimateFromReads(const char* readFN) { for (int i = 0; i < 3; i++) if (N[i] > 0) { genReadFileNames(readFN, i, read_type, s, readFs); - ReadReader reader(s, readFs); + ReadReader reader(s, readFs, refs->hasPolyA(), seedLen); // allow calculation of calc_lq() function int cnt = 0; while (reader.next(read)) { if (!read.isLowQuality()) { - mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0); - qd->update(read.getQScore()); - if (i == 0) { nqpro->updateC(read.getReadSeq(), read.getQScore()); } + mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0); + qd->update(read.getQScore()); + if (i == 0) { nqpro->updateC(read.getReadSeq(), read.getQScore()); } + } + else if (verbose && read.getReadLength() < seedLen) { + printf("Warning: Read %s is ignored due to read length %d < seed length %d!\n", read.getName().c_str(), read.getReadLength(), seedLen); } - else if (verbose && read.getReadLength() < OLEN) { printf("Warning: Read %s is ignored due to read length < %d!\n", read.getName().c_str(), OLEN); } ++cnt; if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); } @@ -464,67 +466,67 @@ void SingleQModel::finishSimulation() { } void SingleQModel::calcMW() { - double probF, probR; - - assert(seedLen >= OLEN && (mld == NULL ? gld->getMinL() : mld->getMinL()) >= seedLen); - - memset(mw, 0, sizeof(double) * (M + 1)); - mw[0] = 1.0; - - probF = ori->getProb(0); - probR = ori->getProb(1); - - for (int i = 1; i <= M; i++) { - RefSeq& ref = refs->getRef(i); - int totLen = ref.getTotLen(); - int fullLen = ref.getFullLen(); - double value = 0.0; - int minL, maxL; - int effL, pfpos; - int end = std::min(fullLen, totLen - seedLen + 1); - double factor; - - for (int seedPos = 0; seedPos < end; seedPos++) - if (ref.getMask(seedPos)) { - //forward - minL = gld->getMinL(); - maxL = std::min(gld->getMaxL(), totLen - seedPos); - pfpos = seedPos; - for (int fragLen = minL; fragLen <= maxL; fragLen++) { - effL = std::min(fullLen, totLen - fragLen + 1); - factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen)); - value += probF * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor; - } - //reverse - minL = gld->getMinL(); - maxL = std::min(gld->getMaxL(), seedPos + seedLen); - for (int fragLen = minL; fragLen <= maxL; fragLen++) { - pfpos = seedPos - (fragLen - seedLen); - effL = std::min(fullLen, totLen - fragLen + 1); - factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen)); - value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor; + double probF, probR; + + assert((mld == NULL ? gld->getMinL() : mld->getMinL()) >= seedLen); + + memset(mw, 0, sizeof(double) * (M + 1)); + mw[0] = 1.0; + + probF = ori->getProb(0); + probR = ori->getProb(1); + + for (int i = 1; i <= M; i++) { + RefSeq& ref = refs->getRef(i); + int totLen = ref.getTotLen(); + int fullLen = ref.getFullLen(); + double value = 0.0; + int minL, maxL; + int effL, pfpos; + int end = std::min(fullLen, totLen - seedLen + 1); + double factor; + + for (int seedPos = 0; seedPos < end; seedPos++) + if (ref.getMask(seedPos)) { + //forward + minL = gld->getMinL(); + maxL = std::min(gld->getMaxL(), totLen - seedPos); + pfpos = seedPos; + for (int fragLen = minL; fragLen <= maxL; fragLen++) { + effL = std::min(fullLen, totLen - fragLen + 1); + factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen)); + value += probF * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor; + } + //reverse + minL = gld->getMinL(); + maxL = std::min(gld->getMaxL(), seedPos + seedLen); + for (int fragLen = minL; fragLen <= maxL; fragLen++) { + pfpos = seedPos - (fragLen - seedLen); + effL = std::min(fullLen, totLen - fragLen + 1); + factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen)); + value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor; + } + } + + //for reverse strand masking + for (int seedPos = end; seedPos <= totLen - seedLen; seedPos++) { + minL = std::max(gld->getMinL(), seedPos + seedLen - fullLen + 1); + maxL = std::min(gld->getMaxL(), seedPos + seedLen); + for (int fragLen = minL; fragLen <= maxL; fragLen++) { + pfpos = seedPos - (fragLen - seedLen); + effL = std::min(fullLen, totLen - fragLen + 1); + factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen)); + value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor; + } + } + + mw[i] = 1.0 - value; + + if (mw[i] < 1e-8) { + // fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i); + mw[i] = 0.0; + } } - } - - //for reverse strand masking - for (int seedPos = end; seedPos <= totLen - seedLen; seedPos++) { - minL = std::max(gld->getMinL(), seedPos + seedLen - fullLen + 1); - maxL = std::min(gld->getMaxL(), seedPos + seedLen); - for (int fragLen = minL; fragLen <= maxL; fragLen++) { - pfpos = seedPos - (fragLen - seedLen); - effL = std::min(fullLen, totLen - fragLen + 1); - factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen)); - value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor; - } - } - - mw[i] = 1.0 - value; - - if (mw[i] < 1e-8) { - // fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i); - mw[i] = 0.0; - } - } } #endif /* SINGLEQMODEL_H_ */ diff --git a/SingleRead.h b/SingleRead.h index 3c61242..8ea4eec 100644 --- a/SingleRead.h +++ b/SingleRead.h @@ -12,77 +12,81 @@ #include "Read.h" class SingleRead : public Read { - public: - SingleRead() { readseq = ""; len = 0; } - SingleRead(const std::string& name, const std::string& readseq) { - this->name = name; - this->readseq = readseq; - this->len = readseq.length(); - calc_lq(); - } - - bool read(int argc, std::istream* argv[], int flags = 7); - void write(int argc, std::ostream* argv[]); - - const int getReadLength() const { return len; /*readseq.length();*/ } // If need memory and .length() are guaranteed O(1), use statement in /* */ - const std::string& getReadSeq() const { return readseq; } - - private: - int len; // read length - std::string readseq; // read sequence - - void calc_lq(); -}; +public: + SingleRead() { readseq = ""; len = 0; } + SingleRead(const std::string& name, const std::string& readseq) { + this->name = name; + this->readseq = readseq; + this->len = readseq.length(); + } -//If return false, you should not trust the value of any member -bool SingleRead::read(int argc, std::istream* argv[], int flags) { - std::string line; + bool read(int argc, std::istream* argv[], int flags = 7); + void write(int argc, std::ostream* argv[]); - assert(argc == 1); - if (!getline((*argv[0]), line)) return false; - if (line[0] != '>') { fprintf(stderr, "Read file does not look like a FASTA file!"); exit(-1); } - name = ""; - if (flags & 4) { name = line.substr(1); } - if (!getline((*argv[0]), readseq)) return false; - len = readseq.length(); // set read length - if (!(flags & 1)) { readseq = ""; } + const int getReadLength() const { return len; /*readseq.length();*/ } // If need memory and .length() are guaranteed O(1), use statement in /* */ + const std::string& getReadSeq() const { return readseq; } - if (flags & 1) calc_lq(); + void calc_lq(bool, int); // calculate if this read is low quality. Without calling this function, isLowQuality() will always be false - return true; +private: + int len; // read length + std::string readseq; // read sequence +}; + +//If return false, you should not trust the value of any member +bool SingleRead::read(int argc, std::istream* argv[], int flags) { + std::string line; + + assert(argc == 1); + if (!getline((*argv[0]), line)) return false; + if (line[0] != '>') { fprintf(stderr, "Read file does not look like a FASTA file!"); exit(-1); } + name = ""; + if (flags & 4) { name = line.substr(1); } + if (!getline((*argv[0]), readseq)) return false; + len = readseq.length(); // set read length + if (!(flags & 1)) { readseq = ""; } + + return true; } void SingleRead::write(int argc, std::ostream* argv[]) { - assert(argc == 1); - (*argv[0])<<">"<"<= len - OLEN) ++numTO; - } - } +//calculate if this read is low quality +void SingleRead::calc_lq(bool hasPolyA, int seedLen) { + low_quality = false; + if (len < seedLen) { low_quality = true; return; } + + // if no polyA, no need to do the following calculation + if (!hasPolyA) return; + + assert(readseq != ""); + + int numA = 0, numT = 0, numAO = 0, numTO = 0; // numAO : number of A in overlap seed region + int threshold_1, threshold_2; + + threshold_1 = int(0.9 * len - 1.5 * sqrt(len * 1.0) + 0.5); + threshold_2 = (OLEN - 1) / 2 + 1; + for (int i = 0; i < len; i++) { + if (readseq[i] == 'A') { + ++numA; + if (i < OLEN) ++numAO; + } + if (readseq[i] == 'T') { + ++numT; + if (i >= len - OLEN) ++numTO; + } + } - if (numA >= threshold_1) { - low_quality = (numAO >= threshold_2); - } - else if (numT >= threshold_1) { - low_quality = (numTO >= threshold_2); - } - else low_quality = false; + if (numA >= threshold_1) { + low_quality = (numAO >= threshold_2); + } + else if (numT >= threshold_1) { + low_quality = (numTO >= threshold_2); + } + else low_quality = false; } #endif diff --git a/SingleReadQ.h b/SingleReadQ.h index 976637f..9fd3dd1 100644 --- a/SingleReadQ.h +++ b/SingleReadQ.h @@ -12,50 +12,46 @@ #include "Read.h" class SingleReadQ : public Read { - public: - SingleReadQ() { readseq = qscore = ""; len = 0; } - SingleReadQ(const std::string& name, const std::string& readseq, const std::string& qscore) { - this->name = name; - this->readseq = readseq; - this->qscore = qscore; - this->len = readseq.length(); - - calc_lq(); - } - - bool read(int argc, std::istream* argv[], int flags = 7); - void write(int argc, std::ostream* argv[]); - - int getReadLength() const { return len; } - const std::string& getReadSeq() const { return readseq; } - const std::string& getQScore() const { return qscore; } - - private: - int len; // read length - std::string readseq, qscore; // qscore : quality scores - - void calc_lq(); +public: + SingleReadQ() { readseq = qscore = ""; len = 0; } + SingleReadQ(const std::string& name, const std::string& readseq, const std::string& qscore) { + this->name = name; + this->readseq = readseq; + this->qscore = qscore; + this->len = readseq.length(); + } + + bool read(int argc, std::istream* argv[], int flags = 7); + void write(int argc, std::ostream* argv[]); + + int getReadLength() const { return len; } + const std::string& getReadSeq() const { return readseq; } + const std::string& getQScore() const { return qscore; } + + void calc_lq(bool, int); // calculate if this read is low quality. Without calling this function, isLowQuality() will always be false + +private: + int len; // read length + std::string readseq, qscore; // qscore : quality scores }; bool SingleReadQ::read(int argc, std::istream* argv[], int flags) { - std::string line; - - assert(argc == 1); - if (!getline((*argv[0]), line)) return false; - if (line[0] != '@') { fprintf(stderr, "Read file does not look like a FASTQ file!\n"); exit(-1); } - name = ""; - if (flags & 4) { name = line.substr(1); } - if (!getline((*argv[0]), readseq)) return false; - len = readseq.length(); - if (!(flags & 1)) { readseq = ""; } - if (!getline((*argv[0]), line)) return false; - if (line[0] != '+') { fprintf(stderr, "Read file does not look like a FASTQ file!\n"); exit(-1); } - if (!getline((*argv[0]), qscore)) return false; - if (!(flags & 2)) { qscore = ""; } - - if (flags & 1) calc_lq(); - - return true; + std::string line; + + assert(argc == 1); + if (!getline((*argv[0]), line)) return false; + if (line[0] != '@') { fprintf(stderr, "Read file does not look like a FASTQ file!\n"); exit(-1); } + name = ""; + if (flags & 4) { name = line.substr(1); } + if (!getline((*argv[0]), readseq)) return false; + len = readseq.length(); + if (!(flags & 1)) { readseq = ""; } + if (!getline((*argv[0]), line)) return false; + if (line[0] != '+') { fprintf(stderr, "Read file does not look like a FASTQ file!\n"); exit(-1); } + if (!getline((*argv[0]), qscore)) return false; + if (!(flags & 2)) { qscore = ""; } + + return true; } void SingleReadQ::write(int argc, std::ostream* argv[]) { @@ -63,32 +59,39 @@ void SingleReadQ::write(int argc, std::ostream* argv[]) { (*argv[0])<<"@"<= len - OLEN) ++numTO; - } - } - - if (numA >= threshold_1) { - low_quality = (numAO >= threshold_2); - } - else if (numT >= threshold_1) { - low_quality = (numTO >= threshold_2); - } - else low_quality = false; +//calculate if this read is low quality +void SingleReadQ::calc_lq(bool hasPolyA, int seedLen) { + low_quality = false; + if (len < seedLen) { low_quality = true; return; } + + // if no polyA, no need to do the following calculation + if (!hasPolyA) return; + + assert(readseq != ""); + + int numA = 0, numT = 0, numAO = 0, numTO = 0; // numAO : number of A in overlap seed region + int threshold_1, threshold_2; + + threshold_1 = int(0.9 * len - 1.5 * sqrt(len * 1.0) + 0.5); + threshold_2 = (OLEN - 1) / 2 + 1; + for (int i = 0; i < len; i++) { + if (readseq[i] == 'A') { + ++numA; + if (i < OLEN) ++numAO; + } + if (readseq[i] == 'T') { + ++numT; + if (i >= len - OLEN) ++numTO; + } + } + + if (numA >= threshold_1) { + low_quality = (numAO >= threshold_2); + } + else if (numT >= threshold_1) { + low_quality = (numTO >= threshold_2); + } + else low_quality = false; } #endif diff --git a/bam2readdepth.cpp b/bam2readdepth.cpp new file mode 100644 index 0000000..c7f0adb --- /dev/null +++ b/bam2readdepth.cpp @@ -0,0 +1,13 @@ +#include +#include "wiggle.h" + +int main(int argc, char* argv[]) { + if (argc != 2) { + printf("Usage: rsem-bam2readdepth sorted_bam_input\n"); + std::exit(1); + } + ReadDepthWriter depth_writer(std::cout); + build_wiggles(argv[1], depth_writer); + + return 0; +} diff --git a/bam2wig.cpp b/bam2wig.cpp index fcb86b3..b03d0b3 100644 --- a/bam2wig.cpp +++ b/bam2wig.cpp @@ -1,103 +1,17 @@ -#include -#include -#include -#include -#include +#include -#include "sam/bam.h" -#include "sam/sam.h" +#include "wiggle.h" using namespace std; -samfile_t *bam_in; -bam1_t *b; - -int cur_tid; //current tid; -float *wig_arr; // wiggle array -FILE *fo; - -void generateWiggle(int tid) { - int chr_len = bam_in->header->target_len[tid]; - char *chr_name = bam_in->header->target_name[tid]; - int sp, ep; - - sp = ep = -1; - for (int i = 0; i < chr_len; i++) { - if (wig_arr[i] > 0) { - ep = i; - } - else { - if (sp < ep) { - ++sp; - fprintf(fo, "fixedStep chrom=%s start=%d step=1\n", chr_name, sp + 1); - for (int j = sp; j <= ep; j++) fprintf(fo, "%.7g\n", wig_arr[j]); - } - sp = i; - } - } - if (sp < ep) { - ++sp; - fprintf(fo, "fixedStep chrom=%s start=%d step=1\n", chr_name, sp + 1); - for (int j = sp; j <= ep; j++) fprintf(fo, "%.7g\n", wig_arr[j]); - } -} - int main(int argc, char* argv[]) { - int cnt = 0; - if (argc != 4) { - printf("Usage : rsem-bam2wig sorted_bam_input wig_output wiggle_name\n"); + printf("Usage: rsem-bam2wig sorted_bam_input wig_output wiggle_name\n"); exit(-1); } - bam_in = samopen(argv[1], "rb", NULL); - if (bam_in == 0) { fprintf(stderr, "Cannot open %s!\n", argv[1]); exit(-1); } - //assert(bam_in != 0); - b = bam_init1(); - - fo = fopen(argv[2], "w"); - fprintf(fo, "track type=wiggle_0 name=\"%s\" description=\"%s\" visibility=full\n", argv[3], argv[3]); - - cur_tid = -1; - wig_arr = NULL; - while (samread(bam_in, b) >= 0) { - if (b->core.tid != cur_tid) { - if (cur_tid >= 0) generateWiggle(cur_tid); - cur_tid = b->core.tid; - size_t len = sizeof(float) * bam_in->header->target_len[cur_tid]; - wig_arr = (float*)realloc(wig_arr, len); - memset(wig_arr, 0, len); - } - - float w = bam_aux2f(bam_aux_get(b, "ZW")); - int pos = b->core.pos; - uint32_t *p = bam1_cigar(b); - - for (int i = 0; i < (int)b->core.n_cigar; i++, ++p) { - int op = *p & BAM_CIGAR_MASK; - int op_len = *p >> BAM_CIGAR_SHIFT; - - switch (op) { - //case BAM_CSOFT_CLIP : pos += op_len; break; - case BAM_CINS : pos += op_len; break; - case BAM_CMATCH : - for (int j = 0; j < op_len; j++, ++pos) wig_arr[pos] += w; - break; - case BAM_CREF_SKIP : pos += op_len; break; - default : assert(false); - } - } - - ++cnt; - if (cnt % 1000000 == 0) printf("%d FIN\n", cnt); - } - if (cur_tid >= 0) generateWiggle(cur_tid); - free(wig_arr); - - samclose(bam_in); - bam_destroy1(b); - - fclose(fo); + UCSCWiggleTrackWriter track_writer(argv[2], argv[3]); + build_wiggles(argv[1], track_writer); return 0; } diff --git a/bc_aux.h b/bc_aux.h new file mode 100644 index 0000000..0527e7e --- /dev/null +++ b/bc_aux.h @@ -0,0 +1,120 @@ +#ifndef BC_AUX_H_ +#define BC_AUX_H_ + +#include + +#include +#include "sam/bam.h" + +struct SingleEndT { + bam1_t *b; + + SingleEndT(bam1_t *b) { + this->b = b; + } + + int getSign(bool value) const { return value ? -1 : 1; } + + int compare(const SingleEndT& o) const { + int strand1, strand2; + uint32_t *p1, *p2; + + if (b->core.tid != o.b->core.tid) return getSign(b->core.tid < o.b->core.tid); + if (b->core.pos != o.b->core.pos) return getSign(b->core.pos < o.b->core.pos); + strand1 = b->core.flag & 0x0010; strand2 = o.b->core.flag & 0x0010; + if (strand1 != strand2) return getSign(strand1 < strand2); + if (b->core.n_cigar != o.b->core.n_cigar) return getSign(b->core.n_cigar < o.b->core.n_cigar); + p1 = bam1_cigar(b); p2 = bam1_cigar(o.b); + for (int i = 0; i < (int)b->core.n_cigar; i++) { + if (*p1 != *p2) return getSign(*p1 < *p2); + ++p1; ++p2; + } + + return 0; + } + + bool operator< (const SingleEndT& o) const { + return compare(o) < 0; + } +}; + +struct PairedEndT { + SingleEndT mate1, mate2; + + PairedEndT(const SingleEndT& mate1, const SingleEndT& mate2) : mate1(mate1), mate2(mate2) { + } + + bool operator< (const PairedEndT& o) const { + int value = mate1.compare(o.mate1); + return value < 0 || value == 0 && mate2 < o.mate2; + } +}; + +class CollapseMap { +public: + CollapseMap() { isPaired = false; smap.clear(); pmap.clear(); } + + void init(bool isPaired) { + this->isPaired = isPaired; + isPaired ? pmap.clear() : smap.clear(); + } + + void insert(bam1_t *b, bam1_t *b2, float prb) { + if (!isPaired) { + smapIter = smap.find(SingleEndT(b)); + if (smapIter == smap.end()) { smap[SingleEndT(bam_dup1(b))] = prb; } + else smapIter->second += prb; + } + else { + pmapIter = pmap.find(PairedEndT(SingleEndT(b), SingleEndT(b2))); + if (pmapIter == pmap.end()) { pmap[PairedEndT(SingleEndT(bam_dup1(b)), SingleEndT(bam_dup1(b2)))] = prb; } + else pmapIter->second += prb; + } + } + + //once this function is called, "insert" cannot be called anymore + bool empty(bool& par) { + bool value; + + par = isPaired; + if (!isPaired) { value = smap.empty(); smapIter = smap.begin(); } + else { value = pmap.empty(); pmapIter = pmap.begin(); } + + return value; + } + + bool next(bam1_t*& b, bam1_t*& b2, float& prb) { + bool value; + + if (!isPaired) { + value = smapIter != smap.end(); + if (value) { + b = smapIter->first.b; + prb = smapIter->second; + smapIter++; + } + } + else { + value = pmapIter != pmap.end(); + if (value) { + b = pmapIter->first.mate1.b; + b2 = pmapIter->first.mate2.b; + prb = pmapIter->second; + pmapIter++; + } + } + + return value; + } + +private: + bool isPaired; + + std::map smap; + std::map::iterator smapIter; + + std::map pmap; + std::map::iterator pmapIter; +}; + +#endif /* BC_AUX_H_ */ diff --git a/convert_sam_for_rsem b/convert_sam_for_rsem new file mode 100755 index 0000000..a6bc124 --- /dev/null +++ b/convert_sam_for_rsem @@ -0,0 +1,184 @@ +#!/usr/bin/perl + +use Getopt::Long; +use Pod::Usage; +use strict; + +my $standard_output = "&STDOUT"; + +my $out_file = $standard_output; +my @tmp_dirs = (); +my $help = 0; + +GetOptions("o=s" => \$out_file, + "T|temporary-directory=s" => \@tmp_dirs, + "h|help" => \$help) or pd2usage(-exitval => 2, -verbose => 2); + + +pod2usage(-verbose => 2) if ($help == 1); +pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 2); + +my $command; +my (@fields, @header) = (); +my $M; + +# Load fields +my @lines = (); +my $type; + +open(INPUT, "$ARGV[0].ti"); +@lines = ; +chomp(@lines); +close(INPUT); + +@fields = (); +($M, $type) = split(/ /, $lines[0]); +for (my $i = 0; $i < $M; $i++) { + push(@fields, "SN:$lines[$i * 6 + 1]"); +} + + +# Reorder header +my $line; + +open(INPUT, $ARGV[1]); +@header = (); +while (($line = ) && substr($line, 0, 1) eq '@') { + chomp($line); + push(@header, $line); +} +close(INPUT); + +my $n = scalar(@header); +if ($n > 0) { + my %hash = (); + my @ktable = (); + + my $tid = 0; + + for (my $i = 0; $i < $n; $i++) { + my @arr = split(/\t/, $header[$i]); + if ($arr[0] ne "\@SQ") { push(@ktable, ""); next; } + my $hasSN = 0; + foreach my $key (@arr) { + if (substr($key, 0, 3) eq "SN:") { + $hash{$key} = $i; + $hasSN = 1; + last; + } + } + if (!$hasSN) { print STDERR "\"$header[$i]\" does not have a SN tag!\n"; exit(-1); } + push(@ktable, $fields[$tid++]); + } + + if ($tid != $M) { print STDERR "Number of \@SQ lines is not correct!\n"; exit(-1); } + + open(OUTPUT, ">$out_file"); + for (my $i = 0; $i < $n; $i++) { + if ($ktable[$i] eq "") { print OUTPUT $header[$i]; } + else { print OUTPUT $header[$hash{$ktable[$i]}]; } + print OUTPUT "\n"; + } + close(OUTPUT); +} + + +# extract alignment section +$command = "grep ^[^@] $ARGV[1] > $ARGV[1].__temp"; +&runCommand($command); + +# sort and output the alignment section +$command = "sort -k 1,1 -s"; +if (scalar(@tmp_dirs) > 0) { $" = " -T "; $command .= " -T @tmp_dirs"; } +$command .= " $ARGV[1].__temp"; +if ($out_file ne $standard_output) { $command .= " >> $out_file"; } +&runCommand($command); + +# delete temporary files +$command = "rm -f $ARGV[1].__temp"; +&runCommand($command); + +# finish +print STDERR "Conversion is completed successfully!\n"; + +# command, {err_msg} +sub runCommand { + print STDERR $_[0]."\n"; + my $status = system($_[0]); + if ($status != 0) { + my $errmsg; + if (scalar(@_) > 1) { $errmsg = $_[1]; } + else { $errmsg = "\"$command\" failed! Plase check if you provide correct parameters/options for the script!"; } + print STDERR $errmsg."\n"; + exit(-1); + } + print STDERR "\n"; +} + +__END__ + +=head1 NAME + +convert_sam_for_rsem + +=head1 SYNOPSIS + +=over + + convert_sam_for_rsem [options] reference_name input_sam + +=back + +=head1 ARGUMENTS + +=over + +=item B + +The name of the reference used. This should be the same name used by 'rsem-prepare-reference'. + +=item B + +The SAM file (*.sam) generated by user's aligner. If the aligner produces a BAM file, please use samtools to convert it to a SAM file (with header information). + +=back + +=head1 OPTIONS + +=over + +=item B<-o> + +Output the converted SAM file into . (Default: STDOUT) + +=item B<-T/--temporary-directory> + +'convert_sam_for_rsem' will call 'sort' command and this is the '-T/--temporary-directory' option of 'sort' command. The following is the description from 'sort' : "use DIR for temporaries, not $TMPDIR or /tmp; multiple options specify multiple directories". + +=item B<-h/--help> + +Show help information. + +=back + +=head1 DESCRIPTION + +This program converts the SAM file generated by user's aligner into a SAM file which RSEM can process. However, users should make sure their aligners use 'reference_name.idx.fa' generated by 'rsem-prepare-reference' as their references. In addition, their aligners should output header information and make two mates of the same alignment adjacent to each other for paired-end data. This program will output the converted file into standard output by default for the purpose of piping. By setting '-o' option, users can make the converted file written into disk. + +Note: You do not need to run this script if Bowtie (not Bowtie 2) is used, or the order of @SQ lines is the same as 'reference_name.idx.fa' and the alignment lines of a same read group together and the mates of the same alignment are adjacent each other for paired-end reads. + +Note: This program can only recognize SAM files. See ARGUMENTS section. + +=head1 EXAMPLES + +Suppose reference_name and input_sam are set to '/ref/mouse_125' and 'input.sam'. + +1) Output to standard output and gzip the output to 'input_for_rsem.sam.gz': + + convert_sam_for_rsem /ref/mouse_125 input.sam | gzip > input_for_rsem.sam.gz + +2) Output to 'input_for_rsem.sam' directly: + + convert_sam_for_rsem /ref/mouse_125 input.sam -o input_for_rsem.sam + +=cut diff --git a/extractRef.cpp b/extractRef.cpp index 693af66..3173d65 100644 --- a/extractRef.cpp +++ b/extractRef.cpp @@ -303,7 +303,10 @@ int main(int argc, char* argv[]) { for (int i = 1; i <= M; i++) { if (seqs[i] == "") { - fprintf(stderr, "%s's sequence is empty! You must provide all chromosome files of transcripts which are presented in the .gtf file!\n", transcripts.getTranscriptAt(i).getTranscriptID().c_str()); + const Transcript& transcript = transcripts.getTranscriptAt(i); + fprintf(stderr, "Cannot extract transcript %s's sequence from chromosome %s, whose information might not be provided! \" + "Please check if the chromosome directory is set correctly or the list of chromosome files is complete.\n", \ + transcript.getTranscriptID().c_str(), transcript.getGeneID().c_str()); exit(-1); } } diff --git a/getUnique.cpp b/getUnique.cpp new file mode 100644 index 0000000..8e2ba4e --- /dev/null +++ b/getUnique.cpp @@ -0,0 +1,72 @@ +#include +#include +#include +#include +#include +#include + +#include +#include "sam/bam.h" +#include "sam/sam.h" + +using namespace std; + +string cqname; +samfile_t *in, *out; +bam1_t *b; +vector arr; +bool unaligned; + +void output() { + if (unaligned || arr.size() == 0) return; + bool isPaired = (arr[0]->core.flag & 0x0001); + if (isPaired && arr.size() != 2 || !isPaired && arr.size() != 1) return; + for (int i = 0; i < (int)arr.size(); i++) samwrite(out, arr[i]); +} + +int main(int argc, char* argv[]) { + if (argc != 3) { + printf("Usage: rsem-get-unique unsorted_transcript_bam_input bam_output\n"); + exit(-1); + } + + in = samopen(argv[1], "rb", NULL); + assert(in != 0); + out = samopen(argv[2], "wb", in->header); + assert(out != 0); + + int cnt = 0; + + cqname = ""; + arr.clear(); + b = bam_init1(); + unaligned = false; + + while (samread(in, b) >= 0) { + if (cqname != bam1_qname(b)) { + output(); + cqname = bam1_qname(b); + for (int i = 0; i < (int)arr.size(); i++) bam_destroy1(arr[i]); + arr.clear(); + unaligned = false; + } + + unaligned = unaligned || (b->core.flag & 0x0004); + arr.push_back(bam_dup1(b)); + + ++cnt; + if (cnt % 1000000 == 0) { printf("."); fflush(stdout); } + } + + if (cnt >= 1000000) printf("\n"); + + output(); + + bam_destroy1(b); + samclose(in); + samclose(out); + + printf("done!\n"); + + return 0; +} diff --git a/makefile b/makefile index 97cba95..6b11536 100644 --- a/makefile +++ b/makefile @@ -1,8 +1,7 @@ CC = g++ -#LFLAGS = -Wall -O3 -ffast-math CFLAGS = -Wall -c -I. COFLAGS = -Wall -O3 -ffast-math -c -I. -PROGRAMS = rsem-bam2wig rsem-build-read-index rsem-run-em rsem-extract-reference-transcripts rsem-synthesis-reference-transcripts rsem-parse-alignments rsem-preref rsem-simulate-reads rsem-run-gibbs rsem-calculate-credibility-intervals +PROGRAMS = rsem-extract-reference-transcripts rsem-synthesis-reference-transcripts rsem-preref rsem-parse-alignments rsem-build-read-index rsem-run-em rsem-tbam2gbam rsem-run-gibbs rsem-calculate-credibility-intervals rsem-simulate-reads rsem-bam2wig rsem-get-unique rsem-bam2readdepth all : build-sam $(PROGRAMS) @@ -62,6 +61,8 @@ rsem-build-read-index : utils.h buildReadIndex.cpp $(CC) -O3 buildReadIndex.cpp -o rsem-build-read-index +simul.h : boost/random.hpp + ReadReader.h : SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h ReadIndex.h SingleModel.h : utils.h Orientation.h LenDist.h RSPD.h Profile.h NoiseProfile.h ModelParams.h RefSeq.h Refs.h SingleRead.h SingleHit.h ReadReader.h simul.h @@ -74,16 +75,35 @@ PairedEndQModel.h : utils.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h HitWrapper.h : HitContainer.h -BamWriter.h : sam/sam.h sam/bam.h utils.h SingleHit.h PairedEndHit.h HitWrapper.h Transcript.h Transcripts.h +sam_rsem_aux.h : sam/bam.h + +sam_rsem_cvt.h : sam/bam.h Transcript.h Transcripts.h + +BamWriter.h : sam/sam.h sam/bam.h sam_rsem_aux.h sam_rsem_cvt.h SingleHit.h PairedEndHit.h HitWrapper.h Transcript.h Transcripts.h + +sampling.h : boost/random.hpp rsem-run-em : EM.o sam/libbam.a $(CC) -o rsem-run-em EM.o sam/libbam.a -lz -lpthread -EM.o : utils.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h GroupInfo.h HitContainer.h ReadIndex.h ReadReader.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h RefSeqPolicy.h PolyARules.h Profile.h NoiseProfile.h Transcript.h Transcripts.h HitWrapper.h BamWriter.h sam/bam.h sam/sam.h EM.cpp simul.h +EM.o : utils.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h GroupInfo.h HitContainer.h ReadIndex.h ReadReader.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h RefSeqPolicy.h PolyARules.h Profile.h NoiseProfile.h Transcript.h Transcripts.h HitWrapper.h BamWriter.h sam/bam.h sam/sam.h simul.h sam_rsem_aux.h sampling.h boost/random.hpp EM.cpp $(CC) $(COFLAGS) EM.cpp -rsem-bam2wig : sam/bam.h sam/sam.h sam/libbam.a bam2wig.cpp - $(CC) -O3 -Wall bam2wig.cpp sam/libbam.a -lz -o rsem-bam2wig +bc_aux.h : sam/bam.h + +BamConverter.h : utils.h sam/sam.h sam/bam.h sam_rsem_aux.h sam_rsem_cvt.h bc_aux.h Transcript.h Transcripts.h + +rsem-tbam2gbam : utils.h Transcripts.h Transcript.h bc_aux.h BamConverter.h sam/sam.h sam/bam.h sam/libbam.a sam_rsem_aux.h sam_rsem_cvt.h tbam2gbam.cpp sam/libbam.a + $(CC) -O3 -Wall tbam2gbam.cpp sam/libbam.a -lz -o $@ + +rsem-bam2wig : wiggle.h wiggle.o sam/libbam.a bam2wig.cpp + $(CC) -O3 -Wall bam2wig.cpp wiggle.o sam/libbam.a -lz -o $@ + +rsem-bam2readdepth : wiggle.h wiggle.o sam/libbam.a bam2readdepth.cpp + $(CC) -O3 -Wall bam2readdepth.cpp wiggle.o sam/libbam.a -lz -o $@ + +wiggle.o: sam/bam.h sam/sam.h wiggle.cpp wiggle.h + $(CC) $(COFLAGS) wiggle.cpp rsem-simulate-reads : simulation.o $(CC) -o rsem-simulate-reads simulation.o @@ -95,7 +115,7 @@ rsem-run-gibbs : Gibbs.o $(CC) -o rsem-run-gibbs Gibbs.o -lpthread #some header files are omitted -Gibbs.o : utils.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h boost/random.hpp Gibbs.cpp +Gibbs.o : utils.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h sampling.h boost/random.hpp Gibbs.cpp $(CC) $(COFLAGS) Gibbs.cpp rsem-calculate-credibility-intervals : calcCI.o @@ -105,6 +125,9 @@ rsem-calculate-credibility-intervals : calcCI.o calcCI.o : utils.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h calcCI.cpp boost/random.hpp $(CC) $(COFLAGS) calcCI.cpp +rsem-get-unique : sam/bam.h sam/sam.h getUnique.cpp sam/libbam.a + $(CC) -O3 -Wall getUnique.cpp sam/libbam.a -lz -o $@ + clean: rm -f *.o *~ $(PROGRAMS) cd sam ; ${MAKE} clean diff --git a/rsem-calculate-expression b/rsem-calculate-expression index 2724314..bc976c3 100755 --- a/rsem-calculate-expression +++ b/rsem-calculate-expression @@ -45,7 +45,9 @@ my $estRSPD = 0; my $B = 20; my $nThreads = 1; -my $genBamF = 0; +my $genBamF = 1; # default is generating transcript bam file +my $genGenomeBamF = 0; +my $sampling = 0; my $calcCI = 0; my $quiet = 0; my $help = 0; @@ -84,7 +86,8 @@ GetOptions("keep-intermediate-files" => \$keep_intermediate_files, "estimate-rspd" => \$estRSPD, "num-rspd-bins=i" => \$B, "p|num-threads=i" => \$nThreads, - "out-bam" => \$genBamF, + "output-genome-bam" => \$genGenomeBamF, + "sampling-for-bam" => \$sampling, "calc-ci" => \$calcCI, "ci-memory=i" => \$NMB, "time" => \$mTime, @@ -112,7 +115,10 @@ pod2usage(-msg => "Min fragment length should be at least 1!", -exitval => 2, -v pod2usage(-msg => "Min fragment length should be smaller or equal to max fragment length!", -exitval => 2, -verbose => 2) if ($minL > $maxL); pod2usage(-msg => "The memory allocated for calculating credibility intervals should be at least 1 MB!\n", -exitval => 2, -verbose => 2) if ($NMB < 1); pod2usage(-msg => "Number of threads should be at least 1!\n", -exitval => 2, -verbose => 2) if ($nThreads < 1); -pod2usage(-msg => "Seed length should be at least 25!\n", -exitval => 2, -verbose => 2) if ($L < 25); +pod2usage(-msg => "Seed length should be at least 5!\n", -exitval => 2, -verbose => 2) if ($L < 5); +pod2usage(-msg => "--sampling-for-bam cannot be specified if --out-bam is not specified!\n", -exitval => 2, -verbose => 2) if ($sampling && !$genBamF); + +if ($L < 25) { print "Warning: the seed length set is less than 25! This is only allowed if the references are not added poly(A) tails.\n"; } if ($strand_specific) { $probF = 1.0; } @@ -145,6 +151,14 @@ else { $sampleName = $ARGV[3]; } +if ($genGenomeBamF) { + open(INPUT, "$refName.ti"); + my $line = ; chomp($line); + close(INPUT); + my ($M, $type) = split(/ /, $line); + pod2usage(-msg => "No genome information provided, so genome bam file cannot be generated!\n", -exitval => 2, -verbose => 2) if ($type != 0); +} + my $pos = rindex($sampleName, '/'); if ($pos < 0) { $sampleToken = $sampleName; } else { $sampleToken = substr($sampleName, $pos + 1); } @@ -197,20 +211,13 @@ if (!$is_sam && !$is_bam) { } $command .= " | gzip > $imdName.sam.gz"; - print "$command\n"; if ($mTime) { $time_start = time(); } - $status = system($command); + &runCommand($command); if ($mTime) { $time_end = time(); $time_alignment = $time_end - $time_start; } - if ($status != 0) { - print "bowtie failed! Please check if you provide correct parameters/options for the pipeline!\n"; - exit(-1); - } - print "\n"; - $inpF = "$imdName.sam.gz"; $is_sam = 1; # output of bowtie is a sam file } @@ -228,13 +235,7 @@ if ($fn_list ne "") { $command .= " -l $fn_list"; } if ($tagName ne "") { $command .= " -tag $tagName"; } if ($quiet) { $command .= " -q"; } -print "$command\n"; -$status = system($command); -if ($status != 0) { - print "rsem-parse-alignments failed! Please check if you provide correct parameters/options for the pipeline!\n"; - exit(-1); -} -print "\n"; +&runCommand($command); $command = $dir."rsem-build-read-index $gap"; switch($read_type) { @@ -243,13 +244,7 @@ switch($read_type) { case 2 { $command .= " 0 $quiet $imdName\_alignable_1.fa $imdName\_alignable_2.fa"; } case 3 { $command .= " 1 $quiet $imdName\_alignable_1.fq $imdName\_alignable_2.fq"; } } -print "$command\n"; -$status = system($command); -if ($status != 0) { - print "rsem-build-read-index failed! Please check if you provide correct parameters/options for the pipeline!\n"; - exit(-1); -} -print "\n"; +&runCommand($command); my $doesOpen = open(OUTPUT, ">$imdName.mparams"); if ($doesOpen == 0) { print "Cannot generate $imdName.mparams!\n"; exit(-1); } @@ -267,35 +262,27 @@ if ($genBamF) { $command .= " -b $samInpType $inpF"; if ($fn_list ne "") { $command .= " 1 $fn_list"; } else { $command .= " 0"; } + if ($sampling) { $command .= " --sampling"; } } if ($calcCI) { $command .= " --gibbs-out"; } if ($quiet) { $command .= " -q"; } -print "$command\n"; -$status = system($command); -if ($status != 0) { - print "rsem-run-em failed! Please check if you provide correct parameters/options for the pipeline!\n"; - exit(-1); -} -print "\n"; +&runCommand($command); if ($genBamF) { - $command = $dir."sam/samtools sort $sampleName.bam $sampleName.sorted"; - print "$command\n"; - $status = system($command); - if ($status != 0) { - print "sam/samtools sort failed! Please check if you provide correct parameters/options for the pipeline!\n"; - exit(-1); - } - print "\n"; - $command = $dir."sam/samtools index $sampleName.sorted.bam"; - print "$command\n"; - $status = system($command); - if ($status != 0) { - print "sam/samtools index failed! Please check if you provide correct parameters/options for the pipeline!\n"; - exit(-1); + $command = $dir."sam/samtools sort $sampleName.transcript.bam $sampleName.transcript.sorted"; + &runCommand($command); + $command = $dir."sam/samtools index $sampleName.transcript.sorted.bam"; + &runCommand($command); + + if ($genGenomeBamF) { + $command = $dir."rsem-tbam2gbam $refName $sampleName.transcript.bam $sampleName.genome.bam"; + &runCommand($command); + $command = $dir."sam/samtools sort $sampleName.genome.bam $sampleName.genome.sorted"; + &runCommand($command); + $command = $dir."sam/samtools index $sampleName.genome.sorted.bam"; + &runCommand($command); } - print "\n"; } &collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level @@ -309,13 +296,7 @@ if ($calcCI) { $command = $dir."rsem-run-gibbs $refName $sampleName $sampleToken $BURNIN $CHAINLEN $SAMPLEGAP"; # $command .= " -p $nThreads"; if ($quiet) { $command .= " -q"; } - print "$command\n"; - $status = system($command); - if ($status != 0) { - print "rsem-run-gibbs failed! Please check if you provide correct parameters/options for the pipeline!\n"; - exit(-1); - } - print "\n"; + &runCommand($command); system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak1"); system("mv $sampleName.genes.results $imdName.genes.results.bak1"); @@ -324,13 +305,7 @@ if ($calcCI) { $command = $dir."rsem-calculate-credibility-intervals $refName $sampleName $sampleToken $CONFIDENCE $NSPC $NMB"; if ($quiet) { $command .= " -q"; } - print "$command\n"; - $status = system($command); - if ($status != 0) { - print "rsem-calculate-credibility-intervals failed! Please check if you provide correct parameters/options for the pipeline!\n"; - exit(-1); - } - print "\n"; + &runCommand($command); system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak2"); system("mv $sampleName.genes.results $imdName.genes.results.bak2"); @@ -343,11 +318,7 @@ if ($mTime) { $time_end = time(); $time_ci = $time_end - $time_start; } if ($mTime) { $time_start = time(); } if (!$keep_intermediate_files) { - $status = system("rm -rf $temp_dir"); - if ($status != 0) { - print "Fail to delete the temporary folder!\n"; - exit(-1); - } + &runCommand("rm -rf $temp_dir", "Fail to delete the temporary folder!"); } if ($mTime) { $time_end = time(); } @@ -362,11 +333,25 @@ if ($mTime) { close(OUTPUT); } +# command, {err_msg} +sub runCommand { + print $_[0]."\n"; + my $status = system($_[0]); + if ($status != 0) { + my $errmsg; + if (scalar(@_) > 1) { $errmsg = $_[1]; } + else { $errmsg = "\"$command\" failed! Plase check if you provide correct parameters/options for the pipeline!"; } + print $errmsg."\n"; + exit(-1); + } + print "\n"; +} + # inpF, outF sub collectResults { my $local_status; my ($inpF, $outF); - my (@results, @comment) = (); + my (@results, @ids) = (); my $line; my $cnt; @@ -383,11 +368,11 @@ sub collectResults { ++$cnt; chomp($line); my @local_arr = split(/\t/, $line); - if ($cnt == 4) { @comment = @local_arr; } + if ($cnt == 4) { @ids = @local_arr; } else { push(@results, \@local_arr); } } - push(@results, \@comment); + push(@results, \@ids); close(INPUT); $local_status = open(OUTPUT, ">$outF"); @@ -479,17 +464,21 @@ RSEM reads header information from input by default. If this option is on, heade Number of threads to use. Both Bowtie and expression estimation will use this many threads. (Default: 1) -=item B<--out-bam> +=item B<--output-genome-bam> + +Generate a BAM file, 'sample_name.genome.bam', with alignments mapped to genomic coordinates and annotated with their posterior probabilities. In addition, RSEM will call samtools (included in RSEM package) to sort and index the bam file. 'sample_name.genome.sorted.bam' and 'sample_name.genome.sorted.bam.bai' will be generated. (Default: off) -Generate a BAM file, 'sample_name.bam', with alignments mapped to genomic coordinates and annotated with their posterior probabilities. In addition, RSEM will call samtools (included in RSEM package) to sort and index the bam file. 'sample_name.sorted.bam' and 'sample_name.sorted.bam.bai' will be generated. (Default: off) +=item B<--sampling-for-bam> +When RSEM generates a BAM file, instead of outputing all alignments a read has with their posterior probabilities, one alignment is sampled and outputed according to the posterior probabilities. If the sampling result is that the read comes from the "noise" transcript, nothing is outputed. (Default: off) + =item B<--calc-ci> Calculate 95% credibility intervals and posterior mean estimates. (Default: off) =item B<--seed-length> -Seed length used by the read aligner. Providing the correct value for this parameter is important for RSEM's accuracy if the data are single-end reads. If RSEM runs Bowtie, it uses this value for Bowtie's seed length parameter. The minimum value is 25. Any read with its or at least one of its mates' (for paired-end reads) length less than 25 will be ignored. (Default: 25) +Seed length used by the read aligner. Providing the correct value is important for RSEM. If RSEM runs Bowtie, it uses this value for Bowtie's seed length parameter. Any read with its or at least one of its mates' (for paired-end reads) length less than this value will be ignored. If the references are not added poly(A) tails, the minimum allowed value is 5, otherwise, the minimum allowed value is 25. Note that this script will only check if the value >= 5 and give a warning message if the value < 25 but >= 5. (Default: 25) =item B<--tag> @@ -520,11 +509,11 @@ The path to the bowtie executables. (Default: the path to the bowtie executables Input quality scores are encoded as Phred+33. (Default: on) =item B<--phred64-quals> - + Input quality scores are encoded as Phred+64 (default for GA Pipeline ver. >= 1.3). (Default: off) =item B<--solexa-quals> - + Input quality scores are solexa encoded (from GA Pipeline ver. < 1.3). (Default: off) =item B<--forward-prob> @@ -575,13 +564,15 @@ Show help information. =head1 DESCRIPTION -In its default mode, this program aligns input reads against a reference transcriptome with Bowtie and calculates expression values using the alignments. RSEM assumes the data are single-end reads with quality scores, unless the '--paired-end' or '--no-qualities' options are specified. Users may use an alternative aligner by specifying one of the --sam and --bam options, and providing an alignment file in the specified format. However, users should make sure the alignment file satisfies the requirements mentioned in ARGUMENTS section. +In its default mode, this program aligns input reads against a reference transcriptome with Bowtie and calculates expression values using the alignments. RSEM assumes the data are single-end reads with quality scores, unless the '--paired-end' or '--no-qualities' options are specified. Users may use an alternative aligner by specifying one of the --sam and --bam options, and providing an alignment file in the specified format. However, users should make sure that they align against the indices generated by 'rsem-prepare-reference' and the alignment file satisfies the requirements mentioned in ARGUMENTS section. + +One simple way to make the alignment file satisfying RSEM's requirements (assuming the aligner used put mates in a paired-end read adjacent) is to use 'convert-sam-for-rsem' script. This script only accept SAM format files as input. If a BAM format file is obtained, please use samtools to convert it to a SAM file first. For example, if '/ref/mouse_125' is the 'reference_name' and the SAM file is named 'input.sam', you can run the following command: -One simple way to make the alignment file (e.g. input.sam) satisfying RSEM's requirements (assuming the aligner used put mates in a paired-end read adjacent) is to use the following command: + convert-sam-for-rsem /ref/mouse_125 input.sam -o input_for_rsem.sam - sort -k 1,1 -s input.sam > input.sorted.sam +For details, please refer to 'convert-sam-for-rsem's documentation page. -The SAM/BAM format RSEM uses is v1.3. However, it is compatible with old SAM/BAM format. +The SAM/BAM format RSEM uses is v1.4. However, it is compatible with old SAM/BAM format. However, RSEM cannot recognize 0x100 in the FLAG field. In addition, RSEM requires SEQ and QUAL are not '*'. The user must run 'rsem-prepare-reference' with the appropriate reference before using this program. @@ -591,7 +582,7 @@ Please note that some of the default values for the Bowtie parameters are not th The temporary directory and all intermediate files will be removed when RSEM finishes unless '--keep-intermediate-files' is specified. -With the "--calc-ci" option, 95% credibility intervals and posterior mean estimates will be calculated in addition to maximum likelihood estimates. +With the '--calc-ci' option, 95% credibility intervals and posterior mean estimates will be calculated in addition to maximum likelihood estimates. =head1 OUTPUT @@ -610,26 +601,40 @@ estimation. pmc stands for posterior mean counts. ci_lower_bound(l) means the lower bound of the credibility intervals, ci_upper_bound(u) means the upper bound of the credibility intervals. So the credibility interval is [l, u]. 'transcript_id_list' is a space-separated list of -transcript_ids belonging to the gene. +transcript_ids belonging to the gene. If no gene information is +provided, this file has the same content as +'sample_name.isoforms.results'. =item B File containing isoform level expression values. The format of each line in this file is: -transcript_id expected_counts tau_value [pmc_value tau_pme_value tau_ci_lower_bound tau_ci_upper_bound] other_attributes +transcript_id expected_counts tau_value [pmc_value tau_pme_value tau_ci_lower_bound tau_ci_upper_bound] gene_id -Fields are separated by the tab character. 'other_attributes' are all -other attributes after attribute 'transcript_id' field in the GTF -file. If no other attributes are given or no GTF file is provided in -'rsem-prepare-reference', there will be no tab after the -tau_value field. +Fields are separated by the tab character. 'gene_id' is the gene_id of +the gene which this transcript belongs to. If no gene information is +provided, 'gene_id' and 'transcript_id' are the same. -=item B +=item B -Only generated when --out-bam is specified. +'sample_name.transcript.bam' is a BAM-formatted file of read +alignments in transcript coordinates. The MAPQ field of each alignment +is set to min(100, floor(-10 * log10(1.0 - w) + 0.5)), where w is the +posterior probability of that alignment being the true mapping of a +read. In addition, RSEM pads a new tag ZW:f:value, where value is a +single precision floating number representing the posterior +probability. -'sample_name.bam' is a BAM-formatted file of read alignments in +'sample_name.transcript.sorted.bam' and +'sample_name.transcript.sorted.bam.bai' are the sorted BAM file and +indices generated by samtools (included in RSEM package). + +=item B + +Only generated when --output-genome-bam is specified. + +'sample_name.genome.bam' is a BAM-formatted file of read alignments in genomic coordinates. Alignments of reads that have identical genomic coordinates (i.e., alignments to different isoforms that share the same genomic region) are collapsed into one alignment. The MAPQ field @@ -637,9 +642,11 @@ of each alignment is set to min(100, floor(-10 * log10(1.0 - w) + 0.5)), where w is the posterior probability of that alignment being the true mapping of a read. In addition, RSEM pads a new tag ZW:f:value, where value is a single precision floating number -representing the posterior probability. +representing the posterior probability. If an alignment is spliced, a +XS:A:value tag is also added, where value is either '+' or '-' +indicating the strand of the transcript it aligns to. -'sample_name.sorted.bam' and 'sample_name.sorted.bam.bai' are the +'sample_name.genome.sorted.bam' and 'sample_name.genome.sorted.bam.bai' are the sorted BAM file and indices generated by samtools (included in RSEM package). =item B @@ -650,56 +657,55 @@ This is a folder instead of a file. All model related statistics are stored in t =head1 EXAMPLES -Assume the path to the bowtie executables is in the user's PATH environment variable. Reference files are under '/ref' with name 'mm9'. +Assume the path to the bowtie executables is in the user's PATH environment variable. Reference files are under '/ref' with name 'mouse_125'. -1) '/data/mmliver.fq', single-end reads with quality scores. Quality scores are encoded as for 'GA pipeline version >= 1.3'. We want to use 8 threads and generate a BAM file: +1) '/data/mmliver.fq', single-end reads with quality scores. Quality scores are encoded as for 'GA pipeline version >= 1.3'. We want to use 8 threads and generate a genome BAM file: rsem-calculate-expression --phred64-quals \ -p 8 \ - --out-bam \ + --output-genome-bam \ /data/mmliver.fq \ - /ref/mm9 \ + /ref/mouse_125 \ mmliver_single_quals -2) '/data/mmliver_1.fq' and '/data/mmliver_2.fq', paired-end reads with quality scores. Quality scores are in SANGER format. We want to use 8 threads and do not generate a BAM file: +2) '/data/mmliver_1.fq' and '/data/mmliver_2.fq', paired-end reads with quality scores. Quality scores are in SANGER format. We want to use 8 threads and do not generate a genome BAM file: rsem-calculate-expression -p 8 \ --paired-end \ /data/mmliver_1.fq \ /data/mmliver_2.fq \ - /ref/mm9 \ + /ref/mouse_125 \ mmliver_paired_end_quals -3) '/data/mmliver.fa', single-end reads without quality scores. We want to use 8 threads and generate a BAM file: +3) '/data/mmliver.fa', single-end reads without quality scores. We want to use 8 threads: rsem-calculate-expression -p 8 \ --no-qualities \ /data/mmliver.fa \ - /ref/mm9 \ + /ref/mouse_125 \ mmliver_single_without_quals -4) Data are the same as 1). We want to take a fragment length distribution into consideration. We set the fragment length mean to 150 and the standard deviation to 35. In addition to a BAM file, we also want to generate credibility intervals. We allow RSEM to use 1GB of memory for CI calculation. +4) Data are the same as 1). We want to take a fragment length distribution into consideration. We set the fragment length mean to 150 and the standard deviation to 35. In addition to a BAM file, we also want to generate credibility intervals. We allow RSEM to use 1GB of memory for CI calculation: rsem-calculate-expression --bowtie-path /sw/bowtie \ --phred64-quals \ --fragment-length-mean 150.0 \ --fragment-length-sd 35.0 \ -p 8 \ - --out-bam \ + --output-genome-bam \ --calc-ci \ --ci-memory 1024 \ /data/mmliver.fq \ - /ref/mm9 \ + /ref/mouse_125 \ mmliver_single_quals -5) '/data/mmliver_paired_end_quals.bam', paired-end reads with quality scores. We want to use 8 threads and do not generate a BAM file: +5) '/data/mmliver_paired_end_quals.bam', paired-end reads with quality scores. We want to use 8 threads: rsem-calculate-expression --paired-end \ --bam \ -p 8 \ /data/mmliver_paired_end_quals.bam \ - /ref/mm9 \ + /ref/mouse_125 \ mmliver_paired_end_quals =cut - diff --git a/rsem-gen-transcript-plots b/rsem-gen-transcript-plots new file mode 100755 index 0000000..9a806d6 --- /dev/null +++ b/rsem-gen-transcript-plots @@ -0,0 +1,129 @@ +#!/usr/bin/env Rscript + +nrow_per_page <- 3 # if input_list is composed of transcript ids +ncol_per_page <- 2 # if input_list is composed of transcript ids +num_plots_per_page <- nrow_per_page * ncol_per_page # if input_list is composed of transcript ids + + +exit_with_error <- function(errmsg) { + cat(errmsg, "\n", sep = "", file = stderr()) + quit(save = "no", status = 1) +} + + +args <- commandArgs(TRUE) +if (length(args) != 5) + exit_with_error("Usage: rsem-gen-transcript-plots sample_name input_list is_gene show_uniq output_plot_file") + +sample_name <- args[1] +input_list <- args[2] +is_gene <- as.numeric(args[3]) +show_uniq <- as.numeric(args[4]) +output_plot_file <- args[5] + + + +load_readdepth_file <- function(filename) { + data <- read.table(file = filename, sep = "\t", stringsAsFactors = FALSE) + nrow <- dim(data)[1] + readdepth <- list() + for (i in 1:nrow) { + readdepth[[data[i, 1]]] <- data[i, c(2, 3)] + } + readdepth +} + +build_t2gmap <- function(filename) { + data <- read.table(file = filename, sep = "\t", stringsAsFactors = FALSE) + t2gmap <- list() + + nrow <- dim(data)[1] + ncol <- dim(data)[2] + + gene_id <- "" + tids <- c() + for (i in 1:nrow) { + if (gene_id != data[i, ncol]) { + if (gene_id != "") { + t2gmap[[gene_id]] <- tids + } + gene_id <- data[i, ncol] + tids <- c() + } + tids <- c(tids, data[i, 1]) + } + if (gene_id != "") t2gmap[[gene_id]] <- tids + + t2gmap +} + +generate_a_page <- function(tids, gene_id = NULL) { + n <- length(tids) + ncol <- ifelse(is_gene, floor(sqrt(n)), ncol_per_page) + nrow <- ifelse(is_gene, ceiling(n / ncol), nrow_per_page) + + par(mfrow = c(nrow, ncol), mar = c(2, 2, 2, 2)) + if (is_gene) par(oma = c(0, 0, 3, 0)) + + for (i in 1:n) { + vec <- readdepth[[tids[i]]] + if (is.null(vec)) exit_with_error(paste("Cannot find transcript", tids[i], sep = "")) + if (is.na(vec[[2]])) wiggle <- rep(0, vec[[1]]) else wiggle <- as.numeric(unlist(strsplit(vec[[2]], split = " "))) + len <- length(wiggle) + if (!show_uniq) { + plot(wiggle, type = "h") + } else { + vec <- readdepth_uniq[[tids[i]]] + stopifnot(!is.null(vec)) + if (is.na(vec[[2]])) wiggle_uniq <- rep(0, vec[[1]]) else wiggle_uniq <- as.numeric(unlist(strsplit(vec[[2]], split = " "))) + stopifnot(len == length(wiggle_uniq), len == sum(wiggle >= wiggle_uniq)) + heights <- rbind(wiggle_uniq, wiggle - wiggle_uniq) + barplot(heights, space = 0, border = NA, names.arg = 1:len, col = c("black", "red")) + } + title(main = tids[i]) #, xlab = "Position in transcript", ylab = "Read depth") + } + + if (is_gene) mtext(gene_id, outer = TRUE, line = 1) +} + +readdepth <- load_readdepth_file(paste(sample_name, ".transcript.readdepth", sep = "")) + +if (show_uniq) { + readdepth_uniq <- load_readdepth_file(paste(sample_name, ".uniq.transcript.readdepth", sep = "")) +} + +ids <- scan(file = input_list, what = "", sep = "\n") + +cat("Loading files is done!\n") + +if (is_gene) { + t2gmap <- build_t2gmap(paste(sample_name, ".isoforms.results", sep = "")) + cat("Building transcript to gene map is done!\n") +} + +pdf(output_plot_file) + +if (!is_gene) { + n <- length(ids) + ub <- (n - 1) %/% num_plots_per_page + 1 + for (i in 1:ub) { + fr <- (i - 1) * num_plots_per_page + 1 + to <- min(i * num_plots_per_page, n) + generate_a_page(ids[fr:to]) + } +} else { + for (gene_id in ids) { + if (is.null(t2gmap[[gene_id]])) exit_with_error(paste("Cannot find gene", gene_id, sep = "")) + generate_a_page(t2gmap[[gene_id]], gene_id) + } +} + +cat("Plots are generated!\n) + +dev.off.output <- dev.off() + + + + + + diff --git a/rsem-plot-transcript-wiggles b/rsem-plot-transcript-wiggles new file mode 100755 index 0000000..5054f5b --- /dev/null +++ b/rsem-plot-transcript-wiggles @@ -0,0 +1,143 @@ +#!/usr/bin/perl + +use Getopt::Long; +use Pod::Usage; +use File::Basename; +use strict; + +my $gene_list = 0; # default is 0, means input is a transcript list; 1 means input is a gene list +my $show_unique = 0; # 0, default value, means do not show unique transcript wiggles; 1 means show unique transcript wiggles +my $help = 0; + +GetOptions("gene-list" => \$gene_list, + "show-unique" => \$show_unique, + "h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2); + +pod2usage(-verbose => 2) if ($help == 1); +pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 3); + +my ($fn, $dir, $suf) = fileparse($0); +my $command = ""; + +unless (-e "$ARGV[0].transcript.readdepth") { + $command = $dir."rsem-bam2readdepth $ARGV[0].transcript.sorted.bam > $ARGV[0].transcript.readdepth"; + &runCommand($command); +} + +if ($show_unique) { + unless (-e "$ARGV[0].uniq.transcript.bam") { + $command = $dir."rsem-get-unique $ARGV[0].transcript.bam $ARGV[0].uniq.transcript.bam"; + &runCommand($command); + } + unless (-e "$ARGV[0].uniq.transcript.sorted.bam") { + $command = $dir."sam/samtools sort $ARGV[0].uniq.transcript.bam $ARGV[0].uniq.transcript.sorted"; + &runCommand($command); + } + unless (-e "$ARGV[0].uniq.transcript.readdepth") { + $command = $dir."rsem-bam2readdepth $ARGV[0].uniq.transcript.sorted.bam > $ARGV[0].uniq.transcript.readdepth"; + &runCommand($command); + } +} + +$command = $dir."rsem-gen-transcript-plots $ARGV[0] $ARGV[1] $gene_list $show_unique $ARGV[2]"; +&runCommand($command); + +# command, {err_msg} +sub runCommand { + print $_[0]."\n"; + my $status = system($_[0]); + if ($status != 0) { + my $errmsg; + if (scalar(@_) > 1) { $errmsg = $_[1]; } + else { $errmsg = "\"$command\" failed! Plase check if you provide correct parameters/options for the pipeline!"; } + print $errmsg."\n"; + exit(-1); + } + print "\n"; +} + +__END__ + +=head1 NAME + +rsem-plot-transcript-wiggles + +=head1 SYNOPSIS + +=over + + rsem-plot-transcript-wiggles [options] sample_name input_list output_plot_file + +=back + +=head1 ARGUMENTS + +=over + +=item B + +The name of the sample analyzed. + +=item B + +A list of transcript ids or gene ids. But it cannot be a mixture of transcript & gene ids. Each id occupies one line without extra spaces. + +=item B + +The file name of the pdf file which contains all plots. + +=back + +=head1 OPTIONS + +=over + +=item B<--gene-list> + +The input-list is a list of gene ids. (Default: off) + +=item B<--show-unique> + +Show the wiggle plots as stacked bar plots. See description section for details. (Default: off) + +=item B<-h/--help> + +Show help information. + +=back + +=head1 DESCRIPTION + +This program generates transcript wiggle plots and outputs them in a pdf file. This program can accept either a list of transcript ids or gene ids (if transcript to gene mapping information is provided) and has two modes of showing wiggle plots. If '--show-unique' is not specified, the wiggle plot for each transcript is a histogram where each position has the expected read depth at this position as its height. If '--show-unique' is specified, for each transcript a stacked bar plot is generated. For each position, the read depth of unique reads, which have only one alignment, is showed in black. The read depth of multi-reads, which align to more than one places, is showed in red on top of the read depth of unique reads.This program will use some files RSEM generated previouslly. So please do not delete/move any file 'rsem-calculate-expression' generated. + +=head1 OUTPUT + +=over + +=item B + +This is a pdf file containing all plots generated. If a list of transcript ids is provided, each page display at most 6 plots in 3 rows and 2 columns. If gene ids are provided, each page display a gene. The gene's id is showed at the top and all its transcripts' wiggle plots are showed in this page. The arrangment of plots is determined automatically. For each transcript wiggle plot, the transcript id is displayed as title. x-axis is position in the transcript and y-axis is read depth. + +=item B + +If these files do not exist, 'rsem-plot-transcript-wiggles' will automatically generate them. + +=item B + +If '--show-unique' option is specified and these files do not exist, 'rsem-plot-transcript-wiggles' will automatically generate them. + +=back + +=head1 EXAMPLES + +Suppose sample_name and output_plot_file are set to 'mmliver_single_quals' and 'output.pdf' respectively. input_list is set to 'transcript_ids.txt' if transcript ids are provided, and is set to 'gene_ids.txt' if gene ids are provided. + +1) Transcript ids are provided and we just want normal wiggle plots: + + rsem-plot-transcript-wiggles mmliver_single_quals transcript_ids.txt output.pdf + +2) Gene ids are provided and we want to show stacked bar plots: + + rsem-plot-transcript-wiggles --gene-list --show-unique mmliver_single_quals gene_ids.txt output.pdf + +=cut diff --git a/rsem-prepare-reference b/rsem-prepare-reference index 47d7d9c..78743e9 100755 --- a/rsem-prepare-reference +++ b/rsem-prepare-reference @@ -51,6 +51,8 @@ if ($size == 1 && (-d $list[0])) { $size = scalar(@list); } +pod2usage(-msg => "reference_fasta_file(s) is empty! Please check if you provide the correct folder name or file suffixes!", -exitval => 2, -verbose => 2) if ($size <= 0); + if ($no_polyA) { $polyAChoice = 1 } elsif ($subsetFile ne "") { $polyAChoice = 2; } @@ -65,13 +67,7 @@ if ($type == 0) { if ($mappingF ne "") { $command .= " 1 $mappingF"; } else { $command .= " 0"; } $command .= " @list"; - print "$command\n"; - $status = system($command); - if ($status != 0) { - print "rsem-extract-reference-transcripts failed! Please check if you provide correct parameters/options for the pipeline!\n"; - exit(-1); - } - print "\n"; + &runCommand($command); } else { $"=" "; @@ -79,37 +75,33 @@ else { if ($mappingF ne "") { $command .= " 1 $mappingF"; } else { $command .= " 0"; } $command .= " @list"; - print "$command\n"; - $status = system($command); - if ($status != 0) { - print "rsem-synthesis-reference-transcripts failed! Please check if you provide correct parameters/options for the pipeline!\n"; - exit(-1); - } - print "\n"; + &runCommand($command); } $command = $dir."rsem-preref $ARGV[1].transcripts.fa $polyAChoice $ARGV[1] -l $polyALen"; if ($polyAChoice == 2) { $command .= " -f $subsetFile"; } if ($no_ntog) { $command .= " --no-ntog"; } if ($quiet) { $command .= " -q"; } - -print "$command\n"; -$status = system($command); -if ($status != 0) { - print "rsem-preref failed! Please check if you provide correct parameters/options for the pipeline!\n"; - exit(-1); -} -print "\n"; + +&runCommand($command); if (!$no_bowtie) { $command = $bowtie_path."bowtie-build -f"; if ($quiet) { $command .= " -q"; } $command .= " $ARGV[1].idx.fa $ARGV[1]"; - - print "$command\n"; - $status = system($command); - if ($status != 0) { - print "bowtie-build failed! Please check if you have a copy of bowtie-build in the path you specified!\n"; + + &runCommand($command); +} + +# command, {err_msg} +sub runCommand { + print $_[0]."\n"; + my $status = system($_[0]); + if ($status != 0) { + my $errmsg; + if (scalar(@_) > 1) { $errmsg = $_[1]; } + else { $errmsg = "\"$command\" failed! Plase check if you provide correct parameters/options for the pipeline!"; } + print $errmsg."\n"; exit(-1); } print "\n"; @@ -184,7 +176,7 @@ The length of the poly(A) tails to be added. (Default: 125) =item B<--bowtie-path> -The path to the bowtie executables. (Default: the path to bowtie executables is assumed to be in the user's PATH environment variable) +The path to the Bowtie executables. (Default: the path to Bowtie executables is assumed to be in the user's PATH environment variable) =item B<--no-bowtie> @@ -219,7 +211,7 @@ B<'reference_name.transcripts.fa'> contains the extracted reference transcripts =head1 EXAMPLES -1) Suppose we have mouse RNA-Seq data and want to use the UCSC mm9 version of the mouse genome. We have downloaded the UCSC Genes transcript annotations in GTF format (as mm9.gtf) using the Table Browser and the knownIsoforms.txt file for mm9 from the UCSC Downloads. We also have all chromosome files for mm9 in the directory '/data/mm9'. We want to put the generated reference files under '/ref' with name 'mm9'. We'll add poly(A) tails with length 125. Please note that GTF files generated from UCSC's Table Browser do not contain isoform-gene relationship information. For the UCSC Genes annotation, this information can be obtained from the knownIsoforms.txt file. Suppose we want to build Bowtie indices and Bowtie executables are found in '/sw/bowtie'. +1) Suppose we have mouse RNA-Seq data and want to use the UCSC mm9 version of the mouse genome. We have downloaded the UCSC Genes transcript annotations in GTF format (as mm9.gtf) using the Table Browser and the knownIsoforms.txt file for mm9 from the UCSC Downloads. We also have all chromosome files for mm9 in the directory '/data/mm9'. We want to put the generated reference files under '/ref' with name 'mouse_125'. We'll add poly(A) tails with length 125. Please note that GTF files generated from UCSC's Table Browser do not contain isoform-gene relationship information. For the UCSC Genes annotation, this information can be obtained from the knownIsoforms.txt file. Suppose we want to build Bowtie indices and Bowtie executables are found in '/sw/bowtie'. There are two ways to write the command: @@ -227,7 +219,7 @@ There are two ways to write the command: --transcript-to-gene-map knownIsoforms.txt \ --bowtie-path /sw/bowtie \ /data/mm9/chr1.fa,/data/mm9/chr2.fa,...,/data/mm9/chrM.fa \ - /ref/mm9 + /ref/mouse_125 OR @@ -235,14 +227,14 @@ OR --transcript-to-gene-map knownIsoforms.txt \ --bowtie-path /sw/bowtie \ /data/mm9 \ - /ref/mm9 + /ref/mouse_125 -2) Suppose we only have transcripts from EST tags in 'mm9.fasta'. In addition, we also have isoform-gene information in 'mapping.txt'. We do not want to add any poly(A) tails. The reference_name will be set to 'mm9'. In addition, we do not want to build Bowtie indices, and will use an alternative aligner to align reads against the 'mm9.idx.fa' output file: +2) Suppose we only have transcripts from EST tags in 'mm9.fasta'. In addition, we also have isoform-gene information in 'mapping.txt'. We do not want to add any poly(A) tails. The reference_name will be set to 'mouse_0'. In addition, we do not want to build Bowtie indices, and will use an alternative aligner to align reads against the 'mouse_0.idx.fa' output file: rsem-prepare-reference --transcript-to-gene-map mapping.txt \ --no-polyA \ --no-bowtie \ mm9.fasta \ - mm9 + mouse_0 =cut diff --git a/sam/ChangeLog b/sam/ChangeLog index dd62b49..a471838 100644 --- a/sam/ChangeLog +++ b/sam/ChangeLog @@ -1,3 +1,490 @@ +------------------------------------------------------------------------ +r925 | lh3lh3 | 2011-02-28 15:45:17 -0500 (Mon, 28 Feb 2011) | 2 lines +Changed paths: + M /trunk/samtools/phase.c + +minor changes to a heuristic rule + +------------------------------------------------------------------------ +r924 | lh3lh3 | 2011-02-28 15:24:04 -0500 (Mon, 28 Feb 2011) | 4 lines +Changed paths: + M /trunk/samtools/bam.h + M /trunk/samtools/bcftools/vcfutils.pl + M /trunk/samtools/phase.c + + * 0.1.12-r924:126 + * fixed a bug in phase (due to recent changes) + * fixed a bug in vcf2fq + +------------------------------------------------------------------------ +r923 | lh3lh3 | 2011-02-28 12:57:39 -0500 (Mon, 28 Feb 2011) | 5 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/bam.h + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/phase.c + + * put version number in bam.h + * write version to BCF + * in phase, change the default -q to 37 + * output a little more information during phasing + +------------------------------------------------------------------------ +r922 | lh3lh3 | 2011-02-25 16:40:09 -0500 (Fri, 25 Feb 2011) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/bcftools/bcf.c + M /trunk/samtools/bcftools/bcf.tex + M /trunk/samtools/bcftools/bcf2qcall.c + M /trunk/samtools/bcftools/bcfutils.c + M /trunk/samtools/bcftools/ld.c + M /trunk/samtools/bcftools/prob1.c + M /trunk/samtools/bcftools/vcf.c + M /trunk/samtools/cut_target.c + + * change the order of PL/GL according to the latest VCF spec + * change the type of SP to int32_t + +------------------------------------------------------------------------ +r921 | lh3lh3 | 2011-02-25 14:40:56 -0500 (Fri, 25 Feb 2011) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/bcf.tex + +update the BCF spec + +------------------------------------------------------------------------ +r920 | lh3lh3 | 2011-02-25 00:59:27 -0500 (Fri, 25 Feb 2011) | 2 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/bam_md.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bam_sort.c + M /trunk/samtools/bamtk.c + A /trunk/samtools/cut_target.c + M /trunk/samtools/errmod.h + M /trunk/samtools/faidx.c + M /trunk/samtools/khash.h + M /trunk/samtools/kstring.c + M /trunk/samtools/kstring.h + A /trunk/samtools/phase.c + M /trunk/samtools/samtools.1 + +added the phase command + +------------------------------------------------------------------------ +r918 | lh3lh3 | 2011-02-24 10:05:54 -0500 (Thu, 24 Feb 2011) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/prob1.c + M /trunk/samtools/bcftools/prob1.h + +added "const" to bcf_p1_cal() + +------------------------------------------------------------------------ +r917 | lh3lh3 | 2011-02-24 09:36:30 -0500 (Thu, 24 Feb 2011) | 2 lines +Changed paths: + M /trunk/samtools/bam.c + +more meaningful BAM truncation message + +------------------------------------------------------------------------ +r916 | lh3lh3 | 2011-02-24 09:35:06 -0500 (Thu, 24 Feb 2011) | 3 lines +Changed paths: + M /trunk/samtools/bcftools/bcf.c + M /trunk/samtools/bcftools/vcf.c + + * automatically fix errors in GL + * output unrecognized FORMAT as "." + +------------------------------------------------------------------------ +r913 | lh3lh3 | 2011-02-10 22:59:47 -0500 (Thu, 10 Feb 2011) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/bcf.h + M /trunk/samtools/bcftools/call1.c + M /trunk/samtools/bcftools/vcf.c + +finished VCF->BCF conversion + +------------------------------------------------------------------------ +r910 | petulda | 2011-02-03 03:13:48 -0500 (Thu, 03 Feb 2011) | 1 line +Changed paths: + M /trunk/samtools/bcftools/vcfutils.pl + +Prevent division by zero +------------------------------------------------------------------------ +r909 | lh3lh3 | 2011-02-02 11:29:20 -0500 (Wed, 02 Feb 2011) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/call1.c + +fixed a typo in the VCF header + +------------------------------------------------------------------------ +r908 | lh3lh3 | 2011-02-02 11:28:24 -0500 (Wed, 02 Feb 2011) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam_index.c + + * fixed an out-of-boundary bug + * improved sorting order checking in index + +------------------------------------------------------------------------ +r907 | lh3lh3 | 2011-01-29 22:59:20 -0500 (Sat, 29 Jan 2011) | 4 lines +Changed paths: + M /trunk/samtools/INSTALL + M /trunk/samtools/bam_tview.c + M /trunk/samtools/knetfile.c + + * avoid a segfault when network connect fails + * update INSTALL + * fixed a bug in tview on big-endian by Nathan Weeks + +------------------------------------------------------------------------ +r903 | lh3lh3 | 2011-01-27 14:50:02 -0500 (Thu, 27 Jan 2011) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bam_md.c + + * fixed a rare memory issue in bam_md.c + * fixed a bug in indel calling related to unmapped and refskip reads + +------------------------------------------------------------------------ +r902 | lh3lh3 | 2011-01-23 21:46:18 -0500 (Sun, 23 Jan 2011) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/fet.c + +fixed two minor bugs in Fisher's exact test + +------------------------------------------------------------------------ +r899 | petulda | 2011-01-19 09:28:02 -0500 (Wed, 19 Jan 2011) | 1 line +Changed paths: + M /trunk/samtools/bcftools/vcfutils.pl + +Skip sites with unknown ref +------------------------------------------------------------------------ +r898 | lh3lh3 | 2011-01-15 12:56:05 -0500 (Sat, 15 Jan 2011) | 2 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bam_md.c + +move bam_nt16_nt4_table[] from bam_maqcns.c to bam_md.c + +------------------------------------------------------------------------ +r896 | lh3lh3 | 2011-01-06 10:52:15 -0500 (Thu, 06 Jan 2011) | 3 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/bcftools/bcf.h + M /trunk/samtools/bcftools/bcfutils.c + M /trunk/samtools/bcftools/call1.c + + * samtools-0.1.12-10 (r896) + * allow to exclude read groups in mpileup + +------------------------------------------------------------------------ +r895 | lh3lh3 | 2011-01-04 11:31:29 -0500 (Tue, 04 Jan 2011) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/bcf.tex + +sorry. It is SP not ST + +------------------------------------------------------------------------ +r894 | lh3lh3 | 2011-01-04 11:29:06 -0500 (Tue, 04 Jan 2011) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/bcf.tex + +added ST + +------------------------------------------------------------------------ +r893 | petulda | 2011-01-04 06:55:56 -0500 (Tue, 04 Jan 2011) | 1 line +Changed paths: + M /trunk/samtools/bcftools/call1.c + +Fixed a typo in read_samples +------------------------------------------------------------------------ +r892 | jmarshall | 2010-12-28 08:06:49 -0500 (Tue, 28 Dec 2010) | 9 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/bcftools/Makefile + M /trunk/samtools/examples/Makefile + +System libraries go *after* user libraries in link commands, because +the user libraries may themselves have dependencies that are satisfied +by the system libraries. It's not rocket science! + +This makes a difference with some linkers; or with -static or --as-needed. + +The examples/Makefile fix is from Charles Plessy. +See also http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=606004 + +------------------------------------------------------------------------ +r891 | lh3lh3 | 2010-12-21 12:16:33 -0500 (Tue, 21 Dec 2010) | 3 lines +Changed paths: + M /trunk/samtools/bamtk.c + M /trunk/samtools/bcftools/bcf.h + M /trunk/samtools/bcftools/bcfutils.c + M /trunk/samtools/bcftools/call1.c + + * samtools-0.1.12-9 (r891) + * allow to call SNPs from a subset of samples + +------------------------------------------------------------------------ +r889 | lh3lh3 | 2010-12-15 11:28:16 -0500 (Wed, 15 Dec 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.12-12 (r889) + * set mapQ as 20 if it equals 255 + +------------------------------------------------------------------------ +r888 | lh3lh3 | 2010-12-14 22:41:09 -0500 (Tue, 14 Dec 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + +When -B is applied to mpileup, still use paired reads only unless -A is flagged. + +------------------------------------------------------------------------ +r887 | lh3lh3 | 2010-12-14 22:37:05 -0500 (Tue, 14 Dec 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam_md.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.12-6 (r887) + * added a hidden option -E to mpileup/calmd. -E triggers an alternative way to apply BAQ. + +------------------------------------------------------------------------ +r886 | lh3lh3 | 2010-12-14 12:51:03 -0500 (Tue, 14 Dec 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bamtk.c + +(Arguably) improved the indel caller a tiny bit for lowCov data. + +------------------------------------------------------------------------ +r885 | petulda | 2010-12-14 04:55:46 -0500 (Tue, 14 Dec 2010) | 1 line +Changed paths: + M /trunk/samtools/bcftools/call1.c + +Fixed the VCF header to pass validation +------------------------------------------------------------------------ +r884 | lh3lh3 | 2010-12-12 23:02:19 -0500 (Sun, 12 Dec 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/bcftools/vcfutils.pl + + * samtools-0.1.12-4 (r884) + * fixed a long-existing flaw in the INDEL calling model + +------------------------------------------------------------------------ +r883 | lh3lh3 | 2010-12-11 20:05:42 -0500 (Sat, 11 Dec 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/bcfutils.c + M /trunk/samtools/bcftools/call1.c + M /trunk/samtools/bcftools/vcfutils.pl + +compute max SP and max GQ from sample genotypes + +------------------------------------------------------------------------ +r880 | lh3lh3 | 2010-12-10 10:50:54 -0500 (Fri, 10 Dec 2010) | 2 lines +Changed paths: + D /trunk/samtools/bcftools/bcf-fix.pl + +drop bcf-fix.pl as it is redundant by the latest changes + +------------------------------------------------------------------------ +r879 | lh3lh3 | 2010-12-10 10:50:29 -0500 (Fri, 10 Dec 2010) | 3 lines +Changed paths: + M /trunk/samtools/bcftools/call1.c + M /trunk/samtools/bcftools/vcf.c + + * fixed a minor issue in printing VCFs + * write bcftools specific INFO and FORMAT in the header + +------------------------------------------------------------------------ +r878 | lh3lh3 | 2010-12-10 10:09:14 -0500 (Fri, 10 Dec 2010) | 2 lines +Changed paths: + M /trunk/samtools/bamtk.c + M /trunk/samtools/bcftools/bcfutils.c + M /trunk/samtools/bcftools/call1.c + +Make sure that the GT genotype field is the first + +------------------------------------------------------------------------ +r877 | lh3lh3 | 2010-12-08 17:27:05 -0500 (Wed, 08 Dec 2010) | 7 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf.h + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.12-2 (r877) + + * allow to fine control the selection of indel candidates. The current + setting is okay for lowCov and highCov with ~100 samples, but it + skips too many indels for highCov with >250 samples. + + +------------------------------------------------------------------------ +r874 | lh3lh3 | 2010-12-07 22:40:35 -0500 (Tue, 07 Dec 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + +a spelling error.. + +------------------------------------------------------------------------ +r873 | lh3lh3 | 2010-12-07 22:39:57 -0500 (Tue, 07 Dec 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.12-1 (r873) + * added a switch to allow anomalous read pairs in calling + +------------------------------------------------------------------------ +r872 | lh3lh3 | 2010-12-07 14:43:54 -0500 (Tue, 07 Dec 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/vcfutils.pl + +fixed a bug in vcf2fq + +------------------------------------------------------------------------ +r869 | lh3lh3 | 2010-12-05 01:18:06 -0500 (Sun, 05 Dec 2010) | 2 lines +Changed paths: + M /trunk/samtools/bamtk.c + +added a warning for the Windows version + +------------------------------------------------------------------------ +r868 | lh3lh3 | 2010-12-05 01:05:51 -0500 (Sun, 05 Dec 2010) | 4 lines +Changed paths: + M /trunk/samtools/bcftools/call1.c + +In ksprintf(), change "%lf" and "%lg" to "%f" and "%g", respectively. +According to the manual page, this change is valid. However, MinGW seems +to interpret "%lf" as "%Lf". + +------------------------------------------------------------------------ +r867 | lh3lh3 | 2010-12-05 00:35:43 -0500 (Sun, 05 Dec 2010) | 2 lines +Changed paths: + M /trunk/samtools/Makefile.mingw + M /trunk/samtools/bam_aux.c + +bring back the windows support + +------------------------------------------------------------------------ +r866 | lh3lh3 | 2010-12-04 23:33:51 -0500 (Sat, 04 Dec 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam_reheader.c + M /trunk/samtools/bcftools/vcfutils.pl + +Fixed a compiling error when knetfile is not used. + +------------------------------------------------------------------------ +r865 | lh3lh3 | 2010-12-04 00:13:22 -0500 (Sat, 04 Dec 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/vcfutils.pl + +vcf->fastq + +------------------------------------------------------------------------ +r864 | lh3lh3 | 2010-12-03 17:12:30 -0500 (Fri, 03 Dec 2010) | 3 lines +Changed paths: + M /trunk/samtools/bcftools/call1.c + M /trunk/samtools/bcftools/prob1.c + M /trunk/samtools/bcftools/prob1.h + + * remove "-f". Instead always compute consensus quality + * increase the upper limit of quality + +------------------------------------------------------------------------ +r863 | lh3lh3 | 2010-12-03 15:28:15 -0500 (Fri, 03 Dec 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/bcf.c + +more informative error message + +------------------------------------------------------------------------ +r862 | lh3lh3 | 2010-12-02 16:16:08 -0500 (Thu, 02 Dec 2010) | 2 lines +Changed paths: + M /trunk/samtools/NEWS + M /trunk/samtools/bamtk.c + +Release samtools-0.1.12a + +------------------------------------------------------------------------ +r861 | lh3lh3 | 2010-12-02 15:55:06 -0500 (Thu, 02 Dec 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/call1.c + +a possible fix to DP4=0,0,0,0; have not tested, but should have no side-effect + +------------------------------------------------------------------------ +r859 | lh3lh3 | 2010-12-02 11:39:57 -0500 (Thu, 02 Dec 2010) | 2 lines +Changed paths: + M /trunk/samtools/NEWS + M /trunk/samtools/bam_index.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/samtools.1 + +Release samtools-0.1.12 + +------------------------------------------------------------------------ +r858 | lh3lh3 | 2010-12-02 11:24:41 -0500 (Thu, 02 Dec 2010) | 4 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/bcftools/bcf.c + + * samtools-0.1.11-1 (r858) + * fixed a bug in mpileup which causes segfaults + * bcftools: do not segfault when BCF contains errors + +------------------------------------------------------------------------ +r857 | lh3lh3 | 2010-11-30 23:52:50 -0500 (Tue, 30 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam_index.c + +fixed a memory leak in bam_fetch() + +------------------------------------------------------------------------ +r856 | lh3lh3 | 2010-11-26 00:07:31 -0500 (Fri, 26 Nov 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bcftools/vcfutils.pl + + * fixed a memory violation + * added splitchr to vcfutils.pl + +------------------------------------------------------------------------ +r854 | lh3lh3 | 2010-11-23 09:05:08 -0500 (Tue, 23 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/ld.c + +fixed a typo/bug in r^2 computation + +------------------------------------------------------------------------ +r852 | lh3lh3 | 2010-11-21 22:20:20 -0500 (Sun, 21 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/bamtk.c + +forget to change the version information + +------------------------------------------------------------------------ +r851 | lh3lh3 | 2010-11-21 22:16:52 -0500 (Sun, 21 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/NEWS + M /trunk/samtools/bcftools/bcftools.1 + M /trunk/samtools/samtools.1 + +Release samtools-0.1.11 + ------------------------------------------------------------------------ r844 | lh3lh3 | 2010-11-19 23:16:08 -0500 (Fri, 19 Nov 2010) | 3 lines Changed paths: diff --git a/sam/INSTALL b/sam/INSTALL index f1cf7aa..37d84a9 100644 --- a/sam/INSTALL +++ b/sam/INSTALL @@ -1,29 +1,30 @@ System Requirements =================== -SAMtools depends on the zlib library . The latest -version 1.2.3 is preferred and with the latest version you can compile -razip and use it to compress a FASTA file. SAMtools' faidx is able to -index a razip-compressed FASTA file to save diskspace. Older zlib also -works with SAMtools, but razip cannot be compiled. +SAMtools depends on the zlib library . Version 1.2.3+ is +preferred and with 1.2.3+ you can compile razip and use it to compress a FASTA +file. SAMtools' faidx is able to index a razip-compressed FASTA file to save +diskspace. Older zlib also works with SAMtools, but razip cannot be compiled. The text-based viewer (tview) requires the GNU ncurses library -, which comes with Mac OS X and -most of the modern Linux/Unix distributions. If you do not have this -library installed, you can still compile the rest of SAMtools by -manually modifying one line in Makefile. +, which comes with Mac OS X and most of +the modern Linux/Unix distributions. If you do not have this library installed, +you can still compile the rest of SAMtools by manually changing: +`-D_CURSES_LIB=1' to `-D_CURSES_LIB=0' at the line starting with `DFLAGS=', and +comment out the line starting with `LIBCURSES='. Compilation =========== -Type `make' to compile samtools. If you have zlib >= 1.2.2.1, you can -compile razip with `make razip'. +Type `make' to compile samtools. If you have zlib >= 1.2.2.1, you can compile +razip with `make razip'. Installation ============ -Simply copy `samtools' and other executables/scripts in `misc' to a -location you want (e.g. a directory in your $PATH). No further -configurations are required. +Copy `samtools', `bcftools/bcftools' and other executables/scripts in `misc' to +a location you want (e.g. a directory in your $PATH). You may also copy +`samtools.1' and `bcftools/bcftools.1' to a directory in your $MANPATH such +that the `man' command may find the manual. diff --git a/sam/Makefile b/sam/Makefile index 13d4a76..db18333 100644 --- a/sam/Makefile +++ b/sam/Makefile @@ -1,13 +1,14 @@ CC= gcc CFLAGS= -g -Wall -O2 #-m64 #-arch ppc -DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE -D_CURSES_LIB=1 +DFLAGS= -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -D_USE_KNETFILE -D_CURSES_LIB=1 KNETFILE_O= knetfile.o LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ - bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o \ - $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o -AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \ + bam_pileup.o bam_lpileup.o bam_md.o razf.o faidx.o bedidx.o \ + $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o bam_cat.o +AOBJS= bam_tview.o bam_plcmd.o sam_view.o \ bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ - bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o + bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \ + cut_target.o phase.o bam2depth.o PROG= samtools INCLUDES= -I. SUBDIRS= . bcftools misc @@ -37,10 +38,10 @@ all:$(PROG) lib:libbam.a libbam.a:$(LOBJS) - $(AR) -cru $@ $(LOBJS) + $(AR) -csru $@ $(LOBJS) samtools:lib-recur $(AOBJS) - $(CC) $(CFLAGS) -o $@ $(AOBJS) libbam.a -lm $(LIBPATH) $(LIBCURSES) -lz -Lbcftools -lbcf + $(CC) $(CFLAGS) -o $@ $(AOBJS) -Lbcftools $(LIBPATH) libbam.a -lbcf $(LIBCURSES) -lm -lz razip:razip.o razf.o $(KNETFILE_O) $(CC) $(CFLAGS) -o $@ razf.o razip.o $(KNETFILE_O) -lz @@ -53,19 +54,19 @@ bam.o:bam.h razf.h bam_endian.h kstring.h sam_header.h sam.o:sam.h bam.h bam_import.o:bam.h kseq.h khash.h razf.h bam_pileup.o:bam.h razf.h ksort.h -bam_plcmd.o:bam.h faidx.h bam_maqcns.h glf.h bcftools/bcf.h bam2bcf.h +bam_plcmd.o:bam.h faidx.h bcftools/bcf.h bam2bcf.h bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h bam_lpileup.o:bam.h ksort.h -bam_tview.o:bam.h faidx.h bam_maqcns.h -bam_maqcns.o:bam.h ksort.h bam_maqcns.h kaln.h +bam_tview.o:bam.h faidx.h bam_sort.o:bam.h ksort.h razf.h bam_md.o:bam.h faidx.h -glf.o:glf.h sam_header.o:sam_header.h khash.h bcf.o:bcftools/bcf.h bam2bcf.o:bam2bcf.h errmod.h bcftools/bcf.h bam2bcf_indel.o:bam2bcf.h errmod.o:errmod.h +phase.o:bam.h khash.h ksort.h +bamtk.o:bam.h faidx.o:faidx.h razf.h khash.h faidx_main.o:faidx.h razf.h diff --git a/sam/Makefile.mingw b/sam/Makefile.mingw index 9df4b9a..7a57ffc 100644 --- a/sam/Makefile.mingw +++ b/sam/Makefile.mingw @@ -1,18 +1,22 @@ CC= gcc.exe AR= ar.exe CFLAGS= -g -Wall -O2 -DFLAGS= -D_CURSES_LIB=2 -D_USE_KNETFILE +DFLAGS= -D_USE_KNETFILE -D_CURSES_LIB=2 KNETFILE_O= knetfile.o LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ - bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o bam_sort.o \ - $(KNETFILE_O) -AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \ + bam_pileup.o bam_lpileup.o bam_md.o razf.o faidx.o \ + $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o bedidx.o +AOBJS= bam_tview.o bam_plcmd.o sam_view.o \ bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ - bamtk.o kaln.o sam_header.o -PROG= samtools -INCLUDES= -Iwin32 + bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \ + cut_target.o phase.o bam_cat.o bam2depth.o +BCFOBJS= bcftools/bcf.o bcftools/fet.o bcftools/bcf2qcall.o bcftools/bcfutils.o \ + bcftools/call1.o bcftools/index.o bcftools/kfunc.o bcftools/em.o \ + bcftools/kmin.o bcftools/prob1.o bcftools/vcf.o bcftools/mut.o +PROG= samtools.exe bcftools.exe +INCLUDES= -I. -Iwin32 SUBDIRS= . -LIBPATH= +LIBPATH= .SUFFIXES:.c .o @@ -29,31 +33,31 @@ lib:libbam.a libbam.a:$(LOBJS) $(AR) -cru $@ $(LOBJS) -samtools:$(AOBJS) libbam.a - $(CC) $(CFLAGS) -o $@ $(AOBJS) $(LIBPATH) -lm -L. -lbam -Lwin32 -lz -lcurses -lws2_32 +samtools.exe:$(AOBJS) libbam.a $(BCFOBJS) + $(CC) $(CFLAGS) -o $@ $(AOBJS) $(BCFOBJS) $(LIBPATH) -lm -L. -lbam -Lwin32 -lz -lcurses -lws2_32 -razip:razip.o razf.o $(KNETFILE_O) - $(CC) $(CFLAGS) -o $@ razf.o razip.o $(KNETFILE_O) -lz - -bgzip:bgzip.o bgzf.o $(KNETFILE_O) - $(CC) $(CFLAGS) -o $@ bgzf.o bgzip.o $(KNETFILE_O) -lz +bcftools.exe:$(BCFOBJS) bcftools/main.o kstring.o bgzf.o knetfile.o bedidx.o + $(CC) $(CFLAGS) -o $@ $(BCFOBJS) bcftools/main.o kstring.o bgzf.o knetfile.o bedidx.o -lm -Lwin32 -lz -lws2_32 razip.o:razf.h -bam.o:bam.h razf.h bam_endian.h kstring.h +bam.o:bam.h razf.h bam_endian.h kstring.h sam_header.h sam.o:sam.h bam.h bam_import.o:bam.h kseq.h khash.h razf.h bam_pileup.o:bam.h razf.h ksort.h -bam_plcmd.o:bam.h faidx.h bam_maqcns.h glf.h +bam_plcmd.o:bam.h faidx.h bcftools/bcf.h bam2bcf.h bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h bam_lpileup.o:bam.h ksort.h -bam_tview.o:bam.h faidx.h bam_maqcns.h -bam_maqcns.o:bam.h ksort.h bam_maqcns.h +bam_tview.o:bam.h faidx.h bam_sort.o:bam.h ksort.h razf.h bam_md.o:bam.h faidx.h -glf.o:glf.h +sam_header.o:sam_header.h khash.h +bcf.o:bcftools/bcf.h +bam2bcf.o:bam2bcf.h errmod.h bcftools/bcf.h +bam2bcf_indel.o:bam2bcf.h +errmod.o:errmod.h faidx.o:faidx.h razf.h khash.h faidx_main.o:faidx.h razf.h clean: - rm -fr gmon.out *.o *.exe *.dSYM razip bgzip $(PROG) *~ *.a + rm -fr gmon.out *.o a.out *.exe *.dSYM razip bgzip $(PROG) *~ *.a *.so.* *.so *.dylib diff --git a/sam/NEWS b/sam/NEWS index 6b4d8aa..41a6cc8 100644 --- a/sam/NEWS +++ b/sam/NEWS @@ -1,3 +1,274 @@ +Beta Release 0.1.18 (2 September, 2011) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes in samtools: + + * Support the new =/X CIGAR operators (by Peter Cock). + + * Allow to subsample BAM while keeping the pairing intact (view -s). + + * Implemented variant distance bias as a new filter (by Petr Danecek). + + * Bugfix: huge memory usage during indexing + + * Bugfix: use of uninitialized variable in mpileup (rare) + + * Bugfix: wrong BAQ probability (rare) + +Notable changes in bcftools: + + * Support indel in the contrast caller. + + * Bugfix: LRT2=nan in rare cases + +(0.1.18: 2 September 2011, r982:295) + + + +Beta Release 0.1.17 (6 July, 2011) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +With the maturity of `mpileup' and the lack of update in the `pileup' command, +the `pileup' command is now formally dropped. Most of the pileup functionality, +such as outputting mapping quality and read positions, have been added +`mpileup'. + +Since this release, `bcftools view' is able to perform contrast SNP calling +(option -T) for discovering de novo and/or somatic mutations between a pair of +samples or in a family trio. Potential mutations are scored by a log likelihood +ratio, which is very simple in math, but should be comparable to more +sophisticated methods. Note that getting the score is only the very first step. +A lot more need to be done to reduce systematical errors due to mapping and +reference errors and structural variations. + +Other notable changes in samtools: + + * Improved sorting order checking during indexing. + + * Improved region parsing. Colons in reference sequence names are parsed + properly. + + * Fixed an issue where mpileup does not apply BAQ for the first few reads when + a region is specified. + + * Fixed an issue where `faidx' does not work with FASTA files with long lines. + + * Bugfix: wrong SP genotype information in the BCF output. + +Other notable changes in bcftools: + + * Output the ML esitmate of the allele count. + + * Added the HWE plus F<0 filter to varFilter. For multiple samples, it + effectively filters false heterozygous calls around centromeres. + + * For association mapping, perform both 1-degree and 2-degree test. The + 2-degree test is conservative but more robust to HWE violation. + +(0.1.17: 6 July 2011, r973:277) + + + +Beta Release 0.1.16 (21 April, 2011) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes in samtools: + + * Support the new SAM/BAM type `B' in the latest SAM spec v1.4. + + * When the output file of `samtools merge' exists, do not overwrite it unless + a new command-line option `-f' is applied. + + * Bugfix: BED support is not working when the input BED is not sorted. + + * Bugfix: some reads without coordinates but given on the reverse strand are + lost in merging. + +Notable changes in bcftools: + + * Code cleanup: separated max-likelihood inference and Bayesian inference. + + * Test Hardy-Weinberg equilibrium with a likelihood-ratio test. + + * Provided another association test P-value by likelihood-ratio test. + + * Use Brent's method to estimate the site allele frequency when EM converges + slowly. The resulting ML estimate of allele frequnecy is more accurate. + + * Added the `ldpair' command, which computes r^2 between SNP pairs given in + an input file. + +Also, the `pileup' command, which has been deprecated by `mpileup' since +version 0.1.10, will be dropped in the next release. The old `pileup' command +is substandard and causing a lot of confusion. + +(0.1.16: 21 April 2011, r963:234) + + + +Beta Release 0.1.15 (10 April, 2011) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Noteable changes: + + * Allow to perform variant calling or to extract information in multiple + regions specified by a BED file (`samtools mpileup -l', `samtools view -L' + and `bcftools view -l'). + + * Added the `depth' command to samtools to compute the per-base depth with a + simpler interface. File `bam2depth.c', which implements this command, is the + recommended example on how to use the mpileup APIs. + + * Estimate genotype frequencies with ML; perform chi^2 based Hardy-Weinberg + test using this estimate. + + * For `samtools view', when `-R' is specified, drop read groups in the header + that are not contained in the specified file. + + * For `samtools flagstat', separate QC-pass and QC-fail reads. + + * Improved the command line help of `samtools mpileup' and `bcftools view'. + + * Use a global variable to control the verbose level of samtools stderr + output. Nonetheless, it has not been full utilized. + + * Fixed an issue in association test which may report false associations, + possibly due to floating point underflow. + +(0.1.15: 10 April 2011, r949:203) + + + +Beta release 0.1.14 (21 March, 2011) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This release implements a method for testing associations for case-control +data. The method does not call genotypes but instead sums over all genotype +configurations to compute a chi^2 based test statistics. It can be potentially +applied to comparing a pair of samples (e.g. a tumor-normal pair), but this +has not been evaluated on real data. + +Another new feature is to make X chromosome variant calls when female and male +samples are both present. The user needs to provide a file indicating the +ploidy of each sample (see also manual bcftools/bcftools.1). + +Other notable changes: + + * Added `bcftools view -F' to parse BCF files generated by samtools r921 or + older which encodes PL in a different way. + + * Changed the behavior of `bcftools view -s'. Now when a list of samples is + provided, the samples in the output will be reordered to match the ordering + in the sample list. This change is mainly designed for association test. + + * Sped up `bcftools view -v' for target sequencing given thousands of samples. + Also added a new option `view -d' to skip loci where only a few samples are + covered by reads. + + * Dropped HWE test. This feature has never been implemented properly. An EM + should be much better. To be implemented in future. + + * Added the `cat' command to samtools. This command concatenate BAMs with + identical sequence dictionaries in an efficient way. Modified from bam_cat.c + written by Chris Saunders. + + * Added `samtools view -1' to write BAMs at a low compression level but twice + faster to create. The `sort' command generates temporary files at a low + compression level as well. + + * Added `samtools mpileup -6' to accept "BAM" with Illumina 1.3+ quality + strings (strictly speaking, such a file is not BAM). + + * Added `samtools mpileup -L' to skip INDEL calling in regions with + excessively high coverage. Such regions dramatically slow down mpileup. + + * Updated `misc/export2sam.pl', provided by Chris Saunders from Illumina Inc. + +(0.1.14: 21 March 2011, r933:170) + + + +Beta release 0.1.13 (1 March, 2011) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The most important though largely invisible modification is the change of the +order of genotypes in the PL VCF/BCF tag. This is to conform the upcoming VCF +spec v4.1. The change means that 0.1.13 is not backward compatible with VCF/BCF +generated by samtools older than r921 inclusive. VCF/BCF generated by the new +samtools will contain a line `##fileformat=VCFv4.1' as well as the samtools +version number. + +Single Individual Haplotyping (SIH) is added as an experimental feature. It +originally aims to produce haploid consensus from fosmid pool sequencing, but +also works with short-read data. For short reads, phased blocks are usually too +short to be useful in many applications, but they can help to rule out part of +SNPs close to INDELs or between copies of CNVs. + + +Other notable changes in samtools: + + * Construct per-sample consensus to reduce the effect of nearby SNPs in INDEL + calling. This reduces the power but improves specificity. + + * Improved sorting order checking in indexing. Now indexing is the preferred way + to check if a BAM is sorted. + + * Added a switch `-E' to mpileup and calmd. This option uses an alternative way + to apply BAQ, which increases sensistivity, especially to MNPs, at the cost of + a little loss in specificity. + + * Added `mpileup -A' to allow to use reads in anomalous pairs in SNP calling. + + * Added `mpileup -m' to allow fine control of the collection of INDEL candidates. + + * Added `mpileup -S' to compute per-sample strand bias P-value. + + * Added `mpileup -G' to exclude read groups in variant calling. + + * Fixed segfault in indel calling related to unmapped and refskip reads. + + * Fixed an integer overflow in INDEL calling. This bug produces wrong INDEL + genotypes for longer short INDELs, typically over 10bp. + + * Fixed a bug in tview on big-endian machines. + + * Fixed a very rare memory issue in bam_md.c + + * Fixed an out-of-boundary bug in mpileup when the read base is `N'. + + * Fixed a compiling error when the knetfile library is not used. Fixed a + library compiling error due to the lack of bam_nt16_nt4_table[] table. + Suppress a compiling warning related to the latest zlib. + + +Other notable changes in bcftools: + + * Updated the BCF spec. + + * Added the `FQ' VCF INFO field, which gives the phred-scaled probability + of all samples being the same (identical to the reference or all homozygous + variants). Option `view -f' has been dropped. + + * Implementated of "vcfutils.pl vcf2fq" to generate a consensus sequence + similar to "samtools.pl pileup2fq". + + * Make sure the GT FORMAT field is always the first FORMAT to conform the VCF + spec. Drop bcf-fix.pl. + + * Output bcftools specific INFO and FORMAT in the VCF header. + + * Added `view -s' to call variants from a subset of samples. + + * Properly convert VCF to BCF with a user provided sequence dictionary. Nonetheless, + custom fields are still unparsed and will be stored as a missing value. + + * Fixed a minor bug in Fisher's exact test; the results are rarely changed. + + +(0.1.13: 1 March 2011, r926:134) + + + Beta release 0.1.12a (2 December, 2010) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -532,4 +803,4 @@ Beta Release 0.1.1 (22 December, 2008) The is the first public release of samtools. For more information, please check the manual page `samtools.1' and the samtools website -http://samtools.sourceforge.net \ No newline at end of file +http://samtools.sourceforge.net diff --git a/sam/bam.c b/sam/bam.c index 521c1dd..0055e84 100644 --- a/sam/bam.c +++ b/sam/bam.c @@ -7,7 +7,7 @@ #include "kstring.h" #include "sam_header.h" -int bam_is_be = 0; +int bam_is_be = 0, bam_verbose = 2; char *bam_flag2char_table = "pPuUrR12sfd\0\0\0\0\0"; /************************** @@ -32,7 +32,7 @@ int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar) int32_t l = 0; for (k = 0; k < c->n_cigar; ++k) { int op = cigar[k] & BAM_CIGAR_MASK; - if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP) + if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CEQUAL || op == BAM_CDIFF) l += cigar[k] >> BAM_CIGAR_SHIFT; } return l; @@ -79,7 +79,7 @@ bam_header_t *bam_header_read(bamFile fp) // with ESPIPE. Suppress the error message in this case. if (errno != ESPIPE) perror("[bam_header_read] bgzf_check_EOF"); } - else if (i == 0) fprintf(stderr, "[bam_header_read] EOF marker is absent.\n"); + else if (i == 0) fprintf(stderr, "[bam_header_read] EOF marker is absent. The input is probably truncated.\n"); // read "BAM1" magic_len = bam_read(fp, buf, 4); if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) { @@ -160,6 +160,19 @@ static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data) else if (type == 'I' || type == 'F') { bam_swap_endian_4p(s); s += 4; } else if (type == 'D') { bam_swap_endian_8p(s); s += 8; } else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; } + else if (type == 'B') { + int32_t n, Bsize = bam_aux_type2size(*s); + memcpy(&n, s + 1, 4); + if (1 == Bsize) { + } else if (2 == Bsize) { + for (i = 0; i < n; i += 2) + bam_swap_endian_2p(s + 5 + i); + } else if (4 == Bsize) { + for (i = 0; i < n; i += 4) + bam_swap_endian_4p(s + 5 + i); + } + bam_swap_endian_4p(s+1); + } } } @@ -255,7 +268,7 @@ char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of) else { for (i = 0; i < c->n_cigar; ++i) { kputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str); - kputc("MIDNSHP"[bam1_cigar(b)[i]&BAM_CIGAR_MASK], &str); + kputc("MIDNSHP=X"[bam1_cigar(b)[i]&BAM_CIGAR_MASK], &str); } } kputc('\t', &str); @@ -289,6 +302,23 @@ char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of) else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; } else if (type == 'd') { ksprintf(&str, "d:%lg", *(double*)s); s += 8; } else if (type == 'Z' || type == 'H') { kputc(type, &str); kputc(':', &str); while (*s) kputc(*s++, &str); ++s; } + else if (type == 'B') { + uint8_t sub_type = *(s++); + int32_t n; + memcpy(&n, s, 4); + s += 4; // no point to the start of the array + kputc(type, &str); kputc(':', &str); kputc(sub_type, &str); // write the typing + for (i = 0; i < n; ++i) { + kputc(',', &str); + if ('c' == sub_type || 'c' == sub_type) { kputw(*(int8_t*)s, &str); ++s; } + else if ('C' == sub_type) { kputw(*(uint8_t*)s, &str); ++s; } + else if ('s' == sub_type) { kputw(*(int16_t*)s, &str); s += 2; } + else if ('S' == sub_type) { kputw(*(uint16_t*)s, &str); s += 2; } + else if ('i' == sub_type) { kputw(*(int32_t*)s, &str); s += 4; } + else if ('I' == sub_type) { kputuw(*(uint32_t*)s, &str); s += 4; } + else if ('f' == sub_type) { ksprintf(&str, "%g", *(float*)s); s += 4; } + } + } } return str.s; } diff --git a/sam/bam.h b/sam/bam.h index eef2ea9..346c750 100644 --- a/sam/bam.h +++ b/sam/bam.h @@ -33,13 +33,15 @@ BAM library provides I/O and various operations on manipulating files in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map) - format. It now supports importing from or exporting to TAM, sorting, + format. It now supports importing from or exporting to SAM, sorting, merging, generating pileup, and quickly retrieval of reads overlapped with a specified region. @copyright Genome Research Ltd. */ +#define BAM_VERSION "0.1.18 (r982:295)" + #include #include #include @@ -132,20 +134,25 @@ typedef struct { /* CIGAR operations. */ -/*! @abstract CIGAR: match */ +/*! @abstract CIGAR: M = match or mismatch*/ #define BAM_CMATCH 0 -/*! @abstract CIGAR: insertion to the reference */ +/*! @abstract CIGAR: I = insertion to the reference */ #define BAM_CINS 1 -/*! @abstract CIGAR: deletion from the reference */ +/*! @abstract CIGAR: D = deletion from the reference */ #define BAM_CDEL 2 -/*! @abstract CIGAR: skip on the reference (e.g. spliced alignment) */ +/*! @abstract CIGAR: N = skip on the reference (e.g. spliced alignment) */ #define BAM_CREF_SKIP 3 -/*! @abstract CIGAR: clip on the read with clipped sequence present in qseq */ +/*! @abstract CIGAR: S = clip on the read with clipped sequence + present in qseq */ #define BAM_CSOFT_CLIP 4 -/*! @abstract CIGAR: clip on the read with clipped sequence trimmed off */ +/*! @abstract CIGAR: H = clip on the read with clipped sequence trimmed off */ #define BAM_CHARD_CLIP 5 -/*! @abstract CIGAR: padding */ +/*! @abstract CIGAR: P = padding */ #define BAM_CPAD 6 +/*! @abstract CIGAR: equals = match */ +#define BAM_CEQUAL 7 +/*! @abstract CIGAR: X = mismatch */ +#define BAM_CDIFF 8 /*! @typedef @abstract Structure for core alignment information. @@ -262,6 +269,12 @@ typedef struct __bam_iter_t *bam_iter_t; */ extern int bam_is_be; +/*! + @abstract Verbose level between 0 and 3; 0 is supposed to disable all + debugging information, though this may not have been implemented. + */ +extern int bam_verbose; + /*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */ extern unsigned char bam_nt16_table[256]; @@ -738,4 +751,13 @@ static inline bam1_t *bam_dup1(const bam1_t *src) return b; } +static inline int bam_aux_type2size(int x) +{ + if (x == 'C' || x == 'c' || x == 'A') return 1; + else if (x == 'S' || x == 's') return 2; + else if (x == 'I' || x == 'i' || x == 'f') return 4; + else return 0; +} + + #endif diff --git a/sam/bam2bcf.c b/sam/bam2bcf.c index 088635c..dec3305 100644 --- a/sam/bam2bcf.c +++ b/sam/bam2bcf.c @@ -11,6 +11,7 @@ extern void ks_introsort_uint32_t(size_t n, uint32_t a[]); #define CALL_ETA 0.03f #define CALL_MAX 256 #define CALL_DEFTHETA 0.83f +#define DEF_MAPQ 20 #define CAP_DIST 25 @@ -23,6 +24,8 @@ bcf_callaux_t *bcf_call_init(double theta, int min_baseQ) bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100; bca->min_baseQ = min_baseQ; bca->e = errmod_init(1. - theta); + bca->min_frac = 0.002; + bca->min_support = 1; return bca; } @@ -36,6 +39,7 @@ void bcf_call_destroy(bcf_callaux_t *bca) * negative if we are looking at an indel. */ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r) { + static int *var_pos = NULL, nvar_pos = 0; int i, n, ref4, is_indel, ori_depth = 0; memset(r, 0, sizeof(bcf_callret1_t)); if (ref_base >= 0) { @@ -61,7 +65,8 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t seqQ = is_indel? (p->aux>>8&0xff) : 99; if (q < bca->min_baseQ) continue; if (q > seqQ) q = seqQ; - mapQ = p->b->core.qual < bca->capQ? p->b->core.qual : bca->capQ; + mapQ = p->b->core.qual < 255? p->b->core.qual : DEF_MAPQ; // special case for mapQ==255 + mapQ = mapQ < bca->capQ? mapQ : bca->capQ; if (q > mapQ) q = mapQ; if (q > 63) q = 63; if (q < 4) q = 4; @@ -75,7 +80,7 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t } bca->bases[n++] = q<<5 | (int)bam1_strand(p->b)<<4 | b; // collect annotations - r->qsum[b] += q; + if (b < 4) r->qsum[b] += q; ++r->anno[0<<2|is_diff<<1|bam1_strand(p->b)]; min_dist = p->b->core.l_qseq - 1 - p->qpos; if (min_dist > p->qpos) min_dist = p->qpos; @@ -90,9 +95,92 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t r->depth = n; r->ori_depth = ori_depth; // glfgen errmod_cal(bca->e, n, 5, bca->bases, r->p); + + // Calculate the Variant Distance Bias (make it optional?) + if ( nvar_pos < _n ) { + nvar_pos = _n; + var_pos = realloc(var_pos,sizeof(int)*nvar_pos); + } + int alt_dp=0, read_len=0; + for (i=0; i<_n; i++) { + const bam_pileup1_t *p = pl + i; + if ( bam1_seqi(bam1_seq(p->b),p->qpos) == ref_base ) + continue; + + var_pos[alt_dp] = p->qpos; + if ( (bam1_cigar(p->b)[0]&BAM_CIGAR_MASK)==4 ) + var_pos[alt_dp] -= bam1_cigar(p->b)[0]>>BAM_CIGAR_SHIFT; + + alt_dp++; + read_len += p->b->core.l_qseq; + } + float mvd=0; + int j; + n=0; + for (i=0; imvd[0] = n ? mvd/n : 0; + r->mvd[1] = alt_dp; + r->mvd[2] = alt_dp ? read_len/alt_dp : 0; + return r->depth; } + +void calc_vdb(int n, const bcf_callret1_t *calls, bcf_call_t *call) +{ + // Variant distance bias. Samples merged by means of DP-weighted average. + + float weight=0, tot_prob=0; + + int i; + for (i=0; i2*mu ? 0 : sin(mvd*3.14/2/mu) / (4*mu/3.14); + } + else + { + // Scaled gaussian curve, crude approximation, but behaves well. Using fixed depth for bigger depths. + if ( dp>5 ) + dp = 5; + float sigma2 = (read_len/1.9/(dp+1)) * (read_len/1.9/(dp+1)); + float norm = 1.125*sqrt(2*3.14*sigma2); + float mu = read_len/2.9; + if ( mvd < mu ) + prob = exp(-(mvd-mu)*(mvd-mu)/2/sigma2)/norm; + else + prob = exp(-(mvd-mu)*(mvd-mu)/3.125/sigma2)/norm; + } + + //fprintf(stderr,"dp=%d mvd=%d read_len=%d -> prob=%f\n", dp,mvd,read_len,prob); + tot_prob += prob*dp; + weight += dp; + } + tot_prob = weight ? tot_prob/weight : 1; + //fprintf(stderr,"prob=%f\n", tot_prob); + call->vdb = tot_prob; +} + int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call) { int ref4, i, j, qsum[4]; @@ -140,8 +228,8 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, x = call->n_alleles * (call->n_alleles + 1) / 2; // get the possible genotypes for (i = z = 0; i < call->n_alleles; ++i) - for (j = i; j < call->n_alleles; ++j) - g[z++] = call->a[i] * 5 + call->a[j]; + for (j = 0; j <= i; ++j) + g[z++] = call->a[j] * 5 + call->a[i]; for (i = 0; i < n; ++i) { uint8_t *PL = call->PL + x * i; const bcf_callret1_t *r = calls + i; @@ -166,6 +254,9 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, call->ori_depth += calls[i].ori_depth; for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j]; } + + calc_vdb(n, calls, call); + return 0; } @@ -219,6 +310,10 @@ int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bc if (i) kputc(',', &s); kputw(bc->anno[i], &s); } + if ( bc->vdb!=1 ) + { + ksprintf(&s, ";VDB=%.4f", bc->vdb); + } kputc('\0', &s); // FMT kputs("PL", &s); @@ -232,7 +327,7 @@ int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bc memcpy(b->gi[0].data, bc->PL, b->gi[0].len * bc->n); if (bcr) { uint16_t *dp = (uint16_t*)b->gi[1].data; - uint8_t *sp = is_SP? b->gi[2].data : 0; + int32_t *sp = is_SP? b->gi[2].data : 0; for (i = 0; i < bc->n; ++i) { bcf_callret1_t *p = bcr + i; dp[i] = p->depth < 0xffff? p->depth : 0xffff; diff --git a/sam/bam2bcf.h b/sam/bam2bcf.h index 26b022c..4af080c 100644 --- a/sam/bam2bcf.h +++ b/sam/bam2bcf.h @@ -9,7 +9,9 @@ typedef struct __bcf_callaux_t { int capQ, min_baseQ; - int openQ, extQ, tandemQ; + int openQ, extQ, tandemQ; // for indels + int min_support; // for collecting indel candidates + double min_frac; // for collecting indel candidates // for internal uses int max_bases; int indel_types[4]; @@ -24,6 +26,7 @@ typedef struct { int depth, ori_depth, qsum[4]; int anno[16]; float p[25]; + int mvd[3]; // mean variant distance, number of variant reads, average read length } bcf_callret1_t; typedef struct { @@ -31,6 +34,7 @@ typedef struct { int n, n_alleles, shift, ori_ref, unseen; int anno[16], depth, ori_depth; uint8_t *PL; + float vdb; // variant distance bias } bcf_call_t; #ifdef __cplusplus diff --git a/sam/bam2bcf_indel.c b/sam/bam2bcf_indel.c index 16241d0..5142b3e 100644 --- a/sam/bam2bcf_indel.c +++ b/sam/bam2bcf_indel.c @@ -3,15 +3,16 @@ #include #include "bam.h" #include "bam2bcf.h" -#include "ksort.h" #include "kaln.h" #include "kprobaln.h" #include "khash.h" KHASH_SET_INIT_STR(rg) +#include "ksort.h" +KSORT_INIT_GENERIC(uint32_t) + #define MINUS_CONST 0x10000000 #define INDEL_WINDOW_SIZE 50 -#define MIN_SUPPORT_COEF 500 void *bcf_call_add_rg(void *_hash, const char *hdtext, const char *list) { @@ -65,7 +66,7 @@ static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, for (k = 0; k < c->n_cigar; ++k) { int op = cigar[k] & BAM_CIGAR_MASK; int l = cigar[k] >> BAM_CIGAR_SHIFT; - if (op == BAM_CMATCH) { + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { if (c->pos > tpos) return y; if (x + l > tpos) { *_tpos = tpos; @@ -111,10 +112,9 @@ static inline int est_indelreg(int pos, const char *ref, int l, char *ins4) int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref, const void *rghash) { - extern void ks_introsort_uint32_t(int, uint32_t*); int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins, *score1, *score2, max_ref2; int N, K, l_run, ref_type, n_alt; - char *inscns = 0, *ref2, *query; + char *inscns = 0, *ref2, *query, **ref_sample; khash_t(rg) *hash = (khash_t(rg)*)rghash; if (ref == 0 || bca == 0) return -1; // mark filtered reads @@ -165,9 +165,15 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla // squeeze out identical types for (i = 1, n_types = 1; i < m; ++i) if (aux[i] != aux[i-1]) ++n_types; - if (n_types == 1 || n_alt * MIN_SUPPORT_COEF < n_tot) { // no indels or too few supporting reads + if (n_types == 1 || (double)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support) { // then skip free(aux); return -1; } + if (n_types >= 64) { + free(aux); + if (bam_verbose >= 2) + fprintf(stderr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1); + return -1; + } types = (int*)calloc(n_types, sizeof(int)); t = 0; types[t++] = aux[0] - MINUS_CONST; @@ -178,7 +184,6 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla for (t = 0; t < n_types; ++t) if (types[t] == 0) break; ref_type = t; // the index of the reference type (0) - assert(n_types < 64); } { // calculate left and right boundary left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0; @@ -189,6 +194,58 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla if (ref[i] == 0) break; right = i; } + /* The following block fixes a long-existing flaw in the INDEL + * calling model: the interference of nearby SNPs. However, it also + * reduces the power because sometimes, substitutions caused by + * indels are not distinguishable from true mutations. Multiple + * sequence realignment helps to increase the power. + */ + { // construct per-sample consensus + int L = right - left + 1, max_i, max2_i; + uint32_t *cns, max, max2; + char *ref0, *r; + ref_sample = calloc(n, sizeof(void*)); + cns = calloc(L, 4); + ref0 = calloc(L, 1); + for (i = 0; i < right - left; ++i) + ref0[i] = bam_nt16_table[(int)ref[i+left]]; + for (s = 0; s < n; ++s) { + r = ref_sample[s] = calloc(L, 1); + memset(cns, 0, sizeof(int) * L); + // collect ref and non-ref counts + for (i = 0; i < n_plp[s]; ++i) { + bam_pileup1_t *p = plp[s] + i; + bam1_t *b = p->b; + uint32_t *cigar = bam1_cigar(b); + uint8_t *seq = bam1_seq(b); + int x = b->core.pos, y = 0; + for (k = 0; k < b->core.n_cigar; ++k) { + int op = cigar[k]&0xf; + int j, l = cigar[k]>>4; + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { + for (j = 0; j < l; ++j) + if (x + j >= left && x + j < right) + cns[x+j-left] += (bam1_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000; + x += l; y += l; + } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l; + else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; + } + } + // determine the consensus + for (i = 0; i < right - left; ++i) r[i] = ref0[i]; + max = max2 = 0; max_i = max2_i = -1; + for (i = 0; i < right - left; ++i) { + if (cns[i]>>16 >= max>>16) max2 = max, max2_i = max_i, max = cns[i], max_i = i; + else if (cns[i]>>16 >= max2>>16) max2 = cns[i], max2_i = i; + } + if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7) max_i = -1; + if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1; + if (max_i >= 0) r[max_i] = 15; + if (max2_i >= 0) r[max2_i] = 15; +// for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], stderr); fputc('\n', stderr); + } + free(ref0); free(cns); + } { // the length of the homopolymer run around the current position int c = bam_nt16_table[(int)ref[pos + 1]]; if (c == 15) l_run = 1; @@ -252,27 +309,29 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla else ir = est_indelreg(pos, ref, -types[t], 0); if (ir > bca->indelreg) bca->indelreg = ir; // fprintf(stderr, "%d, %d, %d\n", pos, types[t], ir); - // write ref2 - for (k = 0, j = left; j <= pos; ++j) - ref2[k++] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[j]]]; - if (types[t] <= 0) j += -types[t]; - else for (l = 0; l < types[t]; ++l) - ref2[k++] = inscns[t*max_ins + l]; - if (types[0] < 0) { // mask deleted sequences to avoid a particular error in the model. - int jj, tmp = types[t] >= 0? -types[0] : -types[0] + types[t]; - for (jj = 0; jj < tmp && j < right && ref[j]; ++jj, ++j) - ref2[k++] = 4; - } - for (; j < right && ref[j]; ++j) - ref2[k++] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[j]]]; - for (; k < max_ref2; ++k) ref2[k] = 4; - if (j < right) right = j; - // align each read to ref2 + // realignment for (s = K = 0; s < n; ++s) { + // write ref2 + for (k = 0, j = left; j <= pos; ++j) + ref2[k++] = bam_nt16_nt4_table[(int)ref_sample[s][j-left]]; + if (types[t] <= 0) j += -types[t]; + else for (l = 0; l < types[t]; ++l) + ref2[k++] = inscns[t*max_ins + l]; + for (; j < right && ref[j]; ++j) + ref2[k++] = bam_nt16_nt4_table[(int)ref_sample[s][j-left]]; + for (; k < max_ref2; ++k) ref2[k] = 4; + if (j < right) right = j; + // align each read to ref2 for (i = 0; i < n_plp[s]; ++i, ++K) { bam_pileup1_t *p = plp[s] + i; - int qbeg, qend, tbeg, tend, sc; + int qbeg, qend, tbeg, tend, sc, kk; uint8_t *seq = bam1_seq(p->b); + uint32_t *cigar = bam1_cigar(p->b); + if (p->b->core.flag&4) continue; // unmapped reads + // FIXME: the following loop should be better moved outside; nonetheless, realignment should be much slower anyway. + for (kk = 0; kk < p->b->core.n_cigar; ++kk) + if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) break; + if (kk < p->b->core.n_cigar) continue; // FIXME: the following skips soft clips, but using them may be more sensitive. // determine the start and end of sequences for alignment qbeg = tpos2qpos(&p->b->core, bam1_cigar(p->b), left, 0, &tbeg); @@ -367,9 +426,11 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla indelQ2 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ2 + .499); // pick the smaller between indelQ1 and indelQ2 indelQ = indelQ1 < indelQ2? indelQ1 : indelQ2; - p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; + if (indelQ > 255) indelQ = 255; + if (seqQ > 255) seqQ = 255; + p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ; -// fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d q=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ); +// fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ); } } // determine bca->indel_types[] and bca->inscns @@ -407,6 +468,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla } free(score1); free(score2); // free + for (i = 0; i < n; ++i) free(ref_sample[i]); + free(ref_sample); free(types); free(inscns); return n_alt > 0? 0 : -1; } diff --git a/sam/bam2depth.c b/sam/bam2depth.c new file mode 100644 index 0000000..ca36b89 --- /dev/null +++ b/sam/bam2depth.c @@ -0,0 +1,112 @@ +/* This program demonstrates how to generate pileup from multiple BAMs + * simutaneously, to achieve random access and to use the BED interface. + * To compile this program separately, you may: + * + * gcc -g -O2 -Wall -o bam2depth -D_MAIN_BAM2DEPTH bam2depth.c -L. -lbam -lz + */ +#include +#include +#include +#include +#include "bam.h" + +typedef struct { // auxiliary data structure + bamFile fp; // the file handler + bam_iter_t iter; // NULL if a region not specified + int min_mapQ; // mapQ filter +} aux_t; + +void *bed_read(const char *fn); // read a BED or position list file +void bed_destroy(void *_h); // destroy the BED data structure +int bed_overlap(const void *_h, const char *chr, int beg, int end); // test if chr:beg-end overlaps + +// This function reads a BAM alignment from one BAM file. +static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup +{ + aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure + int ret = aux->iter? bam_iter_read(aux->fp, aux->iter, b) : bam_read1(aux->fp, b); + if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP; + return ret; +} + +#ifdef _MAIN_BAM2DEPTH +int main(int argc, char *argv[]) +#else +int main_depth(int argc, char *argv[]) +#endif +{ + int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0; + const bam_pileup1_t **plp; + char *reg = 0; // specified region + void *bed = 0; // BED data structure + bam_header_t *h = 0; // BAM header of the 1st input + aux_t **data; + bam_mplp_t mplp; + + // parse the command line + while ((n = getopt(argc, argv, "r:b:q:Q:")) >= 0) { + switch (n) { + case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header + case 'b': bed = bed_read(optarg); break; // BED or position list file can be parsed now + case 'q': baseQ = atoi(optarg); break; // base quality threshold + case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold + } + } + if (optind == argc) { + fprintf(stderr, "Usage: bam2depth [-r reg] [-q baseQthres] [-Q mapQthres] [-b in.bed] [...]\n"); + return 1; + } + + // initialize the auxiliary data structures + n = argc - optind; // the number of BAMs on the command line + data = calloc(n, sizeof(void*)); // data[i] for the i-th input + beg = 0; end = 1<<30; tid = -1; // set the default region + for (i = 0; i < n; ++i) { + bam_header_t *htmp; + data[i] = calloc(1, sizeof(aux_t)); + data[i]->fp = bam_open(argv[optind+i], "r"); // open BAM + data[i]->min_mapQ = mapQ; // set the mapQ filter + htmp = bam_header_read(data[i]->fp); // read the BAM header + if (i == 0) { + h = htmp; // keep the header of the 1st BAM + if (reg) bam_parse_region(h, reg, &tid, &beg, &end); // also parse the region + } else bam_header_destroy(htmp); // if not the 1st BAM, trash the header + if (tid >= 0) { // if a region is specified and parsed successfully + bam_index_t *idx = bam_index_load(argv[optind+i]); // load the index + data[i]->iter = bam_iter_query(idx, tid, beg, end); // set the iterator + bam_index_destroy(idx); // the index is not needed any more; phase out of the memory + } + } + + // the core multi-pileup loop + mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization + n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM + plp = calloc(n, sizeof(void*)); // plp[i] points to the array of covering reads (internal in mplp) + while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position + if (pos < beg || pos >= end) continue; // out of range; skip + if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip + fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster + for (i = 0; i < n; ++i) { // base level filters have to go here + int j, m = 0; + for (j = 0; j < n_plp[i]; ++j) { + const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know + if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos + else if (bam1_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality + } + printf("\t%d", n_plp[i] - m); // this the depth to output + } + putchar('\n'); + } + free(n_plp); free(plp); + bam_mplp_destroy(mplp); + + bam_header_destroy(h); + for (i = 0; i < n; ++i) { + bam_close(data[i]->fp); + if (data[i]->iter) bam_iter_destroy(data[i]->iter); + free(data[i]); + } + free(data); free(reg); + if (bed) bed_destroy(bed); + return 0; +} diff --git a/sam/bam_aux.c b/sam/bam_aux.c index fbcd982..28b22e3 100644 --- a/sam/bam_aux.c +++ b/sam/bam_aux.c @@ -26,14 +26,12 @@ uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]) } #define __skip_tag(s) do { \ - int type = toupper(*(s)); \ - ++(s); \ - if (type == 'C' || type == 'A') ++(s); \ - else if (type == 'S') (s) += 2; \ - else if (type == 'I' || type == 'F') (s) += 4; \ - else if (type == 'D') (s) += 8; \ - else if (type == 'Z' || type == 'H') { while (*(s)) ++(s); ++(s); } \ - } while (0) + int type = toupper(*(s)); \ + ++(s); \ + if (type == 'Z' || type == 'H') { while (*(s)) ++(s); ++(s); } \ + else if (type == 'B') (s) += 5 + bam_aux_type2size(*(s)) * (*(int32_t*)((s)+1)); \ + else (s) += bam_aux_type2size(type); \ + } while(0) uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]) { @@ -61,6 +59,23 @@ int bam_aux_del(bam1_t *b, uint8_t *s) return 0; } +int bam_aux_drop_other(bam1_t *b, uint8_t *s) +{ + if (s) { + uint8_t *p, *aux; + aux = bam1_aux(b); + p = s - 2; + __skip_tag(s); + memmove(aux, p, s - p); + b->data_len -= b->l_aux - (s - p); + b->l_aux = s - p; + } else { + b->data_len -= b->l_aux; + b->l_aux = 0; + } + return 0; +} + void bam_init_header_hash(bam_header_t *header) { if (header->hash == 0) { @@ -89,47 +104,56 @@ int32_t bam_get_tid(const bam_header_t *header, const char *seq_name) return k == kh_end(h)? -1 : kh_value(h, k); } -int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end) +int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *beg, int *end) { - char *s, *p; - int i, l, k; + char *s; + int i, l, k, name_end; khiter_t iter; khash_t(s) *h; bam_init_header_hash(header); h = (khash_t(s)*)header->hash; - l = strlen(str); - p = s = (char*)malloc(l+1); - /* squeeze out "," */ - for (i = k = 0; i != l; ++i) - if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i]; - s[k] = 0; - for (i = 0; i != k; ++i) if (s[i] == ':') break; - s[i] = 0; - iter = kh_get(s, h, s); /* get the ref_id */ - if (iter == kh_end(h)) { // name not found - *ref_id = -1; free(s); - return -1; - } - *ref_id = kh_value(h, iter); - if (i == k) { /* dump the whole sequence */ - *begin = 0; *end = 1<<29; free(s); - return 0; - } - for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break; - *begin = atoi(p); - if (i < k) { - p = s + i + 1; - *end = atoi(p); - } else *end = 1<<29; - if (*begin > 0) --*begin; + *ref_id = *beg = *end = -1; + name_end = l = strlen(str); + s = (char*)malloc(l+1); + // remove space + for (i = k = 0; i < l; ++i) + if (!isspace(str[i])) s[k++] = str[i]; + s[k] = 0; l = k; + // determine the sequence name + for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end + if (i >= 0) name_end = i; + if (name_end < l) { // check if this is really the end + int n_hyphen = 0; + for (i = name_end + 1; i < l; ++i) { + if (s[i] == '-') ++n_hyphen; + else if (!isdigit(s[i]) && s[i] != ',') break; + } + if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name + s[name_end] = 0; + iter = kh_get(s, h, s); + if (iter == kh_end(h)) { // cannot find the sequence name + iter = kh_get(s, h, str); // try str as the name + if (iter == kh_end(h)) { + if (bam_verbose >= 2) fprintf(stderr, "[%s] fail to determine the sequence name.\n", __func__); + free(s); return -1; + } else s[name_end] = ':', name_end = l; + } + } else iter = kh_get(s, h, str); + *ref_id = kh_val(h, iter); + // parse the interval + if (name_end < l) { + for (i = k = name_end + 1; i < l; ++i) + if (s[i] != ',') s[k++] = s[i]; + s[k] = 0; + *beg = atoi(s + name_end + 1); + for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break; + *end = i < k? atoi(s + i + 1) : 1<<29; + if (*beg > 0) --*beg; + } else *beg = 0, *end = 1<<29; free(s); - if (*begin > *end) { - fprintf(stderr, "[bam_parse_region] invalid region.\n"); - return -1; - } - return 0; + return *beg <= *end? 0 : -1; } int32_t bam_aux2i(const uint8_t *s) @@ -180,3 +204,10 @@ char *bam_aux2Z(const uint8_t *s) if (type == 'Z' || type == 'H') return (char*)s; else return 0; } + +#ifdef _WIN32 +double drand48() +{ + return (double)rand() / RAND_MAX; +} +#endif diff --git a/sam/bam_cat.c b/sam/bam_cat.c new file mode 100644 index 0000000..0fde045 --- /dev/null +++ b/sam/bam_cat.c @@ -0,0 +1,184 @@ +/* + +bam_cat -- efficiently concatenates bam files + +bam_cat can be used to concatenate BAM files. Under special +circumstances, it can be used as an alternative to 'samtools merge' to +concatenate multiple sorted files into a single sorted file. For this +to work each file must be sorted, and the sorted files must be given +as command line arguments in order such that the final read in file i +is less than or equal to the first read in file i+1. + +This code is derived from the bam_reheader function in samtools 0.1.8 +and modified to perform concatenation by Chris Saunders on behalf of +Illumina. + + +########## License: + +The MIT License + +Original SAMtools work copyright (c) 2008-2009 Genome Research Ltd. +Modified SAMtools work copyright (c) 2010 Illumina, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + +*/ + + +/* +makefile: +""" +CC=gcc +CFLAGS+=-g -Wall -O2 -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE -I$(SAMTOOLS_DIR) +LDFLAGS+=-L$(SAMTOOLS_DIR) +LDLIBS+=-lbam -lz + +all:bam_cat +""" +*/ + + +#include +#include +#include + +#include "bgzf.h" +#include "bam.h" + +#define BUF_SIZE 0x10000 + +#define GZIPID1 31 +#define GZIPID2 139 + +#define BGZF_EMPTY_BLOCK_SIZE 28 + + +int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam) +{ + BGZF *fp; + FILE* fp_file; + uint8_t *buf; + uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE]; + const int es=BGZF_EMPTY_BLOCK_SIZE; + int i; + + fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(fileno(stdout), "w"); + if (fp == 0) { + fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outbam); + return 1; + } + if (h) bam_header_write(fp, h); + + buf = (uint8_t*) malloc(BUF_SIZE); + for(i = 0; i < nfn; ++i){ + BGZF *in; + bam_header_t *old; + int len,j; + + in = strcmp(fn[i], "-")? bam_open(fn[i], "r") : bam_dopen(fileno(stdin), "r"); + if (in == 0) { + fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]); + return -1; + } + if (in->open_mode != 'r') return -1; + + old = bam_header_read(in); + if (h == 0 && i == 0) bam_header_write(fp, old); + + if (in->block_offset < in->block_length) { + bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset); + bgzf_flush(fp); + } + + j=0; +#ifdef _USE_KNETFILE + fp_file=fp->x.fpw; + while ((len = knet_read(in->x.fpr, buf, BUF_SIZE)) > 0) { +#else + fp_file=fp->file; + while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0) { +#endif + if(len= 0) { + switch (c) { + case 'h': { + tamFile fph = sam_open(optarg); + if (fph == 0) { + fprintf(stderr, "[%s] ERROR: fail to read the header from '%s'.\n", __func__, argv[1]); + return 1; + } + h = sam_header_read(fph); + sam_close(fph); + break; + } + case 'o': outfn = strdup(optarg); break; + } + } + if (argc - optind < 2) { + fprintf(stderr, "Usage: samtools cat [-h header.sam] [-o out.bam] [...]\n"); + return 1; + } + ret = bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-"); + free(outfn); + return ret; +} diff --git a/sam/bam_import.c b/sam/bam_import.c index 9d84328..5518a9c 100644 --- a/sam/bam_import.c +++ b/sam/bam_import.c @@ -14,7 +14,7 @@ #include "kseq.h" #include "khash.h" -KSTREAM_INIT(gzFile, gzread, 8192) +KSTREAM_INIT(gzFile, gzread, 16384) KHASH_MAP_INIT_STR(ref, uint64_t) void bam_init_header_hash(bam_header_t *header); @@ -292,20 +292,22 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) z += str->l + 1; if (str->s[0] != '*') { for (s = str->s; *s; ++s) { - if (isalpha(*s)) ++c->n_cigar; + if ((isalpha(*s)) || (*s=='=')) ++c->n_cigar; else if (!isdigit(*s)) parse_error(fp->n_lines, "invalid CIGAR character"); } b->data = alloc_data(b, doff + c->n_cigar * 4); for (i = 0, s = str->s; i != c->n_cigar; ++i) { x = strtol(s, &t, 10); op = toupper(*t); - if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH; + if (op == 'M') op = BAM_CMATCH; else if (op == 'I') op = BAM_CINS; else if (op == 'D') op = BAM_CDEL; else if (op == 'N') op = BAM_CREF_SKIP; else if (op == 'S') op = BAM_CSOFT_CLIP; else if (op == 'H') op = BAM_CHARD_CLIP; else if (op == 'P') op = BAM_CPAD; + else if (op == '=') op = BAM_CEQUAL; + else if (op == 'X') op = BAM_CDIFF; else parse_error(fp->n_lines, "invalid CIGAR operation"); s = t + 1; bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op; @@ -337,8 +339,11 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) z += str->l + 1; if (strcmp(str->s, "*")) { c->l_qseq = strlen(str->s); - if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b))) - parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent"); + if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b))) { + fprintf(stderr, "Line %ld, sequence length %i vs %i from CIGAR\n", + (long)fp->n_lines, c->l_qseq, (int32_t)bam_cigar2qlen(c, bam1_cigar(b))); + parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent"); + } p = (uint8_t*)alloc_data(b, doff + c->l_qseq + (c->l_qseq+1)/2) + doff; memset(p, 0, (c->l_qseq+1)/2); for (i = 0; i < c->l_qseq; ++i) @@ -427,6 +432,27 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) memcpy(s, str->s + 5, str->l - 5); s[str->l - 5] = 0; doff += size; + } else if (type == 'B') { + int32_t n = 0, Bsize, k = 0, size; + char *p; + if (str->l < 8) parse_error(fp->n_lines, "too few values in aux type B"); + Bsize = bam_aux_type2size(str->s[5]); // the size of each element + for (p = (char*)str->s + 6; *p; ++p) // count the number of elements in the array + if (*p == ',') ++n; + p = str->s + 7; // now p points to the first number in the array + size = 6 + Bsize * n; // total number of bytes allocated to this tag + s = alloc_data(b, doff + 6 * Bsize * n) + doff; // allocate memory + *s++ = 'B'; *s++ = str->s[5]; + memcpy(s, &n, 4); s += 4; // write the number of elements + if (str->s[5] == 'c') while (p < str->s + str->l) ((int8_t*)s)[k++] = (int8_t)strtol(p, &p, 0), ++p; + else if (str->s[5] == 'C') while (p < str->s + str->l) ((uint8_t*)s)[k++] = (uint8_t)strtol(p, &p, 0), ++p; + else if (str->s[5] == 's') while (p < str->s + str->l) ((int16_t*)s)[k++] = (int16_t)strtol(p, &p, 0), ++p; // FIXME: avoid unaligned memory + else if (str->s[5] == 'S') while (p < str->s + str->l) ((uint16_t*)s)[k++] = (uint16_t)strtol(p, &p, 0), ++p; + else if (str->s[5] == 'i') while (p < str->s + str->l) ((int32_t*)s)[k++] = (int32_t)strtol(p, &p, 0), ++p; + else if (str->s[5] == 'I') while (p < str->s + str->l) ((uint32_t*)s)[k++] = (uint32_t)strtol(p, &p, 0), ++p; + else if (str->s[5] == 'f') while (p < str->s + str->l) ((float*)s)[k++] = (float)strtod(p, &p), ++p; + else parse_error(fp->n_lines, "unrecognized array type"); + s += Bsize * n; doff += size; } else parse_error(fp->n_lines, "unrecognized type"); if (dret == '\n' || dret == '\r') break; } diff --git a/sam/bam_index.c b/sam/bam_index.c index 328f011..9610a26 100644 --- a/sam/bam_index.c +++ b/sam/bam_index.c @@ -172,19 +172,23 @@ bam_index_t *bam_index_core(bamFile fp) save_bin = save_tid = last_tid = last_bin = 0xffffffffu; save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu; - n_mapped = n_unmapped = n_no_coor = off_end = 0; + n_mapped = n_unmapped = n_no_coor = off_end = 0; off_beg = off_end = bam_tell(fp); while ((ret = bam_read1(fp, b)) >= 0) { if (c->tid < 0) ++n_no_coor; - if (last_tid != c->tid) { // change of chromosomes + if (last_tid < c->tid || (last_tid >= 0 && c->tid < 0)) { // change of chromosomes last_tid = c->tid; last_bin = 0xffffffffu; - } else if (last_coor > c->pos) { + } else if ((uint32_t)last_tid > (uint32_t)c->tid) { + fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %d-th chr > %d-th chr\n", + bam1_qname(b), last_tid+1, c->tid+1); + return NULL; + } else if ((int32_t)c->tid >= 0 && last_coor > c->pos) { fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n", bam1_qname(b), last_coor, c->pos, c->tid+1); - exit(1); + return NULL; } - if (c->tid >= 0) insert_offset2(&idx->index2[b->core.tid], b, last_off); + if (c->tid >= 0 && !(c->flag & BAM_FUNMAP)) insert_offset2(&idx->index2[b->core.tid], b, last_off); if (c->bin != last_bin) { // then possibly write the binning index if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record insert_offset(idx->index[save_tid], save_bin, save_off, last_off); @@ -203,7 +207,7 @@ bam_index_t *bam_index_core(bamFile fp) if (bam_tell(fp) <= last_off) { fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n", (unsigned long long)bam_tell(fp), (unsigned long long)last_off); - exit(1); + return NULL; } if (c->flag & BAM_FUNMAP) ++n_unmapped; else ++n_mapped; @@ -217,8 +221,15 @@ bam_index_t *bam_index_core(bamFile fp) } merge_chunks(idx); fill_missing(idx); - if (ret >= 0) - while ((ret = bam_read1(fp, b)) >= 0) ++n_no_coor; + if (ret >= 0) { + while ((ret = bam_read1(fp, b)) >= 0) { + ++n_no_coor; + if (c->tid >= 0 && n_no_coor) { + fprintf(stderr, "[bam_index_core] the alignment is not sorted: reads without coordinates prior to reads with coordinates.\n"); + return NULL; + } + } + } if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret); free(b->data); free(b); idx->n_no_coor = n_no_coor; @@ -466,6 +477,10 @@ int bam_index_build2(const char *fn, const char *_fnidx) } idx = bam_index_core(fp); bam_close(fp); + if(idx == 0) { + fprintf(stderr, "[bam_index_build2] fail to index the BAM file.\n"); + return -1; + } if (_fnidx == 0) { fnidx = (char*)calloc(strlen(fn) + 5, 1); strcpy(fnidx, fn); strcat(fnidx, ".bai"); diff --git a/sam/bam_maqcns.c b/sam/bam_maqcns.c deleted file mode 100644 index 4fbc6c6..0000000 --- a/sam/bam_maqcns.c +++ /dev/null @@ -1,628 +0,0 @@ -#include -#include -#include "bam.h" -#include "bam_maqcns.h" -#include "ksort.h" -#include "errmod.h" -#include "kaln.h" -KSORT_INIT_GENERIC(uint32_t) - -#define INDEL_WINDOW_SIZE 50 -#define INDEL_EXT_DEP 0.9 - -typedef struct __bmc_aux_t { - int max; - uint32_t *info; - uint16_t *info16; - errmod_t *em; -} bmc_aux_t; - -typedef struct { - float esum[4], fsum[4]; - uint32_t c[4]; -} glf_call_aux_t; - -char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; - -/* - P() = \theta \sum_{i=1}^{N-1} 1/i - P(D|) = \sum_{k=1}^{N-1} p_k 1/2 [(k/N)^n_2(1-k/N)^n_1 + (k/N)^n1(1-k/N)^n_2] - p_k = 1/k / \sum_{i=1}^{N-1} 1/i - */ -static void cal_het(bam_maqcns_t *aa) -{ - int k, n1, n2; - double sum_harmo; // harmonic sum - double poly_rate; - - free(aa->lhet); - aa->lhet = (double*)calloc(256 * 256, sizeof(double)); - sum_harmo = 0.0; - for (k = 1; k <= aa->n_hap - 1; ++k) - sum_harmo += 1.0 / k; - for (n1 = 0; n1 < 256; ++n1) { - for (n2 = 0; n2 < 256; ++n2) { - long double sum = 0.0; - double lC = aa->errmod == BAM_ERRMOD_SOAP? 0 : lgamma(n1+n2+1) - lgamma(n1+1) - lgamma(n2+1); - for (k = 1; k <= aa->n_hap - 1; ++k) { - double pk = 1.0 / k / sum_harmo; - double log1 = log((double)k/aa->n_hap); - double log2 = log(1.0 - (double)k/aa->n_hap); - sum += pk * 0.5 * (expl(log1*n2) * expl(log2*n1) + expl(log1*n1) * expl(log2*n2)); - } - aa->lhet[n1<<8|n2] = lC + logl(sum); - } - } - poly_rate = aa->het_rate * sum_harmo; - aa->q_r = -4.343 * log(2.0 * poly_rate / (1.0 - poly_rate)); -} - -/** initialize the helper structure */ -static void cal_coef(bam_maqcns_t *aa) -{ - int k, n, q; - long double sum_a[257], b[256], q_c[256], tmp[256], fk2[256]; - double *lC; - - if (aa->errmod == BAM_ERRMOD_MAQ2) return; // no need to do the following - // aa->lhet will be allocated and initialized - free(aa->fk); free(aa->coef); - aa->coef = 0; - aa->fk = (double*)calloc(256, sizeof(double)); - aa->fk[0] = fk2[0] = 1.0; - for (n = 1; n != 256; ++n) { - aa->fk[n] = pow(aa->theta, n) * (1.0 - aa->eta) + aa->eta; - fk2[n] = aa->fk[n>>1]; // this is an approximation, assuming reads equally likely come from both strands - } - if (aa->errmod == BAM_ERRMOD_SOAP) return; - aa->coef = (double*)calloc(256*256*64, sizeof(double)); - lC = (double*)calloc(256 * 256, sizeof(double)); - for (n = 1; n != 256; ++n) - for (k = 1; k <= n; ++k) - lC[n<<8|k] = lgamma(n+1) - lgamma(k+1) - lgamma(n-k+1); - for (q = 1; q != 64; ++q) { - double e = pow(10.0, -q/10.0); - double le = log(e); - double le1 = log(1.0-e); - for (n = 1; n != 256; ++n) { - double *coef = aa->coef + (q<<16|n<<8); - sum_a[n+1] = 0.0; - for (k = n; k >= 0; --k) { // a_k = \sum_{i=k}^n C^n_k \epsilon^k (1-\epsilon)^{n-k} - sum_a[k] = sum_a[k+1] + expl(lC[n<<8|k] + k*le + (n-k)*le1); - b[k] = sum_a[k+1] / sum_a[k]; - if (b[k] > 0.99) b[k] = 0.99; - } - for (k = 0; k != n; ++k) // log(\bar\beta_{nk}(\bar\epsilon)^{f_k}) - q_c[k] = -4.343 * fk2[k] * logl(b[k] / e); - for (k = 1; k != n; ++k) q_c[k] += q_c[k-1]; // \prod_{i=0}^k c_i - for (k = 0; k <= n; ++k) { // powl() in 64-bit mode seems broken on my Mac OS X 10.4.9 - tmp[k] = -4.343 * logl(1.0 - expl(fk2[k] * logl(b[k]))); - coef[k] = (k? q_c[k-1] : 0) + tmp[k]; // this is the final c_{nk} - } - } - } - free(lC); -} - -bam_maqcns_t *bam_maqcns_init() -{ - bam_maqcns_t *bm; - bm = (bam_maqcns_t*)calloc(1, sizeof(bam_maqcns_t)); - bm->aux = (bmc_aux_t*)calloc(1, sizeof(bmc_aux_t)); - bm->het_rate = 0.001; - bm->theta = 0.83f; - bm->n_hap = 2; - bm->eta = 0.03; - bm->cap_mapQ = 60; - bm->min_baseQ = 13; - return bm; -} - -void bam_maqcns_prepare(bam_maqcns_t *bm) -{ - if (bm->errmod == BAM_ERRMOD_MAQ2) bm->aux->em = errmod_init(1. - bm->theta); - cal_coef(bm); cal_het(bm); -} - -void bam_maqcns_destroy(bam_maqcns_t *bm) -{ - if (bm == 0) return; - free(bm->lhet); free(bm->fk); free(bm->coef); free(bm->aux->info); free(bm->aux->info16); - if (bm->aux->em) errmod_destroy(bm->aux->em); - free(bm->aux); free(bm); -} - -glf1_t *bam_maqcns_glfgen(int _n, const bam_pileup1_t *pl, uint8_t ref_base, bam_maqcns_t *bm) -{ - glf_call_aux_t *b = 0; - int i, j, k, w[8], c, n; - glf1_t *g = (glf1_t*)calloc(1, sizeof(glf1_t)); - float p[16], min_p = 1e30; - uint64_t rms; - - g->ref_base = ref_base; - if (_n == 0) return g; - - // construct aux array - if (bm->aux->max < _n) { - bm->aux->max = _n; - kroundup32(bm->aux->max); - bm->aux->info = (uint32_t*)realloc(bm->aux->info, 4 * bm->aux->max); - bm->aux->info16 = (uint16_t*)realloc(bm->aux->info16, 2 * bm->aux->max); - } - for (i = n = 0, rms = 0; i < _n; ++i) { - const bam_pileup1_t *p = pl + i; - uint32_t q, x = 0, qq; - uint16_t y = 0; - if (p->is_del || p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue; - q = (uint32_t)bam1_qual(p->b)[p->qpos]; - if (q < bm->min_baseQ) continue; - x |= (uint32_t)bam1_strand(p->b) << 18 | q << 8 | p->b->core.qual; - y |= bam1_strand(p->b)<<4; - if (p->b->core.qual < q) q = p->b->core.qual; - c = p->b->core.qual < bm->cap_mapQ? p->b->core.qual : bm->cap_mapQ; - rms += c * c; - x |= q << 24; - y |= q << 5; - qq = bam1_seqi(bam1_seq(p->b), p->qpos); - q = bam_nt16_nt4_table[qq? qq : ref_base]; - if (!p->is_del && !p->is_refskip && q < 4) x |= 1 << 21 | q << 16, y |= q; - bm->aux->info16[n] = y; - bm->aux->info[n++] = x; - } - rms = (uint8_t)(sqrt((double)rms / n) + .499); - if (bm->errmod == BAM_ERRMOD_MAQ2) { - errmod_cal(bm->aux->em, n, 4, bm->aux->info16, p); - goto goto_glf; - } - ks_introsort(uint32_t, n, bm->aux->info); - // generate esum and fsum - b = (glf_call_aux_t*)calloc(1, sizeof(glf_call_aux_t)); - for (k = 0; k != 8; ++k) w[k] = 0; - for (j = n - 1; j >= 0; --j) { // calculate esum and fsum - uint32_t info = bm->aux->info[j]; - if (info>>24 < 4 && (info>>8&0x3f) != 0) info = 4<<24 | (info&0xffffff); - k = info>>16&7; - if (info>>24 > 0) { - b->esum[k&3] += bm->fk[w[k]] * (info>>24); - b->fsum[k&3] += bm->fk[w[k]]; - if (w[k] < 0xff) ++w[k]; - ++b->c[k&3]; - } - } - // rescale ->c[] - for (j = c = 0; j != 4; ++j) c += b->c[j]; - if (c > 255) { - for (j = 0; j != 4; ++j) b->c[j] = (int)(254.0 * b->c[j] / c + 0.5); - for (j = c = 0; j != 4; ++j) c += b->c[j]; - } - if (bm->errmod == BAM_ERRMOD_MAQ) { - // generate likelihood - for (j = 0; j != 4; ++j) { - // homozygous - float tmp1, tmp3; - int tmp2, bar_e; - for (k = 0, tmp1 = tmp3 = 0.0, tmp2 = 0; k != 4; ++k) { - if (j == k) continue; - tmp1 += b->esum[k]; tmp2 += b->c[k]; tmp3 += b->fsum[k]; - } - if (tmp2) { - bar_e = (int)(tmp1 / tmp3 + 0.5); - if (bar_e < 4) bar_e = 4; // should not happen - if (bar_e > 63) bar_e = 63; - p[j<<2|j] = tmp1 + bm->coef[bar_e<<16|c<<8|tmp2]; - } else p[j<<2|j] = 0.0; // all the bases are j - // heterozygous - for (k = j + 1; k < 4; ++k) { - for (i = 0, tmp2 = 0, tmp1 = tmp3 = 0.0; i != 4; ++i) { - if (i == j || i == k) continue; - tmp1 += b->esum[i]; tmp2 += b->c[i]; tmp3 += b->fsum[i]; - } - if (tmp2) { - bar_e = (int)(tmp1 / tmp3 + 0.5); - if (bar_e < 4) bar_e = 4; - if (bar_e > 63) bar_e = 63; - p[j<<2|k] = p[k<<2|j] = -4.343 * bm->lhet[b->c[j]<<8|b->c[k]] + tmp1 + bm->coef[bar_e<<16|c<<8|tmp2]; - } else p[j<<2|k] = p[k<<2|j] = -4.343 * bm->lhet[b->c[j]<<8|b->c[k]]; // all the bases are either j or k - } - // - for (k = 0; k != 4; ++k) - if (p[j<<2|k] < 0.0) p[j<<2|k] = 0.0; - } - - { // fix p[k<<2|k] - float max1, max2, min1, min2; - int max_k, min_k; - max_k = min_k = -1; - max1 = max2 = -1.0; min1 = min2 = 1e30; - for (k = 0; k < 4; ++k) { - if (b->esum[k] > max1) { - max2 = max1; max1 = b->esum[k]; max_k = k; - } else if (b->esum[k] > max2) max2 = b->esum[k]; - } - for (k = 0; k < 4; ++k) { - if (p[k<<2|k] < min1) { - min2 = min1; min1 = p[k<<2|k]; min_k = k; - } else if (p[k<<2|k] < min2) min2 = p[k<<2|k]; - } - if (max1 > max2 && (min_k != max_k || min1 + 1.0 > min2)) - p[max_k<<2|max_k] = min1 > 1.0? min1 - 1.0 : 0.0; - } - } else if (bm->errmod == BAM_ERRMOD_SOAP) { // apply the SOAP model - // generate likelihood - for (j = 0; j != 4; ++j) { - float tmp; - // homozygous - for (k = 0, tmp = 0.0; k != 4; ++k) - if (j != k) tmp += b->esum[k]; - p[j<<2|j] = tmp; - // heterozygous - for (k = j + 1; k < 4; ++k) { - for (i = 0, tmp = 0.0; i != 4; ++i) - if (i != j && i != k) tmp += b->esum[i]; - p[j<<2|k] = p[k<<2|j] = -4.343 * bm->lhet[b->c[j]<<8|b->c[k]] + tmp; - } - } - } - -goto_glf: - // convert necessary information to glf1_t - g->ref_base = ref_base; g->max_mapQ = rms; - g->depth = n > 16777215? 16777215 : n; - for (j = 0; j != 4; ++j) - for (k = j; k < 4; ++k) - if (p[j<<2|k] < min_p) min_p = p[j<<2|k]; - g->min_lk = min_p > 255.0? 255 : (int)(min_p + 0.5); - for (j = c = 0; j != 4; ++j) - for (k = j; k < 4; ++k) - g->lk[c++] = p[j<<2|k]-min_p > 255.0? 255 : (int)(p[j<<2|k]-min_p + 0.5); - - free(b); - return g; -} - -uint32_t glf2cns(const glf1_t *g, int q_r) -{ - int i, j, k, p[10], ref4; - uint32_t x = 0; - ref4 = bam_nt16_nt4_table[g->ref_base]; - for (i = k = 0; i < 4; ++i) - for (j = i; j < 4; ++j) { - int prior = (i == ref4 && j == ref4? 0 : i == ref4 || j == ref4? q_r : q_r + 3); - p[k] = (g->lk[k] + prior)<<4 | i<<2 | j; - ++k; - } - for (i = 1; i < 10; ++i) // insertion sort - for (j = i; j > 0 && p[j] < p[j-1]; --j) - k = p[j], p[j] = p[j-1], p[j-1] = k; - x = (1u<<(p[0]&3) | 1u<<(p[0]>>2&3)) << 28; // the best genotype - x |= (uint32_t)g->max_mapQ << 16; // rms mapQ - x |= ((p[1]>>4) - (p[0]>>4) < 256? (p[1]>>4) - (p[0]>>4) : 255) << 8; // consensus Q - for (k = 0; k < 10; ++k) - if ((p[k]&0xf) == (ref4<<2|ref4)) break; - if (k == 10) k = 9; - x |= (p[k]>>4) - (p[0]>>4) < 256? (p[k]>>4) - (p[0]>>4) : 255; // snp Q - return x; -} - -uint32_t bam_maqcns_call(int n, const bam_pileup1_t *pl, bam_maqcns_t *bm) -{ - glf1_t *g; - uint32_t x; - if (n) { - g = bam_maqcns_glfgen(n, pl, 0xf, bm); - x = g->depth == 0? (0xfU<<28 | 0xfU<<24) : glf2cns(g, (int)(bm->q_r + 0.5)); - free(g); - } else x = 0xfU<<28 | 0xfU<<24; - return x; -} - -/************** *****************/ - -bam_maqindel_opt_t *bam_maqindel_opt_init() -{ - bam_maqindel_opt_t *mi = (bam_maqindel_opt_t*)calloc(1, sizeof(bam_maqindel_opt_t)); - mi->q_indel = 40; - mi->r_indel = 0.00015; - mi->r_snp = 0.001; - // - mi->mm_penalty = 3; - mi->indel_err = 4; - mi->ambi_thres = 10; - return mi; -} - -void bam_maqindel_ret_destroy(bam_maqindel_ret_t *mir) -{ - if (mir == 0) return; - free(mir->s[0]); free(mir->s[1]); free(mir); -} - -int bam_tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos) -{ - int k, x = c->pos, y = 0, last_y = 0; - *_tpos = c->pos; - for (k = 0; k < c->n_cigar; ++k) { - int op = cigar[k] & BAM_CIGAR_MASK; - int l = cigar[k] >> BAM_CIGAR_SHIFT; - if (op == BAM_CMATCH) { - if (c->pos > tpos) return y; - if (x + l > tpos) { - *_tpos = tpos; - return y + (tpos - x); - } - x += l; y += l; - last_y = y; - } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; - else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { - if (x + l > tpos) { - *_tpos = is_left? x : x + l; - return y; - } - x += l; - } - } - *_tpos = x; - return last_y; -} - -#define MINUS_CONST 0x10000000 - -bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref, - int _n_types, int *_types) -{ - int i, j, n_types, *types, left, right, max_rd_len = 0; - bam_maqindel_ret_t *ret = 0; - // if there is no proposed indel, check if there is an indel from the alignment - if (_n_types == 0) { - for (i = 0; i < n; ++i) { - const bam_pileup1_t *p = pl + i; - if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0) break; - } - if (i == n) return 0; // no indel - } - { // calculate how many types of indels are available (set n_types and types) - int m; - uint32_t *aux; - aux = (uint32_t*)calloc(n + _n_types + 1, 4); - m = 0; - aux[m++] = MINUS_CONST; // zero indel is always a type - for (i = 0; i < n; ++i) { - const bam_pileup1_t *p = pl + i; - if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0) - aux[m++] = MINUS_CONST + p->indel; - j = bam_cigar2qlen(&p->b->core, bam1_cigar(p->b)); - if (j > max_rd_len) max_rd_len = j; - } - if (_n_types) // then also add this to aux[] - for (i = 0; i < _n_types; ++i) - if (_types[i]) aux[m++] = MINUS_CONST + _types[i]; - ks_introsort(uint32_t, m, aux); - // squeeze out identical types - for (i = 1, n_types = 1; i < m; ++i) - if (aux[i] != aux[i-1]) ++n_types; - types = (int*)calloc(n_types, sizeof(int)); - j = 0; - types[j++] = aux[0] - MINUS_CONST; - for (i = 1; i < m; ++i) { - if (aux[i] != aux[i-1]) - types[j++] = aux[i] - MINUS_CONST; - } - free(aux); - } - { // calculate left and right boundary - left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0; - right = pos + INDEL_WINDOW_SIZE; - if (types[0] < 0) right -= types[0]; - // in case the alignments stand out the reference - for (i = pos; i < right; ++i) - if (ref[i] == 0) break; - right = i; - } - { // the core part - char *ref2, *rs, *inscns = 0; - int qr_snp, k, l, *score, *pscore, max_ins = types[n_types-1]; - qr_snp = (int)(-4.343 * log(mi->r_snp) + .499); - if (max_ins > 0) { // get the consensus of inserted sequences - int *inscns_aux = (int*)calloc(4 * n_types * max_ins, sizeof(int)); - // count occurrences - for (i = 0; i < n_types; ++i) { - if (types[i] <= 0) continue; // not insertion - for (j = 0; j < n; ++j) { - const bam_pileup1_t *p = pl + j; - if (!(p->b->core.flag&BAM_FUNMAP) && p->indel == types[i]) { - for (k = 1; k <= p->indel; ++k) { - int c = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos + k)]; - if (c < 4) ++inscns_aux[i*max_ins*4 + (k-1)*4 + c]; - } - } - } - } - // construct the consensus of inserted sequence - inscns = (char*)calloc(n_types * max_ins, sizeof(char)); - for (i = 0; i < n_types; ++i) { - for (j = 0; j < types[i]; ++j) { - int max = 0, max_k = -1, *ia = inscns_aux + i*max_ins*4 + j*4; - for (k = 0; k < 4; ++k) { - if (ia[k] > max) { - max = ia[k]; - max_k = k; - } - } - inscns[i*max_ins + j] = max? 1<= 0? -types[0] : -types[0] + types[i]; - for (jj = 0; jj < tmp && j < right && ref[j]; ++jj, ++j) - ref2[k++] = 4; - } - for (; j < right && ref[j]; ++j) - ref2[k++] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[j]]]; - if (j < right) right = j; - // calculate score for each read - for (j = 0; j < n; ++j) { - const bam_pileup1_t *p = pl + j; - int qbeg, qend, tbeg, tend; - if (p->b->core.flag & BAM_FUNMAP) continue; - qbeg = bam_tpos2qpos(&p->b->core, bam1_cigar(p->b), left, 0, &tbeg); - qend = bam_tpos2qpos(&p->b->core, bam1_cigar(p->b), right, 1, &tend); - assert(tbeg >= left); - for (l = qbeg; l < qend; ++l) - rs[l - qbeg] = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), l)]; - { - int x, y, n_acigar, ps; - uint32_t *acigar; - ps = 0; - if (tend - tbeg + types[i] <= 0) { - score[i*n+j] = -(1<<20); - pscore[i*n+j] = 1<<20; - continue; - } - acigar = ka_global_core((uint8_t*)ref2 + tbeg - left, tend - tbeg + types[i], (uint8_t*)rs, qend - qbeg, &ap, &score[i*n+j], &n_acigar); - x = tbeg - left; y = 0; - for (l = 0; l < n_acigar; ++l) { - int op = acigar[l]&0xf; - int len = acigar[l]>>4; - if (op == BAM_CMATCH) { - int k; - for (k = 0; k < len; ++k) - if (ref2[x+k] != rs[y+k] && ref2[x+k] < 4) - ps += bam1_qual(p->b)[y+k] < qr_snp? bam1_qual(p->b)[y+k] : qr_snp; - x += len; y += len; - } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { - if (op == BAM_CINS && l > 0 && l < n_acigar - 1) ps += mi->q_indel * len; - y += len; - } else if (op == BAM_CDEL) { - if (l > 0 && l < n_acigar - 1) ps += mi->q_indel * len; - x += len; - } - } - pscore[i*n+j] = ps; - /*if (1) { // for debugging only - fprintf(stderr, "id=%d, pos=%d, type=%d, j=%d, score=%d, psore=%d, %d, %d, %d, %d, %d, ", - j, pos+1, types[i], j, score[i*n+j], pscore[i*n+j], tbeg, tend, qbeg, qend, mi->q_indel); - for (l = 0; l < n_acigar; ++l) fprintf(stderr, "%d%c", acigar[l]>>4, "MIDS"[acigar[l]&0xf]); - fprintf(stderr, "\n"); - for (l = 0; l < tend - tbeg + types[i]; ++l) fputc("ACGTN"[ref2[l+tbeg-left]], stderr); - fputc('\n', stderr); - for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[rs[l]], stderr); - fputc('\n', stderr); - }*/ - free(acigar); - } - } - } - { // get final result - int *sum, max1, max2, max1_i, max2_i; - // pick up the best two score - sum = (int*)calloc(n_types, sizeof(int)); - for (i = 0; i < n_types; ++i) - for (j = 0; j < n; ++j) - sum[i] += -pscore[i*n+j]; - max1 = max2 = -0x7fffffff; max1_i = max2_i = -1; - for (i = 0; i < n_types; ++i) { - if (sum[i] > max1) { - max2 = max1; max2_i = max1_i; max1 = sum[i]; max1_i = i; - } else if (sum[i] > max2) { - max2 = sum[i]; max2_i = i; - } - } - free(sum); - // write ret - ret = (bam_maqindel_ret_t*)calloc(1, sizeof(bam_maqindel_ret_t)); - ret->indel1 = types[max1_i]; ret->indel2 = types[max2_i]; - ret->s[0] = (char*)calloc(abs(ret->indel1) + 2, 1); - ret->s[1] = (char*)calloc(abs(ret->indel2) + 2, 1); - // write indel sequence - if (ret->indel1 > 0) { - ret->s[0][0] = '+'; - for (k = 0; k < ret->indel1; ++k) - ret->s[0][k+1] = bam_nt16_rev_table[(int)inscns[max1_i*max_ins + k]]; - } else if (ret->indel1 < 0) { - ret->s[0][0] = '-'; - for (k = 0; k < -ret->indel1 && ref[pos + k + 1]; ++k) - ret->s[0][k+1] = ref[pos + k + 1]; - } else ret->s[0][0] = '*'; - if (ret->indel2 > 0) { - ret->s[1][0] = '+'; - for (k = 0; k < ret->indel2; ++k) - ret->s[1][k+1] = bam_nt16_rev_table[(int)inscns[max2_i*max_ins + k]]; - } else if (ret->indel2 < 0) { - ret->s[1][0] = '-'; - for (k = 0; k < -ret->indel2 && ref[pos + k + 1]; ++k) - ret->s[1][k+1] = ref[pos + k + 1]; - } else ret->s[1][0] = '*'; - // write count - for (i = 0; i < n; ++i) { - const bam_pileup1_t *p = pl + i; - if (p->indel == ret->indel1) ++ret->cnt1; - else if (p->indel == ret->indel2) ++ret->cnt2; - else ++ret->cnt_anti; - } - { // write gl[] - int tmp, seq_err = 0; - double x = 1.0; - tmp = max1_i - max2_i; - if (tmp < 0) tmp = -tmp; - for (j = 0; j < tmp + 1; ++j) x *= INDEL_EXT_DEP; - seq_err = mi->q_indel * (1.0 - x) / (1.0 - INDEL_EXT_DEP); - ret->gl[0] = ret->gl[1] = 0; - for (j = 0; j < n; ++j) { - int s1 = pscore[max1_i*n + j], s2 = pscore[max2_i*n + j]; - //fprintf(stderr, "id=%d, %d, %d, %d, %d, %d\n", j, pl[j].b->core.pos+1, types[max1_i], types[max2_i], s1, s2); - if (s1 > s2) ret->gl[0] += s1 - s2 < seq_err? s1 - s2 : seq_err; - else ret->gl[1] += s2 - s1 < seq_err? s2 - s1 : seq_err; - } - } - // write cnt_ref and cnt_ambi - if (max1_i != 0 && max2_i != 0) { - for (j = 0; j < n; ++j) { - int diff1 = score[j] - score[max1_i * n + j]; - int diff2 = score[j] - score[max2_i * n + j]; - if (diff1 > 0 && diff2 > 0) ++ret->cnt_ref; - else if (diff1 == 0 || diff2 == 0) ++ret->cnt_ambi; - } - } - } - free(score); free(pscore); free(ref2); free(rs); free(inscns); - } - { // call genotype - int q[3], qr_indel = (int)(-4.343 * log(mi->r_indel) + 0.5); - int min1, min2, min1_i; - q[0] = ret->gl[0] + (ret->s[0][0] != '*'? 0 : 0) * qr_indel; - q[1] = ret->gl[1] + (ret->s[1][0] != '*'? 0 : 0) * qr_indel; - q[2] = n * 3 + (ret->s[0][0] == '*' || ret->s[1][0] == '*'? 1 : 1) * qr_indel; - min1 = min2 = 0x7fffffff; min1_i = -1; - for (i = 0; i < 3; ++i) { - if (q[i] < min1) { - min2 = min1; min1 = q[i]; min1_i = i; - } else if (q[i] < min2) min2 = q[i]; - } - ret->gt = min1_i; - ret->q_cns = min2 - min1; - // set q_ref - if (ret->gt < 2) ret->q_ref = (ret->s[ret->gt][0] == '*')? 0 : q[1-ret->gt] - q[ret->gt] - qr_indel - 3; - else ret->q_ref = (ret->s[0][0] == '*')? q[0] - q[2] : q[1] - q[2]; - if (ret->q_ref < 0) ret->q_ref = 0; - } - free(types); - return ret; -} diff --git a/sam/bam_maqcns.h b/sam/bam_maqcns.h deleted file mode 100644 index 291ae53..0000000 --- a/sam/bam_maqcns.h +++ /dev/null @@ -1,61 +0,0 @@ -#ifndef BAM_MAQCNS_H -#define BAM_MAQCNS_H - -#include "glf.h" - -#define BAM_ERRMOD_MAQ2 0 -#define BAM_ERRMOD_MAQ 1 -#define BAM_ERRMOD_SOAP 2 - -struct __bmc_aux_t; - -typedef struct { - float het_rate, theta; - int n_hap, cap_mapQ, errmod, min_baseQ; - - float eta, q_r; - double *fk, *coef; - double *lhet; - struct __bmc_aux_t *aux; -} bam_maqcns_t; - -typedef struct { - int q_indel; // indel sequencing error, phred scaled - float r_indel; // indel prior - float r_snp; // snp prior - // hidden parameters, unchangeable from command line - int mm_penalty, indel_err, ambi_thres; -} bam_maqindel_opt_t; - -typedef struct { - int indel1, indel2; - int cnt1, cnt2, cnt_anti; - int cnt_ref, cnt_ambi; - char *s[2]; - // - int gt, gl[2]; - int q_cns, q_ref; -} bam_maqindel_ret_t; - -#ifdef __cplusplus -extern "C" { -#endif - - bam_maqcns_t *bam_maqcns_init(); - void bam_maqcns_prepare(bam_maqcns_t *bm); - void bam_maqcns_destroy(bam_maqcns_t *bm); - glf1_t *bam_maqcns_glfgen(int n, const bam_pileup1_t *pl, uint8_t ref_base, bam_maqcns_t *bm); - uint32_t bam_maqcns_call(int n, const bam_pileup1_t *pl, bam_maqcns_t *bm); - // return: cns<<28 | cns2<<24 | mapQ<<16 | cnsQ<<8 | cnsQ2 - uint32_t glf2cns(const glf1_t *g, int q_r); - - bam_maqindel_opt_t *bam_maqindel_opt_init(); - bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref, - int _n_types, int *_types); - void bam_maqindel_ret_destroy(bam_maqindel_ret_t*); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/sam/bam_md.c b/sam/bam_md.c index 44d46a4..d42aa8f 100644 --- a/sam/bam_md.c +++ b/sam/bam_md.c @@ -9,38 +9,46 @@ #include "kaln.h" #include "kprobaln.h" -void bam_fillmd1_core(bam1_t *b, char *ref, int is_equal, int max_nm) +#define USE_EQUAL 1 +#define DROP_TAG 2 +#define BIN_QUAL 4 +#define UPDATE_NM 8 +#define UPDATE_MD 16 +#define HASH_QNM 32 + +char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; + +int bam_aux_drop_other(bam1_t *b, uint8_t *s); + +void bam_fillmd1_core(bam1_t *b, char *ref, int flag, int max_nm) { uint8_t *seq = bam1_seq(b); uint32_t *cigar = bam1_cigar(b); bam1_core_t *c = &b->core; int i, x, y, u = 0; kstring_t *str; - uint8_t *old_md, *old_nm; int32_t old_nm_i = -1, nm = 0; str = (kstring_t*)calloc(1, sizeof(kstring_t)); for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { int j, l = cigar[i]>>4, op = cigar[i]&0xf; - if (op == BAM_CMATCH) { + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (j = 0; j < l; ++j) { int z = y + j; int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]]; if (ref[x+j] == 0) break; // out of boundary if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match - if (is_equal) seq[z/2] &= (z&1)? 0xf0 : 0x0f; + if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f; ++u; } else { - ksprintf(str, "%d", u); - kputc(ref[x+j], str); + kputw(u, str); kputc(ref[x+j], str); u = 0; ++nm; } } if (j < l) break; x += l; y += l; } else if (op == BAM_CDEL) { - ksprintf(str, "%d", u); - kputc('^', str); + kputw(u, str); kputc('^', str); for (j = 0; j < l; ++j) { if (ref[x+j] == 0) break; kputc(ref[x+j], str); @@ -55,12 +63,12 @@ void bam_fillmd1_core(bam1_t *b, char *ref, int is_equal, int max_nm) x += l; } } - ksprintf(str, "%d", u); + kputw(u, str); // apply max_nm if (max_nm > 0 && nm >= max_nm) { for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { int j, l = cigar[i]>>4, op = cigar[i]&0xf; - if (op == BAM_CMATCH) { + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (j = 0; j < l; ++j) { int z = y + j; int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]]; @@ -77,38 +85,54 @@ void bam_fillmd1_core(bam1_t *b, char *ref, int is_equal, int max_nm) } } // update NM - old_nm = bam_aux_get(b, "NM"); - if (c->flag & BAM_FUNMAP) return; - if (old_nm) old_nm_i = bam_aux2i(old_nm); - if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); - else if (nm != old_nm_i) { - fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm); - bam_aux_del(b, old_nm); - bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); + if (flag & UPDATE_NM) { + uint8_t *old_nm = bam_aux_get(b, "NM"); + if (c->flag & BAM_FUNMAP) return; + if (old_nm) old_nm_i = bam_aux2i(old_nm); + if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); + else if (nm != old_nm_i) { + fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm); + bam_aux_del(b, old_nm); + bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); + } } // update MD - old_md = bam_aux_get(b, "MD"); - if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); - else { - int is_diff = 0; - if (strlen((char*)old_md+1) == str->l) { - for (i = 0; i < str->l; ++i) - if (toupper(old_md[i+1]) != toupper(str->s[i])) - break; - if (i < str->l) is_diff = 1; - } else is_diff = 1; - if (is_diff) { - fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam1_qname(b), old_md+1, str->s); - bam_aux_del(b, old_md); - bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); + if (flag & UPDATE_MD) { + uint8_t *old_md = bam_aux_get(b, "MD"); + if (c->flag & BAM_FUNMAP) return; + if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); + else { + int is_diff = 0; + if (strlen((char*)old_md+1) == str->l) { + for (i = 0; i < str->l; ++i) + if (toupper(old_md[i+1]) != toupper(str->s[i])) + break; + if (i < str->l) is_diff = 1; + } else is_diff = 1; + if (is_diff) { + fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam1_qname(b), old_md+1, str->s); + bam_aux_del(b, old_md); + bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); + } } } + // drop all tags but RG + if (flag&DROP_TAG) { + uint8_t *q = bam_aux_get(b, "RG"); + bam_aux_drop_other(b, q); + } + // reduce the resolution of base quality + if (flag&BIN_QUAL) { + uint8_t *qual = bam1_qual(b); + for (i = 0; i < b->core.l_qseq; ++i) + if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7; + } free(str->s); free(str); } -void bam_fillmd1(bam1_t *b, char *ref, int is_equal) +void bam_fillmd1(bam1_t *b, char *ref, int flag) { - bam_fillmd1_core(b, ref, is_equal, 0); + bam_fillmd1_core(b, ref, flag, 0); } int bam_cap_mapQ(bam1_t *b, char *ref, int thres) @@ -122,7 +146,7 @@ int bam_cap_mapQ(bam1_t *b, char *ref, int thres) mm = q = len = clip_l = clip_q = 0; for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { int j, l = cigar[i]>>4, op = cigar[i]&0xf; - if (op == BAM_CMATCH) { + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (j = 0; j < l; ++j) { int z = y + j; int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]]; @@ -162,14 +186,14 @@ int bam_cap_mapQ(bam1_t *b, char *ref, int thres) return (int)(t + .499); } -int bam_prob_realn_core(bam1_t *b, const char *ref, int apply_baq) +int bam_prob_realn_core(bam1_t *b, const char *ref, int flag) { - int k, i, bw, x, y, yb, ye, xb, xe; + int k, i, bw, x, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1; uint32_t *cigar = bam1_cigar(b); bam1_core_t *c = &b->core; kpa_par_t conf = kpa_par_def; uint8_t *bq = 0, *zq = 0, *qual = bam1_qual(b); - if (c->flag & BAM_FUNMAP) return -1; // do nothing + if ((c->flag & BAM_FUNMAP) || b->core.l_qseq == 0) return -1; // do nothing // test if BQ or ZQ is present if ((bq = bam_aux_get(b, "BQ")) != 0) ++bq; if ((zq = bam_aux_get(b, "ZQ")) != 0 && *zq == 'Z') ++zq; @@ -195,7 +219,7 @@ int bam_prob_realn_core(bam1_t *b, const char *ref, int apply_baq) for (k = 0; k < c->n_cigar; ++k) { int op, l; op = cigar[k]&0xf; l = cigar[k]>>4; - if (op == BAM_CMATCH) { + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { if (yb < 0) yb = y; if (xb < 0) xb = x; ye = y + l; xe = x + l; @@ -221,23 +245,47 @@ int bam_prob_realn_core(bam1_t *b, const char *ref, int apply_baq) s = calloc(c->l_qseq, 1); for (i = 0; i < c->l_qseq; ++i) s[i] = bam_nt16_nt4_table[bam1_seqi(seq, i)]; r = calloc(xe - xb, 1); - for (i = xb; i < xe; ++i) + for (i = xb; i < xe; ++i) { + if (ref[i] == 0) { xe = i; break; } r[i-xb] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[i]]]; + } state = calloc(c->l_qseq, sizeof(int)); q = calloc(c->l_qseq, 1); kpa_glocal(r, xe-xb, s, c->l_qseq, qual, &conf, state, q); - for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) { - int op = cigar[k]&0xf, l = cigar[k]>>4; - if (op == BAM_CMATCH) { - for (i = y; i < y + l; ++i) { - if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) bq[i] = 0; - else bq[i] = bq[i] < q[i]? bq[i] : q[i]; - } - x += l; y += l; - } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l; - else if (op == BAM_CDEL) x += l; + if (!extend_baq) { // in this block, bq[] is capped by base quality qual[] + for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) { + int op = cigar[k]&0xf, l = cigar[k]>>4; + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { + for (i = y; i < y + l; ++i) { + if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) bq[i] = 0; + else bq[i] = bq[i] < q[i]? bq[i] : q[i]; + } + x += l; y += l; + } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l; + else if (op == BAM_CDEL) x += l; + } + for (i = 0; i < c->l_qseq; ++i) bq[i] = qual[i] - bq[i] + 64; // finalize BQ + } else { // in this block, bq[] is BAQ that can be larger than qual[] (different from the above!) + uint8_t *left, *rght; + left = calloc(c->l_qseq, 1); rght = calloc(c->l_qseq, 1); + for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) { + int op = cigar[k]&0xf, l = cigar[k]>>4; + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { + for (i = y; i < y + l; ++i) + bq[i] = ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y))? 0 : q[i]; + for (left[y] = bq[y], i = y + 1; i < y + l; ++i) + left[i] = bq[i] > left[i-1]? bq[i] : left[i-1]; + for (rght[y+l-1] = bq[y+l-1], i = y + l - 2; i >= y; --i) + rght[i] = bq[i] > rght[i+1]? bq[i] : rght[i+1]; + for (i = y; i < y + l; ++i) + bq[i] = left[i] < rght[i]? left[i] : rght[i]; + x += l; y += l; + } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l; + else if (op == BAM_CDEL) x += l; + } + for (i = 0; i < c->l_qseq; ++i) bq[i] = 64 + (qual[i] <= bq[i]? 0 : qual[i] - bq[i]); // finalize BQ + free(left); free(rght); } - for (i = 0; i < c->l_qseq; ++i) bq[i] = qual[i] - bq[i] + 64; // finalize BQ if (apply_baq) { for (i = 0; i < c->l_qseq; ++i) qual[i] -= bq[i] - 64; // modify qual bam_aux_append(b, "ZQ", 'Z', c->l_qseq + 1, bq); @@ -254,25 +302,31 @@ int bam_prob_realn(bam1_t *b, const char *ref) int bam_fillmd(int argc, char *argv[]) { - int c, is_equal, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm, is_realn, capQ, apply_baq; + int c, flt_flag, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm, is_realn, capQ, baq_flag; samfile_t *fp, *fpout = 0; faidx_t *fai; char *ref = 0, mode_w[8], mode_r[8]; bam1_t *b; - is_equal = is_bam_out = is_sam_in = is_uncompressed = is_realn = max_nm = apply_baq = capQ = 0; + flt_flag = UPDATE_NM | UPDATE_MD; + is_bam_out = is_sam_in = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0; mode_w[0] = mode_r[0] = 0; strcpy(mode_r, "r"); strcpy(mode_w, "w"); - while ((c = getopt(argc, argv, "reubSC:n:A")) >= 0) { + while ((c = getopt(argc, argv, "EqreuNhbSC:n:Ad")) >= 0) { switch (c) { case 'r': is_realn = 1; break; - case 'e': is_equal = 1; break; + case 'e': flt_flag |= USE_EQUAL; break; + case 'd': flt_flag |= DROP_TAG; break; + case 'q': flt_flag |= BIN_QUAL; break; + case 'h': flt_flag |= HASH_QNM; break; + case 'N': flt_flag &= ~(UPDATE_MD|UPDATE_NM); break; case 'b': is_bam_out = 1; break; case 'u': is_uncompressed = is_bam_out = 1; break; case 'S': is_sam_in = 1; break; case 'n': max_nm = atoi(optarg); break; case 'C': capQ = atoi(optarg); break; - case 'A': apply_baq = 1; break; + case 'A': baq_flag |= 1; break; + case 'E': baq_flag |= 2; break; default: fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n", c); return 1; } } @@ -288,7 +342,8 @@ int bam_fillmd(int argc, char *argv[]) fprintf(stderr, " -b compressed BAM output\n"); fprintf(stderr, " -S the input is SAM with header\n"); fprintf(stderr, " -A modify the quality string\n"); - fprintf(stderr, " -r read-independent local realignment\n\n"); + fprintf(stderr, " -r compute the BQ tag (without -A) or cap baseQ by BAQ (with -A)\n"); + fprintf(stderr, " -E extended BAQ for better sensitivity but lower specificity\n\n"); return 1; } fp = samopen(argv[optind], mode_r, 0); @@ -311,12 +366,12 @@ int bam_fillmd(int argc, char *argv[]) fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n", fp->header->target_name[tid]); } - if (is_realn) bam_prob_realn_core(b, ref, apply_baq); + if (is_realn) bam_prob_realn_core(b, ref, baq_flag); if (capQ > 10) { int q = bam_cap_mapQ(b, ref, capQ); if (b->core.qual > q) b->core.qual = q; } - if (ref) bam_fillmd1_core(b, ref, is_equal, max_nm); + if (ref) bam_fillmd1_core(b, ref, flt_flag, max_nm); } samwrite(fpout, b); } diff --git a/sam/bam_pileup.c b/sam/bam_pileup.c index 3e26f74..57434e0 100644 --- a/sam/bam_pileup.c +++ b/sam/bam_pileup.c @@ -78,12 +78,12 @@ static inline int resolve_cigar2(bam_pileup1_t *p, uint32_t pos, cstate_t *s) if (s->k == -1) { // never processed is_head = 1; if (c->n_cigar == 1) { // just one operation, save a loop - if (_cop(cigar[0]) == BAM_CMATCH) s->k = 0, s->x = c->pos, s->y = 0; + if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0; } else { // find the first match or deletion for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) { int op = _cop(cigar[k]); int l = _cln(cigar[k]); - if (op == BAM_CMATCH || op == BAM_CDEL) break; + if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CEQUAL || op == BAM_CDIFF) break; else if (op == BAM_CREF_SKIP) s->x += l; else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l; } @@ -95,16 +95,16 @@ static inline int resolve_cigar2(bam_pileup1_t *p, uint32_t pos, cstate_t *s) if (pos - s->x >= l) { // jump to the next operation assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case op = _cop(cigar[s->k+1]); - if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP) { // jump to the next without a loop - if (_cop(cigar[s->k]) == BAM_CMATCH) s->y += l; + if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop + if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l; s->x += l; ++s->k; - } else { // find the next M/D/N - if (_cop(cigar[s->k]) == BAM_CMATCH) s->y += l; + } else { // find the next M/D/N/=/X + if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l; s->x += l; for (k = s->k + 1; k < c->n_cigar; ++k) { op = _cop(cigar[k]), l = _cln(cigar[k]); - if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP) break; + if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break; else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l; } s->k = k; @@ -126,12 +126,12 @@ static inline int resolve_cigar2(bam_pileup1_t *p, uint32_t pos, cstate_t *s) for (k = s->k + 2; k < c->n_cigar; ++k) { op2 = _cop(cigar[k]); l2 = _cln(cigar[k]); if (op2 == BAM_CINS) l3 += l2; - else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP) break; + else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break; } if (l3 > 0) p->indel = l3; } } - if (op == BAM_CMATCH) { + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { p->qpos = s->y + (pos - s->x); } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!! diff --git a/sam/bam_plcmd.c b/sam/bam_plcmd.c index 002297a..cbf6ae8 100644 --- a/sam/bam_plcmd.c +++ b/sam/bam_plcmd.c @@ -6,93 +6,8 @@ #include #include "sam.h" #include "faidx.h" -#include "bam_maqcns.h" -#include "khash.h" -#include "glf.h" #include "kstring.h" -typedef int *indel_list_t; -KHASH_MAP_INIT_INT64(64, indel_list_t) - -#define BAM_PLF_SIMPLE 0x01 -#define BAM_PLF_CNS 0x02 -#define BAM_PLF_INDEL_ONLY 0x04 -#define BAM_PLF_GLF 0x08 -#define BAM_PLF_VAR_ONLY 0x10 -#define BAM_PLF_2ND 0x20 -#define BAM_PLF_RANBASE 0x40 -#define BAM_PLF_1STBASE 0x80 -#define BAM_PLF_ALLBASE 0x100 -#define BAM_PLF_READPOS 0x200 -#define BAM_PLF_NOBAQ 0x400 - -typedef struct { - bam_header_t *h; - bam_maqcns_t *c; - bam_maqindel_opt_t *ido; - faidx_t *fai; - khash_t(64) *hash; - uint32_t format; - int tid, len, last_pos; - int mask; - int capQ_thres, min_baseQ; - int max_depth; // for indel calling, ignore reads with the depth too high. 0 for unlimited - char *ref; - glfFile fp_glf; // for glf output only -} pu_data_t; - -char **__bam_get_lines(const char *fn, int *_n); -void bam_init_header_hash(bam_header_t *header); -int32_t bam_get_tid(const bam_header_t *header, const char *seq_name); - -static khash_t(64) *load_pos(const char *fn, bam_header_t *h) -{ - char **list; - int i, j, n, *fields, max_fields; - khash_t(64) *hash; - bam_init_header_hash(h); - list = __bam_get_lines(fn, &n); - hash = kh_init(64); - max_fields = 0; fields = 0; - for (i = 0; i < n; ++i) { - char *str = list[i]; - int chr, n_fields, ret; - khint_t k; - uint64_t x; - n_fields = ksplit_core(str, 0, &max_fields, &fields); - if (n_fields < 2) continue; - chr = bam_get_tid(h, str + fields[0]); - if (chr < 0) { - fprintf(stderr, "[load_pos] unknown reference sequence name: %s\n", str + fields[0]); - continue; - } - x = (uint64_t)chr << 32 | (atoi(str + fields[1]) - 1); - k = kh_put(64, hash, x, &ret); - if (ret == 0) { - fprintf(stderr, "[load_pos] position %s:%s has been loaded.\n", str+fields[0], str+fields[1]); - continue; - } - kh_val(hash, k) = 0; - if (n_fields > 2) { - // count - for (j = 2; j < n_fields; ++j) { - char *s = str + fields[j]; - if ((*s != '+' && *s != '-') || !isdigit(s[1])) break; - } - if (j > 2) { // update kh_val() - int *q, y, z; - q = kh_val(hash, k) = (int*)calloc(j - 1, sizeof(int)); - q[0] = j - 2; z = j; y = 1; - for (j = 2; j < z; ++j) - q[y++] = atoi(str + fields[j]); - } - } - free(str); - } - free(list); free(fields); - return hash; -} - static inline int printw(int c, FILE *fp) { char buf[16]; @@ -108,75 +23,6 @@ static inline int printw(int c, FILE *fp) return 0; } -// an analogy to pileup_func() below -static int glt3_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data) -{ - pu_data_t *d = (pu_data_t*)data; - bam_maqindel_ret_t *r = 0; - int rb, *proposed_indels = 0; - glf1_t *g; - glf3_t *g3; - - if (d->fai == 0) { - fprintf(stderr, "[glt3_func] reference sequence is required for generating GLT. Abort!\n"); - exit(1); - } - if (d->hash) { // only output a list of sites - khint_t k = kh_get(64, d->hash, (uint64_t)tid<<32|pos); - if (k == kh_end(d->hash)) return 0; - proposed_indels = kh_val(d->hash, k); - } - g3 = glf3_init1(); - if (d->fai && (int)tid != d->tid) { - if (d->ref) { // then write the end mark - g3->rtype = GLF3_RTYPE_END; - glf3_write1(d->fp_glf, g3); - } - glf3_ref_write(d->fp_glf, d->h->target_name[tid], d->h->target_len[tid]); // write reference - free(d->ref); - d->ref = fai_fetch(d->fai, d->h->target_name[tid], &d->len); - d->tid = tid; - d->last_pos = 0; - } - rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N'; - g = bam_maqcns_glfgen(n, pu, bam_nt16_table[rb], d->c); - memcpy(g3, g, sizeof(glf1_t)); - g3->rtype = GLF3_RTYPE_SUB; - g3->offset = pos - d->last_pos; - d->last_pos = pos; - glf3_write1(d->fp_glf, g3); - if (pos < d->len) { - int m = (!d->max_depth || d->max_depth>n) ? n : d->max_depth; - if (proposed_indels) - r = bam_maqindel(m, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1); - else r = bam_maqindel(m, pos, d->ido, pu, d->ref, 0, 0); - } - if (r) { // then write indel line - int het = 3 * n, min; - min = het; - if (min > r->gl[0]) min = r->gl[0]; - if (min > r->gl[1]) min = r->gl[1]; - g3->ref_base = 0; - g3->rtype = GLF3_RTYPE_INDEL; - memset(g3->lk, 0, 10); - g3->lk[0] = r->gl[0] - min < 255? r->gl[0] - min : 255; - g3->lk[1] = r->gl[1] - min < 255? r->gl[1] - min : 255; - g3->lk[2] = het - min < 255? het - min : 255; - g3->offset = 0; - g3->indel_len[0] = r->indel1; - g3->indel_len[1] = r->indel2; - g3->min_lk = min < 255? min : 255; - g3->max_len = (abs(r->indel1) > abs(r->indel2)? abs(r->indel1) : abs(r->indel2)) + 1; - g3->indel_seq[0] = strdup(r->s[0]+1); - g3->indel_seq[1] = strdup(r->s[1]+1); - glf3_write1(d->fp_glf, g3); - bam_maqindel_ret_destroy(r); - } - free(g); - glf3_destroy1(g3); - return 0; -} - static inline void pileup_seq(const bam_pileup1_t *p, int pos, int ref_len, const char *ref) { int j; @@ -212,316 +58,6 @@ static inline void pileup_seq(const bam_pileup1_t *p, int pos, int ref_len, cons if (p->is_tail) putchar('$'); } -static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data) -{ - pu_data_t *d = (pu_data_t*)data; - bam_maqindel_ret_t *r = 0; - int i, rb, rms_mapq = -1, *proposed_indels = 0; - uint64_t rms_aux; - uint32_t cns = 0; - - // if GLF is required, suppress -c completely - if (d->format & BAM_PLF_GLF) return glt3_func(tid, pos, n, pu, data); - // if d->hash is initialized, only output the sites in the hash table - if (d->hash) { - khint_t k = kh_get(64, d->hash, (uint64_t)tid<<32|pos); - if (k == kh_end(d->hash)) return 0; - proposed_indels = kh_val(d->hash, k); - } - // update d->ref if necessary - if (d->fai && (int)tid != d->tid) { - free(d->ref); - d->ref = faidx_fetch_seq(d->fai, d->h->target_name[tid], 0, 0x7fffffff, &d->len); - d->tid = tid; - } - rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N'; - // when the indel-only mode is asked for, return if no reads mapped with indels - if (d->format & BAM_PLF_INDEL_ONLY) { - for (i = 0; i < n; ++i) - if (pu[i].indel != 0) break; - if (i == n) return 0; - } - // call the consensus and indel - if (d->format & BAM_PLF_CNS) { // call consensus - if (d->format & (BAM_PLF_RANBASE|BAM_PLF_1STBASE)) { // use a random base or the 1st base as the consensus call - const bam_pileup1_t *p = (d->format & BAM_PLF_1STBASE)? pu : pu + (int)(drand48() * n); - int q = bam1_qual(p->b)[p->qpos]; - int mapQ = p->b->core.qual < d->c->cap_mapQ? p->b->core.qual : d->c->cap_mapQ; - uint32_t b = bam1_seqi(bam1_seq(p->b), p->qpos); - cns = b<<28 | 0xf<<24 | mapQ<<16 | q<<8; - } else if (d->format & BAM_PLF_ALLBASE) { // collapse all bases - uint64_t rmsQ = 0; - uint32_t b = 0; - for (i = 0; i < n; ++i) { - const bam_pileup1_t *p = pu + i; - int q = p->b->core.qual < d->c->cap_mapQ? p->b->core.qual : d->c->cap_mapQ; - b |= bam1_seqi(bam1_seq(p->b), p->qpos); - rmsQ += q * q; - } - rmsQ = (uint64_t)(sqrt((double)rmsQ / n) + .499); - cns = b<<28 | 0xf<<24 | rmsQ<<16 | 60<<8; - } else { - glf1_t *g = bam_maqcns_glfgen(n, pu, bam_nt16_table[rb], d->c); - cns = g->depth == 0? (0xfu<<28 | 0xf<<24) : glf2cns(g, (int)(d->c->q_r + .499)); - free(g); - } - } - if ((d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY)) && d->ref && pos < d->len) { // call indels - int m = (!d->max_depth || d->max_depth>n) ? n : d->max_depth; - if (proposed_indels) // the first element gives the size of the array - r = bam_maqindel(m, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1); - else r = bam_maqindel(m, pos, d->ido, pu, d->ref, 0, 0); - } - // when only variant sites are asked for, test if the site is a variant - if ((d->format & BAM_PLF_CNS) && (d->format & BAM_PLF_VAR_ONLY)) { - if (!(bam_nt16_table[rb] != 15 && cns>>28 != 15 && cns>>28 != bam_nt16_table[rb])) { // not a SNP - if (!(r && (r->gt == 2 || strcmp(r->s[r->gt], "*")))) { // not an indel - if (r) bam_maqindel_ret_destroy(r); - return 0; - } - } - } - // print the first 3 columns - fputs(d->h->target_name[tid], stdout); putchar('\t'); - printw(pos+1, stdout); putchar('\t'); putchar(rb); putchar('\t'); - // print consensus information if required - if (d->format & BAM_PLF_CNS) { - putchar(bam_nt16_rev_table[cns>>28]); putchar('\t'); - printw(cns>>8&0xff, stdout); putchar('\t'); - printw(cns&0xff, stdout); putchar('\t'); - printw(cns>>16&0xff, stdout); putchar('\t'); - } - // print pileup sequences - printw(n, stdout); putchar('\t'); - for (i = 0; i < n; ++i) - pileup_seq(pu + i, pos, d->len, d->ref); - // finalize rms_mapq - if (d->format & BAM_PLF_CNS) { - for (i = rms_aux = 0; i < n; ++i) { - const bam_pileup1_t *p = pu + i; - int tmp = p->b->core.qual < d->c->cap_mapQ? p->b->core.qual : d->c->cap_mapQ; - rms_aux += tmp * tmp; - } - rms_aux = (uint64_t)(sqrt((double)rms_aux / n) + .499); - if (rms_mapq < 0) rms_mapq = rms_aux; - } - putchar('\t'); - // print quality - for (i = 0; i < n; ++i) { - const bam_pileup1_t *p = pu + i; - int c = bam1_qual(p->b)[p->qpos] + 33; - if (c > 126) c = 126; - putchar(c); - } - if (d->format & BAM_PLF_2ND) { // print 2nd calls and qualities - const unsigned char *q; - putchar('\t'); - for (i = 0; i < n; ++i) { - const bam_pileup1_t *p = pu + i; - q = bam_aux_get(p->b, "E2"); - putchar(q? q[p->qpos + 1] : 'N'); - } - putchar('\t'); - for (i = 0; i < n; ++i) { - const bam_pileup1_t *p = pu + i; - q = bam_aux_get(p->b, "U2"); - putchar(q? q[p->qpos + 1] : '!'); - } - } - // print mapping quality if -s is flagged on the command line - if (d->format & BAM_PLF_SIMPLE) { - putchar('\t'); - for (i = 0; i < n; ++i) { - int c = pu[i].b->core.qual + 33; - if (c > 126) c = 126; - putchar(c); - } - } - // print read position - if (d->format & BAM_PLF_READPOS) { - putchar('\t'); - for (i = 0; i < n; ++i) { - int x = pu[i].qpos; - int l = pu[i].b->core.l_qseq; - printw(x < l/2? x+1 : -((l-1)-x+1), stdout); putchar(','); - } - } - putchar('\n'); - // print the indel line if r has been calculated. This only happens if: - // a) -c or -i are flagged, AND b) the reference sequence is available - if (r) { - printf("%s\t%d\t*\t", d->h->target_name[tid], pos + 1); - if (r->gt < 2) printf("%s/%s\t", r->s[r->gt], r->s[r->gt]); - else printf("%s/%s\t", r->s[0], r->s[1]); - printf("%d\t%d\t", r->q_cns, r->q_ref); - printf("%d\t%d\t", rms_mapq, n); - printf("%s\t%s\t", r->s[0], r->s[1]); - //printf("%d\t%d\t", r->gl[0], r->gl[1]); - printf("%d\t%d\t%d\t", r->cnt1, r->cnt2, r->cnt_anti); - printf("%d\t%d\n", r->cnt_ref, r->cnt_ambi); - bam_maqindel_ret_destroy(r); - } - return 0; -} - -int bam_pileup(int argc, char *argv[]) -{ - int c, is_SAM = 0; - char *fn_list = 0, *fn_fa = 0, *fn_pos = 0; - pu_data_t *d = (pu_data_t*)calloc(1, sizeof(pu_data_t)); - d->max_depth = 1024; d->tid = -1; d->mask = BAM_DEF_MASK; d->min_baseQ = 13; - d->c = bam_maqcns_init(); - d->c->errmod = BAM_ERRMOD_MAQ2; // change the default model - d->ido = bam_maqindel_opt_init(); - while ((c = getopt(argc, argv, "st:f:cT:N:r:l:d:im:gI:G:vM:S2aR:PAQ:C:B")) >= 0) { - switch (c) { - case 'Q': d->c->min_baseQ = atoi(optarg); break; - case 'C': d->capQ_thres = atoi(optarg); break; - case 'B': d->format |= BAM_PLF_NOBAQ; break; - case 'a': d->c->errmod = BAM_ERRMOD_SOAP; break; - case 'A': d->c->errmod = BAM_ERRMOD_MAQ; break; - case 's': d->format |= BAM_PLF_SIMPLE; break; - case 't': fn_list = strdup(optarg); break; - case 'l': fn_pos = strdup(optarg); break; - case 'f': fn_fa = strdup(optarg); break; - case 'T': d->c->theta = atof(optarg); break; - case 'N': d->c->n_hap = atoi(optarg); break; - case 'r': d->c->het_rate = atof(optarg); d->ido->r_snp = d->c->het_rate; break; - case 'M': d->c->cap_mapQ = atoi(optarg); break; - case 'd': d->max_depth = atoi(optarg); break; - case 'c': d->format |= BAM_PLF_CNS; break; - case 'i': d->format |= BAM_PLF_INDEL_ONLY; break; - case 'v': d->format |= BAM_PLF_VAR_ONLY; break; - case 'm': d->mask = strtol(optarg, 0, 0); break; - case 'g': d->format |= BAM_PLF_GLF; break; - case '2': d->format |= BAM_PLF_2ND; break; - case 'P': d->format |= BAM_PLF_READPOS; break; - case 'I': d->ido->q_indel = atoi(optarg); break; - case 'G': d->ido->r_indel = atof(optarg); break; - case 'S': is_SAM = 1; break; - case 'R': - if (strcmp(optarg, "random") == 0) d->format |= BAM_PLF_RANBASE; - else if (strcmp(optarg, "first") == 0) d->format |= BAM_PLF_1STBASE; - else if (strcmp(optarg, "all") == 0) d->format |= BAM_PLF_ALLBASE; - else fprintf(stderr, "[bam_pileup] unrecognized -R\n"); - break; - default: fprintf(stderr, "Unrecognizd option '-%c'.\n", c); return 1; - } - } - if (d->c->errmod != BAM_ERRMOD_MAQ2) d->c->theta += 0.02; - if (d->c->theta > 1.0) d->c->theta = 1.0; - if (fn_list) is_SAM = 1; - if (optind == argc) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: samtools pileup [options] |\n\n"); - fprintf(stderr, "Option: -s simple (yet incomplete) pileup format\n"); - fprintf(stderr, " -S the input is in SAM\n"); - fprintf(stderr, " -B disable BAQ computation\n"); - fprintf(stderr, " -A use the original MAQ model for SNP calling (DEPRECATED)\n"); - fprintf(stderr, " -2 output the 2nd best call and quality\n"); - fprintf(stderr, " -i only show lines/consensus with indels\n"); - fprintf(stderr, " -Q INT min base quality (possibly capped by BAQ) [%d]\n", d->c->min_baseQ); - fprintf(stderr, " -C INT coefficient for adjusting mapQ of poor mappings [%d]\n", d->capQ_thres); - fprintf(stderr, " -m INT filtering reads with bits in INT [0x%x]\n", d->mask); - fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", d->c->cap_mapQ); - fprintf(stderr, " -d INT limit maximum depth for indels [%d]\n", d->max_depth); - fprintf(stderr, " -t FILE list of reference sequences (force -S)\n"); - fprintf(stderr, " -l FILE list of sites at which pileup is output\n"); - fprintf(stderr, " -f FILE reference sequence in the FASTA format\n\n"); - fprintf(stderr, " -c compute the consensus sequence\n"); - fprintf(stderr, " -v print variants only (for -c)\n"); - fprintf(stderr, " -g output in the GLFv3 format (DEPRECATED)\n"); - fprintf(stderr, " -T FLOAT theta in maq consensus calling model (for -c) [%.4g]\n", d->c->theta); - fprintf(stderr, " -N INT number of haplotypes in the sample (for -c) [%d]\n", d->c->n_hap); - fprintf(stderr, " -r FLOAT prior of a difference between two haplotypes (for -c) [%.4g]\n", d->c->het_rate); - fprintf(stderr, " -G FLOAT prior of an indel between two haplotypes (for -c) [%.4g]\n", d->ido->r_indel); - fprintf(stderr, " -I INT phred prob. of an indel in sequencing/prep. (for -c) [%d]\n", d->ido->q_indel); - fprintf(stderr, "\n"); - free(fn_list); free(fn_fa); free(d); - return 1; - } - if (d->format & (BAM_PLF_RANBASE|BAM_PLF_1STBASE|BAM_PLF_ALLBASE)) d->format |= BAM_PLF_CNS; - if (fn_fa) d->fai = fai_load(fn_fa); - if (d->format & (BAM_PLF_CNS|BAM_PLF_GLF)) bam_maqcns_prepare(d->c); // consensus calling - if (d->format & BAM_PLF_GLF) { // for glf output - glf3_header_t *h; - h = glf3_header_init(); - d->fp_glf = bgzf_fdopen(fileno(stdout), "w"); - glf3_header_write(d->fp_glf, h); - glf3_header_destroy(h); - } - if (d->fai == 0 && (d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY))) - fprintf(stderr, "[bam_pileup] indels will not be called when -f is absent.\n"); - if (fn_fa && is_SAM && fn_list == 0) fn_list = samfaipath(fn_fa); - - { - samfile_t *fp; - fp = is_SAM? samopen(argv[optind], "r", fn_list) : samopen(argv[optind], "rb", 0); - if (fp == 0 || fp->header == 0) { - fprintf(stderr, "[bam_pileup] fail to read the header: non-exisiting file or wrong format.\n"); - return 1; - } - d->h = fp->header; - if (fn_pos) d->hash = load_pos(fn_pos, d->h); - { // run pileup - extern int bam_prob_realn(bam1_t *b, const char *ref); - extern int bam_cap_mapQ(bam1_t *b, char *ref, int thres); - bam1_t *b; - int ret, tid, pos, n_plp; - bam_plp_t iter; - const bam_pileup1_t *plp; - b = bam_init1(); - iter = bam_plp_init(0, 0); - bam_plp_set_mask(iter, d->mask); - while ((ret = samread(fp, b)) >= 0) { - int skip = 0; - if ((int)b->core.tid < 0) break; - // update d->ref if necessary - if (d->fai && (int)b->core.tid != d->tid) { - free(d->ref); - d->ref = faidx_fetch_seq(d->fai, d->h->target_name[b->core.tid], 0, 0x7fffffff, &d->len); - d->tid = b->core.tid; - } - if (d->ref && (d->format&BAM_PLF_CNS) && !(d->format&BAM_PLF_NOBAQ)) bam_prob_realn(b, d->ref); - if (d->ref && (d->format&BAM_PLF_CNS) && d->capQ_thres > 10) { - int q = bam_cap_mapQ(b, d->ref, d->capQ_thres); - if (q < 0) skip = 1; - else if (b->core.qual > q) b->core.qual = q; - } else if (b->core.flag&BAM_FUNMAP) skip = 1; - else if ((d->format&BAM_PLF_CNS) && (b->core.flag&1) && !(b->core.flag&2)) skip = 1; - if (skip) continue; - bam_plp_push(iter, b); - while ((plp = bam_plp_next(iter, &tid, &pos, &n_plp)) != 0) - pileup_func(tid, pos, n_plp, plp, d); - } - bam_plp_push(iter, 0); - while ((plp = bam_plp_next(iter, &tid, &pos, &n_plp)) != 0) - pileup_func(tid, pos, n_plp, plp, d); - bam_plp_destroy(iter); - bam_destroy1(b); - } - samclose(fp); // d->h will be destroyed here - } - - // free - if (d->format & BAM_PLF_GLF) bgzf_close(d->fp_glf); - if (fn_pos) { // free the hash table - khint_t k; - for (k = kh_begin(d->hash); k < kh_end(d->hash); ++k) - if (kh_exist(d->hash, k)) free(kh_val(d->hash, k)); - kh_destroy(64, d->hash); - } - free(fn_pos); free(fn_list); free(fn_fa); - if (d->fai) fai_destroy(d->fai); - bam_maqcns_destroy(d->c); - free(d->ido); free(d->ref); free(d); - return 0; -} - -/*********** - * mpileup * - ***********/ - #include #include "bam2bcf.h" #include "sample.h" @@ -533,20 +69,32 @@ int bam_pileup(int argc, char *argv[]) #define MPLP_FMT_DP 0x100 #define MPLP_FMT_SP 0x200 #define MPLP_NO_INDEL 0x400 +#define MPLP_EXT_BAQ 0x800 +#define MPLP_ILLUMINA13 0x1000 +#define MPLP_IGNORE_RG 0x2000 +#define MPLP_PRINT_POS 0x4000 +#define MPLP_PRINT_MAPQ 0x8000 + +void *bed_read(const char *fn); +void bed_destroy(void *_h); +int bed_overlap(const void *_h, const char *chr, int beg, int end); typedef struct { - int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth; - int openQ, extQ, tandemQ; - char *reg, *fn_pos, *pl_list; + int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth; + int openQ, extQ, tandemQ, min_support; // for indels + double min_frac; // for indels + char *reg, *pl_list; faidx_t *fai; - kh_64_t *hash; + void *bed, *rghash; } mplp_conf_t; typedef struct { bamFile fp; bam_iter_t iter; - int min_mq, flag, ref_id, capQ_thres; + bam_header_t *h; + int ref_id; char *ref; + const mplp_conf_t *conf; } mplp_aux_t; typedef struct { @@ -566,22 +114,41 @@ static int mplp_func(void *data, bam1_t *b) int has_ref; ret = ma->iter? bam_iter_read(ma->fp, ma->iter, b) : bam_read1(ma->fp, b); if (ret < 0) break; + if (b->core.tid < 0 || (b->core.flag&BAM_FUNMAP)) { // exclude unmapped reads + skip = 1; + continue; + } + if (ma->conf->bed) { // test overlap + skip = !bed_overlap(ma->conf->bed, ma->h->target_name[b->core.tid], b->core.pos, bam_calend(&b->core, bam1_cigar(b))); + if (skip) continue; + } + if (ma->conf->rghash) { // exclude read groups + uint8_t *rg = bam_aux_get(b, "RG"); + skip = (rg && bcf_str2id(ma->conf->rghash, (const char*)(rg+1)) >= 0); + if (skip) continue; + } + if (ma->conf->flag & MPLP_ILLUMINA13) { + int i; + uint8_t *qual = bam1_qual(b); + for (i = 0; i < b->core.l_qseq; ++i) + qual[i] = qual[i] > 31? qual[i] - 31 : 0; + } has_ref = (ma->ref && ma->ref_id == b->core.tid)? 1 : 0; skip = 0; - if (has_ref && (ma->flag&MPLP_REALN)) bam_prob_realn_core(b, ma->ref, 1); - if (has_ref && ma->capQ_thres > 10) { - int q = bam_cap_mapQ(b, ma->ref, ma->capQ_thres); + if (has_ref && (ma->conf->flag&MPLP_REALN)) bam_prob_realn_core(b, ma->ref, (ma->conf->flag & MPLP_EXT_BAQ)? 3 : 1); + if (has_ref && ma->conf->capQ_thres > 10) { + int q = bam_cap_mapQ(b, ma->ref, ma->conf->capQ_thres); if (q < 0) skip = 1; else if (b->core.qual > q) b->core.qual = q; - } else if (b->core.flag&BAM_FUNMAP) skip = 1; - else if (b->core.qual < ma->min_mq) skip = 1; - else if ((ma->flag&MPLP_NO_ORPHAN) && (b->core.flag&1) && !(b->core.flag&2)) skip = 1; + } + else if (b->core.qual < ma->conf->min_mq) skip = 1; + else if ((ma->conf->flag&MPLP_NO_ORPHAN) && (b->core.flag&1) && !(b->core.flag&2)) skip = 1; } while (skip); return ret; } static void group_smpl(mplp_pileup_t *m, bam_sample_t *sm, kstring_t *buf, - int n, char *const*fn, int *n_plp, const bam_pileup1_t **plp) + int n, char *const*fn, int *n_plp, const bam_pileup1_t **plp, int ignore_rg) { int i, j; memset(m->n_plp, 0, m->n * sizeof(int)); @@ -590,10 +157,14 @@ static void group_smpl(mplp_pileup_t *m, bam_sample_t *sm, kstring_t *buf, const bam_pileup1_t *p = plp[i] + j; uint8_t *q; int id = -1; - q = bam_aux_get(p->b, "RG"); + q = ignore_rg? 0 : bam_aux_get(p->b, "RG"); if (q) id = bam_smpl_rg2smid(sm, fn[i], (char*)q+1, buf); if (id < 0) id = bam_smpl_rg2smid(sm, fn[i], 0, buf); - assert(id >= 0 && id < m->n); + if (id < 0 || id >= m->n) { + assert(q); // otherwise a bug + fprintf(stderr, "[%s] Read group %s used in file %s but absent from the header or an alignment missing read group.\n", __func__, (char*)q+1, fn[i]); + exit(1); + } if (m->n_plp[id] == m->m_plp[id]) { m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8; m->plp[id] = realloc(m->plp[id], sizeof(bam_pileup1_t) * m->m_plp[id]); @@ -608,12 +179,11 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); extern void bcf_call_del_rghash(void *rghash); mplp_aux_t **data; - int i, tid, pos, *n_plp, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid, max_depth; + int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth; const bam_pileup1_t **plp; bam_mplp_t iter; bam_header_t *h = 0; char *ref; - khash_t(64) *hash = 0; void *rghash = 0; bcf_callaux_t *bca = 0; @@ -638,12 +208,11 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) for (i = 0; i < n; ++i) { bam_header_t *h_tmp; data[i] = calloc(1, sizeof(mplp_aux_t)); - data[i]->min_mq = conf->min_mq; - data[i]->flag = conf->flag; - data[i]->capQ_thres = conf->capQ_thres; data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r"); + data[i]->conf = conf; h_tmp = bam_header_read(data[i]->fp); - bam_smpl_add(sm, fn[i], h_tmp->text); + data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet + bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); if (conf->reg) { int beg, end; @@ -657,7 +226,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) fprintf(stderr, "[%s] malformatted region or wrong seqname for %d-th input.\n", __func__, i+1); exit(1); } - if (i == 0) beg0 = beg, end0 = end; + if (i == 0) tid0 = tid, beg0 = beg, end0 = end; data[i]->iter = bam_iter_query(idx, tid, beg, end); bam_index_destroy(idx); } @@ -673,7 +242,6 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) gplp.plp = calloc(sm->n, sizeof(void*)); fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n); - if (conf->fn_pos) hash = load_pos(conf->fn_pos, h); // write the VCF header if (conf->flag & MPLP_GLF) { kstring_t s; @@ -694,7 +262,8 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) bh->l_smpl = s.l; bh->sname = malloc(s.l); memcpy(bh->sname, s.s, s.l); - bh->l_txt = 0; + bh->txt = malloc(strlen(BAM_VERSION) + 64); + bh->l_txt = 1 + sprintf(bh->txt, "##samtoolsVersion=%s\n", BAM_VERSION); free(s.s); bcf_hdr_sync(bh); bcf_hdr_write(bp, bh); @@ -702,34 +271,38 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) bcr = calloc(sm->n, sizeof(bcf_callret1_t)); bca->rghash = rghash; bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ; - } - ref_tid = -1; ref = 0; + bca->min_frac = conf->min_frac; + bca->min_support = conf->min_support; + } + if (tid0 >= 0 && conf->fai) { // region is set + ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len); + ref_tid = tid0; + for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0; + } else ref_tid = -1, ref = 0; iter = bam_mplp_init(n, mplp_func, (void**)data); max_depth = conf->max_depth; if (max_depth * sm->n > 1<<20) fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__); if (max_depth * sm->n < 8000) { max_depth = 8000 / sm->n; - fprintf(stderr, "<%s> Set max per-sample depth to %d\n", __func__, max_depth); + fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth); } + max_indel_depth = conf->max_indel_depth * sm->n; bam_mplp_set_maxcnt(iter, max_depth); while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) { if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested - if (hash) { - khint_t k; - k = kh_get(64, hash, (uint64_t)tid<<32 | pos); - if (k == kh_end(hash)) continue; - } + if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; if (tid != ref_tid) { free(ref); ref = 0; - if (conf->fai) ref = fai_fetch(conf->fai, h->target_name[tid], &ref_len); + if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len); for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid; ref_tid = tid; } if (conf->flag & MPLP_GLF) { - int _ref0, ref16; + int total_depth, _ref0, ref16; bcf1_t *b = calloc(1, sizeof(bcf1_t)); - group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp); + for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; + group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG); _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; ref16 = bam_nt16_table[_ref0]; for (i = 0; i < gplp.n; ++i) @@ -740,7 +313,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) bcf_write(bp, bh, b); bcf_destroy(b); // call indels - if (!(conf->flag&MPLP_NO_INDEL) && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { + if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) { @@ -756,8 +329,10 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) for (i = 0; i < n; ++i) { int j; printf("\t%d\t", n_plp[i]); - if (n_plp[i] == 0) printf("*\t*"); - else { + if (n_plp[i] == 0) { + printf("*\t*"); // FIXME: printf() is very slow... + if (conf->flag & MPLP_PRINT_POS) printf("\t*"); + } else { for (j = 0; j < n_plp[i]; ++j) pileup_seq(plp[i] + j, pos, ref_len, ref); putchar('\t'); @@ -767,6 +342,21 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) if (c > 126) c = 126; putchar(c); } + if (conf->flag & MPLP_PRINT_MAPQ) { + putchar('\t'); + for (j = 0; j < n_plp[i]; ++j) { + int c = plp[i][j].b->core.qual + 33; + if (c > 126) c = 126; + putchar(c); + } + } + if (conf->flag & MPLP_PRINT_POS) { + putchar('\t'); + for (j = 0; j < n_plp[i]; ++j) { + if (j > 0) putchar(','); + printf("%d", plp[i][j].qpos + 1); // FIXME: printf() is very slow... + } + } } } putchar('\n'); @@ -778,12 +368,6 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]); free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp); bcf_call_del_rghash(rghash); - if (hash) { // free the hash table - khint_t k; - for (k = kh_begin(hash); k < kh_end(hash); ++k) - if (kh_exist(hash, k)) free(kh_val(hash, k)); - kh_destroy(64, hash); - } bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr); bam_mplp_destroy(iter); bam_header_destroy(h); @@ -797,7 +381,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) } #define MAX_PATH_LEN 1024 -int read_file_list(const char *file_list,int *n,char **argv[]) +static int read_file_list(const char *file_list,int *n,char **argv[]) { char buf[MAX_PATH_LEN]; int len, nfiles; @@ -850,16 +434,18 @@ int bam_mpileup(int argc, char *argv[]) int c; const char *file_list = NULL; char **fn = NULL; - int nfiles = 0; + int nfiles = 0, use_orphan = 0; mplp_conf_t mplp; memset(&mplp, 0, sizeof(mplp_conf_t)); + #define MPLP_PRINT_POS 0x4000 mplp.max_mq = 60; mplp.min_baseQ = 13; mplp.capQ_thres = 0; - mplp.max_depth = 250; + mplp.max_depth = 250; mplp.max_indel_depth = 250; mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100; + mplp.min_frac = 0.002; mplp.min_support = 1; mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN; - while ((c = getopt(argc, argv, "gf:r:l:M:q:Q:uaORC:BDSd:b:P:o:e:h:I")) >= 0) { + while ((c = getopt(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:o:e:h:Im:F:EG:6Os")) >= 0) { switch (c) { case 'f': mplp.fai = fai_load(optarg); @@ -867,17 +453,20 @@ int bam_mpileup(int argc, char *argv[]) break; case 'd': mplp.max_depth = atoi(optarg); break; case 'r': mplp.reg = strdup(optarg); break; - case 'l': mplp.fn_pos = strdup(optarg); break; + case 'l': mplp.bed = bed_read(optarg); break; case 'P': mplp.pl_list = strdup(optarg); break; case 'g': mplp.flag |= MPLP_GLF; break; case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_GLF; break; case 'a': mplp.flag |= MPLP_NO_ORPHAN | MPLP_REALN; break; - case 'B': mplp.flag &= ~MPLP_REALN & ~MPLP_NO_ORPHAN; break; - case 'O': mplp.flag |= MPLP_NO_ORPHAN; break; - case 'R': mplp.flag |= MPLP_REALN; break; + case 'B': mplp.flag &= ~MPLP_REALN; break; case 'D': mplp.flag |= MPLP_FMT_DP; break; case 'S': mplp.flag |= MPLP_FMT_SP; break; case 'I': mplp.flag |= MPLP_NO_INDEL; break; + case 'E': mplp.flag |= MPLP_EXT_BAQ; break; + case '6': mplp.flag |= MPLP_ILLUMINA13; break; + case 'R': mplp.flag |= MPLP_IGNORE_RG; break; + case 's': mplp.flag |= MPLP_PRINT_MAPQ; break; + case 'O': mplp.flag |= MPLP_PRINT_POS; break; case 'C': mplp.capQ_thres = atoi(optarg); break; case 'M': mplp.max_mq = atoi(optarg); break; case 'q': mplp.min_mq = atoi(optarg); break; @@ -886,43 +475,72 @@ int bam_mpileup(int argc, char *argv[]) case 'o': mplp.openQ = atoi(optarg); break; case 'e': mplp.extQ = atoi(optarg); break; case 'h': mplp.tandemQ = atoi(optarg); break; + case 'A': use_orphan = 1; break; + case 'F': mplp.min_frac = atof(optarg); break; + case 'm': mplp.min_support = atoi(optarg); break; + case 'L': mplp.max_indel_depth = atoi(optarg); break; + case 'G': { + FILE *fp_rg; + char buf[1024]; + mplp.rghash = bcf_str2id_init(); + if ((fp_rg = fopen(optarg, "r")) == 0) + fprintf(stderr, "(%s) Fail to open file %s. Continue anyway.\n", __func__, optarg); + while (!feof(fp_rg) && fscanf(fp_rg, "%s", buf) > 0) // this is not a good style, but forgive me... + bcf_str2id_add(mplp.rghash, strdup(buf)); + fclose(fp_rg); + } + break; } } + if (use_orphan) mplp.flag &= ~MPLP_NO_ORPHAN; if (argc == 1) { fprintf(stderr, "\n"); - fprintf(stderr, "Usage: samtools mpileup [options] in1.bam [in2.bam [...]]\n\n"); - fprintf(stderr, "Options: -f FILE reference sequence file [null]\n"); - fprintf(stderr, " -r STR region in which pileup is generated [null]\n"); - fprintf(stderr, " -l FILE list of positions (format: chr pos) [null]\n"); - fprintf(stderr, " -b FILE list of input BAM files [null]\n"); - fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", mplp.max_mq); - fprintf(stderr, " -Q INT min base quality [%d]\n", mplp.min_baseQ); - fprintf(stderr, " -q INT filter out alignment with MQ smaller than INT [%d]\n", mplp.min_mq); - fprintf(stderr, " -d INT max per-sample depth [%d]\n", mplp.max_depth); - fprintf(stderr, " -P STR comma separated list of platforms for indels [all]\n"); - fprintf(stderr, " -o INT Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ); - fprintf(stderr, " -e INT Phred-scaled gap extension seq error probability [%d]\n", mplp.extQ); - fprintf(stderr, " -h INT coefficient for homopolyer errors [%d]\n", mplp.tandemQ); - fprintf(stderr, " -g generate BCF output\n"); - fprintf(stderr, " -u do not compress BCF output\n"); - fprintf(stderr, " -B disable BAQ computation\n"); - fprintf(stderr, " -D output per-sample DP\n"); - fprintf(stderr, " -S output per-sample SP (strand bias P-value, slow)\n"); - fprintf(stderr, " -I do not perform indel calling\n"); + fprintf(stderr, "Usage: samtools mpileup [options] in1.bam [in2.bam [...]]\n\n"); + fprintf(stderr, "Input options:\n\n"); + fprintf(stderr, " -6 assume the quality is in the Illumina-1.3+ encoding\n"); + fprintf(stderr, " -A count anomalous read pairs\n"); + fprintf(stderr, " -B disable BAQ computation\n"); + fprintf(stderr, " -b FILE list of input BAM files [null]\n"); + fprintf(stderr, " -C INT parameter for adjusting mapQ; 0 to disable [0]\n"); + fprintf(stderr, " -d INT max per-BAM depth to avoid excessive memory usage [%d]\n", mplp.max_depth); + fprintf(stderr, " -E extended BAQ for higher sensitivity but lower specificity\n"); + fprintf(stderr, " -f FILE faidx indexed reference sequence file [null]\n"); + fprintf(stderr, " -G FILE exclude read groups listed in FILE [null]\n"); + fprintf(stderr, " -l FILE list of positions (chr pos) or regions (BED) [null]\n"); + fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", mplp.max_mq); + fprintf(stderr, " -r STR region in which pileup is generated [null]\n"); + fprintf(stderr, " -R ignore RG tags\n"); + fprintf(stderr, " -q INT skip alignments with mapQ smaller than INT [%d]\n", mplp.min_mq); + fprintf(stderr, " -Q INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp.min_baseQ); + fprintf(stderr, "\nOutput options:\n\n"); + fprintf(stderr, " -D output per-sample DP in BCF (require -g/-u)\n"); + fprintf(stderr, " -g generate BCF output (genotype likelihoods)\n"); + fprintf(stderr, " -O output base positions on reads (disabled by -g/-u)\n"); + fprintf(stderr, " -s output mapping quality (disabled by -g/-u)\n"); + fprintf(stderr, " -S output per-sample strand bias P-value in BCF (require -g/-u)\n"); + fprintf(stderr, " -u generate uncompress BCF output\n"); + fprintf(stderr, "\nSNP/INDEL genotype likelihoods options (effective with `-g' or `-u'):\n\n"); + fprintf(stderr, " -e INT Phred-scaled gap extension seq error probability [%d]\n", mplp.extQ); + fprintf(stderr, " -F FLOAT minimum fraction of gapped reads for candidates [%g]\n", mplp.min_frac); + fprintf(stderr, " -h INT coefficient for homopolymer errors [%d]\n", mplp.tandemQ); + fprintf(stderr, " -I do not perform indel calling\n"); + fprintf(stderr, " -L INT max per-sample depth for INDEL calling [%d]\n", mplp.max_indel_depth); + fprintf(stderr, " -m INT minimum gapped reads for indel candidates [%d]\n", mplp.min_support); + fprintf(stderr, " -o INT Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ); + fprintf(stderr, " -P STR comma separated list of platforms for indels [all]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Notes: Assuming diploid individuals.\n\n"); return 1; } - if ( file_list ) - { + if (file_list) { if ( read_file_list(file_list,&nfiles,&fn) ) return 1; mpileup(&mplp,nfiles,fn); for (c=0; cx.fpr, buf, BUF_SIZE)) > 0) + fwrite(buf, 1, len, fp->x.fpw); #else while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0) + fwrite(buf, 1, len, fp->file); #endif - fwrite(buf, 1, len, fp->x.fpw); free(buf); fp->block_offset = in->block_offset = 0; bgzf_close(fp); diff --git a/sam/bam_sort.c b/sam/bam_sort.c index 01f7016..abf8d4f 100644 --- a/sam/bam_sort.c +++ b/sam/bam_sort.c @@ -70,6 +70,8 @@ static void swap_header_text(bam_header_t *h1, bam_header_t *h2) #define MERGE_RG 1 #define MERGE_UNCOMP 2 +#define MERGE_LEVEL1 4 +#define MERGE_FORCE 8 /*! @abstract Merge multiple sorted BAM. @@ -202,16 +204,14 @@ int bam_merge_core(int by_qname, const char *out, const char *headers, int n, ch h->i = i; h->b = (bam1_t*)calloc(1, sizeof(bam1_t)); if (bam_iter_read(fp[i], iter[i], h->b) >= 0) { - h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)h->b->core.pos<<1 | bam1_strand(h->b); + h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam1_strand(h->b); h->idx = idx++; } else h->pos = HEAP_EMPTY; } - if (flag & MERGE_UNCOMP) { - fpout = strcmp(out, "-")? bam_open(out, "wu") : bam_dopen(fileno(stdout), "wu"); - } else { - fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w"); - } + if (flag & MERGE_UNCOMP) fpout = strcmp(out, "-")? bam_open(out, "wu") : bam_dopen(fileno(stdout), "wu"); + else if (flag & MERGE_LEVEL1) fpout = strcmp(out, "-")? bam_open(out, "w1") : bam_dopen(fileno(stdout), "w1"); + else fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w"); if (fpout == 0) { fprintf(stderr, "[%s] fail to create the output file.\n", __func__); return -1; @@ -222,11 +222,14 @@ int bam_merge_core(int by_qname, const char *out, const char *headers, int n, ch ks_heapmake(heap, n, heap); while (heap->pos != HEAP_EMPTY) { bam1_t *b = heap->b; - if ((flag & MERGE_RG) && bam_aux_get(b, "RG") == 0) + if (flag & MERGE_RG) { + uint8_t *rg = bam_aux_get(b, "RG"); + if (rg) bam_aux_del(b, rg); bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]); + } bam_write1_core(fpout, &b->core, b->data_len, b->data); if ((j = bam_iter_read(fp[heap->i], iter[heap->i], b)) >= 0) { - heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)b->core.pos<<1 | bam1_strand(b); + heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam1_strand(b); heap->idx = idx++; } else if (j == -1) { heap->pos = HEAP_EMPTY; @@ -254,11 +257,13 @@ int bam_merge(int argc, char *argv[]) int c, is_by_qname = 0, flag = 0, ret = 0; char *fn_headers = NULL, *reg = 0; - while ((c = getopt(argc, argv, "h:nruR:")) >= 0) { + while ((c = getopt(argc, argv, "h:nru1R:f")) >= 0) { switch (c) { case 'r': flag |= MERGE_RG; break; + case 'f': flag |= MERGE_FORCE; break; case 'h': fn_headers = strdup(optarg); break; case 'n': is_by_qname = 1; break; + case '1': flag |= MERGE_LEVEL1; break; case 'u': flag |= MERGE_UNCOMP; break; case 'R': reg = strdup(optarg); break; } @@ -269,6 +274,8 @@ int bam_merge(int argc, char *argv[]) fprintf(stderr, "Options: -n sort by read names\n"); fprintf(stderr, " -r attach RG tag (inferred from file names)\n"); fprintf(stderr, " -u uncompressed BAM output\n"); + fprintf(stderr, " -f overwrite the output BAM if exist\n"); + fprintf(stderr, " -1 compress level 1\n"); fprintf(stderr, " -R STR merge file in the specified region STR [all]\n"); fprintf(stderr, " -h FILE copy the header in FILE to [in1.bam]\n\n"); fprintf(stderr, "Note: Samtools' merge does not reconstruct the @RG dictionary in the header. Users\n"); @@ -276,6 +283,14 @@ int bam_merge(int argc, char *argv[]) fprintf(stderr, " the header dictionary in merging.\n\n"); return 1; } + if (!(flag & MERGE_FORCE) && strcmp(argv[optind], "-")) { + FILE *fp = fopen(argv[optind], "rb"); + if (fp != NULL) { + fclose(fp); + fprintf(stderr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, argv[optind]); + return 1; + } + } if (bam_merge_core(is_by_qname, argv[optind], fn_headers, argc - optind - 1, argv + optind + 1, flag, reg) < 0) ret = 1; free(reg); free(fn_headers); @@ -288,21 +303,26 @@ static inline int bam1_lt(const bam1_p a, const bam1_p b) { if (g_is_by_qname) { int t = strnum_cmp(bam1_qname(a), bam1_qname(b)); - return (t < 0 || (t == 0 && (((uint64_t)a->core.tid<<32|a->core.pos) < ((uint64_t)b->core.tid<<32|b->core.pos)))); - } else return (((uint64_t)a->core.tid<<32|a->core.pos) < ((uint64_t)b->core.tid<<32|b->core.pos)); + return (t < 0 || (t == 0 && (((uint64_t)a->core.tid<<32|(a->core.pos+1)) < ((uint64_t)b->core.tid<<32|(b->core.pos+1))))); + } else return (((uint64_t)a->core.tid<<32|(a->core.pos+1)) < ((uint64_t)b->core.tid<<32|(b->core.pos+1))); } KSORT_INIT(sort, bam1_p, bam1_lt) static void sort_blocks(int n, int k, bam1_p *buf, const char *prefix, const bam_header_t *h, int is_stdout) { - char *name; + char *name, mode[3]; int i; bamFile fp; ks_mergesort(sort, k, buf, 0); name = (char*)calloc(strlen(prefix) + 20, 1); - if (n >= 0) sprintf(name, "%s.%.4d.bam", prefix, n); - else sprintf(name, "%s.bam", prefix); - fp = is_stdout? bam_dopen(fileno(stdout), "w") : bam_open(name, "w"); + if (n >= 0) { + sprintf(name, "%s.%.4d.bam", prefix, n); + strcpy(mode, "w1"); + } else { + sprintf(name, "%s.bam", prefix); + strcpy(mode, "w"); + } + fp = is_stdout? bam_dopen(fileno(stdout), mode) : bam_open(name, mode); if (fp == 0) { fprintf(stderr, "[sort_blocks] fail to create file %s.\n", name); free(name); diff --git a/sam/bam_stat.c b/sam/bam_stat.c index ea9deee..f2de0f1 100644 --- a/sam/bam_stat.c +++ b/sam/bam_stat.c @@ -3,31 +3,31 @@ #include "bam.h" typedef struct { - long long n_reads, n_mapped, n_pair_all, n_pair_map, n_pair_good; - long long n_sgltn, n_read1, n_read2; - long long n_qcfail, n_dup; - long long n_diffchr, n_diffhigh; + long long n_reads[2], n_mapped[2], n_pair_all[2], n_pair_map[2], n_pair_good[2]; + long long n_sgltn[2], n_read1[2], n_read2[2]; + long long n_dup[2]; + long long n_diffchr[2], n_diffhigh[2]; } bam_flagstat_t; #define flagstat_loop(s, c) do { \ - ++(s)->n_reads; \ + int w = ((c)->flag & BAM_FQCFAIL)? 1 : 0; \ + ++(s)->n_reads[w]; \ if ((c)->flag & BAM_FPAIRED) { \ - ++(s)->n_pair_all; \ - if ((c)->flag & BAM_FPROPER_PAIR) ++(s)->n_pair_good; \ - if ((c)->flag & BAM_FREAD1) ++(s)->n_read1; \ - if ((c)->flag & BAM_FREAD2) ++(s)->n_read2; \ - if (((c)->flag & BAM_FMUNMAP) && !((c)->flag & BAM_FUNMAP)) ++(s)->n_sgltn; \ + ++(s)->n_pair_all[w]; \ + if ((c)->flag & BAM_FPROPER_PAIR) ++(s)->n_pair_good[w]; \ + if ((c)->flag & BAM_FREAD1) ++(s)->n_read1[w]; \ + if ((c)->flag & BAM_FREAD2) ++(s)->n_read2[w]; \ + if (((c)->flag & BAM_FMUNMAP) && !((c)->flag & BAM_FUNMAP)) ++(s)->n_sgltn[w]; \ if (!((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FMUNMAP)) { \ - ++(s)->n_pair_map; \ + ++(s)->n_pair_map[w]; \ if ((c)->mtid != (c)->tid) { \ - ++(s)->n_diffchr; \ - if ((c)->qual >= 5) ++(s)->n_diffhigh; \ + ++(s)->n_diffchr[w]; \ + if ((c)->qual >= 5) ++(s)->n_diffhigh[w]; \ } \ } \ } \ - if (!((c)->flag & BAM_FUNMAP)) ++(s)->n_mapped; \ - if ((c)->flag & BAM_FQCFAIL) ++(s)->n_qcfail; \ - if ((c)->flag & BAM_FDUP) ++(s)->n_dup; \ + if (!((c)->flag & BAM_FUNMAP)) ++(s)->n_mapped[w]; \ + if ((c)->flag & BAM_FDUP) ++(s)->n_dup[w]; \ } while (0) bam_flagstat_t *bam_flagstat_core(bamFile fp) @@ -59,18 +59,17 @@ int bam_flagstat(int argc, char *argv[]) assert(fp); header = bam_header_read(fp); s = bam_flagstat_core(fp); - printf("%lld in total\n", s->n_reads); - printf("%lld QC failure\n", s->n_qcfail); - printf("%lld duplicates\n", s->n_dup); - printf("%lld mapped (%.2f%%)\n", s->n_mapped, (float)s->n_mapped / s->n_reads * 100.0); - printf("%lld paired in sequencing\n", s->n_pair_all); - printf("%lld read1\n", s->n_read1); - printf("%lld read2\n", s->n_read2); - printf("%lld properly paired (%.2f%%)\n", s->n_pair_good, (float)s->n_pair_good / s->n_pair_all * 100.0); - printf("%lld with itself and mate mapped\n", s->n_pair_map); - printf("%lld singletons (%.2f%%)\n", s->n_sgltn, (float)s->n_sgltn / s->n_pair_all * 100.0); - printf("%lld with mate mapped to a different chr\n", s->n_diffchr); - printf("%lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh); + printf("%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); + printf("%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]); + printf("%lld + %lld mapped (%.2f%%:%.2f%%)\n", s->n_mapped[0], s->n_mapped[1], (float)s->n_mapped[0] / s->n_reads[0] * 100.0, (float)s->n_mapped[1] / s->n_reads[1] * 100.0); + printf("%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); + printf("%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]); + printf("%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]); + printf("%lld + %lld properly paired (%.2f%%:%.2f%%)\n", s->n_pair_good[0], s->n_pair_good[1], (float)s->n_pair_good[0] / s->n_pair_all[0] * 100.0, (float)s->n_pair_good[1] / s->n_pair_all[1] * 100.0); + printf("%lld + %lld with itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]); + printf("%lld + %lld singletons (%.2f%%:%.2f%%)\n", s->n_sgltn[0], s->n_sgltn[1], (float)s->n_sgltn[0] / s->n_pair_all[0] * 100.0, (float)s->n_sgltn[1] / s->n_pair_all[1] * 100.0); + printf("%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]); + printf("%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]); free(s); bam_header_destroy(header); bam_close(fp); diff --git a/sam/bam_tview.c b/sam/bam_tview.c index e48afa7..4eea955 100644 --- a/sam/bam_tview.c +++ b/sam/bam_tview.c @@ -19,9 +19,10 @@ #include #include #include +#include #include "bam.h" #include "faidx.h" -#include "bam_maqcns.h" +#include "bam2bcf.h" char bam_aux_getCEi(bam1_t *b, int i); char bam_aux_getCSi(bam1_t *b, int i); @@ -50,7 +51,7 @@ typedef struct { bamFile fp; int curr_tid, left_pos; faidx_t *fai; - bam_maqcns_t *bmc; + bcf_callaux_t *bca; int ccol, last_pos, row_shift, base_for, color_for, is_dot, l_ref, ins, no_skip, show_name; char *ref; @@ -58,6 +59,7 @@ typedef struct { int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) { + extern unsigned char bam_nt16_table[256]; tview_t *tv = (tview_t*)data; int i, j, c, rb, attr, max_ins = 0; uint32_t call = 0; @@ -70,11 +72,26 @@ int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void mvaddch(1, tv->ccol++, c); } if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) mvprintw(0, tv->ccol, "%-d", pos+1); - // print consensus - call = bam_maqcns_call(n, pl, tv->bmc); + { // call consensus + bcf_callret1_t bcr; + int qsum[4], a1, a2, tmp; + double p[3], prior = 30; + bcf_call_glfgen(n, pl, bam_nt16_table[rb], tv->bca, &bcr); + for (i = 0; i < 4; ++i) qsum[i] = bcr.qsum[i]<<2 | i; + for (i = 1; i < 4; ++i) // insertion sort + for (j = i; j > 0 && qsum[j] > qsum[j-1]; --j) + tmp = qsum[j], qsum[j] = qsum[j-1], qsum[j-1] = tmp; + a1 = qsum[0]&3; a2 = qsum[1]&3; + p[0] = bcr.p[a1*5+a1]; p[1] = bcr.p[a1*5+a2] + prior; p[2] = bcr.p[a2*5+a2]; + if ("ACGT"[a1] != toupper(rb)) p[0] += prior + 3; + if ("ACGT"[a2] != toupper(rb)) p[2] += prior + 3; + if (p[0] < p[1] && p[0] < p[2]) call = (1<>28&0xf]; - i = (call>>8&0xff)/10+1; + c = ",ACMGRSVTWYHKDBN"[call>>16&0xf]; + i = (call&0xffff)/10+1; if (i > 4) i = 4; attr |= COLOR_PAIR(i); if (c == toupper(rb)) c = '.'; @@ -183,17 +200,16 @@ tview_t *tv_init(const char *fn, const char *fn_fa) { tview_t *tv = (tview_t*)calloc(1, sizeof(tview_t)); tv->is_dot = 1; - tv->idx = bam_index_load(fn); - if (tv->idx == 0) exit(1); tv->fp = bam_open(fn, "r"); bgzf_set_cache_size(tv->fp, 8 * 1024 *1024); assert(tv->fp); tv->header = bam_header_read(tv->fp); + tv->idx = bam_index_load(fn); + if (tv->idx == 0) exit(1); tv->lplbuf = bam_lplbuf_init(tv_pl_func, tv); if (fn_fa) tv->fai = fai_load(fn_fa); - tv->bmc = bam_maqcns_init(); + tv->bca = bcf_call_init(0.83, 13); tv->ins = 1; - bam_maqcns_prepare(tv->bmc); initscr(); keypad(stdscr, TRUE); @@ -224,7 +240,7 @@ void tv_destroy(tview_t *tv) endwin(); bam_lplbuf_destroy(tv->lplbuf); - bam_maqcns_destroy(tv->bmc); + bcf_call_destroy(tv->bca); bam_index_destroy(tv->idx); if (tv->fai) fai_destroy(tv->fai); free(tv->ref); diff --git a/sam/bamtk.c b/sam/bamtk.c index 79635d6..8ba2581 100644 --- a/sam/bamtk.c +++ b/sam/bamtk.c @@ -8,12 +8,7 @@ #include "knetfile.h" #endif -#ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.1.12a (r862)" -#endif - int bam_taf2baf(int argc, char *argv[]); -int bam_pileup(int argc, char *argv[]); int bam_mpileup(int argc, char *argv[]); int bam_merge(int argc, char *argv[]); int bam_index(int argc, char *argv[]); @@ -27,60 +22,24 @@ int bam_idxstats(int argc, char *argv[]); int main_samview(int argc, char *argv[]); int main_import(int argc, char *argv[]); int main_reheader(int argc, char *argv[]); +int main_cut_target(int argc, char *argv[]); +int main_phase(int argc, char *argv[]); +int main_cat(int argc, char *argv[]); +int main_depth(int argc, char *argv[]); +int main_bam2fq(int argc, char *argv[]); int faidx_main(int argc, char *argv[]); -int glf3_view_main(int argc, char *argv[]); - -int bam_tagview(int argc, char *argv[]) -{ - bamFile fp; - bam_header_t *header; - bam1_t *b; - char tag[2]; - int ret; - if (argc < 3) { - fprintf(stderr, "Usage: samtools tagview \n"); - return 1; - } - fp = strcmp(argv[1], "-")? bam_open(argv[1], "r") : bam_dopen(fileno(stdin), "r"); - assert(fp); - header = bam_header_read(fp); - if (header == 0) { - fprintf(stderr, "[bam_view] fail to read the BAM header. Abort!\n"); - return 1; - } - tag[0] = argv[2][0]; tag[1] = argv[2][1]; - b = (bam1_t*)calloc(1, sizeof(bam1_t)); - while ((ret = bam_read1(fp, b)) >= 0) { - uint8_t *d = bam_aux_get(b, tag); - if (d) { - printf("%s\t%d\t", bam1_qname(b), b->core.flag); - if (d[0] == 'Z' || d[0] == 'H') printf("%s\n", bam_aux2Z(d)); - else if (d[0] == 'f') printf("%f\n", bam_aux2f(d)); - else if (d[0] == 'd') printf("%lf\n", bam_aux2d(d)); - else if (d[0] == 'A') printf("%c\n", bam_aux2A(d)); - else if (d[0] == 'c' || d[0] == 's' || d[0] == 'i') printf("%d\n", bam_aux2i(d)); - else if (d[0] == 'C' || d[0] == 'S' || d[0] == 'I') printf("%u\n", bam_aux2i(d)); - else printf("\n"); - } - } - if (ret < -1) fprintf(stderr, "[bam_view] truncated file? Continue anyway. (%d)\n", ret); - free(b->data); free(b); - bam_header_destroy(header); - bam_close(fp); - return 0; -} static int usage() { fprintf(stderr, "\n"); fprintf(stderr, "Program: samtools (Tools for alignments in the SAM format)\n"); - fprintf(stderr, "Version: %s\n\n", PACKAGE_VERSION); + fprintf(stderr, "Version: %s\n\n", BAM_VERSION); fprintf(stderr, "Usage: samtools [options]\n\n"); fprintf(stderr, "Command: view SAM<->BAM conversion\n"); fprintf(stderr, " sort sort alignment file\n"); - fprintf(stderr, " pileup generate pileup output\n"); fprintf(stderr, " mpileup multi-way pileup\n"); + fprintf(stderr, " depth compute the depth\n"); fprintf(stderr, " faidx index/extract FASTA\n"); #if _CURSES_LIB != 0 fprintf(stderr, " tview text alignment viewer\n"); @@ -88,13 +47,21 @@ static int usage() fprintf(stderr, " index index alignment\n"); fprintf(stderr, " idxstats BAM index stats (r595 or later)\n"); fprintf(stderr, " fixmate fix mate information\n"); - fprintf(stderr, " glfview print GLFv3 file\n"); fprintf(stderr, " flagstat simple stats\n"); fprintf(stderr, " calmd recalculate MD/NM tags and '=' bases\n"); fprintf(stderr, " merge merge sorted alignments\n"); fprintf(stderr, " rmdup remove PCR duplicates\n"); fprintf(stderr, " reheader replace BAM header\n"); + fprintf(stderr, " cat concatenate BAMs\n"); + fprintf(stderr, " targetcut cut fosmid regions (for fosmid pool only)\n"); + fprintf(stderr, " phase phase heterozygotes\n"); fprintf(stderr, "\n"); +#ifdef _WIN32 + fprintf(stderr, "\ +Note: The Windows version of SAMtools is mainly designed for read-only\n\ + operations, such as viewing the alignments and generating the pileup.\n\ + Binary files generated by the Windows version may be buggy.\n\n"); +#endif return 1; } @@ -110,7 +77,6 @@ int main(int argc, char *argv[]) if (argc < 2) return usage(); if (strcmp(argv[1], "view") == 0) return main_samview(argc-1, argv+1); else if (strcmp(argv[1], "import") == 0) return main_import(argc-1, argv+1); - else if (strcmp(argv[1], "pileup") == 0) return bam_pileup(argc-1, argv+1); else if (strcmp(argv[1], "mpileup") == 0) return bam_mpileup(argc-1, argv+1); else if (strcmp(argv[1], "merge") == 0) return bam_merge(argc-1, argv+1); else if (strcmp(argv[1], "sort") == 0) return bam_sort(argc-1, argv+1); @@ -119,12 +85,19 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "faidx") == 0) return faidx_main(argc-1, argv+1); else if (strcmp(argv[1], "fixmate") == 0) return bam_mating(argc-1, argv+1); else if (strcmp(argv[1], "rmdup") == 0) return bam_rmdup(argc-1, argv+1); - else if (strcmp(argv[1], "glfview") == 0) return glf3_view_main(argc-1, argv+1); else if (strcmp(argv[1], "flagstat") == 0) return bam_flagstat(argc-1, argv+1); - else if (strcmp(argv[1], "tagview") == 0) return bam_tagview(argc-1, argv+1); else if (strcmp(argv[1], "calmd") == 0) return bam_fillmd(argc-1, argv+1); else if (strcmp(argv[1], "fillmd") == 0) return bam_fillmd(argc-1, argv+1); else if (strcmp(argv[1], "reheader") == 0) return main_reheader(argc-1, argv+1); + else if (strcmp(argv[1], "cat") == 0) return main_cat(argc-1, argv+1); + else if (strcmp(argv[1], "targetcut") == 0) return main_cut_target(argc-1, argv+1); + else if (strcmp(argv[1], "phase") == 0) return main_phase(argc-1, argv+1); + else if (strcmp(argv[1], "depth") == 0) return main_depth(argc-1, argv+1); + else if (strcmp(argv[1], "bam2fq") == 0) return main_bam2fq(argc-1, argv+1); + else if (strcmp(argv[1], "pileup") == 0) { + fprintf(stderr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n"); + return 1; + } #if _CURSES_LIB != 0 else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1); #endif diff --git a/sam/bcftools/Makefile b/sam/bcftools/Makefile index 8b890ba..9b6f863 100644 --- a/sam/bcftools/Makefile +++ b/sam/bcftools/Makefile @@ -1,9 +1,9 @@ CC= gcc CFLAGS= -g -Wall -O2 #-m64 #-arch ppc DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE -LOBJS= bcf.o vcf.o bcfutils.o prob1.o ld.o kfunc.o index.o fet.o bcf2qcall.o +LOBJS= bcf.o vcf.o bcfutils.o prob1.o em.o kfunc.o kmin.o index.o fet.o mut.o bcf2qcall.o OMISC= .. -AOBJS= call1.o main.o $(OMISC)/kstring.o $(OMISC)/bgzf.o $(OMISC)/knetfile.o +AOBJS= call1.o main.o $(OMISC)/kstring.o $(OMISC)/bgzf.o $(OMISC)/knetfile.o $(OMISC)/bedidx.o PROG= bcftools INCLUDES= SUBDIRS= . @@ -28,10 +28,10 @@ all:$(PROG) lib:libbcf.a libbcf.a:$(LOBJS) - $(AR) -cru $@ $(LOBJS) + $(AR) -csru $@ $(LOBJS) bcftools:lib $(AOBJS) - $(CC) $(CFLAGS) -o $@ $(AOBJS) -lm $(LIBPATH) -lz -L. -lbcf + $(CC) $(CFLAGS) -o $@ $(AOBJS) -L. $(LIBPATH) -lbcf -lm -lz bcf.o:bcf.h vcf.o:bcf.h diff --git a/sam/bcftools/bcf-fix.pl b/sam/bcftools/bcf-fix.pl deleted file mode 100755 index 61c6136..0000000 --- a/sam/bcftools/bcf-fix.pl +++ /dev/null @@ -1,101 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use warnings; -use Carp; - -my $opts = parse_params(); -bcf_fix(); - -exit; - -#-------------------------------- - -sub error -{ - my (@msg) = @_; - if ( scalar @msg ) { confess @msg; } - die - "Usage: bcftools view test.bcf | bcf-fix.pl > test.vcf\n", - "Options:\n", - " -h, -?, --help This help message.\n", - "\n"; -} - - -sub parse_params -{ - my $opts = {}; - while (my $arg=shift(@ARGV)) - { - if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } - error("Unknown parameter \"$arg\". Run -h for help.\n"); - } - return $opts; -} - -sub bcf_fix -{ - while (my $line=) - { - if ( $line=~/^#CHROM/ ) - { - print -qq[##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##FORMAT= -##FORMAT= -##FORMAT= -]; - print $line; - } - elsif ( $line=~/^#/ ) - { - print $line; - } - else - { - my @items = split(/\t/,$line); - my @tags = split(/:/,$items[8]); # FORMAT tags - - my $nidx=2; - my @idxs; # Mapping which defines new ordering: $idxs[$inew]=$iold; GT comes first, PL second - for (my $i=0; $i<@tags; $i++) - { - if ( $tags[$i] eq 'GT' ) { $idxs[0]=$i; } - elsif ( $tags[$i] eq 'PL' ) { $idxs[1]=$i; } - else { $idxs[$nidx++]=$i; } - } - if ( !exists($tags[0]) or !exists($tags[1]) ) { error("FIXME: expected GT and PL in the format field.\n"); } - - # First fix the FORMAT column - $items[8] = 'GT:GL'; - for (my $i=2; $i<@tags; $i++) - { - $items[8] .= ':'.$tags[$idxs[$i]]; - } - - # Now all the genotype columns - for (my $iitem=9; $iitem<@items; $iitem++) - { - @tags = split(/:/,$items[$iitem]); - $items[$iitem] = $tags[$idxs[0]] .':'; - - # GL=-PL/10 - my ($a,$b,$c) = split(/,/,$tags[$idxs[1]]); - $items[$iitem] .= sprintf "%.2f,%.2f,%.2f",-$a/10.,-$b/10.,-$c/10.; - - for (my $itag=2; $itag<@tags; $itag++) - { - $items[$iitem] .= ':'.$tags[$idxs[$itag]]; - } - } - print join("\t",@items); - } - } -} - diff --git a/sam/bcftools/bcf.c b/sam/bcftools/bcf.c index 6e45695..84a8e76 100644 --- a/sam/bcftools/bcf.c +++ b/sam/bcftools/bcf.c @@ -103,10 +103,16 @@ int bcf_sync(bcf1_t *b) ks_tokaux_t aux; // set ref, alt, flt, info, fmt b->ref = b->alt = b->flt = b->info = b->fmt = 0; - for (p = b->str, n = 0; p < b->str + b->l_str; ++p) - if (*p == 0 && p+1 != b->str + b->l_str) tmp[n++] = p + 1; + for (p = b->str, n = 0; p < b->str + b->l_str; ++p) { + if (*p == 0 && p+1 != b->str + b->l_str) { + if (n == 5) { + ++n; + break; + } else tmp[n++] = p + 1; + } + } if (n != 5) { - fprintf(stderr, "[%s] incorrect number of fields (%d != 5). Corrupted file?\n", __func__, n); + fprintf(stderr, "[%s] incorrect number of fields (%d != 5) at %d:%d\n", __func__, n, b->tid, b->pos); return -1; } b->ref = tmp[0]; b->alt = tmp[1]; b->flt = tmp[2]; b->info = tmp[3]; b->fmt = tmp[4]; @@ -136,10 +142,10 @@ int bcf_sync(bcf1_t *b) b->gi[i].len = b->n_alleles * (b->n_alleles + 1) / 2; } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("HQ", 2)) { b->gi[i].len = 2; - } else if (b->gi[i].fmt == bcf_str2int("GQ", 2) || b->gi[i].fmt == bcf_str2int("GT", 2) - || b->gi[i].fmt == bcf_str2int("SP", 2)) - { + } else if (b->gi[i].fmt == bcf_str2int("GQ", 2) || b->gi[i].fmt == bcf_str2int("GT", 2)) { b->gi[i].len = 1; + } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) { + b->gi[i].len = 4; } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) { b->gi[i].len = b->n_alleles * (b->n_alleles + 1) / 2 * 4; } @@ -240,8 +246,10 @@ void bcf_fmt_core(const bcf_hdr_t *h, bcf1_t *b, kstring_t *s) } } else if (b->gi[i].fmt == bcf_str2int("DP", 2)) { kputw(((uint16_t*)b->gi[i].data)[j], s); - } else if (b->gi[i].fmt == bcf_str2int("GQ", 2) || b->gi[i].fmt == bcf_str2int("SP", 2)) { + } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) { kputw(((uint8_t*)b->gi[i].data)[j], s); + } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) { + kputw(((int32_t*)b->gi[i].data)[j], s); } else if (b->gi[i].fmt == bcf_str2int("GT", 2)) { int y = ((uint8_t*)b->gi[i].data)[j]; if (y>>7&1) { @@ -259,7 +267,7 @@ void bcf_fmt_core(const bcf_hdr_t *h, bcf1_t *b, kstring_t *s) if (k > 0) kputc(',', s); ksprintf(s, "%.2f", d[k]); } - } + } else kputc('.', s); // custom fields } } } diff --git a/sam/bcftools/bcf.h b/sam/bcftools/bcf.h index f87ac1e..822ae5c 100644 --- a/sam/bcftools/bcf.h +++ b/sam/bcftools/bcf.h @@ -28,6 +28,8 @@ #ifndef BCF_H #define BCF_H +#define BCF_VERSION "0.1.17-dev (r973:277)" + #include #include @@ -129,6 +131,8 @@ extern "C" { int vcf_close(bcf_t *bp); // read the VCF/BCF header bcf_hdr_t *vcf_hdr_read(bcf_t *bp); + // read the sequence dictionary from a separate file; required for VCF->BCF conversion + int vcf_dictread(bcf_t *bp, bcf_hdr_t *h, const char *fn); // read a VCF/BCF record; return -1 on end-of-file and <-1 for errors int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b); // write the VCF header @@ -142,10 +146,21 @@ extern "C" { int bcf_gl2pl(bcf1_t *b); // if the site is an indel int bcf_is_indel(const bcf1_t *b); + bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list); + int bcf_subsam(int n_smpl, int *list, bcf1_t *b); + // move GT to the first FORMAT field + int bcf_fix_gt(bcf1_t *b); + // update PL generated by old samtools + int bcf_fix_pl(bcf1_t *b); + // convert PL to GLF-like 10-likelihood GL + int bcf_gl10(const bcf1_t *b, uint8_t *gl); + // convert up to 4 INDEL alleles to GLF-like 10-likelihood GL + int bcf_gl10_indel(const bcf1_t *b, uint8_t *gl); // string hash table void *bcf_build_refhash(bcf_hdr_t *h); void bcf_str2id_destroy(void *_hash); + void bcf_str2id_thorough_destroy(void *_hash); int bcf_str2id_add(void *_hash, const char *str); int bcf_str2id(void *_hash, const char *str); void *bcf_str2id_init(); diff --git a/sam/bcftools/bcf.tex b/sam/bcftools/bcf.tex index 5ca1e28..442fc2a 100644 --- a/sam/bcftools/bcf.tex +++ b/sam/bcftools/bcf.tex @@ -14,50 +14,64 @@ \begin{tabular}{|l|l|l|l|l|} \hline \multicolumn{2}{|c|}{\bf Field} & \multicolumn{1}{c|}{\bf Descrption} & \multicolumn{1}{c|}{\bf Type} & \multicolumn{1}{c|}{\bf Value} \\\hline\hline -\multicolumn{2}{|l|}{\tt magic} & Magic string & {\tt char[4]} & {\tt BCF\char92 4} \\\hline -\multicolumn{2}{|l|}{\tt l\_nm} & Length of concatenated sequence names & {\tt int32\_t} & \\\hline -\multicolumn{2}{|l|}{\tt name} & Concatenated names, {\tt NULL} padded & {\tt char[l\_nm]} & \\\hline -\multicolumn{2}{|l|}{\tt l\_smpl} & Length of concatenated sample names & {\tt int32\_t} & \\\hline -\multicolumn{2}{|l|}{\tt sname} & Concatenated sample names & {\tt char[l\_smpl]} & \\\hline -\multicolumn{2}{|l|}{\tt l\_txt} & Length of the meta text (double-hash lines)& {\tt int32\_t} & \\\hline -\multicolumn{2}{|l|}{\tt text} & Meta text, {\tt NULL} terminated & {\tt char[l\_txt]} & \\\hline +\multicolumn{2}{|l|}{\sf magic} & Magic string & {\tt char[4]} & {\tt BCF\char92 4} \\\hline +\multicolumn{2}{|l|}{\sf l\_seqnm} & Length of concatenated sequence names & {\tt int32\_t} & \\\hline +\multicolumn{2}{|l|}{\sf seqnm} & Concatenated names, {\tt NULL} padded & {\tt char[{\sf l\_seqnm}]} & \\\hline +\multicolumn{2}{|l|}{\sf l\_smpl} & Length of concatenated sample names & {\tt int32\_t} & \\\hline +\multicolumn{2}{|l|}{\sf smpl} & Concatenated sample names & {\tt char[{\sf l\_smpl}]} & \\\hline +\multicolumn{2}{|l|}{\sf l\_meta} & Length of the meta text (double-hash lines)& {\tt int32\_t} & \\\hline +\multicolumn{2}{|l|}{\sf meta} & Meta text, {\tt NULL} terminated & {\tt char[{\sf l\_meta}]} & \\\hline \multicolumn{5}{|c|}{\it \color{gray}{List of records until the end of the file}}\\\cline{2-5} -& {\tt seq\_id} & Reference sequence ID & {\tt int32\_t} & \\\cline{2-5} -& {\tt pos} & Position & {\tt int32\_t} & \\\cline{2-5} -& {\tt qual} & Variant quality & {\tt float} & \\\cline{2-5} -& {\tt l\_str} & Length of str & {\tt int32\_t} & \\\cline{2-5} -& {\tt str} & {\tt ID+REF+ALT+FILTER+INFO+FORMAT}, {\tt NULL} padded & {\tt char[slen]} &\\\cline{2-5} +& {\sf seq\_id} & Reference sequence ID & {\tt int32\_t} & \\\cline{2-5} +& {\sf pos} & Position & {\tt int32\_t} & \\\cline{2-5} +& {\sf qual} & Variant quality & {\tt float} & \\\cline{2-5} +& {\sf l\_str} & Length of {\sf str} & {\tt int32\_t} & \\\cline{2-5} +& {\sf str} & {\tt ID+REF+ALT+FILTER+INFO+FORMAT}, {\tt NULL} padded & {\tt char[{\sf l\_str}]} &\\\cline{2-5} & \multicolumn{4}{c|}{Blocks of data; \#blocks and formats defined by {\tt FORMAT} (table below)}\\ \hline \end{tabular} \end{center} \begin{center} -\begin{tabular}{cll} +\begin{tabular}{clp{9cm}} \hline \multicolumn{1}{l}{\bf Field} & \multicolumn{1}{l}{\bf Type} & \multicolumn{1}{l}{\bf Description} \\\hline {\tt DP} & {\tt uint16\_t[n]} & Read depth \\ -{\tt GL} & {\tt float[n*x]} & Log10 likelihood of data; $x=\frac{m(m+1)}{2}$, $m=\#\{alleles\}$\\ -{\tt GT} & {\tt uint8\_t[n]} & {\tt phase\char60\char60 6 | allele1\char60\char60 3 | allele2} \\ +{\tt GL} & {\tt float[n*G]} & Log10 likelihood of data; $G=\frac{A(A+1)}{2}$, $A=\#\{alleles\}$\\ +{\tt GT} & {\tt uint8\_t[n]} & {\tt missing\char60\char60 7 | phased\char60\char60 6 | allele1\char60\char60 3 | allele2} \\ +{\tt \_GT} & {\tt uint8\_t+uint8\_t[n*P]} & {Generic GT; the first int equals the max ploidy $P$. If the highest bit is set, + the allele is not present (e.g. due to different ploidy between samples).} \\ {\tt GQ} & {\tt uint8\_t[n]} & {Genotype quality}\\ {\tt HQ} & {\tt uint8\_t[n*2]} & {Haplotype quality}\\ -{\tt PL} & {\tt uint8\_t[n*x]} & {Phred-scaled likelihood of data}\\ -\emph{misc} & {\tt int32\_t+char*} & {\tt NULL} padded concatenated strings (integer equal to the length) \\ +{\tt \_HQ} & {\tt uint8\_t+uint8\_t[n*P]} & {Generic HQ}\\ +{\tt IBD} & {\tt uint32\_t[n*2]} & {IBD}\\ +{\tt \_IBD} & {\tt uint8\_t+uint32\_t[n*P]} & {Generic IBD}\\ +{\tt PL} & {\tt uint8\_t[n*G]} & {Phred-scaled likelihood of data}\\ +{\tt PS} & {\tt uint32\_t[n]} & {Phase set}\\ +%{\tt SP} & {\tt uint8\_t[n]} & {Strand bias P-value (bcftools only)}\\ +\emph{Integer} & {\tt int32\_t[n*X]} & {Fix-sized custom Integer; $X$ defined in the header}\\ +\emph{Numeric} & {\tt double[n*X]} & {Fix-sized custom Numeric}\\ +\emph{String} & {\tt uint32\_t+char*} & {\tt NULL} padded concat. strings (int equals to the length) \\ \hline \end{tabular} \end{center} \begin{itemize} -\item The file is {\tt BGZF} compressed. -\item All integers are little-endian. +\item A BCF file is in the {\tt BGZF} format. +\item All multi-byte numbers are little-endian. \item In a string, a missing value `.' is an empty C string ``{\tt \char92 0}'' (not ``{\tt .\char92 0}'') \item For {\tt GL} and {\tt PL}, likelihoods of genotypes appear in the order of alleles in {\tt REF} and then {\tt ALT}. For example, if {\tt REF=C}, {\tt ALT=T,A}, likelihoods appear in the order of {\tt - CC,CT,CA,TT,TA,AA}. -\item {\tt GL} is an extension to and is backward compatible with the - {\tt GL} genotype field in {\tt VCFv4.0}. + CC,CT,TT,CA,TA,AA} (NB: the ordering is different from the one in the original + BCF proposal). +\item Predefined {\tt FORMAT} fields can be missing from VCF headers, but custom {\tt FORMAT} fields + are required to be explicitly defined in the headers. +\item A {\tt FORMAT} field with its name starting with `{\tt \_}' is specific to BCF only. + It gives an alternative binary representation of the corresponding VCF field, in case + the default representation is unable to keep the genotype information, + for example, when the ploidy is not 2 or there are more than 8 alleles. \end{itemize} -\end{document} \ No newline at end of file +\end{document} diff --git a/sam/bcftools/bcf2qcall.c b/sam/bcftools/bcf2qcall.c index 8634c9e..a86bac2 100644 --- a/sam/bcftools/bcf2qcall.c +++ b/sam/bcftools/bcf2qcall.c @@ -77,8 +77,8 @@ int bcf_2qcall(bcf_hdr_t *h, bcf1_t *b) for (k = j = 0; k < 4; ++k) { for (l = k; l < 4; ++l) { int t, x = map[k], y = map[l]; - if (x > y) t = x, x = y, y = t; - g[j++] = p[x * b->n_alleles - x * (x-1) / 2 + (y - x)]; + if (x > y) t = x, x = y, y = t; // swap + g[j++] = p[y * (y+1) / 2 + x]; } } printf("%s\t%d\t%c", h->ns[b->tid], b->pos+1, *b->ref); diff --git a/sam/bcftools/bcftools.1 b/sam/bcftools/bcftools.1 deleted file mode 100644 index 6c7403b..0000000 --- a/sam/bcftools/bcftools.1 +++ /dev/null @@ -1,120 +0,0 @@ -.TH bcftools 1 "2 October 2010" "bcftools" "Bioinformatics tools" -.SH NAME -.PP -bcftools - Utilities for the Binary Call Format (BCF) and VCF. -.SH SYNOPSIS -.PP -bcftools index in.bcf -.PP -bcftools view in.bcf chr2:100-200 > out.vcf -.PP -bcftools view -vc in.bcf > out.vcf 2> out.afs - -.SH DESCRIPTION -.PP -Bcftools is a toolkit for processing VCF/BCF files, calling variants and -estimating site allele frequencies and allele frequency spectrums. - -.SH COMMANDS AND OPTIONS - -.TP 10 -.B view -.B bcftools view -.RB [ \-cbuSAGgHvNQ ] -.RB [ \-1 -.IR nGroup1 ] -.RB [ \-l -.IR listFile ] -.RB [ \-t -.IR mutRate ] -.RB [ \-p -.IR varThres ] -.RB [ \-P -.IR prior ] -.I in.bcf -.RI [ region ] - -Convert between BCF and VCF, call variant candidates and estimate allele -frequencies. - -.B OPTIONS: -.RS -.TP 10 -.B -b -Output in the BCF format. The default is VCF. -.TP -.B -c -Call variants. -.TP -.B -v -Output variant sites only (force -c) -.TP -.B -g -Call per-sample genotypes at variant sites (force -c) -.TP -.B -u -Uncompressed BCF output (force -b). -.TP -.B -S -The input is VCF instead of BCF. -.TP -.B -A -Retain all possible alternate alleles at variant sites. By default, this -command discards unlikely alleles. -.TP -.B -G -Suppress all individual genotype information. -.TP -.B -H -Perform Hardy-Weiberg Equilibrium test. This will add computation time, sometimes considerably. -.TP -.B -N -Skip sites where the REF field is not A/C/G/T -.TP -.B -Q -Output the QCALL likelihood format -.TP -.B -f -Reference-free variant calling mode. In this mode, the prior will be -folded; a variant is called iff the sample(s) contains at least two -alleles; the QUAL field in the VCF/BCF output is changed accordingly. -.TP -.BI "-1 " INT -Number of group-1 samples. This option is used for dividing input into -two groups for comparing. A zero value disables this functionality. [0] -.TP -.BI "-l " FILE -List of sites at which information are outputted [all sites] -.TP -.BI "-t " FLOAT -Scaled muttion rate for variant calling [0.001] -.TP -.BI "-p " FLOAT -A site is considered to be a variant if P(ref|D) +#include #include "bcf.h" #include "kstring.h" #include "khash.h" KHASH_MAP_INIT_STR(str2id, int) +#ifdef _WIN32 +#define srand48(x) srand(x) +#define drand48() ((double)rand() / RAND_MAX) +#endif + +// FIXME: valgrind report a memory leak in this function. Probably it does not get deallocated... void *bcf_build_refhash(bcf_hdr_t *h) { khash_t(str2id) *hash; @@ -27,6 +35,16 @@ void bcf_str2id_destroy(void *_hash) if (hash) kh_destroy(str2id, hash); // Note that strings are not freed. } +void bcf_str2id_thorough_destroy(void *_hash) +{ + khash_t(str2id) *hash = (khash_t(str2id)*)_hash; + khint_t k; + if (hash == 0) return; + for (k = 0; k < kh_end(hash); ++k) + if (kh_exist(hash, k)) free((char*)kh_key(hash, k)); + kh_destroy(str2id, hash); +} + int bcf_str2id(void *_hash, const char *str) { khash_t(str2id) *hash = (khash_t(str2id)*)_hash; @@ -51,8 +69,9 @@ int bcf_str2id_add(void *_hash, const char *str) int bcf_shrink_alt(bcf1_t *b, int n) { char *p; - int i, j, k, *z, n_smpl = b->n_smpl; + int i, j, k, n_smpl = b->n_smpl; if (b->n_alleles <= n) return -1; + // update ALT if (n > 1) { for (p = b->alt, k = 1; *p; ++p) if (*p == ',' && ++k == n) break; @@ -61,10 +80,7 @@ int bcf_shrink_alt(bcf1_t *b, int n) ++p; memmove(p, b->flt, b->str + b->l_str - b->flt); b->l_str -= b->flt - p; - z = alloca(sizeof(int) / 2 * n * (n+1)); - for (i = k = 0; i < n; ++i) - for (j = 0; j < n - i; ++j) - z[k++] = i * b->n_alleles + j; + // update PL for (i = 0; i < b->n_gi; ++i) { bcf_ginfo_t *g = b->gi + i; if (g->fmt == bcf_str2int("PL", 2)) { @@ -73,7 +89,7 @@ int bcf_shrink_alt(bcf1_t *b, int n) g->len = n * (n + 1) / 2; for (l = k = 0; l < n_smpl; ++l) { uint8_t *dl = d + l * x; - for (j = 0; j < g->len; ++j) d[k++] = dl[z[j]]; + for (j = 0; j < g->len; ++j) d[k++] = dl[j]; } } // FIXME: to add GL } @@ -107,3 +123,268 @@ int bcf_gl2pl(bcf1_t *b) } return 0; } +/* FIXME: this function will fail given AB:GTX:GT. BCFtools never + * produces such FMT, but others may do. */ +int bcf_fix_gt(bcf1_t *b) +{ + char *s; + int i; + uint32_t tmp; + bcf_ginfo_t gt; + // check the presence of the GT FMT + if ((s = strstr(b->fmt, ":GT")) == 0) return 0; // no GT or GT is already the first + if (s[3] != '\0' && s[3] != ':') return 0; // :GTX in fact + tmp = bcf_str2int("GT", 2); + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == tmp) break; + if (i == b->n_gi) return 0; // no GT in b->gi; probably a bug... + gt = b->gi[i]; + // move GT to the first + for (; i > 0; --i) b->gi[i] = b->gi[i-1]; + b->gi[0] = gt; + memmove(b->fmt + 3, b->fmt, s + 1 - b->fmt); + b->fmt[0] = 'G'; b->fmt[1] = 'T'; b->fmt[2] = ':'; + return 0; +} + +int bcf_fix_pl(bcf1_t *b) +{ + int i; + uint32_t tmp; + uint8_t *PL, *swap; + bcf_ginfo_t *gi; + // pinpoint PL + tmp = bcf_str2int("PL", 2); + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == tmp) break; + if (i == b->n_gi) return 0; + // prepare + gi = b->gi + i; + PL = (uint8_t*)gi->data; + swap = alloca(gi->len); + // loop through individuals + for (i = 0; i < b->n_smpl; ++i) { + int k, l, x; + uint8_t *PLi = PL + i * gi->len; + memcpy(swap, PLi, gi->len); + for (k = x = 0; k < b->n_alleles; ++k) + for (l = k; l < b->n_alleles; ++l) + PLi[l*(l+1)/2 + k] = swap[x++]; + } + return 0; +} + +int bcf_smpl_covered(const bcf1_t *b) +{ + int i, j, n = 0; + uint32_t tmp; + bcf_ginfo_t *gi; + // pinpoint PL + tmp = bcf_str2int("PL", 2); + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == tmp) break; + if (i == b->n_gi) return 0; + // count how many samples having PL!=[0..0] + gi = b->gi + i; + for (i = 0; i < b->n_smpl; ++i) { + uint8_t *PLi = ((uint8_t*)gi->data) + i * gi->len; + for (j = 0; j < gi->len; ++j) + if (PLi[j]) break; + if (j < gi->len) ++n; + } + return n; +} + +static void *locate_field(const bcf1_t *b, const char *fmt, int l) +{ + int i; + uint32_t tmp; + tmp = bcf_str2int(fmt, l); + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == tmp) break; + return i == b->n_gi? 0 : b->gi[i].data; +} + +int bcf_anno_max(bcf1_t *b) +{ + int k, max_gq, max_sp, n_het; + kstring_t str; + uint8_t *gt, *gq; + int32_t *sp; + max_gq = max_sp = n_het = 0; + gt = locate_field(b, "GT", 2); + if (gt == 0) return -1; + gq = locate_field(b, "GQ", 2); + sp = locate_field(b, "SP", 2); + if (sp) + for (k = 0; k < b->n_smpl; ++k) + if (gt[k]&0x3f) + max_sp = max_sp > (int)sp[k]? max_sp : sp[k]; + if (gq) + for (k = 0; k < b->n_smpl; ++k) + if (gt[k]&0x3f) + max_gq = max_gq > (int)gq[k]? max_gq : gq[k]; + for (k = 0; k < b->n_smpl; ++k) { + int a1, a2; + a1 = gt[k]&7; a2 = gt[k]>>3&7; + if ((!a1 && a2) || (!a2 && a1)) { // a het + if (gq == 0) ++n_het; + else if (gq[k] >= 20) ++n_het; + } + } + if (n_het) max_sp -= (int)(4.343 * log(n_het) + .499); + if (max_sp < 0) max_sp = 0; + memset(&str, 0, sizeof(kstring_t)); + if (*b->info) kputc(';', &str); + ksprintf(&str, "MXSP=%d;MXGQ=%d", max_sp, max_gq); + bcf_append_info(b, str.s, str.l); + free(str.s); + return 0; +} + +// FIXME: only data are shuffled; the header is NOT +int bcf_shuffle(bcf1_t *b, int seed) +{ + int i, j, *a; + if (seed > 0) srand48(seed); + a = malloc(b->n_smpl * sizeof(int)); + for (i = 0; i < b->n_smpl; ++i) a[i] = i; + for (i = b->n_smpl; i > 1; --i) { + int tmp; + j = (int)(drand48() * i); + tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp; + } + for (j = 0; j < b->n_gi; ++j) { + bcf_ginfo_t *gi = b->gi + j; + uint8_t *swap, *data = (uint8_t*)gi->data; + swap = malloc(gi->len * b->n_smpl); + for (i = 0; i < b->n_smpl; ++i) + memcpy(swap + gi->len * a[i], data + gi->len * i, gi->len); + free(gi->data); + gi->data = swap; + } + free(a); + return 0; +} + +bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list) +{ + int i, ret, j; + khint_t k; + bcf_hdr_t *h; + khash_t(str2id) *hash; + kstring_t s; + s.l = s.m = 0; s.s = 0; + hash = kh_init(str2id); + for (i = 0; i < h0->n_smpl; ++i) { + k = kh_put(str2id, hash, h0->sns[i], &ret); + kh_val(hash, k) = i; + } + for (i = j = 0; i < n; ++i) { + k = kh_get(str2id, hash, samples[i]); + if (k != kh_end(hash)) { + list[j++] = kh_val(hash, k); + kputs(samples[i], &s); kputc('\0', &s); + } + } + if (j < n) fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j); + kh_destroy(str2id, hash); + h = calloc(1, sizeof(bcf_hdr_t)); + *h = *h0; + h->ns = 0; h->sns = 0; + h->name = malloc(h->l_nm); memcpy(h->name, h0->name, h->l_nm); + h->txt = calloc(1, h->l_txt + 1); memcpy(h->txt, h0->txt, h->l_txt); + h->l_smpl = s.l; h->sname = s.s; + bcf_hdr_sync(h); + return h; +} + +int bcf_subsam(int n_smpl, int *list, bcf1_t *b) +{ + int i, j; + for (j = 0; j < b->n_gi; ++j) { + bcf_ginfo_t *gi = b->gi + j; + uint8_t *swap; + swap = malloc(gi->len * b->n_smpl); + for (i = 0; i < n_smpl; ++i) + memcpy(swap + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len); + free(gi->data); + gi->data = swap; + } + b->n_smpl = n_smpl; + return 0; +} + +static int8_t nt4_table[128] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4 +}; + +int bcf_gl10(const bcf1_t *b, uint8_t *gl) +{ + int a[4], k, l, map[4], k1, j, i; + const bcf_ginfo_t *PL; + char *s; + if (b->ref[1] != 0 || b->n_alleles > 4) return -1; // ref is not a single base or >4 alleles + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == bcf_str2int("PL", 2)) break; + if (i == b->n_gi) return -1; // no PL + PL = b->gi + i; + a[0] = nt4_table[(int)b->ref[0]]; + if (a[0] > 3 || a[0] < 0) return -1; // ref is not A/C/G/T + a[1] = a[2] = a[3] = -2; // -1 has a special meaning + if (b->alt[0] == 0) return -1; // no alternate allele + map[0] = map[1] = map[2] = map[3] = -2; + map[a[0]] = 0; + for (k = 0, s = b->alt, k1 = -1; k < 3 && *s; ++k, s += 2) { + if (s[1] != ',' && s[1] != 0) return -1; // ALT is not single base + a[k+1] = nt4_table[(int)*s]; + if (a[k+1] >= 0) map[a[k+1]] = k+1; + else k1 = k + 1; + if (s[1] == 0) break; // the end of the ALT string + } + for (k = 0; k < 4; ++k) + if (map[k] < 0) map[k] = k1; + for (i = 0; i < b->n_smpl; ++i) { + const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual + uint8_t *g = gl + 10 * i; + for (k = j = 0; k < 4; ++k) { + for (l = k; l < 4; ++l) { + int t, x = map[k], y = map[l]; + if (x > y) t = x, x = y, y = t; // make sure x is the smaller + g[j++] = p[y * (y+1) / 2 + x]; + } + } + } + return 0; +} + +int bcf_gl10_indel(const bcf1_t *b, uint8_t *gl) +{ + int k, l, j, i; + const bcf_ginfo_t *PL; + if (b->alt[0] == 0) return -1; // no alternate allele + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == bcf_str2int("PL", 2)) break; + if (i == b->n_gi) return -1; // no PL + PL = b->gi + i; + for (i = 0; i < b->n_smpl; ++i) { + const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual + uint8_t *g = gl + 10 * i; + for (k = j = 0; k < 4; ++k) { + for (l = k; l < 4; ++l) { + int t, x = k, y = l; + if (x > y) t = x, x = y, y = t; // make sure x is the smaller + x = y * (y+1) / 2 + x; + g[j++] = x < PL->len? p[x] : 255; + } + } + } + return 0; +} diff --git a/sam/bcftools/call1.c b/sam/bcftools/call1.c index f293a6c..3cc4649 100644 --- a/sam/bcftools/call1.c +++ b/sam/bcftools/call1.c @@ -6,9 +6,12 @@ #include "bcf.h" #include "prob1.h" #include "kstring.h" +#include "time.h" -#include "khash.h" -KHASH_SET_INIT_INT64(set64) +#ifdef _WIN32 +#define srand48(x) srand(x) +#define lrand48() rand() +#endif #include "kseq.h" KSTREAM_INIT(gzFile, gzread, 16384) @@ -19,70 +22,30 @@ KSTREAM_INIT(gzFile, gzread, 16384) #define VC_VARONLY 16 #define VC_VCFIN 32 #define VC_UNCOMP 64 -#define VC_HWE 128 #define VC_KEEPALT 256 #define VC_ACGT_ONLY 512 #define VC_QCALL 1024 #define VC_CALL_GT 2048 #define VC_ADJLD 4096 #define VC_NO_INDEL 8192 -#define VC_FOLDED 16384 +#define VC_ANNO_MAX 16384 +#define VC_FIX_PL 32768 +#define VC_EM 0x10000 +#define VC_PAIRCALL 0x20000 +#define VC_QCNT 0x40000 typedef struct { - int flag, prior_type, n1; - char *fn_list, *prior_file; - double theta, pref, indel_frac; + int flag, prior_type, n1, n_sub, *sublist, n_perm; + uint32_t *trio_aux; + char *prior_file, **subsam, *fn_dict; + uint8_t *ploidy; + double theta, pref, indel_frac, min_perm_p, min_smpl_frac, min_lrt; + void *bed; } viewconf_t; -khash_t(set64) *bcf_load_pos(const char *fn, bcf_hdr_t *_h) -{ - void *str2id; - gzFile fp; - kstream_t *ks; - int ret, dret, lineno = 1; - kstring_t *str; - khash_t(set64) *hash = 0; - - hash = kh_init(set64); - str2id = bcf_build_refhash(_h); - str = calloc(1, sizeof(kstring_t)); - fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); - ks = ks_init(fp); - while (ks_getuntil(ks, 0, str, &dret) >= 0) { - int tid = bcf_str2id(str2id, str->s); - if (tid >= 0 && dret != '\n') { - if (ks_getuntil(ks, 0, str, &dret) >= 0) { - uint64_t x = (uint64_t)tid<<32 | (atoi(str->s) - 1); - kh_put(set64, hash, x, &ret); - } else break; - } else fprintf(stderr, "[%s] %s is not a reference name (line %d).\n", __func__, str->s, lineno); - if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); - if (dret < 0) break; - ++lineno; - } - bcf_str2id_destroy(str2id); - ks_destroy(ks); - gzclose(fp); - free(str->s); free(str); - return hash; -} - -static double test_hwe(const double g[3]) -{ - extern double kf_gammaq(double p, double x); - double fexp, chi2, f[3], n; - int i; - n = g[0] + g[1] + g[2]; - fexp = (2. * g[2] + g[1]) / (2. * n); - if (fexp > 1. - 1e-10) fexp = 1. - 1e-10; - if (fexp < 1e-10) fexp = 1e-10; - f[0] = n * (1. - fexp) * (1. - fexp); - f[1] = n * 2. * fexp * (1. - fexp); - f[2] = n * fexp * fexp; - for (i = 0, chi2 = 0.; i < 3; ++i) - chi2 += (g[i] - f[i]) * (g[i] - f[i]) / f[i]; - return kf_gammaq(.5, chi2 / 2.); -} +void *bed_read(const char *fn); +void bed_destroy(void *_h); +int bed_overlap(const void *_h, const char *chr, int beg, int end); typedef struct { double p[4]; @@ -147,37 +110,70 @@ static void rm_info(bcf1_t *b, const char *key) bcf_sync(b); } -static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, double pref, int flag) +static int update_bcf1(bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, double pref, int flag, double em[10], int cons_llr, int64_t cons_gt) { kstring_t s; - int is_var = (pr->p_ref < pref); - double p_hwe, r = is_var? pr->p_ref : 1. - pr->p_ref; + int has_I16, is_var; + double fq, r; anno16_t a; - p_hwe = pr->g[0] >= 0.? test_hwe(pr->g) : 1.0; // only do HWE g[] is calculated - test16(b, &a); - rm_info(b, "I16="); + has_I16 = test16(b, &a) >= 0? 1 : 0; + rm_info(b, "I16="); // FIXME: probably this function has a bug. If I move it below, I16 will not be removed! memset(&s, 0, sizeof(kstring_t)); kputc('\0', &s); kputs(b->ref, &s); kputc('\0', &s); kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s); kputs(b->info, &s); if (b->info[0]) kputc(';', &s); -// ksprintf(&s, "AF1=%.4lg;AFE=%.4lg;CI95=%.4lg,%.4lg", 1.-pr->f_em, 1.-pr->f_exp, pr->cil, pr->cih); - ksprintf(&s, "AF1=%.4lg;CI95=%.4lg,%.4lg", 1.-pr->f_em, pr->cil, pr->cih); - ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq); - if (a.is_tested) { - if (pr->pc[0] >= 0.) ksprintf(&s, ";PC4=%lg,%lg,%lg,%lg", pr->pc[0], pr->pc[1], pr->pc[2], pr->pc[3]); - ksprintf(&s, ";PV4=%.2lg,%.2lg,%.2lg,%.2lg", a.p[0], a.p[1], a.p[2], a.p[3]); + { // print EM + if (em[0] >= 0) ksprintf(&s, "AF1=%.4g", 1 - em[0]); + if (em[4] >= 0 && em[4] <= 0.05) ksprintf(&s, ";G3=%.4g,%.4g,%.4g;HWE=%.3g", em[3], em[2], em[1], em[4]); + if (em[5] >= 0 && em[6] >= 0) ksprintf(&s, ";AF2=%.4g,%.4g", 1 - em[5], 1 - em[6]); + if (em[7] >= 0) ksprintf(&s, ";LRT=%.3g", em[7]); + if (em[8] >= 0) ksprintf(&s, ";LRT2=%.3g", em[8]); + } + if (cons_llr > 0) { + ksprintf(&s, ";CLR=%d", cons_llr); + if (cons_gt > 0) + ksprintf(&s, ";UGT=%c%c%c;CGT=%c%c%c", cons_gt&0xff, cons_gt>>8&0xff, cons_gt>>16&0xff, + cons_gt>>32&0xff, cons_gt>>40&0xff, cons_gt>>48&0xff); } - if (pr->g[0] >= 0. && p_hwe <= .2) - ksprintf(&s, ";GC=%.2lf,%.2lf,%.2lf;HWE=%.3lf", pr->g[2], pr->g[1], pr->g[0], p_hwe); + if (pr == 0) { // if pr is unset, return + kputc('\0', &s); kputs(b->fmt, &s); kputc('\0', &s); + free(b->str); + b->m_str = s.m; b->l_str = s.l; b->str = s.s; + bcf_sync(b); + return 1; + } + + is_var = (pr->p_ref < pref); + r = is_var? pr->p_ref : pr->p_var; + +// ksprintf(&s, ";CI95=%.4g,%.4g", pr->cil, pr->cih); // FIXME: when EM is not used, ";" should be omitted! + ksprintf(&s, ";AC1=%d", pr->ac); + if (has_I16) ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq); + fq = pr->p_ref_folded < 0.5? -4.343 * log(pr->p_ref_folded) : 4.343 * log(pr->p_var_folded); + if (fq < -999) fq = -999; + if (fq > 999) fq = 999; + ksprintf(&s, ";FQ=%.3g", fq); + if (pr->cmp[0] >= 0.) { // two sample groups + int i, q[3]; + for (i = 1; i < 3; ++i) { + double x = pr->cmp[i] + pr->cmp[0]/2.; + q[i] = x == 0? 255 : (int)(-4.343 * log(x) + .499); + if (q[i] > 255) q[i] = 255; + } + if (pr->perm_rank >= 0) ksprintf(&s, ";PR=%d", pr->perm_rank); + // ksprintf(&s, ";LRT3=%.3g", pr->lrt); + ksprintf(&s, ";PCHI2=%.3g;PC2=%d,%d", q[1], q[2], pr->p_chi2); + } + if (has_I16 && a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]); kputc('\0', &s); kputs(b->fmt, &s); kputc('\0', &s); free(b->str); b->m_str = s.m; b->l_str = s.l; b->str = s.s; - b->qual = r < 1e-100? 99 : -4.343 * log(r); - if (b->qual > 99) b->qual = 99; + b->qual = r < 1e-100? 999 : -4.343 * log(r); + if (b->qual > 999) b->qual = 999; bcf_sync(b); if (!is_var) bcf_shrink_alt(b, 1); else if (!(flag&VC_KEEPALT)) @@ -189,7 +185,7 @@ static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p b->m_str = s.m; b->l_str = s.l; b->str = s.s; bcf_sync(b); for (i = 0; i < b->n_smpl; ++i) { - x = bcf_p1_call_gt(pa, pr->f_em, i); + x = bcf_p1_call_gt(pa, pr->f_exp, i); ((uint8_t*)b->gi[old_n_gi].data)[i] = (x&3) == 0? 1<<3|1 : (x&3) == 1? 1 : 0; ((uint8_t*)b->gi[old_n_gi+1].data)[i] = x>>2; } @@ -197,47 +193,174 @@ static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p return is_var; } -double bcf_ld_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]); +static char **read_samples(const char *fn, int *_n) +{ + gzFile fp; + kstream_t *ks; + kstring_t s; + int dret, n = 0, max = 0; + char **sam = 0; + *_n = 0; + s.l = s.m = 0; s.s = 0; + fp = gzopen(fn, "r"); + if (fp == 0) return 0; // fail to open file + ks = ks_init(fp); + while (ks_getuntil(ks, 0, &s, &dret) >= 0) { + int l; + if (max == n) { + max = max? max<<1 : 4; + sam = realloc(sam, sizeof(void*)*max); + } + l = s.l; + sam[n] = malloc(s.l + 2); + strcpy(sam[n], s.s); + sam[n][l+1] = 2; // by default, diploid + if (dret != '\n') { + if (ks_getuntil(ks, 0, &s, &dret) >= 0) { // read ploidy, 1 or 2 + int x = (int)s.s[0] - '0'; + if (x == 1 || x == 2) sam[n][l+1] = x; + else fprintf(stderr, "(%s) ploidy can only be 1 or 2; assume diploid\n", __func__); + } + if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); + } + ++n; + } + ks_destroy(ks); + gzclose(fp); + free(s.s); + *_n = n; + return sam; +} + +static void write_header(bcf_hdr_t *h) +{ + kstring_t str; + str.l = h->l_txt? h->l_txt - 1 : 0; + str.m = str.l + 1; str.s = h->txt; + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); +// if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##FORMAT=\n", &str); + if (!strstr(str.s, "##FORMAT=\n", &str); + if (!strstr(str.s, "##FORMAT=\n", &str); + if (!strstr(str.s, "##FORMAT=\n", &str); + if (!strstr(str.s, "##FORMAT=\n", &str); + if (!strstr(str.s, "##FORMAT=\n", &str); + h->l_txt = str.l + 1; h->txt = str.s; +} + +double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]); int bcfview(int argc, char *argv[]) { extern int bcf_2qcall(bcf_hdr_t *h, bcf1_t *b); extern void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x); + extern int bcf_fix_gt(bcf1_t *b); + extern int bcf_anno_max(bcf1_t *b); + extern int bcf_shuffle(bcf1_t *b, int seed); + extern uint32_t *bcf_trio_prep(int is_x, int is_son); + extern int bcf_trio_call(uint32_t *prep, const bcf1_t *b, int *llr, int64_t *gt); + extern int bcf_pair_call(const bcf1_t *b); + extern int bcf_min_diff(const bcf1_t *b); + bcf_t *bp, *bout = 0; bcf1_t *b, *blast; - int c; - uint64_t n_processed = 0; + int c, *seeds = 0; + uint64_t n_processed = 0, qcnt[256]; viewconf_t vc; bcf_p1aux_t *p1 = 0; - bcf_hdr_t *h; + bcf_hdr_t *hin, *hout; int tid, begin, end; char moder[4], modew[4]; - khash_t(set64) *hash = 0; tid = begin = end = -1; memset(&vc, 0, sizeof(viewconf_t)); - vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.; - while ((c = getopt(argc, argv, "fN1:l:cHAGvbSuP:t:p:QgLi:I")) >= 0) { + vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.; vc.n_perm = 0; vc.min_perm_p = 0.01; vc.min_smpl_frac = 0; vc.min_lrt = 1; + memset(qcnt, 0, 8 * 256); + while ((c = getopt(argc, argv, "FN1:l:cC:eHAGvbSuP:t:p:QgLi:IMs:D:U:X:d:T:Y")) >= 0) { switch (c) { - case 'f': vc.flag |= VC_FOLDED; break; case '1': vc.n1 = atoi(optarg); break; - case 'l': vc.fn_list = strdup(optarg); break; + case 'l': vc.bed = bed_read(optarg); break; + case 'D': vc.fn_dict = strdup(optarg); break; + case 'F': vc.flag |= VC_FIX_PL; break; case 'N': vc.flag |= VC_ACGT_ONLY; break; case 'G': vc.flag |= VC_NO_GENO; break; case 'A': vc.flag |= VC_KEEPALT; break; case 'b': vc.flag |= VC_BCFOUT; break; case 'S': vc.flag |= VC_VCFIN; break; case 'c': vc.flag |= VC_CALL; break; + case 'e': vc.flag |= VC_EM; break; case 'v': vc.flag |= VC_VARONLY | VC_CALL; break; case 'u': vc.flag |= VC_UNCOMP | VC_BCFOUT; break; - case 'H': vc.flag |= VC_HWE; break; case 'g': vc.flag |= VC_CALL_GT | VC_CALL; break; case 'I': vc.flag |= VC_NO_INDEL; break; + case 'M': vc.flag |= VC_ANNO_MAX; break; + case 'Y': vc.flag |= VC_QCNT; break; case 't': vc.theta = atof(optarg); break; case 'p': vc.pref = atof(optarg); break; case 'i': vc.indel_frac = atof(optarg); break; case 'Q': vc.flag |= VC_QCALL; break; case 'L': vc.flag |= VC_ADJLD; break; + case 'U': vc.n_perm = atoi(optarg); break; + case 'C': vc.min_lrt = atof(optarg); break; + case 'X': vc.min_perm_p = atof(optarg); break; + case 'd': vc.min_smpl_frac = atof(optarg); break; + case 's': vc.subsam = read_samples(optarg, &vc.n_sub); + vc.ploidy = calloc(vc.n_sub + 1, 1); + for (tid = 0; tid < vc.n_sub; ++tid) vc.ploidy[tid] = vc.subsam[tid][strlen(vc.subsam[tid]) + 1]; + tid = -1; + break; + case 'T': + if (strcmp(optarg, "trioauto") == 0) vc.trio_aux = bcf_trio_prep(0, 0); + else if (strcmp(optarg, "trioxd") == 0) vc.trio_aux = bcf_trio_prep(1, 0); + else if (strcmp(optarg, "trioxs") == 0) vc.trio_aux = bcf_trio_prep(1, 1); + else if (strcmp(optarg, "pair") == 0) vc.flag |= VC_PAIRCALL; + else { + fprintf(stderr, "[%s] Option '-T' can only take value trioauto, trioxd or trioxs.\n", __func__); + return 1; + } + break; case 'P': if (strcmp(optarg, "full") == 0) vc.prior_type = MC_PTYPE_FULL; else if (strcmp(optarg, "cond2") == 0) vc.prior_type = MC_PTYPE_COND2; @@ -248,31 +371,52 @@ int bcfview(int argc, char *argv[]) } if (argc == optind) { fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bcftools view [options] [reg]\n\n"); - fprintf(stderr, "Options: -c SNP calling\n"); - fprintf(stderr, " -v output potential variant sites only (force -c)\n"); - fprintf(stderr, " -g call genotypes at variant sites (force -c)\n"); - fprintf(stderr, " -b output BCF instead of VCF\n"); - fprintf(stderr, " -u uncompressed BCF output (force -b)\n"); - fprintf(stderr, " -S input is VCF\n"); - fprintf(stderr, " -A keep all possible alternate alleles at variant sites\n"); - fprintf(stderr, " -G suppress all individual genotype information\n"); - fprintf(stderr, " -H perform Hardy-Weinberg test (slower)\n"); - fprintf(stderr, " -N skip sites where REF is not A/C/G/T\n"); - fprintf(stderr, " -Q output the QCALL likelihood format\n"); - fprintf(stderr, " -L calculate LD for adjacent sites\n"); - fprintf(stderr, " -I skip indels\n"); - fprintf(stderr, " -f reference-free variant calling\n"); - fprintf(stderr, " -1 INT number of group-1 samples [0]\n"); - fprintf(stderr, " -l FILE list of sites to output [all sites]\n"); - fprintf(stderr, " -t FLOAT scaled substitution mutation rate [%.4lg]\n", vc.theta); - fprintf(stderr, " -i FLOAT indel-to-substitution ratio [%.4lg]\n", vc.indel_frac); - fprintf(stderr, " -p FLOAT variant if P(ref|D) [reg]\n\n"); + fprintf(stderr, "Input/output options:\n\n"); + fprintf(stderr, " -A keep all possible alternate alleles at variant sites\n"); + fprintf(stderr, " -b output BCF instead of VCF\n"); + fprintf(stderr, " -D FILE sequence dictionary for VCF->BCF conversion [null]\n"); + fprintf(stderr, " -F PL generated by r921 or before (which generate old ordering)\n"); + fprintf(stderr, " -G suppress all individual genotype information\n"); + fprintf(stderr, " -l FILE list of sites (chr pos) or regions (BED) to output [all sites]\n"); + fprintf(stderr, " -L calculate LD for adjacent sites\n"); + fprintf(stderr, " -N skip sites where REF is not A/C/G/T\n"); + fprintf(stderr, " -Q output the QCALL likelihood format\n"); + fprintf(stderr, " -s FILE list of samples to use [all samples]\n"); + fprintf(stderr, " -S input is VCF\n"); + fprintf(stderr, " -u uncompressed BCF output (force -b)\n"); + fprintf(stderr, "\nConsensus/variant calling options:\n\n"); + fprintf(stderr, " -c SNP calling (force -e)\n"); + fprintf(stderr, " -d FLOAT skip loci where less than FLOAT fraction of samples covered [0]\n"); + fprintf(stderr, " -e likelihood based analyses\n"); + fprintf(stderr, " -g call genotypes at variant sites (force -c)\n"); + fprintf(stderr, " -i FLOAT indel-to-substitution ratio [%.4g]\n", vc.indel_frac); + fprintf(stderr, " -I skip indels\n"); + fprintf(stderr, " -p FLOAT variant if P(ref|D)BCF conversion please specify the sequence dictionary with -D\n", __func__); + return 1; + } + if (vc.n1 <= 0) vc.n_perm = 0; // TODO: give a warning here! + if (vc.n_perm > 0) { + seeds = malloc(vc.n_perm * sizeof(int)); + srand48(time(0)); + for (c = 0; c < vc.n_perm; ++c) seeds[c] = lrand48(); + } b = calloc(1, sizeof(bcf1_t)); blast = calloc(1, sizeof(bcf1_t)); strcpy(moder, "r"); @@ -281,27 +425,34 @@ int bcfview(int argc, char *argv[]) if (vc.flag & VC_BCFOUT) strcat(modew, "b"); if (vc.flag & VC_UNCOMP) strcat(modew, "u"); bp = vcf_open(argv[optind], moder); - h = vcf_hdr_read(bp); + hin = hout = vcf_hdr_read(bp); + if (vc.fn_dict && (vc.flag & VC_VCFIN)) + vcf_dictread(bp, hin, vc.fn_dict); bout = vcf_open("-", modew); - if (!(vc.flag & VC_QCALL)) vcf_hdr_write(bout, h); + if (!(vc.flag & VC_QCALL)) { + if (vc.n_sub) { + vc.sublist = calloc(vc.n_sub, sizeof(int)); + hout = bcf_hdr_subsam(hin, vc.n_sub, vc.subsam, vc.sublist); + } + if (vc.flag & VC_CALL) write_header(hout); + vcf_hdr_write(bout, hout); + } if (vc.flag & VC_CALL) { - p1 = bcf_p1_init(h->n_smpl); + p1 = bcf_p1_init(hout->n_smpl, vc.ploidy); if (vc.prior_file) { if (bcf_p1_read_prior(p1, vc.prior_file) < 0) { fprintf(stderr, "[%s] fail to read the prior AFS.\n", __func__); return 1; } } else bcf_p1_init_prior(p1, vc.prior_type, vc.theta); - if (vc.n1 > 0) { + if (vc.n1 > 0 && vc.min_lrt > 0.) { // set n1 bcf_p1_set_n1(p1, vc.n1); bcf_p1_init_subprior(p1, vc.prior_type, vc.theta); } if (vc.indel_frac > 0.) bcf_p1_indel_prior(p1, vc.indel_frac); // otherwise use the default indel_frac - if (vc.flag & VC_FOLDED) bcf_p1_set_folded(p1); } - if (vc.fn_list) hash = bcf_load_pos(vc.fn_list, h); if (optind + 1 < argc && !(vc.flag&VC_VCFIN)) { - void *str2id = bcf_build_refhash(h); + void *str2id = bcf_build_refhash(hout); if (bcf_parse_region(str2id, argv[optind+1], &tid, &begin, &end) >= 0) { bcf_idx_t *idx; idx = bcf_idx_load(argv[optind]); @@ -317,8 +468,19 @@ int bcfview(int argc, char *argv[]) } } } - while (vcf_read(bp, h, b) > 0) { - int is_indel = bcf_is_indel(b); + while (vcf_read(bp, hin, b) > 0) { + int is_indel, cons_llr = -1; + int64_t cons_gt = -1; + double em[10]; + if ((vc.flag & VC_VARONLY) && strcmp(b->alt, "X") == 0) continue; + if ((vc.flag & VC_VARONLY) && vc.min_smpl_frac > 0.) { + extern int bcf_smpl_covered(const bcf1_t *b); + int n = bcf_smpl_covered(b); + if ((double)n / b->n_smpl < vc.min_smpl_frac) continue; + } + if (vc.n_sub) bcf_subsam(vc.n_sub, vc.sublist, b); + if (vc.flag & VC_FIX_PL) bcf_fix_pl(b); + is_indel = bcf_is_indel(b); if ((vc.flag & VC_NO_INDEL) && is_indel) continue; if ((vc.flag & VC_ACGT_ONLY) && !is_indel) { int x; @@ -326,13 +488,7 @@ int bcfview(int argc, char *argv[]) x = toupper(b->ref[0]); if (x != 'A' && x != 'C' && x != 'G' && x != 'T') continue; } - if (hash) { - uint64_t x = (uint64_t)b->tid<<32 | b->pos; - khint_t k = kh_get(set64, hash, x); - if (kh_size(hash) == 0) break; - if (k == kh_end(hash)) continue; - kh_del(set64, hash, k); - } + if (vc.bed && !bed_overlap(vc.bed, hin->ns[b->tid], b->pos, b->pos + strlen(b->ref))) continue; if (tid >= 0) { int l = strlen(b->ref); l = b->pos + (l > 0? l : 1); @@ -340,47 +496,91 @@ int bcfview(int argc, char *argv[]) if (!(l > begin && end > b->pos)) continue; } ++n_processed; + if ((vc.flag & VC_QCNT) && !is_indel) { // summarize the difference + int x = bcf_min_diff(b); + if (x > 255) x = 255; + if (x >= 0) ++qcnt[x]; + } if (vc.flag & VC_QCALL) { // output QCALL format; STOP here - bcf_2qcall(h, b); + bcf_2qcall(hout, b); continue; } - if (vc.flag & (VC_CALL|VC_ADJLD)) bcf_gl2pl(b); + if (vc.trio_aux) // do trio calling + bcf_trio_call(vc.trio_aux, b, &cons_llr, &cons_gt); + else if (vc.flag & VC_PAIRCALL) + cons_llr = bcf_pair_call(b); + if (vc.flag & (VC_CALL|VC_ADJLD|VC_EM)) bcf_gl2pl(b); + if (vc.flag & VC_EM) bcf_em1(b, vc.n1, 0x1ff, em); + else { + int i; + for (i = 0; i < 9; ++i) em[i] = -1.; + } if (vc.flag & VC_CALL) { // call variants bcf_p1rst_t pr; - bcf_p1_cal(b, p1, &pr); // pr.g[3] is not calculated here - if (vc.flag&VC_HWE) bcf_p1_cal_g3(p1, pr.g); + int calret = bcf_p1_cal(b, (em[7] >= 0 && em[7] < vc.min_lrt), p1, &pr); if (n_processed % 100000 == 0) { fprintf(stderr, "[%s] %ld sites processed.\n", __func__, (long)n_processed); bcf_p1_dump_afs(p1); } if (pr.p_ref >= vc.pref && (vc.flag & VC_VARONLY)) continue; - update_bcf1(h->n_smpl, b, p1, &pr, vc.pref, vc.flag); - } + if (vc.n_perm && vc.n1 > 0 && pr.p_chi2 < vc.min_perm_p) { // permutation test + bcf_p1rst_t r; + int i, n = 0; + for (i = 0; i < vc.n_perm; ++i) { +#ifdef BCF_PERM_LRT // LRT based permutation is much faster but less robust to artifacts + double x[10]; + bcf_shuffle(b, seeds[i]); + bcf_em1(b, vc.n1, 1<<7, x); + if (x[7] < em[7]) ++n; +#else + bcf_shuffle(b, seeds[i]); + bcf_p1_cal(b, 1, p1, &r); + if (pr.p_chi2 >= r.p_chi2) ++n; +#endif + } + pr.perm_rank = n; + } + if (calret >= 0) update_bcf1(b, p1, &pr, vc.pref, vc.flag, em, cons_llr, cons_gt); + } else if (vc.flag & VC_EM) update_bcf1(b, 0, 0, 0, vc.flag, em, cons_llr, cons_gt); if (vc.flag & VC_ADJLD) { // compute LD double f[4], r2; - if ((r2 = bcf_ld_freq(blast, b, f)) >= 0) { + if ((r2 = bcf_pair_freq(blast, b, f)) >= 0) { kstring_t s; s.m = s.l = 0; s.s = 0; if (*b->info) kputc(';', &s); - ksprintf(&s, "NEIR=%.3lf;NEIF=%.3lf,%.3lf", r2, f[0]+f[2], f[0]+f[1]); + ksprintf(&s, "NEIR=%.3f;NEIF4=%.3f,%.3f,%.3f,%.3f", r2, f[0], f[1], f[2], f[3]); bcf_append_info(b, s.s, s.l); free(s.s); } bcf_cpy(blast, b); } + if (vc.flag & VC_ANNO_MAX) bcf_anno_max(b); if (vc.flag & VC_NO_GENO) { // do not output GENO fields b->n_gi = 0; b->fmt[0] = '\0'; - } - vcf_write(bout, h, b); + b->l_str = b->fmt - b->str + 1; + } else bcf_fix_gt(b); + vcf_write(bout, hout, b); } if (vc.prior_file) free(vc.prior_file); if (vc.flag & VC_CALL) bcf_p1_dump_afs(p1); - bcf_hdr_destroy(h); + if (hin != hout) bcf_hdr_destroy(hout); + bcf_hdr_destroy(hin); bcf_destroy(b); bcf_destroy(blast); vcf_close(bp); vcf_close(bout); - if (hash) kh_destroy(set64, hash); - if (vc.fn_list) free(vc.fn_list); + if (vc.fn_dict) free(vc.fn_dict); + if (vc.ploidy) free(vc.ploidy); + if (vc.trio_aux) free(vc.trio_aux); + if (vc.n_sub) { + int i; + for (i = 0; i < vc.n_sub; ++i) free(vc.subsam[i]); + free(vc.subsam); free(vc.sublist); + } + if (vc.bed) bed_destroy(vc.bed); + if (vc.flag & VC_QCNT) + for (c = 0; c < 256; ++c) + fprintf(stderr, "QT\t%d\t%lld\n", c, (long long)qcnt[c]); + if (seeds) free(seeds); if (p1) bcf_p1_destroy(p1); return 0; } diff --git a/sam/bcftools/em.c b/sam/bcftools/em.c new file mode 100644 index 0000000..b7dfe1a --- /dev/null +++ b/sam/bcftools/em.c @@ -0,0 +1,310 @@ +#include +#include +#include +#include "bcf.h" +#include "kmin.h" + +static double g_q2p[256]; + +#define ITER_MAX 50 +#define ITER_TRY 10 +#define EPS 1e-5 + +extern double kf_gammaq(double, double); + +/* + Generic routines + */ +// get the 3 genotype likelihoods +static double *get_pdg3(const bcf1_t *b) +{ + double *pdg; + const uint8_t *PL = 0; + int i, PL_len = 0; + // initialize g_q2p if necessary + if (g_q2p[0] == 0.) + for (i = 0; i < 256; ++i) + g_q2p[i] = pow(10., -i / 10.); + // set PL and PL_len + for (i = 0; i < b->n_gi; ++i) { + if (b->gi[i].fmt == bcf_str2int("PL", 2)) { + PL = (const uint8_t*)b->gi[i].data; + PL_len = b->gi[i].len; + break; + } + } + if (i == b->n_gi) return 0; // no PL + // fill pdg + pdg = malloc(3 * b->n_smpl * sizeof(double)); + for (i = 0; i < b->n_smpl; ++i) { + const uint8_t *pi = PL + i * PL_len; + double *p = pdg + i * 3; + p[0] = g_q2p[pi[2]]; p[1] = g_q2p[pi[1]]; p[2] = g_q2p[pi[0]]; + } + return pdg; +} + +// estimate site allele frequency in a very naive and inaccurate way +static double est_freq(int n, const double *pdg) +{ + int i, gcnt[3], tmp1; + // get a rough estimate of the genotype frequency + gcnt[0] = gcnt[1] = gcnt[2] = 0; + for (i = 0; i < n; ++i) { + const double *p = pdg + i * 3; + if (p[0] != 1. || p[1] != 1. || p[2] != 1.) { + int which = p[0] > p[1]? 0 : 1; + which = p[which] > p[2]? which : 2; + ++gcnt[which]; + } + } + tmp1 = gcnt[0] + gcnt[1] + gcnt[2]; + return (tmp1 == 0)? -1.0 : (.5 * gcnt[1] + gcnt[2]) / tmp1; +} + +/* + Single-locus EM + */ + +typedef struct { + int beg, end; + const double *pdg; +} minaux1_t; + +static double prob1(double f, void *data) +{ + minaux1_t *a = (minaux1_t*)data; + double p = 1., l = 0., f3[3]; + int i; +// printf("brent %lg\n", f); + if (f < 0 || f > 1) return 1e300; + f3[0] = (1.-f)*(1.-f); f3[1] = 2.*f*(1.-f); f3[2] = f*f; + for (i = a->beg; i < a->end; ++i) { + const double *pdg = a->pdg + i * 3; + p *= pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]; + if (p < 1e-200) l -= log(p), p = 1.; + } + return l - log(p); +} + +// one EM iteration for allele frequency estimate +static double freq_iter(double *f, const double *_pdg, int beg, int end) +{ + double f0 = *f, f3[3], err; + int i; +// printf("em %lg\n", *f); + f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0; + for (i = beg, f0 = 0.; i < end; ++i) { + const double *pdg = _pdg + i * 3; + f0 += (pdg[1] * f3[1] + 2. * pdg[2] * f3[2]) + / (pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]); + } + f0 /= (end - beg) * 2; + err = fabs(f0 - *f); + *f = f0; + return err; +} + +/* The following function combines EM and Brent's method. When the signal from + * the data is strong, EM is faster but sometimes, EM may converge very slowly. + * When this happens, we switch to Brent's method. The idea is learned from + * Rasmus Nielsen. + */ +static double freqml(double f0, int beg, int end, const double *pdg) +{ + int i; + double f; + for (i = 0, f = f0; i < ITER_TRY; ++i) + if (freq_iter(&f, pdg, beg, end) < EPS) break; + if (i == ITER_TRY) { // haven't converged yet; try Brent's method + minaux1_t a; + a.beg = beg; a.end = end; a.pdg = pdg; + kmin_brent(prob1, f0 == f? .5*f0 : f0, f, (void*)&a, EPS, &f); + } + return f; +} + +// one EM iteration for genotype frequency estimate +static double g3_iter(double g[3], const double *_pdg, int beg, int end) +{ + double err, gg[3]; + int i; + gg[0] = gg[1] = gg[2] = 0.; +// printf("%lg,%lg,%lg\n", g[0], g[1], g[2]); + for (i = beg; i < end; ++i) { + double sum, tmp[3]; + const double *pdg = _pdg + i * 3; + tmp[0] = pdg[0] * g[0]; tmp[1] = pdg[1] * g[1]; tmp[2] = pdg[2] * g[2]; + sum = (tmp[0] + tmp[1] + tmp[2]) * (end - beg); + gg[0] += tmp[0] / sum; gg[1] += tmp[1] / sum; gg[2] += tmp[2] / sum; + } + err = fabs(gg[0] - g[0]) > fabs(gg[1] - g[1])? fabs(gg[0] - g[0]) : fabs(gg[1] - g[1]); + err = err > fabs(gg[2] - g[2])? err : fabs(gg[2] - g[2]); + g[0] = gg[0]; g[1] = gg[1]; g[2] = gg[2]; + return err; +} + +// perform likelihood ratio test +static double lk_ratio_test(int n, int n1, const double *pdg, double f3[3][3]) +{ + double r; + int i; + for (i = 0, r = 1.; i < n1; ++i) { + const double *p = pdg + i * 3; + r *= (p[0] * f3[1][0] + p[1] * f3[1][1] + p[2] * f3[1][2]) + / (p[0] * f3[0][0] + p[1] * f3[0][1] + p[2] * f3[0][2]); + } + for (; i < n; ++i) { + const double *p = pdg + i * 3; + r *= (p[0] * f3[2][0] + p[1] * f3[2][1] + p[2] * f3[2][2]) + / (p[0] * f3[0][0] + p[1] * f3[0][1] + p[2] * f3[0][2]); + } + return r; +} + +// x[0]: ref frequency +// x[1..3]: alt-alt, alt-ref, ref-ref frequenc +// x[4]: HWE P-value +// x[5..6]: group1 freq, group2 freq +// x[7]: 1-degree P-value +// x[8]: 2-degree P-value +int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10]) +{ + double *pdg; + int i, n, n2; + if (b->n_alleles < 2) return -1; // one allele only + // initialization + if (n1 < 0 || n1 > b->n_smpl) n1 = 0; + if (flag & 1<<7) flag |= 7<<5; // compute group freq if LRT is required + if (flag & 0xf<<1) flag |= 0xf<<1; + n = b->n_smpl; n2 = n - n1; + pdg = get_pdg3(b); + if (pdg == 0) return -1; + for (i = 0; i < 10; ++i) x[i] = -1.; // set to negative + { + if ((x[0] = est_freq(n, pdg)) < 0.) { + free(pdg); + return -1; // no data + } + x[0] = freqml(x[0], 0, n, pdg); + } + if (flag & (0xf<<1|3<<8)) { // estimate the genotype frequency and test HWE + double *g = x + 1, f3[3], r; + f3[0] = g[0] = (1 - x[0]) * (1 - x[0]); + f3[1] = g[1] = 2 * x[0] * (1 - x[0]); + f3[2] = g[2] = x[0] * x[0]; + for (i = 0; i < ITER_MAX; ++i) + if (g3_iter(g, pdg, 0, n) < EPS) break; + // Hardy-Weinberg equilibrium (HWE) + for (i = 0, r = 1.; i < n; ++i) { + double *p = pdg + i * 3; + r *= (p[0] * g[0] + p[1] * g[1] + p[2] * g[2]) / (p[0] * f3[0] + p[1] * f3[1] + p[2] * f3[2]); + } + x[4] = kf_gammaq(.5, log(r)); + } + if ((flag & 7<<5) && n1 > 0 && n1 < n) { // group frequency + x[5] = freqml(x[0], 0, n1, pdg); + x[6] = freqml(x[0], n1, n, pdg); + } + if ((flag & 1<<7) && n1 > 0 && n1 < n) { // 1-degree P-value + double f[3], f3[3][3], tmp; + f[0] = x[0]; f[1] = x[5]; f[2] = x[6]; + for (i = 0; i < 3; ++i) + f3[i][0] = (1-f[i])*(1-f[i]), f3[i][1] = 2*f[i]*(1-f[i]), f3[i][2] = f[i]*f[i]; + tmp = log(lk_ratio_test(n, n1, pdg, f3)); + if (tmp < 0) tmp = 0; + x[7] = kf_gammaq(.5, tmp); + } + if ((flag & 3<<8) && n1 > 0 && n1 < n) { // 2-degree P-value + double g[3][3], tmp; + for (i = 0; i < 3; ++i) memcpy(g[i], x + 1, 3 * sizeof(double)); + for (i = 0; i < ITER_MAX; ++i) + if (g3_iter(g[1], pdg, 0, n1) < EPS) break; + for (i = 0; i < ITER_MAX; ++i) + if (g3_iter(g[2], pdg, n1, n) < EPS) break; + tmp = log(lk_ratio_test(n, n1, pdg, g)); + if (tmp < 0) tmp = 0; + x[8] = kf_gammaq(1., tmp); + } + // free + free(pdg); + return 0; +} + +/* + Two-locus EM (LD) + */ + +#define _G1(h, k) ((h>>1&1) + (k>>1&1)) +#define _G2(h, k) ((h&1) + (k&1)) + +// 0: the previous site; 1: the current site +static int pair_freq_iter(int n, double *pdg[2], double f[4]) +{ + double ff[4]; + int i, k, h; +// printf("%lf,%lf,%lf,%lf\n", f[0], f[1], f[2], f[3]); + memset(ff, 0, 4 * sizeof(double)); + for (i = 0; i < n; ++i) { + double *p[2], sum, tmp; + p[0] = pdg[0] + i * 3; p[1] = pdg[1] + i * 3; + for (k = 0, sum = 0.; k < 4; ++k) + for (h = 0; h < 4; ++h) + sum += f[k] * f[h] * p[0][_G1(k,h)] * p[1][_G2(k,h)]; + for (k = 0; k < 4; ++k) { + tmp = f[0] * (p[0][_G1(0,k)] * p[1][_G2(0,k)] + p[0][_G1(k,0)] * p[1][_G2(k,0)]) + + f[1] * (p[0][_G1(1,k)] * p[1][_G2(1,k)] + p[0][_G1(k,1)] * p[1][_G2(k,1)]) + + f[2] * (p[0][_G1(2,k)] * p[1][_G2(2,k)] + p[0][_G1(k,2)] * p[1][_G2(k,2)]) + + f[3] * (p[0][_G1(3,k)] * p[1][_G2(3,k)] + p[0][_G1(k,3)] * p[1][_G2(k,3)]); + ff[k] += f[k] * tmp / sum; + } + } + for (k = 0; k < 4; ++k) f[k] = ff[k] / (2 * n); + return 0; +} + +double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]) +{ + const bcf1_t *b[2]; + int i, j, n_smpl; + double *pdg[2], flast[4], r, f0[2]; + // initialize others + if (b0->n_smpl != b1->n_smpl) return -1; // different number of samples + n_smpl = b0->n_smpl; + b[0] = b0; b[1] = b1; + f[0] = f[1] = f[2] = f[3] = -1.; + if (b[0]->n_alleles < 2 || b[1]->n_alleles < 2) return -1; // one allele only + pdg[0] = get_pdg3(b0); pdg[1] = get_pdg3(b1); + if (pdg[0] == 0 || pdg[1] == 0) { + free(pdg[0]); free(pdg[1]); + return -1; + } + // set the initial value + f0[0] = est_freq(n_smpl, pdg[0]); + f0[1] = est_freq(n_smpl, pdg[1]); + f[0] = (1 - f0[0]) * (1 - f0[1]); f[3] = f0[0] * f0[1]; + f[1] = (1 - f0[0]) * f0[1]; f[2] = f0[0] * (1 - f0[1]); + // iteration + for (j = 0; j < ITER_MAX; ++j) { + double eps = 0; + memcpy(flast, f, 4 * sizeof(double)); + pair_freq_iter(n_smpl, pdg, f); + for (i = 0; i < 4; ++i) { + double x = fabs(f[i] - flast[i]); + if (x > eps) eps = x; + } + if (eps < EPS) break; + } + // free + free(pdg[0]); free(pdg[1]); + { // calculate r^2 + double p[2], q[2], D; + p[0] = f[0] + f[1]; q[0] = 1 - p[0]; + p[1] = f[0] + f[2]; q[1] = 1 - p[1]; + D = f[0] * f[3] - f[1] * f[2]; + r = sqrt(D * D / (p[0] * p[1] * q[0] * q[1])); +// printf("R(%lf,%lf,%lf,%lf)=%lf\n", f[0], f[1], f[2], f[3], r); + if (isnan(r)) r = -1.; + } + return r; +} diff --git a/sam/bcftools/fet.c b/sam/bcftools/fet.c index 845f8c2..5812517 100644 --- a/sam/bcftools/fet.c +++ b/sam/bcftools/fet.c @@ -64,7 +64,8 @@ double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double n1_ = n11 + n12; n_1 = n11 + n21; n = n11 + n12 + n21 + n22; // calculate n1_, n_1 and n max = (n_1 < n1_) ? n_1 : n1_; // max n11, for right tail - min = (n1_ + n_1 - n < 0) ? 0 : (n1_ + n_1 - n < 0); // min n11, for left tail + min = n1_ + n_1 - n; + if (min < 0) min = 0; // min n11, for left tail *two = *_left = *_right = 1.; if (min == max) return 1.; // no need to do test q = hypergeo_acc(n11, n1_, n_1, n, &aux); // the probability of the current table @@ -79,6 +80,7 @@ double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double p = hypergeo_acc(max, 0, 0, 0, &aux); for (right = 0., j = max - 1; p < 0.99999999 * q; --j) // loop until underflow right += p, p = hypergeo_acc(j, 0, 0, 0, &aux); + ++j; if (p < 1.00000001 * q) right += p; else ++j; // two-tail diff --git a/sam/bcftools/kmin.c b/sam/bcftools/kmin.c new file mode 100644 index 0000000..5b8193b --- /dev/null +++ b/sam/bcftools/kmin.c @@ -0,0 +1,209 @@ +/* The MIT License + + Copyright (c) 2008, 2010 by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Hooke-Jeeves algorithm for nonlinear minimization + + Based on the pseudocodes by Bell and Pike (CACM 9(9):684-685), and + the revision by Tomlin and Smith (CACM 12(11):637-638). Both of the + papers are comments on Kaupe's Algorithm 178 "Direct Search" (ACM + 6(6):313-314). The original algorithm was designed by Hooke and + Jeeves (ACM 8:212-229). This program is further revised according to + Johnson's implementation at Netlib (opt/hooke.c). + + Hooke-Jeeves algorithm is very simple and it works quite well on a + few examples. However, it might fail to converge due to its heuristic + nature. A possible improvement, as is suggested by Johnson, may be to + choose a small r at the beginning to quickly approach to the minimum + and a large r at later step to hit the minimum. + */ + +#include +#include +#include +#include "kmin.h" + +static double __kmin_hj_aux(kmin_f func, int n, double *x1, void *data, double fx1, double *dx, int *n_calls) +{ + int k, j = *n_calls; + double ftmp; + for (k = 0; k != n; ++k) { + x1[k] += dx[k]; + ftmp = func(n, x1, data); ++j; + if (ftmp < fx1) fx1 = ftmp; + else { /* search the opposite direction */ + dx[k] = 0.0 - dx[k]; + x1[k] += dx[k] + dx[k]; + ftmp = func(n, x1, data); ++j; + if (ftmp < fx1) fx1 = ftmp; + else x1[k] -= dx[k]; /* back to the original x[k] */ + } + } + *n_calls = j; + return fx1; /* here: fx1=f(n,x1) */ +} + +double kmin_hj(kmin_f func, int n, double *x, void *data, double r, double eps, int max_calls) +{ + double fx, fx1, *x1, *dx, radius; + int k, n_calls = 0; + x1 = (double*)calloc(n, sizeof(double)); + dx = (double*)calloc(n, sizeof(double)); + for (k = 0; k != n; ++k) { /* initial directions, based on MGJ */ + dx[k] = fabs(x[k]) * r; + if (dx[k] == 0) dx[k] = r; + } + radius = r; + fx1 = fx = func(n, x, data); ++n_calls; + for (;;) { + memcpy(x1, x, n * sizeof(double)); /* x1 = x */ + fx1 = __kmin_hj_aux(func, n, x1, data, fx, dx, &n_calls); + while (fx1 < fx) { + for (k = 0; k != n; ++k) { + double t = x[k]; + dx[k] = x1[k] > x[k]? fabs(dx[k]) : 0.0 - fabs(dx[k]); + x[k] = x1[k]; + x1[k] = x1[k] + x1[k] - t; + } + fx = fx1; + if (n_calls >= max_calls) break; + fx1 = func(n, x1, data); ++n_calls; + fx1 = __kmin_hj_aux(func, n, x1, data, fx1, dx, &n_calls); + if (fx1 >= fx) break; + for (k = 0; k != n; ++k) + if (fabs(x1[k] - x[k]) > .5 * fabs(dx[k])) break; + if (k == n) break; + } + if (radius >= eps) { + if (n_calls >= max_calls) break; + radius *= r; + for (k = 0; k != n; ++k) dx[k] *= r; + } else break; /* converge */ + } + free(x1); free(dx); + return fx1; +} + +// I copied this function somewhere several years ago with some of my modifications, but I forgot the source. +double kmin_brent(kmin1_f func, double a, double b, void *data, double tol, double *xmin) +{ + double bound, u, r, q, fu, tmp, fa, fb, fc, c; + const double gold1 = 1.6180339887; + const double gold2 = 0.3819660113; + const double tiny = 1e-20; + const int max_iter = 100; + + double e, d, w, v, mid, tol1, tol2, p, eold, fv, fw; + int iter; + + fa = func(a, data); fb = func(b, data); + if (fb > fa) { // swap, such that f(a) > f(b) + tmp = a; a = b; b = tmp; + tmp = fa; fa = fb; fb = tmp; + } + c = b + gold1 * (b - a), fc = func(c, data); // golden section extrapolation + while (fb > fc) { + bound = b + 100.0 * (c - b); // the farthest point where we want to go + r = (b - a) * (fb - fc); + q = (b - c) * (fb - fa); + if (fabs(q - r) < tiny) { // avoid 0 denominator + tmp = q > r? tiny : 0.0 - tiny; + } else tmp = q - r; + u = b - ((b - c) * q - (b - a) * r) / (2.0 * tmp); // u is the parabolic extrapolation point + if ((b > u && u > c) || (b < u && u < c)) { // u lies between b and c + fu = func(u, data); + if (fu < fc) { // (b,u,c) bracket the minimum + a = b; b = u; fa = fb; fb = fu; + break; + } else if (fu > fb) { // (a,b,u) bracket the minimum + c = u; fc = fu; + break; + } + u = c + gold1 * (c - b); fu = func(u, data); // golden section extrapolation + } else if ((c > u && u > bound) || (c < u && u < bound)) { // u lies between c and bound + fu = func(u, data); + if (fu < fc) { // fb > fc > fu + b = c; c = u; u = c + gold1 * (c - b); + fb = fc; fc = fu; fu = func(u, data); + } else { // (b,c,u) bracket the minimum + a = b; b = c; c = u; + fa = fb; fb = fc; fc = fu; + break; + } + } else if ((u > bound && bound > c) || (u < bound && bound < c)) { // u goes beyond the bound + u = bound; fu = func(u, data); + } else { // u goes the other way around, use golden section extrapolation + u = c + gold1 * (c - b); fu = func(u, data); + } + a = b; b = c; c = u; + fa = fb; fb = fc; fc = fu; + } + if (a > c) u = a, a = c, c = u; // swap + + // now, afb and fb tol1) { + // related to parabolic interpolation + r = (b - w) * (fb - fv); + q = (b - v) * (fb - fw); + p = (b - v) * q - (b - w) * r; + q = 2.0 * (q - r); + if (q > 0.0) p = 0.0 - p; + else q = 0.0 - q; + eold = e; e = d; + if (fabs(p) >= fabs(0.5 * q * eold) || p <= q * (a - b) || p >= q * (c - b)) { + d = gold2 * (e = (b >= mid ? a - b : c - b)); + } else { + d = p / q; u = b + d; // actual parabolic interpolation happens here + if (u - a < tol2 || c - u < tol2) + d = (mid > b)? tol1 : 0.0 - tol1; + } + } else d = gold2 * (e = (b >= mid ? a - b : c - b)); // golden section interpolation + u = fabs(d) >= tol1 ? b + d : b + (d > 0.0? tol1 : -tol1); + fu = func(u, data); + if (fu <= fb) { // u is the minimum point so far + if (u >= b) a = b; + else c = b; + v = w; w = b; b = u; fv = fw; fw = fb; fb = fu; + } else { // adjust (a,c) and (u,v,w) + if (u < b) a = u; + else c = u; + if (fu <= fw || w == b) { + v = w; w = u; + fv = fw; fw = fu; + } else if (fu <= fv || v == b || v == w) { + v = u; fv = fu; + } + } + } + *xmin = b; + return fb; +} diff --git a/sam/bcftools/kmin.h b/sam/bcftools/kmin.h new file mode 100644 index 0000000..6feba45 --- /dev/null +++ b/sam/bcftools/kmin.h @@ -0,0 +1,46 @@ +/* + Copyright (c) 2008, 2010 by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#ifndef KMIN_H +#define KMIN_H + +#define KMIN_RADIUS 0.5 +#define KMIN_EPS 1e-7 +#define KMIN_MAXCALL 50000 + +typedef double (*kmin_f)(int, double*, void*); +typedef double (*kmin1_f)(double, void*); + +#ifdef __cplusplus +extern "C" { +#endif + + double kmin_hj(kmin_f func, int n, double *x, void *data, double r, double eps, int max_calls); + double kmin_brent(kmin1_f func, double a, double b, void *data, double tol, double *xmin); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sam/bcftools/ld.c b/sam/bcftools/ld.c deleted file mode 100644 index dc84d4b..0000000 --- a/sam/bcftools/ld.c +++ /dev/null @@ -1,100 +0,0 @@ -#include -#include -#include -#include "bcf.h" - -static double g_q2p[256]; - -#define LD_ITER_MAX 50 -#define LD_ITER_EPS 1e-4 - -#define _G1(h, k) ((h>>1&1) + (k>>1&1)) -#define _G2(h, k) ((h&1) + (k&1)) - -// 0: the previous site; 1: the current site -static int freq_iter(int n, double *pdg[2], double f[4]) -{ - double ff[4]; - int i, k, h; - memset(ff, 0, 4 * sizeof(double)); - for (i = 0; i < n; ++i) { - double *p[2], sum, tmp; - p[0] = pdg[0] + i * 3; p[1] = pdg[1] + i * 3; - for (k = 0, sum = 0.; k < 4; ++k) - for (h = 0; h < 4; ++h) - sum += f[k] * f[h] * p[0][_G1(k,h)] * p[1][_G2(k,h)]; - for (k = 0; k < 4; ++k) { - tmp = f[0] * (p[0][_G1(0,k)] * p[1][_G2(0,k)] + p[0][_G1(k,0)] * p[1][_G2(k,0)]) - + f[1] * (p[0][_G1(1,k)] * p[1][_G2(1,k)] + p[0][_G1(k,1)] * p[1][_G2(k,1)]) - + f[2] * (p[0][_G1(2,k)] * p[1][_G2(2,k)] + p[0][_G1(k,2)] * p[1][_G2(k,2)]) - + f[3] * (p[0][_G1(3,k)] * p[1][_G2(3,k)] + p[0][_G1(k,3)] * p[1][_G2(k,3)]); - ff[k] += f[k] * tmp / sum; - } - } - for (k = 0; k < 4; ++k) f[k] = ff[k] / (2 * n); - return 0; -} - -double bcf_ld_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]) -{ - const bcf1_t *b[2]; - uint8_t *PL[2]; - int i, j, PL_len[2], n_smpl; - double *pdg[2], flast[4], r; - // initialize g_q2p if necessary - if (g_q2p[0] == 0.) - for (i = 0; i < 256; ++i) - g_q2p[i] = pow(10., -i / 10.); - // initialize others - if (b0->n_smpl != b1->n_smpl) return -1; // different number of samples - n_smpl = b0->n_smpl; - b[0] = b0; b[1] = b1; - f[0] = f[1] = f[2] = f[3] = -1.; - if (b[0]->n_alleles < 2 || b[1]->n_alleles < 2) return -1; // one allele only - // set PL and PL_len - for (j = 0; j < 2; ++j) { - const bcf1_t *bj = b[j]; - for (i = 0; i < bj->n_gi; ++i) { - if (bj->gi[i].fmt == bcf_str2int("PL", 2)) { - PL[j] = (uint8_t*)bj->gi[i].data; - PL_len[j] = bj->gi[i].len; - break; - } - } - if (i == bj->n_gi) return -1; // no PL - } - // fill pdg[2] - pdg[0] = malloc(3 * n_smpl * sizeof(double)); - pdg[1] = malloc(3 * n_smpl * sizeof(double)); - for (j = 0; j < 2; ++j) { - for (i = 0; i < n_smpl; ++i) { - const uint8_t *pi = PL[j] + i * PL_len[j]; - double *p = pdg[j] + i * 3; - p[0] = g_q2p[pi[b[j]->n_alleles]]; p[1] = g_q2p[pi[1]]; p[2] = g_q2p[pi[0]]; - } - } - // iteration - f[0] = f[1] = f[2] = f[3] = 0.25; // this is a really bad guess... - for (j = 0; j < LD_ITER_MAX; ++j) { - double eps = 0; - memcpy(flast, f, 4 * sizeof(double)); - freq_iter(n_smpl, pdg, f); - for (i = 0; i < 4; ++i) { - double x = fabs(f[i] - flast[i]); - if (x > eps) eps = x; - } - if (eps < LD_ITER_EPS) break; - } - // free - free(pdg[0]); free(pdg[1]); - { // calculate r^2 - double p[2], q[2], D; - p[0] = f[0] + f[1]; q[0] = 1 - p[0]; - p[1] = f[0] + f[2]; q[1] = 1 - p[1]; - D = f[0] * f[3] - f[1] * f[2]; - r = sqrt(D * D / (p[0] * p[1] * q[0] * q[1])); - // fprintf(stderr, "R(%lf,%lf,%lf,%lf)=%lf\n", f[0], f[1], f[2], f[3], r2); - if (isnan(r)) r = -1.; - } - return r; -} diff --git a/sam/bcftools/main.c b/sam/bcftools/main.c index 7ffc2a0..fcd94b8 100644 --- a/sam/bcftools/main.c +++ b/sam/bcftools/main.c @@ -1,8 +1,12 @@ #include #include #include +#include #include "bcf.h" +#include "kseq.h" +KSTREAM_INIT(gzFile, gzread, 0x10000) + int bcfview(int argc, char *argv[]); int bcf_main_index(int argc, char *argv[]); @@ -42,20 +46,142 @@ int bcf_cat(int n, char * const *fn) return 0; } +extern double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]); + +int bcf_main_ldpair(int argc, char *argv[]) +{ + bcf_t *fp; + bcf_hdr_t *h; + bcf1_t *b0, *b1; + bcf_idx_t *idx; + kstring_t str; + void *str2id; + gzFile fplist; + kstream_t *ks; + int dret, lineno = 0; + if (argc < 3) { + fprintf(stderr, "Usage: bcftools ldpair \n"); + return 1; + } + fplist = gzopen(argv[2], "rb"); + ks = ks_init(fplist); + memset(&str, 0, sizeof(kstring_t)); + fp = bcf_open(argv[1], "rb"); + h = bcf_hdr_read(fp); + str2id = bcf_build_refhash(h); + idx = bcf_idx_load(argv[1]); + if (idx == 0) { + fprintf(stderr, "[%s] No bcf index is found. Abort!\n", __func__); + return 1; + } + b0 = calloc(1, sizeof(bcf1_t)); + b1 = calloc(1, sizeof(bcf1_t)); + while (ks_getuntil(ks, '\n', &str, &dret) >= 0) { + char *p, *q; + int k; + int tid0 = -1, tid1 = -1, pos0 = -1, pos1 = -1; + ++lineno; + for (p = q = str.s, k = 0; *p; ++p) { + if (*p == ' ' || *p == '\t') { + *p = '\0'; + if (k == 0) tid0 = bcf_str2id(str2id, q); + else if (k == 1) pos0 = atoi(q) - 1; + else if (k == 2) tid1 = strcmp(q, "=")? bcf_str2id(str2id, q) : tid0; + else if (k == 3) pos1 = atoi(q) - 1; + q = p + 1; + ++k; + } + } + if (k == 3) pos1 = atoi(q) - 1; + if (tid0 >= 0 && tid1 >= 0 && pos0 >= 0 && pos1 >= 0) { + uint64_t off; + double r, f[4]; + off = bcf_idx_query(idx, tid0, pos0); + bgzf_seek(fp->fp, off, SEEK_SET); + while (bcf_read(fp, h, b0) >= 0 && b0->pos != pos0); + off = bcf_idx_query(idx, tid1, pos1); + bgzf_seek(fp->fp, off, SEEK_SET); + while (bcf_read(fp, h, b1) >= 0 && b1->pos != pos1); + r = bcf_pair_freq(b0, b1, f); + r *= r; + printf("%s\t%d\t%s\t%d\t%.4g\t%.4g\t%.4g\t%.4g\t%.4g\n", h->ns[tid0], pos0+1, h->ns[tid1], pos1+1, + r, f[0], f[1], f[2], f[3]); + } //else fprintf(stderr, "[%s] Parse error at line %d.\n", __func__, lineno); + } + bcf_destroy(b0); bcf_destroy(b1); + bcf_idx_destroy(idx); + bcf_str2id_destroy(str2id); + bcf_hdr_destroy(h); + bcf_close(fp); + free(str.s); + ks_destroy(ks); + gzclose(fplist); + return 0; +} + +int bcf_main_ld(int argc, char *argv[]) +{ + bcf_t *fp; + bcf_hdr_t *h; + bcf1_t **b, *b0; + int i, j, m, n; + double f[4]; + if (argc == 1) { + fprintf(stderr, "Usage: bcftools ld \n"); + return 1; + } + fp = bcf_open(argv[1], "rb"); + h = bcf_hdr_read(fp); + // read the entire BCF + m = n = 0; b = 0; + b0 = calloc(1, sizeof(bcf1_t)); + while (bcf_read(fp, h, b0) >= 0) { + if (m == n) { + m = m? m<<1 : 16; + b = realloc(b, sizeof(void*) * m); + } + b[n] = calloc(1, sizeof(bcf1_t)); + bcf_cpy(b[n++], b0); + } + bcf_destroy(b0); + // compute pair-wise r^2 + printf("%d\n", n); // the number of loci + for (i = 0; i < n; ++i) { + printf("%s:%d", h->ns[b[i]->tid], b[i]->pos + 1); + for (j = 0; j < i; ++j) { + double r = bcf_pair_freq(b[i], b[j], f); + printf("\t%.3f", r*r); + } + printf("\t1.000\n"); + } + // free + for (i = 0; i < n; ++i) bcf_destroy(b[i]); + free(b); + bcf_hdr_destroy(h); + bcf_close(fp); + return 0; +} + int main(int argc, char *argv[]) { if (argc == 1) { fprintf(stderr, "\n"); + fprintf(stderr, "Program: bcftools (Tools for data in the VCF/BCF formats)\n"); + fprintf(stderr, "Version: %s\n\n", BCF_VERSION); fprintf(stderr, "Usage: bcftools \n\n"); fprintf(stderr, "Command: view print, extract, convert and call SNPs from BCF\n"); fprintf(stderr, " index index BCF\n"); fprintf(stderr, " cat concatenate BCFs\n"); + fprintf(stderr, " ld compute all-pair r^2\n"); + fprintf(stderr, " ldpair compute r^2 between requested pairs\n"); fprintf(stderr, "\n"); return 1; } if (strcmp(argv[1], "view") == 0) return bcfview(argc-1, argv+1); else if (strcmp(argv[1], "index") == 0) return bcf_main_index(argc-1, argv+1); - else if (strcmp(argv[1], "cat") == 0) return bcf_cat(argc-2, argv+2); + else if (strcmp(argv[1], "ld") == 0) return bcf_main_ld(argc-1, argv+1); + else if (strcmp(argv[1], "ldpair") == 0) return bcf_main_ldpair(argc-1, argv+1); + else if (strcmp(argv[1], "cat") == 0) return bcf_cat(argc-2, argv+2); // cat is different ... else { fprintf(stderr, "[main] Unrecognized command.\n"); return 1; diff --git a/sam/bcftools/mut.c b/sam/bcftools/mut.c new file mode 100644 index 0000000..15ef265 --- /dev/null +++ b/sam/bcftools/mut.c @@ -0,0 +1,127 @@ +#include +#include +#include "bcf.h" + +#define MAX_GENO 359 + +int8_t seq_bitcnt[] = { 4, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; +char *seq_nt16rev = "XACMGRSVTWYHKDBN"; + +uint32_t *bcf_trio_prep(int is_x, int is_son) +{ + int i, j, k, n, map[10]; + uint32_t *ret; + ret = calloc(MAX_GENO, 4); + for (i = 0, k = 0; i < 4; ++i) + for (j = i; j < 4; ++j) + map[k++] = 1<n_smpl != 3) return -1; // not a trio + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == bcf_str2int("PL", 2)) break; + if (i == b->n_gi) return -1; // no PL + gl10 = alloca(10 * b->n_smpl); + if (bcf_gl10(b, gl10) < 0) { + if (bcf_gl10_indel(b, gl10) < 0) return -1; + } + PL = b->gi + i; + for (i = 0, k = 0; i < 4; ++i) + for (j = i; j < 4; ++j) + map[k++] = seq_nt16rev[1<data)[j * PL->len] != 0) break; + if (j < 3) { // we need to go through the complex procedure + uint8_t *g[3]; + int minc = 1<<30, minc_j = -1, minf = 0, gtf = 0, gtc = 0; + g[0] = gl10; + g[1] = gl10 + 10; + g[2] = gl10 + 20; + for (j = 1; j <= (int)prep[0]; ++j) { // compute LK with constraint + int sum = g[0][prep[j]&0xff] + g[1][prep[j]>>8&0xff] + g[2][prep[j]>>16&0xff]; + if (sum < minc) minc = sum, minc_j = j; + } + gtc |= map[prep[minc_j]&0xff]; gtc |= map[prep[minc_j]>>8&0xff]<<8; gtc |= map[prep[minc_j]>>16]<<16; + for (j = 0; j < 3; ++j) { // compute LK without constraint + int min = 1<<30, min_k = -1; + for (k = 0; k < 10; ++k) + if (g[j][k] < min) min = g[j][k], min_k = k; + gtf |= map[min_k]<<(j*8); + minf += min; + } + *llr = minc - minf; *gt = (int64_t)gtc<<32 | gtf; + } else *llr = 0, *gt = -1; + return 0; +} + +int bcf_pair_call(const bcf1_t *b) +{ + int i, j, k; + const bcf_ginfo_t *PL; + if (b->n_smpl != 2) return -1; // not a pair + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == bcf_str2int("PL", 2)) break; + if (i == b->n_gi) return -1; // no PL + PL = b->gi + i; + for (j = 0; j < 2; ++j) // check if ref hom is the most probable in all members + if (((uint8_t*)PL->data)[j * PL->len] != 0) break; + if (j < 2) { // we need to go through the complex procedure + uint8_t *g[2]; + int minc = 1<<30, minf = 0; + g[0] = PL->data; + g[1] = (uint8_t*)PL->data + PL->len; + for (j = 0; j < PL->len; ++j) // compute LK with constraint + minc = minc < g[0][j] + g[1][j]? minc : g[0][j] + g[1][j]; + for (j = 0; j < 2; ++j) { // compute LK without constraint + int min = 1<<30; + for (k = 0; k < PL->len; ++k) + min = min < g[j][k]? min : g[j][k]; + minf += min; + } + return minc - minf; + } else return 0; +} + +int bcf_min_diff(const bcf1_t *b) +{ + int i, min = 1<<30; + const bcf_ginfo_t *PL; + for (i = 0; i < b->n_gi; ++i) + if (b->gi[i].fmt == bcf_str2int("PL", 2)) break; + if (i == b->n_gi) return -1; // no PL + PL = b->gi + i; + for (i = 0; i < b->n_smpl; ++i) { + int m1, m2, j; + const uint8_t *p = (uint8_t*)PL->data; + m1 = m2 = 1<<30; + for (j = 0; j < PL->len; ++j) { + if ((int)p[j] < m1) m2 = m1, m1 = p[j]; + else if ((int)p[j] < m2) m2 = p[j]; + } + min = min < m2 - m1? min : m2 - m1; + } + return min; +} diff --git a/sam/bcftools/prob1.c b/sam/bcftools/prob1.c index 8bf968f..a380484 100644 --- a/sam/bcftools/prob1.c +++ b/sam/bcftools/prob1.c @@ -3,13 +3,14 @@ #include #include #include +#include #include "prob1.h" #include "kseq.h" KSTREAM_INIT(gzFile, gzread, 16384) #define MC_MAX_EM_ITER 16 -#define MC_EM_EPS 1e-4 +#define MC_EM_EPS 1e-5 #define MC_DEF_INDEL 0.15 unsigned char seq_nt4_table[256] = { @@ -32,24 +33,20 @@ unsigned char seq_nt4_table[256] = { }; struct __bcf_p1aux_t { - int n, M, n1, is_indel, is_folded; + int n, M, n1, is_indel; + uint8_t *ploidy; // haploid or diploid ONLY double *q2p, *pdg; // pdg -> P(D|g) double *phi, *phi_indel; double *z, *zswap; // aux for afs double *z1, *z2, *phi1, *phi2; // only calculated when n1 is set + double **hg; // hypergeometric distribution + double *lf; // log factorial double t, t1, t2; double *afs, *afs1; // afs: accumulative AFS; afs1: site posterior distribution const uint8_t *PL; // point to PL int PL_len; }; -static void fold_array(int M, double *x) -{ - int k; - for (k = 0; k < M/2; ++k) - x[k] = x[M-k] = (x[k] + x[M-k]) / 2.; -} - void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x) { int i; @@ -130,27 +127,38 @@ int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn) return 0; } -bcf_p1aux_t *bcf_p1_init(int n) +bcf_p1aux_t *bcf_p1_init(int n, uint8_t *ploidy) { bcf_p1aux_t *ma; int i; ma = calloc(1, sizeof(bcf_p1aux_t)); ma->n1 = -1; ma->n = n; ma->M = 2 * n; + if (ploidy) { + ma->ploidy = malloc(n); + memcpy(ma->ploidy, ploidy, n); + for (i = 0, ma->M = 0; i < n; ++i) ma->M += ploidy[i]; + if (ma->M == 2 * n) { + free(ma->ploidy); + ma->ploidy = 0; + } + } ma->q2p = calloc(256, sizeof(double)); ma->pdg = calloc(3 * ma->n, sizeof(double)); ma->phi = calloc(ma->M + 1, sizeof(double)); ma->phi_indel = calloc(ma->M + 1, sizeof(double)); ma->phi1 = calloc(ma->M + 1, sizeof(double)); ma->phi2 = calloc(ma->M + 1, sizeof(double)); - ma->z = calloc(2 * ma->n + 1, sizeof(double)); - ma->zswap = calloc(2 * ma->n + 1, sizeof(double)); + ma->z = calloc(ma->M + 1, sizeof(double)); + ma->zswap = calloc(ma->M + 1, sizeof(double)); ma->z1 = calloc(ma->M + 1, sizeof(double)); // actually we do not need this large ma->z2 = calloc(ma->M + 1, sizeof(double)); - ma->afs = calloc(2 * ma->n + 1, sizeof(double)); - ma->afs1 = calloc(2 * ma->n + 1, sizeof(double)); + ma->afs = calloc(ma->M + 1, sizeof(double)); + ma->afs1 = calloc(ma->M + 1, sizeof(double)); + ma->lf = calloc(ma->M + 1, sizeof(double)); for (i = 0; i < 256; ++i) ma->q2p[i] = pow(10., -i / 10.); + for (i = 0; i <= ma->M; ++i) ma->lf[i] = lgamma(i + 1); bcf_p1_init_prior(ma, MC_PTYPE_FULL, 1e-3); // the simplest prior return ma; } @@ -158,23 +166,24 @@ bcf_p1aux_t *bcf_p1_init(int n) int bcf_p1_set_n1(bcf_p1aux_t *b, int n1) { if (n1 == 0 || n1 >= b->n) return -1; + if (b->M != b->n * 2) { + fprintf(stderr, "[%s] unable to set `n1' when there are haploid samples.\n", __func__); + return -1; + } b->n1 = n1; return 0; } -void bcf_p1_set_folded(bcf_p1aux_t *p1a) -{ - if (p1a->n1 < 0) { - p1a->is_folded = 1; - fold_array(p1a->M, p1a->phi); - fold_array(p1a->M, p1a->phi_indel); - } -} - void bcf_p1_destroy(bcf_p1aux_t *ma) { if (ma) { - free(ma->q2p); free(ma->pdg); + int k; + free(ma->lf); + if (ma->hg && ma->n1 > 0) { + for (k = 0; k <= 2*ma->n1; ++k) free(ma->hg[k]); + free(ma->hg); + } + free(ma->ploidy); free(ma->q2p); free(ma->pdg); free(ma->phi); free(ma->phi_indel); free(ma->phi1); free(ma->phi2); free(ma->z); free(ma->zswap); free(ma->z1); free(ma->z2); free(ma->afs); free(ma->afs1); @@ -184,18 +193,16 @@ void bcf_p1_destroy(bcf_p1aux_t *ma) static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma) { - int i, j, k; + int i, j; long *p, tmp; p = alloca(b->n_alleles * sizeof(long)); memset(p, 0, sizeof(long) * b->n_alleles); for (j = 0; j < ma->n; ++j) { const uint8_t *pi = ma->PL + j * ma->PL_len; double *pdg = ma->pdg + j * 3; - pdg[0] = ma->q2p[pi[b->n_alleles]]; pdg[1] = ma->q2p[pi[1]]; pdg[2] = ma->q2p[pi[0]]; - for (i = k = 0; i < b->n_alleles; ++i) { - p[i] += (int)pi[k]; - k += b->n_alleles - i; - } + pdg[0] = ma->q2p[pi[2]]; pdg[1] = ma->q2p[pi[1]]; pdg[2] = ma->q2p[pi[0]]; + for (i = 0; i < b->n_alleles; ++i) + p[i] += (int)pi[(i+1)*(i+2)/2-1]; } for (i = 0; i < b->n_alleles; ++i) p[i] = p[i]<<4 | i; for (i = 1; i < b->n_alleles; ++i) // insertion sort @@ -205,28 +212,18 @@ static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma) if ((p[i]&0xf) == 0) break; return i; } -// f0 is the reference allele frequency -static double mc_freq_iter(double f0, const bcf_p1aux_t *ma) -{ - double f, f3[3]; - int i; - f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0; - for (i = 0, f = 0.; i < ma->n; ++i) { - double *pdg; - pdg = ma->pdg + i * 3; - f += (pdg[1] * f3[1] + 2. * pdg[2] * f3[2]) - / (pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]); - } - f /= ma->n * 2.; - return f; -} int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k) { double sum, g[3]; double max, f3[3], *pdg = ma->pdg + k * 3; - int q, i, max_i; - f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0; + int q, i, max_i, ploidy; + ploidy = ma->ploidy? ma->ploidy[k] : 2; + if (ploidy == 2) { + f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0; + } else { + f3[0] = 1. - f0; f3[1] = 0; f3[2] = f0; + } for (i = 0, sum = 0.; i < 3; ++i) sum += (g[i] = pdg[i] * f3[i]); for (i = 0, max = -1., max_i = 0; i < 3; ++i) { @@ -246,6 +243,7 @@ static void mc_cal_y_core(bcf_p1aux_t *ma, int beg) { double *z[2], *tmp, *pdg; int _j, last_min, last_max; + assert(beg == 0 || ma->M == ma->n*2); z[0] = ma->z; z[1] = ma->zswap; pdg = ma->pdg; @@ -254,41 +252,81 @@ static void mc_cal_y_core(bcf_p1aux_t *ma, int beg) z[0][0] = 1.; last_min = last_max = 0; ma->t = 0.; - for (_j = beg; _j < ma->n; ++_j) { - int k, j = _j - beg, _min = last_min, _max = last_max; - double p[3], sum; - pdg = ma->pdg + _j * 3; - p[0] = pdg[0]; p[1] = 2. * pdg[1]; p[2] = pdg[2]; - for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.; - for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.; - _max += 2; - if (_min == 0) - k = 0, z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k]; - if (_min <= 1) - k = 1, z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k] + k*(2*j+2-k) * p[1] * z[0][k-1]; - for (k = _min < 2? 2 : _min; k <= _max; ++k) - z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k] - + k*(2*j+2-k) * p[1] * z[0][k-1] - + k*(k-1)* p[2] * z[0][k-2]; - for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k]; - ma->t += log(sum / ((2. * j + 2) * (2. * j + 1))); - for (k = _min; k <= _max; ++k) z[1][k] /= sum; - if (_min >= 1) z[1][_min-1] = 0.; - if (_min >= 2) z[1][_min-2] = 0.; - if (j < ma->n - 1) z[1][_max+1] = z[1][_max+2] = 0.; - if (_j == ma->n1 - 1) { // set pop1 - ma->t1 = ma->t; - memcpy(ma->z1, z[1], sizeof(double) * (ma->n1 * 2 + 1)); + if (ma->M == ma->n * 2) { + int M = 0; + for (_j = beg; _j < ma->n; ++_j) { + int k, j = _j - beg, _min = last_min, _max = last_max, M0; + double p[3], sum; + M0 = M; M += 2; + pdg = ma->pdg + _j * 3; + p[0] = pdg[0]; p[1] = 2. * pdg[1]; p[2] = pdg[2]; + for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.; + for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.; + _max += 2; + if (_min == 0) k = 0, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k]; + if (_min <= 1) k = 1, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1]; + for (k = _min < 2? 2 : _min; k <= _max; ++k) + z[1][k] = (M0-k+1)*(M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1] + k*(k-1)* p[2] * z[0][k-2]; + for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k]; + ma->t += log(sum / (M * (M - 1.))); + for (k = _min; k <= _max; ++k) z[1][k] /= sum; + if (_min >= 1) z[1][_min-1] = 0.; + if (_min >= 2) z[1][_min-2] = 0.; + if (j < ma->n - 1) z[1][_max+1] = z[1][_max+2] = 0.; + if (_j == ma->n1 - 1) { // set pop1; ma->n1==-1 when unset + ma->t1 = ma->t; + memcpy(ma->z1, z[1], sizeof(double) * (ma->n1 * 2 + 1)); + } + tmp = z[0]; z[0] = z[1]; z[1] = tmp; + last_min = _min; last_max = _max; + } + //for (_j = 0; _j < last_min; ++_j) z[0][_j] = 0.; // TODO: are these necessary? + //for (_j = last_max + 1; _j < ma->M; ++_j) z[0][_j] = 0.; + } else { // this block is very similar to the block above; these two might be merged in future + int j, M = 0; + for (j = 0; j < ma->n; ++j) { + int k, M0, _min = last_min, _max = last_max; + double p[3], sum; + pdg = ma->pdg + j * 3; + for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.; + for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.; + M0 = M; + M += ma->ploidy[j]; + if (ma->ploidy[j] == 1) { + p[0] = pdg[0]; p[1] = pdg[2]; + _max++; + if (_min == 0) k = 0, z[1][k] = (M0+1-k) * p[0] * z[0][k]; + for (k = _min < 1? 1 : _min; k <= _max; ++k) + z[1][k] = (M0+1-k) * p[0] * z[0][k] + k * p[1] * z[0][k-1]; + for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k]; + ma->t += log(sum / M); + for (k = _min; k <= _max; ++k) z[1][k] /= sum; + if (_min >= 1) z[1][_min-1] = 0.; + if (j < ma->n - 1) z[1][_max+1] = 0.; + } else if (ma->ploidy[j] == 2) { + p[0] = pdg[0]; p[1] = 2 * pdg[1]; p[2] = pdg[2]; + _max += 2; + if (_min == 0) k = 0, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k]; + if (_min <= 1) k = 1, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1]; + for (k = _min < 2? 2 : _min; k <= _max; ++k) + z[1][k] = (M0-k+1)*(M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1] + k*(k-1)* p[2] * z[0][k-2]; + for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k]; + ma->t += log(sum / (M * (M - 1.))); + for (k = _min; k <= _max; ++k) z[1][k] /= sum; + if (_min >= 1) z[1][_min-1] = 0.; + if (_min >= 2) z[1][_min-2] = 0.; + if (j < ma->n - 1) z[1][_max+1] = z[1][_max+2] = 0.; + } + tmp = z[0]; z[0] = z[1]; z[1] = tmp; + last_min = _min; last_max = _max; } - tmp = z[0]; z[0] = z[1]; z[1] = tmp; - last_min = _min; last_max = _max; } if (z[0] != ma->z) memcpy(ma->z, z[0], sizeof(double) * (ma->M + 1)); } static void mc_cal_y(bcf_p1aux_t *ma) { - if (ma->n1 > 0 && ma->n1 < ma->n) { + if (ma->n1 > 0 && ma->n1 < ma->n && ma->M == ma->n * 2) { // NB: ma->n1 is ineffective when there are haploid samples int k; long double x; memset(ma->z1, 0, sizeof(double) * (2 * ma->n1 + 1)); @@ -304,41 +342,131 @@ static void mc_cal_y(bcf_p1aux_t *ma) } else mc_cal_y_core(ma, 0); } -static void contrast(bcf_p1aux_t *ma, double pc[4]) // mc_cal_y() must be called before hand +#define CONTRAST_TINY 1e-30 + +extern double kf_gammaq(double s, double z); // incomplete gamma function for chi^2 test + +static inline double chi2_test(int a, int b, int c, int d) +{ + double x, z; + x = (double)(a+b) * (c+d) * (b+d) * (a+c); + if (x == 0.) return 1; + z = a * d - b * c; + return kf_gammaq(.5, .5 * z * z * (a+b+c+d) / x); +} + +// chi2=(a+b+c+d)(ad-bc)^2/[(a+b)(c+d)(a+c)(b+d)] +static inline double contrast2_aux(const bcf_p1aux_t *p1, double sum, int k1, int k2, double x[3]) { - int k, n1 = ma->n1, n2 = ma->n - ma->n1; - long double sum1, sum2; - pc[0] = pc[1] = pc[2] = pc[3] = -1.; - if (n1 <= 0 || n2 <= 0) return; - for (k = 0, sum1 = 0.; k <= 2*n1; ++k) sum1 += ma->phi1[k] * ma->z1[k]; - for (k = 0, sum2 = 0.; k <= 2*n2; ++k) sum2 += ma->phi2[k] * ma->z2[k]; - pc[2] = ma->phi1[2*n1] * ma->z1[2*n1] / sum1; - pc[3] = ma->phi2[2*n2] * ma->z2[2*n2] / sum2; - for (k = 2; k < 4; ++k) { - pc[k] = pc[k] > .5? -(-4.343 * log(1. - pc[k] + TINY) + .499) : -4.343 * log(pc[k] + TINY) + .499; - pc[k] = (int)pc[k]; - if (pc[k] > 99) pc[k] = 99; - if (pc[k] < -99) pc[k] = -99; + double p = p1->phi[k1+k2] * p1->z1[k1] * p1->z2[k2] / sum * p1->hg[k1][k2]; + int n1 = p1->n1, n2 = p1->n - p1->n1; + if (p < CONTRAST_TINY) return -1; + if (.5*k1/n1 < .5*k2/n2) x[1] += p; + else if (.5*k1/n1 > .5*k2/n2) x[2] += p; + else x[0] += p; + return p * chi2_test(k1, k2, (n1<<1) - k1, (n2<<1) - k2); +} + +static double contrast2(bcf_p1aux_t *p1, double ret[3]) +{ + int k, k1, k2, k10, k20, n1, n2; + double sum; + // get n1 and n2 + n1 = p1->n1; n2 = p1->n - p1->n1; + if (n1 <= 0 || n2 <= 0) return 0.; + if (p1->hg == 0) { // initialize the hypergeometric distribution + /* NB: the hg matrix may take a lot of memory when there are many samples. There is a way + to avoid precomputing this matrix, but it is slower and quite intricate. The following + computation in this block can be accelerated with a similar strategy, but perhaps this + is not a serious concern for now. */ + double tmp = lgamma(2*(n1+n2)+1) - (lgamma(2*n1+1) + lgamma(2*n2+1)); + p1->hg = calloc(2*n1+1, sizeof(void*)); + for (k1 = 0; k1 <= 2*n1; ++k1) { + p1->hg[k1] = calloc(2*n2+1, sizeof(double)); + for (k2 = 0; k2 <= 2*n2; ++k2) + p1->hg[k1][k2] = exp(lgamma(k1+k2+1) + lgamma(p1->M-k1-k2+1) - (lgamma(k1+1) + lgamma(k2+1) + lgamma(2*n1-k1+1) + lgamma(2*n2-k2+1) + tmp)); + } + } + { // compute + long double suml = 0; + for (k = 0; k <= p1->M; ++k) suml += p1->phi[k] * p1->z[k]; + sum = suml; + } + { // get the max k1 and k2 + double max; + int max_k; + for (k = 0, max = 0, max_k = -1; k <= 2*n1; ++k) { + double x = p1->phi1[k] * p1->z1[k]; + if (x > max) max = x, max_k = k; + } + k10 = max_k; + for (k = 0, max = 0, max_k = -1; k <= 2*n2; ++k) { + double x = p1->phi2[k] * p1->z2[k]; + if (x > max) max = x, max_k = k; + } + k20 = max_k; + } + { // We can do the following with one nested loop, but that is an O(N^2) thing. The following code block is much faster for large N. + double x[3], y; + long double z = 0., L[2]; + x[0] = x[1] = x[2] = 0; L[0] = L[1] = 0; + for (k1 = k10; k1 >= 0; --k1) { + for (k2 = k20; k2 >= 0; --k2) { + if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break; + else z += y; + } + for (k2 = k20 + 1; k2 <= 2*n2; ++k2) { + if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break; + else z += y; + } + } + ret[0] = x[0]; ret[1] = x[1]; ret[2] = x[2]; + x[0] = x[1] = x[2] = 0; + for (k1 = k10 + 1; k1 <= 2*n1; ++k1) { + for (k2 = k20; k2 >= 0; --k2) { + if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break; + else z += y; + } + for (k2 = k20 + 1; k2 <= 2*n2; ++k2) { + if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break; + else z += y; + } + } + ret[0] += x[0]; ret[1] += x[1]; ret[2] += x[2]; + if (ret[0] + ret[1] + ret[2] < 0.95) { // in case of bad things happened + ret[0] = ret[1] = ret[2] = 0; L[0] = L[1] = 0; + for (k1 = 0, z = 0.; k1 <= 2*n1; ++k1) + for (k2 = 0; k2 <= 2*n2; ++k2) + if ((y = contrast2_aux(p1, sum, k1, k2, ret)) >= 0) z += y; + if (ret[0] + ret[1] + ret[2] < 0.95) // It seems that this may be caused by floating point errors. I do not really understand why... + z = 1.0, ret[0] = ret[1] = ret[2] = 1./3; + } + return (double)z; } - pc[0] = ma->phi2[2*n2] * ma->z2[2*n2] / sum2 * (1. - ma->phi1[2*n1] * ma->z1[2*n1] / sum1); - pc[1] = ma->phi1[2*n1] * ma->z1[2*n1] / sum1 * (1. - ma->phi2[2*n2] * ma->z2[2*n2] / sum2); - pc[0] = pc[0] == 1.? 99 : (int)(-4.343 * log(1. - pc[0]) + .499); - pc[1] = pc[1] == 1.? 99 : (int)(-4.343 * log(1. - pc[1]) + .499); } -static double mc_cal_afs(bcf_p1aux_t *ma) +static double mc_cal_afs(bcf_p1aux_t *ma, double *p_ref_folded, double *p_var_folded) { int k; - long double sum = 0.; + long double sum = 0., sum2; double *phi = ma->is_indel? ma->phi_indel : ma->phi; memset(ma->afs1, 0, sizeof(double) * (ma->M + 1)); mc_cal_y(ma); + // compute AFS for (k = 0, sum = 0.; k <= ma->M; ++k) sum += (long double)phi[k] * ma->z[k]; for (k = 0; k <= ma->M; ++k) { ma->afs1[k] = phi[k] * ma->z[k] / sum; if (isnan(ma->afs1[k]) || isinf(ma->afs1[k])) return -1.; } + // compute folded variant probability + for (k = 0, sum = 0.; k <= ma->M; ++k) + sum += (long double)(phi[k] + phi[ma->M - k]) / 2. * ma->z[k]; + for (k = 1, sum2 = 0.; k < ma->M; ++k) + sum2 += (long double)(phi[k] + phi[ma->M - k]) / 2. * ma->z[k]; + *p_var_folded = sum2 / sum; + *p_ref_folded = (phi[k] + phi[ma->M - k]) / 2. * (ma->z[ma->M] + ma->z[0]) / sum; + // the expected frequency for (k = 0, sum = 0.; k <= ma->M; ++k) { ma->afs[k] += ma->afs1[k]; sum += k * ma->afs1[k]; @@ -346,37 +474,12 @@ static double mc_cal_afs(bcf_p1aux_t *ma) return sum / ma->M; } -long double bcf_p1_cal_g3(bcf_p1aux_t *p1a, double g[3]) -{ - long double pd = 0., g2[3]; - int i, k; - memset(g2, 0, sizeof(long double) * 3); - for (k = 0; k < p1a->M; ++k) { - double f = (double)k / p1a->M, f3[3], g1[3]; - long double z = 1.; - g1[0] = g1[1] = g1[2] = 0.; - f3[0] = (1. - f) * (1. - f); f3[1] = 2. * f * (1. - f); f3[2] = f * f; - for (i = 0; i < p1a->n; ++i) { - double *pdg = p1a->pdg + i * 3; - double x = pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]; - z *= x; - g1[0] += pdg[0] * f3[0] / x; - g1[1] += pdg[1] * f3[1] / x; - g1[2] += pdg[2] * f3[2] / x; - } - pd += p1a->phi[k] * z; - for (i = 0; i < 3; ++i) - g2[i] += p1a->phi[k] * z * g1[i]; - } - for (i = 0; i < 3; ++i) g[i] = g2[i] / pd; - return pd; -} - -int bcf_p1_cal(bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst) +int bcf_p1_cal(const bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1rst_t *rst) { int i, k; long double sum = 0.; ma->is_indel = bcf_is_indel(b); + rst->perm_rank = -1; // set PL and PL_len for (i = 0; i < b->n_gi; ++i) { if (b->gi[i].fmt == bcf_str2int("PL", 2)) { @@ -385,11 +488,22 @@ int bcf_p1_cal(bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst) break; } } + if (i == b->n_gi) return -1; // no PL if (b->n_alleles < 2) return -1; // FIXME: find a better solution // rst->rank0 = cal_pdg(b, ma); - rst->f_exp = mc_cal_afs(ma); - rst->p_ref = ma->is_folded? ma->afs1[ma->M] + ma->afs1[0] : ma->afs1[ma->M]; + rst->f_exp = mc_cal_afs(ma, &rst->p_ref_folded, &rst->p_var_folded); + rst->p_ref = ma->afs1[ma->M]; + for (k = 0, sum = 0.; k < ma->M; ++k) + sum += ma->afs1[k]; + rst->p_var = (double)sum; + { // compute the allele count + double max = -1; + rst->ac = -1; + for (k = 0; k <= ma->M; ++k) + if (max < ma->z[k]) max = ma->z[k], rst->ac = k; + rst->ac = ma->M - rst->ac; + } // calculate f_flat and f_em for (k = 0, sum = 0.; k <= ma->M; ++k) sum += (long double)ma->z[k]; @@ -399,36 +513,39 @@ int bcf_p1_cal(bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst) rst->f_flat += k * p; } rst->f_flat /= ma->M; - { // calculate f_em - double flast = rst->f_flat; - for (i = 0; i < MC_MAX_EM_ITER; ++i) { - rst->f_em = mc_freq_iter(flast, ma); - if (fabs(rst->f_em - flast) < MC_EM_EPS) break; - flast = rst->f_em; - } - } { // estimate equal-tail credible interval (95% level) int l, h; double p; - for (i = 0, p = 0.; i < ma->M; ++i) + for (i = 0, p = 0.; i <= ma->M; ++i) if (p + ma->afs1[i] > 0.025) break; else p += ma->afs1[i]; l = i; - for (i = ma->M-1, p = 0.; i >= 0; --i) + for (i = ma->M, p = 0.; i >= 0; --i) if (p + ma->afs1[i] > 0.025) break; else p += ma->afs1[i]; h = i; rst->cil = (double)(ma->M - h) / ma->M; rst->cih = (double)(ma->M - l) / ma->M; } - rst->g[0] = rst->g[1] = rst->g[2] = -1.; - contrast(ma, rst->pc); + if (ma->n1 > 0) { // compute LRT + double max0, max1, max2; + for (k = 0, max0 = -1; k <= ma->M; ++k) + if (max0 < ma->z[k]) max0 = ma->z[k]; + for (k = 0, max1 = -1; k <= ma->n1 * 2; ++k) + if (max1 < ma->z1[k]) max1 = ma->z1[k]; + for (k = 0, max2 = -1; k <= ma->M - ma->n1 * 2; ++k) + if (max2 < ma->z2[k]) max2 = ma->z2[k]; + rst->lrt = log(max1 * max2 / max0); + rst->lrt = rst->lrt < 0? 1 : kf_gammaq(.5, rst->lrt); + } else rst->lrt = -1.0; + rst->cmp[0] = rst->cmp[1] = rst->cmp[2] = rst->p_chi2 = -1.0; + if (do_contrast && rst->p_var > 0.5) // skip contrast2() if the locus is a strong non-variant + rst->p_chi2 = contrast2(ma, rst->cmp); return 0; } void bcf_p1_dump_afs(bcf_p1aux_t *ma) { int k; - if (ma->is_folded) fold_array(ma->M, ma->afs); fprintf(stderr, "[afs]"); for (k = 0; k <= ma->M; ++k) fprintf(stderr, " %d:%.3lf", k, ma->afs[ma->M - k]); diff --git a/sam/bcftools/prob1.h b/sam/bcftools/prob1.h index 3827534..0a51a0a 100644 --- a/sam/bcftools/prob1.h +++ b/sam/bcftools/prob1.h @@ -7,11 +7,11 @@ struct __bcf_p1aux_t; typedef struct __bcf_p1aux_t bcf_p1aux_t; typedef struct { - int rank0; - double f_em, f_exp, f_flat, p_ref; + int rank0, perm_rank; // NB: perm_rank is always set to -1 by bcf_p1_cal() + int ac; // ML alternative allele count + double f_exp, f_flat, p_ref_folded, p_ref, p_var_folded, p_var; double cil, cih; - double pc[4]; - double g[3]; + double cmp[3], p_chi2, lrt; // used by contrast2() } bcf_p1rst_t; #define MC_PTYPE_FULL 1 @@ -22,18 +22,19 @@ typedef struct { extern "C" { #endif - bcf_p1aux_t *bcf_p1_init(int n); + bcf_p1aux_t *bcf_p1_init(int n, uint8_t *ploidy); void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta); void bcf_p1_init_subprior(bcf_p1aux_t *ma, int type, double theta); void bcf_p1_destroy(bcf_p1aux_t *ma); - int bcf_p1_cal(bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst); + int bcf_p1_cal(const bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1rst_t *rst); int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k); void bcf_p1_dump_afs(bcf_p1aux_t *ma); int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn); - long double bcf_p1_cal_g3(bcf_p1aux_t *p1a, double g[3]); int bcf_p1_set_n1(bcf_p1aux_t *b, int n1); void bcf_p1_set_folded(bcf_p1aux_t *p1a); // only effective when set_n1() is not called + int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10]); + #ifdef __cplusplus } #endif diff --git a/sam/bcftools/vcf.c b/sam/bcftools/vcf.c index 9b661ff..9daa845 100644 --- a/sam/bcftools/vcf.c +++ b/sam/bcftools/vcf.c @@ -72,6 +72,33 @@ bcf_t *vcf_open(const char *fn, const char *mode) return bp; } +int vcf_dictread(bcf_t *bp, bcf_hdr_t *h, const char *fn) +{ + vcf_t *v; + gzFile fp; + kstream_t *ks; + kstring_t s, rn; + int dret; + if (bp == 0) return -1; + if (!bp->is_vcf) return 0; + s.l = s.m = 0; s.s = 0; + rn.m = rn.l = h->l_nm; rn.s = h->name; + v = (vcf_t*)bp->v; + fp = gzopen(fn, "r"); + ks = ks_init(fp); + while (ks_getuntil(ks, 0, &s, &dret) >= 0) { + bcf_str2id_add(v->refhash, strdup(s.s)); + kputs(s.s, &rn); kputc('\0', &rn); + if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); + } + ks_destroy(ks); + gzclose(fp); + h->l_nm = rn.l; h->name = rn.s; + bcf_hdr_sync(h); + free(s.s); + return 0; +} + int vcf_close(bcf_t *bp) { vcf_t *v; @@ -84,7 +111,7 @@ int vcf_close(bcf_t *bp) } if (v->fpout) fclose(v->fpout); free(v->line.s); - bcf_str2id_destroy(v->refhash); + bcf_str2id_thorough_destroy(v->refhash); free(v); free(bp); return 0; @@ -93,15 +120,14 @@ int vcf_close(bcf_t *bp) int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h) { vcf_t *v = (vcf_t*)bp->v; - int i, has_ref = 0, has_ver = 0; + int i, has_ver = 0; if (!bp->is_vcf) return bcf_hdr_write(bp, h); if (h->l_txt > 0) { if (strstr(h->txt, "##fileformat=")) has_ver = 1; - if (has_ver == 0) fprintf(v->fpout, "##fileformat=VCFv4.0\n"); + if (has_ver == 0) fprintf(v->fpout, "##fileformat=VCFv4.1\n"); fwrite(h->txt, 1, h->l_txt - 1, v->fpout); - if (strstr(h->txt, "##SQ=")) has_ref = 1; } - if (has_ver == 0) fprintf(v->fpout, "##fileformat=VCFv4.0\n"); + if (h->l_txt == 0) fprintf(v->fpout, "##fileformat=VCFv4.1\n"); fprintf(v->fpout, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"); for (i = 0; i < h->n_smpl; ++i) fprintf(v->fpout, "\t%s", h->sns[i]); @@ -138,7 +164,7 @@ int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b) if (k == 0) { // ref int tid = bcf_str2id(v->refhash, p); if (tid < 0) { - tid = bcf_str2id_add(v->refhash, p); + tid = bcf_str2id_add(v->refhash, strdup(p)); kputs(p, &rn); kputc('\0', &rn); sync = 1; } @@ -156,8 +182,10 @@ int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b) for (i = 0; i < b->n_gi; ++i) { if (b->gi[i].fmt == bcf_str2int("GT", 2)) { ((uint8_t*)b->gi[i].data)[k-9] = 1<<7; - } else if (b->gi[i].fmt == bcf_str2int("GQ", 2) || b->gi[i].fmt == bcf_str2int("SP", 2)) { + } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) { ((uint8_t*)b->gi[i].data)[k-9] = 0; + } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) { + ((int32_t*)b->gi[i].data)[k-9] = 0; } else if (b->gi[i].fmt == bcf_str2int("DP", 2)) { ((uint16_t*)b->gi[i].data)[k-9] = 0; } else if (b->gi[i].fmt == bcf_str2int("PL", 2)) { @@ -173,11 +201,15 @@ int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b) for (q = kstrtok(p, ":", &a2), i = 0; q && i < b->n_gi; q = kstrtok(0, 0, &a2), ++i) { if (b->gi[i].fmt == bcf_str2int("GT", 2)) { ((uint8_t*)b->gi[i].data)[k-9] = (q[0] - '0')<<3 | (q[2] - '0') | (q[1] == '/'? 0 : 1) << 6; - } else if (b->gi[i].fmt == bcf_str2int("GQ", 2) || b->gi[i].fmt == bcf_str2int("SP", 2)) { + } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) { double _x = strtod(q, &q); int x = (int)(_x + .499); if (x > 255) x = 255; ((uint8_t*)b->gi[i].data)[k-9] = x; + } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) { + int x = strtol(q, &q, 10); + if (x > 0xffff) x = 0xffff; + ((uint32_t*)b->gi[i].data)[k-9] = x; } else if (b->gi[i].fmt == bcf_str2int("DP", 2)) { int x = strtol(q, &q, 10); if (x > 0xffff) x = 0xffff; @@ -198,7 +230,7 @@ int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b) y = b->n_alleles * (b->n_alleles + 1) / 2; for (j = 0; j < y; ++j) { x = strtod(q, &q); - data[(k-9) * y + j] = x; + data[(k-9) * y + j] = x > 0? -x/10. : x; ++q; } } diff --git a/sam/bcftools/vcfutils.pl b/sam/bcftools/vcfutils.pl index cd86b0f..2b7ba0b 100755 --- a/sam/bcftools/vcfutils.pl +++ b/sam/bcftools/vcfutils.pl @@ -14,7 +14,7 @@ sub main { my $command = shift(@ARGV); my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter, hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&varFilter, ldstats=>\&ldstats, - gapstats=>\&gapstats, splitchr=>\&splitchr); + gapstats=>\&gapstats, splitchr=>\&splitchr, vcf2fq=>\&vcf2fq); die("Unknown command \"$command\".\n") if (!defined($func{$command})); &{$func{$command}}; } @@ -86,7 +86,7 @@ sub fillac { print; } else { my @t = split; - my @c = (0); + my @c = (0, 0); my $n = 0; my $s = -1; @_ = split(":", $t[8]); @@ -215,8 +215,8 @@ Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions # } sub varFilter { - my %opts = (d=>2, D=>10000, a=>2, W=>10, Q=>10, w=>10, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4); - getopts('pd:D:W:Q:w:a:1:2:3:4:', \%opts); + my %opts = (d=>2, D=>10000000, a=>2, W=>10, Q=>10, w=>3, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, G=>0, S=>1000, e=>1e-4); + getopts('pd:D:W:Q:w:a:1:2:3:4:G:S:e:', \%opts); die(qq/ Usage: vcfutils.pl varFilter [options] @@ -230,6 +230,7 @@ Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}] -2 FLOAT min P-value for baseQ bias [$opts{2}] -3 FLOAT min P-value for mapQ bias [$opts{3}] -4 FLOAT min P-value for end distance bias [$opts{4}] + -e FLOAT min P-value for HWE (plus F<0) [$opts{e}] -p print filtered variants Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools. @@ -246,6 +247,7 @@ Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools. print; next; } next if ($t[4] eq '.'); # skip non-var sites + next if ($t[3] eq 'N'); # skip sites with unknown ref ('N') # check if the site is a SNP my $type = 1; # SNP if (length($t[3]) > 1) { @@ -289,6 +291,13 @@ Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools. $flt = 1 if ($flt == 0 && $mq >= 0 && $mq < $opts{Q}); $flt = 7 if ($flt == 0 && /PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/ && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4})); + $flt = 8 if ($flt == 0 && ((/MXGQ=(\d+)/ && $1 < $opts{G}) || (/MXSP=(\d+)/ && $1 >= $opts{S}))); + # HWE filter + if ($t[7] =~ /G3=([^;,]+),([^;,]+),([^;,]+).*HWE=([^;,]+)/ && $4 < $opts{e}) { + my $p = 2*$1 + $2; + my $f = ($p > 0 && $p < 1)? 1 - $2 / ($p * (1-$p)) : 0; + $flt = 9 if ($f < 0); + } my $score = $t[5] * 100 + $dp_alt; my $rlen = length($t[3]) - 1; # $indel_score<0 for SNPs @@ -311,7 +320,10 @@ Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools. } else { # SNP or MNP for my $x (@staging) { next if (($x->[0]&3) != 3 || $x->[4] + $x->[2] + $ow < $t[1]); - $flt = 5; + if ($x->[4] + length($x->[7]) - 1 == $t[1] && substr($x->[7], -1, 1) eq substr($t[4], 0, 1) + && length($x->[7]) - length($x->[6]) == 1) { + $x->[1] = 5; + } else { $flt = 5; } last; } # check MNP @@ -338,7 +350,7 @@ sub varFilter_aux { if ($first->[1] == 0) { print join("\t", @$first[3 .. @$first-1]), "\n"; } elsif ($is_print) { - print STDERR join("\t", substr("UQdDaGgPM", $first->[1], 1), @$first[3 .. @$first-1]), "\n"; + print STDERR join("\t", substr("UQdDaGgPMS", $first->[1], 1), @$first[3 .. @$first-1]), "\n"; } } @@ -454,6 +466,87 @@ sub hapmap2vcf { } } +sub vcf2fq { + my %opts = (d=>3, D=>100000, Q=>10, l=>5); + getopts('d:D:Q:l:', \%opts); + die(qq/ +Usage: vcfutils.pl vcf2fq [options] + +Options: -d INT minimum depth [$opts{d}] + -D INT maximum depth [$opts{D}] + -Q INT min RMS mapQ [$opts{Q}] + -l INT INDEL filtering window [$opts{l}] +\n/) if (@ARGV == 0 && -t STDIN); + + my ($last_chr, $seq, $qual, $last_pos, @gaps); + my $_Q = $opts{Q}; + my $_d = $opts{d}; + my $_D = $opts{D}; + + my %het = (AC=>'M', AG=>'R', AT=>'W', CA=>'M', CG=>'S', CT=>'Y', + GA=>'R', GC=>'S', GT=>'K', TA=>'W', TC=>'Y', TG=>'K'); + + $last_chr = ''; + while (<>) { + next if (/^#/); + my @t = split; + if ($last_chr ne $t[0]) { + &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}) if ($last_chr); + ($last_chr, $last_pos) = ($t[0], 0); + $seq = $qual = ''; + @gaps = (); + } + die("[vcf2fq] unsorted input\n") if ($t[1] - $last_pos < 0); + if ($t[1] - $last_pos > 1) { + $seq .= 'n' x ($t[1] - $last_pos - 1); + $qual .= '!' x ($t[1] - $last_pos - 1); + } + if (length($t[3]) == 1 && $t[7] !~ /INDEL/ && $t[4] =~ /^([A-Za-z.])(,[A-Za-z])*$/) { # a SNP or reference + my ($ref, $alt) = ($t[3], $1); + my ($b, $q); + $q = $1 if ($t[7] =~ /FQ=(-?[\d\.]+)/); + if ($q < 0) { + $_ = ($t[7] =~ /AF1=([\d\.]+)/)? $1 : 0; + $b = ($_ < .5 || $alt eq '.')? $ref : $alt; + $q = -$q; + } else { + $b = $het{"$ref$alt"}; + $b ||= 'N'; + } + $b = lc($b); + $b = uc($b) if (($t[7] =~ /MQ=(\d+)/ && $1 >= $_Q) && ($t[7] =~ /DP=(\d+)/ && $1 >= $_d && $1 <= $_D)); + $q = int($q + 33 + .499); + $q = chr($q <= 126? $q : 126); + $seq .= $b; + $qual .= $q; + } elsif ($t[4] ne '.') { # an INDEL + push(@gaps, [$t[1], length($t[3])]); + } + $last_pos = $t[1]; + } + &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}); +} + +sub v2q_post_process { + my ($chr, $seq, $qual, $gaps, $l) = @_; + for my $g (@$gaps) { + my $beg = $g->[0] > $l? $g->[0] - $l : 0; + my $end = $g->[0] + $g->[1] + $l; + $end = length($$seq) if ($end > length($$seq)); + substr($$seq, $beg, $end - $beg) = lc(substr($$seq, $beg, $end - $beg)); + } + print "\@$chr\n"; &v2q_print_str($seq); + print "+\n"; &v2q_print_str($qual); +} + +sub v2q_print_str { + my ($s) = @_; + my $l = length($$s); + for (my $i = 0; $i < $l; $i += 60) { + print substr($$s, $i, 60), "\n"; + } +} + sub usage { die(qq/ Usage: vcfutils.pl []\n @@ -461,8 +554,14 @@ Command: subsam get a subset of samples listsam list the samples fillac fill the allele count field qstats SNP stats stratified by QUAL - varFilter filtering short variants + hapmap2vcf convert the hapmap format to VCF ucscsnp2vcf convert UCSC SNP SQL dump to VCF + + varFilter filtering short variants (*) + vcf2fq VCF->fastq (**) + +Notes: Commands with description endting with (*) may need bcftools + specific annotations. \n/); } diff --git a/sam/bedidx.c b/sam/bedidx.c new file mode 100644 index 0000000..ec75a10 --- /dev/null +++ b/sam/bedidx.c @@ -0,0 +1,162 @@ +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#define drand48() ((double)rand() / RAND_MAX) +#endif + +#include "ksort.h" +KSORT_INIT_GENERIC(uint64_t) + +#include "kseq.h" +KSTREAM_INIT(gzFile, gzread, 8192) + +typedef struct { + int n, m; + uint64_t *a; + int *idx; +} bed_reglist_t; + +#include "khash.h" +KHASH_MAP_INIT_STR(reg, bed_reglist_t) + +#define LIDX_SHIFT 13 + +typedef kh_reg_t reghash_t; + +int *bed_index_core(int n, uint64_t *a, int *n_idx) +{ + int i, j, m, *idx; + m = *n_idx = 0; idx = 0; + for (i = 0; i < n; ++i) { + int beg, end; + beg = a[i]>>32 >> LIDX_SHIFT; end = ((uint32_t)a[i]) >> LIDX_SHIFT; + if (m < end + 1) { + int oldm = m; + m = end + 1; + kroundup32(m); + idx = realloc(idx, m * sizeof(int)); + for (j = oldm; j < m; ++j) idx[j] = -1; + } + if (beg == end) { + if (idx[beg] < 0) idx[beg] = i; + } else { + for (j = beg; j <= end; ++j) + if (idx[j] < 0) idx[j] = i; + } + *n_idx = end + 1; + } + return idx; +} + +void bed_index(void *_h) +{ + reghash_t *h = (reghash_t*)_h; + khint_t k; + for (k = 0; k < kh_end(h); ++k) { + if (kh_exist(h, k)) { + bed_reglist_t *p = &kh_val(h, k); + if (p->idx) free(p->idx); + ks_introsort(uint64_t, p->n, p->a); + p->idx = bed_index_core(p->n, p->a, &p->m); + } + } +} + +int bed_overlap_core(const bed_reglist_t *p, int beg, int end) +{ + int i, min_off; + if (p->n == 0) return 0; + min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT]; + if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here + int n = beg>>LIDX_SHIFT; + if (n > p->n) n = p->n; + for (i = n - 1; i >= 0; --i) + if (p->idx[i] >= 0) break; + min_off = i >= 0? p->idx[i] : 0; + } + for (i = min_off; i < p->n; ++i) { + if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed + if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end) + return 1; // find the overlap; return + } + return 0; +} + +int bed_overlap(const void *_h, const char *chr, int beg, int end) +{ + const reghash_t *h = (const reghash_t*)_h; + khint_t k; + if (!h) return 0; + k = kh_get(reg, h, chr); + if (k == kh_end(h)) return 0; + return bed_overlap_core(&kh_val(h, k), beg, end); +} + +void *bed_read(const char *fn) +{ + reghash_t *h = kh_init(reg); + gzFile fp; + kstream_t *ks; + int dret; + kstring_t *str; + // read the list + fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); + if (fp == 0) return 0; + str = calloc(1, sizeof(kstring_t)); + ks = ks_init(fp); + while (ks_getuntil(ks, 0, str, &dret) >= 0) { // read the chr name + int beg = -1, end = -1; + bed_reglist_t *p; + khint_t k = kh_get(reg, h, str->s); + if (k == kh_end(h)) { // absent from the hash table + int ret; + char *s = strdup(str->s); + k = kh_put(reg, h, s, &ret); + memset(&kh_val(h, k), 0, sizeof(bed_reglist_t)); + } + p = &kh_val(h, k); + if (dret != '\n') { // if the lines has other characters + if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { + beg = atoi(str->s); // begin + if (dret != '\n') { + if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { + end = atoi(str->s); // end + if (end < beg) end = -1; + } + } + } + } + if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); // skip the rest of the line + if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column + if (beg >= 0 && end > beg) { + if (p->n == p->m) { + p->m = p->m? p->m<<1 : 4; + p->a = realloc(p->a, p->m * 8); + } + p->a[p->n++] = (uint64_t)beg<<32 | end; + } + } + ks_destroy(ks); + gzclose(fp); + free(str->s); free(str); + bed_index(h); + return h; +} + +void bed_destroy(void *_h) +{ + reghash_t *h = (reghash_t*)_h; + khint_t k; + for (k = 0; k < kh_end(h); ++k) { + if (kh_exist(h, k)) { + free(kh_val(h, k).a); + free(kh_val(h, k).idx); + free((char*)kh_key(h, k)); + } + } + kh_destroy(reg, h); +} diff --git a/sam/bgzf.c b/sam/bgzf.c index 66d6b02..216cd04 100644 --- a/sam/bgzf.c +++ b/sam/bgzf.c @@ -111,6 +111,32 @@ report_error(BGZF* fp, const char* message) { fp->error = message; } +int bgzf_check_bgzf(const char *fn) +{ + BGZF *fp; + uint8_t buf[10],magic[10]="\037\213\010\4\0\0\0\0\0\377"; + int n; + + if ((fp = bgzf_open(fn, "r")) == 0) + { + fprintf(stderr, "[bgzf_check_bgzf] failed to open the file: %s\n",fn); + return -1; + } + +#ifdef _USE_KNETFILE + n = knet_read(fp->x.fpr, buf, 10); +#else + n = fread(buf, 1, 10, fp->file); +#endif + bgzf_close(fp); + + if ( n!=10 ) + return -1; + + if ( !memcmp(magic, buf, 10) ) return 1; + return 0; +} + static BGZF *bgzf_read_init() { BGZF *fp; @@ -148,7 +174,7 @@ open_read(int fd) static BGZF* -open_write(int fd, bool is_uncompressed) +open_write(int fd, int compress_level) // compress_level==-1 for the default level { FILE* file = fdopen(fd, "w"); BGZF* fp; @@ -156,7 +182,9 @@ open_write(int fd, bool is_uncompressed) fp = malloc(sizeof(BGZF)); fp->file_descriptor = fd; fp->open_mode = 'w'; - fp->owned_file = 0; fp->is_uncompressed = is_uncompressed; + fp->owned_file = 0; + fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1 + if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION; #ifdef _USE_KNETFILE fp->x.fpw = file; #else @@ -195,13 +223,20 @@ bgzf_open(const char* __restrict path, const char* __restrict mode) fp = open_read(fd); #endif } else if (strchr(mode, 'w') || strchr(mode, 'W')) { - int fd, oflag = O_WRONLY | O_CREAT | O_TRUNC; + int fd, compress_level = -1, oflag = O_WRONLY | O_CREAT | O_TRUNC; #ifdef _WIN32 oflag |= O_BINARY; #endif fd = open(path, oflag, 0666); if (fd == -1) return 0; - fp = open_write(fd, strchr(mode, 'u')? 1 : 0); + { // set compress_level + int i; + for (i = 0; mode[i]; ++i) + if (mode[i] >= '0' && mode[i] <= '9') break; + if (mode[i]) compress_level = (int)mode[i] - '0'; + if (strchr(mode, 'u')) compress_level = 0; + } + fp = open_write(fd, compress_level); } if (fp != NULL) fp->owned_file = 1; return fp; @@ -214,7 +249,12 @@ bgzf_fdopen(int fd, const char * __restrict mode) if (mode[0] == 'r' || mode[0] == 'R') { return open_read(fd); } else if (mode[0] == 'w' || mode[0] == 'W') { - return open_write(fd, strstr(mode, "u")? 1 : 0); + int i, compress_level = -1; + for (i = 0; mode[i]; ++i) + if (mode[i] >= '0' && mode[i] <= '9') break; + if (mode[i]) compress_level = (int)mode[i] - '0'; + if (strchr(mode, 'u')) compress_level = 0; + return open_write(fd, compress_level); } else { return NULL; } @@ -254,7 +294,6 @@ deflate_block(BGZF* fp, int block_length) int input_length = block_length; int compressed_length = 0; while (1) { - int compress_level = fp->is_uncompressed? 0 : Z_DEFAULT_COMPRESSION; z_stream zs; zs.zalloc = NULL; zs.zfree = NULL; @@ -263,7 +302,7 @@ deflate_block(BGZF* fp, int block_length) zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH]; zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; - int status = deflateInit2(&zs, compress_level, Z_DEFLATED, + int status = deflateInit2(&zs, fp->compress_level, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY); if (status != Z_OK) { report_error(fp, "deflate init failed"); @@ -330,6 +369,7 @@ inflate_block(BGZF* fp, int block_length) // Inflate the block in fp->compressed_block into fp->uncompressed_block z_stream zs; + int status; zs.zalloc = NULL; zs.zfree = NULL; zs.next_in = fp->compressed_block + 18; @@ -337,7 +377,7 @@ inflate_block(BGZF* fp, int block_length) zs.next_out = fp->uncompressed_block; zs.avail_out = fp->uncompressed_block_size; - int status = inflateInit2(&zs, GZIP_WINDOW_BITS); + status = inflateInit2(&zs, GZIP_WINDOW_BITS); if (status != Z_OK) { report_error(fp, "inflate init failed"); return -1; @@ -431,7 +471,7 @@ int bgzf_read_block(BGZF* fp) { bgzf_byte_t header[BLOCK_HEADER_LENGTH]; - int count, size = 0; + int count, size = 0, block_length, remaining; #ifdef _USE_KNETFILE int64_t block_address = knet_tell(fp->x.fpr); if (load_block_from_cache(fp, block_address)) return 0; @@ -454,10 +494,10 @@ bgzf_read_block(BGZF* fp) report_error(fp, "invalid block header"); return -1; } - int block_length = unpackInt16((uint8_t*)&header[16]) + 1; + block_length = unpackInt16((uint8_t*)&header[16]) + 1; bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block; memcpy(compressed_block, header, BLOCK_HEADER_LENGTH); - int remaining = block_length - BLOCK_HEADER_LENGTH; + remaining = block_length - BLOCK_HEADER_LENGTH; #ifdef _USE_KNETFILE count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining); #else @@ -494,7 +534,8 @@ bgzf_read(BGZF* fp, void* data, int length) int bytes_read = 0; bgzf_byte_t* output = data; while (bytes_read < length) { - int available = fp->block_length - fp->block_offset; + int copy_length, available = fp->block_length - fp->block_offset; + bgzf_byte_t *buffer; if (available <= 0) { if (bgzf_read_block(fp) != 0) { return -1; @@ -504,8 +545,8 @@ bgzf_read(BGZF* fp, void* data, int length) break; } } - int copy_length = bgzf_min(length-bytes_read, available); - bgzf_byte_t* buffer = fp->uncompressed_block; + copy_length = bgzf_min(length-bytes_read, available); + buffer = fp->uncompressed_block; memcpy(output, buffer + fp->block_offset, copy_length); fp->block_offset += copy_length; output += copy_length; @@ -552,6 +593,8 @@ int bgzf_flush_try(BGZF *fp, int size) int bgzf_write(BGZF* fp, const void* data, int length) { + const bgzf_byte_t *input = data; + int block_length, bytes_written; if (fp->open_mode != 'w') { report_error(fp, "file not open for writing"); return -1; @@ -560,9 +603,9 @@ int bgzf_write(BGZF* fp, const void* data, int length) if (fp->uncompressed_block == NULL) fp->uncompressed_block = malloc(fp->uncompressed_block_size); - const bgzf_byte_t* input = data; - int block_length = fp->uncompressed_block_size; - int bytes_written = 0; + input = data; + block_length = fp->uncompressed_block_size; + bytes_written = 0; while (bytes_written < length) { int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written); bgzf_byte_t* buffer = fp->uncompressed_block; diff --git a/sam/bgzf.h b/sam/bgzf.h index 099ae9a..7295f37 100644 --- a/sam/bgzf.h +++ b/sam/bgzf.h @@ -26,7 +26,6 @@ #include #include -#include #include #ifdef _USE_KNETFILE #include "knetfile.h" @@ -37,7 +36,7 @@ typedef struct { int file_descriptor; char open_mode; // 'r' or 'w' - bool owned_file, is_uncompressed; + int16_t owned_file, compress_level; #ifdef _USE_KNETFILE union { knetFile *fpr; @@ -129,6 +128,7 @@ int bgzf_check_EOF(BGZF *fp); int bgzf_read_block(BGZF* fp); int bgzf_flush(BGZF* fp); int bgzf_flush_try(BGZF *fp, int size); +int bgzf_check_bgzf(const char *fn); #ifdef __cplusplus } diff --git a/sam/cut_target.c b/sam/cut_target.c new file mode 100644 index 0000000..26f434f --- /dev/null +++ b/sam/cut_target.c @@ -0,0 +1,193 @@ +#include +#include +#include +#include "bam.h" +#include "errmod.h" +#include "faidx.h" + +#define ERR_DEP 0.83f + +typedef struct { + int e[2][3], p[2][2]; +} score_param_t; + +/* Note that although the two matrics have 10 parameters in total, only 4 + * (probably 3) are free. Changing the scoring matrices in a sort of symmetric + * way will not change the result. */ +static score_param_t g_param = { {{0,0,0},{-4,1,6}}, {{0,-14000}, {0,0}} }; + +typedef struct { + int min_baseQ, tid, max_bases; + uint16_t *bases; + bamFile fp; + bam_header_t *h; + char *ref; + faidx_t *fai; + errmod_t *em; +} ct_t; + +static uint16_t gencns(ct_t *g, int n, const bam_pileup1_t *plp) +{ + int i, j, ret, tmp, k, sum[4], qual; + float q[16]; + if (n > g->max_bases) { // enlarge g->bases + g->max_bases = n; + kroundup32(g->max_bases); + g->bases = realloc(g->bases, g->max_bases * 2); + } + for (i = k = 0; i < n; ++i) { + const bam_pileup1_t *p = plp + i; + uint8_t *seq; + int q, baseQ, b; + if (p->is_refskip || p->is_del) continue; + baseQ = bam1_qual(p->b)[p->qpos]; + if (baseQ < g->min_baseQ) continue; + seq = bam1_seq(p->b); + b = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos)]; + if (b > 3) continue; + q = baseQ < p->b->core.qual? baseQ : p->b->core.qual; + if (q < 4) q = 4; + if (q > 63) q = 63; + g->bases[k++] = q<<5 | bam1_strand(p->b)<<4 | b; + } + if (k == 0) return 0; + errmod_cal(g->em, k, 4, g->bases, q); + for (i = 0; i < 4; ++i) sum[i] = (int)(q[i<<2|i] + .499) << 2 | i; + for (i = 1; i < 4; ++i) // insertion sort + for (j = i; j > 0 && sum[j] < sum[j-1]; --j) + tmp = sum[j], sum[j] = sum[j-1], sum[j-1] = tmp; + qual = (sum[1]>>2) - (sum[0]>>2); + k = k < 256? k : 255; + ret = (qual < 63? qual : 63) << 2 | (sum[0]&3); + return ret<<8|k; +} + +static void process_cns(bam_header_t *h, int tid, int l, uint16_t *cns) +{ + int i, f[2][2], *prev, *curr, *swap_tmp, s; + uint8_t *b; // backtrack array + b = calloc(l, 1); + f[0][0] = f[0][1] = 0; + prev = f[0]; curr = f[1]; + // fill the backtrack matrix + for (i = 0; i < l; ++i) { + int c = (cns[i] == 0)? 0 : (cns[i]>>8 == 0)? 1 : 2; + int tmp0, tmp1; + // compute f[0] + tmp0 = prev[0] + g_param.e[0][c] + g_param.p[0][0]; // (s[i+1],s[i])=(0,0) + tmp1 = prev[1] + g_param.e[0][c] + g_param.p[1][0]; // (0,1) + if (tmp0 > tmp1) curr[0] = tmp0, b[i] = 0; + else curr[0] = tmp1, b[i] = 1; + // compute f[1] + tmp0 = prev[0] + g_param.e[1][c] + g_param.p[0][1]; // (s[i+1],s[i])=(1,0) + tmp1 = prev[1] + g_param.e[1][c] + g_param.p[1][1]; // (1,1) + if (tmp0 > tmp1) curr[1] = tmp0, b[i] |= 0<<1; + else curr[1] = tmp1, b[i] |= 1<<1; + // swap + swap_tmp = prev; prev = curr; curr = swap_tmp; + } + // backtrack + s = prev[0] > prev[1]? 0 : 1; + for (i = l - 1; i > 0; --i) { + b[i] |= s<<2; + s = b[i]>>s&1; + } + // print + for (i = 0, s = -1; i <= l; ++i) { + if (i == l || ((b[i]>>2&3) == 0 && s >= 0)) { + if (s >= 0) { + int j; + printf("%s:%d-%d\t0\t%s\t%d\t60\t%dM\t*\t0\t0\t", h->target_name[tid], s+1, i, h->target_name[tid], s+1, i-s); + for (j = s; j < i; ++j) { + int c = cns[j]>>8; + if (c == 0) putchar('N'); + else putchar("ACGT"[c&3]); + } + putchar('\t'); + for (j = s; j < i; ++j) + putchar(33 + (cns[j]>>8>>2)); + putchar('\n'); + } + //if (s >= 0) printf("%s\t%d\t%d\t%d\n", h->target_name[tid], s, i, i - s); + s = -1; + } else if ((b[i]>>2&3) && s < 0) s = i; + } + free(b); +} + +static int read_aln(void *data, bam1_t *b) +{ + extern int bam_prob_realn_core(bam1_t *b, const char *ref, int flag); + ct_t *g = (ct_t*)data; + int ret, len; + ret = bam_read1(g->fp, b); + if (ret >= 0 && g->fai && b->core.tid >= 0 && (b->core.flag&4) == 0) { + if (b->core.tid != g->tid) { // then load the sequence + free(g->ref); + g->ref = fai_fetch(g->fai, g->h->target_name[b->core.tid], &len); + g->tid = b->core.tid; + } + bam_prob_realn_core(b, g->ref, 1<<1|1); + } + return ret; +} + +int main_cut_target(int argc, char *argv[]) +{ + int c, tid, pos, n, lasttid = -1, lastpos = -1, l, max_l; + const bam_pileup1_t *p; + bam_plp_t plp; + uint16_t *cns; + ct_t g; + + memset(&g, 0, sizeof(ct_t)); + g.min_baseQ = 13; g.tid = -1; + while ((c = getopt(argc, argv, "f:Q:i:o:0:1:2:")) >= 0) { + switch (c) { + case 'Q': g.min_baseQ = atoi(optarg); break; // quality cutoff + case 'i': g_param.p[0][1] = -atoi(optarg); break; // 0->1 transition (in) PENALTY + case '0': g_param.e[1][0] = atoi(optarg); break; // emission SCORE + case '1': g_param.e[1][1] = atoi(optarg); break; + case '2': g_param.e[1][2] = atoi(optarg); break; + case 'f': g.fai = fai_load(optarg); + if (g.fai == 0) fprintf(stderr, "[%s] fail to load the fasta index.\n", __func__); + break; + } + } + if (argc == optind) { + fprintf(stderr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] [-f ref] \n"); + return 1; + } + l = max_l = 0; cns = 0; + g.fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r"); + g.h = bam_header_read(g.fp); + g.em = errmod_init(1 - ERR_DEP); + plp = bam_plp_init(read_aln, &g); + while ((p = bam_plp_auto(plp, &tid, &pos, &n)) != 0) { + if (tid < 0) break; + if (tid != lasttid) { // change of chromosome + if (cns) process_cns(g.h, lasttid, l, cns); + if (max_l < g.h->target_len[tid]) { + max_l = g.h->target_len[tid]; + kroundup32(max_l); + cns = realloc(cns, max_l * 2); + } + l = g.h->target_len[tid]; + memset(cns, 0, max_l * 2); + lasttid = tid; + } + cns[pos] = gencns(&g, n, p); + lastpos = pos; + } + process_cns(g.h, lasttid, l, cns); + free(cns); + bam_header_destroy(g.h); + bam_plp_destroy(plp); + bam_close(g.fp); + if (g.fai) { + fai_destroy(g.fai); free(g.ref); + } + errmod_destroy(g.em); + free(g.bases); + return 0; +} diff --git a/sam/errmod.h b/sam/errmod.h index e3e9a90..32c07b6 100644 --- a/sam/errmod.h +++ b/sam/errmod.h @@ -12,6 +12,13 @@ typedef struct { errmod_t *errmod_init(float depcorr); void errmod_destroy(errmod_t *em); + +/* + n: number of bases + m: maximum base + bases[i]: qual:6, strand:1, base:4 + q[i*m+j]: phred-scaled likelihood of (i,j) + */ int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q); #endif diff --git a/sam/examples/Makefile b/sam/examples/Makefile index ec976ae..309399f 100644 --- a/sam/examples/Makefile +++ b/sam/examples/Makefile @@ -40,11 +40,11 @@ ex1.bcf:ex1.bam ex1.fa.fai (cd ..; make libbam.a) calDepth:../libbam.a calDepth.c - gcc -g -Wall -O2 -I.. calDepth.c -o $@ -lm -lz -L.. -lbam + gcc -g -Wall -O2 -I.. calDepth.c -o $@ -L.. -lbam -lm -lz clean: rm -fr *.bam *.bai *.glf* *.fai *.pileup* *~ calDepth *.dSYM ex1*.rg ex1.bcf # ../samtools pileup ex1.bam|perl -ape '$_=$F[4];s/(\d+)(??{".{$1}"})|\^.//g;@_=(tr/A-Z//,tr/a-z//);$_=join("\t",@F[0,1],@_)."\n"' -# ../samtools pileup -cf ex1.fa ex1.bam|perl -ape '$_=$F[8];s/\^.//g;s/(\d+)(??{".{$1}"})|\^.//g;@_=(tr/A-Za-z//,tr/,.//);$_=join("\t",@F[0,1],@_)."\n"' \ No newline at end of file +# ../samtools pileup -cf ex1.fa ex1.bam|perl -ape '$_=$F[8];s/\^.//g;s/(\d+)(??{".{$1}"})|\^.//g;@_=(tr/A-Za-z//,tr/,.//);$_=join("\t",@F[0,1],@_)."\n"' diff --git a/sam/examples/toy.sam b/sam/examples/toy.sam index 1aff220..33449b1 100644 --- a/sam/examples/toy.sam +++ b/sam/examples/toy.sam @@ -1,6 +1,6 @@ @SQ SN:ref LN:45 @SQ SN:ref2 LN:40 -r001 163 ref 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * +r001 163 ref 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * XX:B:S,12561,2,20,112 r002 0 ref 9 30 1S2I6M1P1I1P1I4M2I * 0 0 AAAAGATAAGGGATAAA * r003 0 ref 9 30 5H6M * 0 0 AGCTAA * r004 0 ref 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC * diff --git a/sam/faidx.c b/sam/faidx.c index dbd8b3e..f0798fc 100644 --- a/sam/faidx.c +++ b/sam/faidx.c @@ -2,11 +2,13 @@ #include #include #include +#include #include "faidx.h" #include "khash.h" typedef struct { - uint64_t len:32, line_len:16, line_blen:16; + int32_t line_len, line_blen; + int64_t len; uint64_t offset; } faidx1_t; KHASH_MAP_INIT_STR(s, faidx1_t) @@ -63,10 +65,11 @@ faidx_t *fai_build_core(RAZF *rz) { char c, *name; int l_name, m_name, ret; - int len, line_len, line_blen, state; + int line_len, line_blen, state; int l1, l2; faidx_t *idx; uint64_t offset; + int64_t len; idx = (faidx_t*)calloc(1, sizeof(faidx_t)); idx->hash = kh_init(s); @@ -118,11 +121,6 @@ faidx_t *fai_build_core(RAZF *rz) return 0; } ++l1; len += l2; - if (l2 >= 0x10000) { - fprintf(stderr, "[fai_build_core] line length exceeds 65535 in sequence '%s'.\n", name); - free(name); fai_destroy(idx); - return 0; - } if (state == 1) line_len = l1, line_blen = l2, state = 0; else if (state == 0) { if (l1 != line_len || l2 != line_blen) state = 2; @@ -304,8 +302,8 @@ faidx_t *fai_load(const char *fn) char *fai_fetch(const faidx_t *fai, const char *str, int *len) { - char *s, *p, c; - int i, l, k; + char *s, c; + int i, l, k, name_end; khiter_t iter; faidx1_t val; khash_t(s) *h; @@ -313,31 +311,43 @@ char *fai_fetch(const faidx_t *fai, const char *str, int *len) beg = end = -1; h = fai->hash; - l = strlen(str); - p = s = (char*)malloc(l+1); - /* squeeze out "," */ - for (i = k = 0; i != l; ++i) - if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i]; - s[k] = 0; - for (i = 0; i != k; ++i) if (s[i] == ':') break; - s[i] = 0; - iter = kh_get(s, h, s); /* get the ref_id */ - if (iter == kh_end(h)) { - *len = 0; - free(s); return 0; - } + name_end = l = strlen(str); + s = (char*)malloc(l+1); + // remove space + for (i = k = 0; i < l; ++i) + if (!isspace(str[i])) s[k++] = str[i]; + s[k] = 0; l = k; + // determine the sequence name + for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end + if (i >= 0) name_end = i; + if (name_end < l) { // check if this is really the end + int n_hyphen = 0; + for (i = name_end + 1; i < l; ++i) { + if (s[i] == '-') ++n_hyphen; + else if (!isdigit(s[i]) && s[i] != ',') break; + } + if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name + s[name_end] = 0; + iter = kh_get(s, h, s); + if (iter == kh_end(h)) { // cannot find the sequence name + iter = kh_get(s, h, str); // try str as the name + if (iter == kh_end(h)) { + *len = 0; + free(s); return 0; + } else s[name_end] = ':', name_end = l; + } + } else iter = kh_get(s, h, str); val = kh_value(h, iter); - if (i == k) { /* dump the whole sequence */ - beg = 0; end = val.len; - } else { - for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break; - beg = atoi(p); - if (i < k) { - p = s + i + 1; - end = atoi(p); - } else end = val.len; - } - if (beg > 0) --beg; + // parse the interval + if (name_end < l) { + for (i = k = name_end + 1; i < l; ++i) + if (s[i] != ',') s[k++] = s[i]; + s[k] = 0; + beg = atoi(s + name_end + 1); + for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break; + end = i < k? atoi(s + i + 1) : val.len; + if (beg > 0) --beg; + } else beg = 0, end = val.len; if (beg >= val.len) beg = val.len; if (end >= val.len) end = val.len; if (beg > end) beg = end; diff --git a/sam/glf.c b/sam/glf.c deleted file mode 100644 index 8d5346a..0000000 --- a/sam/glf.c +++ /dev/null @@ -1,236 +0,0 @@ -#include -#include -#include "glf.h" - -#ifdef _NO_BGZF -// then alias bgzf_*() functions -#endif - -static int glf3_is_BE = 0; - -static inline uint32_t bam_swap_endian_4(uint32_t v) -{ - v = ((v & 0x0000FFFFU) << 16) | (v >> 16); - return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); -} - -static inline uint16_t bam_swap_endian_2(uint16_t v) -{ - return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8)); -} - -static inline int bam_is_big_endian() -{ - long one= 1; - return !(*((char *)(&one))); -} - -glf3_header_t *glf3_header_init() -{ - glf3_is_BE = bam_is_big_endian(); - return (glf3_header_t*)calloc(1, sizeof(glf3_header_t)); -} - -glf3_header_t *glf3_header_read(glfFile fp) -{ - glf3_header_t *h; - char magic[4]; - h = glf3_header_init(); - bgzf_read(fp, magic, 4); - if (strncmp(magic, "GLF\3", 4)) { - fprintf(stderr, "[glf3_header_read] invalid magic.\n"); - glf3_header_destroy(h); - return 0; - } - bgzf_read(fp, &h->l_text, 4); - if (glf3_is_BE) h->l_text = bam_swap_endian_4(h->l_text); - if (h->l_text) { - h->text = (uint8_t*)calloc(h->l_text + 1, 1); - bgzf_read(fp, h->text, h->l_text); - } - return h; -} - -void glf3_header_write(glfFile fp, const glf3_header_t *h) -{ - int32_t x; - bgzf_write(fp, "GLF\3", 4); - x = glf3_is_BE? bam_swap_endian_4(h->l_text) : h->l_text; - bgzf_write(fp, &x, 4); - if (h->l_text) bgzf_write(fp, h->text, h->l_text); -} - -void glf3_header_destroy(glf3_header_t *h) -{ - free(h->text); - free(h); -} - -char *glf3_ref_read(glfFile fp, int *len) -{ - int32_t n, x; - char *str; - *len = 0; - if (bgzf_read(fp, &n, 4) != 4) return 0; - if (glf3_is_BE) n = bam_swap_endian_4(n); - if (n < 0) { - fprintf(stderr, "[glf3_ref_read] invalid reference name length: %d.\n", n); - return 0; - } - str = (char*)calloc(n + 1, 1); // not necesarily n+1 in fact - x = bgzf_read(fp, str, n); - x += bgzf_read(fp, len, 4); - if (x != n + 4) { - free(str); *len = -1; return 0; // truncated - } - if (glf3_is_BE) *len = bam_swap_endian_4(*len); - return str; -} - -void glf3_ref_write(glfFile fp, const char *str, int len) -{ - int32_t m, n = strlen(str) + 1; - m = glf3_is_BE? bam_swap_endian_4(n) : n; - bgzf_write(fp, &m, 4); - bgzf_write(fp, str, n); - if (glf3_is_BE) len = bam_swap_endian_4(len); - bgzf_write(fp, &len, 4); -} - -void glf3_view1(const char *ref_name, const glf3_t *g3, int pos) -{ - int j; - if (g3->rtype == GLF3_RTYPE_END) return; - printf("%s\t%d\t%c\t%d\t%d\t%d", ref_name, pos + 1, - g3->rtype == GLF3_RTYPE_INDEL? '*' : "XACMGRSVTWYHKDBN"[g3->ref_base], - g3->depth, g3->rms_mapQ, g3->min_lk); - if (g3->rtype == GLF3_RTYPE_SUB) - for (j = 0; j != 10; ++j) printf("\t%d", g3->lk[j]); - else { - printf("\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t", g3->lk[0], g3->lk[1], g3->lk[2], g3->indel_len[0], g3->indel_len[1], - g3->indel_len[0]? g3->indel_seq[0] : "*", g3->indel_len[1]? g3->indel_seq[1] : "*"); - } - printf("\n"); -} - -int glf3_write1(glfFile fp, const glf3_t *g3) -{ - int r; - uint8_t c; - uint32_t y[2]; - c = g3->rtype<<4 | g3->ref_base; - r = bgzf_write(fp, &c, 1); - if (g3->rtype == GLF3_RTYPE_END) return r; - y[0] = g3->offset; - y[1] = g3->min_lk<<24 | g3->depth; - if (glf3_is_BE) { - y[0] = bam_swap_endian_4(y[0]); - y[1] = bam_swap_endian_4(y[1]); - } - r += bgzf_write(fp, y, 8); - r += bgzf_write(fp, &g3->rms_mapQ, 1); - if (g3->rtype == GLF3_RTYPE_SUB) r += bgzf_write(fp, g3->lk, 10); - else { - int16_t x[2]; - r += bgzf_write(fp, g3->lk, 3); - x[0] = glf3_is_BE? bam_swap_endian_2(g3->indel_len[0]) : g3->indel_len[0]; - x[1] = glf3_is_BE? bam_swap_endian_2(g3->indel_len[1]) : g3->indel_len[1]; - r += bgzf_write(fp, x, 4); - if (g3->indel_len[0]) r += bgzf_write(fp, g3->indel_seq[0], abs(g3->indel_len[0])); - if (g3->indel_len[1]) r += bgzf_write(fp, g3->indel_seq[1], abs(g3->indel_len[1])); - } - return r; -} - -#ifndef kv_roundup32 -#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) -#endif - -int glf3_read1(glfFile fp, glf3_t *g3) -{ - int r; - uint8_t c; - uint32_t y[2]; - r = bgzf_read(fp, &c, 1); - if (r == 0) return 0; - g3->ref_base = c & 0xf; - g3->rtype = c>>4; - if (g3->rtype == GLF3_RTYPE_END) return r; - r += bgzf_read(fp, y, 8); - if (glf3_is_BE) { - y[0] = bam_swap_endian_4(y[0]); - y[1] = bam_swap_endian_4(y[1]); - } - g3->offset = y[0]; - g3->min_lk = y[1]>>24; - g3->depth = y[1]<<8>>8; - r += bgzf_read(fp, &g3->rms_mapQ, 1); - if (g3->rtype == GLF3_RTYPE_SUB) r += bgzf_read(fp, g3->lk, 10); - else { - int16_t x[2], max; - r += bgzf_read(fp, g3->lk, 3); - r += bgzf_read(fp, x, 4); - if (glf3_is_BE) { - x[0] = bam_swap_endian_2(x[0]); - x[1] = bam_swap_endian_2(x[1]); - } - g3->indel_len[0] = x[0]; - g3->indel_len[1] = x[1]; - x[0] = abs(x[0]); x[1] = abs(x[1]); - max = (x[0] > x[1]? x[0] : x[1]) + 1; - if (g3->max_len < max) { - g3->max_len = max; - kv_roundup32(g3->max_len); - g3->indel_seq[0] = (char*)realloc(g3->indel_seq[0], g3->max_len); - g3->indel_seq[1] = (char*)realloc(g3->indel_seq[1], g3->max_len); - } - r += bgzf_read(fp, g3->indel_seq[0], x[0]); - r += bgzf_read(fp, g3->indel_seq[1], x[1]); - g3->indel_seq[0][x[0]] = g3->indel_seq[1][x[1]] = 0; - } - return r; -} - -void glf3_view(glfFile fp) -{ - glf3_header_t *h; - char *name; - glf3_t *g3; - int len; - h = glf3_header_read(fp); - g3 = glf3_init1(); - while ((name = glf3_ref_read(fp, &len)) != 0) { - int pos = 0; - while (glf3_read1(fp, g3) && g3->rtype != GLF3_RTYPE_END) { - pos += g3->offset; - glf3_view1(name, g3, pos); - } - free(name); - } - glf3_header_destroy(h); - glf3_destroy1(g3); -} - -int glf3_view_main(int argc, char *argv[]) -{ - glfFile fp; - if (argc == 1) { - fprintf(stderr, "Usage: glfview \n"); - return 1; - } - fp = (strcmp(argv[1], "-") == 0)? bgzf_fdopen(fileno(stdin), "r") : bgzf_open(argv[1], "r"); - if (fp == 0) { - fprintf(stderr, "Fail to open file '%s'\n", argv[1]); - return 1; - } - glf3_view(fp); - bgzf_close(fp); - return 0; -} - -#ifdef GLFVIEW_MAIN -int main(int argc, char *argv[]) -{ - return glf3_view_main(argc, argv); -} -#endif diff --git a/sam/glf.h b/sam/glf.h deleted file mode 100644 index 12e5400..0000000 --- a/sam/glf.h +++ /dev/null @@ -1,56 +0,0 @@ -#ifndef GLF_H_ -#define GLF_H_ - -typedef struct { - unsigned char ref_base:4, dummy:4; /** "XACMGRSVTWYHKDBN"[ref_base] gives the reference base */ - unsigned char max_mapQ; /** maximum mapping quality */ - unsigned char lk[10]; /** log likelihood ratio, capped at 255 */ - unsigned min_lk:8, depth:24; /** minimum lk capped at 255, and the number of mapped reads */ -} glf1_t; - -#include -#include "bgzf.h" -typedef BGZF *glfFile; - -#define GLF3_RTYPE_END 0 -#define GLF3_RTYPE_SUB 1 -#define GLF3_RTYPE_INDEL 2 - -typedef struct { - uint8_t ref_base:4, rtype:4; /** "XACMGRSVTWYHKDBN"[ref_base] gives the reference base */ - uint8_t rms_mapQ; /** RMS mapping quality */ - uint8_t lk[10]; /** log likelihood ratio, capped at 255 */ - uint32_t min_lk:8, depth:24; /** minimum lk capped at 255, and the number of mapped reads */ - int32_t offset; /** the first base in a chromosome has offset zero. */ - // for indel (lkHom1, lkHom2 and lkHet are the first three elements in lk[10]) - int16_t indel_len[2]; - int32_t max_len; // maximum indel len; will be modified by glf3_read1() - char *indel_seq[2]; -} glf3_t; - -typedef struct { - int32_t l_text; - uint8_t *text; -} glf3_header_t; - -#ifdef __cplusplus -extern "C" { -#endif - -#define glf3_init1() ((glf3_t*)calloc(1, sizeof(glf3_t))) -#define glf3_destroy1(g3) do { free((g3)->indel_seq[0]); free((g3)->indel_seq[1]); free(g3); } while (0) - - glf3_header_t *glf3_header_init(); - glf3_header_t *glf3_header_read(glfFile fp); - void glf3_header_write(glfFile fp, const glf3_header_t *h); - void glf3_header_destroy(glf3_header_t *h); - char *glf3_ref_read(glfFile fp, int *len); - void glf3_ref_write(glfFile fp, const char *name, int len); - int glf3_write1(glfFile fp, const glf3_t *g3); - int glf3_read1(glfFile fp, glf3_t *g3); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/sam/khash.h b/sam/khash.h index 1d583ef..a7e8056 100644 --- a/sam/khash.h +++ b/sam/khash.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2008 Genome Research Ltd (GRL). + Copyright (c) 2008, 2009, 2011 by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -23,8 +23,6 @@ SOFTWARE. */ -/* Contact: Heng Li */ - /* An example: @@ -49,6 +47,14 @@ int main() { */ /* + 2011-02-14 (0.2.5): + + * Allow to declare global functions. + + 2009-09-26 (0.2.4): + + * Improve portability + 2008-09-19 (0.2.3): * Corrected the example @@ -88,17 +94,35 @@ int main() { @copyright Heng Li */ -#define AC_VERSION_KHASH_H "0.2.2" +#define AC_VERSION_KHASH_H "0.2.5" -#include #include #include +#include + +/* compipler specific configuration */ + +#if UINT_MAX == 0xffffffffu +typedef unsigned int khint32_t; +#elif ULONG_MAX == 0xffffffffu +typedef unsigned long khint32_t; +#endif + +#if ULONG_MAX == ULLONG_MAX +typedef unsigned long khint64_t; +#else +typedef unsigned long long khint64_t; +#endif + +#ifdef _MSC_VER +#define inline __inline +#endif -typedef uint32_t khint_t; +typedef khint32_t khint_t; typedef khint_t khiter_t; #define __ac_HASH_PRIME_SIZE 32 -static const uint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] = +static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] = { 0ul, 3ul, 11ul, 23ul, 53ul, 97ul, 193ul, 389ul, 769ul, 1543ul, @@ -119,17 +143,32 @@ static const uint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] = static const double __ac_HASH_UPPER = 0.77; -#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ +#define KHASH_DECLARE(name, khkey_t, khval_t) \ + typedef struct { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + khint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; \ + extern kh_##name##_t *kh_init_##name(); \ + extern void kh_destroy_##name(kh_##name##_t *h); \ + extern void kh_clear_##name(kh_##name##_t *h); \ + extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ + extern void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ + extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ + extern void kh_del_##name(kh_##name##_t *h, khint_t x); + +#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ typedef struct { \ khint_t n_buckets, size, n_occupied, upper_bound; \ - uint32_t *flags; \ + khint32_t *flags; \ khkey_t *keys; \ khval_t *vals; \ } kh_##name##_t; \ - static inline kh_##name##_t *kh_init_##name() { \ + SCOPE kh_##name##_t *kh_init_##name() { \ return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ } \ - static inline void kh_destroy_##name(kh_##name##_t *h) \ + SCOPE void kh_destroy_##name(kh_##name##_t *h) \ { \ if (h) { \ free(h->keys); free(h->flags); \ @@ -137,14 +176,14 @@ static const double __ac_HASH_UPPER = 0.77; free(h); \ } \ } \ - static inline void kh_clear_##name(kh_##name##_t *h) \ + SCOPE void kh_clear_##name(kh_##name##_t *h) \ { \ if (h && h->flags) { \ - memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t)); \ + memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(khint32_t)); \ h->size = h->n_occupied = 0; \ } \ } \ - static inline khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ + SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ { \ if (h->n_buckets) { \ khint_t inc, k, i, last; \ @@ -158,9 +197,9 @@ static const double __ac_HASH_UPPER = 0.77; return __ac_iseither(h->flags, i)? h->n_buckets : i; \ } else return 0; \ } \ - static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ + SCOPE void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ { \ - uint32_t *new_flags = 0; \ + khint32_t *new_flags = 0; \ khint_t j = 1; \ { \ khint_t t = __ac_HASH_PRIME_SIZE - 1; \ @@ -168,8 +207,8 @@ static const double __ac_HASH_UPPER = 0.77; new_n_buckets = __ac_prime_list[t+1]; \ if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \ else { \ - new_flags = (uint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \ - memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \ + new_flags = (khint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ + memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ if (h->n_buckets < new_n_buckets) { \ h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ if (kh_is_map) \ @@ -218,7 +257,7 @@ static const double __ac_HASH_UPPER = 0.77; h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ } \ } \ - static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ + SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ { \ khint_t x; \ if (h->n_occupied >= h->upper_bound) { \ @@ -256,7 +295,7 @@ static const double __ac_HASH_UPPER = 0.77; } else *ret = 0; \ return x; \ } \ - static inline void kh_del_##name(kh_##name##_t *h, khint_t x) \ + SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ { \ if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ __ac_set_isdel_true(h->flags, x); \ @@ -264,24 +303,27 @@ static const double __ac_HASH_UPPER = 0.77; } \ } +#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + KHASH_INIT2(name, static inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + /* --- BEGIN OF HASH FUNCTIONS --- */ /*! @function @abstract Integer hash function - @param key The integer [uint32_t] + @param key The integer [khint32_t] @return The hash value [khint_t] */ -#define kh_int_hash_func(key) (uint32_t)(key) +#define kh_int_hash_func(key) (khint32_t)(key) /*! @function @abstract Integer comparison function */ #define kh_int_hash_equal(a, b) ((a) == (b)) /*! @function @abstract 64-bit integer hash function - @param key The integer [uint64_t] + @param key The integer [khint64_t] @return The hash value [khint_t] */ -#define kh_int64_hash_func(key) (uint32_t)((key)>>33^(key)^(key)<<11) +#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) /*! @function @abstract 64-bit integer comparison function */ @@ -442,7 +484,7 @@ static inline khint_t __ac_X31_hash_string(const char *s) @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_INT(name) \ - KHASH_INIT(name, uint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) /*! @function @abstract Instantiate a hash map containing integer keys @@ -450,14 +492,14 @@ static inline khint_t __ac_X31_hash_string(const char *s) @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_INT(name, khval_t) \ - KHASH_INIT(name, uint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_INT64(name) \ - KHASH_INIT(name, uint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) + KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @@ -465,7 +507,7 @@ static inline khint_t __ac_X31_hash_string(const char *s) @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_INT64(name, khval_t) \ - KHASH_INIT(name, uint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) typedef const char *kh_cstr_t; /*! @function diff --git a/sam/knetfile.c b/sam/knetfile.c index 1e2c042..af09146 100644 --- a/sam/knetfile.c +++ b/sam/knetfile.c @@ -1,6 +1,7 @@ /* The MIT License - Copyright (c) 2008 Genome Research Ltd (GRL). + Copyright (c) 2008 by Genome Research Ltd (GRL). + 2010 by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -23,11 +24,9 @@ SOFTWARE. */ -/* Contact: Heng Li */ - /* Probably I will not do socket programming in the next few years and therefore I decide to heavily annotate this file, for Linux and - Windows as well. -lh3 */ + Windows as well. -ac */ #include #include @@ -90,7 +89,7 @@ static int socket_connect(const char *host, const char *port) int on = 1, fd; struct linger lng = { 0, 0 }; - struct addrinfo hints, *res; + struct addrinfo hints, *res = 0; memset(&hints, 0, sizeof(struct addrinfo)); hints.ai_family = AF_UNSPEC; hints.ai_socktype = SOCK_STREAM; diff --git a/sam/kprobaln.c b/sam/kprobaln.c index 5201c1a..894a2ae 100644 --- a/sam/kprobaln.c +++ b/sam/kprobaln.c @@ -161,7 +161,7 @@ int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_quer double p = 1., Pr1 = 0.; for (i = 0; i <= l_query + 1; ++i) { p *= s[i]; - if (p < 1e-100) Pr += -4.343 * log(p), p = 1.; + if (p < 1e-100) Pr1 += -4.343 * log(p), p = 1.; } Pr1 += -4.343 * log(p * l_ref * l_query); Pr = (int)(Pr1 + .499); diff --git a/sam/kseq.h b/sam/kseq.h index 82face0..0bbc7dc 100644 --- a/sam/kseq.h +++ b/sam/kseq.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2008 Genome Research Ltd (GRL). + Copyright (c) 2008, 2009, 2011 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -23,13 +23,7 @@ SOFTWARE. */ -/* Contact: Heng Li */ - -/* - 2009-07-16 (lh3): in kstream_t, change "char*" to "unsigned char*" - */ - -/* Last Modified: 12APR2009 */ +/* Last Modified: 18AUG2011 */ #ifndef AC_KSEQ_H #define AC_KSEQ_H @@ -94,10 +88,10 @@ typedef struct __kstring_t { #endif #define __KS_GETUNTIL(__read, __bufsize) \ - static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ { \ if (dret) *dret = 0; \ - str->l = 0; \ + str->l = append? str->l : 0; \ if (ks->begin >= ks->end && ks->is_eof) return -1; \ for (;;) { \ int i; \ @@ -132,13 +126,15 @@ typedef struct __kstring_t { break; \ } \ } \ - if (str->l == 0) { \ + if (str->s == 0) { \ str->m = 1; \ str->s = (char*)calloc(1, 1); \ } \ str->s[str->l] = '\0'; \ return str->l; \ - } + } \ + static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + { return ks_getuntil2(ks, delimiter, str, dret, 0); } #define KSTREAM_INIT(type_t, __read, __bufsize) \ __KS_TYPE(type_t) \ @@ -171,44 +167,45 @@ typedef struct __kstring_t { -1 end-of-file -2 truncated quality string */ -#define __KSEQ_READ \ - static int kseq_read(kseq_t *seq) \ - { \ - int c; \ - kstream_t *ks = seq->f; \ +#define __KSEQ_READ \ + static int kseq_read(kseq_t *seq) \ + { \ + int c; \ + kstream_t *ks = seq->f; \ if (seq->last_char == 0) { /* then jump to the next header line */ \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ - if (c == -1) return -1; /* end of file */ \ - seq->last_char = c; \ - } /* the first header char has been read */ \ - seq->comment.l = seq->seq.l = seq->qual.l = 0; \ - if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \ - if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \ + while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ + if (c == -1) return -1; /* end of file */ \ + seq->last_char = c; \ + } /* else: the first header char has been read in the previous call */ \ + seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ + if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ + if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); /* read FASTA/Q comment */ \ + if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ + seq->seq.m = 256; \ + seq->seq.s = (char*)malloc(seq->seq.m); \ + } \ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ - if (isgraph(c)) { /* printable non-space character */ \ - if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \ - seq->seq.m = seq->seq.l + 2; \ - kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \ - seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ - } \ - seq->seq.s[seq->seq.l++] = (char)c; \ - } \ - } \ + seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ + ks_getuntil2(ks, '\n', &seq->seq, 0, 1); /* read the rest of the line */ \ + } \ if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ - seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ - if (c != '+') return seq->seq.l; /* FASTA */ \ - if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \ - seq->qual.m = seq->seq.m; \ - seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ - } \ + if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ + seq->seq.m = seq->seq.l + 2; \ + kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ + seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ + } \ + seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ + if (c != '+') return seq->seq.l; /* FASTA */ \ + if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ + seq->qual.m = seq->seq.m; \ + seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ + } \ while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ - if (c == -1) return -2; /* we should not stop here */ \ - while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \ - if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \ - seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \ + if (c == -1) return -2; /* error: no quality string */ \ + while (ks_getuntil2(ks, '\n', &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ seq->last_char = 0; /* we have not come to the next header line */ \ - if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \ - return seq->seq.l; \ + if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ + return seq->seq.l; \ } #define __KSEQ_TYPE(type_t) \ @@ -219,7 +216,7 @@ typedef struct __kstring_t { } kseq_t; #define KSEQ_INIT(type_t, __read) \ - KSTREAM_INIT(type_t, __read, 4096) \ + KSTREAM_INIT(type_t, __read, 16384) \ __KSEQ_TYPE(type_t) \ __KSEQ_BASIC(type_t) \ __KSEQ_READ diff --git a/sam/kstring.c b/sam/kstring.c index 43d524c..b2a0dab 100644 --- a/sam/kstring.c +++ b/sam/kstring.c @@ -29,16 +29,24 @@ char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux) const char *p, *start; if (sep) { // set up the table if (str == 0 && (aux->tab[0]&1)) return 0; // no need to set up if we have finished - aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0; - for (p = sep; *p; ++p) - aux->tab[*p/64] |= 1ull<<(*p%64); + aux->finished = 0; + if (sep[1]) { + aux->sep = -1; + aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0; + for (p = sep; *p; ++p) aux->tab[*p>>6] |= 1ull<<(*p&0x3f); + } else aux->sep = sep[0]; + } + if (aux->finished) return 0; + else if (str) aux->p = str - 1, aux->finished = 0; + if (aux->sep < 0) { + for (p = start = aux->p + 1; *p; ++p) + if (aux->tab[*p>>6]>>(*p&0x3f)&1) break; + } else { + for (p = start = aux->p + 1; *p; ++p) + if (*p == aux->sep) break; } - if (str) aux->p = str - 1, aux->tab[0] &= ~1ull; - else if (aux->tab[0]&1) return 0; - for (p = start = aux->p + 1; *p; ++p) - if (aux->tab[*p/64]>>(*p%64)&1) break; aux->p = p; // end of token - if (*p == 0) aux->tab[0] |= 1; // no more tokens + if (*p == 0) aux->finished = 1; // no more tokens return (char*)start; } diff --git a/sam/kstring.h b/sam/kstring.h index c46a62b..ec5775b 100644 --- a/sam/kstring.h +++ b/sam/kstring.h @@ -19,6 +19,7 @@ typedef struct __kstring_t { typedef struct { uint64_t tab[4]; + int sep, finished; const char *p; // end of the current token } ks_tokaux_t; diff --git a/sam/misc/Makefile b/sam/misc/Makefile index 6c25c78..d2f8bd8 100644 --- a/sam/misc/Makefile +++ b/sam/misc/Makefile @@ -4,7 +4,7 @@ CFLAGS= -g -Wall -O2 #-m64 #-arch ppc CXXFLAGS= $(CFLAGS) DFLAGS= -D_FILE_OFFSET_BITS=64 OBJS= -PROG= md5sum-lite md5fa maq2sam-short maq2sam-long wgsim +PROG= md5sum-lite md5fa maq2sam-short maq2sam-long wgsim seqtk INCLUDES= -I.. SUBDIRS= . @@ -27,11 +27,11 @@ lib-recur all-recur clean-recur cleanlocal-recur install-recur: lib: -afs2:afs2.o - $(CC) $(CFLAGS) -o $@ afs2.o -lm -lz -L.. -lbam +seqtk:seqtk.o + $(CC) $(CFLAGS) -o $@ seqtk.o -lm -lz wgsim:wgsim.o - $(CC) $(CFLAGS) -o $@ wgsim.o -lm + $(CC) $(CFLAGS) -o $@ wgsim.o -lm -lz md5fa:md5.o md5fa.o md5.h ../kseq.h $(CC) $(CFLAGS) -o $@ md5.o md5fa.o -lz @@ -51,8 +51,11 @@ maq2sam-long:maq2sam.c md5fa.o:md5.h md5fa.c $(CC) $(CFLAGS) -c -I.. -o $@ md5fa.c -afs2.o:afs2.c ../bam.h - $(CC) $(CFLAGS) -c -I.. -o $@ afs2.c +seqtk.o:seqtk.c ../khash.h ../kseq.h + $(CC) $(CFLAGS) -c -I.. -o $@ seqtk.c + +wgsim.o:wgsim.c ../kseq.h + $(CC) $(CFLAGS) -c -I.. -o $@ wgsim.c cleanlocal: rm -fr gmon.out *.o a.out *.exe *.dSYM $(PROG) *~ *.a diff --git a/sam/misc/export2sam.pl b/sam/misc/export2sam.pl index a2a436c..ec6dacf 100755 --- a/sam/misc/export2sam.pl +++ b/sam/misc/export2sam.pl @@ -1,461 +1,545 @@ -#!/usr/bin/env perl -# -# -# Script to convert GERALD export files to SAM format. -# -# -# -########## License: -# -# The MIT License -# -# Original SAMtools version 0.1.2 copyright (c) 2008-2009 Genome Research Ltd. -# Modifications from version 0.1.2 to 2.0.0 copyright (c) 2010 Illumina, Inc. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. -# -# -# -########## ChangeLog: -# -# Version: 2.0.0 (15FEB2010) -# Script updated by Illumina in conjunction with CASAVA 1.7.0 release. -# Major changes are as follows: -# - The CIGAR string has been updated to include all gaps from ELANDv2 alignments. -# - The ELAND single read alignment score is always stored in the optional "SM" field -# and the ELAND paired read alignment score is stored in the optional "AS" field -# when it exists. -# - The MAPQ value is set to the higher of the two alignment scores, but no greater -# than 254, i.e. min(254,max(SM,AS)) -# - The SAM "proper pair" bit (0x0002) is now set for read pairs meeting ELAND's -# expected orientation and insert size criteria. -# - The default quality score translation is set for export files which contain -# Phread+64 quality values. An option, "--qlogodds", has been added to -# translate quality values from the Solexa+64 format used in export files prior -# to Pipeline 1.3 -# - The export match descriptor is now reverse-complemented when necessary such that -# it always corresponds to the forward strand of the reference, to be consistent -# with other information in the SAM record. It is now written to the optional -# 'XD' field (rather than 'MD') to acknowledge its minor differences from the -# samtools match descriptor (see additional detail below). -# - An option, "--nofilter", has been added to include reads which have failed -# primary analysis quality filtration. Such reads will have the corresponding -# SAM flag bit (0x0200) set. -# - Labels in the export 'contig' field are preserved by setting RNAME to -# "$export_chromosome/$export_contig" when then contig label exists. -# -# -# Contact: lh3 -# Version: 0.1.2 (03JAN2009) -# -# -# -########## Known Conversion Limitations: -# -# - Export records for reads that map to a position < 1 (allowed in export format), are converted -# to unmapped reads in the SAM record. -# - Export records contain the reserved chromosome names: "NM" and "QC". "NM" indicates that the -# aligner could not map the read to the reference sequence set, and "QC" means that the -# aligner did not attempt to map the read due to some technical limitation. Both of these -# alignment types are collapsed to the single unmapped alignment state in the SAM record. -# - The export match descriptor is slightly different than the samtools match descriptor. For -# this reason it is stored in the optional SAM field 'XD' (and not 'MD'). Note that the -# export match descriptor differs from the samtools version in two respects: (1) indels -# are explicitly closed with the '$' character and (2) insertions must be enumerated in -# the match descriptor. For example a 35-base read with a two-base insertion is described -# as: 20^2$14 -# -# -# - -my $version = "2.0.0"; - -use strict; -use warnings; - -use File::Spec qw(splitpath); -use Getopt::Long; -use List::Util qw(min max); - - -use constant { - EXPORT_INDEX => 6, - EXPORT_READNO => 7, - EXPORT_READ => 8, - EXPORT_QUAL => 9, - EXPORT_CHROM => 10, - EXPORT_CONTIG => 11, - EXPORT_POS => 12, - EXPORT_STRAND => 13, - EXPORT_MD => 14, - EXPORT_SEMAP => 15, - EXPORT_PEMAP => 16, - EXPORT_PASSFILT => 21, -}; - - -use constant { - SAM_QNAME => 0, - SAM_FLAG => 1, - SAM_RNAME => 2, - SAM_POS => 3, - SAM_MAPQ => 4, - SAM_CIGAR => 5, - SAM_MRNM => 6, - SAM_MPOS => 7, - SAM_ISIZE => 8, - SAM_SEQ => 9, - SAM_QUAL => 10, -}; - - -# function prototypes for Richard's code -sub match_desc_to_cigar($); -sub match_desc_frag_length($); -sub reverse_compl_match_descriptor($); -sub write_header($;$;$); - - -&export2sam; -exit; - - - - -sub export2sam { - - my $cmdline = $0 . " " . join(" ",@ARGV); - my $arg_count = scalar @ARGV; - my @spval = File::Spec->splitpath($0); - my $progname = $spval[2]; - - my $is_logodds_qvals = 0; # if true, assume files contain logodds (i.e. "solexa") quality values - my $is_nofilter = 0; - my $read1file; - my $read2file; - my $print_version = 0; - my $help = 0; - - my $result = GetOptions( "qlogodds" => \$is_logodds_qvals, - "nofilter" => \$is_nofilter, - "read1=s" => \$read1file, - "read2=s" => \$read2file, - "version" => \$print_version, - "help" => \$help ); - - my $usage = <) { - $export_line_count++; - my (@s1, @s2); - &export2sam_aux($_, $export_line_count, \@s1, \@conv_table, $is_paired, 1, $is_nofilter); - if ($is_paired) { - my $read2line = <$fh2>; - if(not $read2line){ - die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read1 file at line no: $export_line_count.\n\n"); - } - &export2sam_aux($read2line, $export_line_count, \@s2, \@conv_table, $is_paired, 2, $is_nofilter); - - if (@s1 && @s2) { # then set mate coordinate - if($s1[SAM_QNAME] ne $s2[SAM_QNAME]){ - die("\nERROR: Non-paired reads in export files on line: $export_line_count.\n Read1: $_ Read2: $read2line\n"); - } - - my $isize = 0; - if ($s1[SAM_RNAME] ne '*' && $s1[SAM_RNAME] eq $s2[SAM_RNAME]) { # then calculate $isize - my $x1 = ($s1[SAM_FLAG] & 0x10)? $s1[SAM_POS] + length($s1[SAM_SEQ]) : $s1[SAM_POS]; - my $x2 = ($s2[SAM_FLAG] & 0x10)? $s2[SAM_POS] + length($s2[SAM_SEQ]) : $s2[SAM_POS]; - $isize = $x2 - $x1; - } - - foreach ([\@s1,\@s2,$isize],[\@s2,\@s1,-$isize]){ - my ($sa,$sb,$is) = @{$_}; - if ($sb->[SAM_RNAME] ne '*') { - $sa->[SAM_MRNM] = ($sb->[SAM_RNAME] eq $sa->[SAM_RNAME]) ? "=" : $sb->[SAM_RNAME]; - $sa->[SAM_MPOS] = $sb->[SAM_POS]; - $sa->[SAM_ISIZE] = $is; - $sa->[SAM_FLAG] |= 0x20 if ($sb->[SAM_FLAG] & 0x10); - } else { - $sa->[SAM_FLAG] |= 0x8; - } - } - } - } - print join("\t", @s1), "\n" if (@s1); - print join("\t", @s2), "\n" if (@s2 && $is_paired); - } - close($fh1); - if($is_paired) { - while(my $read2line = <$fh2>){ - $export_line_count++; - die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read2 file at line no: $export_line_count.\n\n"); - } - close($fh2); - } -} - -sub export2sam_aux { - my ($line, $line_no, $s, $ct, $is_paired, $read_no, $is_nofilter) = @_; - chomp($line); - my @t = split("\t", $line); - @$s = (); - my $isPassFilt = ($t[EXPORT_PASSFILT] eq 'Y'); - return if(not ($isPassFilt or $is_nofilter)); - # read name - $s->[SAM_QNAME] = $t[1]? "$t[0]_$t[1]:$t[2]:$t[3]:$t[4]:$t[5]" : "$t[0]:$t[2]:$t[3]:$t[4]:$t[5]"; - # initial flag (will be updated later) - $s->[SAM_FLAG] = 0; - if($is_paired) { - if($t[EXPORT_READNO] != $read_no){ - die("\nERROR: read$read_no export file contains record with read number: " .$t[EXPORT_READNO] . " on line: $line_no\n\n"); - } - $s->[SAM_FLAG] |= 1 | 1<<(5 + $read_no); - } - $s->[SAM_FLAG] |= 0x200 if (not $isPassFilt); - - # read & quality - my $is_export_rev = ($t[EXPORT_STRAND] eq 'R'); - if ($is_export_rev) { # then reverse the sequence and quality - $s->[SAM_SEQ] = reverse($t[EXPORT_READ]); - $s->[SAM_SEQ] =~ tr/ACGTacgt/TGCAtgca/; - $s->[SAM_QUAL] = reverse($t[EXPORT_QUAL]); - } else { - $s->[SAM_SEQ] = $t[EXPORT_READ]; - $s->[SAM_QUAL] = $t[EXPORT_QUAL]; - } - my @convqual = (); - foreach (unpack('C*', $s->[SAM_QUAL])){ - my $val=$ct->[$_]; - if(not defined $val){ - my $msg="\nERROR: can't interpret export quality value: " . $_ . " in read$read_no export file, line: $line_no\n"; - if( $_ < 64 ) { $msg .= " Use --qlogodds flag to translate logodds (solexa) quality values.\n"; } - die($msg . "\n"); - } - push @convqual,$val; - } - - $s->[SAM_QUAL] = pack('C*',@convqual); # change coding - - - # coor - my $has_coor = 0; - $s->[SAM_RNAME] = "*"; - if ($t[EXPORT_CHROM] eq 'NM' or $t[EXPORT_CHROM] eq 'QC') { - $s->[SAM_FLAG] |= 0x4; # unmapped - } elsif ($t[EXPORT_CHROM] =~ /(\d+):(\d+):(\d+)/) { - $s->[SAM_FLAG] |= 0x4; # TODO: should I set BAM_FUNMAP in this case? - push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3") - } elsif ($t[EXPORT_POS] < 1) { - $s->[SAM_FLAG] |= 0x4; # unmapped - } else { - $s->[SAM_RNAME] = $t[EXPORT_CHROM]; - $s->[SAM_RNAME] .= "/" . $t[EXPORT_CONTIG] if($t[EXPORT_CONTIG] ne ''); - $has_coor = 1; - } - $s->[SAM_POS] = $has_coor? $t[EXPORT_POS] : 0; - -# print STDERR "t[14] = " . $t[14] . "\n"; - my $matchDesc = ''; - $s->[SAM_CIGAR] = "*"; - if($has_coor){ - $matchDesc = ($is_export_rev) ? reverse_compl_match_descriptor($t[EXPORT_MD]) : $t[EXPORT_MD]; - - if($matchDesc =~ /\^/){ - # construct CIGAR string using Richard's function - $s->[SAM_CIGAR] = match_desc_to_cigar($matchDesc); # indel processing - } else { - $s->[SAM_CIGAR] = length($s->[SAM_SEQ]) . "M"; - } - } - -# print STDERR "cigar_string = $cigar_string\n"; - - $s->[SAM_FLAG] |= 0x10 if ($has_coor && $is_export_rev); - if($has_coor){ - my $semap = ($t[EXPORT_SEMAP] ne '') ? $t[EXPORT_SEMAP] : 0; - my $pemap = 0; - if($is_paired) { - $pemap = ($t[EXPORT_PEMAP] ne '') ? $t[EXPORT_PEMAP] : 0; - - # set `proper pair' bit if non-blank, non-zero PE alignment score: - $s->[SAM_FLAG] |= 0x02 if ($pemap > 0); - } - $s->[SAM_MAPQ] = min(254,max($semap,$pemap)); - } else { - $s->[SAM_MAPQ] = 0; - } - # mate coordinate - $s->[SAM_MRNM] = '*'; - $s->[SAM_MPOS] = 0; - $s->[SAM_ISIZE] = 0; - # aux - push(@$s, "BC:Z:$t[EXPORT_INDEX]") if ($t[EXPORT_INDEX]); - if($has_coor){ - # The export match descriptor differs slightly from the samtools match descriptor. - # In order for the converted SAM files to be as compliant as possible, - # we put the export match descriptor in optional field 'XD' rather than 'MD': - push(@$s, "XD:Z:$matchDesc"); - push(@$s, "SM:i:$t[EXPORT_SEMAP]") if ($t[EXPORT_SEMAP] ne ''); - push(@$s, "AS:i:$t[EXPORT_PEMAP]") if ($is_paired and ($t[EXPORT_PEMAP] ne '')); - } -} - - - -# -# the following code is taken from Richard Shaw's sorted2sam.pl file -# -sub reverse_compl_match_descriptor($) -{ -# print "\nREVERSING THE MATCH DESCRIPTOR!\n"; - my ($match_desc) = @_; - my $rev_compl_match_desc = reverse($match_desc); - $rev_compl_match_desc =~ tr/ACGT\^\$/TGCA\$\^/; - - # Unreverse the digits of numbers. - $rev_compl_match_desc = join('', - map {($_ =~ /\d+/) - ? join('', reverse(split('', $_))) - : $_} split(/(\d+)/, - $rev_compl_match_desc)); - - return $rev_compl_match_desc; -} - - - -sub match_desc_to_cigar($) -{ - my ($match_desc) = @_; - - my @match_desc_parts = split(/(\^.*?\$)/, $match_desc); - my $cigar_str = ''; - my $cigar_del_ch = 'D'; - my $cigar_ins_ch = 'I'; - my $cigar_match_ch = 'M'; - - foreach my $match_desc_part (@match_desc_parts) { - next if (!$match_desc_part); - - if ($match_desc_part =~ /^\^([ACGTN]+)\$$/) { - # Deletion - $cigar_str .= (length($1) . $cigar_del_ch); - } elsif ($match_desc_part =~ /^\^(\d+)\$$/) { - # Insertion - $cigar_str .= ($1 . $cigar_ins_ch); - } else { - $cigar_str .= (match_desc_frag_length($match_desc_part) - . $cigar_match_ch); - } - } - - return $cigar_str; -} - - -#------------------------------------------------------------------------------ - -sub match_desc_frag_length($) - { - my ($match_desc_str) = @_; - my $len = 0; - - my @match_desc_fields = split(/([ACGTN]+)/, $match_desc_str); - - foreach my $match_desc_field (@match_desc_fields) { - next if ($match_desc_field eq ''); - - $len += (($match_desc_field =~ /(\d+)/) - ? $1 : length($match_desc_field)); - } - - return $len; -} - - -# argument holds the command line -sub write_header($;$;$) -{ - my ($progname,$version,$cl) = @_; - my $complete_header = ""; - $complete_header .= "\@PG\tID:$progname\tVN:$version\tCL:$cl\n"; - - return $complete_header; -} +#!/usr/bin/env perl +# +# +# export2sam.pl converts GERALD export files to SAM format. +# +# +# +########## License: +# +# The MIT License +# +# Original SAMtools work copyright (c) 2008-2009 Genome Research Ltd. +# Modified SAMtools work copyright (c) 2010 Illumina, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# +# +# +########## ChangeLog: +# +# Version: 2.3.1 (18MAR2011) +# +# - Restore file '-' as stdin input. +# +# Version: 2.3.0 (24JAN2011) +# +# - Add support for export reserved chromosome name "CONTROL", +# which is translated to optional field "XC:Z:CONTROL". +# - Check for ".gz" file extension on export files and open +# these as gzip pipes when the extension is found. +# +# Version: 2.2.0 (16NOV2010) +# +# - Remove any leading zeros in export fields: RUNNO,LANE,TILE,X,Y +# - For export records with reserved chromosome name identifiers +# "QC" and "RM", add the optional field "XC:Z:QC" or "XC:Z:RM" +# to the SAM record, so that these cases can be distinguished +# from other unmatched reads. +# +# Version: 2.1.0 (21SEP2010) +# +# - Additional export record error checking. +# - Convert export records with chromomsome value of "RM" to unmapped +# SAM records. +# +# Version: 2.0.0 (15FEB2010) +# +# Script updated by Illumina in conjunction with CASAVA 1.7.0 +# release. +# +# Major changes are as follows: +# - The CIGAR string has been updated to include all gaps from +# ELANDv2 alignments. +# - The ELAND single read alignment score is always stored in the +# optional "SM" field and the ELAND paired read alignment score +# is stored in the optional "AS" field when it exists. +# - The MAPQ value is set to the higher of the two alignment scores, +# but no greater than 254, i.e. min(254,max(SM,AS)) +# - The SAM "proper pair" bit (0x0002) is now set for read pairs +# meeting ELAND's expected orientation and insert size criteria. +# - The default quality score translation is set for export files +# which contain Phread+64 quality values. An option, +# "--qlogodds", has been added to translate quality values from +# the Solexa+64 format used in export files prior to Pipeline +# 1.3 +# - The export match descriptor is now reverse-complemented when +# necessary such that it always corresponds to the forward +# strand of the reference, to be consistent with other +# information in the SAM record. It is now written to the +# optional 'XD' field (rather than 'MD') to acknowledge its +# minor differences from the samtools match descriptor (see +# additional detail below). +# - An option, "--nofilter", has been added to include reads which +# have failed primary analysis quality filtration. Such reads +# will have the corresponding SAM flag bit (0x0200) set. +# - Labels in the export 'contig' field are preserved by setting +# RNAME to "$export_chromosome/$export_contig" when the contig +# label exists. +# +# +# Contact: lh3 +# Version: 0.1.2 (03JAN2009) +# +# +# +########## Known Conversion Limitations: +# +# - Export records for reads that map to a position < 1 (allowed +# in export format), are converted to unmapped reads in the SAM +# record. +# - Export records contain the reserved chromosome names: "NM", +# "QC","RM" and "CONTROL". "NM" indicates that the aligner could +# not map the read to the reference sequence set. "QC" means that +# the aligner did not attempt to map the read due to some +# technical limitation. "RM" means that the read mapped to a set +# of 'contaminant' sequences specified in GERALD's RNA-seq +# workflow. "CONTROL" means that the read is a control. All of +# these alignment types are collapsed to the single unmapped +# alignment state in the SAM record, but the optional SAM "XC" +# field is used to record the original reserved chromosome name of +# the read for all but the "NM" case. +# - The export match descriptor is slightly different than the +# samtools match descriptor. For this reason it is stored in the +# optional SAM field 'XD' (and not 'MD'). Note that the export +# match descriptor differs from the samtools version in two +# respects: (1) indels are explicitly closed with the '$' +# character and (2) insertions must be enumerated in the match +# descriptor. For example a 35-base read with a two-base insertion +# is described as: 20^2$14 +# +# +# + +my $version = "2.3.1"; + +use strict; +use warnings; + +use Getopt::Long; +use File::Spec; +use List::Util qw(min max); + + +use constant { + EXPORT_MACHINE => 0, + EXPORT_RUNNO => 1, + EXPORT_LANE => 2, + EXPORT_TILE => 3, + EXPORT_X => 4, + EXPORT_Y => 5, + EXPORT_INDEX => 6, + EXPORT_READNO => 7, + EXPORT_READ => 8, + EXPORT_QUAL => 9, + EXPORT_CHROM => 10, + EXPORT_CONTIG => 11, + EXPORT_POS => 12, + EXPORT_STRAND => 13, + EXPORT_MD => 14, + EXPORT_SEMAP => 15, + EXPORT_PEMAP => 16, + EXPORT_PASSFILT => 21, + EXPORT_SIZE => 22, +}; + + +use constant { + SAM_QNAME => 0, + SAM_FLAG => 1, + SAM_RNAME => 2, + SAM_POS => 3, + SAM_MAPQ => 4, + SAM_CIGAR => 5, + SAM_MRNM => 6, + SAM_MPOS => 7, + SAM_ISIZE => 8, + SAM_SEQ => 9, + SAM_QUAL => 10, +}; + + +# function prototypes for Richard's code +sub match_desc_to_cigar($); +sub match_desc_frag_length($); +sub reverse_compl_match_descriptor($); +sub write_header($;$;$); + + +&export2sam; +exit; + + + + +sub export2sam { + + my $cmdline = $0 . " " . join(" ",@ARGV); + my $arg_count = scalar @ARGV; + my $progname = (File::Spec->splitpath($0))[2]; + + my $is_logodds_qvals = 0; # if true, assume files contain logodds (i.e. "solexa") quality values + my $is_nofilter = 0; + my $read1file; + my $read2file; + my $print_version = 0; + my $help = 0; + + my $result = GetOptions( "qlogodds" => \$is_logodds_qvals, + "nofilter" => \$is_nofilter, + "read1=s" => \$read1file, + "read2=s" => \$read2file, + "version" => \$print_version, + "help" => \$help ); + + my $usage = <) { + $export_line_count++; + my (@s1, @s2); + &export2sam_aux($_, $export_line_count, \@s1, \@conv_table, $is_paired, 1, $is_nofilter); + if ($is_paired) { + my $read2line = <$fh2>; + if(not $read2line){ + die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read1 file at line no: $export_line_count.\n\n"); + } + &export2sam_aux($read2line, $export_line_count, \@s2, \@conv_table, $is_paired, 2, $is_nofilter); + + if (@s1 && @s2) { # then set mate coordinate + if($s1[SAM_QNAME] ne $s2[SAM_QNAME]){ + die("\nERROR: Non-paired reads in export files on line: $export_line_count.\n Read1: $_ Read2: $read2line\n"); + } + + my $isize = 0; + if ($s1[SAM_RNAME] ne '*' && $s1[SAM_RNAME] eq $s2[SAM_RNAME]) { # then calculate $isize + my $x1 = ($s1[SAM_FLAG] & 0x10)? $s1[SAM_POS] + length($s1[SAM_SEQ]) : $s1[SAM_POS]; + my $x2 = ($s2[SAM_FLAG] & 0x10)? $s2[SAM_POS] + length($s2[SAM_SEQ]) : $s2[SAM_POS]; + $isize = $x2 - $x1; + } + + foreach ([\@s1,\@s2,$isize],[\@s2,\@s1,-$isize]){ + my ($sa,$sb,$is) = @{$_}; + if ($sb->[SAM_RNAME] ne '*') { + $sa->[SAM_MRNM] = ($sb->[SAM_RNAME] eq $sa->[SAM_RNAME]) ? "=" : $sb->[SAM_RNAME]; + $sa->[SAM_MPOS] = $sb->[SAM_POS]; + $sa->[SAM_ISIZE] = $is; + $sa->[SAM_FLAG] |= 0x20 if ($sb->[SAM_FLAG] & 0x10); + } else { + $sa->[SAM_FLAG] |= 0x8; + } + } + } + } + print join("\t", @s1), "\n" if (@s1); + print join("\t", @s2), "\n" if (@s2 && $is_paired); + } + close($fh1); + if($is_paired) { + while(my $read2line = <$fh2>){ + $export_line_count++; + die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read2 file at line no: $export_line_count.\n\n"); + } + close($fh2); + } +} + +sub export2sam_aux { + my ($line, $line_no, $s, $ct, $is_paired, $read_no, $is_nofilter) = @_; + chomp($line); + my @t = split("\t", $line); + if(scalar(@t) < EXPORT_SIZE) { + my $msg="\nERROR: Unexpected number of fields in export record on line $line_no of read$read_no export file. Found " . scalar(@t) . " fields but expected " . EXPORT_SIZE . ".\n"; + $msg.="\t...erroneous export record:\n" . $line . "\n\n"; + die($msg); + } + @$s = (); + my $isPassFilt = ($t[EXPORT_PASSFILT] eq 'Y'); + return if(not ($isPassFilt or $is_nofilter)); + # read name + my $samQnamePrefix = $t[EXPORT_MACHINE] . (($t[EXPORT_RUNNO] ne "") ? "_" . int($t[EXPORT_RUNNO]) : ""); + $s->[SAM_QNAME] = join(':', $samQnamePrefix, int($t[EXPORT_LANE]), int($t[EXPORT_TILE]), + int($t[EXPORT_X]), int($t[EXPORT_Y])); + # initial flag (will be updated later) + $s->[SAM_FLAG] = 0; + if($is_paired) { + if($t[EXPORT_READNO] != $read_no){ + die("\nERROR: read$read_no export file contains record with read number: " .$t[EXPORT_READNO] . " on line: $line_no\n\n"); + } + $s->[SAM_FLAG] |= 1 | 1<<(5 + $read_no); + } + $s->[SAM_FLAG] |= 0x200 if (not $isPassFilt); + + # read & quality + my $is_export_rev = ($t[EXPORT_STRAND] eq 'R'); + if ($is_export_rev) { # then reverse the sequence and quality + $s->[SAM_SEQ] = reverse($t[EXPORT_READ]); + $s->[SAM_SEQ] =~ tr/ACGTacgt/TGCAtgca/; + $s->[SAM_QUAL] = reverse($t[EXPORT_QUAL]); + } else { + $s->[SAM_SEQ] = $t[EXPORT_READ]; + $s->[SAM_QUAL] = $t[EXPORT_QUAL]; + } + my @convqual = (); + foreach (unpack('C*', $s->[SAM_QUAL])){ + my $val=$ct->[$_]; + if(not defined $val){ + my $msg="\nERROR: can't interpret export quality value: " . $_ . " in read$read_no export file, line: $line_no\n"; + if( $_ < 64 ) { $msg .= " Use --qlogodds flag to translate logodds (solexa) quality values.\n"; } + die($msg . "\n"); + } + push @convqual,$val; + } + + $s->[SAM_QUAL] = pack('C*',@convqual); # change coding + + + # coor + my $has_coor = 0; + $s->[SAM_RNAME] = "*"; + if (($t[EXPORT_CHROM] eq 'NM') or + ($t[EXPORT_CHROM] eq 'QC') or + ($t[EXPORT_CHROM] eq 'RM') or + ($t[EXPORT_CHROM] eq 'CONTROL')) { + $s->[SAM_FLAG] |= 0x4; # unmapped + push(@$s,"XC:Z:".$t[EXPORT_CHROM]) if($t[EXPORT_CHROM] ne 'NM'); + } elsif ($t[EXPORT_CHROM] =~ /(\d+):(\d+):(\d+)/) { + $s->[SAM_FLAG] |= 0x4; # TODO: should I set BAM_FUNMAP in this case? + push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3") + } elsif ($t[EXPORT_POS] < 1) { + $s->[SAM_FLAG] |= 0x4; # unmapped + } else { + $s->[SAM_RNAME] = $t[EXPORT_CHROM]; + $s->[SAM_RNAME] .= "/" . $t[EXPORT_CONTIG] if($t[EXPORT_CONTIG] ne ''); + $has_coor = 1; + } + $s->[SAM_POS] = $has_coor? $t[EXPORT_POS] : 0; + +# print STDERR "t[14] = " . $t[14] . "\n"; + my $matchDesc = ''; + $s->[SAM_CIGAR] = "*"; + if($has_coor){ + $matchDesc = ($is_export_rev) ? reverse_compl_match_descriptor($t[EXPORT_MD]) : $t[EXPORT_MD]; + + if($matchDesc =~ /\^/){ + # construct CIGAR string using Richard's function + $s->[SAM_CIGAR] = match_desc_to_cigar($matchDesc); # indel processing + } else { + $s->[SAM_CIGAR] = length($s->[SAM_SEQ]) . "M"; + } + } + +# print STDERR "cigar_string = $cigar_string\n"; + + $s->[SAM_FLAG] |= 0x10 if ($has_coor && $is_export_rev); + if($has_coor){ + my $semap = ($t[EXPORT_SEMAP] ne '') ? $t[EXPORT_SEMAP] : 0; + my $pemap = 0; + if($is_paired) { + $pemap = ($t[EXPORT_PEMAP] ne '') ? $t[EXPORT_PEMAP] : 0; + + # set `proper pair' bit if non-blank, non-zero PE alignment score: + $s->[SAM_FLAG] |= 0x02 if ($pemap > 0); + } + $s->[SAM_MAPQ] = min(254,max($semap,$pemap)); + } else { + $s->[SAM_MAPQ] = 0; + } + # mate coordinate + $s->[SAM_MRNM] = '*'; + $s->[SAM_MPOS] = 0; + $s->[SAM_ISIZE] = 0; + # aux + push(@$s, "BC:Z:$t[EXPORT_INDEX]") if ($t[EXPORT_INDEX]); + if($has_coor){ + # The export match descriptor differs slightly from the samtools match descriptor. + # In order for the converted SAM files to be as compliant as possible, + # we put the export match descriptor in optional field 'XD' rather than 'MD': + push(@$s, "XD:Z:$matchDesc"); + push(@$s, "SM:i:$t[EXPORT_SEMAP]") if ($t[EXPORT_SEMAP] ne ''); + push(@$s, "AS:i:$t[EXPORT_PEMAP]") if ($is_paired and ($t[EXPORT_PEMAP] ne '')); + } +} + + + +# +# the following code is taken from Richard Shaw's sorted2sam.pl file +# +sub reverse_compl_match_descriptor($) +{ +# print "\nREVERSING THE MATCH DESCRIPTOR!\n"; + my ($match_desc) = @_; + my $rev_compl_match_desc = reverse($match_desc); + $rev_compl_match_desc =~ tr/ACGT\^\$/TGCA\$\^/; + + # Unreverse the digits of numbers. + $rev_compl_match_desc = join('', + map {($_ =~ /\d+/) + ? join('', reverse(split('', $_))) + : $_} split(/(\d+)/, + $rev_compl_match_desc)); + + return $rev_compl_match_desc; +} + + + +sub match_desc_to_cigar($) +{ + my ($match_desc) = @_; + + my @match_desc_parts = split(/(\^.*?\$)/, $match_desc); + my $cigar_str = ''; + my $cigar_del_ch = 'D'; + my $cigar_ins_ch = 'I'; + my $cigar_match_ch = 'M'; + + foreach my $match_desc_part (@match_desc_parts) { + next if (!$match_desc_part); + + if ($match_desc_part =~ /^\^([ACGTN]+)\$$/) { + # Deletion + $cigar_str .= (length($1) . $cigar_del_ch); + } elsif ($match_desc_part =~ /^\^(\d+)\$$/) { + # Insertion + $cigar_str .= ($1 . $cigar_ins_ch); + } else { + $cigar_str .= (match_desc_frag_length($match_desc_part) + . $cigar_match_ch); + } + } + + return $cigar_str; +} + + +#------------------------------------------------------------------------------ + +sub match_desc_frag_length($) + { + my ($match_desc_str) = @_; + my $len = 0; + + my @match_desc_fields = split(/([ACGTN]+)/, $match_desc_str); + + foreach my $match_desc_field (@match_desc_fields) { + next if ($match_desc_field eq ''); + + $len += (($match_desc_field =~ /(\d+)/) + ? $1 : length($match_desc_field)); + } + + return $len; +} + + +# argument holds the command line +sub write_header($;$;$) +{ + my ($progname,$version,$cl) = @_; + my $complete_header = ""; + $complete_header .= "\@PG\tID:$progname\tVN:$version\tCL:$cl\n"; + + return $complete_header; +} diff --git a/sam/misc/seqtk.c b/sam/misc/seqtk.c new file mode 100644 index 0000000..591ddff --- /dev/null +++ b/sam/misc/seqtk.c @@ -0,0 +1,783 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) + +typedef struct { + int n, m; + uint64_t *a; +} reglist_t; + +#include "khash.h" +KHASH_MAP_INIT_STR(reg, reglist_t) + +typedef kh_reg_t reghash_t; + +reghash_t *stk_reg_read(const char *fn) +{ + reghash_t *h = kh_init(reg); + gzFile fp; + kstream_t *ks; + int dret; + kstring_t *str; + // read the list + str = calloc(1, sizeof(kstring_t)); + fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); + ks = ks_init(fp); + while (ks_getuntil(ks, 0, str, &dret) >= 0) { + int beg = -1, end = -1; + reglist_t *p; + khint_t k = kh_get(reg, h, str->s); + if (k == kh_end(h)) { + int ret; + char *s = strdup(str->s); + k = kh_put(reg, h, s, &ret); + memset(&kh_val(h, k), 0, sizeof(reglist_t)); + } + p = &kh_val(h, k); + if (dret != '\n') { + if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { + beg = atoi(str->s); + if (dret != '\n') { + if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { + end = atoi(str->s); + if (end < 0) end = -1; + } + } + } + } + // skip the rest of the line + if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); + if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column + if (beg < 0) beg = 0, end = INT_MAX; + if (p->n == p->m) { + p->m = p->m? p->m<<1 : 4; + p->a = realloc(p->a, p->m * 8); + } + p->a[p->n++] = (uint64_t)beg<<32 | end; + } + ks_destroy(ks); + gzclose(fp); + free(str->s); free(str); + return h; +} + +void stk_reg_destroy(reghash_t *h) +{ + khint_t k; + for (k = 0; k < kh_end(h); ++k) { + if (kh_exist(h, k)) { + free(kh_val(h, k).a); + free((char*)kh_key(h, k)); + } + } + kh_destroy(reg, h); +} + +/* constant table */ + +unsigned char seq_nt16_table[256] = { + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15 /*'-'*/,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, + 15,15, 5, 6, 8,15, 7, 9, 0,10,15,15, 15,15,15,15, + 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, + 15,15, 5, 6, 8,15, 7, 9, 0,10,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, + 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 +}; + +char *seq_nt16_rev_table = "XACMGRSVTWYHKDBN"; +unsigned char seq_nt16to4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; +int bitcnt_table[] = { 4, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; + +/* composition */ +int stk_comp(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq; + int l, c, upper_only = 0; + reghash_t *h = 0; + reglist_t dummy; + while ((c = getopt(argc, argv, "ur:")) >= 0) { + switch (c) { + case 'u': upper_only = 1; break; + case 'r': h = stk_reg_read(optarg); break; + } + } + if (argc == optind) { + fprintf(stderr, "Usage: seqtk comp [-u] [-r in.bed] \n\n"); + fprintf(stderr, "Output format: chr, length, #A, #C, #G, #T, #2, #3, #4, #CpG, #tv, #ts, #CpG-ts\n"); + return 1; + } + fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); + seq = kseq_init(fp); + dummy.n= dummy.m = 1; dummy.a = calloc(1, 8); + while ((l = kseq_read(seq)) >= 0) { + int i, k; + reglist_t *p = 0; + if (h) { + khint_t k = kh_get(reg, h, seq->name.s); + if (k != kh_end(h)) p = &kh_val(h, k); + } else { + p = &dummy; + dummy.a[0] = l; + } + for (k = 0; p && k < p->n; ++k) { + int beg = p->a[k]>>32, end = p->a[k]&0xffffffff; + int la, lb, lc, na, nb, nc, cnt[11]; + if (beg > 0) la = seq->seq.s[beg-1], lb = seq_nt16_table[la], lc = bitcnt_table[lb]; + else la = 'a', lb = -1, lc = 0; + na = seq->seq.s[beg]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb]; + memset(cnt, 0, 11 * sizeof(int)); + for (i = beg; i < end; ++i) { + int is_CpG = 0, a, b, c; + a = na; b = nb; c = nc; + na = seq->seq.s[i+1]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb]; + if (b == 2 || b == 10) { // C or Y + if (nb == 4 || nb == 5) is_CpG = 1; + } else if (b == 4 || b == 5) { // G or R + if (lb == 2 || lb == 10) is_CpG = 1; + } + if (upper_only == 0 || isupper(a)) { + if (c > 1) ++cnt[c+2]; + if (c == 1) ++cnt[seq_nt16to4_table[b]]; + if (b == 10 || b == 5) ++cnt[9]; + else if (c == 2) { + ++cnt[8]; + } + if (is_CpG) { + ++cnt[7]; + if (b == 10 || b == 5) ++cnt[10]; + } + } + la = a; lb = b; lc = c; + } + if (h) printf("%s\t%d\t%d", seq->name.s, beg, end); + else printf("%s\t%d", seq->name.s, l); + for (i = 0; i < 11; ++i) printf("\t%d", cnt[i]); + putchar('\n'); + } + fflush(stdout); + } + free(dummy.a); + kseq_destroy(seq); + gzclose(fp); + return 0; +} + +int stk_randbase(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq; + int l; + if (argc == 1) { + fprintf(stderr, "Usage: seqtk randbase \n"); + return 1; + } + fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r"); + seq = kseq_init(fp); + while ((l = kseq_read(seq)) >= 0) { + int i; + printf(">%s", seq->name.s); + for (i = 0; i < l; ++i) { + int c, b, a, j, k, m; + b = seq->seq.s[i]; + c = seq_nt16_table[b]; + a = bitcnt_table[c]; + if (a == 2) { + m = (drand48() < 0.5); + for (j = k = 0; j < 4; ++j) { + if ((1<seq.s[i] = islower(b)? "acgt"[j] : "ACGT"[j]; + } + if (i%60 == 0) putchar('\n'); + putchar(seq->seq.s[i]); + } + putchar('\n'); + } + kseq_destroy(seq); + gzclose(fp); + return 0; +} + +int stk_hety(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq; + int l, c, win_size = 50000, n_start = 5, win_step, is_lower_mask = 0; + char *buf; + uint32_t cnt[3]; + if (argc == 1) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: seqtk hety [options] \n\n"); + fprintf(stderr, "Options: -w INT window size [%d]\n", win_size); + fprintf(stderr, " -t INT # start positions in a window [%d]\n", n_start); + fprintf(stderr, " -m treat lowercases as masked\n"); + fprintf(stderr, "\n"); + return 1; + } + while ((c = getopt(argc, argv, "w:t:m")) >= 0) { + switch (c) { + case 'w': win_size = atoi(optarg); break; + case 't': n_start = atoi(optarg); break; + case 'm': is_lower_mask = 1; break; + } + } + fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); + seq = kseq_init(fp); + win_step = win_size / n_start; + buf = calloc(win_size, 1); + while ((l = kseq_read(seq)) >= 0) { + int x, i, y, z, next = 0; + cnt[0] = cnt[1] = cnt[2] = 0; + for (i = 0; i <= l; ++i) { + if ((i >= win_size && i % win_step == 0) || i == l) { + if (i == l && l >= win_size) { + for (y = l - win_size; y < next; ++y) --cnt[(int)buf[y % win_size]]; + } + if (cnt[1] + cnt[2] > 0) + printf("%s\t%d\t%d\t%.2lf\t%d\t%d\n", seq->name.s, next, i, + (double)cnt[2] / (cnt[1] + cnt[2]) * win_size, cnt[1] + cnt[2], cnt[2]); + next = i; + } + if (i < l) { + y = i % win_size; + c = seq->seq.s[i]; + if (is_lower_mask && islower(c)) c = 'N'; + c = seq_nt16_table[c]; + x = bitcnt_table[c]; + if (i >= win_size) --cnt[(int)buf[y]]; + buf[y] = z = x > 2? 0 : x == 2? 2 : 1; + ++cnt[z]; + } + } + } + free(buf); + kseq_destroy(seq); + gzclose(fp); + return 0; +} + +/* fq2fa */ +int stk_fq2fa(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq; + char *buf; + int l, i, c, qual_thres = 0, linelen = 60; + while ((c = getopt(argc, argv, "q:l:")) >= 0) { + switch (c) { + case 'q': qual_thres = atoi(optarg); break; + case 'l': linelen = atoi(optarg); break; + } + } + if (argc == optind) { + fprintf(stderr, "Usage: seqtk fq2fa [-q qualThres=0] [-l lineLen=60] \n"); + return 1; + } + buf = linelen > 0? malloc(linelen + 1) : 0; + fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + seq = kseq_init(fp); + while ((l = kseq_read(seq)) >= 0) { + if (seq->qual.l && qual_thres > 0) { + for (i = 0; i < l; ++i) + if (seq->qual.s[i] - 33 < qual_thres) + seq->seq.s[i] = tolower(seq->seq.s[i]); + } + putchar('>'); + if (seq->comment.l) { + fputs(seq->name.s, stdout); + putchar(' '); + puts(seq->comment.s); + } else puts(seq->name.s); + if (buf) { // multi-line + for (i = 0; i < l; i += linelen) { + int x = i + linelen < l? linelen : l - i; + memcpy(buf, seq->seq.s + i, x); + buf[x] = 0; + puts(buf); + } + } else puts(seq->seq.s); + } + free(buf); + kseq_destroy(seq); + gzclose(fp); + return 0; +} + +int stk_maskseq(int argc, char *argv[]) +{ + khash_t(reg) *h = kh_init(reg); + gzFile fp; + kseq_t *seq; + int l, i, j, c, is_complement = 0, is_lower = 0; + khint_t k; + while ((c = getopt(argc, argv, "cl")) >= 0) { + switch (c) { + case 'c': is_complement = 1; break; + case 'l': is_lower = 1; break; + } + } + if (argc - optind < 2) { + fprintf(stderr, "Usage: seqtk maskseq [-cl] \n\n"); + fprintf(stderr, "Options: -c mask the complement regions\n"); + fprintf(stderr, " -l soft mask (to lower cases)\n"); + return 1; + } + h = stk_reg_read(argv[optind+1]); + // maskseq + fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + seq = kseq_init(fp); + while ((l = kseq_read(seq)) >= 0) { + k = kh_get(reg, h, seq->name.s); + if (k == kh_end(h)) { // not found in the hash table + if (is_complement) { + for (j = 0; j < l; ++j) + seq->seq.s[j] = is_lower? tolower(seq->seq.s[j]) : 'N'; + } + } else { + reglist_t *p = &kh_val(h, k); + if (!is_complement) { + for (i = 0; i < p->n; ++i) { + int beg = p->a[i]>>32, end = p->a[i]; + if (beg >= seq->seq.l) { + fprintf(stderr, "[maskseq] start position >= the sequence length.\n"); + continue; + } + if (end >= seq->seq.l) end = seq->seq.l; + if (is_lower) for (j = beg; j < end; ++j) seq->seq.s[j] = tolower(seq->seq.s[j]); + else for (j = beg; j < end; ++j) seq->seq.s[j] = 'N'; + } + } else { + int8_t *mask = calloc(seq->seq.l, 1); + for (i = 0; i < p->n; ++i) { + int beg = p->a[i]>>32, end = p->a[i]; + if (end >= seq->seq.l) end = seq->seq.l; + for (j = beg; j < end; ++j) mask[j] = 1; + } + for (j = 0; j < l; ++j) + if (mask[j] == 0) seq->seq.s[j] = is_lower? tolower(seq->seq.s[j]) : 'N'; + free(mask); + } + } + printf(">%s", seq->name.s); + for (j = 0; j < seq->seq.l; ++j) { + if (j%60 == 0) putchar('\n'); + putchar(seq->seq.s[j]); + } + putchar('\n'); + } + // free + kseq_destroy(seq); + gzclose(fp); + stk_reg_destroy(h); + return 0; +} + +/* subseq */ + +int stk_subseq(int argc, char *argv[]) +{ + khash_t(reg) *h = kh_init(reg); + gzFile fp; + kseq_t *seq; + int l, i, j, c, is_tab = 0; + khint_t k; + while ((c = getopt(argc, argv, "t")) >= 0) { + switch (c) { + case 't': is_tab = 1; break; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: seqtk subseq [-t] \n\n"); + fprintf(stderr, "Note: Use 'samtools faidx' if only a few regions are intended.\n"); + return 1; + } + h = stk_reg_read(argv[optind+1]); + // subseq + fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + seq = kseq_init(fp); + while ((l = kseq_read(seq)) >= 0) { + reglist_t *p; + k = kh_get(reg, h, seq->name.s); + if (k == kh_end(h)) continue; + p = &kh_val(h, k); + for (i = 0; i < p->n; ++i) { + int beg = p->a[i]>>32, end = p->a[i]; + if (beg >= seq->seq.l) { + fprintf(stderr, "[subseq] %s: %d >= %ld\n", seq->name.s, beg, seq->seq.l); + continue; + } + if (end > seq->seq.l) end = seq->seq.l; + if (is_tab == 0) { + printf("%c%s", seq->qual.l == seq->seq.l? '@' : '>', seq->name.s); + if (end == INT_MAX) { + if (beg) printf(":%d", beg+1); + } else printf(":%d-%d", beg+1, end); + } else printf("%s\t%d\t", seq->name.s, beg + 1); + if (end > seq->seq.l) end = seq->seq.l; + for (j = 0; j < end - beg; ++j) { + if (is_tab == 0 && j % 60 == 0) putchar('\n'); + putchar(seq->seq.s[j + beg]); + } + putchar('\n'); + if (seq->qual.l != seq->seq.l || is_tab) continue; + printf("+"); + for (j = 0; j < end - beg; ++j) { + if (j % 60 == 0) putchar('\n'); + putchar(seq->qual.s[j + beg]); + } + putchar('\n'); + } + } + // free + kseq_destroy(seq); + gzclose(fp); + stk_reg_destroy(h); + return 0; +} + +/* mergefa */ +int stk_mergefa(int argc, char *argv[]) +{ + gzFile fp[2]; + kseq_t *seq[2]; + int i, l, c, is_intersect = 0, is_haploid = 0, qual = 0, is_mask = 0; + while ((c = getopt(argc, argv, "himq:")) >= 0) { + switch (c) { + case 'i': is_intersect = 1; break; + case 'h': is_haploid = 1; break; + case 'm': is_mask = 1; break; + case 'q': qual = atoi(optarg); break; + } + } + if (is_mask && is_intersect) { + fprintf(stderr, "[%s] `-i' and `-h' cannot be applied at the same time.\n", __func__); + return 1; + } + if (optind + 2 > argc) { + fprintf(stderr, "\nUsage: seqtk mergefa [options] \n\n"); + fprintf(stderr, "Options: -q INT quality threshold [0]\n"); + fprintf(stderr, " -i take intersection\n"); + fprintf(stderr, " -m convert to lowercase when one of the input base is N.\n"); + fprintf(stderr, " -h suppress hets in the input\n\n"); + return 1; + } + for (i = 0; i < 2; ++i) { + fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r"); + seq[i] = kseq_init(fp[i]); + } + while (kseq_read(seq[0]) >= 0) { + int min_l, c[2], is_upper; + kseq_read(seq[1]); + if (strcmp(seq[0]->name.s, seq[1]->name.s)) + fprintf(stderr, "[%s] Different sequence names: %s != %s\n", __func__, seq[0]->name.s, seq[1]->name.s); + if (seq[0]->seq.l != seq[1]->seq.l) + fprintf(stderr, "[%s] Unequal sequence length: %ld != %ld\n", __func__, seq[0]->seq.l, seq[1]->seq.l); + min_l = seq[0]->seq.l < seq[1]->seq.l? seq[0]->seq.l : seq[1]->seq.l; + printf(">%s", seq[0]->name.s); + for (l = 0; l < min_l; ++l) { + c[0] = seq[0]->seq.s[l]; c[1] = seq[1]->seq.s[l]; + if (seq[0]->qual.l && seq[0]->qual.s[l] - 33 < qual) c[0] = tolower(c[0]); + if (seq[1]->qual.l && seq[1]->qual.s[l] - 33 < qual) c[1] = tolower(c[1]); + if (is_intersect) is_upper = (isupper(c[0]) || isupper(c[1]))? 1 : 0; + else if (is_mask) is_upper = (isupper(c[0]) || isupper(c[1]))? 1 : 0; + else is_upper = (isupper(c[0]) && isupper(c[1]))? 1 : 0; + c[0] = seq_nt16_table[c[0]]; c[1] = seq_nt16_table[c[1]]; + if (c[0] == 0) c[0] = 15; + if (c[1] == 0) c[1] = 15; + if (is_haploid && (bitcnt_table[c[0]] > 1 || bitcnt_table[c[1]] > 1)) is_upper = 0; + if (is_intersect) { + c[0] = c[0] & c[1]; + if (c[0] == 0) is_upper = 0; + } else if (is_mask) { + if (c[0] == 15 || c[1] == 15) is_upper = 0; + c[0] = c[0] & c[1]; + if (c[0] == 0) is_upper = 0; + } else c[0] = c[0] | c[1]; + c[0] = seq_nt16_rev_table[c[0]]; + if (!is_upper) c[0] = tolower(c[0]); + if (l%60 == 0) putchar('\n'); + putchar(c[0]); + } + putchar('\n'); + } + return 0; +} + +int stk_famask(int argc, char *argv[]) +{ + gzFile fp[2]; + kseq_t *seq[2]; + int i, l; + if (argc < 3) { + fprintf(stderr, "Usage: seqtk famask \n"); + return 1; + } + for (i = 0; i < 2; ++i) { + fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r"); + seq[i] = kseq_init(fp[i]); + } + while (kseq_read(seq[0]) >= 0) { + int min_l, c[2]; + kseq_read(seq[1]); + if (strcmp(seq[0]->name.s, seq[1]->name.s)) + fprintf(stderr, "[%s] Different sequence names: %s != %s\n", __func__, seq[0]->name.s, seq[1]->name.s); + if (seq[0]->seq.l != seq[1]->seq.l) + fprintf(stderr, "[%s] Unequal sequence length: %ld != %ld\n", __func__, seq[0]->seq.l, seq[1]->seq.l); + min_l = seq[0]->seq.l < seq[1]->seq.l? seq[0]->seq.l : seq[1]->seq.l; + printf(">%s", seq[0]->name.s); + for (l = 0; l < min_l; ++l) { + c[0] = seq[0]->seq.s[l]; c[1] = seq[1]->seq.s[l]; + if (c[1] == 'x') c[0] = tolower(c[0]); + else if (c[1] != 'X') c[0] = c[1]; + if (l%60 == 0) putchar('\n'); + putchar(c[0]); + } + putchar('\n'); + } + return 0; +} + +int stk_mutfa(int argc, char *argv[]) +{ + khash_t(reg) *h = kh_init(reg); + gzFile fp; + kseq_t *seq; + kstream_t *ks; + int l, i, dret; + kstring_t *str; + khint_t k; + if (argc < 3) { + fprintf(stderr, "Usage: seqtk mutfa \n\n"); + fprintf(stderr, "Note: contains at least four columns per line which are:\n"); + fprintf(stderr, " 'chr 1-based-pos any base-changed-to'.\n"); + return 1; + } + // read the list + str = calloc(1, sizeof(kstring_t)); + fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r"); + ks = ks_init(fp); + while (ks_getuntil(ks, 0, str, &dret) >= 0) { + char *s = strdup(str->s); + int beg = 0, ret; + reglist_t *p; + k = kh_get(reg, h, s); + if (k == kh_end(h)) { + k = kh_put(reg, h, s, &ret); + memset(&kh_val(h, k), 0, sizeof(reglist_t)); + } + p = &kh_val(h, k); + if (ks_getuntil(ks, 0, str, &dret) > 0) beg = atol(str->s) - 1; // 2nd col + ks_getuntil(ks, 0, str, &dret); // 3rd col + ks_getuntil(ks, 0, str, &dret); // 4th col + // skip the rest of the line + if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); + if (isalpha(str->s[0]) && str->l == 1) { + if (p->n == p->m) { + p->m = p->m? p->m<<1 : 4; + p->a = realloc(p->a, p->m * 8); + } + p->a[p->n++] = (uint64_t)beg<<32 | str->s[0]; + } + } + ks_destroy(ks); + gzclose(fp); + free(str->s); free(str); + // mutfa + fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); + seq = kseq_init(fp); + while ((l = kseq_read(seq)) >= 0) { + reglist_t *p; + k = kh_get(reg, h, seq->name.s); + if (k != kh_end(h)) { + p = &kh_val(h, k); + for (i = 0; i < p->n; ++i) { + int beg = p->a[i]>>32; + if (beg < seq->seq.l) + seq->seq.s[beg] = (int)p->a[i]; + } + } + printf(">%s", seq->name.s); + for (i = 0; i < l; ++i) { + if (i%60 == 0) putchar('\n'); + putchar(seq->seq.s[i]); + } + putchar('\n'); + } + // free + kseq_destroy(seq); + gzclose(fp); + for (k = 0; k < kh_end(h); ++k) { + if (kh_exist(h, k)) { + free(kh_val(h, k).a); + free((char*)kh_key(h, k)); + } + } + kh_destroy(reg, h); + return 0; +} + +int stk_listhet(int argc, char *argv[]) +{ + gzFile fp; + kseq_t *seq; + int i, l; + if (argc == 1) { + fprintf(stderr, "Usage: seqtk listhet \n"); + return 1; + } + fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r"); + seq = kseq_init(fp); + while ((l = kseq_read(seq)) >= 0) { + for (i = 0; i < l; ++i) { + int b = seq->seq.s[i]; + if (bitcnt_table[seq_nt16_table[b]] == 2) + printf("%s\t%d\t%c\n", seq->name.s, i+1, b); + } + } + kseq_destroy(seq); + gzclose(fp); + return 0; +} + +/* cutN */ +static int cutN_min_N_tract = 1000; +static int cutN_nonN_penalty = 10; + +static int find_next_cut(const kseq_t *ks, int k, int *begin, int *end) +{ + int i, b, e; + while (k < ks->seq.l) { + if (seq_nt16_table[(int)ks->seq.s[k]] == 15) { + int score, max; + score = 0; e = max = -1; + for (i = k; i < ks->seq.l && score >= 0; ++i) { /* forward */ + if (seq_nt16_table[(int)ks->seq.s[i]] == 15) ++score; + else score -= cutN_nonN_penalty; + if (score > max) max = score, e = i; + } + score = 0; b = max = -1; + for (i = e; i >= 0 && score >= 0; --i) { /* backward */ + if (seq_nt16_table[(int)ks->seq.s[i]] == 15) ++score; + else score -= cutN_nonN_penalty; + if (score > max) max = score, b = i; + } + if (e + 1 - b >= cutN_min_N_tract) { + *begin = b; + *end = e + 1; + return *end; + } + k = e + 1; + } else ++k; + } + return -1; +} +static void print_seq(FILE *fpout, const kseq_t *ks, int begin, int end) +{ + int i; + if (begin >= end) return; // FIXME: why may this happen? Understand it! + fprintf(fpout, ">%s:%d-%d", ks->name.s, begin+1, end); + for (i = begin; i < end && i < ks->seq.l; ++i) { + if ((i - begin)%60 == 0) fputc('\n', fpout); + fputc(ks->seq.s[i], fpout); + } + fputc('\n', fpout); +} +int stk_cutN(int argc, char *argv[]) +{ + int c, l, gap_only = 0; + gzFile fp; + kseq_t *ks; + while ((c = getopt(argc, argv, "n:p:g")) >= 0) { + switch (c) { + case 'n': cutN_min_N_tract = atoi(optarg); break; + case 'p': cutN_nonN_penalty = atoi(optarg); break; + case 'g': gap_only = 1; break; + default: return 1; + } + } + if (argc == optind) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: seqtk cutN [options] \n\n"); + fprintf(stderr, "Options: -n INT min size of N tract [%d]\n", cutN_min_N_tract); + fprintf(stderr, " -p INT penalty for a non-N [%d]\n", cutN_nonN_penalty); + fprintf(stderr, " -g print gaps only, no sequence\n\n"); + return 1; + } + fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); + ks = kseq_init(fp); + while ((l = kseq_read(ks)) >= 0) { + int k = 0, begin = 0, end = 0; + while (find_next_cut(ks, k, &begin, &end) >= 0) { + if (begin != 0) { + if (gap_only) printf("%s\t%d\t%d\n", ks->name.s, begin, end); + else print_seq(stdout, ks, k, begin); + } + k = end; + } + if (!gap_only) print_seq(stdout, ks, k, l); + } + kseq_destroy(ks); + gzclose(fp); + return 0; +} + +/* main function */ +static int usage() +{ + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: seqtk \n\n"); + fprintf(stderr, "Command: comp get the nucleotide composite of FASTA/Q\n"); + fprintf(stderr, " hety regional heterozygosity\n"); + fprintf(stderr, " fq2fa convert FASTQ to FASTA\n"); + fprintf(stderr, " subseq extract subsequences from FASTA/Q\n"); + fprintf(stderr, " maskseq mask sequences\n"); + fprintf(stderr, " mutfa point mutate FASTA at specified positions\n"); + fprintf(stderr, " mergefa merge two FASTA/Q files\n"); + fprintf(stderr, " randbase choose a random base from hets\n"); + fprintf(stderr, " cutN cut sequence at long N\n"); + fprintf(stderr, " listhet extract the position of each het\n"); + fprintf(stderr, "\n"); + return 1; +} + +int main(int argc, char *argv[]) +{ + if (argc == 1) return usage(); + if (strcmp(argv[1], "comp") == 0) stk_comp(argc-1, argv+1); + else if (strcmp(argv[1], "hety") == 0) stk_hety(argc-1, argv+1); + else if (strcmp(argv[1], "fq2fa") == 0) stk_fq2fa(argc-1, argv+1); + else if (strcmp(argv[1], "subseq") == 0) stk_subseq(argc-1, argv+1); + else if (strcmp(argv[1], "maskseq") == 0) stk_maskseq(argc-1, argv+1); + else if (strcmp(argv[1], "mutfa") == 0) stk_mutfa(argc-1, argv+1); + else if (strcmp(argv[1], "mergefa") == 0) stk_mergefa(argc-1, argv+1); + else if (strcmp(argv[1], "randbase") == 0) stk_randbase(argc-1, argv+1); + else if (strcmp(argv[1], "cutN") == 0) stk_cutN(argc-1, argv+1); + else if (strcmp(argv[1], "listhet") == 0) stk_listhet(argc-1, argv+1); + else if (strcmp(argv[1], "famask") == 0) stk_famask(argc-1, argv+1); + else { + fprintf(stderr, "[main] unrecognized commad '%s'. Abort!\n", argv[1]); + return 1; + } + return 0; +} diff --git a/sam/misc/wgsim.c b/sam/misc/wgsim.c index 7b5f095..b9c513c 100644 --- a/sam/misc/wgsim.c +++ b/sam/misc/wgsim.c @@ -1,6 +1,7 @@ /* The MIT License Copyright (c) 2008 Genome Research Ltd (GRL). + 2011 Heng Li Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -23,11 +24,8 @@ SOFTWARE. */ -/* Contact: Heng Li */ - /* This program is separated from maq's read simulator with Colin - * Hercus' modification to allow longer indels. Colin is the chief - * developer of novoalign. */ + * Hercus' modification to allow longer indels. */ #include #include @@ -38,8 +36,11 @@ #include #include #include +#include +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) -#define PACKAGE_VERSION "0.2.3" +#define PACKAGE_VERSION "0.3.0" const uint8_t nst_nt4_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, @@ -60,8 +61,6 @@ const uint8_t nst_nt4_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }; -const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4}; - /* Simple normal random number generator, copied from genran.c */ double ran_normal() @@ -85,78 +84,6 @@ double ran_normal() } } -/* FASTA parser, copied from seq.c */ - -typedef struct { - int l, m; /* length and maximum buffer size */ - unsigned char *s; /* sequence */ -} seq_t; - -#define INIT_SEQ(seq) (seq).s = 0; (seq).l = (seq).m = 0 - -static int SEQ_BLOCK_SIZE = 512; - -void seq_set_block_size(int size) -{ - SEQ_BLOCK_SIZE = size; -} - -int seq_read_fasta(FILE *fp, seq_t *seq, char *locus, char *comment) -{ - int c, l, max; - char *p; - - c = 0; - while (!feof(fp) && fgetc(fp) != '>'); - if (feof(fp)) return -1; - p = locus; - while (!feof(fp) && (c = fgetc(fp)) != ' ' && c != '\t' && c != '\n') - if (c != '\r') *p++ = c; - *p = '\0'; - if (comment) { - p = comment; - if (c != '\n') { - while (!feof(fp) && ((c = fgetc(fp)) == ' ' || c == '\t')); - if (c != '\n') { - *p++ = c; - while (!feof(fp) && (c = fgetc(fp)) != '\n') - if (c != '\r') *p++ = c; - } - } - *p = '\0'; - } else if (c != '\n') while (!feof(fp) && fgetc(fp) != '\n'); - l = 0; max = seq->m; - while (!feof(fp) && (c = fgetc(fp)) != '>') { - if (isalpha(c) || c == '-' || c == '.') { - if (l + 1 >= max) { - max += SEQ_BLOCK_SIZE; - seq->s = (unsigned char*)realloc(seq->s, sizeof(char) * max); - } - seq->s[l++] = (unsigned char)c; - } - } - if (c == '>') ungetc(c,fp); - seq->s[l] = 0; - seq->m = max; seq->l = l; - return l; -} - -/* Error-checking open, copied from utils.c */ - -#define xopen(fn, mode) err_xopen_core(__func__, fn, mode) - -FILE *err_xopen_core(const char *func, const char *fn, const char *mode) -{ - FILE *fp = 0; - if (strcmp(fn, "-") == 0) - return (strstr(mode, "r"))? stdin : stdout; - if ((fp = fopen(fn, mode)) == 0) { - fprintf(stderr, "[%s] fail to open file '%s'. Abort!\n", func, fn); - abort(); - } - return fp; -} - /* wgsim */ enum muttype_t {NOCHANGE = 0, INSERT = 0x1000, SUBSTITUTE = 0xe000, DELETE = 0xf000}; @@ -170,24 +97,23 @@ typedef struct { static double ERR_RATE = 0.02; static double MUT_RATE = 0.001; -static double INDEL_FRAC = 0.1; +static double INDEL_FRAC = 0.15; static double INDEL_EXTEND = 0.3; -static int IS_SOLID = 0; -static int SHOW_MM_INFO = 1; +static double MAX_N_RATIO = 0.1; -void maq_mut_diref(const seq_t *seq, int is_hap, mutseq_t *hap1, mutseq_t *hap2) +void wgsim_mut_diref(const kseq_t *ks, int is_hap, mutseq_t *hap1, mutseq_t *hap2) { int i, deleting = 0; mutseq_t *ret[2]; ret[0] = hap1; ret[1] = hap2; - ret[0]->l = seq->l; ret[1]->l = seq->l; - ret[0]->m = seq->m; ret[1]->m = seq->m; - ret[0]->s = (mut_t *)calloc(seq->m, sizeof(mut_t)); - ret[1]->s = (mut_t *)calloc(seq->m, sizeof(mut_t)); - for (i = 0; i != seq->l; ++i) { + ret[0]->l = ks->seq.l; ret[1]->l = ks->seq.l; + ret[0]->m = ks->seq.m; ret[1]->m = ks->seq.m; + ret[0]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t)); + ret[1]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t)); + for (i = 0; i != ks->seq.l; ++i) { int c; - c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)seq->s[i]]; + c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)ks->seq.s[i]]; if (deleting) { if (drand48() < INDEL_EXTEND) { if (deleting & 1) ret[0]->s[i] |= DELETE; @@ -230,12 +156,12 @@ void maq_mut_diref(const seq_t *seq, int is_hap, mutseq_t *hap1, mutseq_t *hap2) } } } -void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2) +void wgsim_print_mutref(const char *name, const kseq_t *ks, mutseq_t *hap1, mutseq_t *hap2) { int i; - for (i = 0; i != seq->l; ++i) { + for (i = 0; i != ks->seq.l; ++i) { int c[3]; - c[0] = nst_nt4_table[(int)seq->s[i]]; + c[0] = nst_nt4_table[(int)ks->seq.s[i]]; c[1] = hap1->s[i]; c[2] = hap2->s[i]; if (c[0] >= 4) continue; if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) { @@ -248,8 +174,9 @@ void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq } else if (((c[1] & mutmsk) >> 12) <= 5) { // ins printf("-\t"); int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4; - while(n > 0) { + while (n > 0) { putchar("ACGTN"[ins & 0x3]); + ins >>= 2; n--; } printf("\t-\n"); @@ -266,6 +193,7 @@ void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4; while (n > 0) { putchar("ACGTN"[ins & 0x3]); + ins >>= 2; n--; } printf("\t+\n"); @@ -284,46 +212,51 @@ void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq } } -void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r) +void wgsim_core(FILE *fpout1, FILE *fpout2, const char *fn, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r) { - seq_t seq; + kseq_t *ks; mutseq_t rseq[2]; + gzFile fp_fa; uint64_t tot_len, ii; int i, l, n_ref; - char name[256], *qstr; - int size[2], Q; + char *qstr; + int size[2], Q, max_size; uint8_t *tmp_seq[2]; mut_t *target; - INIT_SEQ(seq); - srand48(time(0)); - seq_set_block_size(0x1000000); l = size_l > size_r? size_l : size_r; qstr = (char*)calloc(l+1, 1); tmp_seq[0] = (uint8_t*)calloc(l+2, 1); tmp_seq[1] = (uint8_t*)calloc(l+2, 1); size[0] = size_l; size[1] = size_r; + max_size = size_l > size_r? size_l : size_r; Q = (ERR_RATE == 0.0)? 'I' : (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33; + fp_fa = gzopen(fn, "r"); + ks = kseq_init(fp_fa); tot_len = n_ref = 0; - while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) { + fprintf(stderr, "[%s] calculating the total length of the reference sequence...\n", __func__); + while ((l = kseq_read(ks)) >= 0) { tot_len += l; ++n_ref; } - fprintf(stderr, "[wgsim_core] %d sequences, total length: %llu\n", n_ref, (long long)tot_len); - rewind(fp_fa); + fprintf(stderr, "[%s] %d sequences, total length: %llu\n", __func__, n_ref, (long long)tot_len); + kseq_destroy(ks); + gzclose(fp_fa); - while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) { + fp_fa = gzopen(fn, "r"); + ks = kseq_init(fp_fa); + while ((l = kseq_read(ks)) >= 0) { uint64_t n_pairs = (uint64_t)((long double)l / tot_len * N + 0.5); if (l < dist + 3 * std_dev) { - fprintf(stderr, "[wgsim_core] kkip sequence '%s' as it is shorter than %d!\n", name, dist + 3 * std_dev); + fprintf(stderr, "[%s] skip sequence '%s' as it is shorter than %d!\n", __func__, ks->name.s, dist + 3 * std_dev); continue; } // generate mutations and print them out - maq_mut_diref(&seq, is_hap, rseq, rseq+1); - maq_print_mutref(name, &seq, rseq, rseq+1); + wgsim_mut_diref(ks, is_hap, rseq, rseq+1); + wgsim_print_mutref(ks->name.s, ks, rseq, rseq+1); for (ii = 0; ii != n_pairs; ++ii) { // the core loop double ran; @@ -335,8 +268,9 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, ran = ran_normal(); ran = ran * std_dev + dist; d = (int)(ran + 0.5); + d = d > max_size? d : max_size; pos = (int)((l - d + 1) * drand48()); - } while (pos < 0 || pos >= seq.l || pos + d - 1 >= seq.l); + } while (pos < 0 || pos >= ks->seq.l || pos + d - 1 >= ks->seq.l); // flip or not if (drand48() < 0.5) { @@ -353,7 +287,7 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0; #define __gen_read(x, start, iter) do { \ - for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < seq.l && k < s[x]; iter) { \ + for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < ks->seq.l && k < s[x]; iter) { \ int c = target[i], mut_type = c & mutmsk; \ if (ext_coor[x] < 0) { \ if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue; \ @@ -374,33 +308,9 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, if (k != s[x]) ext_coor[x] = -10; \ } while (0) - if (!IS_SOLID) { - __gen_read(0, pos, ++i); - __gen_read(1, pos + d - 1, --i); - for (k = 0; k < s[1]; ++k) tmp_seq[1][k] = tmp_seq[1][k] < 4? 3 - tmp_seq[1][k] : 4; // complement - } else { - int c1, c2, c; - ++s[0]; ++s[1]; // temporarily increase read length by 1 - if (is_flip) { // RR pair - __gen_read(0, pos + s[0], --i); - __gen_read(1, pos + d - 1, --i); - } else { // FF pair - __gen_read(0, pos, ++i); - __gen_read(1, pos + d - 1 - s[1], ++i); - ++ext_coor[0]; ++ext_coor[1]; - } - // change to color sequence: (0,1,2,3) -> (A,C,G,T) - for (j = 0; j < 2; ++j) { - c1 = tmp_seq[j][0]; - for (i = 1; i < s[j]; ++i) { - c2 = tmp_seq[j][i]; - c = (c1 >= 4 || c2 >= 4)? 4 : nst_color_space_table[(1<= 4) c = 4; // actually c should be never larger than 4 if everything is correct - else if (drand48() < ERR_RATE) { - c = (c + (int)(drand48() * 3.0 + 1)) & 3; + if (c >= 4) { // actually c should be never larger than 4 if everything is correct + c = 4; + ++n_n; + } else if (drand48() < ERR_RATE) { + // c = (c + (int)(drand48() * 3.0 + 1)) & 3; // random sequencing errors + c = (c + 1) & 3; // recurrent sequencing errors ++n_err[j]; } tmp_seq[j][i] = c; } + if ((double)n_n / s[j] > MAX_N_RATIO) break; + } + if (j < 2) { // too many ambiguous bases on one of the reads + --ii; + continue; } // print for (j = 0; j < 2; ++j) { for (i = 0; i < s[j]; ++i) qstr[i] = Q; qstr[i] = 0; - if (SHOW_MM_INFO) { - fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", name, ext_coor[0]+1, ext_coor[1]+1, - n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1], - (long long)ii, j==0? is_flip+1 : 2-is_flip); - } else { - fprintf(fpo[j], "@%s_%u_%u_%llx/%d %d:%d:%d_%d:%d:%d\n", name, ext_coor[0]+1, ext_coor[1]+1, - (long long)ii, j==0? is_flip+1 : 2-is_flip, - n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1]); - } + fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", ks->name.s, ext_coor[0]+1, ext_coor[1]+1, + n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1], + (long long)ii, j==0? is_flip+1 : 2-is_flip); for (i = 0; i < s[j]; ++i) fputc("ACGTN"[(int)tmp_seq[j][i]], fpo[j]); fprintf(fpo[j], "\n+\n%s\n", qstr); @@ -439,7 +352,9 @@ void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, } free(rseq[0].s); free(rseq[1].s); } - free(seq.s); free(qstr); + kseq_destroy(ks); + gzclose(fp_fa); + free(qstr); free(tmp_seq[0]); free(tmp_seq[1]); } @@ -459,11 +374,9 @@ static int simu_usage() fprintf(stderr, " -r FLOAT rate of mutations [%.4f]\n", MUT_RATE); fprintf(stderr, " -R FLOAT fraction of indels [%.2f]\n", INDEL_FRAC); fprintf(stderr, " -X FLOAT probability an indel is extended [%.2f]\n", INDEL_EXTEND); - fprintf(stderr, " -c generate reads in color space (SOLiD reads)\n"); - fprintf(stderr, " -C show mismatch info in comment rather than read name\n"); + fprintf(stderr, " -S INT seed for random generator [-1]\n"); fprintf(stderr, " -h haplotype mode\n"); fprintf(stderr, "\n"); - fprintf(stderr, "Note: For SOLiD reads, the first read is F3 and the second is R3.\n\n"); return 1; } @@ -471,11 +384,12 @@ int main(int argc, char *argv[]) { int64_t N; int dist, std_dev, c, size_l, size_r, is_hap = 0; - FILE *fpout1, *fpout2, *fp_fa; + FILE *fpout1, *fpout2; + int seed = -1; N = 1000000; dist = 500; std_dev = 50; size_l = size_r = 70; - while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:cC")) >= 0) { + while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:S:")) >= 0) { switch (c) { case 'd': dist = atoi(optarg); break; case 's': std_dev = atoi(optarg); break; @@ -486,17 +400,20 @@ int main(int argc, char *argv[]) case 'r': MUT_RATE = atof(optarg); break; case 'R': INDEL_FRAC = atof(optarg); break; case 'X': INDEL_EXTEND = atof(optarg); break; - case 'c': IS_SOLID = 1; break; - case 'C': SHOW_MM_INFO = 0; break; + case 'S': seed = atoi(optarg); break; case 'h': is_hap = 1; break; } } if (argc - optind < 3) return simu_usage(); - fp_fa = (strcmp(argv[optind+0], "-") == 0)? stdin : xopen(argv[optind+0], "r"); - fpout1 = xopen(argv[optind+1], "w"); - fpout2 = xopen(argv[optind+2], "w"); - wgsim_core(fpout1, fpout2, fp_fa, is_hap, N, dist, std_dev, size_l, size_r); + fpout1 = fopen(argv[optind+1], "w"); + fpout2 = fopen(argv[optind+2], "w"); + if (!fpout1 || !fpout2) { + fprintf(stderr, "[wgsim] file open error\n"); + return 1; + } + srand48(seed > 0? seed : time(0)); + wgsim_core(fpout1, fpout2, argv[optind], is_hap, N, dist, std_dev, size_l, size_r); - fclose(fpout1); fclose(fpout2); fclose(fp_fa); + fclose(fpout1); fclose(fpout2); return 0; } diff --git a/sam/phase.c b/sam/phase.c new file mode 100644 index 0000000..ef4eff9 --- /dev/null +++ b/sam/phase.c @@ -0,0 +1,687 @@ +#include +#include +#include +#include +#include +#include +#include "bam.h" +#include "errmod.h" + +#include "kseq.h" +KSTREAM_INIT(gzFile, gzread, 16384) + +#define MAX_VARS 256 +#define FLIP_PENALTY 2 +#define FLIP_THRES 4 +#define MASK_THRES 3 + +#define FLAG_FIX_CHIMERA 0x1 +#define FLAG_LIST_EXCL 0x4 +#define FLAG_DROP_AMBI 0x8 + +typedef struct { + // configurations, initialized in the main function + int flag, k, min_baseQ, min_varLOD, max_depth; + // other global variables + int vpos_shift; + bamFile fp; + char *pre; + bamFile out[3]; + // alignment queue + int n, m; + bam1_t **b; +} phaseg_t; + +typedef struct { + int8_t seq[MAX_VARS]; // TODO: change to dynamic memory allocation! + int vpos, beg, end; + uint32_t vlen:16, single:1, flip:1, phase:1, phased:1, ambig:1; + uint32_t in:16, out:16; // in-phase and out-phase +} frag_t, *frag_p; + +#define rseq_lt(a,b) ((a)->vpos < (b)->vpos) + +#include "khash.h" +KHASH_SET_INIT_INT64(set64) +KHASH_MAP_INIT_INT64(64, frag_t) + +typedef khash_t(64) nseq_t; + +#include "ksort.h" +KSORT_INIT(rseq, frag_p, rseq_lt) + +static char nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; + +static inline uint64_t X31_hash_string(const char *s) +{ + uint64_t h = *s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; + return h; +} + +static void count1(int l, const uint8_t *seq, int *cnt) +{ + int i, j, n_ambi; + uint32_t z, x; + if (seq[l-1] == 0) return; // do nothing is the last base is ambiguous + for (i = n_ambi = 0; i < l; ++i) // collect ambiguous bases + if (seq[i] == 0) ++n_ambi; + if (l - n_ambi <= 1) return; // only one SNP + for (x = 0; x < 1u<>j&1; + ++j; + } + z = z<<1 | c; + } + ++cnt[z]; + } +} + +static int **count_all(int l, int vpos, nseq_t *hash) +{ + khint_t k; + int i, j, **cnt; + uint8_t *seq; + seq = calloc(l, 1); + cnt = calloc(vpos, sizeof(void*)); + for (i = 0; i < vpos; ++i) cnt[i] = calloc(1<vpos >= vpos || f->single) continue; // out of region; or singleton + if (f->vlen == 1) { // such reads should be flagged as deleted previously if everything is right + f->single = 1; + continue; + } + for (j = 1; j < f->vlen; ++j) { + for (i = 0; i < l; ++i) + seq[i] = j < l - 1 - i? 0 : f->seq[j - (l - 1 - i)]; + count1(l, seq, cnt[f->vpos + j]); + } + } + } + free(seq); + return cnt; +} + +// phasing +static int8_t *dynaprog(int l, int vpos, int **w) +{ + int *f[2], *curr, *prev, max, i; + int8_t **b, *h = 0; + uint32_t x, z = 1u<<(l-1), mask = (1u<>1; y1 = xc>>1; + c0 = prev[y0] + wi[x] + wi[xc]; + c1 = prev[y1] + wi[x] + wi[xc]; + if (c0 > c1) bi[x] = 0, curr[x] = c0; + else bi[x] = 1, curr[x] = c1; + } + tmp = prev; prev = curr; curr = tmp; // swap + } + { // backtrack + uint32_t max_x = 0; + int which = 0; + h = calloc(vpos, 1); + for (x = 0, max = 0, max_x = 0; x < z; ++x) + if (prev[x] > max) max = prev[x], max_x = x; + for (i = vpos - 1, x = max_x; i >= 0; --i) { + h[i] = which? (~x&1) : (x&1); + which = b[i][x]? !which : which; + x = b[i][x]? (~x&mask)>>1 : x>>1; + } + } + // free + for (i = 0; i < vpos; ++i) free(b[i]); + free(f[0]); free(f[1]); free(b); + return h; +} + +// phase each fragment +static uint64_t *fragphase(int vpos, const int8_t *path, nseq_t *hash, int flip) +{ + khint_t k; + uint64_t *pcnt; + uint32_t *left, *rght, max; + left = rght = 0; max = 0; + pcnt = calloc(vpos, 8); + for (k = 0; k < kh_end(hash); ++k) { + if (kh_exist(hash, k)) { + int i, c[2]; + frag_t *f = &kh_val(hash, k); + if (f->vpos >= vpos) continue; + // get the phase + c[0] = c[1] = 0; + for (i = 0; i < f->vlen; ++i) { + if (f->seq[i] == 0) continue; + ++c[f->seq[i] == path[f->vpos + i] + 1? 0 : 1]; + } + f->phase = c[0] > c[1]? 0 : 1; + f->in = c[f->phase]; f->out = c[1 - f->phase]; + f->phased = f->in == f->out? 0 : 1; + f->ambig = (f->in && f->out && f->out < 3 && f->in <= f->out + 1)? 1 : 0; + // fix chimera + f->flip = 0; + if (flip && c[0] >= 3 && c[1] >= 3) { + int sum[2], m, mi, md; + if (f->vlen > max) { // enlarge the array + max = f->vlen; + kroundup32(max); + left = realloc(left, max * 4); + rght = realloc(rght, max * 4); + } + for (i = 0, sum[0] = sum[1] = 0; i < f->vlen; ++i) { // get left counts + if (f->seq[i]) { + int c = f->phase? 2 - f->seq[i] : f->seq[i] - 1; + ++sum[c == path[f->vpos + i]? 0 : 1]; + } + left[i] = sum[1]<<16 | sum[0]; + } + for (i = f->vlen - 1, sum[0] = sum[1] = 0; i >= 0; --i) { // get right counts + if (f->seq[i]) { + int c = f->phase? 2 - f->seq[i] : f->seq[i] - 1; + ++sum[c == path[f->vpos + i]? 0 : 1]; + } + rght[i] = sum[1]<<16 | sum[0]; + } + // find the best flip point + for (i = m = 0, mi = -1, md = -1; i < f->vlen - 1; ++i) { + int a[2]; + a[0] = (left[i]&0xffff) + (rght[i+1]>>16&0xffff) - (rght[i+1]&0xffff) * FLIP_PENALTY; + a[1] = (left[i]>>16&0xffff) + (rght[i+1]&0xffff) - (rght[i+1]>>16&0xffff) * FLIP_PENALTY; + if (a[0] > a[1]) { + if (a[0] > m) m = a[0], md = 0, mi = i; + } else { + if (a[1] > m) m = a[1], md = 1, mi = i; + } + } + if (m - c[0] >= FLIP_THRES && m - c[1] >= FLIP_THRES) { // then flip + f->flip = 1; + if (md == 0) { // flip the tail + for (i = mi + 1; i < f->vlen; ++i) + if (f->seq[i] == 1) f->seq[i] = 2; + else if (f->seq[i] == 2) f->seq[i] = 1; + } else { // flip the head + for (i = 0; i <= mi; ++i) + if (f->seq[i] == 1) f->seq[i] = 2; + else if (f->seq[i] == 2) f->seq[i] = 1; + } + } + } + // update pcnt[] + if (!f->single) { + for (i = 0; i < f->vlen; ++i) { + int c; + if (f->seq[i] == 0) continue; + c = f->phase? 2 - f->seq[i] : f->seq[i] - 1; + if (c == path[f->vpos + i]) { + if (f->phase == 0) ++pcnt[f->vpos + i]; + else pcnt[f->vpos + i] += 1ull<<32; + } else { + if (f->phase == 0) pcnt[f->vpos + i] += 1<<16; + else pcnt[f->vpos + i] += 1ull<<48; + } + } + } + } + } + free(left); free(rght); + return pcnt; +} + +static uint64_t *genmask(int vpos, const uint64_t *pcnt, int *_n) +{ + int i, max = 0, max_i = -1, m = 0, n = 0, beg = 0, score = 0; + uint64_t *list = 0; + for (i = 0; i < vpos; ++i) { + uint64_t x = pcnt[i]; + int c[4], pre = score, s; + c[0] = x&0xffff; c[1] = x>>16&0xffff; c[2] = x>>32&0xffff; c[3] = x>>48&0xffff; + s = (c[1] + c[3] == 0)? -(c[0] + c[2]) : (c[1] + c[3] - 1); + if (c[3] > c[2]) s += c[3] - c[2]; + if (c[1] > c[0]) s += c[1] - c[0]; + score += s; + if (score < 0) score = 0; + if (pre == 0 && score > 0) beg = i; // change from zero to non-zero + if ((i == vpos - 1 || score == 0) && max >= MASK_THRES) { + if (n == m) { + m = m? m<<1 : 4; + list = realloc(list, m * 8); + } + list[n++] = (uint64_t)beg<<32 | max_i; + i = max_i; // reset i to max_i + score = 0; + } else if (score > max) max = score, max_i = i; + if (score == 0) max = 0; + } + *_n = n; + return list; +} + +// trim heading and tailing ambiguous bases; mark deleted and remove sequence +static int clean_seqs(int vpos, nseq_t *hash) +{ + khint_t k; + int ret = 0; + for (k = 0; k < kh_end(hash); ++k) { + if (kh_exist(hash, k)) { + frag_t *f = &kh_val(hash, k); + int beg, end, i; + if (f->vpos >= vpos) { + ret = 1; + continue; + } + for (i = 0; i < f->vlen; ++i) + if (f->seq[i] != 0) break; + beg = i; + for (i = f->vlen - 1; i >= 0; --i) + if (f->seq[i] != 0) break; + end = i + 1; + if (end - beg <= 0) kh_del(64, hash, k); + else { + if (beg != 0) memmove(f->seq, f->seq + beg, end - beg); + f->vpos += beg; f->vlen = end - beg; + f->single = f->vlen == 1? 1 : 0; + } + } + } + return ret; +} + +static void dump_aln(phaseg_t *g, int min_pos, const nseq_t *hash) +{ + int i, is_flip, drop_ambi; + drop_ambi = g->flag & FLAG_DROP_AMBI; + is_flip = (drand48() < 0.5); + for (i = 0; i < g->n; ++i) { + int end, which; + uint64_t key; + khint_t k; + bam1_t *b = g->b[i]; + key = X31_hash_string(bam1_qname(b)); + end = bam_calend(&b->core, bam1_cigar(b)); + if (end > min_pos) break; + k = kh_get(64, hash, key); + if (k == kh_end(hash)) which = 3; + else { + frag_t *f = &kh_val(hash, k); + if (f->ambig) which = drop_ambi? 2 : 3; + else if (f->phased && f->flip) which = 2; + else if (f->phased == 0) which = 3; + else { // phased and not flipped + char c = 'Y'; + which = f->phase; + bam_aux_append(b, "ZP", 'A', 1, (uint8_t*)&c); + } + if (which < 2 && is_flip) which = 1 - which; // increase the randomness + } + if (which == 3) which = (drand48() < 0.5); + bam_write1(g->out[which], b); + bam_destroy1(b); + g->b[i] = 0; + } + memmove(g->b, g->b + i, (g->n - i) * sizeof(void*)); + g->n -= i; +} + +static int phase(phaseg_t *g, const char *chr, int vpos, uint64_t *cns, nseq_t *hash) +{ + int i, j, n_seqs = kh_size(hash), n_masked = 0, min_pos; + khint_t k; + frag_t **seqs; + int8_t *path, *sitemask; + uint64_t *pcnt, *regmask; + + if (vpos == 0) return 0; + i = clean_seqs(vpos, hash); // i is true if hash has an element with its vpos >= vpos + min_pos = i? cns[vpos]>>32 : 0x7fffffff; + if (vpos == 1) { + printf("PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1); + printf("M0\t%s\t%d\t%d\t%c\t%c\t%d\t0\t0\t0\t0\n//\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1, + "ACGTX"[cns[0]&3], "ACGTX"[cns[0]>>16&3], g->vpos_shift + 1); + for (k = 0; k < kh_end(hash); ++k) { + if (kh_exist(hash, k)) { + frag_t *f = &kh_val(hash, k); + if (f->vpos) continue; + f->flip = 0; + if (f->seq[0] == 0) f->phased = 0; + else f->phased = 1, f->phase = f->seq[0] - 1; + } + } + dump_aln(g, min_pos, hash); + ++g->vpos_shift; + return 1; + } + { // phase + int **cnt; + uint64_t *mask; + printf("PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[vpos-1]>>32) + 1); + sitemask = calloc(vpos, 1); + cnt = count_all(g->k, vpos, hash); + path = dynaprog(g->k, vpos, cnt); + for (i = 0; i < vpos; ++i) free(cnt[i]); + free(cnt); + pcnt = fragphase(vpos, path, hash, 0); // do not fix chimeras when masking + mask = genmask(vpos, pcnt, &n_masked); + regmask = calloc(n_masked, 8); + for (i = 0; i < n_masked; ++i) { + regmask[i] = cns[mask[i]>>32]>>32<<32 | cns[(uint32_t)mask[i]]>>32; + for (j = mask[i]>>32; j <= (int32_t)mask[i]; ++j) + sitemask[j] = 1; + } + free(mask); + if (g->flag & FLAG_FIX_CHIMERA) { + free(pcnt); + pcnt = fragphase(vpos, path, hash, 1); + } + } + for (i = 0; i < n_masked; ++i) + printf("FL\t%s\t%d\t%d\n", chr, (int)(regmask[i]>>32) + 1, (int)regmask[i] + 1); + for (i = 0; i < vpos; ++i) { + uint64_t x = pcnt[i]; + int8_t c[2]; + c[0] = (cns[i]&0xffff)>>2 == 0? 4 : (cns[i]&3); + c[1] = (cns[i]>>16&0xffff)>>2 == 0? 4 : (cns[i]>>16&3); + printf("M%d\t%s\t%d\t%d\t%c\t%c\t%d\t%d\t%d\t%d\t%d\n", sitemask[i]+1, chr, (int)(cns[0]>>32) + 1, (int)(cns[i]>>32) + 1, "ACGTX"[c[path[i]]], "ACGTX"[c[1-path[i]]], + i + g->vpos_shift + 1, (int)(x&0xffff), (int)(x>>16&0xffff), (int)(x>>32&0xffff), (int)(x>>48&0xffff)); + } + free(path); free(pcnt); free(regmask); free(sitemask); + seqs = calloc(n_seqs, sizeof(void*)); + for (k = 0, i = 0; k < kh_end(hash); ++k) + if (kh_exist(hash, k) && kh_val(hash, k).vpos < vpos && !kh_val(hash, k).single) + seqs[i++] = &kh_val(hash, k); + n_seqs = i; + ks_introsort_rseq(n_seqs, seqs); + for (i = 0; i < n_seqs; ++i) { + frag_t *f = seqs[i]; + printf("EV\t0\t%s\t%d\t40\t%dM\t*\t0\t0\t", chr, f->vpos + 1 + g->vpos_shift, f->vlen); + for (j = 0; j < f->vlen; ++j) { + uint32_t c = cns[f->vpos + j]; + if (f->seq[j] == 0) putchar('N'); + else putchar("ACGT"[f->seq[j] == 1? (c&3) : (c>>16&3)]); + } + printf("\t*\tYP:i:%d\tYF:i:%d\tYI:i:%d\tYO:i:%d\tYS:i:%d\n", f->phase, f->flip, f->in, f->out, f->beg+1); + } + free(seqs); + printf("//\n"); + fflush(stdout); + g->vpos_shift += vpos; + dump_aln(g, min_pos, hash); + return vpos; +} + +static void update_vpos(int vpos, nseq_t *hash) +{ + khint_t k; + for (k = 0; k < kh_end(hash); ++k) { + if (kh_exist(hash, k)) { + frag_t *f = &kh_val(hash, k); + if (f->vpos < vpos) kh_del(64, hash, k); // TODO: if frag_t::seq is allocated dynamically, free it + else f->vpos -= vpos; + } + } +} + +static nseq_t *shrink_hash(nseq_t *hash) // TODO: to implement +{ + return hash; +} + +static int readaln(void *data, bam1_t *b) +{ + phaseg_t *g = (phaseg_t*)data; + int ret; + ret = bam_read1(g->fp, b); + if (ret < 0) return ret; + if (!(b->core.flag & (BAM_FUNMAP|BAM_FSECONDARY|BAM_FQCFAIL|BAM_FDUP)) && g->pre) { + if (g->n == g->m) { + g->m = g->m? g->m<<1 : 16; + g->b = realloc(g->b, g->m * sizeof(void*)); + } + g->b[g->n++] = bam_dup1(b); + } + return ret; +} + +static khash_t(set64) *loadpos(const char *fn, bam_header_t *h) +{ + gzFile fp; + kstream_t *ks; + int ret, dret; + kstring_t *str; + khash_t(set64) *hash; + + hash = kh_init(set64); + str = calloc(1, sizeof(kstring_t)); + fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); + ks = ks_init(fp); + while (ks_getuntil(ks, 0, str, &dret) >= 0) { + int tid = bam_get_tid(h, str->s); + if (tid >= 0 && dret != '\n') { + if (ks_getuntil(ks, 0, str, &dret) >= 0) { + uint64_t x = (uint64_t)tid<<32 | (atoi(str->s) - 1); + kh_put(set64, hash, x, &ret); + } else break; + } + if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); + if (dret < 0) break; + } + ks_destroy(ks); + gzclose(fp); + free(str->s); free(str); + return hash; +} + +static int gl2cns(float q[16]) +{ + int i, j, min_ij; + float min, min2; + min = min2 = 1e30; min_ij = -1; + for (i = 0; i < 4; ++i) { + for (j = i; j < 4; ++j) { + if (q[i<<2|j] < min) min_ij = i<<2|j, min2 = min, min = q[i<<2|j]; + else if (q[i<<2|j] < min2) min2 = q[i<<2|j]; + } + } + return (min_ij>>2&3) == (min_ij&3)? 0 : 1<<18 | (min_ij>>2&3)<<16 | (min_ij&3) | (int)(min2 - min + .499) << 2; +} + +int main_phase(int argc, char *argv[]) +{ + extern void bam_init_header_hash(bam_header_t *header); + int c, tid, pos, vpos = 0, n, lasttid = -1, max_vpos = 0; + const bam_pileup1_t *plp; + bam_plp_t iter; + bam_header_t *h; + nseq_t *seqs; + uint64_t *cns = 0; + phaseg_t g; + char *fn_list = 0; + khash_t(set64) *set = 0; + errmod_t *em; + uint16_t *bases; + + memset(&g, 0, sizeof(phaseg_t)); + g.flag = FLAG_FIX_CHIMERA; + g.min_varLOD = 37; g.k = 13; g.min_baseQ = 13; g.max_depth = 256; + while ((c = getopt(argc, argv, "Q:eFq:k:b:l:D:A:")) >= 0) { + switch (c) { + case 'D': g.max_depth = atoi(optarg); break; + case 'q': g.min_varLOD = atoi(optarg); break; + case 'Q': g.min_baseQ = atoi(optarg); break; + case 'k': g.k = atoi(optarg); break; + case 'F': g.flag &= ~FLAG_FIX_CHIMERA; break; + case 'e': g.flag |= FLAG_LIST_EXCL; break; + case 'A': g.flag |= FLAG_DROP_AMBI; break; + case 'b': g.pre = strdup(optarg); break; + case 'l': fn_list = strdup(optarg); break; + } + } + if (argc == optind) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools phase [options] \n\n"); + fprintf(stderr, "Options: -k INT block length [%d]\n", g.k); + fprintf(stderr, " -b STR prefix of BAMs to output [null]\n"); + fprintf(stderr, " -q INT min het phred-LOD [%d]\n", g.min_varLOD); + fprintf(stderr, " -Q INT min base quality in het calling [%d]\n", g.min_baseQ); + fprintf(stderr, " -D INT max read depth [%d]\n", g.max_depth); +// fprintf(stderr, " -l FILE list of sites to phase [null]\n"); + fprintf(stderr, " -F do not attempt to fix chimeras\n"); + fprintf(stderr, " -A drop reads with ambiguous phase\n"); +// fprintf(stderr, " -e do not discover SNPs (effective with -l)\n"); + fprintf(stderr, "\n"); + return 1; + } + g.fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r"); + h = bam_header_read(g.fp); + if (fn_list) { // read the list of sites to phase + bam_init_header_hash(h); + set = loadpos(fn_list, h); + free(fn_list); + } else g.flag &= ~FLAG_LIST_EXCL; + if (g.pre) { // open BAMs to write + char *s = malloc(strlen(g.pre) + 20); + strcpy(s, g.pre); strcat(s, ".0.bam"); g.out[0] = bam_open(s, "w"); + strcpy(s, g.pre); strcat(s, ".1.bam"); g.out[1] = bam_open(s, "w"); + strcpy(s, g.pre); strcat(s, ".chimera.bam"); g.out[2] = bam_open(s, "w"); + for (c = 0; c <= 2; ++c) bam_header_write(g.out[c], h); + free(s); + } + + iter = bam_plp_init(readaln, &g); + g.vpos_shift = 0; + seqs = kh_init(64); + em = errmod_init(1. - 0.83); + bases = calloc(g.max_depth, 2); + printf("CC\n"); + printf("CC\tDescriptions:\nCC\n"); + printf("CC\t CC comments\n"); + printf("CC\t PS start of a phase set\n"); + printf("CC\t FL filtered region\n"); + printf("CC\t M[012] markers; 0 for singletons, 1 for phased and 2 for filtered\n"); + printf("CC\t EV supporting reads; SAM format\n"); + printf("CC\t // end of a phase set\nCC\n"); + printf("CC\tFormats of PS, FL and M[012] lines (1-based coordinates):\nCC\n"); + printf("CC\t PS chr phaseSetStart phaseSetEnd\n"); + printf("CC\t FL chr filterStart filterEnd\n"); + printf("CC\t M? chr PS pos allele0 allele1 hetIndex #supports0 #errors0 #supp1 #err1\n"); + printf("CC\nCC\n"); + fflush(stdout); + while ((plp = bam_plp_auto(iter, &tid, &pos, &n)) != 0) { + int i, k, c, tmp, dophase = 1, in_set = 0; + float q[16]; + if (tid < 0) break; + if (tid != lasttid) { // change of chromosome + g.vpos_shift = 0; + if (lasttid >= 0) { + seqs = shrink_hash(seqs); + phase(&g, h->target_name[lasttid], vpos, cns, seqs); + update_vpos(0x7fffffff, seqs); + } + lasttid = tid; + vpos = 0; + } + if (set && kh_get(set64, set, (uint64_t)tid<<32 | pos) != kh_end(set)) in_set = 1; + if (n > g.max_depth) continue; // do not proceed if the depth is too high + // fill the bases array and check if there is a variant + for (i = k = 0; i < n; ++i) { + const bam_pileup1_t *p = plp + i; + uint8_t *seq; + int q, baseQ, b; + if (p->is_del || p->is_refskip) continue; + baseQ = bam1_qual(p->b)[p->qpos]; + if (baseQ < g.min_baseQ) continue; + seq = bam1_seq(p->b); + b = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos)]; + if (b > 3) continue; + q = baseQ < p->b->core.qual? baseQ : p->b->core.qual; + if (q < 4) q = 4; + if (q > 63) q = 63; + bases[k++] = q<<5 | (int)bam1_strand(p->b)<<4 | b; + } + if (k == 0) continue; + errmod_cal(em, k, 4, bases, q); // compute genotype likelihood + c = gl2cns(q); // get the consensus + // tell if to proceed + if (set && (g.flag&FLAG_LIST_EXCL) && !in_set) continue; // not in the list + if (!in_set && (c&0xffff)>>2 < g.min_varLOD) continue; // not a variant + // add the variant + if (vpos == max_vpos) { + max_vpos = max_vpos? max_vpos<<1 : 128; + cns = realloc(cns, max_vpos * 8); + } + cns[vpos] = (uint64_t)pos<<32 | c; + for (i = 0; i < n; ++i) { + const bam_pileup1_t *p = plp + i; + uint64_t key; + khint_t k; + uint8_t *seq = bam1_seq(p->b); + frag_t *f; + if (p->is_del || p->is_refskip) continue; + if (p->b->core.qual == 0) continue; + // get the base code + c = nt16_nt4_table[(int)bam1_seqi(seq, p->qpos)]; + if (c == (cns[vpos]&3)) c = 1; + else if (c == (cns[vpos]>>16&3)) c = 2; + else c = 0; + // write to seqs + key = X31_hash_string(bam1_qname(p->b)); + k = kh_put(64, seqs, key, &tmp); + f = &kh_val(seqs, k); + if (tmp == 0) { // present in the hash table + if (vpos - f->vpos + 1 < MAX_VARS) { + f->vlen = vpos - f->vpos + 1; + f->seq[f->vlen-1] = c; + f->end = bam_calend(&p->b->core, bam1_cigar(p->b)); + } + dophase = 0; + } else { // absent + memset(f->seq, 0, MAX_VARS); + f->beg = p->b->core.pos; + f->end = bam_calend(&p->b->core, bam1_cigar(p->b)); + f->vpos = vpos, f->vlen = 1, f->seq[0] = c, f->single = f->phased = f->flip = f->ambig = 0; + } + } + if (dophase) { + seqs = shrink_hash(seqs); + phase(&g, h->target_name[tid], vpos, cns, seqs); + update_vpos(vpos, seqs); + cns[0] = cns[vpos]; + vpos = 0; + } + ++vpos; + } + if (tid >= 0) phase(&g, h->target_name[tid], vpos, cns, seqs); + bam_header_destroy(h); + bam_plp_destroy(iter); + bam_close(g.fp); + kh_destroy(64, seqs); + kh_destroy(set64, set); + free(cns); + errmod_destroy(em); + free(bases); + if (g.pre) { + for (c = 0; c <= 2; ++c) bam_close(g.out[c]); + free(g.pre); free(g.b); + } + return 0; +} diff --git a/sam/sam.c b/sam/sam.c index ecdee02..f026bc8 100644 --- a/sam/sam.c +++ b/sam/sam.c @@ -40,9 +40,9 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux) { samfile_t *fp; fp = (samfile_t*)calloc(1, sizeof(samfile_t)); - if (mode[0] == 'r') { // read + if (strchr(mode, 'r')) { // read fp->type |= TYPE_READ; - if (mode[1] == 'b') { // binary + if (strchr(mode, 'b')) { // binary fp->type |= TYPE_BAM; fp->x.bam = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); if (fp->x.bam == 0) goto open_err_ret; @@ -59,15 +59,19 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux) append_header_text(fp->header, textheader->text, textheader->l_text); bam_header_destroy(textheader); } - if (fp->header->n_targets == 0) + if (fp->header->n_targets == 0 && bam_verbose >= 1) fprintf(stderr, "[samopen] no @SQ lines in the header.\n"); - } else fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets); + } else if (bam_verbose >= 2) fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets); } - } else if (mode[0] == 'w') { // write + } else if (strchr(mode, 'w')) { // write fp->header = bam_header_dup((const bam_header_t*)aux); - if (mode[1] == 'b') { // binary + if (strchr(mode, 'b')) { // binary char bmode[3]; - bmode[0] = 'w'; bmode[1] = strstr(mode, "u")? 'u' : 0; bmode[2] = 0; + int i, compress_level = -1; + for (i = 0; mode[i]; ++i) if (mode[i] >= '0' && mode[i] <= '9') break; + if (mode[i]) compress_level = mode[i] - '0'; + if (strchr(mode, 'u')) compress_level = 0; + bmode[0] = 'w'; bmode[1] = compress_level < 0? 0 : compress_level + '0'; bmode[2] = 0; fp->type |= TYPE_BAM; fp->x.bam = strcmp(fn, "-")? bam_open(fn, bmode) : bam_dopen(fileno(stdout), bmode); if (fp->x.bam == 0) goto open_err_ret; @@ -76,11 +80,11 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux) // open file fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout; if (fp->x.tamr == 0) goto open_err_ret; - if (strstr(mode, "X")) fp->type |= BAM_OFSTR<<2; - else if (strstr(mode, "x")) fp->type |= BAM_OFHEX<<2; + if (strchr(mode, 'X')) fp->type |= BAM_OFSTR<<2; + else if (strchr(mode, 'x')) fp->type |= BAM_OFHEX<<2; else fp->type |= BAM_OFDEC<<2; // write header - if (strstr(mode, "h")) { + if (strchr(mode, 'h')) { int i; bam_header_t *alt; // parse the header text @@ -89,10 +93,10 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux) sam_header_parse(alt); alt->l_text = 0; alt->text = 0; // check if there are @SQ lines in the header - fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw); + fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw); // FIXME: better to skip the trailing NULL if (alt->n_targets) { // then write the header text without dumping ->target_{name,len} - if (alt->n_targets != fp->header->n_targets) - fprintf(stderr, "[samopen] inconsistent number of target sequences.\n"); + if (alt->n_targets != fp->header->n_targets && bam_verbose >= 1) + fprintf(stderr, "[samopen] inconsistent number of target sequences. Output the text header.\n"); } else { // then dump ->target_{name,len} for (i = 0; i < fp->header->n_targets; ++i) fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]); @@ -164,7 +168,7 @@ char *samfaipath(const char *fn_ref) if (access(fn_ref, R_OK) == -1) { fprintf(stderr, "[samfaipath] fail to read file %s.\n", fn_ref); } else { - fprintf(stderr, "[samfaipath] build FASTA index...\n"); + if (bam_verbose >= 3) fprintf(stderr, "[samfaipath] build FASTA index...\n"); if (fai_build(fn_ref) == -1) { fprintf(stderr, "[samfaipath] fail to build FASTA index.\n"); free(fn_list); fn_list = 0; diff --git a/sam/sam_header.c b/sam/sam_header.c index 05d75de..f4c8a3b 100644 --- a/sam/sam_header.c +++ b/sam/sam_header.c @@ -38,7 +38,7 @@ const char *o_sq_tags[] = {"AS","M5","UR","SP",NULL}; const char *r_sq_tags[] = {"SN","LN",NULL}; const char *u_sq_tags[] = {"SN",NULL}; -const char *o_rg_tags[] = {"LB","DS","PU","PI","CN","DT","PL",NULL}; +const char *o_rg_tags[] = {"CN","DS","DT","FO","KS","LB","PG","PI","PL","PU","SM",NULL}; const char *r_rg_tags[] = {"ID",NULL}; const char *u_rg_tags[] = {"ID",NULL}; @@ -563,6 +563,7 @@ void *sam_header_parse2(const char *headerText) const char *text; char *buf=NULL; size_t nbuf = 0; + int tovalidate = 0; if ( !headerText ) return 0; @@ -571,7 +572,7 @@ void *sam_header_parse2(const char *headerText) while ( (text=nextline(&buf, &nbuf, text)) ) { hline = sam_header_line_parse(buf); - if ( hline && sam_header_line_validate(hline) ) + if ( hline && (!tovalidate || sam_header_line_validate(hline)) ) // With too many (~250,000) reference sequences the header parsing was too slow with list_append. hlines = list_append_to_end(hlines, hline); else diff --git a/sam/sam_view.c b/sam/sam_view.c index eb69449..efda4e8 100644 --- a/sam/sam_view.c +++ b/sam/sam_view.c @@ -6,6 +6,7 @@ #include "sam_header.h" #include "sam.h" #include "faidx.h" +#include "kstring.h" #include "khash.h" KHASH_SET_INIT_STR(rg) @@ -18,32 +19,28 @@ typedef struct { typedef khash_t(rg) *rghash_t; -rghash_t g_rghash = 0; +// FIXME: we'd better use no global variables... +static rghash_t g_rghash = 0; static int g_min_mapQ = 0, g_flag_on = 0, g_flag_off = 0; +static float g_subsam = -1; static char *g_library, *g_rg; -static int g_sol2sanger_tbl[128]; +static void *g_bed; -static void sol2sanger(bam1_t *b) -{ - int l; - uint8_t *qual = bam1_qual(b); - if (g_sol2sanger_tbl[30] == 0) { - for (l = 0; l != 128; ++l) { - g_sol2sanger_tbl[l] = (int)(10.0 * log(1.0 + pow(10.0, (l - 64 + 33) / 10.0)) / log(10.0) + .499); - if (g_sol2sanger_tbl[l] >= 93) g_sol2sanger_tbl[l] = 93; - } - } - for (l = 0; l < b->core.l_qseq; ++l) { - int q = qual[l]; - if (q > 127) q = 127; - qual[l] = g_sol2sanger_tbl[q]; - } -} +void *bed_read(const char *fn); +void bed_destroy(void *_h); +int bed_overlap(const void *_h, const char *chr, int beg, int end); static inline int __g_skip_aln(const bam_header_t *h, const bam1_t *b) { if (b->core.qual < g_min_mapQ || ((b->core.flag & g_flag_on) != g_flag_on) || (b->core.flag & g_flag_off)) return 1; + if (g_bed && b->core.tid >= 0 && !bed_overlap(g_bed, h->target_name[b->core.tid], b->core.pos, bam_calend(&b->core, bam1_cigar(b)))) + return 1; + if (g_subsam > 0.) { + int x = (int)(g_subsam + .499); + uint32_t k = __ac_X31_hash_string(bam1_qname(b)) + x; + if (k%1024 / 1024.0 >= g_subsam - x) return 1; + } if (g_rg || g_rghash) { uint8_t *s = bam_aux_get(b, "RG"); if (s) { @@ -61,6 +58,37 @@ static inline int __g_skip_aln(const bam_header_t *h, const bam1_t *b) return 0; } +static char *drop_rg(char *hdtxt, rghash_t h, int *len) +{ + char *p = hdtxt, *q, *r, *s; + kstring_t str; + memset(&str, 0, sizeof(kstring_t)); + while (1) { + int toprint = 0; + q = strchr(p, '\n'); + if (q == 0) q = p + strlen(p); + if (q - p < 3) break; // the line is too short; then stop + if (strncmp(p, "@RG\t", 4) == 0) { + int c; + khint_t k; + if ((r = strstr(p, "\tID:")) != 0) { + r += 4; + for (s = r; *s != '\0' && *s != '\n' && *s != '\t'; ++s); + c = *s; *s = '\0'; + k = kh_get(rg, h, r); + *s = c; + if (k != kh_end(h)) toprint = 1; + } + } else toprint = 1; + if (toprint) { + kputsn(p, q - p, &str); kputc('\n', &str); + } + p = q + 1; + } + *len = str.l; + return str.s; +} + // callback function for bam_fetch() that prints nonskipped records static int view_func(const bam1_t *b, void *data) { @@ -82,7 +110,7 @@ static int usage(int is_long_help); int main_samview(int argc, char *argv[]) { - int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, is_uncompressed = 0, is_bamout = 0, slx2sngr = 0, is_count = 0; + int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, compress_level = -1, is_bamout = 0, is_count = 0; int of_type = BAM_OFDEC, is_long_help = 0; int count = 0; samfile_t *in = 0, *out = 0; @@ -90,10 +118,10 @@ int main_samview(int argc, char *argv[]) /* parse command-line options */ strcpy(in_mode, "r"); strcpy(out_mode, "w"); - while ((c = getopt(argc, argv, "Sbct:hHo:q:f:F:ul:r:xX?T:CR:")) >= 0) { + while ((c = getopt(argc, argv, "Sbct:h1Ho:q:f:F:ul:r:xX?T:R:L:s:")) >= 0) { switch (c) { + case 's': g_subsam = atof(optarg); break; case 'c': is_count = 1; break; - case 'C': slx2sngr = 1; break; case 'S': is_bamin = 0; break; case 'b': is_bamout = 1; break; case 't': fn_list = strdup(optarg); is_bamin = 0; break; @@ -103,8 +131,10 @@ int main_samview(int argc, char *argv[]) case 'f': g_flag_on = strtol(optarg, 0, 0); break; case 'F': g_flag_off = strtol(optarg, 0, 0); break; case 'q': g_min_mapQ = atoi(optarg); break; - case 'u': is_uncompressed = 1; break; + case 'u': compress_level = 0; break; + case '1': compress_level = 1; break; case 'l': g_library = strdup(optarg); break; + case 'L': g_bed = bed_read(optarg); break; case 'r': g_rg = strdup(optarg); break; case 'R': fn_rg = strdup(optarg); break; case 'x': of_type = BAM_OFHEX; break; @@ -114,7 +144,7 @@ int main_samview(int argc, char *argv[]) default: return usage(is_long_help); } } - if (is_uncompressed) is_bamout = 1; + if (compress_level >= 0) is_bamout = 1; if (is_header_only) is_header = 1; if (is_bamout) strcat(out_mode, "b"); else { @@ -123,7 +153,11 @@ int main_samview(int argc, char *argv[]) } if (is_bamin) strcat(in_mode, "b"); if (is_header) strcat(out_mode, "h"); - if (is_uncompressed) strcat(out_mode, "u"); + if (compress_level >= 0) { + char tmp[2]; + tmp[0] = compress_level + '0'; tmp[1] = '\0'; + strcat(out_mode, tmp); + } if (argc == optind) return usage(is_long_help); // potential memory leak... // read the list of read groups @@ -151,6 +185,14 @@ int main_samview(int argc, char *argv[]) ret = 1; goto view_end; } + if (g_rghash) { // FIXME: I do not know what "bam_header_t::n_text" is for... + char *tmp; + int l; + tmp = drop_rg(in->header->text, g_rghash, &l); + free(in->header->text); + in->header->text = tmp; + in->header->l_text = l; + } if (!is_count && (out = samopen(fn_out? fn_out : "-", out_mode, in->header)) == 0) { fprintf(stderr, "[main_samview] fail to open \"%s\" for writing.\n", fn_out? fn_out : "standard output"); ret = 1; @@ -163,7 +205,6 @@ int main_samview(int argc, char *argv[]) int r; while ((r = samread(in, b)) >= 0) { // read one alignment from `in' if (!__g_skip_aln(in->header, b)) { - if (slx2sngr) sol2sanger(b); if (!is_count) samwrite(out, b); // write the alignment to `out' count++; } @@ -210,6 +251,7 @@ view_end: } // close files, free and return free(fn_list); free(fn_ref); free(fn_out); free(g_library); free(g_rg); free(fn_rg); + if (g_bed) bed_destroy(g_bed); if (g_rghash) { khint_t k; for (k = 0; k < kh_end(g_rghash); ++k) @@ -231,9 +273,11 @@ static int usage(int is_long_help) fprintf(stderr, " -H print header only (no alignments)\n"); fprintf(stderr, " -S input is SAM\n"); fprintf(stderr, " -u uncompressed BAM output (force -b)\n"); + fprintf(stderr, " -1 fast compression (force -b)\n"); fprintf(stderr, " -x output FLAG in HEX (samtools-C specific)\n"); fprintf(stderr, " -X output FLAG in string (samtools-C specific)\n"); fprintf(stderr, " -c print only the count of matching records\n"); + fprintf(stderr, " -L FILE output alignments overlapping the input BED FILE [null]\n"); fprintf(stderr, " -t FILE list of reference names and lengths (force -S) [null]\n"); fprintf(stderr, " -T FILE reference sequence file (force -S) [null]\n"); fprintf(stderr, " -o FILE output file name [stdout]\n"); @@ -243,6 +287,7 @@ static int usage(int is_long_help) fprintf(stderr, " -q INT minimum mapping quality [0]\n"); fprintf(stderr, " -l STR only output reads in library STR [null]\n"); fprintf(stderr, " -r STR only output reads in read group STR [null]\n"); + fprintf(stderr, " -s FLOAT fraction of templates to subsample; integer part as seed [-1]\n"); fprintf(stderr, " -? longer help\n"); fprintf(stderr, "\n"); if (is_long_help) @@ -293,3 +338,69 @@ int main_import(int argc, char *argv[]) free(argv2); return ret; } + +int8_t seq_comp_table[16] = { 0, 8, 4, 12, 2, 10, 9, 14, 1, 6, 5, 13, 3, 11, 7, 15 }; + +int main_bam2fq(int argc, char *argv[]) +{ + bamFile fp; + bam_header_t *h; + bam1_t *b; + int8_t *buf; + int max_buf; + if (argc == 1) { + fprintf(stderr, "Usage: samtools bam2fq \n"); + return 1; + } + fp = strcmp(argv[1], "-")? bam_open(argv[1], "r") : bam_dopen(fileno(stdin), "r"); + if (fp == 0) return 1; + h = bam_header_read(fp); + b = bam_init1(); + buf = 0; + max_buf = 0; + while (bam_read1(fp, b) >= 0) { + int i, qlen = b->core.l_qseq; + uint8_t *seq; + putchar('@'); fputs(bam1_qname(b), stdout); + if ((b->core.flag & 0x40) && !(b->core.flag & 0x80)) puts("/1"); + else if ((b->core.flag & 0x80) && !(b->core.flag & 0x40)) puts("/2"); + else putchar('\n'); + if (max_buf < qlen + 1) { + max_buf = qlen + 1; + kroundup32(max_buf); + buf = realloc(buf, max_buf); + } + buf[qlen] = 0; + seq = bam1_seq(b); + for (i = 0; i < qlen; ++i) + buf[i] = bam1_seqi(seq, i); + if (b->core.flag & 16) { // reverse complement + for (i = 0; i < qlen>>1; ++i) { + int8_t t = seq_comp_table[buf[qlen - 1 - i]]; + buf[qlen - 1 - i] = seq_comp_table[buf[i]]; + buf[i] = t; + } + if (qlen&1) buf[i] = seq_comp_table[buf[i]]; + } + for (i = 0; i < qlen; ++i) + buf[i] = bam_nt16_rev_table[buf[i]]; + puts((char*)buf); + puts("+"); + seq = bam1_qual(b); + for (i = 0; i < qlen; ++i) + buf[i] = 33 + seq[i]; + if (b->core.flag & 16) { // reverse + for (i = 0; i < qlen>>1; ++i) { + int8_t t = buf[qlen - 1 - i]; + buf[qlen - 1 - i] = buf[i]; + buf[i] = t; + } + } + puts((char*)buf); + } + free(buf); + bam_destroy1(b); + bam_header_destroy(h); + bam_close(fp); + return 0; +} diff --git a/sam/sample.c b/sam/sample.c index b3d2642..830b9d1 100644 --- a/sam/sample.c +++ b/sam/sample.c @@ -52,10 +52,15 @@ static void add_pair(bam_sample_t *sm, khash_t(sm) *sm2id, const char *key, cons int bam_smpl_add(bam_sample_t *sm, const char *fn, const char *txt) { const char *p = txt, *q, *r; - kstring_t buf; + kstring_t buf, first_sm; int n = 0; khash_t(sm) *sm2id = (khash_t(sm)*)sm->sm2id; + if (txt == 0) { + add_pair(sm, sm2id, fn, fn); + return 0; + } memset(&buf, 0, sizeof(kstring_t)); + memset(&first_sm, 0, sizeof(kstring_t)); while ((q = strstr(p, "@RG")) != 0) { p = q + 3; r = q = 0; @@ -69,12 +74,22 @@ int bam_smpl_add(bam_sample_t *sm, const char *fn, const char *txt) oq = *u; or = *v; *u = *v = '\0'; buf.l = 0; kputs(fn, &buf); kputc('/', &buf); kputs(q, &buf); add_pair(sm, sm2id, buf.s, r); + if ( !first_sm.s ) + kputs(r,&first_sm); *u = oq; *v = or; } else break; p = q > r? q : r; ++n; } if (n == 0) add_pair(sm, sm2id, fn, fn); + // If there is only one RG tag present in the header and reads are not annotated, don't refuse to work but + // use the tag instead. + else if ( n==1 && first_sm.s ) + add_pair(sm,sm2id,fn,first_sm.s); + if ( first_sm.s ) + free(first_sm.s); + +// add_pair(sm, sm2id, fn, fn); free(buf.s); return 0; } diff --git a/sam/samtools.1 b/sam/samtools.1 index 57f1aff..98ce9d0 100644 --- a/sam/samtools.1 +++ b/sam/samtools.1 @@ -1,7 +1,9 @@ -.TH samtools 1 "2 December 2010" "samtools-0.1.12" "Bioinformatics tools" +.TH samtools 1 "05 July 2011" "samtools-0.1.17" "Bioinformatics tools" .SH NAME .PP samtools - Utilities for the Sequence Alignment/Map (SAM) format + +bcftools - Utilities for the Binary Call Format (BCF) and VCF .SH SYNOPSIS .PP samtools view -bt ref_list.txt -o aln.bam aln.sam.gz @@ -23,6 +25,12 @@ samtools pileup -vcf ref.fasta aln.sorted.bam samtools mpileup -C50 -gf ref.fasta -r chr3:1,000-2,000 in1.bam in2.bam .PP samtools tview aln.sorted.bam ref.fasta +.PP +bcftools index in.bcf +.PP +bcftools view in.bcf chr2:100-200 > out.vcf +.PP +bcftools view -vc in.bcf > out.vcf 2> out.afs .SH DESCRIPTION .PP @@ -43,7 +51,7 @@ Samtools checks the current working directory for the index file and will download the index upon absence. Samtools does not retrieve the entire alignment file unless it is asked to do so. -.SH COMMANDS AND OPTIONS +.SH SAMTOOLS COMMANDS AND OPTIONS .TP 10 .B view @@ -137,21 +145,68 @@ viewing the same reference sequence. .TP .B mpileup -samtools mpileup [-Bug] [-C capQcoef] [-r reg] [-f in.fa] [-l list] [-M capMapQ] [-Q minBaseQ] [-q minMapQ] in.bam [in2.bam [...]] +.B samtools mpileup +.RB [ \-EBug ] +.RB [ \-C +.IR capQcoef ] +.RB [ \-r +.IR reg ] +.RB [ \-f +.IR in.fa ] +.RB [ \-l +.IR list ] +.RB [ \-M +.IR capMapQ ] +.RB [ \-Q +.IR minBaseQ ] +.RB [ \-q +.IR minMapQ ] +.I in.bam +.RI [ in2.bam +.RI [ ... ]] Generate BCF or pileup for one or multiple BAM files. Alignment records are grouped by sample identifiers in @RG header lines. If sample identifiers are absent, each input file is regarded as one sample. -.B OPTIONS: +In the pileup format (without +.BR -u or -g ), +each +line represents a genomic position, consisting of chromosome name, +coordinate, reference base, read bases, read qualities and alignment +mapping qualities. Information on match, mismatch, indel, strand, +mapping quality and start and end of a read are all encoded at the read +base column. At this column, a dot stands for a match to the reference +base on the forward strand, a comma for a match on the reverse strand, +a '>' or '<' for a reference skip, `ACGTN' for a mismatch on the forward +strand and `acgtn' for a mismatch on the reverse strand. A pattern +`\\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between this +reference position and the next reference position. The length of the +insertion is given by the integer in the pattern, followed by the +inserted sequence. Similarly, a pattern `-[0-9]+[ACGTNacgtn]+' +represents a deletion from the reference. The deleted bases will be +presented as `*' in the following lines. Also at the read base column, a +symbol `^' marks the start of a read. The ASCII of the character +following `^' minus 33 gives the mapping quality. A symbol `$' marks the +end of a read segment. + +.B Input Options: .RS -.TP 8 +.TP 10 +.B -6 +Assume the quality is in the Illumina 1.3+ encoding. +.B -A +Do not skip anomalous read pairs in variant calling. +.TP .B -B Disable probabilistic realignment for the computation of base alignment quality (BAQ). BAQ is the Phred-scaled probability of a read base being misaligned. Applying this option greatly helps to reduce false SNPs caused by misalignments. .TP +.BI -b \ FILE +List of input BAM files, one file per line [null] +.TP .BI -C \ INT Coefficient for downgrading mapping quality for reads containing excessive mismatches. Given a read with a phred-scaled probability q of @@ -159,17 +214,62 @@ being generated from the mapped position, the new mapping quality is about sqrt((INT-q)/INT)*INT. A zero value disables this functionality; if enabled, the recommended value for BWA is 50. [0] .TP -.BI -e \ INT -Phred-scaled gap extension sequencing error probability. Reducing +.BI -d \ INT +At a position, read maximally .I INT -leads to longer indels. [20] +reads per input BAM. [250] +.TP +.B -E +Extended BAQ computation. This option helps sensitivity especially for MNPs, but may hurt +specificity a little bit. .TP .BI -f \ FILE -The reference file [null] +The +.BR faidx -indexed +reference file in the FASTA format. The file can be optionally compressed by +.BR razip . +[null] +.TP +.BI -l \ FILE +BED or position list file containing a list of regions or sites where pileup or BCF should be generated [null] +.TP +.BI -q \ INT +Minimum mapping quality for an alignment to be used [0] +.TP +.BI -Q \ INT +Minimum base quality for a base to be considered [13] +.TP +.BI -r \ STR +Only generate pileup in region +.I STR +[all sites] +.TP +.B Output Options: + +.TP +.B -D +Output per-sample read depth .TP .B -g Compute genotype likelihoods and output them in the binary call format (BCF). .TP +.B -S +Output per-sample Phred-scaled strand bias P-value +.TP +.B -u +Similar to +.B -g +except that the output is uncompressed BCF, which is preferred for piping. + +.TP +.B Options for Genotype Likelihood Computation (for -g or -u): + +.TP +.BI -e \ INT +Phred-scaled gap extension sequencing error probability. Reducing +.I INT +leads to longer indels. [20] +.TP .BI -h \ INT Coefficient for modeling homopolymer errors. Given an .IR l -long @@ -180,8 +280,13 @@ is modeled as .IR INT * s / l . [100] .TP -.BI -l \ FILE -File containing a list of sites where pileup or BCF is outputted [null] +.B -I +Do not perform INDEL calling +.TP +.BI -L \ INT +Skip INDEL calling if the average per-sample depth is above +.IR INT . +[250] .TP .BI -o \ INT Phred-scaled gap open sequencing error probability. Reducing @@ -194,22 +299,6 @@ Comma dilimited list of platforms (determined by from which indel candidates are obtained. It is recommended to collect indel candidates from sequencing technologies that have low indel error rate such as ILLUMINA. [all] -.TP -.BI -q \ INT -Minimum mapping quality for an alignment to be used [0] -.TP -.BI -Q \ INT -Minimum base quality for a base to be considered [13] -.TP -.BI -r \ STR -Only generate pileup in region -.I STR -[all sites] -.TP -.B -u -Similar to -.B -g -except that the output is uncompressed BCF, which is preferred for piping. .RE .TP @@ -223,6 +312,16 @@ with the header in This command is much faster than replacing the header with a BAM->SAM->BAM conversion. +.TP +.B cat +samtools cat [-h header.sam] [-o out.bam] [ ... ] + +Concatenate BAMs. The sequence dictionary of each input BAM must be identical, +although this command does not check this. This command uses a similar trick +to +.B reheader +which enables fast BAM concatenation. + .TP .B sort samtools sort [-no] [-m maxMem] @@ -249,7 +348,7 @@ Approximately the maximum required memory. [500000000] .TP .B merge -samtools merge [-nur] [-h inh.sam] [-R reg] [...] +samtools merge [-nur1f] [-h inh.sam] [-R reg] [...] Merge multiple sorted alignments. The header reference lists of all the input BAM files, and the @SQ headers of @@ -266,6 +365,12 @@ and the headers of other files will be ignored. .B OPTIONS: .RS .TP 8 +.B -1 +Use zlib compression level 1 to comrpess the output +.TP +.B -f +Force to overwrite the output file if present. +.TP 8 .BI -h \ FILE Use the lines of .I FILE @@ -277,17 +382,18 @@ replacing any header lines that would otherwise be copied from is actually in SAM format, though any alignment records it may contain are ignored.) .TP +.B -n +The input alignments are sorted by read names rather than by chromosomal +coordinates +.TP .BI -R \ STR Merge files in the specified region indicated by .I STR +[null] .TP .B -r Attach an RG tag to each alignment. The tag value is inferred from file names. .TP -.B -n -The input alignments are sorted by read names rather than by chromosomal -coordinates -.TP .B -u Uncompressed BAM output .RE @@ -355,7 +461,7 @@ Treat paired-end reads and single-end reads. .TP .B calmd -samtools calmd [-eubSr] [-C capQcoef] +samtools calmd [-EeubSr] [-C capQcoef] Generate the MD tag. If the MD tag is already present, this command will give a warning if the MD tag generated is different from the existing @@ -388,142 +494,228 @@ Coefficient to cap mapping quality of poorly mapped reads. See the command for details. [0] .TP .B -r -Compute the BQ tag without changing the base quality. +Compute the BQ tag (without -A) or cap base quality by BAQ (with -A). +.TP +.B -E +Extended BAQ calculation. This option trades specificity for sensitivity, though the +effect is minor. .RE .TP -.B pileup -samtools pileup [-2sSBicv] [-f in.ref.fasta] [-t in.ref_list] [-l -in.site_list] [-C capMapQ] [-M maxMapQ] [-T theta] [-N nHap] [-r -pairDiffRate] [-m mask] [-d maxIndelDepth] [-G indelPrior] -| +.B targetcut +samtools targetcut [-Q minBaseQ] [-i inPenalty] [-0 em0] [-1 em1] [-2 em2] [-f ref] -Print the alignment in the pileup format. In the pileup format, each -line represents a genomic position, consisting of chromosome name, -coordinate, reference base, read bases, read qualities and alignment -mapping qualities. Information on match, mismatch, indel, strand, -mapping quality and start and end of a read are all encoded at the read -base column. At this column, a dot stands for a match to the reference -base on the forward strand, a comma for a match on the reverse strand, -a '>' or '<' for a reference skip, `ACGTN' for a mismatch on the forward -strand and `acgtn' for a mismatch on the reverse strand. A pattern -`\\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between this -reference position and the next reference position. The length of the -insertion is given by the integer in the pattern, followed by the -inserted sequence. Similarly, a pattern `-[0-9]+[ACGTNacgtn]+' -represents a deletion from the reference. The deleted bases will be -presented as `*' in the following lines. Also at the read base column, a -symbol `^' marks the start of a read. The ASCII of the character -following `^' minus 33 gives the mapping quality. A symbol `$' marks the -end of a read segment. +This command identifies target regions by examining the continuity of read depth, computes +haploid consensus sequences of targets and outputs a SAM with each sequence corresponding +to a target. When option +.B -f +is in use, BAQ will be applied. This command is +.B only +designed for cutting fosmid clones from fosmid pool sequencing [Ref. Kitzman et al. (2010)]. +.RE -If option -.B -c -is applied, the consensus base, Phred-scaled consensus quality, SNP -quality (i.e. the Phred-scaled probability of the consensus being -identical to the reference) and root mean square (RMS) mapping quality -of the reads covering the site will be inserted between the `reference -base' and the `read bases' columns. An indel occupies an additional -line. Each indel line consists of chromosome name, coordinate, a star, -the genotype, consensus quality, SNP quality, RMS mapping quality, # -covering reads, the first alllele, the second allele, # reads supporting -the first allele, # reads supporting the second allele and # reads -containing indels different from the top two alleles. - -.B NOTE: -Since 0.1.10, the `pileup' command is deprecated by `mpileup'. +.TP +.B phase +samtools phase [-AF] [-k len] [-b prefix] [-q minLOD] [-Q minBaseQ] +Call and phase heterozygous SNPs. .B OPTIONS: .RS -.TP 10 -.B -B -Disable the BAQ computation. See the -.B mpileup -command for details. +.TP 8 +.B -A +Drop reads with ambiguous phase. +.TP 8 +.BI -b \ STR +Prefix of BAM output. When this option is in use, phase-0 reads will be saved in file +.BR STR .0.bam +and phase-1 reads in +.BR STR .1.bam. +Phase unknown reads will be randomly allocated to one of the two files. Chimeric reads +with switch errors will be saved in +.BR STR .chimeric.bam. +[null] .TP -.B -c -Call the consensus sequence. Options -.BR -T ", " -N ", " -I " and " -r -are only effective when -.BR -c " or " -g -is in use. +.B -F +Do not attempt to fix chimeric reads. .TP -.BI -C \ INT -Coefficient for downgrading the mapping quality of poorly mapped -reads. See the -.B mpileup -command for details. [0] +.BI -k \ INT +Maximum length for local phasing. [13] .TP -.BI -d \ INT -Use the first -.I NUM -reads in the pileup for indel calling for speed up. Zero for unlimited. [1024] +.BI -q \ INT +Minimum Phred-scaled LOD to call a heterozygote. [40] .TP -.BI -f \ FILE -The reference sequence in the FASTA format. Index file -.I FILE.fai -will be created if -absent. +.BI -Q \ INT +Minimum base quality to be used in het calling. [13] +.RE + +.SH BCFTOOLS COMMANDS AND OPTIONS + +.TP 10 +.B view +.B bcftools view +.RB [ \-AbFGNQSucgv ] +.RB [ \-D +.IR seqDict ] +.RB [ \-l +.IR listLoci ] +.RB [ \-s +.IR listSample ] +.RB [ \-i +.IR gapSNPratio ] +.RB [ \-t +.IR mutRate ] +.RB [ \-p +.IR varThres ] +.RB [ \-P +.IR prior ] +.RB [ \-1 +.IR nGroup1 ] +.RB [ \-d +.IR minFrac ] +.RB [ \-U +.IR nPerm ] +.RB [ \-X +.IR permThres ] +.RB [ \-T +.IR trioType ] +.I in.bcf +.RI [ region ] + +Convert between BCF and VCF, call variant candidates and estimate allele +frequencies. + +.RS .TP -.B -g -Generate genotype likelihood in the binary GLFv3 format. This option -suppresses -c, -i and -s. This option is deprecated by the -.B mpileup -command. +.B Input/Output Options: +.TP 10 +.B -A +Retain all possible alternate alleles at variant sites. By default, the view +command discards unlikely alleles. +.TP 10 +.B -b +Output in the BCF format. The default is VCF. .TP -.B -i -Only output pileup lines containing indels. +.BI -D \ FILE +Sequence dictionary (list of chromosome names) for VCF->BCF conversion [null] .TP -.BI -I \ INT -Phred probability of an indel in sequencing/prep. [40] +.B -F +Indicate PL is generated by r921 or before (ordering is different). +.TP +.B -G +Suppress all individual genotype information. .TP .BI -l \ FILE -List of sites at which pileup is output. This file is space -delimited. The first two columns are required to be chromosome and -1-based coordinate. Additional columns are ignored. It is -recommended to use option +List of sites at which information are outputted [all sites] .TP -.BI -m \ INT -Filter reads with flag containing bits in -.I INT -[1796] +.B -N +Skip sites where the REF field is not A/C/G/T .TP -.BI -M \ INT -Cap mapping quality at INT [60] +.B -Q +Output the QCALL likelihood format .TP -.BI -N \ INT -Number of haplotypes in the sample (>=2) [2] +.BI -s \ FILE +List of samples to use. The first column in the input gives the sample names +and the second gives the ploidy, which can only be 1 or 2. When the 2nd column +is absent, the sample ploidy is assumed to be 2. In the output, the ordering of +samples will be identical to the one in +.IR FILE . +[null] .TP -.BI -r \ FLOAT -Expected fraction of differences between a pair of haplotypes [0.001] +.B -S +The input is VCF instead of BCF. .TP -.B -s -Print the mapping quality as the last column. This option makes the -output easier to parse, although this format is not space efficient. +.B -u +Uncompressed BCF output (force -b). .TP -.B -S -The input file is in SAM. +.B Consensus/Variant Calling Options: +.TP 10 +.B -c +Call variants using Bayesian inference. This option automatically invokes option +.BR -e . .TP -.BI -t \ FILE -List of reference names ane sequence lengths, in the format described -for the -.B import -command. If this option is present, samtools assumes the input -.I -is in SAM format; otherwise it assumes in BAM format. +.BI -d \ FLOAT +When +.B -v +is in use, skip loci where the fraction of samples covered by reads is below FLOAT. [0] +.TP +.B -e +Perform max-likelihood inference only, including estimating the site allele frequency, +testing Hardy-Weinberg equlibrium and testing associations with LRT. +.TP +.B -g +Call per-sample genotypes at variant sites (force -c) +.TP +.BI -i \ FLOAT +Ratio of INDEL-to-SNP mutation rate [0.15] +.TP +.BI -p \ FLOAT +A site is considered to be a variant if P(ref|D)| [region1 [...]] - - Extract/print all or sub alignments in SAM or BAM format. If - no region is specified, all the alignments will be printed; - otherwise only alignments overlapping the specified regions - will be output. An alignment may be given multiple times if - it is overlapping several regions. A region can be presented, - for example, in the following format: `chr2' (the whole - chr2), `chr2:1000000' (region starting from 1,000,000bp) or - `chr2:1,000,000-2,000,000' (region between 1,000,000 and - 2,000,000bp including the end points). The coordinate is - 1-based. - - OPTIONS: - - -b Output in the BAM format. - - -f INT Only output alignments with all bits in INT present - in the FLAG field. INT can be in hex in the format of - /^0x[0-9A-F]+/ [0] - - -F INT Skip alignments with bits present in INT [0] - - -h Include the header in the output. - - -H Output the header only. - - -l STR Only output reads in library STR [null] - - -o FILE Output file [stdout] - - -q INT Skip alignments with MAPQ smaller than INT [0] - - -r STR Only output reads in read group STR [null] - - -R FILE Output reads in read groups listed in FILE [null] - - -S Input is in SAM. If @SQ header lines are absent, the - `-t' option is required. - - -c Instead of printing the alignments, only count them - and print the total number. All filter options, such - as `-f', `-F' and `-q' , are taken into account. - - -t FILE This file is TAB-delimited. Each line must contain - the reference name and the length of the reference, - one line for each distinct reference; additional - fields are ignored. This file also defines the order - of the reference sequences in sorting. If you run - `samtools faidx ', the resultant index file - .fai can be used as this file. - - -u Output uncompressed BAM. This option saves time spent - on compression/decomprssion and is thus preferred - when the output is piped to another samtools command. - - - tview samtools tview [ref.fasta] - - Text alignment viewer (based on the ncurses library). In the - viewer, press `?' for help and press `g' to check the align- - ment start from a region in the format like - `chr10:10,000,000' or `=10,000,000' when viewing the same - reference sequence. - - - mpileup samtools mpileup [-Bug] [-C capQcoef] [-r reg] [-f in.fa] [-l - list] [-M capMapQ] [-Q minBaseQ] [-q minMapQ] in.bam [in2.bam - [...]] - - Generate BCF or pileup for one or multiple BAM files. Align- - ment records are grouped by sample identifiers in @RG header - lines. If sample identifiers are absent, each input file is - regarded as one sample. - - OPTIONS: - - -B Disable probabilistic realignment for the computation - of base alignment quality (BAQ). BAQ is the Phred- - scaled probability of a read base being misaligned. - Applying this option greatly helps to reduce false - SNPs caused by misalignments. - - -C INT Coefficient for downgrading mapping quality for reads - containing excessive mismatches. Given a read with a - phred-scaled probability q of being generated from - the mapped position, the new mapping quality is about - sqrt((INT-q)/INT)*INT. A zero value disables this - functionality; if enabled, the recommended value for - BWA is 50. [0] - - -e INT Phred-scaled gap extension sequencing error probabil- - ity. Reducing INT leads to longer indels. [20] - - -f FILE The reference file [null] - - -g Compute genotype likelihoods and output them in the - binary call format (BCF). - - -h INT Coefficient for modeling homopolymer errors. Given an - l-long homopolymer run, the sequencing error of an - indel of size s is modeled as INT*s/l. [100] - - -l FILE File containing a list of sites where pileup or BCF - is outputted [null] - - -o INT Phred-scaled gap open sequencing error probability. - Reducing INT leads to more indel calls. [40] - - -P STR Comma dilimited list of platforms (determined by @RG- - PL) from which indel candidates are obtained. It is - recommended to collect indel candidates from sequenc- - ing technologies that have low indel error rate such - as ILLUMINA. [all] - - -q INT Minimum mapping quality for an alignment to be used - [0] - - -Q INT Minimum base quality for a base to be considered [13] - - -r STR Only generate pileup in region STR [all sites] - - -u Similar to -g except that the output is uncompressed - BCF, which is preferred for piping. - - - reheader samtools reheader - - Replace the header in in.bam with the header in - in.header.sam. This command is much faster than replacing - the header with a BAM->SAM->BAM conversion. - - - sort samtools sort [-no] [-m maxMem] - - Sort alignments by leftmost coordinates. File .bam will be created. This command may also create tempo- - rary files .%d.bam when the whole alignment can- - not be fitted into memory (controlled by option -m). - - OPTIONS: - - -o Output the final alignment to the standard output. - - -n Sort by read names rather than by chromosomal coordi- - nates - - -m INT Approximately the maximum required memory. - [500000000] - - - merge samtools merge [-nur] [-h inh.sam] [-R reg] - [...] - - Merge multiple sorted alignments. The header reference lists - of all the input BAM files, and the @SQ headers of inh.sam, - if any, must all refer to the same set of reference - sequences. The header reference list and (unless overridden - by -h) `@' headers of in1.bam will be copied to out.bam, and - the headers of other files will be ignored. - - OPTIONS: - - -h FILE Use the lines of FILE as `@' headers to be copied to - out.bam, replacing any header lines that would other- - wise be copied from in1.bam. (FILE is actually in - SAM format, though any alignment records it may con- - tain are ignored.) - - -R STR Merge files in the specified region indicated by STR - - -r Attach an RG tag to each alignment. The tag value is - inferred from file names. - - -n The input alignments are sorted by read names rather - than by chromosomal coordinates - - -u Uncompressed BAM output - - - index samtools index - - Index sorted alignment for fast random access. Index file - .bai will be created. - - - idxstats samtools idxstats - - Retrieve and print stats in the index file. The output is TAB - delimited with each line consisting of reference sequence - name, sequence length, # mapped reads and # unmapped reads. - - - faidx samtools faidx [region1 [...]] - - Index reference sequence in the FASTA format or extract sub- - sequence from indexed reference sequence. If no region is - specified, faidx will index the file and create - .fai on the disk. If regions are speficified, the - subsequences will be retrieved and printed to stdout in the - FASTA format. The input file can be compressed in the RAZF - format. - - - fixmate samtools fixmate - - Fill in mate coordinates, ISIZE and mate related flags from a - name-sorted alignment. - - - rmdup samtools rmdup [-sS] - - Remove potential PCR duplicates: if multiple read pairs have - identical external coordinates, only retain the pair with - highest mapping quality. In the paired-end mode, this com- - mand ONLY works with FR orientation and requires ISIZE is - correctly set. It does not work for unpaired reads (e.g. two - ends mapped to different chromosomes or orphan reads). - - OPTIONS: - - -s Remove duplicate for single-end reads. By default, - the command works for paired-end reads only. - - -S Treat paired-end reads and single-end reads. - - - calmd samtools calmd [-eubSr] [-C capQcoef] - - Generate the MD tag. If the MD tag is already present, this - command will give a warning if the MD tag generated is dif- - ferent from the existing tag. Output SAM by default. - - OPTIONS: - - -A When used jointly with -r this option overwrites the - original base quality. - - -e Convert a the read base to = if it is identical to - the aligned reference base. Indel caller does not - support the = bases at the moment. - - -u Output uncompressed BAM - - -b Output compressed BAM - - -S The input is SAM with header lines - - -C INT Coefficient to cap mapping quality of poorly mapped - reads. See the pileup command for details. [0] - - -r Compute the BQ tag without changing the base quality. - - - pileup samtools pileup [-2sSBicv] [-f in.ref.fasta] [-t in.ref_list] - [-l in.site_list] [-C capMapQ] [-M maxMapQ] [-T theta] [-N - nHap] [-r pairDiffRate] [-m mask] [-d maxIndelDepth] [-G - indelPrior] | - - Print the alignment in the pileup format. In the pileup for- - mat, each line represents a genomic position, consisting of - chromosome name, coordinate, reference base, read bases, read - qualities and alignment mapping qualities. Information on - match, mismatch, indel, strand, mapping quality and start and - end of a read are all encoded at the read base column. At - this column, a dot stands for a match to the reference base - on the forward strand, a comma for a match on the reverse - strand, a '>' or '<' for a reference skip, `ACGTN' for a mis- - match on the forward strand and `acgtn' for a mismatch on the - reverse strand. A pattern `\+[0-9]+[ACGTNacgtn]+' indicates - there is an insertion between this reference position and the - next reference position. The length of the insertion is given - by the integer in the pattern, followed by the inserted - sequence. Similarly, a pattern `-[0-9]+[ACGTNacgtn]+' repre- - sents a deletion from the reference. The deleted bases will - be presented as `*' in the following lines. Also at the read - base column, a symbol `^' marks the start of a read. The - ASCII of the character following `^' minus 33 gives the map- - ping quality. A symbol `$' marks the end of a read segment. - - If option -c is applied, the consensus base, Phred-scaled - consensus quality, SNP quality (i.e. the Phred-scaled proba- - bility of the consensus being identical to the reference) and - root mean square (RMS) mapping quality of the reads covering - the site will be inserted between the `reference base' and - the `read bases' columns. An indel occupies an additional - line. Each indel line consists of chromosome name, coordi- - nate, a star, the genotype, consensus quality, SNP quality, - RMS mapping quality, # covering reads, the first alllele, the - second allele, # reads supporting the first allele, # reads - supporting the second allele and # reads containing indels - different from the top two alleles. - - NOTE: Since 0.1.10, the `pileup' command is deprecated by - `mpileup'. - - OPTIONS: - - -B Disable the BAQ computation. See the mpileup com- - mand for details. - - -c Call the consensus sequence. Options -T, -N, -I and - -r are only effective when -c or -g is in use. - - -C INT Coefficient for downgrading the mapping quality of - poorly mapped reads. See the mpileup command for - details. [0] - - -d INT Use the first NUM reads in the pileup for indel - calling for speed up. Zero for unlimited. [1024] - - -f FILE The reference sequence in the FASTA format. Index - file FILE.fai will be created if absent. - - -g Generate genotype likelihood in the binary GLFv3 - format. This option suppresses -c, -i and -s. This - option is deprecated by the mpileup command. - - -i Only output pileup lines containing indels. - - -I INT Phred probability of an indel in sequencing/prep. - [40] - - -l FILE List of sites at which pileup is output. This file - is space delimited. The first two columns are - required to be chromosome and 1-based coordinate. - Additional columns are ignored. It is recommended - to use option - - -m INT Filter reads with flag containing bits in INT - [1796] - - -M INT Cap mapping quality at INT [60] - - -N INT Number of haplotypes in the sample (>=2) [2] - - -r FLOAT Expected fraction of differences between a pair of - haplotypes [0.001] - - -s Print the mapping quality as the last column. This - option makes the output easier to parse, although - this format is not space efficient. - - -S The input file is in SAM. - - -t FILE List of reference names ane sequence lengths, in - the format described for the import command. If - this option is present, samtools assumes the input - is in SAM format; otherwise it - assumes in BAM format. -s together with -l as in - the default format we may not know the mapping - quality. - - -T FLOAT The theta parameter (error dependency coefficient) - in the maq consensus calling model [0.85] - - -SAM FORMAT - SAM is TAB-delimited. Apart from the header lines, which are started - with the `@' symbol, each alignment line consists of: - - - +----+-------+----------------------------------------------------------+ - |Col | Field | Description | - +----+-------+----------------------------------------------------------+ - | 1 | QNAME | Query (pair) NAME | - | 2 | FLAG | bitwise FLAG | - | 3 | RNAME | Reference sequence NAME | - | 4 | POS | 1-based leftmost POSition/coordinate of clipped sequence | - | 5 | MAPQ | MAPping Quality (Phred-scaled) | - | 6 | CIAGR | extended CIGAR string | - | 7 | MRNM | Mate Reference sequence NaMe (`=' if same as RNAME) | - | 8 | MPOS | 1-based Mate POSistion | - | 9 | ISIZE | Inferred insert SIZE | - |10 | SEQ | query SEQuence on the same strand as the reference | - |11 | QUAL | query QUALity (ASCII-33 gives the Phred base quality) | - |12 | OPT | variable OPTional fields in the format TAG:VTYPE:VALUE | - +----+-------+----------------------------------------------------------+ - - Each bit in the FLAG field is defined as: - - - +-------+-----+--------------------------------------------------+ - | Flag | Chr | Description | - +-------+-----+--------------------------------------------------+ - |0x0001 | p | the read is paired in sequencing | - |0x0002 | P | the read is mapped in a proper pair | - |0x0004 | u | the query sequence itself is unmapped | - |0x0008 | U | the mate is unmapped | - |0x0010 | r | strand of the query (1 for reverse) | - |0x0020 | R | strand of the mate | - |0x0040 | 1 | the read is the first read in a pair | - |0x0080 | 2 | the read is the second read in a pair | - |0x0100 | s | the alignment is not primary | - |0x0200 | f | the read fails platform/vendor quality checks | - |0x0400 | d | the read is either a PCR or an optical duplicate | - +-------+-----+--------------------------------------------------+ - -EXAMPLES - o Import SAM to BAM when @SQ lines are present in the header: - - samtools view -bS aln.sam > aln.bam - - If @SQ lines are absent: - - samtools faidx ref.fa - samtools view -bt ref.fa.fai aln.sam > aln.bam - - where ref.fa.fai is generated automatically by the faidx command. - - - o Attach the RG tag while merging sorted alignments: - - perl -e 'print "@RG\tID:ga\tSM:hs\tLB:ga\tPL:Illu- - mina\n@RG\tID:454\tSM:hs\tLB:454\tPL:454\n"' > rg.txt - samtools merge -rh rg.txt merged.bam ga.bam 454.bam - - The value in a RG tag is determined by the file name the read is com- - ing from. In this example, in the merged.bam, reads from ga.bam will - be attached RG:Z:ga, while reads from 454.bam will be attached - RG:Z:454. - - - o Call SNPs and short indels for one diploid individual: - - samtools mpileup -ugf ref.fa aln.bam | bcftools view -bvcg - > - var.raw.bcf - bcftools view var.raw.bcf | vcfutils.pl varFilter -D 100 > - var.flt.vcf - - The -D option of varFilter controls the maximum read depth, which - should be adjusted to about twice the average read depth. One may - consider to add -C50 to mpileup if mapping quality is overestimated - for reads containing excessive mismatches. Applying this option usu- - ally helps BWA-short but may not other mappers. - - - o Call SNPs and short indels for multiple diploid individuals: - - samtools mpileup -P ILLUMINA -ugf ref.fa *.bam | bcftools view - -bcvg - > var.raw.bcf - bcftools view var.raw.bcf | vcfutils.pl varFilter -D 2000 > - var.flt.vcf - - Individuals are identified from the SM tags in the @RG header lines. - Individuals can be pooled in one alignment file; one individual can - also be separated into multiple files. The -P option specifies that - indel candidates should be collected only from read groups with the - @RG-PL tag set to ILLUMINA. Collecting indel candidates from reads - sequenced by an indel-prone technology may affect the performance of - indel calling. - - - o Derive the allele frequency spectrum (AFS) on a list of sites from - multiple individuals: - - samtools mpileup -Igf ref.fa *.bam > all.bcf - bcftools view -bl sites.list all.bcf > sites.bcf - bcftools view -cGP cond2 sites.bcf > /dev/null 2> sites.1.afs - bcftools view -cGP sites.1.afs sites.bcf > /dev/null 2> sites.2.afs - bcftools view -cGP sites.2.afs sites.bcf > /dev/null 2> sites.3.afs - ...... - - where sites.list contains the list of sites with each line consisting - of the reference sequence name and position. The following bcftools - commands estimate AFS by EM. - - - o Dump BAQ applied alignment for other SNP callers: - - samtools calmd -bAr aln.bam > aln.baq.bam - - It adds and corrects the NM and MD tags at the same time. The calmd - command also comes with the -C option, the same as the one in pileup - and mpileup. Apply if it helps. - - -LIMITATIONS - o Unaligned words used in bam_import.c, bam_endian.h, bam.c and - bam_aux.c. - - o In merging, the input files are required to have the same number of - reference sequences. The requirement can be relaxed. In addition, - merging does not reconstruct the header dictionaries automatically. - Endusers have to provide the correct header. Picard is better at - merging. - - o Samtools paired-end rmdup does not work for unpaired reads (e.g. - orphan reads or ends mapped to different chromosomes). If this is a - concern, please use Picard's MarkDuplicate which correctly handles - these cases, although a little slower. - - -AUTHOR - Heng Li from the Sanger Institute wrote the C version of samtools. Bob - Handsaker from the Broad Institute implemented the BGZF library and Jue - Ruan from Beijing Genomics Institute wrote the RAZF library. John Mar- - shall and Petr Danecek contribute to the source code and various people - from the 1000 Genomes Project have contributed to the SAM format speci- - fication. - - -SEE ALSO - Samtools website: - - - -samtools-0.1.12 2 December 2010 samtools(1) diff --git a/sam_rsem_aux.h b/sam_rsem_aux.h new file mode 100644 index 0000000..5b4f9cd --- /dev/null +++ b/sam_rsem_aux.h @@ -0,0 +1,48 @@ +#ifndef SAM_RSEM_AUX_H_ +#define SAM_RSEM_AUX_H_ + +#include +#include +#include + +#include "sam/bam.h" + +// dwt: duplicate without text +bam_header_t *bam_header_dwt(const bam_header_t *ori_h) +{ + bam_header_t *h; + + h = bam_header_init(); + h->n_targets = ori_h->n_targets; + h->target_len = (uint32_t*)calloc(h->n_targets, 4); + h->target_name = (char**)calloc(h->n_targets, sizeof(char*)); + for (int i = 0; i < h->n_targets; i++) { + h->target_len[i] = ori_h->target_len[i]; + h->target_name[i] = strdup(ori_h->target_name[i]); + } + + return h; +} + +void append_header_text(bam_header_t *header, const char* text, int len) +{ + int x = header->l_text + 1; + int y = header->l_text + len + 1; // 1 byte null + if (text == 0) return; + kroundup32(x); + kroundup32(y); + if (x < y) header->text = (char*)realloc(header->text, y); + strncpy(header->text + header->l_text, text, len); // we cannot use strcpy() here. + header->l_text += len; + header->text[header->l_text] = 0; +} + +void expand_data_size(bam1_t *b) { + if (b->m_data < b->data_len) { + b->m_data = b->data_len; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + } +} + +#endif /* SAM_RSEM_AUX_H_ */ diff --git a/sam_rsem_cvt.h b/sam_rsem_cvt.h new file mode 100644 index 0000000..4347ef8 --- /dev/null +++ b/sam_rsem_cvt.h @@ -0,0 +1,92 @@ +#ifndef SAM_RSEM_CVT_H_ +#define SAM_RSEM_CVT_H_ + +#include + +#include "stdint.h" +#include "sam/bam.h" + +#include "Transcript.h" +#include "Transcripts.h" + +uint8_t getMAPQ(double val) { + double err = 1.0 - val; + if (err <= 1e-10) return 100; + return (uint8_t)(-10 * log10(err) + .5); // round it +} + +//convert transcript coordinate to chromosome coordinate and generate CIGAR string +void tr2chr(const Transcript& transcript, int sp, int ep, int& pos, int& n_cigar, std::vector& data) { + int length = transcript.getLength(); + char strand = transcript.getStrand(); + const std::vector& structure = transcript.getStructure(); + + int s, i; + int oldlen, curlen; + + uint32_t operation; + + n_cigar = 0; + s = structure.size(); + + if (strand == '-') { + int tmp = sp; + sp = length - ep + 1; + ep = length - tmp + 1; + } + + if (ep < 1 || sp > length) { // a read which align to polyA tails totally! + pos = (sp > length ? structure[s - 1].end : structure[0].start - 1); // 0 based + + n_cigar = 1; + operation = (ep - sp + 1) << BAM_CIGAR_SHIFT | BAM_CINS; //BAM_CSOFT_CLIP; + data.push_back(operation); + + return; + } + + if (sp < 1) { + n_cigar++; + operation = (1 - sp) << BAM_CIGAR_SHIFT | BAM_CINS; //BAM_CSOFT_CLIP; + data.push_back(operation); + sp = 1; + } + + oldlen = curlen = 0; + + for (i = 0; i < s; i++) { + oldlen = curlen; + curlen += structure[i].end - structure[i].start + 1; + if (curlen >= sp) break; + } + assert(i < s); + pos = structure[i].start + (sp - oldlen - 1) - 1; // 0 based + + while (curlen < ep && i < s) { + n_cigar++; + operation = (curlen - sp + 1) << BAM_CIGAR_SHIFT | BAM_CMATCH; + data.push_back(operation); + ++i; + if (i >= s) continue; + n_cigar++; + operation = (structure[i].start - structure[i - 1].end - 1) << BAM_CIGAR_SHIFT | BAM_CREF_SKIP; + data.push_back(operation); + + oldlen = curlen; + sp = oldlen + 1; + curlen += structure[i].end - structure[i].start + 1; + } + + if (i >= s) { + n_cigar++; + operation = (ep - length) << BAM_CIGAR_SHIFT | BAM_CINS; //BAM_CSOFT_CLIP; + data.push_back(operation); + } + else { + n_cigar++; + operation = (ep - sp + 1) << BAM_CIGAR_SHIFT | BAM_CMATCH; + data.push_back(operation); + } +} + +#endif /* SAM_RSEM_CVT_H_ */ diff --git a/sampling.h b/sampling.h new file mode 100644 index 0000000..4171403 --- /dev/null +++ b/sampling.h @@ -0,0 +1,35 @@ +#ifndef SAMPLING +#define SAMPLING + +#include +#include +#include +#include + +#include "boost/random.hpp" + +boost::mt19937 rng(time(NULL)); +boost::uniform_01 rg(rng); + +// arr should be cumulative! +// interval : [,) +// random number should be in [0, arr[len - 1]) +// If by chance arr[len - 1] == 0.0, one possibility is to sample uniformly from 0...len-1 +int sample(std::vector& arr, int len) { + int l, r, mid; + double prb = rg() * arr[len - 1]; + + l = 0; r = len - 1; + while (l <= r) { + mid = (l + r) / 2; + if (arr[mid] <= prb) l = mid + 1; + else r = mid - 1; + } + + if (l >= len) { printf("%d %lf %lf\n", len, arr[len - 1], prb); } + assert(l < len); + + return l; +} + +#endif diff --git a/synthesisRef.cpp b/synthesisRef.cpp index 0c6695e..8ce268c 100644 --- a/synthesisRef.cpp +++ b/synthesisRef.cpp @@ -17,7 +17,7 @@ int M; map name2seq; map::iterator iter; -Transcripts transcripts; +Transcripts transcripts(1); // no genome, just transcript set char groupF[STRLEN], tiF[STRLEN], refFastaF[STRLEN], chromListF[STRLEN]; bool hasMappingFile; diff --git a/tbam2gbam.cpp b/tbam2gbam.cpp new file mode 100644 index 0000000..18688d0 --- /dev/null +++ b/tbam2gbam.cpp @@ -0,0 +1,31 @@ +#include +#include +#include + +#include "utils.h" +#include "Transcripts.h" +#include "BamConverter.h" + +using namespace std; + +char tiF[STRLEN], chr_list[STRLEN]; +Transcripts transcripts; + +int main(int argc, char* argv[]) { + if (argc != 4) { + printf("Usage: rsem-tbam2gbam reference_name unsorted_transcript_bam_input genome_bam_output\n"); + exit(-1); + } + + sprintf(tiF, "%s.ti", argv[1]); + sprintf(chr_list, "%s.chrlist", argv[1]); + transcripts.readFrom(tiF); + + printf("Start converting:\n"); + BamConverter bc(argv[2], argv[3], chr_list, transcripts); + bc.process(); + printf("Genome bam file is generated!\n"); + + return 0; +} + diff --git a/utils.h b/utils.h index 278e95e..0991fb9 100644 --- a/utils.h +++ b/utils.h @@ -156,4 +156,9 @@ void genReadFileNames(const char* readFN, int tagType, int read_type, int& s, ch } } +void exitWithError(const char* errmsg) { + fprintf(stderr, "%s\n", errmsg); + exit(-1); +} + #endif diff --git a/wiggle.cpp b/wiggle.cpp new file mode 100644 index 0000000..65a3d4a --- /dev/null +++ b/wiggle.cpp @@ -0,0 +1,131 @@ +#include +#include +#include + +#include +#include "sam/bam.h" +#include "sam/sam.h" + +#include "wiggle.h" + +void add_bam_record_to_wiggle(const bam1_t *b, Wiggle& wiggle) { + uint8_t *p_tag = bam_aux_get(b, "ZW"); + float w = (p_tag != NULL ? bam_aux2f(p_tag) : 1.0); + int pos = b->core.pos; + uint32_t *p = bam1_cigar(b); + + for (int i = 0; i < (int)b->core.n_cigar; i++, ++p) { + int op = *p & BAM_CIGAR_MASK; + int op_len = *p >> BAM_CIGAR_SHIFT; + + switch (op) { + //case BAM_CSOFT_CLIP : pos += op_len; break; + case BAM_CINS : pos += op_len; break; + case BAM_CMATCH : + for (int j = 0; j < op_len; j++, ++pos) { + wiggle.read_depth[pos] += w; + } + break; + case BAM_CREF_SKIP : pos += op_len; break; + default : assert(false); + } + } +} + +void build_wiggles(const std::string& bam_filename, + WiggleProcessor& processor) { + samfile_t *bam_in = samopen(bam_filename.c_str(), "rb", NULL); + if (bam_in == 0) { fprintf(stderr, "Cannot open %s!\n", bam_filename.c_str()); exit(-1); } + + bam_header_t *header = bam_in->header; + bool *used = new bool[header->n_targets]; + memset(used, 0, sizeof(bool) * header->n_targets); + + int cur_tid = -1; //current tid; + int cnt = 0; + bam1_t *b = bam_init1(); + Wiggle wiggle; + while (samread(bam_in, b) >= 0) { + if (b->core.flag & 0x0004) continue; + + if (b->core.tid != cur_tid) { + if (cur_tid >= 0) { used[cur_tid] = true; processor.process(wiggle); } + cur_tid = b->core.tid; + wiggle.name = header->target_name[cur_tid]; + wiggle.length = header->target_len[cur_tid]; + wiggle.read_depth.assign(wiggle.length, 0.0); + } + add_bam_record_to_wiggle(b, wiggle); + ++cnt; + if (cnt % 1000000 == 0) fprintf(stderr, "%d FIN\n", cnt); + } + if (cur_tid >= 0) { used[cur_tid] = true; processor.process(wiggle); } + + for (int32_t i = 0; i < header->n_targets; i++) + if (!used[i]) { + wiggle.name = header->target_name[i]; + wiggle.length = header->target_len[i]; + wiggle.read_depth.clear(); + processor.process(wiggle); + } + + samclose(bam_in); + bam_destroy1(b); + delete[] used; +} + +UCSCWiggleTrackWriter::UCSCWiggleTrackWriter(const std::string& output_filename, + const std::string& track_name) { + fo = fopen(output_filename.c_str(), "w"); + fprintf(fo, "track type=wiggle_0 name=\"%s\" description=\"%s\" visibility=full\n", + track_name.c_str(), + track_name.c_str()); +} + +UCSCWiggleTrackWriter::~UCSCWiggleTrackWriter() { + fclose(fo); +} + +void UCSCWiggleTrackWriter::process(const Wiggle& wiggle) { + int sp, ep; + + if (wiggle.read_depth.empty()) return; + + sp = ep = -1; + for (size_t i = 0; i < wiggle.length; i++) { + if (wiggle.read_depth[i] > 0) { + ep = i; + } + else { + if (sp < ep) { + ++sp; + fprintf(fo, "fixedStep chrom=%s start=%d step=1\n", wiggle.name.c_str(), sp + 1); + for (int j = sp; j <= ep; j++) fprintf(fo, "%.7g\n", wiggle.read_depth[j]); + } + sp = i; + } + } + if (sp < ep) { + ++sp; + fprintf(fo, "fixedStep chrom=%s start=%d step=1\n", wiggle.name.c_str(), sp + 1); + for (int j = sp; j <= ep; j++) fprintf(fo, "%.7g\n", wiggle.read_depth[j]); + } +} + +ReadDepthWriter::ReadDepthWriter(std::ostream& stream) + : stream_(stream) { +} + +void ReadDepthWriter::process(const Wiggle& wiggle) { + + stream_ << wiggle.name << '\t' + << wiggle.length << '\t'; + + if (wiggle.read_depth.empty()) { stream_ << "NA\n"; return; } + + for (size_t i = 0; i < wiggle.length; ++i) { + if (i > 0) stream_ << ' '; + stream_ << wiggle.read_depth[i]; + } + stream_ << '\n'; +} diff --git a/wiggle.h b/wiggle.h new file mode 100644 index 0000000..1f37592 --- /dev/null +++ b/wiggle.h @@ -0,0 +1,42 @@ +#include +#include +#include +#include + +struct Wiggle { + std::string name; + std::vector read_depth; + size_t length; +}; + +class WiggleProcessor { +public: + virtual ~WiggleProcessor() {} + virtual void process(const Wiggle& wiggle) = 0; +}; + +class UCSCWiggleTrackWriter : public WiggleProcessor { +public: + UCSCWiggleTrackWriter(const std::string& output_filename, + const std::string& track_name); + + ~UCSCWiggleTrackWriter(); + + void process(const Wiggle& wiggle); + +private: + FILE *fo; +}; + +class ReadDepthWriter : public WiggleProcessor { +public: + ReadDepthWriter(std::ostream& stream); + + void process(const Wiggle& wiggle); + +private: + std::ostream& stream_; +}; + +void build_wiggles(const std::string& bam_filename, + WiggleProcessor& processor);