From: Bo Li Date: Mon, 28 Nov 2011 23:21:26 +0000 (-0600) Subject: tested version for tbam2gbam X-Git-Url: https://git.donarmstrong.com/?p=rsem.git;a=commitdiff_plain;h=946f9a6adb2a82048c8453d44693cd3838d32939 tested version for tbam2gbam --- 946f9a6adb2a82048c8453d44693cd3838d32939 diff --cc BamWriter.h index 558a44a,e33a07f..b0aeee0 --- a/BamWriter.h +++ b/BamWriter.h @@@ -6,13 -6,13 +6,14 @@@ #include #include #include --#include #include ++#include #include "sam/bam.h" #include "sam/sam.h" ++#include "sam_rsem_aux.h" ++#include "sam_rsem_cvt.h" --#include "utils.h" #include "SingleHit.h" #include "PairedEndHit.h" @@@ -22,136 -22,136 +23,23 @@@ class BamWriter { public: -- BamWriter(char, const char*, const char*, const char*, const char*); ++ BamWriter(char, const char*, const char*, const char*, Transcripts&); ~BamWriter(); -- void work(HitWrapper, Transcripts&); -- void work(HitWrapper, Transcripts&); ++ void work(HitWrapper); ++ void work(HitWrapper); private: samfile_t *in, *out; ++ Transcripts& transcripts; -- std::map refmap; -- std::map::iterator iter; -- -- struct SingleEndT { -- bam1_t *b; -- -- SingleEndT(bam1_t *b = NULL) { -- this->b = b; -- } -- -- bool operator< (const SingleEndT& o) const { -- int strand1, strand2; -- uint32_t *p1, *p2; -- -- if (b->core.tid != o.b->core.tid) return b->core.tid < o.b->core.tid; -- if (b->core.pos != o.b->core.pos) return b->core.pos < o.b->core.pos; -- strand1 = b->core.flag & 0x0010; strand2 = o.b->core.flag & 0x0010; -- if (strand1 != strand2) return strand1 < strand2; -- if (b->core.n_cigar != o.b->core.n_cigar) return b->core.n_cigar < o.b->core.n_cigar; -- p1 = bam1_cigar(b); p2 = bam1_cigar(o.b); -- for (int i = 0; i < (int)b->core.n_cigar; i++) { -- if (*p1 != *p2) return *p1 < *p2; -- ++p1; ++p2; -- } -- return false; -- } -- }; -- -- //b is mate 1, b2 is mate 2 -- struct PairedEndT { -- bam1_t *b, *b2; -- -- PairedEndT() { b = NULL; b2 = NULL;} -- -- PairedEndT(bam1_t *b, bam1_t *b2) { -- this->b = b; -- this->b2 = b2; -- } -- -- bool operator< (const PairedEndT& o) const { -- int strand1, strand2; -- uint32_t *p1, *p2; -- -- //compare b -- if (b->core.tid != o.b->core.tid) return b->core.tid < o.b->core.tid; -- if (b->core.pos != o.b->core.pos) return b->core.pos < o.b->core.pos; -- strand1 = b->core.flag & 0x0010; strand2 = o.b->core.flag & 0x0010; -- if (strand1 != strand2) return strand1 < strand2; -- if (b->core.n_cigar != o.b->core.n_cigar) return b->core.n_cigar < o.b->core.n_cigar; -- p1 = bam1_cigar(b); p2 = bam1_cigar(o.b); -- for (int i = 0; i < (int)b->core.n_cigar; i++) { -- if (*p1 != *p2) return *p1 < *p2; -- ++p1; ++p2; -- } -- -- //compare b2 -- if (b2->core.tid != o.b2->core.tid) return b2->core.tid < o.b2->core.tid; -- if (b2->core.pos != o.b2->core.pos) return b2->core.pos < o.b2->core.pos; -- strand1 = b2->core.flag & 0x0010; strand2 = o.b2->core.flag & 0x0010; -- if (strand1 != strand2) return strand1 < strand2; -- if (b2->core.n_cigar != o.b2->core.n_cigar) return b2->core.n_cigar < o.b2->core.n_cigar; -- p1 = bam1_cigar(b2); p2 = bam1_cigar(o.b2); -- for (int i = 0; i < (int)b2->core.n_cigar; i++) { -- if (*p1 != *p2) return *p1 < *p2; -- ++p1; ++p2; -- } -- -- return false; -- } -- }; -- -- uint8_t getMAPQ(double val) { -- double err = 1.0 - val; -- if (err <= 1e-10) return 100; -- return (uint8_t)(-10 * log10(err) + .5); // round it -- } -- -- void push_qname(const uint8_t* qname, int l_qname, std::vector& data) { -- for (int i = 0; i < l_qname; i++) data.push_back(*(qname + i)); -- } -- -- void push_seq(const uint8_t* seq, int readlen, char strand, std::vector& data) { -- int seq_len = (readlen + 1) / 2; -- -- switch (strand) { -- case '+': for (int i = 0; i < seq_len; i++) data.push_back(*(seq + i)); break; -- case '-': -- uint8_t code, base; -- code = 0; base = 0; -- for (int i = 0; i < readlen; i++) { -- switch (bam1_seqi(seq, readlen - i - 1)) { -- case 1: base = 8; break; -- case 2: base = 4; break; -- case 4: base = 2; break; -- case 8: base = 1; break; -- case 15: base = 15; break; -- default: assert(false); -- } -- code |= base << (4 * (1 - i % 2)); -- if (i % 2 == 1) { data.push_back(code); code = 0; } -- } -- -- if (readlen % 2 == 1) { data.push_back(code); } -- break; -- default: assert(false); -- } -- } -- -- void push_qual(const uint8_t* qual, int readlen, char strand, std::vector& data) { -- switch (strand) { -- case '+': for (int i = 0; i < readlen; i++) data.push_back(*(qual + i)); break; -- case '-': for (int i = readlen - 1; i >= 0; i--) data.push_back(*(qual + i)); break; -- default: assert(false); -- } -- } -- -- //convert transcript coordinate to chromosome coordinate and generate CIGAR string -- void tr2chr(const Transcript&, int, int, int&, int&, std::vector&); ++ //convert bam1_t ++ void convert(bam1_t*, double); }; //fn_list can be NULL --BamWriter::BamWriter(char inpType, const char* inpF, const char* fn_list, const char* outF, const char* chr_list) { ++BamWriter::BamWriter(char inpType, const char* inpF, const char* fn_list, const char* outF, Transcripts& transcripts) ++ : transcripts(transcripts) ++{ switch(inpType) { case 's': in = samopen(inpF, "r", fn_list); break; case 'b': in = samopen(inpF, "rb", fn_list); break; @@@ -160,25 -160,25 +48,33 @@@ assert(in != 0); //generate output's header -- bam_header_t *out_header = NULL; -- refmap.clear(); ++ bam_header_t *out_header = bam_header_dwt(in->header); -- if (chr_list == NULL) { -- out_header = in->header; ++ if (out_header->n_targets != transcripts.getM()) { ++ fprintf(stderr, "Number of reference sequences recorded in the header is not correct! The header contains %d sequences while there should be %d sequences\n", out_header->n_targets, transcripts.getM()); ++ exit(-1); } -- else { -- out_header = sam_header_read2(chr_list); -- for (int i = 0; i < out_header->n_targets; i++) { -- refmap[out_header->target_name[i]] = i; ++ for (int i = 0; i < out_header->n_targets; i++) { ++ const Transcript& transcript = transcripts.getTranscriptAt(i + 1); ++ if (out_header->target_name[i] != transcript.getTranscriptID()) { ++ fprintf(stderr, "Reference sequence %d's name recorded in the header is not correct! \n", i); ++ fprintf(stderr, "Name in the header: %s\n", out_header->target_name[i]); ++ fprintf(stderr, "Should be: %s\n", transcript.getTranscriptID().c_str()); ++ exit(-1); } ++ out_header->target_len[i] = transcript.getLength(); // transcript length without poly(A) tail } ++ std::ostringstream strout; ++ strout<<"@HD\tVN:1.4\tSO:unknown\n@PG\tID:RSEM\n"; ++ std::string content = strout.str(); ++ append_header_text(out_header, content.c_str(), content.length()); out = samopen(outF, "wb", out_header); assert(out != 0); -- if (chr_list != NULL) { bam_header_destroy(out_header); } ++ bam_header_destroy(out_header); } BamWriter::~BamWriter() { @@@ -186,137 -186,138 +82,46 @@@ samclose(out); } --void BamWriter::work(HitWrapper wrapper, Transcripts& transcripts) { ++void BamWriter::work(HitWrapper wrapper) { bam1_t *b; -- std::string cqname; // cqname : current query name -- std::map hmap; -- std::map::iterator hmapIter; SingleHit *hit; int cnt = 0; -- cqname = ""; b = bam_init1(); -- hmap.clear(); while (samread(in, b) >= 0) { -- -- if (verbose && cnt > 0 && cnt % 1000000 == 0) { printf("%d entries are finished!\n", cnt); } ++cnt; ++ if (verbose && cnt % 1000000 == 0) { printf("%d alignment lines are loaded!\n", cnt); } if (b->core.flag & 0x0004) continue; hit = wrapper.getNextHit(); assert(hit != NULL); -- int sid = b->core.tid + 1; -- assert(sid == hit->getSid()); -- const Transcript& transcript = transcripts.getTranscriptAt(sid); -- -- if (transcripts.getType() == 0) { -- int pos = b->core.pos; -- int readlen = b->core.l_qseq; -- uint8_t *qname = b->data, *seq = bam1_seq(b), *qual = bam1_qual(b); -- std::vector data; -- data.clear(); -- -- iter = refmap.find(transcript.getSeqName()); -- assert(iter != refmap.end()); -- b->core.tid = iter->second; -- b->core.qual = 255; -- -- uint16_t rstrand = b->core.flag & 0x0010; // read strand -- b->core.flag -= rstrand; - rstrand = (((!rstrand && transcript.getStrand() == '+') || (rstrand && transcript.getStrand() == '-')) ? 0 : 0x0010); - rstrand = (!rstrand && transcript.getStrand() == '+' || rstrand && transcript.getStrand() == '-' ? 0 : 0x0010); -- b->core.flag += rstrand; -- -- push_qname(qname, b->core.l_qname, data); -- int core_pos, core_n_cigar; -- tr2chr(transcript, pos + 1, pos + readlen, core_pos, core_n_cigar, data); -- if (core_pos < 0) b->core.tid = -1; -- b->core.pos = core_pos; -- b->core.n_cigar = core_n_cigar; -- push_seq(seq, readlen, transcript.getStrand(), data); -- push_qual(qual, readlen, transcript.getStrand(), data); -- -- free(b->data); - b->m_data = b->data_len = data.size(); - b->l_aux = 0; - b->m_data = b->data_len = data.size() + 7; // 7 extra bytes for ZW tag - b->l_aux = 7; -- b->data = (uint8_t*)malloc(b->m_data); -- for (int i = 0; i < b->data_len; i++) b->data[i] = data[i]; -- -- b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&(b->core), bam1_cigar(b))); - - char strand = transcript.getStrand(); - bam_aux_append(b, "XS", 'A', 1, (uint8_t*)&strand); - -- } -- else { - b->m_data = b->data_len = b->data_len - b->l_aux; - b->l_aux = 0; - b->m_data = b->data_len = b->data_len - b->l_aux + 7; // 7 extra bytes for ZW tag - b->l_aux = 7; -- b->data = (uint8_t*)realloc(b->data, b->m_data); -- } - - - -- if (cqname != bam1_qname(b)) { -- if (!hmap.empty()) { -- for (hmapIter = hmap.begin(); hmapIter != hmap.end(); hmapIter++) { -- bam1_t *tmp_b = hmapIter->first.b; -- tmp_b->core.qual = getMAPQ(hmapIter->second); - uint8_t *p = bam1_aux(tmp_b); - *p = 'Z'; ++p; *p = 'W'; ++p; *p = 'f'; ++p; -- float val = (float)hmapIter->second; - bam_aux_append(tmp_b, "ZW", 'f', 4, (uint8_t*)&val); - if (tmp_b->core.qual > 0) samwrite(out, tmp_b); // output only when MAPQ > 0 - memcpy(p, &val, 4); - samwrite(out, tmp_b); -- bam_destroy1(tmp_b); // now hmapIter->b makes no sense -- } -- hmap.clear(); -- } -- cqname = bam1_qname(b); -- } -- -- hmapIter = hmap.find(SingleEndT(b)); -- if (hmapIter == hmap.end()) { -- hmap[SingleEndT(bam_dup1(b))] = hit->getConPrb(); -- } -- else { -- hmapIter->second += hit->getConPrb(); -- } ++ assert(b->core.tid + 1 == hit->getSid()); ++ convert(b, hit->getConPrb()); ++ if (b->core.qual > 0) samwrite(out, b); // output only when MAPQ > 0 } assert(wrapper.getNextHit() == NULL); -- if (!hmap.empty()) { -- for (hmapIter = hmap.begin(); hmapIter != hmap.end(); hmapIter++) { -- bam1_t *tmp_b = hmapIter->first.b; -- tmp_b->core.qual = getMAPQ(hmapIter->second); - uint8_t *p = bam1_aux(tmp_b); - *p = 'Z'; ++p; *p = 'W'; ++p; *p = 'f'; ++p; -- float val = (float)hmapIter->second; - bam_aux_append(tmp_b, "ZW", 'f', 4, (uint8_t*)&val); - if (tmp_b->core.qual > 0) samwrite(out, tmp_b); // If MAPQ is equal to 0, do not output this alignment - memcpy(p, &val, 4); - samwrite(out, tmp_b); -- bam_destroy1(tmp_b); // now hmapIter->b makes no sense -- } -- hmap.clear(); -- } -- bam_destroy1(b); if (verbose) { printf("Bam output file is generated!\n"); } } --void BamWriter::work(HitWrapper wrapper, Transcripts& transcripts) { ++void BamWriter::work(HitWrapper wrapper) { bam1_t *b, *b2; -- std::string cqname; // cqname : current query name -- std::map hmap; -- std::map::iterator hmapIter; PairedEndHit *hit; int cnt = 0; -- cqname = ""; b = bam_init1(); b2 = bam_init1(); -- hmap.clear(); while (samread(in, b) >= 0 && samread(in, b2) >= 0) { -- -- if (verbose && cnt > 0 && cnt % 1000000 == 0) { printf("%d entries are finished!\n", cnt); } -- ++cnt; ++ cnt += 2; ++ if (verbose && cnt % 1000000 == 0) { printf("%d alignment lines are loaded!\n", cnt); } if (!((b->core.flag & 0x0002) && (b2->core.flag & 0x0002))) continue; @@@ -330,236 -331,236 +135,60 @@@ hit = wrapper.getNextHit(); assert(hit != NULL); -- int sid = b->core.tid + 1; -- assert(sid == hit->getSid()); -- assert(sid == b2->core.tid + 1); -- const Transcript& transcript = transcripts.getTranscriptAt(sid); -- -- if (transcripts.getType() == 0) { -- int pos = b->core.pos, pos2 = b2->core.pos; -- int readlen = b->core.l_qseq, readlen2 = b2->core.l_qseq; -- uint8_t *qname = b->data, *seq = bam1_seq(b), *qual = bam1_qual(b); -- uint8_t *qname2 = b2->data, *seq2 = bam1_seq(b2), *qual2 = bam1_qual(b2); -- std::vector data, data2; -- -- data.clear(); -- data2.clear(); -- -- iter = refmap.find(transcript.getSeqName()); -- assert(iter != refmap.end()); -- b->core.tid = iter->second; b->core.mtid = iter->second; -- b2->core.tid = iter->second; b2->core.mtid = iter->second; -- -- uint16_t rstrand = b->core.flag & 0x0010; -- b->core.flag = b->core.flag - (b->core.flag & 0x0010) - (b->core.flag & 0x0020); -- b2->core.flag = b2->core.flag - (b2->core.flag & 0x0010) - (b2->core.flag & 0x0020); -- -- uint16_t add, add2; - if ((!rstrand && transcript.getStrand() == '+') || (rstrand && transcript.getStrand() == '-')) { - if (!rstrand && transcript.getStrand() == '+' || rstrand && transcript.getStrand() == '-') { -- add = 0x0020; add2 = 0x0010; -- } -- else { -- add = 0x0010; add2 = 0x0020; -- } -- b->core.flag += add; -- b2->core.flag += add2; -- -- b->core.qual = b2->core.qual = 255; -- - //Do I really need this? The insert size uses transcript coordinates -- if (transcript.getStrand() == '-') { -- b->core.isize = -b->core.isize; -- b2->core.isize = -b2->core.isize; -- } -- -- push_qname(qname, b->core.l_qname, data); -- push_qname(qname2, b2->core.l_qname, data2); -- int core_pos, core_n_cigar; -- tr2chr(transcript, pos + 1, pos + readlen, core_pos, core_n_cigar, data); -- if (core_pos < 0) b->core.tid = -1; -- b->core.pos = core_pos; b->core.n_cigar = core_n_cigar; -- tr2chr(transcript, pos2 + 1, pos2 + readlen2, core_pos, core_n_cigar, data2); -- if (core_pos < 0) b2->core.tid = -1; -- b2->core.pos = core_pos; b2->core.n_cigar = core_n_cigar; -- b->core.mpos = b2->core.pos; -- b2->core.mpos = b->core.pos; -- push_seq(seq, readlen, transcript.getStrand(), data); -- push_seq(seq2, readlen2, transcript.getStrand(), data2); -- push_qual(qual, readlen, transcript.getStrand(), data); -- push_qual(qual2, readlen2, transcript.getStrand(), data2); -- -- free(b->data); - b->m_data = b->data_len = data.size(); - b->l_aux = 0; - b->m_data = b->data_len = data.size() + 7; // 7 extra bytes for ZW tag - b->l_aux = 7; -- b->data = (uint8_t*)malloc(b->m_data); -- for (int i = 0; i < b->data_len; i++) b->data[i] = data[i]; -- -- free(b2->data); - b2->m_data = b2->data_len = data2.size(); - b2->l_aux = 0; - b2->m_data = b2->data_len = data2.size() + 7; // 7 extra bytes for ZW tag - b2->l_aux = 7; -- b2->data = (uint8_t*)malloc(b2->m_data); -- for (int i = 0; i < b2->data_len; i++) b2->data[i] = data2[i]; -- -- b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&(b->core), bam1_cigar(b))); -- b2->core.bin = bam_reg2bin(b2->core.pos, bam_calend(&(b2->core), bam1_cigar(b2))); - - char strand = transcript.getStrand(); - bam_aux_append(b, "XS", 'A', 1, (uint8_t*)&strand); - bam_aux_append(b2, "XS", 'A', 1, (uint8_t*)&strand); -- } -- else { - b->m_data = b->data_len = b->data_len - b->l_aux; - b->l_aux = 0; - b->m_data = b->data_len = b->data_len - b->l_aux + 7; // 7 extra bytes for ZW tag - b->l_aux = 7; -- b->data = (uint8_t*)realloc(b->data, b->m_data); -- - b2->m_data = b2->data_len = b2->data_len - b2->l_aux; - b2->l_aux = 0; - b2->m_data = b2->data_len = b2->data_len - b2->l_aux + 7; // 7 extra bytes for ZW tag - b2->l_aux = 7; -- b2->data = (uint8_t*)realloc(b2->data, b2->m_data); -- } - - if (cqname != bam1_qname(b)) { - if (!hmap.empty()) { - for (hmapIter = hmap.begin(); hmapIter != hmap.end(); hmapIter++) { - bam1_t *tmp_b = hmapIter->first.b; - bam1_t *tmp_b2 = hmapIter->first.b2; - - tmp_b->core.qual = tmp_b2->core.qual = getMAPQ(hmapIter->second); - - uint8_t *p = bam1_aux(tmp_b), *p2 = bam1_aux(tmp_b2); - *p = 'Z'; ++p; *p = 'W'; ++p; *p = 'f'; ++p; - *p2 = 'Z'; ++p2; *p2 = 'W'; ++p2; *p2 = 'f'; ++p2; ++ assert(b->core.tid + 1 == hit->getSid()); ++ assert(b2->core.tid + 1 == hit->getSid()); - if (cqname != bam1_qname(b)) { - if (!hmap.empty()) { - for (hmapIter = hmap.begin(); hmapIter != hmap.end(); hmapIter++) { - bam1_t *tmp_b = hmapIter->first.b; - bam1_t *tmp_b2 = hmapIter->first.b2; - - tmp_b->core.qual = tmp_b2->core.qual = getMAPQ(hmapIter->second); - -- float val = (float)hmapIter->second; - bam_aux_append(tmp_b, "ZW", 'f', 4, (uint8_t*)&val); - bam_aux_append(tmp_b2, "ZW", 'f', 4, (uint8_t*)&val); - - // If MAPQ is equal to 0, do not output this alignment pair - if (tmp_b->core.qual > 0) { - samwrite(out, tmp_b); - samwrite(out, tmp_b2); - } - - bam_destroy1(tmp_b); - bam_destroy1(tmp_b2); - } - hmap.clear(); - } - cqname = bam1_qname(b); - } - memcpy(p, &val, 4); - memcpy(p2, &val, 4); ++ convert(b, hit->getConPrb()); ++ convert(b2, hit->getConPrb()); - hmapIter = hmap.find(PairedEndT(b, b2)); - if (hmapIter == hmap.end()) { - hmap[PairedEndT(bam_dup1(b), bam_dup1(b2))] = hit->getConPrb(); - } - else { - hmapIter->second += hit->getConPrb(); - samwrite(out, tmp_b); - samwrite(out, tmp_b2); ++ b->core.mpos = b2->core.pos; ++ b2->core.mpos = b->core.pos; + - bam_destroy1(tmp_b); - bam_destroy1(tmp_b2); - } - hmap.clear(); - } - cqname = bam1_qname(b); - } - - hmapIter = hmap.find(PairedEndT(b, b2)); - if (hmapIter == hmap.end()) { - hmap[PairedEndT(bam_dup1(b), bam_dup1(b2))] = hit->getConPrb(); - } - else { - hmapIter->second += hit->getConPrb(); ++ if (b->core.qual > 0) { ++ samwrite(out, b); ++ samwrite(out, b2); } } assert(wrapper.getNextHit() == NULL); -- if (!hmap.empty()) { -- for (hmapIter = hmap.begin(); hmapIter != hmap.end(); hmapIter++) { -- bam1_t *tmp_b = hmapIter->first.b; -- bam1_t *tmp_b2 = hmapIter->first.b2; -- -- tmp_b->core.qual = tmp_b2->core.qual = getMAPQ(hmapIter->second); - - uint8_t *p = bam1_aux(tmp_b), *p2 = bam1_aux(tmp_b2); - *p = 'Z'; ++p; *p = 'W'; ++p; *p = 'f'; ++p; - *p2 = 'Z'; ++p2; *p2 = 'W'; ++p2; *p2 = 'f'; ++p2; -- -- float val = (float)hmapIter->second; - bam_aux_append(tmp_b, "ZW", 'f', 4, (uint8_t*)&val); - bam_aux_append(tmp_b2, "ZW", 'f', 4, (uint8_t*)&val); - memcpy(p, &val, 4); - memcpy(p2, &val, 4); -- - if (tmp_b->core.qual > 0) { - samwrite(out, tmp_b); - samwrite(out, tmp_b2); - } - samwrite(out, tmp_b); - samwrite(out, tmp_b2); -- -- bam_destroy1(tmp_b); -- bam_destroy1(tmp_b2); -- } -- hmap.clear(); -- } -- bam_destroy1(b); bam_destroy1(b2); - if (verbose) { printf("Bam output file is generated!"); } + if (verbose) { printf("Bam output file is generated!\n"); } } --void BamWriter::tr2chr(const Transcript& transcript, int sp, int ep, int& pos, int& n_cigar, std::vector& data) { -- int length = transcript.getLength(); -- char strand = transcript.getStrand(); -- const std::vector& structure = transcript.getStructure(); -- -- int s, i; -- int oldlen, curlen; -- -- uint32_t operation; -- uint8_t *p; -- -- n_cigar = 0; -- s = structure.size(); -- -- if (strand == '-') { -- int tmp = sp; -- sp = length - ep + 1; -- ep = length - tmp + 1; -- } -- -- if (ep < 1 || sp > length) { // a read which align to polyA tails totally! -- pos = (sp > length ? structure[s - 1].end : structure[0].start - 1); // 0 based -- -- n_cigar = 1; -- operation = (ep - sp + 1) << BAM_CIGAR_SHIFT | BAM_CINS; //BAM_CSOFT_CLIP; -- p = (uint8_t*)(&operation); -- for (int j = 0; j < 4; j++) data.push_back(*(p + j)); -- -- return; -- } -- -- if (sp < 1) { -- n_cigar++; -- operation = (1 - sp) << BAM_CIGAR_SHIFT | BAM_CINS; //BAM_CSOFT_CLIP; -- p = (uint8_t*)(&operation); -- for (int j = 0; j < 4; j++) data.push_back(*(p + j)); -- sp = 1; -- } -- -- oldlen = curlen = 0; -- -- for (i = 0; i < s; i++) { -- oldlen = curlen; -- curlen += structure[i].end - structure[i].start + 1; -- if (curlen >= sp) break; -- } -- assert(i < s); -- pos = structure[i].start + (sp - oldlen - 1) - 1; // 0 based -- -- while (curlen < ep && i < s) { -- n_cigar++; -- operation = (curlen - sp + 1) << BAM_CIGAR_SHIFT | BAM_CMATCH; -- p = (uint8_t*)(&operation); -- for (int j = 0; j < 4; j++) data.push_back(*(p + j)); -- -- ++i; -- if (i >= s) continue; -- n_cigar++; -- operation = (structure[i].start - structure[i - 1].end - 1) << BAM_CIGAR_SHIFT | BAM_CREF_SKIP; -- p = (uint8_t*)(&operation); -- for (int j = 0; j < 4; j++) data.push_back(*(p + j)); -- -- oldlen = curlen; -- sp = oldlen + 1; -- curlen += structure[i].end - structure[i].start + 1; -- } -- -- if (i >= s) { -- n_cigar++; -- operation = (ep - length) << BAM_CIGAR_SHIFT | BAM_CINS; //BAM_CSOFT_CLIP; -- p = (uint8_t*)(&operation); -- for (int j = 0; j < 4; j++) data.push_back(*(p + j)); -- } -- else { -- n_cigar++; -- operation = (ep - sp + 1) << BAM_CIGAR_SHIFT | BAM_CMATCH; -- p = (uint8_t*)(&operation); -- for (int j = 0; j < 4; j++) data.push_back(*(p + j)); -- } ++void BamWriter::convert(bam1_t *b, double prb) { ++ int sid = b->core.tid + 1; ++ const Transcript& transcript = transcripts.getTranscriptAt(sid); ++ ++ int pos = b->core.pos; ++ int readlen = b->core.l_qseq; ++ ++ std::vector data; ++ data.clear(); ++ ++ int core_pos, core_n_cigar; ++ std::vector vec; ++ vec.assign(1, Interval(1, transcript.getLength())); ++ // make an artificial chromosome coordinates for the transcript to get new CIGAR strings ++ tr2chr(Transcript("", "", "", '+', vec, ""), pos + 1, pos + readlen, core_pos, core_n_cigar, data); ++ assert(core_pos >= 0); ++ ++ int rest_len = b->data_len - b->core.l_qname - b->core.n_cigar * 4; ++ b->data_len = b->core.l_qname + core_n_cigar * 4 + rest_len; ++ expand_data_size(b); ++ uint8_t* pt = b->data + b->core.l_qname; ++ memmove(pt + core_n_cigar * 4, pt + b->core.n_cigar * 4, rest_len); ++ for (int i = 0; i < core_n_cigar; i++) { memmove(pt, &data[i], 4); pt += 4; } ++ ++ b->core.pos = core_pos; ++ b->core.n_cigar = core_n_cigar; ++ b->core.qual = getMAPQ(prb); ++ b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&(b->core), bam1_cigar(b))); ++ ++ float val = (float)prb; ++ bam_aux_append(b, "ZW", 'f', bam_aux_type2size('f'), (uint8_t*)&val); } #endif /* BAMWRITER_H_ */ diff --cc EM.cpp index 107be3c,08fc8c7..2921ad3 --- a/EM.cpp +++ b/EM.cpp @@@ -614,38 -611,15 +614,34 @@@ void EM() writeResults(model, countvs[0]); if (genBamF) { -- sprintf(outBamF, "%s.bam", outName); -- if (transcripts.getType() == 0) { -- sprintf(chr_list, "%s.chrlist", refName); -- pt_chr_list = (char*)(&chr_list); - } ++ sprintf(outBamF, "%s.transcript.bam", outName); + + if (bamSampling) { - int local_N; - int fr, to, len, id; - vector arr; - arr.clear(); - - if (verbose) printf("Begin to sample reads from their posteriors.\n"); - for (int i = 0; i < nThreads; i++) { - local_N = hitvs[i]->getN(); - for (int j = 0; j < local_N; j++) { - fr = hitvs[i]->getSAt(j); - to = hitvs[i]->getSAt(j + 1); - len = to - fr + 1; - arr.resize(len); - arr[0] = ncpvs[i][j]; - for (int k = fr; k < to; k++) arr[k - fr + 1] = arr[k - fr] + hitvs[i]->getHitAt(k).getConPrb(); - id = (arr[len - 1] < EPSILON ? -1 : sample(arr, len)); // if all entries in arr are 0, let id be -1 - for (int k = fr; k < to; k++) hitvs[i]->getHitAt(k).setConPrb(k - fr + 1 == id ? 1.0 : 0.0); - } - } ++ int local_N; ++ int fr, to, len, id; ++ vector arr; ++ arr.clear(); ++ ++ if (verbose) printf("Begin to sample reads from their posteriors.\n"); ++ for (int i = 0; i < nThreads; i++) { ++ local_N = hitvs[i]->getN(); ++ for (int j = 0; j < local_N; j++) { ++ fr = hitvs[i]->getSAt(j); ++ to = hitvs[i]->getSAt(j + 1); ++ len = to - fr + 1; ++ arr.resize(len); ++ arr[0] = ncpvs[i][j]; ++ for (int k = fr; k < to; k++) arr[k - fr + 1] = arr[k - fr] + hitvs[i]->getHitAt(k).getConPrb(); ++ id = (arr[len - 1] < EPSILON ? -1 : sample(arr, len)); // if all entries in arr are 0, let id be -1 ++ for (int k = fr; k < to; k++) hitvs[i]->getHitAt(k).setConPrb(k - fr + 1 == id ? 1.0 : 0.0); ++ } ++ } ++ if (verbose) printf("Sampling is finished.\n"); } - if (verbose) printf("Sampling is finished.\n"); -- BamWriter writer(inpSamType, inpSamF, pt_fn_list, outBamF, pt_chr_list); ++ BamWriter writer(inpSamType, inpSamF, pt_fn_list, outBamF, transcripts); HitWrapper wrapper(nThreads, hitvs); -- writer.work(wrapper, transcripts); ++ writer.work(wrapper); } release(readers, hitvs, ncpvs, mhps); diff --cc bam2wig.cpp index e46fa67,cb02bc2..b03d0b3 --- a/bam2wig.cpp +++ b/bam2wig.cpp @@@ -1,108 -1,15 +1,17 @@@ - #include - #include - #include - #include - #include + #include + - #include - - #include "sam/bam.h" - #include "sam/sam.h" + #include "wiggle.h" using namespace std; - samfile_t *bam_in; - bam1_t *b; - - int cur_tid; //current tid; - float *wig_arr; // wiggle array - FILE *fo; - - void generateWiggle(int tid) { - int chr_len = bam_in->header->target_len[tid]; - char *chr_name = bam_in->header->target_name[tid]; - int sp, ep; - - sp = ep = -1; - for (int i = 0; i < chr_len; i++) { - if (wig_arr[i] > 0) { - ep = i; - } - else { - if (sp < ep) { - ++sp; - fprintf(fo, "fixedStep chrom=%s start=%d step=1\n", chr_name, sp + 1); - for (int j = sp; j <= ep; j++) fprintf(fo, "%.7g\n", wig_arr[j]); - } - sp = i; - } - } - if (sp < ep) { - ++sp; - fprintf(fo, "fixedStep chrom=%s start=%d step=1\n", chr_name, sp + 1); - for (int j = sp; j <= ep; j++) fprintf(fo, "%.7g\n", wig_arr[j]); - } - } - int main(int argc, char* argv[]) { - int cnt = 0; - if (argc != 4) { - printf("Usage : rsem-bam2wig sorted_bam_input wig_output wiggle_name\n"); + printf("Usage: rsem-bam2wig sorted_bam_input wig_output wiggle_name\n"); exit(-1); } - UCSCWiggleTrackWriter track_writer(argv[2], argv[3]); - build_wiggles(argv[1], track_writer); + - bam_in = samopen(argv[1], "rb", NULL); - if (bam_in == 0) { fprintf(stderr, "Cannot open %s!\n", argv[1]); exit(-1); } - //assert(bam_in != 0); - b = bam_init1(); - - fo = fopen(argv[2], "w"); - fprintf(fo, "track type=wiggle_0 name=\"%s\" description=\"%s\" visibility=full\n", argv[3], argv[3]); - - cur_tid = -1; - wig_arr = NULL; - while (samread(bam_in, b) >= 0) { - if (b->core.flag & 0x0004) continue; - - if (b->core.tid != cur_tid) { - if (cur_tid >= 0) generateWiggle(cur_tid); - cur_tid = b->core.tid; - size_t len = sizeof(float) * bam_in->header->target_len[cur_tid]; - wig_arr = (float*)realloc(wig_arr, len); - memset(wig_arr, 0, len); - } - - uint8_t *p_tag = bam_aux_get(b, "ZW"); - float w = (p_tag != NULL ? bam_aux2f(p_tag) : 1.0); - int pos = b->core.pos; - uint32_t *p = bam1_cigar(b); - - for (int i = 0; i < (int)b->core.n_cigar; i++, ++p) { - int op = *p & BAM_CIGAR_MASK; - int op_len = *p >> BAM_CIGAR_SHIFT; - - switch (op) { - //case BAM_CSOFT_CLIP : pos += op_len; break; - case BAM_CINS : pos += op_len; break; - case BAM_CMATCH : - for (int j = 0; j < op_len; j++, ++pos) wig_arr[pos] += w; - break; - case BAM_CREF_SKIP : pos += op_len; break; - default : assert(false); - } - } - - ++cnt; - if (cnt % 1000000 == 0) printf("%d FIN\n", cnt); - } - if (cur_tid >= 0) generateWiggle(cur_tid); - free(wig_arr); - - samclose(bam_in); - bam_destroy1(b); - - fclose(fo); ++ UCSCWiggleTrackWriter track_writer(argv[2], argv[3]); ++ build_wiggles(argv[1], track_writer); return 0; } diff --cc makefile index a28caa7,c5c5271..e8b1318 --- a/makefile +++ b/makefile @@@ -2,7 -2,7 +2,7 @@@ CC = g+ #LFLAGS = -Wall -O3 -ffast-math CFLAGS = -Wall -c -I. COFLAGS = -Wall -O3 -ffast-math -c -I. - PROGRAMS = rsem-bam2wig rsem-build-read-index rsem-run-em rsem-extract-reference-transcripts rsem-synthesis-reference-transcripts rsem-parse-alignments rsem-preref rsem-simulate-reads rsem-run-gibbs rsem-calculate-credibility-intervals -PROGRAMS = rsem-bam2wig rsem-bam2readdepth rsem-build-read-index rsem-run-em rsem-extract-reference-transcripts rsem-synthesis-reference-transcripts rsem-parse-alignments rsem-preref rsem-simulate-reads rsem-run-gibbs rsem-calculate-credibility-intervals ++PROGRAMS = rsem-tbam2gbam rsem-bam2wig rsem-bam2readdepth rsem-build-read-index rsem-run-em rsem-extract-reference-transcripts rsem-synthesis-reference-transcripts rsem-parse-alignments rsem-preref rsem-simulate-reads rsem-run-gibbs rsem-calculate-credibility-intervals all : build-sam $(PROGRAMS) @@@ -76,18 -74,22 +76,35 @@@ PairedEndQModel.h : utils.h Orientation HitWrapper.h : HitContainer.h --BamWriter.h : sam/sam.h sam/bam.h utils.h SingleHit.h PairedEndHit.h HitWrapper.h Transcript.h Transcripts.h ++sam_rsem_aux.h : sam/bam.h ++ ++sam_rsem_cvt.h : sam/bam.h Transcript.h Transcripts.h ++ ++BamWriter.h : sam/sam.h sam/bam.h sam_rsem_aux.h sam_rsem_cvt.h SingleHit.h PairedEndHit.h HitWrapper.h Transcript.h Transcripts.h + +sampling.h : boost/random.hpp rsem-run-em : EM.o sam/libbam.a $(CC) -o rsem-run-em EM.o sam/libbam.a -lz -lpthread - EM.o : utils.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h GroupInfo.h HitContainer.h ReadIndex.h ReadReader.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h RefSeqPolicy.h PolyARules.h Profile.h NoiseProfile.h Transcript.h Transcripts.h HitWrapper.h BamWriter.h sam/bam.h sam/sam.h simul.h sampling.h boost/random.hpp EM.cpp -EM.o : utils.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h GroupInfo.h HitContainer.h ReadIndex.h ReadReader.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h RefSeqPolicy.h PolyARules.h Profile.h NoiseProfile.h Transcript.h Transcripts.h HitWrapper.h BamWriter.h sam/bam.h sam/sam.h EM.cpp simul.h ++EM.o : utils.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h GroupInfo.h HitContainer.h ReadIndex.h ReadReader.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h RefSeqPolicy.h PolyARules.h Profile.h NoiseProfile.h Transcript.h Transcripts.h HitWrapper.h BamWriter.h sam/bam.h sam/sam.h simul.h sam_rsem_aux.h sampling.h boost/random.hpp EM.cpp $(CC) $(COFLAGS) EM.cpp - rsem-bam2wig : sam/bam.h sam/sam.h sam/libbam.a bam2wig.cpp - $(CC) -O3 -Wall bam2wig.cpp sam/libbam.a -lz -o rsem-bam2wig ++bc_aux.h : sam/bam.h ++ ++BamConverter.h : utils.h sam/sam.h sam/bam.h sam_rsem_aux.h sam_rsem_cvt.h bc_aux.h Transcript.h Transcripts.h ++ ++rsem-tbam2gbam : utils.h Transcripts.h Transcript.h bc_aux.h BamConverter.h sam/sam.h sam/bam.h sam/libbam.a sam_rsem_aux.h sam_rsem_cvt.h tbam2gbam.cpp ++ $(CC) -O3 -Wall tbam2gbam.cpp sam/libbam.a -lz -o $@ ++ + rsem-bam2wig : wiggle.h wiggle.o sam/libbam.a bam2wig.cpp + $(CC) -O3 -Wall bam2wig.cpp wiggle.o sam/libbam.a -lz -o $@ + + rsem-bam2readdepth : wiggle.h wiggle.o sam/libbam.a bam2readdepth.cpp + $(CC) -O3 -Wall bam2readdepth.cpp wiggle.o sam/libbam.a -lz -o $@ + + wiggle.o: sam/bam.h sam/sam.h wiggle.cpp wiggle.h + $(CC) $(COFLAGS) wiggle.cpp rsem-simulate-reads : simulation.o $(CC) -o rsem-simulate-reads simulation.o diff --cc rsem-calculate-expression index 7a9092b,06604d9..6f9706e --- a/rsem-calculate-expression +++ b/rsem-calculate-expression @@@ -44,8 -44,7 +44,9 @@@ my $estRSPD = 0 my $B = 20; my $nThreads = 1; --my $genBamF = 0; ++my $genBamF = 1; # default is generating transcript bam file ++my $genGenomeBamF = 0; +my $sampling = 0; my $calcCI = 0; my $quiet = 0; my $help = 0; @@@ -83,11 -79,9 +84,11 @@@ GetOptions("keep-intermediate-files" = "estimate-rspd" => \$estRSPD, "num-rspd-bins=i" => \$B, "p|num-threads=i" => \$nThreads, -- "out-bam" => \$genBamF, ++ "output-genome-bam" => \$genGenomeBamF, + "sampling-for-bam" => \$sampling, "calc-ci" => \$calcCI, "ci-memory=i" => \$NMB, + "time" => \$mTime, "q|quiet" => \$quiet, "h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2); @@@ -146,6 -138,6 +147,14 @@@ else $sampleName = $ARGV[3]; } ++if ($genGenomeBamF) { ++ open(INPUT, "$refName.ti"); ++ my $line = ; chomp($line); ++ close(INPUT); ++ my ($M, $type) = split(/ /, $line); ++ pod2usage(-msg => "No genome information provided, so genome bam file cannot be generated!\n", -exitval => 2, -verbose => 2) if ($type != 0); ++} ++ my $pos = rindex($sampleName, '/'); if ($pos < 0) { $sampleToken = $sampleName; } else { $sampleToken = substr($sampleName, $pos + 1); } @@@ -196,20 -188,14 +205,13 @@@ if (!$is_sam && !$is_bam) } $command .= " | gzip > $imdName.sam.gz"; -- print "$command\n"; - $status = system($command); - if ($status != 0) { - print "bowtie failed! Please check if you provide correct parameters/options for the pipeline!\n"; - exit(-1); - } - print "\n"; + + if ($mTime) { $time_start = time(); } + - $status = system($command); ++ &runCommand($command); + + if ($mTime) { $time_end = time(); $time_alignment = $time_end - $time_start; } - if ($status != 0) { - print "bowtie failed! Please check if you provide correct parameters/options for the pipeline!\n"; - exit(-1); - } - print "\n"; - $inpF = "$imdName.sam.gz"; $is_sam = 1; # output of bowtie is a sam file } @@@ -227,13 -211,13 +229,7 @@@ if ($fn_list ne "") { $command .= " -l if ($tagName ne "") { $command .= " -tag $tagName"; } if ($quiet) { $command .= " -q"; } --print "$command\n"; --$status = system($command); --if ($status != 0) { -- print "rsem-parse-alignments failed! Please check if you provide correct parameters/options for the pipeline!\n"; -- exit(-1); --} --print "\n"; ++&runCommand($command); $command = $dir."rsem-build-read-index $gap"; switch($read_type) { @@@ -242,16 -226,16 +238,10 @@@ case 2 { $command .= " 0 $quiet $imdName\_alignable_1.fa $imdName\_alignable_2.fa"; } case 3 { $command .= " 1 $quiet $imdName\_alignable_1.fq $imdName\_alignable_2.fq"; } } --print "$command\n"; --$status = system($command); --if ($status != 0) { -- print "rsem-build-read-index failed! Please check if you provide correct parameters/options for the pipeline!\n"; -- exit(-1); --} --print "\n"; ++&runCommand($command); -$status = open(OUTPUT, ">$imdName.mparams"); -if ($status == 0) { print "Cannot generate $imdName.mparams!\n"; exit(-1); } +my $doesOpen = open(OUTPUT, ">$imdName.mparams"); +if ($doesOpen == 0) { print "Cannot generate $imdName.mparams!\n"; exit(-1); } print OUTPUT "$minL $maxL\n"; print OUTPUT "$probF\n"; print OUTPUT "$estRSPD\n"; @@@ -271,31 -254,31 +261,22 @@@ if ($genBamF) if ($calcCI) { $command .= " --gibbs-out"; } if ($quiet) { $command .= " -q"; } --print "$command\n"; --$status = system($command); --if ($status != 0) { -- print "rsem-run-em failed! Please check if you provide correct parameters/options for the pipeline!\n"; -- exit(-1); --} --print "\n"; ++&runCommand($command); if ($genBamF) { -- $command = $dir."sam/samtools sort $sampleName.bam $sampleName.sorted"; -- print "$command\n"; -- $status = system($command); -- if ($status != 0) { -- print "sam/samtools sort failed! Please check if you provide correct parameters/options for the pipeline!\n"; -- exit(-1); -- } -- print "\n"; -- $command = $dir."sam/samtools index $sampleName.sorted.bam"; -- print "$command\n"; -- $status = system($command); -- if ($status != 0) { -- print "sam/samtools index failed! Please check if you provide correct parameters/options for the pipeline!\n"; -- exit(-1); ++ $command = $dir."sam/samtools sort $sampleName.transcript.bam $sampleName.transcript.sorted"; ++ &runCommand($command); ++ $command = $dir."sam/samtools index $sampleName.transcript.sorted.bam"; ++ &runCommand($command); ++ ++ if ($genGenomeBamF) { ++ $command = $dir."rsem-tbam2gbam $refName $sampleName.transcript.bam $sampleName.genome.bam"; ++ &runCommand($command); ++ $command = $dir."sam/samtools sort $sampleName.genome.bam $sampleName.genome.sorted"; ++ &runCommand($command); ++ $command = $dir."sam/samtools index $sampleName.genome.sorted.bam"; ++ &runCommand($command); } -- print "\n"; } &collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level @@@ -309,13 -288,13 +290,7 @@@ if ($calcCI) $command = $dir."rsem-run-gibbs $refName $sampleName $sampleToken $BURNIN $CHAINLEN $SAMPLEGAP"; # $command .= " -p $nThreads"; if ($quiet) { $command .= " -q"; } -- print "$command\n"; -- $status = system($command); -- if ($status != 0) { -- print "rsem-run-gibbs failed! Please check if you provide correct parameters/options for the pipeline!\n"; -- exit(-1); -- } -- print "\n"; ++ &runCommand($command); system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak1"); system("mv $sampleName.genes.results $imdName.genes.results.bak1"); @@@ -324,13 -303,13 +299,7 @@@ $command = $dir."rsem-calculate-credibility-intervals $refName $sampleName $sampleToken $CONFIDENCE $NSPC $NMB"; if ($quiet) { $command .= " -q"; } -- print "$command\n"; -- $status = system($command); -- if ($status != 0) { -- print "rsem-calculate-credibility-intervals failed! Please check if you provide correct parameters/options for the pipeline!\n"; -- exit(-1); -- } -- print "\n"; ++ &runCommand($command); system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak2"); system("mv $sampleName.genes.results $imdName.genes.results.bak2"); @@@ -338,30 -317,14 +307,40 @@@ &collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level } +if ($mTime) { $time_end = time(); $time_ci = $time_end - $time_start; } + +if ($mTime) { $time_start = time(); } + if (!$keep_intermediate_files) { - $status = system("rm -rf $temp_dir"); - $status = system ("rm -rf $temp_dir"); -- if ($status != 0) { -- print "Fail to delete the temporary folder!\n"; - exit(-1); - } ++ &runCommand("rm -rf $temp_dir", "Fail to delete the temporary folder!"); +} + +if ($mTime) { $time_end = time(); } + +if ($mTime) { + open(OUTPUT, ">$sampleName.time"); + print OUTPUT "Alignment: $time_alignment s.\n"; + print OUTPUT "RSEM: $time_rsem s.\n"; + print OUTPUT "CI: $time_ci s.\n"; + my $time_del = $time_end - $time_start; + print OUTPUT "Delete: $time_del s.\n"; + close(OUTPUT); +} + ++# command, {err_msg} ++sub runCommand { ++ print $_[0]."\n"; ++ my $status = system($_[0]); ++ if ($status != 0) { ++ my $errmsg; ++ if (scalar(@_) > 1) { $errmsg = $_[1]; } ++ else { $errmsg = "$command failed! Plase check if you provide correct parameters/options for the pipeline!"; } ++ print $errmsg."\n"; + exit(-1); + } ++ print "\n"; + } + # inpF, outF sub collectResults { my $local_status; @@@ -479,14 -442,10 +458,14 @@@ RSEM reads header information from inpu Number of threads to use. Both Bowtie and expression estimation will use this many threads. (Default: 1) --=item B<--out-bam> ++=item B<--output-genome-bam> + - Generate a BAM file, 'sample_name.bam', with alignments mapped to genomic coordinates and annotated with their posterior probabilities. In addition, RSEM will call samtools (included in RSEM package) to sort and index the bam file. 'sample_name.sorted.bam' and 'sample_name.sorted.bam.bai' will be generated. (Default: off) ++Generate a BAM file, 'sample_name.genome.bam', with alignments mapped to genomic coordinates and annotated with their posterior probabilities. In addition, RSEM will call samtools (included in RSEM package) to sort and index the bam file. 'sample_name.genome.sorted.bam' and 'sample_name.genome.sorted.bam.bai' will be generated. (Default: off) -Generate a BAM file, 'sample_name.bam', with alignments mapped to genomic coordinates and annotated with their posterior probabilities. In addition, RSEM will call samtools (included in RSEM package) to sort and index the bam file. 'sample_name.sorted.bam' and 'sample_name.sorted.bam.bai' will be generated. (Default: off) +=item B<--sampling-for-bam> - When RSEM generates a BAM file, instead of outputing all alignments a read has with their posterior probabilities, one alignment is sampled and outputed according to the posterior probabilities. If the sampling result is that the read comes from the "noise" transcript, nothing is outputed. It cannot be specified unless --out-bam is specified. (Default: off) ++When RSEM generates a BAM file, instead of outputing all alignments a read has with their posterior probabilities, one alignment is sampled and outputed according to the posterior probabilities. If the sampling result is that the read comes from the "noise" transcript, nothing is outputed. (Default: off) + =item B<--calc-ci> Calculate 95% credibility intervals and posterior mean estimates. (Default: off) @@@ -575,13 -534,13 +554,13 @@@ Show help information =head1 DESCRIPTION --In its default mode, this program aligns input reads against a reference transcriptome with Bowtie and calculates expression values using the alignments. RSEM assumes the data are single-end reads with quality scores, unless the '--paired-end' or '--no-qualities' options are specified. Users may use an alternative aligner by specifying one of the --sam and --bam options, and providing an alignment file in the specified format. However, users should make sure the alignment file satisfies the requirements mentioned in ARGUMENTS section. ++In its default mode, this program aligns input reads against a reference transcriptome with Bowtie and calculates expression values using the alignments. RSEM assumes the data are single-end reads with quality scores, unless the '--paired-end' or '--no-qualities' options are specified. Users may use an alternative aligner by specifying one of the --sam and --bam options, and providing an alignment file in the specified format. However, users should make sure that they align against the indices generated by 'rsem-prepare-reference' and the alignment file satisfies the requirements mentioned in ARGUMENTS section. One simple way to make the alignment file (e.g. input.sam) satisfying RSEM's requirements (assuming the aligner used put mates in a paired-end read adjacent) is to use the following command: sort -k 1,1 -s input.sam > input.sorted.sam --The SAM/BAM format RSEM uses is v1.3. However, it is compatible with old SAM/BAM format. ++The SAM/BAM format RSEM uses is v1.4. However, it is compatible with old SAM/BAM format. The user must run 'rsem-prepare-reference' with the appropriate reference before using this program. @@@ -625,11 -584,11 +604,25 @@@ file. If no other attributes are given 'rsem-prepare-reference', there will be no tab after the tau_value field. --=item B ++=item B ++ ++'sample_name.transcript.bam' is a BAM-formatted file of read ++alignments in transcript coordinates. The MAPQ field of each alignment ++is set to min(100, floor(-10 * log10(1.0 - w) + 0.5)), where w is the ++posterior probability of that alignment being the true mapping of a ++read. In addition, RSEM pads a new tag ZW:f:value, where value is a ++single precision floating number representing the posterior ++probability. ++ ++'sample_name.transcript.sorted.bam' and ++'sample_name.transcript.sorted.bam.bai' are the sorted BAM file and ++indices generated by samtools (included in RSEM package). ++ ++=item B --Only generated when --out-bam is specified. ++Only generated when --output-genome-bam is specified. --'sample_name.bam' is a BAM-formatted file of read alignments in ++'sample_name.genome.bam' is a BAM-formatted file of read alignments in genomic coordinates. Alignments of reads that have identical genomic coordinates (i.e., alignments to different isoforms that share the same genomic region) are collapsed into one alignment. The MAPQ field @@@ -639,7 -598,7 +632,7 @@@ the true mapping of a read. In additio ZW:f:value, where value is a single precision floating number representing the posterior probability. --'sample_name.sorted.bam' and 'sample_name.sorted.bam.bai' are the ++'sample_name.genome.sorted.bam' and 'sample_name.genome.sorted.bam.bai' are the sorted BAM file and indices generated by samtools (included in RSEM package). =item B @@@ -652,16 -611,16 +645,16 @@@ This is a folder instead of a file. Al Assume the path to the bowtie executables is in the user's PATH environment variable. Reference files are under '/ref' with name 'mm9'. --1) '/data/mmliver.fq', single-end reads with quality scores. Quality scores are encoded as for 'GA pipeline version >= 1.3'. We want to use 8 threads and generate a BAM file: ++1) '/data/mmliver.fq', single-end reads with quality scores. Quality scores are encoded as for 'GA pipeline version >= 1.3'. We want to use 8 threads and generate a genome BAM file: rsem-calculate-expression --phred64-quals \ -p 8 \ -- --out-bam \ ++ --output-genome-bam \ /data/mmliver.fq \ /ref/mm9 \ mmliver_single_quals --2) '/data/mmliver_1.fq' and '/data/mmliver_2.fq', paired-end reads with quality scores. Quality scores are in SANGER format. We want to use 8 threads and do not generate a BAM file: ++2) '/data/mmliver_1.fq' and '/data/mmliver_2.fq', paired-end reads with quality scores. Quality scores are in SANGER format. We want to use 8 threads and do not generate a genome BAM file: rsem-calculate-expression -p 8 \ --paired-end \ @@@ -670,7 -629,7 +663,7 @@@ /ref/mm9 \ mmliver_paired_end_quals --3) '/data/mmliver.fa', single-end reads without quality scores. We want to use 8 threads and generate a BAM file: ++3) '/data/mmliver.fa', single-end reads without quality scores. We want to use 8 threads: rsem-calculate-expression -p 8 \ --no-qualities \ @@@ -678,21 -637,21 +671,21 @@@ /ref/mm9 \ mmliver_single_without_quals --4) Data are the same as 1). We want to take a fragment length distribution into consideration. We set the fragment length mean to 150 and the standard deviation to 35. In addition to a BAM file, we also want to generate credibility intervals. We allow RSEM to use 1GB of memory for CI calculation. ++4) Data are the same as 1). We want to take a fragment length distribution into consideration. We set the fragment length mean to 150 and the standard deviation to 35. In addition to a BAM file, we also want to generate credibility intervals. We allow RSEM to use 1GB of memory for CI calculation: rsem-calculate-expression --bowtie-path /sw/bowtie \ --phred64-quals \ --fragment-length-mean 150.0 \ --fragment-length-sd 35.0 \ -p 8 \ -- --out-bam \ ++ --output-genome-bam \ --calc-ci \ --ci-memory 1024 \ /data/mmliver.fq \ /ref/mm9 \ mmliver_single_quals --5) '/data/mmliver_paired_end_quals.bam', paired-end reads with quality scores. We want to use 8 threads and do not generate a BAM file: ++5) '/data/mmliver_paired_end_quals.bam', paired-end reads with quality scores. We want to use 8 threads: rsem-calculate-expression --paired-end \ --bam \ diff --cc rsem-prepare-reference index 47d7d9c,ecea8d7..6eb6b60 --- a/rsem-prepare-reference +++ b/rsem-prepare-reference @@@ -51,6 -51,6 +51,8 @@@ if ($size == 1 && (-d $list[0])) $size = scalar(@list); } ++pod2usage(-msg => "reference_fasta_file(s) is empty! Please check if you provide the correct folder name or file suffixes!", -exitval => 2, -verbose => 2) if ($size <= 0); ++ if ($no_polyA) { $polyAChoice = 1 } elsif ($subsetFile ne "") { $polyAChoice = 2; } @@@ -65,13 -65,13 +67,7 @@@ if ($type == 0) if ($mappingF ne "") { $command .= " 1 $mappingF"; } else { $command .= " 0"; } $command .= " @list"; -- print "$command\n"; -- $status = system($command); -- if ($status != 0) { -- print "rsem-extract-reference-transcripts failed! Please check if you provide correct parameters/options for the pipeline!\n"; -- exit(-1); -- } -- print "\n"; ++ &runCommand($command); } else { $"=" "; @@@ -79,37 -79,37 +75,33 @@@ if ($mappingF ne "") { $command .= " 1 $mappingF"; } else { $command .= " 0"; } $command .= " @list"; -- print "$command\n"; -- $status = system($command); -- if ($status != 0) { -- print "rsem-synthesis-reference-transcripts failed! Please check if you provide correct parameters/options for the pipeline!\n"; -- exit(-1); -- } -- print "\n"; ++ &runCommand($command); } $command = $dir."rsem-preref $ARGV[1].transcripts.fa $polyAChoice $ARGV[1] -l $polyALen"; if ($polyAChoice == 2) { $command .= " -f $subsetFile"; } if ($no_ntog) { $command .= " --no-ntog"; } if ($quiet) { $command .= " -q"; } -- --print "$command\n"; --$status = system($command); --if ($status != 0) { -- print "rsem-preref failed! Please check if you provide correct parameters/options for the pipeline!\n"; -- exit(-1); --} --print "\n"; ++ ++&runCommand($command); if (!$no_bowtie) { $command = $bowtie_path."bowtie-build -f"; if ($quiet) { $command .= " -q"; } $command .= " $ARGV[1].idx.fa $ARGV[1]"; -- -- print "$command\n"; -- $status = system($command); -- if ($status != 0) { - print "bowtie-build failed! Please check if you have a copy of bowtie-build in the path you specified!\n"; - print "bowtie-build failed! Please check if you provide correct parameters/options for the pipeline!\n"; ++ ++ &runCommand($command); ++} ++ ++# command, {err_msg} ++sub runCommand { ++ print $_[0]."\n"; ++ my $status = system($_[0]); ++ if ($status != 0) { ++ my $errmsg; ++ if (scalar(@_) > 1) { $errmsg = $_[1]; } ++ else { $errmsg = "$command failed! Plase check if you provide correct parameters/options for the pipeline!"; } ++ print $errmsg."\n"; exit(-1); } print "\n"; diff --cc sam/ChangeLog index dd62b49,dd62b49..a471838 --- a/sam/ChangeLog +++ b/sam/ChangeLog @@@ -1,3 -1,3 +1,490 @@@ ++------------------------------------------------------------------------ ++r925 | lh3lh3 | 2011-02-28 15:45:17 -0500 (Mon, 28 Feb 2011) | 2 lines ++Changed paths: ++ M /trunk/samtools/phase.c ++ ++minor changes to a heuristic rule ++ ++------------------------------------------------------------------------ ++r924 | lh3lh3 | 2011-02-28 15:24:04 -0500 (Mon, 28 Feb 2011) | 4 lines ++Changed paths: ++ M /trunk/samtools/bam.h ++ M /trunk/samtools/bcftools/vcfutils.pl ++ M /trunk/samtools/phase.c ++ ++ * 0.1.12-r924:126 ++ * fixed a bug in phase (due to recent changes) ++ * fixed a bug in vcf2fq ++ ++------------------------------------------------------------------------ ++r923 | lh3lh3 | 2011-02-28 12:57:39 -0500 (Mon, 28 Feb 2011) | 5 lines ++Changed paths: ++ M /trunk/samtools/Makefile ++ M /trunk/samtools/bam.h ++ M /trunk/samtools/bam_plcmd.c ++ M /trunk/samtools/bamtk.c ++ M /trunk/samtools/phase.c ++ ++ * put version number in bam.h ++ * write version to BCF ++ * in phase, change the default -q to 37 ++ * output a little more information during phasing ++ ++------------------------------------------------------------------------ ++r922 | lh3lh3 | 2011-02-25 16:40:09 -0500 (Fri, 25 Feb 2011) | 3 lines ++Changed paths: ++ M /trunk/samtools/bam2bcf.c ++ M /trunk/samtools/bamtk.c ++ M /trunk/samtools/bcftools/bcf.c ++ M /trunk/samtools/bcftools/bcf.tex ++ M /trunk/samtools/bcftools/bcf2qcall.c ++ M /trunk/samtools/bcftools/bcfutils.c ++ M /trunk/samtools/bcftools/ld.c ++ M /trunk/samtools/bcftools/prob1.c ++ M /trunk/samtools/bcftools/vcf.c ++ M /trunk/samtools/cut_target.c ++ ++ * change the order of PL/GL according to the latest VCF spec ++ * change the type of SP to int32_t ++ ++------------------------------------------------------------------------ ++r921 | lh3lh3 | 2011-02-25 14:40:56 -0500 (Fri, 25 Feb 2011) | 2 lines ++Changed paths: ++ M /trunk/samtools/bcftools/bcf.tex ++ ++update the BCF spec ++ ++------------------------------------------------------------------------ ++r920 | lh3lh3 | 2011-02-25 00:59:27 -0500 (Fri, 25 Feb 2011) | 2 lines ++Changed paths: ++ M /trunk/samtools/Makefile ++ M /trunk/samtools/bam_md.c ++ M /trunk/samtools/bam_plcmd.c ++ M /trunk/samtools/bam_sort.c ++ M /trunk/samtools/bamtk.c ++ A /trunk/samtools/cut_target.c ++ M /trunk/samtools/errmod.h ++ M /trunk/samtools/faidx.c ++ M /trunk/samtools/khash.h ++ M /trunk/samtools/kstring.c ++ M /trunk/samtools/kstring.h ++ A /trunk/samtools/phase.c ++ M /trunk/samtools/samtools.1 ++ ++added the phase command ++ ++------------------------------------------------------------------------ ++r918 | lh3lh3 | 2011-02-24 10:05:54 -0500 (Thu, 24 Feb 2011) | 2 lines ++Changed paths: ++ M /trunk/samtools/bcftools/prob1.c ++ M /trunk/samtools/bcftools/prob1.h ++ ++added "const" to bcf_p1_cal() ++ ++------------------------------------------------------------------------ ++r917 | lh3lh3 | 2011-02-24 09:36:30 -0500 (Thu, 24 Feb 2011) | 2 lines ++Changed paths: ++ M /trunk/samtools/bam.c ++ ++more meaningful BAM truncation message ++ ++------------------------------------------------------------------------ ++r916 | lh3lh3 | 2011-02-24 09:35:06 -0500 (Thu, 24 Feb 2011) | 3 lines ++Changed paths: ++ M /trunk/samtools/bcftools/bcf.c ++ M /trunk/samtools/bcftools/vcf.c ++ ++ * automatically fix errors in GL ++ * output unrecognized FORMAT as "." ++ ++------------------------------------------------------------------------ ++r913 | lh3lh3 | 2011-02-10 22:59:47 -0500 (Thu, 10 Feb 2011) | 2 lines ++Changed paths: ++ M /trunk/samtools/bcftools/bcf.h ++ M /trunk/samtools/bcftools/call1.c ++ M /trunk/samtools/bcftools/vcf.c ++ ++finished VCF->BCF conversion ++ ++------------------------------------------------------------------------ ++r910 | petulda | 2011-02-03 03:13:48 -0500 (Thu, 03 Feb 2011) | 1 line ++Changed paths: ++ M /trunk/samtools/bcftools/vcfutils.pl ++ ++Prevent division by zero ++------------------------------------------------------------------------ ++r909 | lh3lh3 | 2011-02-02 11:29:20 -0500 (Wed, 02 Feb 2011) | 2 lines ++Changed paths: ++ M /trunk/samtools/bcftools/call1.c ++ ++fixed a typo in the VCF header ++ ++------------------------------------------------------------------------ ++r908 | lh3lh3 | 2011-02-02 11:28:24 -0500 (Wed, 02 Feb 2011) | 3 lines ++Changed paths: ++ M /trunk/samtools/bam2bcf.c ++ M /trunk/samtools/bam_index.c ++ ++ * fixed an out-of-boundary bug ++ * improved sorting order checking in index ++ ++------------------------------------------------------------------------ ++r907 | lh3lh3 | 2011-01-29 22:59:20 -0500 (Sat, 29 Jan 2011) | 4 lines ++Changed paths: ++ M /trunk/samtools/INSTALL ++ M /trunk/samtools/bam_tview.c ++ M /trunk/samtools/knetfile.c ++ ++ * avoid a segfault when network connect fails ++ * update INSTALL ++ * fixed a bug in tview on big-endian by Nathan Weeks ++ ++------------------------------------------------------------------------ ++r903 | lh3lh3 | 2011-01-27 14:50:02 -0500 (Thu, 27 Jan 2011) | 3 lines ++Changed paths: ++ M /trunk/samtools/bam2bcf_indel.c ++ M /trunk/samtools/bam_md.c ++ ++ * fixed a rare memory issue in bam_md.c ++ * fixed a bug in indel calling related to unmapped and refskip reads ++ ++------------------------------------------------------------------------ ++r902 | lh3lh3 | 2011-01-23 21:46:18 -0500 (Sun, 23 Jan 2011) | 2 lines ++Changed paths: ++ M /trunk/samtools/bcftools/fet.c ++ ++fixed two minor bugs in Fisher's exact test ++ ++------------------------------------------------------------------------ ++r899 | petulda | 2011-01-19 09:28:02 -0500 (Wed, 19 Jan 2011) | 1 line ++Changed paths: ++ M /trunk/samtools/bcftools/vcfutils.pl ++ ++Skip sites with unknown ref ++------------------------------------------------------------------------ ++r898 | lh3lh3 | 2011-01-15 12:56:05 -0500 (Sat, 15 Jan 2011) | 2 lines ++Changed paths: ++ M /trunk/samtools/ChangeLog ++ M /trunk/samtools/bam_maqcns.c ++ M /trunk/samtools/bam_md.c ++ ++move bam_nt16_nt4_table[] from bam_maqcns.c to bam_md.c ++ ++------------------------------------------------------------------------ ++r896 | lh3lh3 | 2011-01-06 10:52:15 -0500 (Thu, 06 Jan 2011) | 3 lines ++Changed paths: ++ M /trunk/samtools/bam_plcmd.c ++ M /trunk/samtools/bamtk.c ++ M /trunk/samtools/bcftools/bcf.h ++ M /trunk/samtools/bcftools/bcfutils.c ++ M /trunk/samtools/bcftools/call1.c ++ ++ * samtools-0.1.12-10 (r896) ++ * allow to exclude read groups in mpileup ++ ++------------------------------------------------------------------------ ++r895 | lh3lh3 | 2011-01-04 11:31:29 -0500 (Tue, 04 Jan 2011) | 2 lines ++Changed paths: ++ M /trunk/samtools/bcftools/bcf.tex ++ ++sorry. It is SP not ST ++ ++------------------------------------------------------------------------ ++r894 | lh3lh3 | 2011-01-04 11:29:06 -0500 (Tue, 04 Jan 2011) | 2 lines ++Changed paths: ++ M /trunk/samtools/bcftools/bcf.tex ++ ++added ST ++ ++------------------------------------------------------------------------ ++r893 | petulda | 2011-01-04 06:55:56 -0500 (Tue, 04 Jan 2011) | 1 line ++Changed paths: ++ M /trunk/samtools/bcftools/call1.c ++ ++Fixed a typo in read_samples ++------------------------------------------------------------------------ ++r892 | jmarshall | 2010-12-28 08:06:49 -0500 (Tue, 28 Dec 2010) | 9 lines ++Changed paths: ++ M /trunk/samtools/Makefile ++ M /trunk/samtools/bcftools/Makefile ++ M /trunk/samtools/examples/Makefile ++ ++System libraries go *after* user libraries in link commands, because ++the user libraries may themselves have dependencies that are satisfied ++by the system libraries. It's not rocket science! ++ ++This makes a difference with some linkers; or with -static or --as-needed. ++ ++The examples/Makefile fix is from Charles Plessy. ++See also http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=606004 ++ ++------------------------------------------------------------------------ ++r891 | lh3lh3 | 2010-12-21 12:16:33 -0500 (Tue, 21 Dec 2010) | 3 lines ++Changed paths: ++ M /trunk/samtools/bamtk.c ++ M /trunk/samtools/bcftools/bcf.h ++ M /trunk/samtools/bcftools/bcfutils.c ++ M /trunk/samtools/bcftools/call1.c ++ ++ * samtools-0.1.12-9 (r891) ++ * allow to call SNPs from a subset of samples ++ ++------------------------------------------------------------------------ ++r889 | lh3lh3 | 2010-12-15 11:28:16 -0500 (Wed, 15 Dec 2010) | 3 lines ++Changed paths: ++ M /trunk/samtools/bam2bcf.c ++ M /trunk/samtools/bamtk.c ++ ++ * samtools-0.1.12-12 (r889) ++ * set mapQ as 20 if it equals 255 ++ ++------------------------------------------------------------------------ ++r888 | lh3lh3 | 2010-12-14 22:41:09 -0500 (Tue, 14 Dec 2010) | 2 lines ++Changed paths: ++ M /trunk/samtools/bam_plcmd.c ++ M /trunk/samtools/bamtk.c ++ ++When -B is applied to mpileup, still use paired reads only unless -A is flagged. ++ ++------------------------------------------------------------------------ ++r887 | lh3lh3 | 2010-12-14 22:37:05 -0500 (Tue, 14 Dec 2010) | 3 lines ++Changed paths: ++ M /trunk/samtools/bam_md.c ++ M /trunk/samtools/bam_plcmd.c ++ M /trunk/samtools/bamtk.c ++ ++ * samtools-0.1.12-6 (r887) ++ * added a hidden option -E to mpileup/calmd. -E triggers an alternative way to apply BAQ. ++ ++------------------------------------------------------------------------ ++r886 | lh3lh3 | 2010-12-14 12:51:03 -0500 (Tue, 14 Dec 2010) | 2 lines ++Changed paths: ++ M /trunk/samtools/bam2bcf_indel.c ++ M /trunk/samtools/bamtk.c ++ ++(Arguably) improved the indel caller a tiny bit for lowCov data. ++ ++------------------------------------------------------------------------ ++r885 | petulda | 2010-12-14 04:55:46 -0500 (Tue, 14 Dec 2010) | 1 line ++Changed paths: ++ M /trunk/samtools/bcftools/call1.c ++ ++Fixed the VCF header to pass validation ++------------------------------------------------------------------------ ++r884 | lh3lh3 | 2010-12-12 23:02:19 -0500 (Sun, 12 Dec 2010) | 3 lines ++Changed paths: ++ M /trunk/samtools/bam2bcf_indel.c ++ M /trunk/samtools/bamtk.c ++ M /trunk/samtools/bcftools/vcfutils.pl ++ ++ * samtools-0.1.12-4 (r884) ++ * fixed a long-existing flaw in the INDEL calling model ++ ++------------------------------------------------------------------------ ++r883 | lh3lh3 | 2010-12-11 20:05:42 -0500 (Sat, 11 Dec 2010) | 2 lines ++Changed paths: ++ M /trunk/samtools/bcftools/bcfutils.c ++ M /trunk/samtools/bcftools/call1.c ++ M /trunk/samtools/bcftools/vcfutils.pl ++ ++compute max SP and max GQ from sample genotypes ++ ++------------------------------------------------------------------------ ++r880 | lh3lh3 | 2010-12-10 10:50:54 -0500 (Fri, 10 Dec 2010) | 2 lines ++Changed paths: ++ D /trunk/samtools/bcftools/bcf-fix.pl ++ ++drop bcf-fix.pl as it is redundant by the latest changes ++ ++------------------------------------------------------------------------ ++r879 | lh3lh3 | 2010-12-10 10:50:29 -0500 (Fri, 10 Dec 2010) | 3 lines ++Changed paths: ++ M /trunk/samtools/bcftools/call1.c ++ M /trunk/samtools/bcftools/vcf.c ++ ++ * fixed a minor issue in printing VCFs ++ * write bcftools specific INFO and FORMAT in the header ++ ++------------------------------------------------------------------------ ++r878 | lh3lh3 | 2010-12-10 10:09:14 -0500 (Fri, 10 Dec 2010) | 2 lines ++Changed paths: ++ M /trunk/samtools/bamtk.c ++ M /trunk/samtools/bcftools/bcfutils.c ++ M /trunk/samtools/bcftools/call1.c ++ ++Make sure that the GT genotype field is the first ++ ++------------------------------------------------------------------------ ++r877 | lh3lh3 | 2010-12-08 17:27:05 -0500 (Wed, 08 Dec 2010) | 7 lines ++Changed paths: ++ M /trunk/samtools/bam2bcf.c ++ M /trunk/samtools/bam2bcf.h ++ M /trunk/samtools/bam2bcf_indel.c ++ M /trunk/samtools/bam_plcmd.c ++ M /trunk/samtools/bamtk.c ++ ++ * samtools-0.1.12-2 (r877) ++ ++ * allow to fine control the selection of indel candidates. The current ++ setting is okay for lowCov and highCov with ~100 samples, but it ++ skips too many indels for highCov with >250 samples. ++ ++ ++------------------------------------------------------------------------ ++r874 | lh3lh3 | 2010-12-07 22:40:35 -0500 (Tue, 07 Dec 2010) | 2 lines ++Changed paths: ++ M /trunk/samtools/bam_plcmd.c ++ ++a spelling error.. ++ ++------------------------------------------------------------------------ ++r873 | lh3lh3 | 2010-12-07 22:39:57 -0500 (Tue, 07 Dec 2010) | 3 lines ++Changed paths: ++ M /trunk/samtools/bam_plcmd.c ++ M /trunk/samtools/bamtk.c ++ ++ * samtools-0.1.12-1 (r873) ++ * added a switch to allow anomalous read pairs in calling ++ ++------------------------------------------------------------------------ ++r872 | lh3lh3 | 2010-12-07 14:43:54 -0500 (Tue, 07 Dec 2010) | 2 lines ++Changed paths: ++ M /trunk/samtools/bcftools/vcfutils.pl ++ ++fixed a bug in vcf2fq ++ ++------------------------------------------------------------------------ ++r869 | lh3lh3 | 2010-12-05 01:18:06 -0500 (Sun, 05 Dec 2010) | 2 lines ++Changed paths: ++ M /trunk/samtools/bamtk.c ++ ++added a warning for the Windows version ++ ++------------------------------------------------------------------------ ++r868 | lh3lh3 | 2010-12-05 01:05:51 -0500 (Sun, 05 Dec 2010) | 4 lines ++Changed paths: ++ M /trunk/samtools/bcftools/call1.c ++ ++In ksprintf(), change "%lf" and "%lg" to "%f" and "%g", respectively. ++According to the manual page, this change is valid. However, MinGW seems ++to interpret "%lf" as "%Lf". ++ ++------------------------------------------------------------------------ ++r867 | lh3lh3 | 2010-12-05 00:35:43 -0500 (Sun, 05 Dec 2010) | 2 lines ++Changed paths: ++ M /trunk/samtools/Makefile.mingw ++ M /trunk/samtools/bam_aux.c ++ ++bring back the windows support ++ ++------------------------------------------------------------------------ ++r866 | lh3lh3 | 2010-12-04 23:33:51 -0500 (Sat, 04 Dec 2010) | 2 lines ++Changed paths: ++ M /trunk/samtools/bam_reheader.c ++ M /trunk/samtools/bcftools/vcfutils.pl ++ ++Fixed a compiling error when knetfile is not used. ++ ++------------------------------------------------------------------------ ++r865 | lh3lh3 | 2010-12-04 00:13:22 -0500 (Sat, 04 Dec 2010) | 2 lines ++Changed paths: ++ M /trunk/samtools/bcftools/vcfutils.pl ++ ++vcf->fastq ++ ++------------------------------------------------------------------------ ++r864 | lh3lh3 | 2010-12-03 17:12:30 -0500 (Fri, 03 Dec 2010) | 3 lines ++Changed paths: ++ M /trunk/samtools/bcftools/call1.c ++ M /trunk/samtools/bcftools/prob1.c ++ M /trunk/samtools/bcftools/prob1.h ++ ++ * remove "-f". Instead always compute consensus quality ++ * increase the upper limit of quality ++ ++------------------------------------------------------------------------ ++r863 | lh3lh3 | 2010-12-03 15:28:15 -0500 (Fri, 03 Dec 2010) | 2 lines ++Changed paths: ++ M /trunk/samtools/bcftools/bcf.c ++ ++more informative error message ++ ++------------------------------------------------------------------------ ++r862 | lh3lh3 | 2010-12-02 16:16:08 -0500 (Thu, 02 Dec 2010) | 2 lines ++Changed paths: ++ M /trunk/samtools/NEWS ++ M /trunk/samtools/bamtk.c ++ ++Release samtools-0.1.12a ++ ++------------------------------------------------------------------------ ++r861 | lh3lh3 | 2010-12-02 15:55:06 -0500 (Thu, 02 Dec 2010) | 2 lines ++Changed paths: ++ M /trunk/samtools/bcftools/call1.c ++ ++a possible fix to DP4=0,0,0,0; have not tested, but should have no side-effect ++ ++------------------------------------------------------------------------ ++r859 | lh3lh3 | 2010-12-02 11:39:57 -0500 (Thu, 02 Dec 2010) | 2 lines ++Changed paths: ++ M /trunk/samtools/NEWS ++ M /trunk/samtools/bam_index.c ++ M /trunk/samtools/bamtk.c ++ M /trunk/samtools/samtools.1 ++ ++Release samtools-0.1.12 ++ ++------------------------------------------------------------------------ ++r858 | lh3lh3 | 2010-12-02 11:24:41 -0500 (Thu, 02 Dec 2010) | 4 lines ++Changed paths: ++ M /trunk/samtools/bam_plcmd.c ++ M /trunk/samtools/bamtk.c ++ M /trunk/samtools/bcftools/bcf.c ++ ++ * samtools-0.1.11-1 (r858) ++ * fixed a bug in mpileup which causes segfaults ++ * bcftools: do not segfault when BCF contains errors ++ ++------------------------------------------------------------------------ ++r857 | lh3lh3 | 2010-11-30 23:52:50 -0500 (Tue, 30 Nov 2010) | 2 lines ++Changed paths: ++ M /trunk/samtools/bam_index.c ++ ++fixed a memory leak in bam_fetch() ++ ++------------------------------------------------------------------------ ++r856 | lh3lh3 | 2010-11-26 00:07:31 -0500 (Fri, 26 Nov 2010) | 3 lines ++Changed paths: ++ M /trunk/samtools/bam2bcf_indel.c ++ M /trunk/samtools/bcftools/vcfutils.pl ++ ++ * fixed a memory violation ++ * added splitchr to vcfutils.pl ++ ++------------------------------------------------------------------------ ++r854 | lh3lh3 | 2010-11-23 09:05:08 -0500 (Tue, 23 Nov 2010) | 2 lines ++Changed paths: ++ M /trunk/samtools/bcftools/ld.c ++ ++fixed a typo/bug in r^2 computation ++ ++------------------------------------------------------------------------ ++r852 | lh3lh3 | 2010-11-21 22:20:20 -0500 (Sun, 21 Nov 2010) | 2 lines ++Changed paths: ++ M /trunk/samtools/bamtk.c ++ ++forget to change the version information ++ ++------------------------------------------------------------------------ ++r851 | lh3lh3 | 2010-11-21 22:16:52 -0500 (Sun, 21 Nov 2010) | 2 lines ++Changed paths: ++ M /trunk/samtools/ChangeLog ++ M /trunk/samtools/NEWS ++ M /trunk/samtools/bcftools/bcftools.1 ++ M /trunk/samtools/samtools.1 ++ ++Release samtools-0.1.11 ++ ------------------------------------------------------------------------ r844 | lh3lh3 | 2010-11-19 23:16:08 -0500 (Fri, 19 Nov 2010) | 3 lines Changed paths: diff --cc sam/INSTALL index f1cf7aa,f1cf7aa..37d84a9 --- a/sam/INSTALL +++ b/sam/INSTALL @@@ -1,29 -1,29 +1,30 @@@ System Requirements =================== --SAMtools depends on the zlib library . The latest --version 1.2.3 is preferred and with the latest version you can compile --razip and use it to compress a FASTA file. SAMtools' faidx is able to --index a razip-compressed FASTA file to save diskspace. Older zlib also --works with SAMtools, but razip cannot be compiled. ++SAMtools depends on the zlib library . Version 1.2.3+ is ++preferred and with 1.2.3+ you can compile razip and use it to compress a FASTA ++file. SAMtools' faidx is able to index a razip-compressed FASTA file to save ++diskspace. Older zlib also works with SAMtools, but razip cannot be compiled. The text-based viewer (tview) requires the GNU ncurses library --, which comes with Mac OS X and --most of the modern Linux/Unix distributions. If you do not have this --library installed, you can still compile the rest of SAMtools by --manually modifying one line in Makefile. ++, which comes with Mac OS X and most of ++the modern Linux/Unix distributions. If you do not have this library installed, ++you can still compile the rest of SAMtools by manually changing: ++`-D_CURSES_LIB=1' to `-D_CURSES_LIB=0' at the line starting with `DFLAGS=', and ++comment out the line starting with `LIBCURSES='. Compilation =========== --Type `make' to compile samtools. If you have zlib >= 1.2.2.1, you can --compile razip with `make razip'. ++Type `make' to compile samtools. If you have zlib >= 1.2.2.1, you can compile ++razip with `make razip'. Installation ============ --Simply copy `samtools' and other executables/scripts in `misc' to a --location you want (e.g. a directory in your $PATH). No further --configurations are required. ++Copy `samtools', `bcftools/bcftools' and other executables/scripts in `misc' to ++a location you want (e.g. a directory in your $PATH). You may also copy ++`samtools.1' and `bcftools/bcftools.1' to a directory in your $MANPATH such ++that the `man' command may find the manual. diff --cc sam/Makefile index 13d4a76,13d4a76..db18333 --- a/sam/Makefile +++ b/sam/Makefile @@@ -1,13 -1,13 +1,14 @@@ CC= gcc CFLAGS= -g -Wall -O2 #-m64 #-arch ppc --DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE -D_CURSES_LIB=1 ++DFLAGS= -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -D_USE_KNETFILE -D_CURSES_LIB=1 KNETFILE_O= knetfile.o LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ -- bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o \ -- $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o --AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \ ++ bam_pileup.o bam_lpileup.o bam_md.o razf.o faidx.o bedidx.o \ ++ $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o bam_cat.o ++AOBJS= bam_tview.o bam_plcmd.o sam_view.o \ bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ -- bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o ++ bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \ ++ cut_target.o phase.o bam2depth.o PROG= samtools INCLUDES= -I. SUBDIRS= . bcftools misc @@@ -37,10 -37,10 +38,10 @@@ all:$(PROG lib:libbam.a libbam.a:$(LOBJS) -- $(AR) -cru $@ $(LOBJS) ++ $(AR) -csru $@ $(LOBJS) samtools:lib-recur $(AOBJS) -- $(CC) $(CFLAGS) -o $@ $(AOBJS) libbam.a -lm $(LIBPATH) $(LIBCURSES) -lz -Lbcftools -lbcf ++ $(CC) $(CFLAGS) -o $@ $(AOBJS) -Lbcftools $(LIBPATH) libbam.a -lbcf $(LIBCURSES) -lm -lz razip:razip.o razf.o $(KNETFILE_O) $(CC) $(CFLAGS) -o $@ razf.o razip.o $(KNETFILE_O) -lz @@@ -53,19 -53,19 +54,19 @@@ bam.o:bam.h razf.h bam_endian.h kstring sam.o:sam.h bam.h bam_import.o:bam.h kseq.h khash.h razf.h bam_pileup.o:bam.h razf.h ksort.h --bam_plcmd.o:bam.h faidx.h bam_maqcns.h glf.h bcftools/bcf.h bam2bcf.h ++bam_plcmd.o:bam.h faidx.h bcftools/bcf.h bam2bcf.h bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h bam_lpileup.o:bam.h ksort.h --bam_tview.o:bam.h faidx.h bam_maqcns.h --bam_maqcns.o:bam.h ksort.h bam_maqcns.h kaln.h ++bam_tview.o:bam.h faidx.h bam_sort.o:bam.h ksort.h razf.h bam_md.o:bam.h faidx.h --glf.o:glf.h sam_header.o:sam_header.h khash.h bcf.o:bcftools/bcf.h bam2bcf.o:bam2bcf.h errmod.h bcftools/bcf.h bam2bcf_indel.o:bam2bcf.h errmod.o:errmod.h ++phase.o:bam.h khash.h ksort.h ++bamtk.o:bam.h faidx.o:faidx.h razf.h khash.h faidx_main.o:faidx.h razf.h diff --cc sam/Makefile.mingw index 9df4b9a,9df4b9a..7a57ffc --- a/sam/Makefile.mingw +++ b/sam/Makefile.mingw @@@ -1,18 -1,18 +1,22 @@@ CC= gcc.exe AR= ar.exe CFLAGS= -g -Wall -O2 --DFLAGS= -D_CURSES_LIB=2 -D_USE_KNETFILE ++DFLAGS= -D_USE_KNETFILE -D_CURSES_LIB=2 KNETFILE_O= knetfile.o LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ -- bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o bam_sort.o \ -- $(KNETFILE_O) --AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \ ++ bam_pileup.o bam_lpileup.o bam_md.o razf.o faidx.o \ ++ $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o bedidx.o ++AOBJS= bam_tview.o bam_plcmd.o sam_view.o \ bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ -- bamtk.o kaln.o sam_header.o --PROG= samtools --INCLUDES= -Iwin32 ++ bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \ ++ cut_target.o phase.o bam_cat.o bam2depth.o ++BCFOBJS= bcftools/bcf.o bcftools/fet.o bcftools/bcf2qcall.o bcftools/bcfutils.o \ ++ bcftools/call1.o bcftools/index.o bcftools/kfunc.o bcftools/em.o \ ++ bcftools/kmin.o bcftools/prob1.o bcftools/vcf.o bcftools/mut.o ++PROG= samtools.exe bcftools.exe ++INCLUDES= -I. -Iwin32 SUBDIRS= . --LIBPATH= ++LIBPATH= .SUFFIXES:.c .o @@@ -29,31 -29,31 +33,31 @@@ lib:libbam. libbam.a:$(LOBJS) $(AR) -cru $@ $(LOBJS) --samtools:$(AOBJS) libbam.a -- $(CC) $(CFLAGS) -o $@ $(AOBJS) $(LIBPATH) -lm -L. -lbam -Lwin32 -lz -lcurses -lws2_32 ++samtools.exe:$(AOBJS) libbam.a $(BCFOBJS) ++ $(CC) $(CFLAGS) -o $@ $(AOBJS) $(BCFOBJS) $(LIBPATH) -lm -L. -lbam -Lwin32 -lz -lcurses -lws2_32 --razip:razip.o razf.o $(KNETFILE_O) -- $(CC) $(CFLAGS) -o $@ razf.o razip.o $(KNETFILE_O) -lz -- --bgzip:bgzip.o bgzf.o $(KNETFILE_O) -- $(CC) $(CFLAGS) -o $@ bgzf.o bgzip.o $(KNETFILE_O) -lz ++bcftools.exe:$(BCFOBJS) bcftools/main.o kstring.o bgzf.o knetfile.o bedidx.o ++ $(CC) $(CFLAGS) -o $@ $(BCFOBJS) bcftools/main.o kstring.o bgzf.o knetfile.o bedidx.o -lm -Lwin32 -lz -lws2_32 razip.o:razf.h --bam.o:bam.h razf.h bam_endian.h kstring.h ++bam.o:bam.h razf.h bam_endian.h kstring.h sam_header.h sam.o:sam.h bam.h bam_import.o:bam.h kseq.h khash.h razf.h bam_pileup.o:bam.h razf.h ksort.h --bam_plcmd.o:bam.h faidx.h bam_maqcns.h glf.h ++bam_plcmd.o:bam.h faidx.h bcftools/bcf.h bam2bcf.h bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h bam_lpileup.o:bam.h ksort.h --bam_tview.o:bam.h faidx.h bam_maqcns.h --bam_maqcns.o:bam.h ksort.h bam_maqcns.h ++bam_tview.o:bam.h faidx.h bam_sort.o:bam.h ksort.h razf.h bam_md.o:bam.h faidx.h --glf.o:glf.h ++sam_header.o:sam_header.h khash.h ++bcf.o:bcftools/bcf.h ++bam2bcf.o:bam2bcf.h errmod.h bcftools/bcf.h ++bam2bcf_indel.o:bam2bcf.h ++errmod.o:errmod.h faidx.o:faidx.h razf.h khash.h faidx_main.o:faidx.h razf.h clean: -- rm -fr gmon.out *.o *.exe *.dSYM razip bgzip $(PROG) *~ *.a ++ rm -fr gmon.out *.o a.out *.exe *.dSYM razip bgzip $(PROG) *~ *.a *.so.* *.so *.dylib diff --cc sam/NEWS index 6b4d8aa,6b4d8aa..41a6cc8 --- a/sam/NEWS +++ b/sam/NEWS @@@ -1,3 -1,3 +1,274 @@@ ++Beta Release 0.1.18 (2 September, 2011) ++~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ ++Notable changes in samtools: ++ ++ * Support the new =/X CIGAR operators (by Peter Cock). ++ ++ * Allow to subsample BAM while keeping the pairing intact (view -s). ++ ++ * Implemented variant distance bias as a new filter (by Petr Danecek). ++ ++ * Bugfix: huge memory usage during indexing ++ ++ * Bugfix: use of uninitialized variable in mpileup (rare) ++ ++ * Bugfix: wrong BAQ probability (rare) ++ ++Notable changes in bcftools: ++ ++ * Support indel in the contrast caller. ++ ++ * Bugfix: LRT2=nan in rare cases ++ ++(0.1.18: 2 September 2011, r982:295) ++ ++ ++ ++Beta Release 0.1.17 (6 July, 2011) ++~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ ++With the maturity of `mpileup' and the lack of update in the `pileup' command, ++the `pileup' command is now formally dropped. Most of the pileup functionality, ++such as outputting mapping quality and read positions, have been added ++`mpileup'. ++ ++Since this release, `bcftools view' is able to perform contrast SNP calling ++(option -T) for discovering de novo and/or somatic mutations between a pair of ++samples or in a family trio. Potential mutations are scored by a log likelihood ++ratio, which is very simple in math, but should be comparable to more ++sophisticated methods. Note that getting the score is only the very first step. ++A lot more need to be done to reduce systematical errors due to mapping and ++reference errors and structural variations. ++ ++Other notable changes in samtools: ++ ++ * Improved sorting order checking during indexing. ++ ++ * Improved region parsing. Colons in reference sequence names are parsed ++ properly. ++ ++ * Fixed an issue where mpileup does not apply BAQ for the first few reads when ++ a region is specified. ++ ++ * Fixed an issue where `faidx' does not work with FASTA files with long lines. ++ ++ * Bugfix: wrong SP genotype information in the BCF output. ++ ++Other notable changes in bcftools: ++ ++ * Output the ML esitmate of the allele count. ++ ++ * Added the HWE plus F<0 filter to varFilter. For multiple samples, it ++ effectively filters false heterozygous calls around centromeres. ++ ++ * For association mapping, perform both 1-degree and 2-degree test. The ++ 2-degree test is conservative but more robust to HWE violation. ++ ++(0.1.17: 6 July 2011, r973:277) ++ ++ ++ ++Beta Release 0.1.16 (21 April, 2011) ++~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ ++Notable changes in samtools: ++ ++ * Support the new SAM/BAM type `B' in the latest SAM spec v1.4. ++ ++ * When the output file of `samtools merge' exists, do not overwrite it unless ++ a new command-line option `-f' is applied. ++ ++ * Bugfix: BED support is not working when the input BED is not sorted. ++ ++ * Bugfix: some reads without coordinates but given on the reverse strand are ++ lost in merging. ++ ++Notable changes in bcftools: ++ ++ * Code cleanup: separated max-likelihood inference and Bayesian inference. ++ ++ * Test Hardy-Weinberg equilibrium with a likelihood-ratio test. ++ ++ * Provided another association test P-value by likelihood-ratio test. ++ ++ * Use Brent's method to estimate the site allele frequency when EM converges ++ slowly. The resulting ML estimate of allele frequnecy is more accurate. ++ ++ * Added the `ldpair' command, which computes r^2 between SNP pairs given in ++ an input file. ++ ++Also, the `pileup' command, which has been deprecated by `mpileup' since ++version 0.1.10, will be dropped in the next release. The old `pileup' command ++is substandard and causing a lot of confusion. ++ ++(0.1.16: 21 April 2011, r963:234) ++ ++ ++ ++Beta Release 0.1.15 (10 April, 2011) ++~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ ++Noteable changes: ++ ++ * Allow to perform variant calling or to extract information in multiple ++ regions specified by a BED file (`samtools mpileup -l', `samtools view -L' ++ and `bcftools view -l'). ++ ++ * Added the `depth' command to samtools to compute the per-base depth with a ++ simpler interface. File `bam2depth.c', which implements this command, is the ++ recommended example on how to use the mpileup APIs. ++ ++ * Estimate genotype frequencies with ML; perform chi^2 based Hardy-Weinberg ++ test using this estimate. ++ ++ * For `samtools view', when `-R' is specified, drop read groups in the header ++ that are not contained in the specified file. ++ ++ * For `samtools flagstat', separate QC-pass and QC-fail reads. ++ ++ * Improved the command line help of `samtools mpileup' and `bcftools view'. ++ ++ * Use a global variable to control the verbose level of samtools stderr ++ output. Nonetheless, it has not been full utilized. ++ ++ * Fixed an issue in association test which may report false associations, ++ possibly due to floating point underflow. ++ ++(0.1.15: 10 April 2011, r949:203) ++ ++ ++ ++Beta release 0.1.14 (21 March, 2011) ++~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ ++This release implements a method for testing associations for case-control ++data. The method does not call genotypes but instead sums over all genotype ++configurations to compute a chi^2 based test statistics. It can be potentially ++applied to comparing a pair of samples (e.g. a tumor-normal pair), but this ++has not been evaluated on real data. ++ ++Another new feature is to make X chromosome variant calls when female and male ++samples are both present. The user needs to provide a file indicating the ++ploidy of each sample (see also manual bcftools/bcftools.1). ++ ++Other notable changes: ++ ++ * Added `bcftools view -F' to parse BCF files generated by samtools r921 or ++ older which encodes PL in a different way. ++ ++ * Changed the behavior of `bcftools view -s'. Now when a list of samples is ++ provided, the samples in the output will be reordered to match the ordering ++ in the sample list. This change is mainly designed for association test. ++ ++ * Sped up `bcftools view -v' for target sequencing given thousands of samples. ++ Also added a new option `view -d' to skip loci where only a few samples are ++ covered by reads. ++ ++ * Dropped HWE test. This feature has never been implemented properly. An EM ++ should be much better. To be implemented in future. ++ ++ * Added the `cat' command to samtools. This command concatenate BAMs with ++ identical sequence dictionaries in an efficient way. Modified from bam_cat.c ++ written by Chris Saunders. ++ ++ * Added `samtools view -1' to write BAMs at a low compression level but twice ++ faster to create. The `sort' command generates temporary files at a low ++ compression level as well. ++ ++ * Added `samtools mpileup -6' to accept "BAM" with Illumina 1.3+ quality ++ strings (strictly speaking, such a file is not BAM). ++ ++ * Added `samtools mpileup -L' to skip INDEL calling in regions with ++ excessively high coverage. Such regions dramatically slow down mpileup. ++ ++ * Updated `misc/export2sam.pl', provided by Chris Saunders from Illumina Inc. ++ ++(0.1.14: 21 March 2011, r933:170) ++ ++ ++ ++Beta release 0.1.13 (1 March, 2011) ++~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ++ ++The most important though largely invisible modification is the change of the ++order of genotypes in the PL VCF/BCF tag. This is to conform the upcoming VCF ++spec v4.1. The change means that 0.1.13 is not backward compatible with VCF/BCF ++generated by samtools older than r921 inclusive. VCF/BCF generated by the new ++samtools will contain a line `##fileformat=VCFv4.1' as well as the samtools ++version number. ++ ++Single Individual Haplotyping (SIH) is added as an experimental feature. It ++originally aims to produce haploid consensus from fosmid pool sequencing, but ++also works with short-read data. For short reads, phased blocks are usually too ++short to be useful in many applications, but they can help to rule out part of ++SNPs close to INDELs or between copies of CNVs. ++ ++ ++Other notable changes in samtools: ++ ++ * Construct per-sample consensus to reduce the effect of nearby SNPs in INDEL ++ calling. This reduces the power but improves specificity. ++ ++ * Improved sorting order checking in indexing. Now indexing is the preferred way ++ to check if a BAM is sorted. ++ ++ * Added a switch `-E' to mpileup and calmd. This option uses an alternative way ++ to apply BAQ, which increases sensistivity, especially to MNPs, at the cost of ++ a little loss in specificity. ++ ++ * Added `mpileup -A' to allow to use reads in anomalous pairs in SNP calling. ++ ++ * Added `mpileup -m' to allow fine control of the collection of INDEL candidates. ++ ++ * Added `mpileup -S' to compute per-sample strand bias P-value. ++ ++ * Added `mpileup -G' to exclude read groups in variant calling. ++ ++ * Fixed segfault in indel calling related to unmapped and refskip reads. ++ ++ * Fixed an integer overflow in INDEL calling. This bug produces wrong INDEL ++ genotypes for longer short INDELs, typically over 10bp. ++ ++ * Fixed a bug in tview on big-endian machines. ++ ++ * Fixed a very rare memory issue in bam_md.c ++ ++ * Fixed an out-of-boundary bug in mpileup when the read base is `N'. ++ ++ * Fixed a compiling error when the knetfile library is not used. Fixed a ++ library compiling error due to the lack of bam_nt16_nt4_table[] table. ++ Suppress a compiling warning related to the latest zlib. ++ ++ ++Other notable changes in bcftools: ++ ++ * Updated the BCF spec. ++ ++ * Added the `FQ' VCF INFO field, which gives the phred-scaled probability ++ of all samples being the same (identical to the reference or all homozygous ++ variants). Option `view -f' has been dropped. ++ ++ * Implementated of "vcfutils.pl vcf2fq" to generate a consensus sequence ++ similar to "samtools.pl pileup2fq". ++ ++ * Make sure the GT FORMAT field is always the first FORMAT to conform the VCF ++ spec. Drop bcf-fix.pl. ++ ++ * Output bcftools specific INFO and FORMAT in the VCF header. ++ ++ * Added `view -s' to call variants from a subset of samples. ++ ++ * Properly convert VCF to BCF with a user provided sequence dictionary. Nonetheless, ++ custom fields are still unparsed and will be stored as a missing value. ++ ++ * Fixed a minor bug in Fisher's exact test; the results are rarely changed. ++ ++ ++(0.1.13: 1 March 2011, r926:134) ++ ++ ++ Beta release 0.1.12a (2 December, 2010) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@@ -532,4 -532,4 +803,4 @@@ Beta Release 0.1.1 (22 December, 2008 The is the first public release of samtools. For more information, please check the manual page `samtools.1' and the samtools website --http://samtools.sourceforge.net ++http://samtools.sourceforge.net diff --cc sam/bam.c index 521c1dd,521c1dd..0055e84 --- a/sam/bam.c +++ b/sam/bam.c @@@ -7,7 -7,7 +7,7 @@@ #include "kstring.h" #include "sam_header.h" --int bam_is_be = 0; ++int bam_is_be = 0, bam_verbose = 2; char *bam_flag2char_table = "pPuUrR12sfd\0\0\0\0\0"; /************************** @@@ -32,7 -32,7 +32,7 @@@ int32_t bam_cigar2qlen(const bam1_core_ int32_t l = 0; for (k = 0; k < c->n_cigar; ++k) { int op = cigar[k] & BAM_CIGAR_MASK; -- if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP) ++ if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CEQUAL || op == BAM_CDIFF) l += cigar[k] >> BAM_CIGAR_SHIFT; } return l; @@@ -79,7 -79,7 +79,7 @@@ bam_header_t *bam_header_read(bamFile f // with ESPIPE. Suppress the error message in this case. if (errno != ESPIPE) perror("[bam_header_read] bgzf_check_EOF"); } -- else if (i == 0) fprintf(stderr, "[bam_header_read] EOF marker is absent.\n"); ++ else if (i == 0) fprintf(stderr, "[bam_header_read] EOF marker is absent. The input is probably truncated.\n"); // read "BAM1" magic_len = bam_read(fp, buf, 4); if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) { @@@ -160,6 -160,6 +160,19 @@@ static void swap_endian_data(const bam1 else if (type == 'I' || type == 'F') { bam_swap_endian_4p(s); s += 4; } else if (type == 'D') { bam_swap_endian_8p(s); s += 8; } else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; } ++ else if (type == 'B') { ++ int32_t n, Bsize = bam_aux_type2size(*s); ++ memcpy(&n, s + 1, 4); ++ if (1 == Bsize) { ++ } else if (2 == Bsize) { ++ for (i = 0; i < n; i += 2) ++ bam_swap_endian_2p(s + 5 + i); ++ } else if (4 == Bsize) { ++ for (i = 0; i < n; i += 4) ++ bam_swap_endian_4p(s + 5 + i); ++ } ++ bam_swap_endian_4p(s+1); ++ } } } @@@ -255,7 -255,7 +268,7 @@@ char *bam_format1_core(const bam_header else { for (i = 0; i < c->n_cigar; ++i) { kputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str); -- kputc("MIDNSHP"[bam1_cigar(b)[i]&BAM_CIGAR_MASK], &str); ++ kputc("MIDNSHP=X"[bam1_cigar(b)[i]&BAM_CIGAR_MASK], &str); } } kputc('\t', &str); @@@ -289,6 -289,6 +302,23 @@@ else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; } else if (type == 'd') { ksprintf(&str, "d:%lg", *(double*)s); s += 8; } else if (type == 'Z' || type == 'H') { kputc(type, &str); kputc(':', &str); while (*s) kputc(*s++, &str); ++s; } ++ else if (type == 'B') { ++ uint8_t sub_type = *(s++); ++ int32_t n; ++ memcpy(&n, s, 4); ++ s += 4; // no point to the start of the array ++ kputc(type, &str); kputc(':', &str); kputc(sub_type, &str); // write the typing ++ for (i = 0; i < n; ++i) { ++ kputc(',', &str); ++ if ('c' == sub_type || 'c' == sub_type) { kputw(*(int8_t*)s, &str); ++s; } ++ else if ('C' == sub_type) { kputw(*(uint8_t*)s, &str); ++s; } ++ else if ('s' == sub_type) { kputw(*(int16_t*)s, &str); s += 2; } ++ else if ('S' == sub_type) { kputw(*(uint16_t*)s, &str); s += 2; } ++ else if ('i' == sub_type) { kputw(*(int32_t*)s, &str); s += 4; } ++ else if ('I' == sub_type) { kputuw(*(uint32_t*)s, &str); s += 4; } ++ else if ('f' == sub_type) { ksprintf(&str, "%g", *(float*)s); s += 4; } ++ } ++ } } return str.s; } diff --cc sam/bam.h index eef2ea9,eef2ea9..346c750 --- a/sam/bam.h +++ b/sam/bam.h @@@ -33,13 -33,13 +33,15 @@@ BAM library provides I/O and various operations on manipulating files in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map) -- format. It now supports importing from or exporting to TAM, sorting, ++ format. It now supports importing from or exporting to SAM, sorting, merging, generating pileup, and quickly retrieval of reads overlapped with a specified region. @copyright Genome Research Ltd. */ ++#define BAM_VERSION "0.1.18 (r982:295)" ++ #include #include #include @@@ -132,20 -132,20 +134,25 @@@ typedef struct /* CIGAR operations. */ --/*! @abstract CIGAR: match */ ++/*! @abstract CIGAR: M = match or mismatch*/ #define BAM_CMATCH 0 --/*! @abstract CIGAR: insertion to the reference */ ++/*! @abstract CIGAR: I = insertion to the reference */ #define BAM_CINS 1 --/*! @abstract CIGAR: deletion from the reference */ ++/*! @abstract CIGAR: D = deletion from the reference */ #define BAM_CDEL 2 --/*! @abstract CIGAR: skip on the reference (e.g. spliced alignment) */ ++/*! @abstract CIGAR: N = skip on the reference (e.g. spliced alignment) */ #define BAM_CREF_SKIP 3 --/*! @abstract CIGAR: clip on the read with clipped sequence present in qseq */ ++/*! @abstract CIGAR: S = clip on the read with clipped sequence ++ present in qseq */ #define BAM_CSOFT_CLIP 4 --/*! @abstract CIGAR: clip on the read with clipped sequence trimmed off */ ++/*! @abstract CIGAR: H = clip on the read with clipped sequence trimmed off */ #define BAM_CHARD_CLIP 5 --/*! @abstract CIGAR: padding */ ++/*! @abstract CIGAR: P = padding */ #define BAM_CPAD 6 ++/*! @abstract CIGAR: equals = match */ ++#define BAM_CEQUAL 7 ++/*! @abstract CIGAR: X = mismatch */ ++#define BAM_CDIFF 8 /*! @typedef @abstract Structure for core alignment information. @@@ -262,6 -262,6 +269,12 @@@ typedef struct __bam_iter_t *bam_iter_t */ extern int bam_is_be; ++/*! ++ @abstract Verbose level between 0 and 3; 0 is supposed to disable all ++ debugging information, though this may not have been implemented. ++ */ ++extern int bam_verbose; ++ /*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */ extern unsigned char bam_nt16_table[256]; @@@ -738,4 -738,4 +751,13 @@@ static inline bam1_t *bam_dup1(const ba return b; } ++static inline int bam_aux_type2size(int x) ++{ ++ if (x == 'C' || x == 'c' || x == 'A') return 1; ++ else if (x == 'S' || x == 's') return 2; ++ else if (x == 'I' || x == 'i' || x == 'f') return 4; ++ else return 0; ++} ++ ++ #endif diff --cc sam/bam2bcf.c index 088635c,088635c..dec3305 --- a/sam/bam2bcf.c +++ b/sam/bam2bcf.c @@@ -11,6 -11,6 +11,7 @@@ extern void ks_introsort_uint32_t(size_ #define CALL_ETA 0.03f #define CALL_MAX 256 #define CALL_DEFTHETA 0.83f ++#define DEF_MAPQ 20 #define CAP_DIST 25 @@@ -23,6 -23,6 +24,8 @@@ bcf_callaux_t *bcf_call_init(double the bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100; bca->min_baseQ = min_baseQ; bca->e = errmod_init(1. - theta); ++ bca->min_frac = 0.002; ++ bca->min_support = 1; return bca; } @@@ -36,6 -36,6 +39,7 @@@ void bcf_call_destroy(bcf_callaux_t *bc * negative if we are looking at an indel. */ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r) { ++ static int *var_pos = NULL, nvar_pos = 0; int i, n, ref4, is_indel, ori_depth = 0; memset(r, 0, sizeof(bcf_callret1_t)); if (ref_base >= 0) { @@@ -61,7 -61,7 +65,8 @@@ seqQ = is_indel? (p->aux>>8&0xff) : 99; if (q < bca->min_baseQ) continue; if (q > seqQ) q = seqQ; -- mapQ = p->b->core.qual < bca->capQ? p->b->core.qual : bca->capQ; ++ mapQ = p->b->core.qual < 255? p->b->core.qual : DEF_MAPQ; // special case for mapQ==255 ++ mapQ = mapQ < bca->capQ? mapQ : bca->capQ; if (q > mapQ) q = mapQ; if (q > 63) q = 63; if (q < 4) q = 4; @@@ -75,7 -75,7 +80,7 @@@ } bca->bases[n++] = q<<5 | (int)bam1_strand(p->b)<<4 | b; // collect annotations -- r->qsum[b] += q; ++ if (b < 4) r->qsum[b] += q; ++r->anno[0<<2|is_diff<<1|bam1_strand(p->b)]; min_dist = p->b->core.l_qseq - 1 - p->qpos; if (min_dist > p->qpos) min_dist = p->qpos; @@@ -90,9 -90,9 +95,92 @@@ r->depth = n; r->ori_depth = ori_depth; // glfgen errmod_cal(bca->e, n, 5, bca->bases, r->p); ++ ++ // Calculate the Variant Distance Bias (make it optional?) ++ if ( nvar_pos < _n ) { ++ nvar_pos = _n; ++ var_pos = realloc(var_pos,sizeof(int)*nvar_pos); ++ } ++ int alt_dp=0, read_len=0; ++ for (i=0; i<_n; i++) { ++ const bam_pileup1_t *p = pl + i; ++ if ( bam1_seqi(bam1_seq(p->b),p->qpos) == ref_base ) ++ continue; ++ ++ var_pos[alt_dp] = p->qpos; ++ if ( (bam1_cigar(p->b)[0]&BAM_CIGAR_MASK)==4 ) ++ var_pos[alt_dp] -= bam1_cigar(p->b)[0]>>BAM_CIGAR_SHIFT; ++ ++ alt_dp++; ++ read_len += p->b->core.l_qseq; ++ } ++ float mvd=0; ++ int j; ++ n=0; ++ for (i=0; imvd[0] = n ? mvd/n : 0; ++ r->mvd[1] = alt_dp; ++ r->mvd[2] = alt_dp ? read_len/alt_dp : 0; ++ return r->depth; } ++ ++void calc_vdb(int n, const bcf_callret1_t *calls, bcf_call_t *call) ++{ ++ // Variant distance bias. Samples merged by means of DP-weighted average. ++ ++ float weight=0, tot_prob=0; ++ ++ int i; ++ for (i=0; i2*mu ? 0 : sin(mvd*3.14/2/mu) / (4*mu/3.14); ++ } ++ else ++ { ++ // Scaled gaussian curve, crude approximation, but behaves well. Using fixed depth for bigger depths. ++ if ( dp>5 ) ++ dp = 5; ++ float sigma2 = (read_len/1.9/(dp+1)) * (read_len/1.9/(dp+1)); ++ float norm = 1.125*sqrt(2*3.14*sigma2); ++ float mu = read_len/2.9; ++ if ( mvd < mu ) ++ prob = exp(-(mvd-mu)*(mvd-mu)/2/sigma2)/norm; ++ else ++ prob = exp(-(mvd-mu)*(mvd-mu)/3.125/sigma2)/norm; ++ } ++ ++ //fprintf(stderr,"dp=%d mvd=%d read_len=%d -> prob=%f\n", dp,mvd,read_len,prob); ++ tot_prob += prob*dp; ++ weight += dp; ++ } ++ tot_prob = weight ? tot_prob/weight : 1; ++ //fprintf(stderr,"prob=%f\n", tot_prob); ++ call->vdb = tot_prob; ++} ++ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call) { int ref4, i, j, qsum[4]; @@@ -140,8 -140,8 +228,8 @@@ x = call->n_alleles * (call->n_alleles + 1) / 2; // get the possible genotypes for (i = z = 0; i < call->n_alleles; ++i) -- for (j = i; j < call->n_alleles; ++j) -- g[z++] = call->a[i] * 5 + call->a[j]; ++ for (j = 0; j <= i; ++j) ++ g[z++] = call->a[j] * 5 + call->a[i]; for (i = 0; i < n; ++i) { uint8_t *PL = call->PL + x * i; const bcf_callret1_t *r = calls + i; @@@ -166,6 -166,6 +254,9 @@@ call->ori_depth += calls[i].ori_depth; for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j]; } ++ ++ calc_vdb(n, calls, call); ++ return 0; } @@@ -219,6 -219,6 +310,10 @@@ int bcf_call2bcf(int tid, int pos, bcf_ if (i) kputc(',', &s); kputw(bc->anno[i], &s); } ++ if ( bc->vdb!=1 ) ++ { ++ ksprintf(&s, ";VDB=%.4f", bc->vdb); ++ } kputc('\0', &s); // FMT kputs("PL", &s); @@@ -232,7 -232,7 +327,7 @@@ memcpy(b->gi[0].data, bc->PL, b->gi[0].len * bc->n); if (bcr) { uint16_t *dp = (uint16_t*)b->gi[1].data; -- uint8_t *sp = is_SP? b->gi[2].data : 0; ++ int32_t *sp = is_SP? b->gi[2].data : 0; for (i = 0; i < bc->n; ++i) { bcf_callret1_t *p = bcr + i; dp[i] = p->depth < 0xffff? p->depth : 0xffff; diff --cc sam/bam2bcf.h index 26b022c,26b022c..4af080c --- a/sam/bam2bcf.h +++ b/sam/bam2bcf.h @@@ -9,7 -9,7 +9,9 @@@ typedef struct __bcf_callaux_t { int capQ, min_baseQ; -- int openQ, extQ, tandemQ; ++ int openQ, extQ, tandemQ; // for indels ++ int min_support; // for collecting indel candidates ++ double min_frac; // for collecting indel candidates // for internal uses int max_bases; int indel_types[4]; @@@ -24,6 -24,6 +26,7 @@@ typedef struct int depth, ori_depth, qsum[4]; int anno[16]; float p[25]; ++ int mvd[3]; // mean variant distance, number of variant reads, average read length } bcf_callret1_t; typedef struct { @@@ -31,6 -31,6 +34,7 @@@ int n, n_alleles, shift, ori_ref, unseen; int anno[16], depth, ori_depth; uint8_t *PL; ++ float vdb; // variant distance bias } bcf_call_t; #ifdef __cplusplus diff --cc sam/bam2bcf_indel.c index 16241d0,16241d0..5142b3e --- a/sam/bam2bcf_indel.c +++ b/sam/bam2bcf_indel.c @@@ -3,15 -3,15 +3,16 @@@ #include #include "bam.h" #include "bam2bcf.h" --#include "ksort.h" #include "kaln.h" #include "kprobaln.h" #include "khash.h" KHASH_SET_INIT_STR(rg) ++#include "ksort.h" ++KSORT_INIT_GENERIC(uint32_t) ++ #define MINUS_CONST 0x10000000 #define INDEL_WINDOW_SIZE 50 --#define MIN_SUPPORT_COEF 500 void *bcf_call_add_rg(void *_hash, const char *hdtext, const char *list) { @@@ -65,7 -65,7 +66,7 @@@ static int tpos2qpos(const bam1_core_t for (k = 0; k < c->n_cigar; ++k) { int op = cigar[k] & BAM_CIGAR_MASK; int l = cigar[k] >> BAM_CIGAR_SHIFT; -- if (op == BAM_CMATCH) { ++ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { if (c->pos > tpos) return y; if (x + l > tpos) { *_tpos = tpos; @@@ -111,10 -111,10 +112,9 @@@ static inline int est_indelreg(int pos int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref, const void *rghash) { -- extern void ks_introsort_uint32_t(int, uint32_t*); int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins, *score1, *score2, max_ref2; int N, K, l_run, ref_type, n_alt; -- char *inscns = 0, *ref2, *query; ++ char *inscns = 0, *ref2, *query, **ref_sample; khash_t(rg) *hash = (khash_t(rg)*)rghash; if (ref == 0 || bca == 0) return -1; // mark filtered reads @@@ -165,9 -165,9 +165,15 @@@ // squeeze out identical types for (i = 1, n_types = 1; i < m; ++i) if (aux[i] != aux[i-1]) ++n_types; -- if (n_types == 1 || n_alt * MIN_SUPPORT_COEF < n_tot) { // no indels or too few supporting reads ++ if (n_types == 1 || (double)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support) { // then skip free(aux); return -1; } ++ if (n_types >= 64) { ++ free(aux); ++ if (bam_verbose >= 2) ++ fprintf(stderr, "[%s] excessive INDEL alleles at position %d. Skip the position.\n", __func__, pos + 1); ++ return -1; ++ } types = (int*)calloc(n_types, sizeof(int)); t = 0; types[t++] = aux[0] - MINUS_CONST; @@@ -178,7 -178,7 +184,6 @@@ for (t = 0; t < n_types; ++t) if (types[t] == 0) break; ref_type = t; // the index of the reference type (0) -- assert(n_types < 64); } { // calculate left and right boundary left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0; @@@ -189,6 -189,6 +194,58 @@@ if (ref[i] == 0) break; right = i; } ++ /* The following block fixes a long-existing flaw in the INDEL ++ * calling model: the interference of nearby SNPs. However, it also ++ * reduces the power because sometimes, substitutions caused by ++ * indels are not distinguishable from true mutations. Multiple ++ * sequence realignment helps to increase the power. ++ */ ++ { // construct per-sample consensus ++ int L = right - left + 1, max_i, max2_i; ++ uint32_t *cns, max, max2; ++ char *ref0, *r; ++ ref_sample = calloc(n, sizeof(void*)); ++ cns = calloc(L, 4); ++ ref0 = calloc(L, 1); ++ for (i = 0; i < right - left; ++i) ++ ref0[i] = bam_nt16_table[(int)ref[i+left]]; ++ for (s = 0; s < n; ++s) { ++ r = ref_sample[s] = calloc(L, 1); ++ memset(cns, 0, sizeof(int) * L); ++ // collect ref and non-ref counts ++ for (i = 0; i < n_plp[s]; ++i) { ++ bam_pileup1_t *p = plp[s] + i; ++ bam1_t *b = p->b; ++ uint32_t *cigar = bam1_cigar(b); ++ uint8_t *seq = bam1_seq(b); ++ int x = b->core.pos, y = 0; ++ for (k = 0; k < b->core.n_cigar; ++k) { ++ int op = cigar[k]&0xf; ++ int j, l = cigar[k]>>4; ++ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { ++ for (j = 0; j < l; ++j) ++ if (x + j >= left && x + j < right) ++ cns[x+j-left] += (bam1_seqi(seq, y+j) == ref0[x+j-left])? 1 : 0x10000; ++ x += l; y += l; ++ } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l; ++ else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; ++ } ++ } ++ // determine the consensus ++ for (i = 0; i < right - left; ++i) r[i] = ref0[i]; ++ max = max2 = 0; max_i = max2_i = -1; ++ for (i = 0; i < right - left; ++i) { ++ if (cns[i]>>16 >= max>>16) max2 = max, max2_i = max_i, max = cns[i], max_i = i; ++ else if (cns[i]>>16 >= max2>>16) max2 = cns[i], max2_i = i; ++ } ++ if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7) max_i = -1; ++ if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1; ++ if (max_i >= 0) r[max_i] = 15; ++ if (max2_i >= 0) r[max2_i] = 15; ++// for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], stderr); fputc('\n', stderr); ++ } ++ free(ref0); free(cns); ++ } { // the length of the homopolymer run around the current position int c = bam_nt16_table[(int)ref[pos + 1]]; if (c == 15) l_run = 1; @@@ -252,27 -252,27 +309,29 @@@ else ir = est_indelreg(pos, ref, -types[t], 0); if (ir > bca->indelreg) bca->indelreg = ir; // fprintf(stderr, "%d, %d, %d\n", pos, types[t], ir); -- // write ref2 -- for (k = 0, j = left; j <= pos; ++j) -- ref2[k++] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[j]]]; -- if (types[t] <= 0) j += -types[t]; -- else for (l = 0; l < types[t]; ++l) -- ref2[k++] = inscns[t*max_ins + l]; -- if (types[0] < 0) { // mask deleted sequences to avoid a particular error in the model. -- int jj, tmp = types[t] >= 0? -types[0] : -types[0] + types[t]; -- for (jj = 0; jj < tmp && j < right && ref[j]; ++jj, ++j) -- ref2[k++] = 4; -- } -- for (; j < right && ref[j]; ++j) -- ref2[k++] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[j]]]; -- for (; k < max_ref2; ++k) ref2[k] = 4; -- if (j < right) right = j; -- // align each read to ref2 ++ // realignment for (s = K = 0; s < n; ++s) { ++ // write ref2 ++ for (k = 0, j = left; j <= pos; ++j) ++ ref2[k++] = bam_nt16_nt4_table[(int)ref_sample[s][j-left]]; ++ if (types[t] <= 0) j += -types[t]; ++ else for (l = 0; l < types[t]; ++l) ++ ref2[k++] = inscns[t*max_ins + l]; ++ for (; j < right && ref[j]; ++j) ++ ref2[k++] = bam_nt16_nt4_table[(int)ref_sample[s][j-left]]; ++ for (; k < max_ref2; ++k) ref2[k] = 4; ++ if (j < right) right = j; ++ // align each read to ref2 for (i = 0; i < n_plp[s]; ++i, ++K) { bam_pileup1_t *p = plp[s] + i; -- int qbeg, qend, tbeg, tend, sc; ++ int qbeg, qend, tbeg, tend, sc, kk; uint8_t *seq = bam1_seq(p->b); ++ uint32_t *cigar = bam1_cigar(p->b); ++ if (p->b->core.flag&4) continue; // unmapped reads ++ // FIXME: the following loop should be better moved outside; nonetheless, realignment should be much slower anyway. ++ for (kk = 0; kk < p->b->core.n_cigar; ++kk) ++ if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP) break; ++ if (kk < p->b->core.n_cigar) continue; // FIXME: the following skips soft clips, but using them may be more sensitive. // determine the start and end of sequences for alignment qbeg = tpos2qpos(&p->b->core, bam1_cigar(p->b), left, 0, &tbeg); @@@ -367,9 -367,9 +426,11 @@@ indelQ2 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ2 + .499); // pick the smaller between indelQ1 and indelQ2 indelQ = indelQ1 < indelQ2? indelQ1 : indelQ2; -- p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; ++ if (indelQ > 255) indelQ = 255; ++ if (seqQ > 255) seqQ = 255; ++ p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ; --// fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d q=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ); ++// fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ); } } // determine bca->indel_types[] and bca->inscns @@@ -407,6 -407,6 +468,8 @@@ } free(score1); free(score2); // free ++ for (i = 0; i < n; ++i) free(ref_sample[i]); ++ free(ref_sample); free(types); free(inscns); return n_alt > 0? 0 : -1; } diff --cc sam/bam_aux.c index fbcd982,fbcd982..28b22e3 --- a/sam/bam_aux.c +++ b/sam/bam_aux.c @@@ -26,14 -26,14 +26,12 @@@ uint8_t *bam_aux_get_core(bam1_t *b, co } #define __skip_tag(s) do { \ -- int type = toupper(*(s)); \ -- ++(s); \ -- if (type == 'C' || type == 'A') ++(s); \ -- else if (type == 'S') (s) += 2; \ -- else if (type == 'I' || type == 'F') (s) += 4; \ -- else if (type == 'D') (s) += 8; \ -- else if (type == 'Z' || type == 'H') { while (*(s)) ++(s); ++(s); } \ -- } while (0) ++ int type = toupper(*(s)); \ ++ ++(s); \ ++ if (type == 'Z' || type == 'H') { while (*(s)) ++(s); ++(s); } \ ++ else if (type == 'B') (s) += 5 + bam_aux_type2size(*(s)) * (*(int32_t*)((s)+1)); \ ++ else (s) += bam_aux_type2size(type); \ ++ } while(0) uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]) { @@@ -61,6 -61,6 +59,23 @@@ int bam_aux_del(bam1_t *b, uint8_t *s return 0; } ++int bam_aux_drop_other(bam1_t *b, uint8_t *s) ++{ ++ if (s) { ++ uint8_t *p, *aux; ++ aux = bam1_aux(b); ++ p = s - 2; ++ __skip_tag(s); ++ memmove(aux, p, s - p); ++ b->data_len -= b->l_aux - (s - p); ++ b->l_aux = s - p; ++ } else { ++ b->data_len -= b->l_aux; ++ b->l_aux = 0; ++ } ++ return 0; ++} ++ void bam_init_header_hash(bam_header_t *header) { if (header->hash == 0) { @@@ -89,47 -89,47 +104,56 @@@ int32_t bam_get_tid(const bam_header_t return k == kh_end(h)? -1 : kh_value(h, k); } --int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end) ++int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *beg, int *end) { -- char *s, *p; -- int i, l, k; ++ char *s; ++ int i, l, k, name_end; khiter_t iter; khash_t(s) *h; bam_init_header_hash(header); h = (khash_t(s)*)header->hash; -- l = strlen(str); -- p = s = (char*)malloc(l+1); -- /* squeeze out "," */ -- for (i = k = 0; i != l; ++i) -- if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i]; -- s[k] = 0; -- for (i = 0; i != k; ++i) if (s[i] == ':') break; -- s[i] = 0; -- iter = kh_get(s, h, s); /* get the ref_id */ -- if (iter == kh_end(h)) { // name not found -- *ref_id = -1; free(s); -- return -1; -- } -- *ref_id = kh_value(h, iter); -- if (i == k) { /* dump the whole sequence */ -- *begin = 0; *end = 1<<29; free(s); -- return 0; -- } -- for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break; -- *begin = atoi(p); -- if (i < k) { -- p = s + i + 1; -- *end = atoi(p); -- } else *end = 1<<29; -- if (*begin > 0) --*begin; ++ *ref_id = *beg = *end = -1; ++ name_end = l = strlen(str); ++ s = (char*)malloc(l+1); ++ // remove space ++ for (i = k = 0; i < l; ++i) ++ if (!isspace(str[i])) s[k++] = str[i]; ++ s[k] = 0; l = k; ++ // determine the sequence name ++ for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end ++ if (i >= 0) name_end = i; ++ if (name_end < l) { // check if this is really the end ++ int n_hyphen = 0; ++ for (i = name_end + 1; i < l; ++i) { ++ if (s[i] == '-') ++n_hyphen; ++ else if (!isdigit(s[i]) && s[i] != ',') break; ++ } ++ if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name ++ s[name_end] = 0; ++ iter = kh_get(s, h, s); ++ if (iter == kh_end(h)) { // cannot find the sequence name ++ iter = kh_get(s, h, str); // try str as the name ++ if (iter == kh_end(h)) { ++ if (bam_verbose >= 2) fprintf(stderr, "[%s] fail to determine the sequence name.\n", __func__); ++ free(s); return -1; ++ } else s[name_end] = ':', name_end = l; ++ } ++ } else iter = kh_get(s, h, str); ++ *ref_id = kh_val(h, iter); ++ // parse the interval ++ if (name_end < l) { ++ for (i = k = name_end + 1; i < l; ++i) ++ if (s[i] != ',') s[k++] = s[i]; ++ s[k] = 0; ++ *beg = atoi(s + name_end + 1); ++ for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break; ++ *end = i < k? atoi(s + i + 1) : 1<<29; ++ if (*beg > 0) --*beg; ++ } else *beg = 0, *end = 1<<29; free(s); -- if (*begin > *end) { -- fprintf(stderr, "[bam_parse_region] invalid region.\n"); -- return -1; -- } -- return 0; ++ return *beg <= *end? 0 : -1; } int32_t bam_aux2i(const uint8_t *s) @@@ -180,3 -180,3 +204,10 @@@ char *bam_aux2Z(const uint8_t *s if (type == 'Z' || type == 'H') return (char*)s; else return 0; } ++ ++#ifdef _WIN32 ++double drand48() ++{ ++ return (double)rand() / RAND_MAX; ++} ++#endif diff --cc sam/bam_import.c index 9d84328,9d84328..5518a9c --- a/sam/bam_import.c +++ b/sam/bam_import.c @@@ -14,7 -14,7 +14,7 @@@ #include "kseq.h" #include "khash.h" --KSTREAM_INIT(gzFile, gzread, 8192) ++KSTREAM_INIT(gzFile, gzread, 16384) KHASH_MAP_INIT_STR(ref, uint64_t) void bam_init_header_hash(bam_header_t *header); @@@ -292,20 -292,20 +292,22 @@@ int sam_read1(tamFile fp, bam_header_t z += str->l + 1; if (str->s[0] != '*') { for (s = str->s; *s; ++s) { -- if (isalpha(*s)) ++c->n_cigar; ++ if ((isalpha(*s)) || (*s=='=')) ++c->n_cigar; else if (!isdigit(*s)) parse_error(fp->n_lines, "invalid CIGAR character"); } b->data = alloc_data(b, doff + c->n_cigar * 4); for (i = 0, s = str->s; i != c->n_cigar; ++i) { x = strtol(s, &t, 10); op = toupper(*t); -- if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH; ++ if (op == 'M') op = BAM_CMATCH; else if (op == 'I') op = BAM_CINS; else if (op == 'D') op = BAM_CDEL; else if (op == 'N') op = BAM_CREF_SKIP; else if (op == 'S') op = BAM_CSOFT_CLIP; else if (op == 'H') op = BAM_CHARD_CLIP; else if (op == 'P') op = BAM_CPAD; ++ else if (op == '=') op = BAM_CEQUAL; ++ else if (op == 'X') op = BAM_CDIFF; else parse_error(fp->n_lines, "invalid CIGAR operation"); s = t + 1; bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op; @@@ -337,8 -337,8 +339,11 @@@ z += str->l + 1; if (strcmp(str->s, "*")) { c->l_qseq = strlen(str->s); -- if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b))) -- parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent"); ++ if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b))) { ++ fprintf(stderr, "Line %ld, sequence length %i vs %i from CIGAR\n", ++ (long)fp->n_lines, c->l_qseq, (int32_t)bam_cigar2qlen(c, bam1_cigar(b))); ++ parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent"); ++ } p = (uint8_t*)alloc_data(b, doff + c->l_qseq + (c->l_qseq+1)/2) + doff; memset(p, 0, (c->l_qseq+1)/2); for (i = 0; i < c->l_qseq; ++i) @@@ -427,6 -427,6 +432,27 @@@ memcpy(s, str->s + 5, str->l - 5); s[str->l - 5] = 0; doff += size; ++ } else if (type == 'B') { ++ int32_t n = 0, Bsize, k = 0, size; ++ char *p; ++ if (str->l < 8) parse_error(fp->n_lines, "too few values in aux type B"); ++ Bsize = bam_aux_type2size(str->s[5]); // the size of each element ++ for (p = (char*)str->s + 6; *p; ++p) // count the number of elements in the array ++ if (*p == ',') ++n; ++ p = str->s + 7; // now p points to the first number in the array ++ size = 6 + Bsize * n; // total number of bytes allocated to this tag ++ s = alloc_data(b, doff + 6 * Bsize * n) + doff; // allocate memory ++ *s++ = 'B'; *s++ = str->s[5]; ++ memcpy(s, &n, 4); s += 4; // write the number of elements ++ if (str->s[5] == 'c') while (p < str->s + str->l) ((int8_t*)s)[k++] = (int8_t)strtol(p, &p, 0), ++p; ++ else if (str->s[5] == 'C') while (p < str->s + str->l) ((uint8_t*)s)[k++] = (uint8_t)strtol(p, &p, 0), ++p; ++ else if (str->s[5] == 's') while (p < str->s + str->l) ((int16_t*)s)[k++] = (int16_t)strtol(p, &p, 0), ++p; // FIXME: avoid unaligned memory ++ else if (str->s[5] == 'S') while (p < str->s + str->l) ((uint16_t*)s)[k++] = (uint16_t)strtol(p, &p, 0), ++p; ++ else if (str->s[5] == 'i') while (p < str->s + str->l) ((int32_t*)s)[k++] = (int32_t)strtol(p, &p, 0), ++p; ++ else if (str->s[5] == 'I') while (p < str->s + str->l) ((uint32_t*)s)[k++] = (uint32_t)strtol(p, &p, 0), ++p; ++ else if (str->s[5] == 'f') while (p < str->s + str->l) ((float*)s)[k++] = (float)strtod(p, &p), ++p; ++ else parse_error(fp->n_lines, "unrecognized array type"); ++ s += Bsize * n; doff += size; } else parse_error(fp->n_lines, "unrecognized type"); if (dret == '\n' || dret == '\r') break; } diff --cc sam/bam_index.c index 328f011,328f011..9610a26 --- a/sam/bam_index.c +++ b/sam/bam_index.c @@@ -172,19 -172,19 +172,23 @@@ bam_index_t *bam_index_core(bamFile fp save_bin = save_tid = last_tid = last_bin = 0xffffffffu; save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu; -- n_mapped = n_unmapped = n_no_coor = off_end = 0; ++ n_mapped = n_unmapped = n_no_coor = off_end = 0; off_beg = off_end = bam_tell(fp); while ((ret = bam_read1(fp, b)) >= 0) { if (c->tid < 0) ++n_no_coor; -- if (last_tid != c->tid) { // change of chromosomes ++ if (last_tid < c->tid || (last_tid >= 0 && c->tid < 0)) { // change of chromosomes last_tid = c->tid; last_bin = 0xffffffffu; -- } else if (last_coor > c->pos) { ++ } else if ((uint32_t)last_tid > (uint32_t)c->tid) { ++ fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %d-th chr > %d-th chr\n", ++ bam1_qname(b), last_tid+1, c->tid+1); ++ return NULL; ++ } else if ((int32_t)c->tid >= 0 && last_coor > c->pos) { fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n", bam1_qname(b), last_coor, c->pos, c->tid+1); -- exit(1); ++ return NULL; } -- if (c->tid >= 0) insert_offset2(&idx->index2[b->core.tid], b, last_off); ++ if (c->tid >= 0 && !(c->flag & BAM_FUNMAP)) insert_offset2(&idx->index2[b->core.tid], b, last_off); if (c->bin != last_bin) { // then possibly write the binning index if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record insert_offset(idx->index[save_tid], save_bin, save_off, last_off); @@@ -203,7 -203,7 +207,7 @@@ if (bam_tell(fp) <= last_off) { fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n", (unsigned long long)bam_tell(fp), (unsigned long long)last_off); -- exit(1); ++ return NULL; } if (c->flag & BAM_FUNMAP) ++n_unmapped; else ++n_mapped; @@@ -217,8 -217,8 +221,15 @@@ } merge_chunks(idx); fill_missing(idx); -- if (ret >= 0) -- while ((ret = bam_read1(fp, b)) >= 0) ++n_no_coor; ++ if (ret >= 0) { ++ while ((ret = bam_read1(fp, b)) >= 0) { ++ ++n_no_coor; ++ if (c->tid >= 0 && n_no_coor) { ++ fprintf(stderr, "[bam_index_core] the alignment is not sorted: reads without coordinates prior to reads with coordinates.\n"); ++ return NULL; ++ } ++ } ++ } if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret); free(b->data); free(b); idx->n_no_coor = n_no_coor; @@@ -466,6 -466,6 +477,10 @@@ int bam_index_build2(const char *fn, co } idx = bam_index_core(fp); bam_close(fp); ++ if(idx == 0) { ++ fprintf(stderr, "[bam_index_build2] fail to index the BAM file.\n"); ++ return -1; ++ } if (_fnidx == 0) { fnidx = (char*)calloc(strlen(fn) + 5, 1); strcpy(fnidx, fn); strcat(fnidx, ".bai"); diff --cc sam/bam_maqcns.c index 4fbc6c6,4fbc6c6..0000000 deleted file mode 100644,100644 --- a/sam/bam_maqcns.c +++ /dev/null @@@ -1,628 -1,628 +1,0 @@@ --#include --#include --#include "bam.h" --#include "bam_maqcns.h" --#include "ksort.h" --#include "errmod.h" --#include "kaln.h" --KSORT_INIT_GENERIC(uint32_t) -- --#define INDEL_WINDOW_SIZE 50 --#define INDEL_EXT_DEP 0.9 -- --typedef struct __bmc_aux_t { -- int max; -- uint32_t *info; -- uint16_t *info16; -- errmod_t *em; --} bmc_aux_t; -- --typedef struct { -- float esum[4], fsum[4]; -- uint32_t c[4]; --} glf_call_aux_t; -- --char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; -- --/* -- P() = \theta \sum_{i=1}^{N-1} 1/i -- P(D|) = \sum_{k=1}^{N-1} p_k 1/2 [(k/N)^n_2(1-k/N)^n_1 + (k/N)^n1(1-k/N)^n_2] -- p_k = 1/k / \sum_{i=1}^{N-1} 1/i -- */ --static void cal_het(bam_maqcns_t *aa) --{ -- int k, n1, n2; -- double sum_harmo; // harmonic sum -- double poly_rate; -- -- free(aa->lhet); -- aa->lhet = (double*)calloc(256 * 256, sizeof(double)); -- sum_harmo = 0.0; -- for (k = 1; k <= aa->n_hap - 1; ++k) -- sum_harmo += 1.0 / k; -- for (n1 = 0; n1 < 256; ++n1) { -- for (n2 = 0; n2 < 256; ++n2) { -- long double sum = 0.0; -- double lC = aa->errmod == BAM_ERRMOD_SOAP? 0 : lgamma(n1+n2+1) - lgamma(n1+1) - lgamma(n2+1); -- for (k = 1; k <= aa->n_hap - 1; ++k) { -- double pk = 1.0 / k / sum_harmo; -- double log1 = log((double)k/aa->n_hap); -- double log2 = log(1.0 - (double)k/aa->n_hap); -- sum += pk * 0.5 * (expl(log1*n2) * expl(log2*n1) + expl(log1*n1) * expl(log2*n2)); -- } -- aa->lhet[n1<<8|n2] = lC + logl(sum); -- } -- } -- poly_rate = aa->het_rate * sum_harmo; -- aa->q_r = -4.343 * log(2.0 * poly_rate / (1.0 - poly_rate)); --} -- --/** initialize the helper structure */ --static void cal_coef(bam_maqcns_t *aa) --{ -- int k, n, q; -- long double sum_a[257], b[256], q_c[256], tmp[256], fk2[256]; -- double *lC; -- -- if (aa->errmod == BAM_ERRMOD_MAQ2) return; // no need to do the following -- // aa->lhet will be allocated and initialized -- free(aa->fk); free(aa->coef); -- aa->coef = 0; -- aa->fk = (double*)calloc(256, sizeof(double)); -- aa->fk[0] = fk2[0] = 1.0; -- for (n = 1; n != 256; ++n) { -- aa->fk[n] = pow(aa->theta, n) * (1.0 - aa->eta) + aa->eta; -- fk2[n] = aa->fk[n>>1]; // this is an approximation, assuming reads equally likely come from both strands -- } -- if (aa->errmod == BAM_ERRMOD_SOAP) return; -- aa->coef = (double*)calloc(256*256*64, sizeof(double)); -- lC = (double*)calloc(256 * 256, sizeof(double)); -- for (n = 1; n != 256; ++n) -- for (k = 1; k <= n; ++k) -- lC[n<<8|k] = lgamma(n+1) - lgamma(k+1) - lgamma(n-k+1); -- for (q = 1; q != 64; ++q) { -- double e = pow(10.0, -q/10.0); -- double le = log(e); -- double le1 = log(1.0-e); -- for (n = 1; n != 256; ++n) { -- double *coef = aa->coef + (q<<16|n<<8); -- sum_a[n+1] = 0.0; -- for (k = n; k >= 0; --k) { // a_k = \sum_{i=k}^n C^n_k \epsilon^k (1-\epsilon)^{n-k} -- sum_a[k] = sum_a[k+1] + expl(lC[n<<8|k] + k*le + (n-k)*le1); -- b[k] = sum_a[k+1] / sum_a[k]; -- if (b[k] > 0.99) b[k] = 0.99; -- } -- for (k = 0; k != n; ++k) // log(\bar\beta_{nk}(\bar\epsilon)^{f_k}) -- q_c[k] = -4.343 * fk2[k] * logl(b[k] / e); -- for (k = 1; k != n; ++k) q_c[k] += q_c[k-1]; // \prod_{i=0}^k c_i -- for (k = 0; k <= n; ++k) { // powl() in 64-bit mode seems broken on my Mac OS X 10.4.9 -- tmp[k] = -4.343 * logl(1.0 - expl(fk2[k] * logl(b[k]))); -- coef[k] = (k? q_c[k-1] : 0) + tmp[k]; // this is the final c_{nk} -- } -- } -- } -- free(lC); --} -- --bam_maqcns_t *bam_maqcns_init() --{ -- bam_maqcns_t *bm; -- bm = (bam_maqcns_t*)calloc(1, sizeof(bam_maqcns_t)); -- bm->aux = (bmc_aux_t*)calloc(1, sizeof(bmc_aux_t)); -- bm->het_rate = 0.001; -- bm->theta = 0.83f; -- bm->n_hap = 2; -- bm->eta = 0.03; -- bm->cap_mapQ = 60; -- bm->min_baseQ = 13; -- return bm; --} -- --void bam_maqcns_prepare(bam_maqcns_t *bm) --{ -- if (bm->errmod == BAM_ERRMOD_MAQ2) bm->aux->em = errmod_init(1. - bm->theta); -- cal_coef(bm); cal_het(bm); --} -- --void bam_maqcns_destroy(bam_maqcns_t *bm) --{ -- if (bm == 0) return; -- free(bm->lhet); free(bm->fk); free(bm->coef); free(bm->aux->info); free(bm->aux->info16); -- if (bm->aux->em) errmod_destroy(bm->aux->em); -- free(bm->aux); free(bm); --} -- --glf1_t *bam_maqcns_glfgen(int _n, const bam_pileup1_t *pl, uint8_t ref_base, bam_maqcns_t *bm) --{ -- glf_call_aux_t *b = 0; -- int i, j, k, w[8], c, n; -- glf1_t *g = (glf1_t*)calloc(1, sizeof(glf1_t)); -- float p[16], min_p = 1e30; -- uint64_t rms; -- -- g->ref_base = ref_base; -- if (_n == 0) return g; -- -- // construct aux array -- if (bm->aux->max < _n) { -- bm->aux->max = _n; -- kroundup32(bm->aux->max); -- bm->aux->info = (uint32_t*)realloc(bm->aux->info, 4 * bm->aux->max); -- bm->aux->info16 = (uint16_t*)realloc(bm->aux->info16, 2 * bm->aux->max); -- } -- for (i = n = 0, rms = 0; i < _n; ++i) { -- const bam_pileup1_t *p = pl + i; -- uint32_t q, x = 0, qq; -- uint16_t y = 0; -- if (p->is_del || p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue; -- q = (uint32_t)bam1_qual(p->b)[p->qpos]; -- if (q < bm->min_baseQ) continue; -- x |= (uint32_t)bam1_strand(p->b) << 18 | q << 8 | p->b->core.qual; -- y |= bam1_strand(p->b)<<4; -- if (p->b->core.qual < q) q = p->b->core.qual; -- c = p->b->core.qual < bm->cap_mapQ? p->b->core.qual : bm->cap_mapQ; -- rms += c * c; -- x |= q << 24; -- y |= q << 5; -- qq = bam1_seqi(bam1_seq(p->b), p->qpos); -- q = bam_nt16_nt4_table[qq? qq : ref_base]; -- if (!p->is_del && !p->is_refskip && q < 4) x |= 1 << 21 | q << 16, y |= q; -- bm->aux->info16[n] = y; -- bm->aux->info[n++] = x; -- } -- rms = (uint8_t)(sqrt((double)rms / n) + .499); -- if (bm->errmod == BAM_ERRMOD_MAQ2) { -- errmod_cal(bm->aux->em, n, 4, bm->aux->info16, p); -- goto goto_glf; -- } -- ks_introsort(uint32_t, n, bm->aux->info); -- // generate esum and fsum -- b = (glf_call_aux_t*)calloc(1, sizeof(glf_call_aux_t)); -- for (k = 0; k != 8; ++k) w[k] = 0; -- for (j = n - 1; j >= 0; --j) { // calculate esum and fsum -- uint32_t info = bm->aux->info[j]; -- if (info>>24 < 4 && (info>>8&0x3f) != 0) info = 4<<24 | (info&0xffffff); -- k = info>>16&7; -- if (info>>24 > 0) { -- b->esum[k&3] += bm->fk[w[k]] * (info>>24); -- b->fsum[k&3] += bm->fk[w[k]]; -- if (w[k] < 0xff) ++w[k]; -- ++b->c[k&3]; -- } -- } -- // rescale ->c[] -- for (j = c = 0; j != 4; ++j) c += b->c[j]; -- if (c > 255) { -- for (j = 0; j != 4; ++j) b->c[j] = (int)(254.0 * b->c[j] / c + 0.5); -- for (j = c = 0; j != 4; ++j) c += b->c[j]; -- } -- if (bm->errmod == BAM_ERRMOD_MAQ) { -- // generate likelihood -- for (j = 0; j != 4; ++j) { -- // homozygous -- float tmp1, tmp3; -- int tmp2, bar_e; -- for (k = 0, tmp1 = tmp3 = 0.0, tmp2 = 0; k != 4; ++k) { -- if (j == k) continue; -- tmp1 += b->esum[k]; tmp2 += b->c[k]; tmp3 += b->fsum[k]; -- } -- if (tmp2) { -- bar_e = (int)(tmp1 / tmp3 + 0.5); -- if (bar_e < 4) bar_e = 4; // should not happen -- if (bar_e > 63) bar_e = 63; -- p[j<<2|j] = tmp1 + bm->coef[bar_e<<16|c<<8|tmp2]; -- } else p[j<<2|j] = 0.0; // all the bases are j -- // heterozygous -- for (k = j + 1; k < 4; ++k) { -- for (i = 0, tmp2 = 0, tmp1 = tmp3 = 0.0; i != 4; ++i) { -- if (i == j || i == k) continue; -- tmp1 += b->esum[i]; tmp2 += b->c[i]; tmp3 += b->fsum[i]; -- } -- if (tmp2) { -- bar_e = (int)(tmp1 / tmp3 + 0.5); -- if (bar_e < 4) bar_e = 4; -- if (bar_e > 63) bar_e = 63; -- p[j<<2|k] = p[k<<2|j] = -4.343 * bm->lhet[b->c[j]<<8|b->c[k]] + tmp1 + bm->coef[bar_e<<16|c<<8|tmp2]; -- } else p[j<<2|k] = p[k<<2|j] = -4.343 * bm->lhet[b->c[j]<<8|b->c[k]]; // all the bases are either j or k -- } -- // -- for (k = 0; k != 4; ++k) -- if (p[j<<2|k] < 0.0) p[j<<2|k] = 0.0; -- } -- -- { // fix p[k<<2|k] -- float max1, max2, min1, min2; -- int max_k, min_k; -- max_k = min_k = -1; -- max1 = max2 = -1.0; min1 = min2 = 1e30; -- for (k = 0; k < 4; ++k) { -- if (b->esum[k] > max1) { -- max2 = max1; max1 = b->esum[k]; max_k = k; -- } else if (b->esum[k] > max2) max2 = b->esum[k]; -- } -- for (k = 0; k < 4; ++k) { -- if (p[k<<2|k] < min1) { -- min2 = min1; min1 = p[k<<2|k]; min_k = k; -- } else if (p[k<<2|k] < min2) min2 = p[k<<2|k]; -- } -- if (max1 > max2 && (min_k != max_k || min1 + 1.0 > min2)) -- p[max_k<<2|max_k] = min1 > 1.0? min1 - 1.0 : 0.0; -- } -- } else if (bm->errmod == BAM_ERRMOD_SOAP) { // apply the SOAP model -- // generate likelihood -- for (j = 0; j != 4; ++j) { -- float tmp; -- // homozygous -- for (k = 0, tmp = 0.0; k != 4; ++k) -- if (j != k) tmp += b->esum[k]; -- p[j<<2|j] = tmp; -- // heterozygous -- for (k = j + 1; k < 4; ++k) { -- for (i = 0, tmp = 0.0; i != 4; ++i) -- if (i != j && i != k) tmp += b->esum[i]; -- p[j<<2|k] = p[k<<2|j] = -4.343 * bm->lhet[b->c[j]<<8|b->c[k]] + tmp; -- } -- } -- } -- --goto_glf: -- // convert necessary information to glf1_t -- g->ref_base = ref_base; g->max_mapQ = rms; -- g->depth = n > 16777215? 16777215 : n; -- for (j = 0; j != 4; ++j) -- for (k = j; k < 4; ++k) -- if (p[j<<2|k] < min_p) min_p = p[j<<2|k]; -- g->min_lk = min_p > 255.0? 255 : (int)(min_p + 0.5); -- for (j = c = 0; j != 4; ++j) -- for (k = j; k < 4; ++k) -- g->lk[c++] = p[j<<2|k]-min_p > 255.0? 255 : (int)(p[j<<2|k]-min_p + 0.5); -- -- free(b); -- return g; --} -- --uint32_t glf2cns(const glf1_t *g, int q_r) --{ -- int i, j, k, p[10], ref4; -- uint32_t x = 0; -- ref4 = bam_nt16_nt4_table[g->ref_base]; -- for (i = k = 0; i < 4; ++i) -- for (j = i; j < 4; ++j) { -- int prior = (i == ref4 && j == ref4? 0 : i == ref4 || j == ref4? q_r : q_r + 3); -- p[k] = (g->lk[k] + prior)<<4 | i<<2 | j; -- ++k; -- } -- for (i = 1; i < 10; ++i) // insertion sort -- for (j = i; j > 0 && p[j] < p[j-1]; --j) -- k = p[j], p[j] = p[j-1], p[j-1] = k; -- x = (1u<<(p[0]&3) | 1u<<(p[0]>>2&3)) << 28; // the best genotype -- x |= (uint32_t)g->max_mapQ << 16; // rms mapQ -- x |= ((p[1]>>4) - (p[0]>>4) < 256? (p[1]>>4) - (p[0]>>4) : 255) << 8; // consensus Q -- for (k = 0; k < 10; ++k) -- if ((p[k]&0xf) == (ref4<<2|ref4)) break; -- if (k == 10) k = 9; -- x |= (p[k]>>4) - (p[0]>>4) < 256? (p[k]>>4) - (p[0]>>4) : 255; // snp Q -- return x; --} -- --uint32_t bam_maqcns_call(int n, const bam_pileup1_t *pl, bam_maqcns_t *bm) --{ -- glf1_t *g; -- uint32_t x; -- if (n) { -- g = bam_maqcns_glfgen(n, pl, 0xf, bm); -- x = g->depth == 0? (0xfU<<28 | 0xfU<<24) : glf2cns(g, (int)(bm->q_r + 0.5)); -- free(g); -- } else x = 0xfU<<28 | 0xfU<<24; -- return x; --} -- --/************** *****************/ -- --bam_maqindel_opt_t *bam_maqindel_opt_init() --{ -- bam_maqindel_opt_t *mi = (bam_maqindel_opt_t*)calloc(1, sizeof(bam_maqindel_opt_t)); -- mi->q_indel = 40; -- mi->r_indel = 0.00015; -- mi->r_snp = 0.001; -- // -- mi->mm_penalty = 3; -- mi->indel_err = 4; -- mi->ambi_thres = 10; -- return mi; --} -- --void bam_maqindel_ret_destroy(bam_maqindel_ret_t *mir) --{ -- if (mir == 0) return; -- free(mir->s[0]); free(mir->s[1]); free(mir); --} -- --int bam_tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos) --{ -- int k, x = c->pos, y = 0, last_y = 0; -- *_tpos = c->pos; -- for (k = 0; k < c->n_cigar; ++k) { -- int op = cigar[k] & BAM_CIGAR_MASK; -- int l = cigar[k] >> BAM_CIGAR_SHIFT; -- if (op == BAM_CMATCH) { -- if (c->pos > tpos) return y; -- if (x + l > tpos) { -- *_tpos = tpos; -- return y + (tpos - x); -- } -- x += l; y += l; -- last_y = y; -- } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; -- else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { -- if (x + l > tpos) { -- *_tpos = is_left? x : x + l; -- return y; -- } -- x += l; -- } -- } -- *_tpos = x; -- return last_y; --} -- --#define MINUS_CONST 0x10000000 -- --bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref, -- int _n_types, int *_types) --{ -- int i, j, n_types, *types, left, right, max_rd_len = 0; -- bam_maqindel_ret_t *ret = 0; -- // if there is no proposed indel, check if there is an indel from the alignment -- if (_n_types == 0) { -- for (i = 0; i < n; ++i) { -- const bam_pileup1_t *p = pl + i; -- if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0) break; -- } -- if (i == n) return 0; // no indel -- } -- { // calculate how many types of indels are available (set n_types and types) -- int m; -- uint32_t *aux; -- aux = (uint32_t*)calloc(n + _n_types + 1, 4); -- m = 0; -- aux[m++] = MINUS_CONST; // zero indel is always a type -- for (i = 0; i < n; ++i) { -- const bam_pileup1_t *p = pl + i; -- if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0) -- aux[m++] = MINUS_CONST + p->indel; -- j = bam_cigar2qlen(&p->b->core, bam1_cigar(p->b)); -- if (j > max_rd_len) max_rd_len = j; -- } -- if (_n_types) // then also add this to aux[] -- for (i = 0; i < _n_types; ++i) -- if (_types[i]) aux[m++] = MINUS_CONST + _types[i]; -- ks_introsort(uint32_t, m, aux); -- // squeeze out identical types -- for (i = 1, n_types = 1; i < m; ++i) -- if (aux[i] != aux[i-1]) ++n_types; -- types = (int*)calloc(n_types, sizeof(int)); -- j = 0; -- types[j++] = aux[0] - MINUS_CONST; -- for (i = 1; i < m; ++i) { -- if (aux[i] != aux[i-1]) -- types[j++] = aux[i] - MINUS_CONST; -- } -- free(aux); -- } -- { // calculate left and right boundary -- left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0; -- right = pos + INDEL_WINDOW_SIZE; -- if (types[0] < 0) right -= types[0]; -- // in case the alignments stand out the reference -- for (i = pos; i < right; ++i) -- if (ref[i] == 0) break; -- right = i; -- } -- { // the core part -- char *ref2, *rs, *inscns = 0; -- int qr_snp, k, l, *score, *pscore, max_ins = types[n_types-1]; -- qr_snp = (int)(-4.343 * log(mi->r_snp) + .499); -- if (max_ins > 0) { // get the consensus of inserted sequences -- int *inscns_aux = (int*)calloc(4 * n_types * max_ins, sizeof(int)); -- // count occurrences -- for (i = 0; i < n_types; ++i) { -- if (types[i] <= 0) continue; // not insertion -- for (j = 0; j < n; ++j) { -- const bam_pileup1_t *p = pl + j; -- if (!(p->b->core.flag&BAM_FUNMAP) && p->indel == types[i]) { -- for (k = 1; k <= p->indel; ++k) { -- int c = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos + k)]; -- if (c < 4) ++inscns_aux[i*max_ins*4 + (k-1)*4 + c]; -- } -- } -- } -- } -- // construct the consensus of inserted sequence -- inscns = (char*)calloc(n_types * max_ins, sizeof(char)); -- for (i = 0; i < n_types; ++i) { -- for (j = 0; j < types[i]; ++j) { -- int max = 0, max_k = -1, *ia = inscns_aux + i*max_ins*4 + j*4; -- for (k = 0; k < 4; ++k) { -- if (ia[k] > max) { -- max = ia[k]; -- max_k = k; -- } -- } -- inscns[i*max_ins + j] = max? 1<= 0? -types[0] : -types[0] + types[i]; -- for (jj = 0; jj < tmp && j < right && ref[j]; ++jj, ++j) -- ref2[k++] = 4; -- } -- for (; j < right && ref[j]; ++j) -- ref2[k++] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[j]]]; -- if (j < right) right = j; -- // calculate score for each read -- for (j = 0; j < n; ++j) { -- const bam_pileup1_t *p = pl + j; -- int qbeg, qend, tbeg, tend; -- if (p->b->core.flag & BAM_FUNMAP) continue; -- qbeg = bam_tpos2qpos(&p->b->core, bam1_cigar(p->b), left, 0, &tbeg); -- qend = bam_tpos2qpos(&p->b->core, bam1_cigar(p->b), right, 1, &tend); -- assert(tbeg >= left); -- for (l = qbeg; l < qend; ++l) -- rs[l - qbeg] = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), l)]; -- { -- int x, y, n_acigar, ps; -- uint32_t *acigar; -- ps = 0; -- if (tend - tbeg + types[i] <= 0) { -- score[i*n+j] = -(1<<20); -- pscore[i*n+j] = 1<<20; -- continue; -- } -- acigar = ka_global_core((uint8_t*)ref2 + tbeg - left, tend - tbeg + types[i], (uint8_t*)rs, qend - qbeg, &ap, &score[i*n+j], &n_acigar); -- x = tbeg - left; y = 0; -- for (l = 0; l < n_acigar; ++l) { -- int op = acigar[l]&0xf; -- int len = acigar[l]>>4; -- if (op == BAM_CMATCH) { -- int k; -- for (k = 0; k < len; ++k) -- if (ref2[x+k] != rs[y+k] && ref2[x+k] < 4) -- ps += bam1_qual(p->b)[y+k] < qr_snp? bam1_qual(p->b)[y+k] : qr_snp; -- x += len; y += len; -- } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { -- if (op == BAM_CINS && l > 0 && l < n_acigar - 1) ps += mi->q_indel * len; -- y += len; -- } else if (op == BAM_CDEL) { -- if (l > 0 && l < n_acigar - 1) ps += mi->q_indel * len; -- x += len; -- } -- } -- pscore[i*n+j] = ps; -- /*if (1) { // for debugging only -- fprintf(stderr, "id=%d, pos=%d, type=%d, j=%d, score=%d, psore=%d, %d, %d, %d, %d, %d, ", -- j, pos+1, types[i], j, score[i*n+j], pscore[i*n+j], tbeg, tend, qbeg, qend, mi->q_indel); -- for (l = 0; l < n_acigar; ++l) fprintf(stderr, "%d%c", acigar[l]>>4, "MIDS"[acigar[l]&0xf]); -- fprintf(stderr, "\n"); -- for (l = 0; l < tend - tbeg + types[i]; ++l) fputc("ACGTN"[ref2[l+tbeg-left]], stderr); -- fputc('\n', stderr); -- for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[rs[l]], stderr); -- fputc('\n', stderr); -- }*/ -- free(acigar); -- } -- } -- } -- { // get final result -- int *sum, max1, max2, max1_i, max2_i; -- // pick up the best two score -- sum = (int*)calloc(n_types, sizeof(int)); -- for (i = 0; i < n_types; ++i) -- for (j = 0; j < n; ++j) -- sum[i] += -pscore[i*n+j]; -- max1 = max2 = -0x7fffffff; max1_i = max2_i = -1; -- for (i = 0; i < n_types; ++i) { -- if (sum[i] > max1) { -- max2 = max1; max2_i = max1_i; max1 = sum[i]; max1_i = i; -- } else if (sum[i] > max2) { -- max2 = sum[i]; max2_i = i; -- } -- } -- free(sum); -- // write ret -- ret = (bam_maqindel_ret_t*)calloc(1, sizeof(bam_maqindel_ret_t)); -- ret->indel1 = types[max1_i]; ret->indel2 = types[max2_i]; -- ret->s[0] = (char*)calloc(abs(ret->indel1) + 2, 1); -- ret->s[1] = (char*)calloc(abs(ret->indel2) + 2, 1); -- // write indel sequence -- if (ret->indel1 > 0) { -- ret->s[0][0] = '+'; -- for (k = 0; k < ret->indel1; ++k) -- ret->s[0][k+1] = bam_nt16_rev_table[(int)inscns[max1_i*max_ins + k]]; -- } else if (ret->indel1 < 0) { -- ret->s[0][0] = '-'; -- for (k = 0; k < -ret->indel1 && ref[pos + k + 1]; ++k) -- ret->s[0][k+1] = ref[pos + k + 1]; -- } else ret->s[0][0] = '*'; -- if (ret->indel2 > 0) { -- ret->s[1][0] = '+'; -- for (k = 0; k < ret->indel2; ++k) -- ret->s[1][k+1] = bam_nt16_rev_table[(int)inscns[max2_i*max_ins + k]]; -- } else if (ret->indel2 < 0) { -- ret->s[1][0] = '-'; -- for (k = 0; k < -ret->indel2 && ref[pos + k + 1]; ++k) -- ret->s[1][k+1] = ref[pos + k + 1]; -- } else ret->s[1][0] = '*'; -- // write count -- for (i = 0; i < n; ++i) { -- const bam_pileup1_t *p = pl + i; -- if (p->indel == ret->indel1) ++ret->cnt1; -- else if (p->indel == ret->indel2) ++ret->cnt2; -- else ++ret->cnt_anti; -- } -- { // write gl[] -- int tmp, seq_err = 0; -- double x = 1.0; -- tmp = max1_i - max2_i; -- if (tmp < 0) tmp = -tmp; -- for (j = 0; j < tmp + 1; ++j) x *= INDEL_EXT_DEP; -- seq_err = mi->q_indel * (1.0 - x) / (1.0 - INDEL_EXT_DEP); -- ret->gl[0] = ret->gl[1] = 0; -- for (j = 0; j < n; ++j) { -- int s1 = pscore[max1_i*n + j], s2 = pscore[max2_i*n + j]; -- //fprintf(stderr, "id=%d, %d, %d, %d, %d, %d\n", j, pl[j].b->core.pos+1, types[max1_i], types[max2_i], s1, s2); -- if (s1 > s2) ret->gl[0] += s1 - s2 < seq_err? s1 - s2 : seq_err; -- else ret->gl[1] += s2 - s1 < seq_err? s2 - s1 : seq_err; -- } -- } -- // write cnt_ref and cnt_ambi -- if (max1_i != 0 && max2_i != 0) { -- for (j = 0; j < n; ++j) { -- int diff1 = score[j] - score[max1_i * n + j]; -- int diff2 = score[j] - score[max2_i * n + j]; -- if (diff1 > 0 && diff2 > 0) ++ret->cnt_ref; -- else if (diff1 == 0 || diff2 == 0) ++ret->cnt_ambi; -- } -- } -- } -- free(score); free(pscore); free(ref2); free(rs); free(inscns); -- } -- { // call genotype -- int q[3], qr_indel = (int)(-4.343 * log(mi->r_indel) + 0.5); -- int min1, min2, min1_i; -- q[0] = ret->gl[0] + (ret->s[0][0] != '*'? 0 : 0) * qr_indel; -- q[1] = ret->gl[1] + (ret->s[1][0] != '*'? 0 : 0) * qr_indel; -- q[2] = n * 3 + (ret->s[0][0] == '*' || ret->s[1][0] == '*'? 1 : 1) * qr_indel; -- min1 = min2 = 0x7fffffff; min1_i = -1; -- for (i = 0; i < 3; ++i) { -- if (q[i] < min1) { -- min2 = min1; min1 = q[i]; min1_i = i; -- } else if (q[i] < min2) min2 = q[i]; -- } -- ret->gt = min1_i; -- ret->q_cns = min2 - min1; -- // set q_ref -- if (ret->gt < 2) ret->q_ref = (ret->s[ret->gt][0] == '*')? 0 : q[1-ret->gt] - q[ret->gt] - qr_indel - 3; -- else ret->q_ref = (ret->s[0][0] == '*')? q[0] - q[2] : q[1] - q[2]; -- if (ret->q_ref < 0) ret->q_ref = 0; -- } -- free(types); -- return ret; --} diff --cc sam/bam_maqcns.h index 291ae53,291ae53..0000000 deleted file mode 100644,100644 --- a/sam/bam_maqcns.h +++ /dev/null @@@ -1,61 -1,61 +1,0 @@@ --#ifndef BAM_MAQCNS_H --#define BAM_MAQCNS_H -- --#include "glf.h" -- --#define BAM_ERRMOD_MAQ2 0 --#define BAM_ERRMOD_MAQ 1 --#define BAM_ERRMOD_SOAP 2 -- --struct __bmc_aux_t; -- --typedef struct { -- float het_rate, theta; -- int n_hap, cap_mapQ, errmod, min_baseQ; -- -- float eta, q_r; -- double *fk, *coef; -- double *lhet; -- struct __bmc_aux_t *aux; --} bam_maqcns_t; -- --typedef struct { -- int q_indel; // indel sequencing error, phred scaled -- float r_indel; // indel prior -- float r_snp; // snp prior -- // hidden parameters, unchangeable from command line -- int mm_penalty, indel_err, ambi_thres; --} bam_maqindel_opt_t; -- --typedef struct { -- int indel1, indel2; -- int cnt1, cnt2, cnt_anti; -- int cnt_ref, cnt_ambi; -- char *s[2]; -- // -- int gt, gl[2]; -- int q_cns, q_ref; --} bam_maqindel_ret_t; -- --#ifdef __cplusplus --extern "C" { --#endif -- -- bam_maqcns_t *bam_maqcns_init(); -- void bam_maqcns_prepare(bam_maqcns_t *bm); -- void bam_maqcns_destroy(bam_maqcns_t *bm); -- glf1_t *bam_maqcns_glfgen(int n, const bam_pileup1_t *pl, uint8_t ref_base, bam_maqcns_t *bm); -- uint32_t bam_maqcns_call(int n, const bam_pileup1_t *pl, bam_maqcns_t *bm); -- // return: cns<<28 | cns2<<24 | mapQ<<16 | cnsQ<<8 | cnsQ2 -- uint32_t glf2cns(const glf1_t *g, int q_r); -- -- bam_maqindel_opt_t *bam_maqindel_opt_init(); -- bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref, -- int _n_types, int *_types); -- void bam_maqindel_ret_destroy(bam_maqindel_ret_t*); -- --#ifdef __cplusplus --} --#endif -- --#endif diff --cc sam/bam_md.c index 44d46a4,44d46a4..d42aa8f --- a/sam/bam_md.c +++ b/sam/bam_md.c @@@ -9,38 -9,38 +9,46 @@@ #include "kaln.h" #include "kprobaln.h" --void bam_fillmd1_core(bam1_t *b, char *ref, int is_equal, int max_nm) ++#define USE_EQUAL 1 ++#define DROP_TAG 2 ++#define BIN_QUAL 4 ++#define UPDATE_NM 8 ++#define UPDATE_MD 16 ++#define HASH_QNM 32 ++ ++char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; ++ ++int bam_aux_drop_other(bam1_t *b, uint8_t *s); ++ ++void bam_fillmd1_core(bam1_t *b, char *ref, int flag, int max_nm) { uint8_t *seq = bam1_seq(b); uint32_t *cigar = bam1_cigar(b); bam1_core_t *c = &b->core; int i, x, y, u = 0; kstring_t *str; -- uint8_t *old_md, *old_nm; int32_t old_nm_i = -1, nm = 0; str = (kstring_t*)calloc(1, sizeof(kstring_t)); for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { int j, l = cigar[i]>>4, op = cigar[i]&0xf; -- if (op == BAM_CMATCH) { ++ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (j = 0; j < l; ++j) { int z = y + j; int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]]; if (ref[x+j] == 0) break; // out of boundary if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match -- if (is_equal) seq[z/2] &= (z&1)? 0xf0 : 0x0f; ++ if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f; ++u; } else { -- ksprintf(str, "%d", u); -- kputc(ref[x+j], str); ++ kputw(u, str); kputc(ref[x+j], str); u = 0; ++nm; } } if (j < l) break; x += l; y += l; } else if (op == BAM_CDEL) { -- ksprintf(str, "%d", u); -- kputc('^', str); ++ kputw(u, str); kputc('^', str); for (j = 0; j < l; ++j) { if (ref[x+j] == 0) break; kputc(ref[x+j], str); @@@ -55,12 -55,12 +63,12 @@@ x += l; } } -- ksprintf(str, "%d", u); ++ kputw(u, str); // apply max_nm if (max_nm > 0 && nm >= max_nm) { for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { int j, l = cigar[i]>>4, op = cigar[i]&0xf; -- if (op == BAM_CMATCH) { ++ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (j = 0; j < l; ++j) { int z = y + j; int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]]; @@@ -77,38 -77,38 +85,54 @@@ } } // update NM -- old_nm = bam_aux_get(b, "NM"); -- if (c->flag & BAM_FUNMAP) return; -- if (old_nm) old_nm_i = bam_aux2i(old_nm); -- if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); -- else if (nm != old_nm_i) { -- fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm); -- bam_aux_del(b, old_nm); -- bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); ++ if (flag & UPDATE_NM) { ++ uint8_t *old_nm = bam_aux_get(b, "NM"); ++ if (c->flag & BAM_FUNMAP) return; ++ if (old_nm) old_nm_i = bam_aux2i(old_nm); ++ if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); ++ else if (nm != old_nm_i) { ++ fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm); ++ bam_aux_del(b, old_nm); ++ bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); ++ } } // update MD -- old_md = bam_aux_get(b, "MD"); -- if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); -- else { -- int is_diff = 0; -- if (strlen((char*)old_md+1) == str->l) { -- for (i = 0; i < str->l; ++i) -- if (toupper(old_md[i+1]) != toupper(str->s[i])) -- break; -- if (i < str->l) is_diff = 1; -- } else is_diff = 1; -- if (is_diff) { -- fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam1_qname(b), old_md+1, str->s); -- bam_aux_del(b, old_md); -- bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); ++ if (flag & UPDATE_MD) { ++ uint8_t *old_md = bam_aux_get(b, "MD"); ++ if (c->flag & BAM_FUNMAP) return; ++ if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); ++ else { ++ int is_diff = 0; ++ if (strlen((char*)old_md+1) == str->l) { ++ for (i = 0; i < str->l; ++i) ++ if (toupper(old_md[i+1]) != toupper(str->s[i])) ++ break; ++ if (i < str->l) is_diff = 1; ++ } else is_diff = 1; ++ if (is_diff) { ++ fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam1_qname(b), old_md+1, str->s); ++ bam_aux_del(b, old_md); ++ bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); ++ } } } ++ // drop all tags but RG ++ if (flag&DROP_TAG) { ++ uint8_t *q = bam_aux_get(b, "RG"); ++ bam_aux_drop_other(b, q); ++ } ++ // reduce the resolution of base quality ++ if (flag&BIN_QUAL) { ++ uint8_t *qual = bam1_qual(b); ++ for (i = 0; i < b->core.l_qseq; ++i) ++ if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7; ++ } free(str->s); free(str); } --void bam_fillmd1(bam1_t *b, char *ref, int is_equal) ++void bam_fillmd1(bam1_t *b, char *ref, int flag) { -- bam_fillmd1_core(b, ref, is_equal, 0); ++ bam_fillmd1_core(b, ref, flag, 0); } int bam_cap_mapQ(bam1_t *b, char *ref, int thres) @@@ -122,7 -122,7 +146,7 @@@ mm = q = len = clip_l = clip_q = 0; for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { int j, l = cigar[i]>>4, op = cigar[i]&0xf; -- if (op == BAM_CMATCH) { ++ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (j = 0; j < l; ++j) { int z = y + j; int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]]; @@@ -162,14 -162,14 +186,14 @@@ return (int)(t + .499); } --int bam_prob_realn_core(bam1_t *b, const char *ref, int apply_baq) ++int bam_prob_realn_core(bam1_t *b, const char *ref, int flag) { -- int k, i, bw, x, y, yb, ye, xb, xe; ++ int k, i, bw, x, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1; uint32_t *cigar = bam1_cigar(b); bam1_core_t *c = &b->core; kpa_par_t conf = kpa_par_def; uint8_t *bq = 0, *zq = 0, *qual = bam1_qual(b); -- if (c->flag & BAM_FUNMAP) return -1; // do nothing ++ if ((c->flag & BAM_FUNMAP) || b->core.l_qseq == 0) return -1; // do nothing // test if BQ or ZQ is present if ((bq = bam_aux_get(b, "BQ")) != 0) ++bq; if ((zq = bam_aux_get(b, "ZQ")) != 0 && *zq == 'Z') ++zq; @@@ -195,7 -195,7 +219,7 @@@ for (k = 0; k < c->n_cigar; ++k) { int op, l; op = cigar[k]&0xf; l = cigar[k]>>4; -- if (op == BAM_CMATCH) { ++ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { if (yb < 0) yb = y; if (xb < 0) xb = x; ye = y + l; xe = x + l; @@@ -221,23 -221,23 +245,47 @@@ s = calloc(c->l_qseq, 1); for (i = 0; i < c->l_qseq; ++i) s[i] = bam_nt16_nt4_table[bam1_seqi(seq, i)]; r = calloc(xe - xb, 1); -- for (i = xb; i < xe; ++i) ++ for (i = xb; i < xe; ++i) { ++ if (ref[i] == 0) { xe = i; break; } r[i-xb] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[i]]]; ++ } state = calloc(c->l_qseq, sizeof(int)); q = calloc(c->l_qseq, 1); kpa_glocal(r, xe-xb, s, c->l_qseq, qual, &conf, state, q); -- for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) { -- int op = cigar[k]&0xf, l = cigar[k]>>4; -- if (op == BAM_CMATCH) { -- for (i = y; i < y + l; ++i) { -- if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) bq[i] = 0; -- else bq[i] = bq[i] < q[i]? bq[i] : q[i]; -- } -- x += l; y += l; -- } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l; -- else if (op == BAM_CDEL) x += l; ++ if (!extend_baq) { // in this block, bq[] is capped by base quality qual[] ++ for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) { ++ int op = cigar[k]&0xf, l = cigar[k]>>4; ++ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { ++ for (i = y; i < y + l; ++i) { ++ if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) bq[i] = 0; ++ else bq[i] = bq[i] < q[i]? bq[i] : q[i]; ++ } ++ x += l; y += l; ++ } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l; ++ else if (op == BAM_CDEL) x += l; ++ } ++ for (i = 0; i < c->l_qseq; ++i) bq[i] = qual[i] - bq[i] + 64; // finalize BQ ++ } else { // in this block, bq[] is BAQ that can be larger than qual[] (different from the above!) ++ uint8_t *left, *rght; ++ left = calloc(c->l_qseq, 1); rght = calloc(c->l_qseq, 1); ++ for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) { ++ int op = cigar[k]&0xf, l = cigar[k]>>4; ++ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { ++ for (i = y; i < y + l; ++i) ++ bq[i] = ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y))? 0 : q[i]; ++ for (left[y] = bq[y], i = y + 1; i < y + l; ++i) ++ left[i] = bq[i] > left[i-1]? bq[i] : left[i-1]; ++ for (rght[y+l-1] = bq[y+l-1], i = y + l - 2; i >= y; --i) ++ rght[i] = bq[i] > rght[i+1]? bq[i] : rght[i+1]; ++ for (i = y; i < y + l; ++i) ++ bq[i] = left[i] < rght[i]? left[i] : rght[i]; ++ x += l; y += l; ++ } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l; ++ else if (op == BAM_CDEL) x += l; ++ } ++ for (i = 0; i < c->l_qseq; ++i) bq[i] = 64 + (qual[i] <= bq[i]? 0 : qual[i] - bq[i]); // finalize BQ ++ free(left); free(rght); } -- for (i = 0; i < c->l_qseq; ++i) bq[i] = qual[i] - bq[i] + 64; // finalize BQ if (apply_baq) { for (i = 0; i < c->l_qseq; ++i) qual[i] -= bq[i] - 64; // modify qual bam_aux_append(b, "ZQ", 'Z', c->l_qseq + 1, bq); @@@ -254,25 -254,25 +302,31 @@@ int bam_prob_realn(bam1_t *b, const cha int bam_fillmd(int argc, char *argv[]) { -- int c, is_equal, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm, is_realn, capQ, apply_baq; ++ int c, flt_flag, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm, is_realn, capQ, baq_flag; samfile_t *fp, *fpout = 0; faidx_t *fai; char *ref = 0, mode_w[8], mode_r[8]; bam1_t *b; -- is_equal = is_bam_out = is_sam_in = is_uncompressed = is_realn = max_nm = apply_baq = capQ = 0; ++ flt_flag = UPDATE_NM | UPDATE_MD; ++ is_bam_out = is_sam_in = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0; mode_w[0] = mode_r[0] = 0; strcpy(mode_r, "r"); strcpy(mode_w, "w"); -- while ((c = getopt(argc, argv, "reubSC:n:A")) >= 0) { ++ while ((c = getopt(argc, argv, "EqreuNhbSC:n:Ad")) >= 0) { switch (c) { case 'r': is_realn = 1; break; -- case 'e': is_equal = 1; break; ++ case 'e': flt_flag |= USE_EQUAL; break; ++ case 'd': flt_flag |= DROP_TAG; break; ++ case 'q': flt_flag |= BIN_QUAL; break; ++ case 'h': flt_flag |= HASH_QNM; break; ++ case 'N': flt_flag &= ~(UPDATE_MD|UPDATE_NM); break; case 'b': is_bam_out = 1; break; case 'u': is_uncompressed = is_bam_out = 1; break; case 'S': is_sam_in = 1; break; case 'n': max_nm = atoi(optarg); break; case 'C': capQ = atoi(optarg); break; -- case 'A': apply_baq = 1; break; ++ case 'A': baq_flag |= 1; break; ++ case 'E': baq_flag |= 2; break; default: fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n", c); return 1; } } @@@ -288,7 -288,7 +342,8 @@@ fprintf(stderr, " -b compressed BAM output\n"); fprintf(stderr, " -S the input is SAM with header\n"); fprintf(stderr, " -A modify the quality string\n"); -- fprintf(stderr, " -r read-independent local realignment\n\n"); ++ fprintf(stderr, " -r compute the BQ tag (without -A) or cap baseQ by BAQ (with -A)\n"); ++ fprintf(stderr, " -E extended BAQ for better sensitivity but lower specificity\n\n"); return 1; } fp = samopen(argv[optind], mode_r, 0); @@@ -311,12 -311,12 +366,12 @@@ fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n", fp->header->target_name[tid]); } -- if (is_realn) bam_prob_realn_core(b, ref, apply_baq); ++ if (is_realn) bam_prob_realn_core(b, ref, baq_flag); if (capQ > 10) { int q = bam_cap_mapQ(b, ref, capQ); if (b->core.qual > q) b->core.qual = q; } -- if (ref) bam_fillmd1_core(b, ref, is_equal, max_nm); ++ if (ref) bam_fillmd1_core(b, ref, flt_flag, max_nm); } samwrite(fpout, b); } diff --cc sam/bam_pileup.c index 3e26f74,3e26f74..57434e0 --- a/sam/bam_pileup.c +++ b/sam/bam_pileup.c @@@ -78,12 -78,12 +78,12 @@@ static inline int resolve_cigar2(bam_pi if (s->k == -1) { // never processed is_head = 1; if (c->n_cigar == 1) { // just one operation, save a loop -- if (_cop(cigar[0]) == BAM_CMATCH) s->k = 0, s->x = c->pos, s->y = 0; ++ if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0; } else { // find the first match or deletion for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) { int op = _cop(cigar[k]); int l = _cln(cigar[k]); -- if (op == BAM_CMATCH || op == BAM_CDEL) break; ++ if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CEQUAL || op == BAM_CDIFF) break; else if (op == BAM_CREF_SKIP) s->x += l; else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l; } @@@ -95,16 -95,16 +95,16 @@@ if (pos - s->x >= l) { // jump to the next operation assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case op = _cop(cigar[s->k+1]); -- if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP) { // jump to the next without a loop -- if (_cop(cigar[s->k]) == BAM_CMATCH) s->y += l; ++ if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop ++ if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l; s->x += l; ++s->k; -- } else { // find the next M/D/N -- if (_cop(cigar[s->k]) == BAM_CMATCH) s->y += l; ++ } else { // find the next M/D/N/=/X ++ if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l; s->x += l; for (k = s->k + 1; k < c->n_cigar; ++k) { op = _cop(cigar[k]), l = _cln(cigar[k]); -- if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP) break; ++ if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break; else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l; } s->k = k; @@@ -126,12 -126,12 +126,12 @@@ for (k = s->k + 2; k < c->n_cigar; ++k) { op2 = _cop(cigar[k]); l2 = _cln(cigar[k]); if (op2 == BAM_CINS) l3 += l2; -- else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP) break; ++ else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break; } if (l3 > 0) p->indel = l3; } } -- if (op == BAM_CMATCH) { ++ if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { p->qpos = s->y + (pos - s->x); } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!! diff --cc sam/bam_plcmd.c index 002297a,002297a..cbf6ae8 --- a/sam/bam_plcmd.c +++ b/sam/bam_plcmd.c @@@ -6,93 -6,93 +6,8 @@@ #include #include "sam.h" #include "faidx.h" --#include "bam_maqcns.h" --#include "khash.h" --#include "glf.h" #include "kstring.h" --typedef int *indel_list_t; --KHASH_MAP_INIT_INT64(64, indel_list_t) -- --#define BAM_PLF_SIMPLE 0x01 --#define BAM_PLF_CNS 0x02 --#define BAM_PLF_INDEL_ONLY 0x04 --#define BAM_PLF_GLF 0x08 --#define BAM_PLF_VAR_ONLY 0x10 --#define BAM_PLF_2ND 0x20 --#define BAM_PLF_RANBASE 0x40 --#define BAM_PLF_1STBASE 0x80 --#define BAM_PLF_ALLBASE 0x100 --#define BAM_PLF_READPOS 0x200 --#define BAM_PLF_NOBAQ 0x400 -- --typedef struct { -- bam_header_t *h; -- bam_maqcns_t *c; -- bam_maqindel_opt_t *ido; -- faidx_t *fai; -- khash_t(64) *hash; -- uint32_t format; -- int tid, len, last_pos; -- int mask; -- int capQ_thres, min_baseQ; -- int max_depth; // for indel calling, ignore reads with the depth too high. 0 for unlimited -- char *ref; -- glfFile fp_glf; // for glf output only --} pu_data_t; -- --char **__bam_get_lines(const char *fn, int *_n); --void bam_init_header_hash(bam_header_t *header); --int32_t bam_get_tid(const bam_header_t *header, const char *seq_name); -- --static khash_t(64) *load_pos(const char *fn, bam_header_t *h) --{ -- char **list; -- int i, j, n, *fields, max_fields; -- khash_t(64) *hash; -- bam_init_header_hash(h); -- list = __bam_get_lines(fn, &n); -- hash = kh_init(64); -- max_fields = 0; fields = 0; -- for (i = 0; i < n; ++i) { -- char *str = list[i]; -- int chr, n_fields, ret; -- khint_t k; -- uint64_t x; -- n_fields = ksplit_core(str, 0, &max_fields, &fields); -- if (n_fields < 2) continue; -- chr = bam_get_tid(h, str + fields[0]); -- if (chr < 0) { -- fprintf(stderr, "[load_pos] unknown reference sequence name: %s\n", str + fields[0]); -- continue; -- } -- x = (uint64_t)chr << 32 | (atoi(str + fields[1]) - 1); -- k = kh_put(64, hash, x, &ret); -- if (ret == 0) { -- fprintf(stderr, "[load_pos] position %s:%s has been loaded.\n", str+fields[0], str+fields[1]); -- continue; -- } -- kh_val(hash, k) = 0; -- if (n_fields > 2) { -- // count -- for (j = 2; j < n_fields; ++j) { -- char *s = str + fields[j]; -- if ((*s != '+' && *s != '-') || !isdigit(s[1])) break; -- } -- if (j > 2) { // update kh_val() -- int *q, y, z; -- q = kh_val(hash, k) = (int*)calloc(j - 1, sizeof(int)); -- q[0] = j - 2; z = j; y = 1; -- for (j = 2; j < z; ++j) -- q[y++] = atoi(str + fields[j]); -- } -- } -- free(str); -- } -- free(list); free(fields); -- return hash; --} -- static inline int printw(int c, FILE *fp) { char buf[16]; @@@ -108,75 -108,75 +23,6 @@@ return 0; } --// an analogy to pileup_func() below --static int glt3_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data) --{ -- pu_data_t *d = (pu_data_t*)data; -- bam_maqindel_ret_t *r = 0; -- int rb, *proposed_indels = 0; -- glf1_t *g; -- glf3_t *g3; -- -- if (d->fai == 0) { -- fprintf(stderr, "[glt3_func] reference sequence is required for generating GLT. Abort!\n"); -- exit(1); -- } -- if (d->hash) { // only output a list of sites -- khint_t k = kh_get(64, d->hash, (uint64_t)tid<<32|pos); -- if (k == kh_end(d->hash)) return 0; -- proposed_indels = kh_val(d->hash, k); -- } -- g3 = glf3_init1(); -- if (d->fai && (int)tid != d->tid) { -- if (d->ref) { // then write the end mark -- g3->rtype = GLF3_RTYPE_END; -- glf3_write1(d->fp_glf, g3); -- } -- glf3_ref_write(d->fp_glf, d->h->target_name[tid], d->h->target_len[tid]); // write reference -- free(d->ref); -- d->ref = fai_fetch(d->fai, d->h->target_name[tid], &d->len); -- d->tid = tid; -- d->last_pos = 0; -- } -- rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N'; -- g = bam_maqcns_glfgen(n, pu, bam_nt16_table[rb], d->c); -- memcpy(g3, g, sizeof(glf1_t)); -- g3->rtype = GLF3_RTYPE_SUB; -- g3->offset = pos - d->last_pos; -- d->last_pos = pos; -- glf3_write1(d->fp_glf, g3); -- if (pos < d->len) { -- int m = (!d->max_depth || d->max_depth>n) ? n : d->max_depth; -- if (proposed_indels) -- r = bam_maqindel(m, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1); -- else r = bam_maqindel(m, pos, d->ido, pu, d->ref, 0, 0); -- } -- if (r) { // then write indel line -- int het = 3 * n, min; -- min = het; -- if (min > r->gl[0]) min = r->gl[0]; -- if (min > r->gl[1]) min = r->gl[1]; -- g3->ref_base = 0; -- g3->rtype = GLF3_RTYPE_INDEL; -- memset(g3->lk, 0, 10); -- g3->lk[0] = r->gl[0] - min < 255? r->gl[0] - min : 255; -- g3->lk[1] = r->gl[1] - min < 255? r->gl[1] - min : 255; -- g3->lk[2] = het - min < 255? het - min : 255; -- g3->offset = 0; -- g3->indel_len[0] = r->indel1; -- g3->indel_len[1] = r->indel2; -- g3->min_lk = min < 255? min : 255; -- g3->max_len = (abs(r->indel1) > abs(r->indel2)? abs(r->indel1) : abs(r->indel2)) + 1; -- g3->indel_seq[0] = strdup(r->s[0]+1); -- g3->indel_seq[1] = strdup(r->s[1]+1); -- glf3_write1(d->fp_glf, g3); -- bam_maqindel_ret_destroy(r); -- } -- free(g); -- glf3_destroy1(g3); -- return 0; --} -- static inline void pileup_seq(const bam_pileup1_t *p, int pos, int ref_len, const char *ref) { int j; @@@ -212,316 -212,316 +58,6 @@@ if (p->is_tail) putchar('$'); } --static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data) --{ -- pu_data_t *d = (pu_data_t*)data; -- bam_maqindel_ret_t *r = 0; -- int i, rb, rms_mapq = -1, *proposed_indels = 0; -- uint64_t rms_aux; -- uint32_t cns = 0; -- -- // if GLF is required, suppress -c completely -- if (d->format & BAM_PLF_GLF) return glt3_func(tid, pos, n, pu, data); -- // if d->hash is initialized, only output the sites in the hash table -- if (d->hash) { -- khint_t k = kh_get(64, d->hash, (uint64_t)tid<<32|pos); -- if (k == kh_end(d->hash)) return 0; -- proposed_indels = kh_val(d->hash, k); -- } -- // update d->ref if necessary -- if (d->fai && (int)tid != d->tid) { -- free(d->ref); -- d->ref = faidx_fetch_seq(d->fai, d->h->target_name[tid], 0, 0x7fffffff, &d->len); -- d->tid = tid; -- } -- rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N'; -- // when the indel-only mode is asked for, return if no reads mapped with indels -- if (d->format & BAM_PLF_INDEL_ONLY) { -- for (i = 0; i < n; ++i) -- if (pu[i].indel != 0) break; -- if (i == n) return 0; -- } -- // call the consensus and indel -- if (d->format & BAM_PLF_CNS) { // call consensus -- if (d->format & (BAM_PLF_RANBASE|BAM_PLF_1STBASE)) { // use a random base or the 1st base as the consensus call -- const bam_pileup1_t *p = (d->format & BAM_PLF_1STBASE)? pu : pu + (int)(drand48() * n); -- int q = bam1_qual(p->b)[p->qpos]; -- int mapQ = p->b->core.qual < d->c->cap_mapQ? p->b->core.qual : d->c->cap_mapQ; -- uint32_t b = bam1_seqi(bam1_seq(p->b), p->qpos); -- cns = b<<28 | 0xf<<24 | mapQ<<16 | q<<8; -- } else if (d->format & BAM_PLF_ALLBASE) { // collapse all bases -- uint64_t rmsQ = 0; -- uint32_t b = 0; -- for (i = 0; i < n; ++i) { -- const bam_pileup1_t *p = pu + i; -- int q = p->b->core.qual < d->c->cap_mapQ? p->b->core.qual : d->c->cap_mapQ; -- b |= bam1_seqi(bam1_seq(p->b), p->qpos); -- rmsQ += q * q; -- } -- rmsQ = (uint64_t)(sqrt((double)rmsQ / n) + .499); -- cns = b<<28 | 0xf<<24 | rmsQ<<16 | 60<<8; -- } else { -- glf1_t *g = bam_maqcns_glfgen(n, pu, bam_nt16_table[rb], d->c); -- cns = g->depth == 0? (0xfu<<28 | 0xf<<24) : glf2cns(g, (int)(d->c->q_r + .499)); -- free(g); -- } -- } -- if ((d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY)) && d->ref && pos < d->len) { // call indels -- int m = (!d->max_depth || d->max_depth>n) ? n : d->max_depth; -- if (proposed_indels) // the first element gives the size of the array -- r = bam_maqindel(m, pos, d->ido, pu, d->ref, proposed_indels[0], proposed_indels+1); -- else r = bam_maqindel(m, pos, d->ido, pu, d->ref, 0, 0); -- } -- // when only variant sites are asked for, test if the site is a variant -- if ((d->format & BAM_PLF_CNS) && (d->format & BAM_PLF_VAR_ONLY)) { -- if (!(bam_nt16_table[rb] != 15 && cns>>28 != 15 && cns>>28 != bam_nt16_table[rb])) { // not a SNP -- if (!(r && (r->gt == 2 || strcmp(r->s[r->gt], "*")))) { // not an indel -- if (r) bam_maqindel_ret_destroy(r); -- return 0; -- } -- } -- } -- // print the first 3 columns -- fputs(d->h->target_name[tid], stdout); putchar('\t'); -- printw(pos+1, stdout); putchar('\t'); putchar(rb); putchar('\t'); -- // print consensus information if required -- if (d->format & BAM_PLF_CNS) { -- putchar(bam_nt16_rev_table[cns>>28]); putchar('\t'); -- printw(cns>>8&0xff, stdout); putchar('\t'); -- printw(cns&0xff, stdout); putchar('\t'); -- printw(cns>>16&0xff, stdout); putchar('\t'); -- } -- // print pileup sequences -- printw(n, stdout); putchar('\t'); -- for (i = 0; i < n; ++i) -- pileup_seq(pu + i, pos, d->len, d->ref); -- // finalize rms_mapq -- if (d->format & BAM_PLF_CNS) { -- for (i = rms_aux = 0; i < n; ++i) { -- const bam_pileup1_t *p = pu + i; -- int tmp = p->b->core.qual < d->c->cap_mapQ? p->b->core.qual : d->c->cap_mapQ; -- rms_aux += tmp * tmp; -- } -- rms_aux = (uint64_t)(sqrt((double)rms_aux / n) + .499); -- if (rms_mapq < 0) rms_mapq = rms_aux; -- } -- putchar('\t'); -- // print quality -- for (i = 0; i < n; ++i) { -- const bam_pileup1_t *p = pu + i; -- int c = bam1_qual(p->b)[p->qpos] + 33; -- if (c > 126) c = 126; -- putchar(c); -- } -- if (d->format & BAM_PLF_2ND) { // print 2nd calls and qualities -- const unsigned char *q; -- putchar('\t'); -- for (i = 0; i < n; ++i) { -- const bam_pileup1_t *p = pu + i; -- q = bam_aux_get(p->b, "E2"); -- putchar(q? q[p->qpos + 1] : 'N'); -- } -- putchar('\t'); -- for (i = 0; i < n; ++i) { -- const bam_pileup1_t *p = pu + i; -- q = bam_aux_get(p->b, "U2"); -- putchar(q? q[p->qpos + 1] : '!'); -- } -- } -- // print mapping quality if -s is flagged on the command line -- if (d->format & BAM_PLF_SIMPLE) { -- putchar('\t'); -- for (i = 0; i < n; ++i) { -- int c = pu[i].b->core.qual + 33; -- if (c > 126) c = 126; -- putchar(c); -- } -- } -- // print read position -- if (d->format & BAM_PLF_READPOS) { -- putchar('\t'); -- for (i = 0; i < n; ++i) { -- int x = pu[i].qpos; -- int l = pu[i].b->core.l_qseq; -- printw(x < l/2? x+1 : -((l-1)-x+1), stdout); putchar(','); -- } -- } -- putchar('\n'); -- // print the indel line if r has been calculated. This only happens if: -- // a) -c or -i are flagged, AND b) the reference sequence is available -- if (r) { -- printf("%s\t%d\t*\t", d->h->target_name[tid], pos + 1); -- if (r->gt < 2) printf("%s/%s\t", r->s[r->gt], r->s[r->gt]); -- else printf("%s/%s\t", r->s[0], r->s[1]); -- printf("%d\t%d\t", r->q_cns, r->q_ref); -- printf("%d\t%d\t", rms_mapq, n); -- printf("%s\t%s\t", r->s[0], r->s[1]); -- //printf("%d\t%d\t", r->gl[0], r->gl[1]); -- printf("%d\t%d\t%d\t", r->cnt1, r->cnt2, r->cnt_anti); -- printf("%d\t%d\n", r->cnt_ref, r->cnt_ambi); -- bam_maqindel_ret_destroy(r); -- } -- return 0; --} -- --int bam_pileup(int argc, char *argv[]) --{ -- int c, is_SAM = 0; -- char *fn_list = 0, *fn_fa = 0, *fn_pos = 0; -- pu_data_t *d = (pu_data_t*)calloc(1, sizeof(pu_data_t)); -- d->max_depth = 1024; d->tid = -1; d->mask = BAM_DEF_MASK; d->min_baseQ = 13; -- d->c = bam_maqcns_init(); -- d->c->errmod = BAM_ERRMOD_MAQ2; // change the default model -- d->ido = bam_maqindel_opt_init(); -- while ((c = getopt(argc, argv, "st:f:cT:N:r:l:d:im:gI:G:vM:S2aR:PAQ:C:B")) >= 0) { -- switch (c) { -- case 'Q': d->c->min_baseQ = atoi(optarg); break; -- case 'C': d->capQ_thres = atoi(optarg); break; -- case 'B': d->format |= BAM_PLF_NOBAQ; break; -- case 'a': d->c->errmod = BAM_ERRMOD_SOAP; break; -- case 'A': d->c->errmod = BAM_ERRMOD_MAQ; break; -- case 's': d->format |= BAM_PLF_SIMPLE; break; -- case 't': fn_list = strdup(optarg); break; -- case 'l': fn_pos = strdup(optarg); break; -- case 'f': fn_fa = strdup(optarg); break; -- case 'T': d->c->theta = atof(optarg); break; -- case 'N': d->c->n_hap = atoi(optarg); break; -- case 'r': d->c->het_rate = atof(optarg); d->ido->r_snp = d->c->het_rate; break; -- case 'M': d->c->cap_mapQ = atoi(optarg); break; -- case 'd': d->max_depth = atoi(optarg); break; -- case 'c': d->format |= BAM_PLF_CNS; break; -- case 'i': d->format |= BAM_PLF_INDEL_ONLY; break; -- case 'v': d->format |= BAM_PLF_VAR_ONLY; break; -- case 'm': d->mask = strtol(optarg, 0, 0); break; -- case 'g': d->format |= BAM_PLF_GLF; break; -- case '2': d->format |= BAM_PLF_2ND; break; -- case 'P': d->format |= BAM_PLF_READPOS; break; -- case 'I': d->ido->q_indel = atoi(optarg); break; -- case 'G': d->ido->r_indel = atof(optarg); break; -- case 'S': is_SAM = 1; break; -- case 'R': -- if (strcmp(optarg, "random") == 0) d->format |= BAM_PLF_RANBASE; -- else if (strcmp(optarg, "first") == 0) d->format |= BAM_PLF_1STBASE; -- else if (strcmp(optarg, "all") == 0) d->format |= BAM_PLF_ALLBASE; -- else fprintf(stderr, "[bam_pileup] unrecognized -R\n"); -- break; -- default: fprintf(stderr, "Unrecognizd option '-%c'.\n", c); return 1; -- } -- } -- if (d->c->errmod != BAM_ERRMOD_MAQ2) d->c->theta += 0.02; -- if (d->c->theta > 1.0) d->c->theta = 1.0; -- if (fn_list) is_SAM = 1; -- if (optind == argc) { -- fprintf(stderr, "\n"); -- fprintf(stderr, "Usage: samtools pileup [options] |\n\n"); -- fprintf(stderr, "Option: -s simple (yet incomplete) pileup format\n"); -- fprintf(stderr, " -S the input is in SAM\n"); -- fprintf(stderr, " -B disable BAQ computation\n"); -- fprintf(stderr, " -A use the original MAQ model for SNP calling (DEPRECATED)\n"); -- fprintf(stderr, " -2 output the 2nd best call and quality\n"); -- fprintf(stderr, " -i only show lines/consensus with indels\n"); -- fprintf(stderr, " -Q INT min base quality (possibly capped by BAQ) [%d]\n", d->c->min_baseQ); -- fprintf(stderr, " -C INT coefficient for adjusting mapQ of poor mappings [%d]\n", d->capQ_thres); -- fprintf(stderr, " -m INT filtering reads with bits in INT [0x%x]\n", d->mask); -- fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", d->c->cap_mapQ); -- fprintf(stderr, " -d INT limit maximum depth for indels [%d]\n", d->max_depth); -- fprintf(stderr, " -t FILE list of reference sequences (force -S)\n"); -- fprintf(stderr, " -l FILE list of sites at which pileup is output\n"); -- fprintf(stderr, " -f FILE reference sequence in the FASTA format\n\n"); -- fprintf(stderr, " -c compute the consensus sequence\n"); -- fprintf(stderr, " -v print variants only (for -c)\n"); -- fprintf(stderr, " -g output in the GLFv3 format (DEPRECATED)\n"); -- fprintf(stderr, " -T FLOAT theta in maq consensus calling model (for -c) [%.4g]\n", d->c->theta); -- fprintf(stderr, " -N INT number of haplotypes in the sample (for -c) [%d]\n", d->c->n_hap); -- fprintf(stderr, " -r FLOAT prior of a difference between two haplotypes (for -c) [%.4g]\n", d->c->het_rate); -- fprintf(stderr, " -G FLOAT prior of an indel between two haplotypes (for -c) [%.4g]\n", d->ido->r_indel); -- fprintf(stderr, " -I INT phred prob. of an indel in sequencing/prep. (for -c) [%d]\n", d->ido->q_indel); -- fprintf(stderr, "\n"); -- free(fn_list); free(fn_fa); free(d); -- return 1; -- } -- if (d->format & (BAM_PLF_RANBASE|BAM_PLF_1STBASE|BAM_PLF_ALLBASE)) d->format |= BAM_PLF_CNS; -- if (fn_fa) d->fai = fai_load(fn_fa); -- if (d->format & (BAM_PLF_CNS|BAM_PLF_GLF)) bam_maqcns_prepare(d->c); // consensus calling -- if (d->format & BAM_PLF_GLF) { // for glf output -- glf3_header_t *h; -- h = glf3_header_init(); -- d->fp_glf = bgzf_fdopen(fileno(stdout), "w"); -- glf3_header_write(d->fp_glf, h); -- glf3_header_destroy(h); -- } -- if (d->fai == 0 && (d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY))) -- fprintf(stderr, "[bam_pileup] indels will not be called when -f is absent.\n"); -- if (fn_fa && is_SAM && fn_list == 0) fn_list = samfaipath(fn_fa); -- -- { -- samfile_t *fp; -- fp = is_SAM? samopen(argv[optind], "r", fn_list) : samopen(argv[optind], "rb", 0); -- if (fp == 0 || fp->header == 0) { -- fprintf(stderr, "[bam_pileup] fail to read the header: non-exisiting file or wrong format.\n"); -- return 1; -- } -- d->h = fp->header; -- if (fn_pos) d->hash = load_pos(fn_pos, d->h); -- { // run pileup -- extern int bam_prob_realn(bam1_t *b, const char *ref); -- extern int bam_cap_mapQ(bam1_t *b, char *ref, int thres); -- bam1_t *b; -- int ret, tid, pos, n_plp; -- bam_plp_t iter; -- const bam_pileup1_t *plp; -- b = bam_init1(); -- iter = bam_plp_init(0, 0); -- bam_plp_set_mask(iter, d->mask); -- while ((ret = samread(fp, b)) >= 0) { -- int skip = 0; -- if ((int)b->core.tid < 0) break; -- // update d->ref if necessary -- if (d->fai && (int)b->core.tid != d->tid) { -- free(d->ref); -- d->ref = faidx_fetch_seq(d->fai, d->h->target_name[b->core.tid], 0, 0x7fffffff, &d->len); -- d->tid = b->core.tid; -- } -- if (d->ref && (d->format&BAM_PLF_CNS) && !(d->format&BAM_PLF_NOBAQ)) bam_prob_realn(b, d->ref); -- if (d->ref && (d->format&BAM_PLF_CNS) && d->capQ_thres > 10) { -- int q = bam_cap_mapQ(b, d->ref, d->capQ_thres); -- if (q < 0) skip = 1; -- else if (b->core.qual > q) b->core.qual = q; -- } else if (b->core.flag&BAM_FUNMAP) skip = 1; -- else if ((d->format&BAM_PLF_CNS) && (b->core.flag&1) && !(b->core.flag&2)) skip = 1; -- if (skip) continue; -- bam_plp_push(iter, b); -- while ((plp = bam_plp_next(iter, &tid, &pos, &n_plp)) != 0) -- pileup_func(tid, pos, n_plp, plp, d); -- } -- bam_plp_push(iter, 0); -- while ((plp = bam_plp_next(iter, &tid, &pos, &n_plp)) != 0) -- pileup_func(tid, pos, n_plp, plp, d); -- bam_plp_destroy(iter); -- bam_destroy1(b); -- } -- samclose(fp); // d->h will be destroyed here -- } -- -- // free -- if (d->format & BAM_PLF_GLF) bgzf_close(d->fp_glf); -- if (fn_pos) { // free the hash table -- khint_t k; -- for (k = kh_begin(d->hash); k < kh_end(d->hash); ++k) -- if (kh_exist(d->hash, k)) free(kh_val(d->hash, k)); -- kh_destroy(64, d->hash); -- } -- free(fn_pos); free(fn_list); free(fn_fa); -- if (d->fai) fai_destroy(d->fai); -- bam_maqcns_destroy(d->c); -- free(d->ido); free(d->ref); free(d); -- return 0; --} -- --/*********** -- * mpileup * -- ***********/ -- #include #include "bam2bcf.h" #include "sample.h" @@@ -533,20 -533,20 +69,32 @@@ #define MPLP_FMT_DP 0x100 #define MPLP_FMT_SP 0x200 #define MPLP_NO_INDEL 0x400 ++#define MPLP_EXT_BAQ 0x800 ++#define MPLP_ILLUMINA13 0x1000 ++#define MPLP_IGNORE_RG 0x2000 ++#define MPLP_PRINT_POS 0x4000 ++#define MPLP_PRINT_MAPQ 0x8000 ++ ++void *bed_read(const char *fn); ++void bed_destroy(void *_h); ++int bed_overlap(const void *_h, const char *chr, int beg, int end); typedef struct { -- int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth; -- int openQ, extQ, tandemQ; -- char *reg, *fn_pos, *pl_list; ++ int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth; ++ int openQ, extQ, tandemQ, min_support; // for indels ++ double min_frac; // for indels ++ char *reg, *pl_list; faidx_t *fai; -- kh_64_t *hash; ++ void *bed, *rghash; } mplp_conf_t; typedef struct { bamFile fp; bam_iter_t iter; -- int min_mq, flag, ref_id, capQ_thres; ++ bam_header_t *h; ++ int ref_id; char *ref; ++ const mplp_conf_t *conf; } mplp_aux_t; typedef struct { @@@ -566,22 -566,22 +114,41 @@@ static int mplp_func(void *data, bam1_ int has_ref; ret = ma->iter? bam_iter_read(ma->fp, ma->iter, b) : bam_read1(ma->fp, b); if (ret < 0) break; ++ if (b->core.tid < 0 || (b->core.flag&BAM_FUNMAP)) { // exclude unmapped reads ++ skip = 1; ++ continue; ++ } ++ if (ma->conf->bed) { // test overlap ++ skip = !bed_overlap(ma->conf->bed, ma->h->target_name[b->core.tid], b->core.pos, bam_calend(&b->core, bam1_cigar(b))); ++ if (skip) continue; ++ } ++ if (ma->conf->rghash) { // exclude read groups ++ uint8_t *rg = bam_aux_get(b, "RG"); ++ skip = (rg && bcf_str2id(ma->conf->rghash, (const char*)(rg+1)) >= 0); ++ if (skip) continue; ++ } ++ if (ma->conf->flag & MPLP_ILLUMINA13) { ++ int i; ++ uint8_t *qual = bam1_qual(b); ++ for (i = 0; i < b->core.l_qseq; ++i) ++ qual[i] = qual[i] > 31? qual[i] - 31 : 0; ++ } has_ref = (ma->ref && ma->ref_id == b->core.tid)? 1 : 0; skip = 0; -- if (has_ref && (ma->flag&MPLP_REALN)) bam_prob_realn_core(b, ma->ref, 1); -- if (has_ref && ma->capQ_thres > 10) { -- int q = bam_cap_mapQ(b, ma->ref, ma->capQ_thres); ++ if (has_ref && (ma->conf->flag&MPLP_REALN)) bam_prob_realn_core(b, ma->ref, (ma->conf->flag & MPLP_EXT_BAQ)? 3 : 1); ++ if (has_ref && ma->conf->capQ_thres > 10) { ++ int q = bam_cap_mapQ(b, ma->ref, ma->conf->capQ_thres); if (q < 0) skip = 1; else if (b->core.qual > q) b->core.qual = q; -- } else if (b->core.flag&BAM_FUNMAP) skip = 1; -- else if (b->core.qual < ma->min_mq) skip = 1; -- else if ((ma->flag&MPLP_NO_ORPHAN) && (b->core.flag&1) && !(b->core.flag&2)) skip = 1; ++ } ++ else if (b->core.qual < ma->conf->min_mq) skip = 1; ++ else if ((ma->conf->flag&MPLP_NO_ORPHAN) && (b->core.flag&1) && !(b->core.flag&2)) skip = 1; } while (skip); return ret; } static void group_smpl(mplp_pileup_t *m, bam_sample_t *sm, kstring_t *buf, -- int n, char *const*fn, int *n_plp, const bam_pileup1_t **plp) ++ int n, char *const*fn, int *n_plp, const bam_pileup1_t **plp, int ignore_rg) { int i, j; memset(m->n_plp, 0, m->n * sizeof(int)); @@@ -590,10 -590,10 +157,14 @@@ const bam_pileup1_t *p = plp[i] + j; uint8_t *q; int id = -1; -- q = bam_aux_get(p->b, "RG"); ++ q = ignore_rg? 0 : bam_aux_get(p->b, "RG"); if (q) id = bam_smpl_rg2smid(sm, fn[i], (char*)q+1, buf); if (id < 0) id = bam_smpl_rg2smid(sm, fn[i], 0, buf); -- assert(id >= 0 && id < m->n); ++ if (id < 0 || id >= m->n) { ++ assert(q); // otherwise a bug ++ fprintf(stderr, "[%s] Read group %s used in file %s but absent from the header or an alignment missing read group.\n", __func__, (char*)q+1, fn[i]); ++ exit(1); ++ } if (m->n_plp[id] == m->m_plp[id]) { m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8; m->plp[id] = realloc(m->plp[id], sizeof(bam_pileup1_t) * m->m_plp[id]); @@@ -608,12 -608,12 +179,11 @@@ static int mpileup(mplp_conf_t *conf, i extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); extern void bcf_call_del_rghash(void *rghash); mplp_aux_t **data; -- int i, tid, pos, *n_plp, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid, max_depth; ++ int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth; const bam_pileup1_t **plp; bam_mplp_t iter; bam_header_t *h = 0; char *ref; -- khash_t(64) *hash = 0; void *rghash = 0; bcf_callaux_t *bca = 0; @@@ -638,12 -638,12 +208,11 @@@ for (i = 0; i < n; ++i) { bam_header_t *h_tmp; data[i] = calloc(1, sizeof(mplp_aux_t)); -- data[i]->min_mq = conf->min_mq; -- data[i]->flag = conf->flag; -- data[i]->capQ_thres = conf->capQ_thres; data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r"); ++ data[i]->conf = conf; h_tmp = bam_header_read(data[i]->fp); -- bam_smpl_add(sm, fn[i], h_tmp->text); ++ data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet ++ bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); if (conf->reg) { int beg, end; @@@ -657,7 -657,7 +226,7 @@@ fprintf(stderr, "[%s] malformatted region or wrong seqname for %d-th input.\n", __func__, i+1); exit(1); } -- if (i == 0) beg0 = beg, end0 = end; ++ if (i == 0) tid0 = tid, beg0 = beg, end0 = end; data[i]->iter = bam_iter_query(idx, tid, beg, end); bam_index_destroy(idx); } @@@ -673,7 -673,7 +242,6 @@@ gplp.plp = calloc(sm->n, sizeof(void*)); fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n); -- if (conf->fn_pos) hash = load_pos(conf->fn_pos, h); // write the VCF header if (conf->flag & MPLP_GLF) { kstring_t s; @@@ -694,7 -694,7 +262,8 @@@ bh->l_smpl = s.l; bh->sname = malloc(s.l); memcpy(bh->sname, s.s, s.l); -- bh->l_txt = 0; ++ bh->txt = malloc(strlen(BAM_VERSION) + 64); ++ bh->l_txt = 1 + sprintf(bh->txt, "##samtoolsVersion=%s\n", BAM_VERSION); free(s.s); bcf_hdr_sync(bh); bcf_hdr_write(bp, bh); @@@ -702,34 -702,34 +271,38 @@@ bcr = calloc(sm->n, sizeof(bcf_callret1_t)); bca->rghash = rghash; bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ; -- } -- ref_tid = -1; ref = 0; ++ bca->min_frac = conf->min_frac; ++ bca->min_support = conf->min_support; ++ } ++ if (tid0 >= 0 && conf->fai) { // region is set ++ ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len); ++ ref_tid = tid0; ++ for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0; ++ } else ref_tid = -1, ref = 0; iter = bam_mplp_init(n, mplp_func, (void**)data); max_depth = conf->max_depth; if (max_depth * sm->n > 1<<20) fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__); if (max_depth * sm->n < 8000) { max_depth = 8000 / sm->n; -- fprintf(stderr, "<%s> Set max per-sample depth to %d\n", __func__, max_depth); ++ fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth); } ++ max_indel_depth = conf->max_indel_depth * sm->n; bam_mplp_set_maxcnt(iter, max_depth); while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) { if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested -- if (hash) { -- khint_t k; -- k = kh_get(64, hash, (uint64_t)tid<<32 | pos); -- if (k == kh_end(hash)) continue; -- } ++ if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; if (tid != ref_tid) { free(ref); ref = 0; -- if (conf->fai) ref = fai_fetch(conf->fai, h->target_name[tid], &ref_len); ++ if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len); for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid; ref_tid = tid; } if (conf->flag & MPLP_GLF) { -- int _ref0, ref16; ++ int total_depth, _ref0, ref16; bcf1_t *b = calloc(1, sizeof(bcf1_t)); -- group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp); ++ for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; ++ group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG); _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; ref16 = bam_nt16_table[_ref0]; for (i = 0; i < gplp.n; ++i) @@@ -740,7 -740,7 +313,7 @@@ bcf_write(bp, bh, b); bcf_destroy(b); // call indels -- if (!(conf->flag&MPLP_NO_INDEL) && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { ++ if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) { @@@ -756,8 -756,8 +329,10 @@@ for (i = 0; i < n; ++i) { int j; printf("\t%d\t", n_plp[i]); -- if (n_plp[i] == 0) printf("*\t*"); -- else { ++ if (n_plp[i] == 0) { ++ printf("*\t*"); // FIXME: printf() is very slow... ++ if (conf->flag & MPLP_PRINT_POS) printf("\t*"); ++ } else { for (j = 0; j < n_plp[i]; ++j) pileup_seq(plp[i] + j, pos, ref_len, ref); putchar('\t'); @@@ -767,6 -767,6 +342,21 @@@ if (c > 126) c = 126; putchar(c); } ++ if (conf->flag & MPLP_PRINT_MAPQ) { ++ putchar('\t'); ++ for (j = 0; j < n_plp[i]; ++j) { ++ int c = plp[i][j].b->core.qual + 33; ++ if (c > 126) c = 126; ++ putchar(c); ++ } ++ } ++ if (conf->flag & MPLP_PRINT_POS) { ++ putchar('\t'); ++ for (j = 0; j < n_plp[i]; ++j) { ++ if (j > 0) putchar(','); ++ printf("%d", plp[i][j].qpos + 1); // FIXME: printf() is very slow... ++ } ++ } } } putchar('\n'); @@@ -778,12 -778,12 +368,6 @@@ for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]); free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp); bcf_call_del_rghash(rghash); -- if (hash) { // free the hash table -- khint_t k; -- for (k = kh_begin(hash); k < kh_end(hash); ++k) -- if (kh_exist(hash, k)) free(kh_val(hash, k)); -- kh_destroy(64, hash); -- } bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr); bam_mplp_destroy(iter); bam_header_destroy(h); @@@ -797,7 -797,7 +381,7 @@@ } #define MAX_PATH_LEN 1024 --int read_file_list(const char *file_list,int *n,char **argv[]) ++static int read_file_list(const char *file_list,int *n,char **argv[]) { char buf[MAX_PATH_LEN]; int len, nfiles; @@@ -850,16 -850,16 +434,18 @@@ int bam_mpileup(int argc, char *argv[] int c; const char *file_list = NULL; char **fn = NULL; -- int nfiles = 0; ++ int nfiles = 0, use_orphan = 0; mplp_conf_t mplp; memset(&mplp, 0, sizeof(mplp_conf_t)); ++ #define MPLP_PRINT_POS 0x4000 mplp.max_mq = 60; mplp.min_baseQ = 13; mplp.capQ_thres = 0; -- mplp.max_depth = 250; ++ mplp.max_depth = 250; mplp.max_indel_depth = 250; mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100; ++ mplp.min_frac = 0.002; mplp.min_support = 1; mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN; -- while ((c = getopt(argc, argv, "gf:r:l:M:q:Q:uaORC:BDSd:b:P:o:e:h:I")) >= 0) { ++ while ((c = getopt(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:o:e:h:Im:F:EG:6Os")) >= 0) { switch (c) { case 'f': mplp.fai = fai_load(optarg); @@@ -867,17 -867,17 +453,20 @@@ break; case 'd': mplp.max_depth = atoi(optarg); break; case 'r': mplp.reg = strdup(optarg); break; -- case 'l': mplp.fn_pos = strdup(optarg); break; ++ case 'l': mplp.bed = bed_read(optarg); break; case 'P': mplp.pl_list = strdup(optarg); break; case 'g': mplp.flag |= MPLP_GLF; break; case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_GLF; break; case 'a': mplp.flag |= MPLP_NO_ORPHAN | MPLP_REALN; break; -- case 'B': mplp.flag &= ~MPLP_REALN & ~MPLP_NO_ORPHAN; break; -- case 'O': mplp.flag |= MPLP_NO_ORPHAN; break; -- case 'R': mplp.flag |= MPLP_REALN; break; ++ case 'B': mplp.flag &= ~MPLP_REALN; break; case 'D': mplp.flag |= MPLP_FMT_DP; break; case 'S': mplp.flag |= MPLP_FMT_SP; break; case 'I': mplp.flag |= MPLP_NO_INDEL; break; ++ case 'E': mplp.flag |= MPLP_EXT_BAQ; break; ++ case '6': mplp.flag |= MPLP_ILLUMINA13; break; ++ case 'R': mplp.flag |= MPLP_IGNORE_RG; break; ++ case 's': mplp.flag |= MPLP_PRINT_MAPQ; break; ++ case 'O': mplp.flag |= MPLP_PRINT_POS; break; case 'C': mplp.capQ_thres = atoi(optarg); break; case 'M': mplp.max_mq = atoi(optarg); break; case 'q': mplp.min_mq = atoi(optarg); break; @@@ -886,43 -886,43 +475,72 @@@ case 'o': mplp.openQ = atoi(optarg); break; case 'e': mplp.extQ = atoi(optarg); break; case 'h': mplp.tandemQ = atoi(optarg); break; ++ case 'A': use_orphan = 1; break; ++ case 'F': mplp.min_frac = atof(optarg); break; ++ case 'm': mplp.min_support = atoi(optarg); break; ++ case 'L': mplp.max_indel_depth = atoi(optarg); break; ++ case 'G': { ++ FILE *fp_rg; ++ char buf[1024]; ++ mplp.rghash = bcf_str2id_init(); ++ if ((fp_rg = fopen(optarg, "r")) == 0) ++ fprintf(stderr, "(%s) Fail to open file %s. Continue anyway.\n", __func__, optarg); ++ while (!feof(fp_rg) && fscanf(fp_rg, "%s", buf) > 0) // this is not a good style, but forgive me... ++ bcf_str2id_add(mplp.rghash, strdup(buf)); ++ fclose(fp_rg); ++ } ++ break; } } ++ if (use_orphan) mplp.flag &= ~MPLP_NO_ORPHAN; if (argc == 1) { fprintf(stderr, "\n"); -- fprintf(stderr, "Usage: samtools mpileup [options] in1.bam [in2.bam [...]]\n\n"); -- fprintf(stderr, "Options: -f FILE reference sequence file [null]\n"); -- fprintf(stderr, " -r STR region in which pileup is generated [null]\n"); -- fprintf(stderr, " -l FILE list of positions (format: chr pos) [null]\n"); -- fprintf(stderr, " -b FILE list of input BAM files [null]\n"); -- fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", mplp.max_mq); -- fprintf(stderr, " -Q INT min base quality [%d]\n", mplp.min_baseQ); -- fprintf(stderr, " -q INT filter out alignment with MQ smaller than INT [%d]\n", mplp.min_mq); -- fprintf(stderr, " -d INT max per-sample depth [%d]\n", mplp.max_depth); -- fprintf(stderr, " -P STR comma separated list of platforms for indels [all]\n"); -- fprintf(stderr, " -o INT Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ); -- fprintf(stderr, " -e INT Phred-scaled gap extension seq error probability [%d]\n", mplp.extQ); -- fprintf(stderr, " -h INT coefficient for homopolyer errors [%d]\n", mplp.tandemQ); -- fprintf(stderr, " -g generate BCF output\n"); -- fprintf(stderr, " -u do not compress BCF output\n"); -- fprintf(stderr, " -B disable BAQ computation\n"); -- fprintf(stderr, " -D output per-sample DP\n"); -- fprintf(stderr, " -S output per-sample SP (strand bias P-value, slow)\n"); -- fprintf(stderr, " -I do not perform indel calling\n"); ++ fprintf(stderr, "Usage: samtools mpileup [options] in1.bam [in2.bam [...]]\n\n"); ++ fprintf(stderr, "Input options:\n\n"); ++ fprintf(stderr, " -6 assume the quality is in the Illumina-1.3+ encoding\n"); ++ fprintf(stderr, " -A count anomalous read pairs\n"); ++ fprintf(stderr, " -B disable BAQ computation\n"); ++ fprintf(stderr, " -b FILE list of input BAM files [null]\n"); ++ fprintf(stderr, " -C INT parameter for adjusting mapQ; 0 to disable [0]\n"); ++ fprintf(stderr, " -d INT max per-BAM depth to avoid excessive memory usage [%d]\n", mplp.max_depth); ++ fprintf(stderr, " -E extended BAQ for higher sensitivity but lower specificity\n"); ++ fprintf(stderr, " -f FILE faidx indexed reference sequence file [null]\n"); ++ fprintf(stderr, " -G FILE exclude read groups listed in FILE [null]\n"); ++ fprintf(stderr, " -l FILE list of positions (chr pos) or regions (BED) [null]\n"); ++ fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", mplp.max_mq); ++ fprintf(stderr, " -r STR region in which pileup is generated [null]\n"); ++ fprintf(stderr, " -R ignore RG tags\n"); ++ fprintf(stderr, " -q INT skip alignments with mapQ smaller than INT [%d]\n", mplp.min_mq); ++ fprintf(stderr, " -Q INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp.min_baseQ); ++ fprintf(stderr, "\nOutput options:\n\n"); ++ fprintf(stderr, " -D output per-sample DP in BCF (require -g/-u)\n"); ++ fprintf(stderr, " -g generate BCF output (genotype likelihoods)\n"); ++ fprintf(stderr, " -O output base positions on reads (disabled by -g/-u)\n"); ++ fprintf(stderr, " -s output mapping quality (disabled by -g/-u)\n"); ++ fprintf(stderr, " -S output per-sample strand bias P-value in BCF (require -g/-u)\n"); ++ fprintf(stderr, " -u generate uncompress BCF output\n"); ++ fprintf(stderr, "\nSNP/INDEL genotype likelihoods options (effective with `-g' or `-u'):\n\n"); ++ fprintf(stderr, " -e INT Phred-scaled gap extension seq error probability [%d]\n", mplp.extQ); ++ fprintf(stderr, " -F FLOAT minimum fraction of gapped reads for candidates [%g]\n", mplp.min_frac); ++ fprintf(stderr, " -h INT coefficient for homopolymer errors [%d]\n", mplp.tandemQ); ++ fprintf(stderr, " -I do not perform indel calling\n"); ++ fprintf(stderr, " -L INT max per-sample depth for INDEL calling [%d]\n", mplp.max_indel_depth); ++ fprintf(stderr, " -m INT minimum gapped reads for indel candidates [%d]\n", mplp.min_support); ++ fprintf(stderr, " -o INT Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ); ++ fprintf(stderr, " -P STR comma separated list of platforms for indels [all]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Notes: Assuming diploid individuals.\n\n"); return 1; } -- if ( file_list ) -- { ++ if (file_list) { if ( read_file_list(file_list,&nfiles,&fn) ) return 1; mpileup(&mplp,nfiles,fn); for (c=0; cx.fpr, buf, BUF_SIZE)) > 0) ++ fwrite(buf, 1, len, fp->x.fpw); #else while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0) ++ fwrite(buf, 1, len, fp->file); #endif -- fwrite(buf, 1, len, fp->x.fpw); free(buf); fp->block_offset = in->block_offset = 0; bgzf_close(fp); diff --cc sam/bam_sort.c index 01f7016,01f7016..abf8d4f --- a/sam/bam_sort.c +++ b/sam/bam_sort.c @@@ -70,6 -70,6 +70,8 @@@ static void swap_header_text(bam_header #define MERGE_RG 1 #define MERGE_UNCOMP 2 ++#define MERGE_LEVEL1 4 ++#define MERGE_FORCE 8 /*! @abstract Merge multiple sorted BAM. @@@ -202,16 -202,16 +204,14 @@@ int bam_merge_core(int by_qname, const h->i = i; h->b = (bam1_t*)calloc(1, sizeof(bam1_t)); if (bam_iter_read(fp[i], iter[i], h->b) >= 0) { -- h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)h->b->core.pos<<1 | bam1_strand(h->b); ++ h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam1_strand(h->b); h->idx = idx++; } else h->pos = HEAP_EMPTY; } -- if (flag & MERGE_UNCOMP) { -- fpout = strcmp(out, "-")? bam_open(out, "wu") : bam_dopen(fileno(stdout), "wu"); -- } else { -- fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w"); -- } ++ if (flag & MERGE_UNCOMP) fpout = strcmp(out, "-")? bam_open(out, "wu") : bam_dopen(fileno(stdout), "wu"); ++ else if (flag & MERGE_LEVEL1) fpout = strcmp(out, "-")? bam_open(out, "w1") : bam_dopen(fileno(stdout), "w1"); ++ else fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w"); if (fpout == 0) { fprintf(stderr, "[%s] fail to create the output file.\n", __func__); return -1; @@@ -222,11 -222,11 +222,14 @@@ ks_heapmake(heap, n, heap); while (heap->pos != HEAP_EMPTY) { bam1_t *b = heap->b; -- if ((flag & MERGE_RG) && bam_aux_get(b, "RG") == 0) ++ if (flag & MERGE_RG) { ++ uint8_t *rg = bam_aux_get(b, "RG"); ++ if (rg) bam_aux_del(b, rg); bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]); ++ } bam_write1_core(fpout, &b->core, b->data_len, b->data); if ((j = bam_iter_read(fp[heap->i], iter[heap->i], b)) >= 0) { -- heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)b->core.pos<<1 | bam1_strand(b); ++ heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam1_strand(b); heap->idx = idx++; } else if (j == -1) { heap->pos = HEAP_EMPTY; @@@ -254,11 -254,11 +257,13 @@@ int bam_merge(int argc, char *argv[] int c, is_by_qname = 0, flag = 0, ret = 0; char *fn_headers = NULL, *reg = 0; -- while ((c = getopt(argc, argv, "h:nruR:")) >= 0) { ++ while ((c = getopt(argc, argv, "h:nru1R:f")) >= 0) { switch (c) { case 'r': flag |= MERGE_RG; break; ++ case 'f': flag |= MERGE_FORCE; break; case 'h': fn_headers = strdup(optarg); break; case 'n': is_by_qname = 1; break; ++ case '1': flag |= MERGE_LEVEL1; break; case 'u': flag |= MERGE_UNCOMP; break; case 'R': reg = strdup(optarg); break; } @@@ -269,6 -269,6 +274,8 @@@ fprintf(stderr, "Options: -n sort by read names\n"); fprintf(stderr, " -r attach RG tag (inferred from file names)\n"); fprintf(stderr, " -u uncompressed BAM output\n"); ++ fprintf(stderr, " -f overwrite the output BAM if exist\n"); ++ fprintf(stderr, " -1 compress level 1\n"); fprintf(stderr, " -R STR merge file in the specified region STR [all]\n"); fprintf(stderr, " -h FILE copy the header in FILE to [in1.bam]\n\n"); fprintf(stderr, "Note: Samtools' merge does not reconstruct the @RG dictionary in the header. Users\n"); @@@ -276,6 -276,6 +283,14 @@@ fprintf(stderr, " the header dictionary in merging.\n\n"); return 1; } ++ if (!(flag & MERGE_FORCE) && strcmp(argv[optind], "-")) { ++ FILE *fp = fopen(argv[optind], "rb"); ++ if (fp != NULL) { ++ fclose(fp); ++ fprintf(stderr, "[%s] File '%s' exists. Please apply '-f' to overwrite. Abort.\n", __func__, argv[optind]); ++ return 1; ++ } ++ } if (bam_merge_core(is_by_qname, argv[optind], fn_headers, argc - optind - 1, argv + optind + 1, flag, reg) < 0) ret = 1; free(reg); free(fn_headers); @@@ -288,21 -288,21 +303,26 @@@ static inline int bam1_lt(const bam1_p { if (g_is_by_qname) { int t = strnum_cmp(bam1_qname(a), bam1_qname(b)); -- return (t < 0 || (t == 0 && (((uint64_t)a->core.tid<<32|a->core.pos) < ((uint64_t)b->core.tid<<32|b->core.pos)))); -- } else return (((uint64_t)a->core.tid<<32|a->core.pos) < ((uint64_t)b->core.tid<<32|b->core.pos)); ++ return (t < 0 || (t == 0 && (((uint64_t)a->core.tid<<32|(a->core.pos+1)) < ((uint64_t)b->core.tid<<32|(b->core.pos+1))))); ++ } else return (((uint64_t)a->core.tid<<32|(a->core.pos+1)) < ((uint64_t)b->core.tid<<32|(b->core.pos+1))); } KSORT_INIT(sort, bam1_p, bam1_lt) static void sort_blocks(int n, int k, bam1_p *buf, const char *prefix, const bam_header_t *h, int is_stdout) { -- char *name; ++ char *name, mode[3]; int i; bamFile fp; ks_mergesort(sort, k, buf, 0); name = (char*)calloc(strlen(prefix) + 20, 1); -- if (n >= 0) sprintf(name, "%s.%.4d.bam", prefix, n); -- else sprintf(name, "%s.bam", prefix); -- fp = is_stdout? bam_dopen(fileno(stdout), "w") : bam_open(name, "w"); ++ if (n >= 0) { ++ sprintf(name, "%s.%.4d.bam", prefix, n); ++ strcpy(mode, "w1"); ++ } else { ++ sprintf(name, "%s.bam", prefix); ++ strcpy(mode, "w"); ++ } ++ fp = is_stdout? bam_dopen(fileno(stdout), mode) : bam_open(name, mode); if (fp == 0) { fprintf(stderr, "[sort_blocks] fail to create file %s.\n", name); free(name); diff --cc sam/bam_stat.c index ea9deee,ea9deee..f2de0f1 --- a/sam/bam_stat.c +++ b/sam/bam_stat.c @@@ -3,31 -3,31 +3,31 @@@ #include "bam.h" typedef struct { -- long long n_reads, n_mapped, n_pair_all, n_pair_map, n_pair_good; -- long long n_sgltn, n_read1, n_read2; -- long long n_qcfail, n_dup; -- long long n_diffchr, n_diffhigh; ++ long long n_reads[2], n_mapped[2], n_pair_all[2], n_pair_map[2], n_pair_good[2]; ++ long long n_sgltn[2], n_read1[2], n_read2[2]; ++ long long n_dup[2]; ++ long long n_diffchr[2], n_diffhigh[2]; } bam_flagstat_t; #define flagstat_loop(s, c) do { \ -- ++(s)->n_reads; \ ++ int w = ((c)->flag & BAM_FQCFAIL)? 1 : 0; \ ++ ++(s)->n_reads[w]; \ if ((c)->flag & BAM_FPAIRED) { \ -- ++(s)->n_pair_all; \ -- if ((c)->flag & BAM_FPROPER_PAIR) ++(s)->n_pair_good; \ -- if ((c)->flag & BAM_FREAD1) ++(s)->n_read1; \ -- if ((c)->flag & BAM_FREAD2) ++(s)->n_read2; \ -- if (((c)->flag & BAM_FMUNMAP) && !((c)->flag & BAM_FUNMAP)) ++(s)->n_sgltn; \ ++ ++(s)->n_pair_all[w]; \ ++ if ((c)->flag & BAM_FPROPER_PAIR) ++(s)->n_pair_good[w]; \ ++ if ((c)->flag & BAM_FREAD1) ++(s)->n_read1[w]; \ ++ if ((c)->flag & BAM_FREAD2) ++(s)->n_read2[w]; \ ++ if (((c)->flag & BAM_FMUNMAP) && !((c)->flag & BAM_FUNMAP)) ++(s)->n_sgltn[w]; \ if (!((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FMUNMAP)) { \ -- ++(s)->n_pair_map; \ ++ ++(s)->n_pair_map[w]; \ if ((c)->mtid != (c)->tid) { \ -- ++(s)->n_diffchr; \ -- if ((c)->qual >= 5) ++(s)->n_diffhigh; \ ++ ++(s)->n_diffchr[w]; \ ++ if ((c)->qual >= 5) ++(s)->n_diffhigh[w]; \ } \ } \ } \ -- if (!((c)->flag & BAM_FUNMAP)) ++(s)->n_mapped; \ -- if ((c)->flag & BAM_FQCFAIL) ++(s)->n_qcfail; \ -- if ((c)->flag & BAM_FDUP) ++(s)->n_dup; \ ++ if (!((c)->flag & BAM_FUNMAP)) ++(s)->n_mapped[w]; \ ++ if ((c)->flag & BAM_FDUP) ++(s)->n_dup[w]; \ } while (0) bam_flagstat_t *bam_flagstat_core(bamFile fp) @@@ -59,18 -59,18 +59,17 @@@ int bam_flagstat(int argc, char *argv[] assert(fp); header = bam_header_read(fp); s = bam_flagstat_core(fp); -- printf("%lld in total\n", s->n_reads); -- printf("%lld QC failure\n", s->n_qcfail); -- printf("%lld duplicates\n", s->n_dup); -- printf("%lld mapped (%.2f%%)\n", s->n_mapped, (float)s->n_mapped / s->n_reads * 100.0); -- printf("%lld paired in sequencing\n", s->n_pair_all); -- printf("%lld read1\n", s->n_read1); -- printf("%lld read2\n", s->n_read2); -- printf("%lld properly paired (%.2f%%)\n", s->n_pair_good, (float)s->n_pair_good / s->n_pair_all * 100.0); -- printf("%lld with itself and mate mapped\n", s->n_pair_map); -- printf("%lld singletons (%.2f%%)\n", s->n_sgltn, (float)s->n_sgltn / s->n_pair_all * 100.0); -- printf("%lld with mate mapped to a different chr\n", s->n_diffchr); -- printf("%lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh); ++ printf("%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); ++ printf("%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]); ++ printf("%lld + %lld mapped (%.2f%%:%.2f%%)\n", s->n_mapped[0], s->n_mapped[1], (float)s->n_mapped[0] / s->n_reads[0] * 100.0, (float)s->n_mapped[1] / s->n_reads[1] * 100.0); ++ printf("%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); ++ printf("%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]); ++ printf("%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]); ++ printf("%lld + %lld properly paired (%.2f%%:%.2f%%)\n", s->n_pair_good[0], s->n_pair_good[1], (float)s->n_pair_good[0] / s->n_pair_all[0] * 100.0, (float)s->n_pair_good[1] / s->n_pair_all[1] * 100.0); ++ printf("%lld + %lld with itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]); ++ printf("%lld + %lld singletons (%.2f%%:%.2f%%)\n", s->n_sgltn[0], s->n_sgltn[1], (float)s->n_sgltn[0] / s->n_pair_all[0] * 100.0, (float)s->n_sgltn[1] / s->n_pair_all[1] * 100.0); ++ printf("%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]); ++ printf("%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]); free(s); bam_header_destroy(header); bam_close(fp); diff --cc sam/bam_tview.c index e48afa7,e48afa7..4eea955 --- a/sam/bam_tview.c +++ b/sam/bam_tview.c @@@ -19,9 -19,9 +19,10 @@@ #include #include #include ++#include #include "bam.h" #include "faidx.h" --#include "bam_maqcns.h" ++#include "bam2bcf.h" char bam_aux_getCEi(bam1_t *b, int i); char bam_aux_getCSi(bam1_t *b, int i); @@@ -50,7 -50,7 +51,7 @@@ typedef struct bamFile fp; int curr_tid, left_pos; faidx_t *fai; -- bam_maqcns_t *bmc; ++ bcf_callaux_t *bca; int ccol, last_pos, row_shift, base_for, color_for, is_dot, l_ref, ins, no_skip, show_name; char *ref; @@@ -58,6 -58,6 +59,7 @@@ int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) { ++ extern unsigned char bam_nt16_table[256]; tview_t *tv = (tview_t*)data; int i, j, c, rb, attr, max_ins = 0; uint32_t call = 0; @@@ -70,11 -70,11 +72,26 @@@ mvaddch(1, tv->ccol++, c); } if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) mvprintw(0, tv->ccol, "%-d", pos+1); -- // print consensus -- call = bam_maqcns_call(n, pl, tv->bmc); ++ { // call consensus ++ bcf_callret1_t bcr; ++ int qsum[4], a1, a2, tmp; ++ double p[3], prior = 30; ++ bcf_call_glfgen(n, pl, bam_nt16_table[rb], tv->bca, &bcr); ++ for (i = 0; i < 4; ++i) qsum[i] = bcr.qsum[i]<<2 | i; ++ for (i = 1; i < 4; ++i) // insertion sort ++ for (j = i; j > 0 && qsum[j] > qsum[j-1]; --j) ++ tmp = qsum[j], qsum[j] = qsum[j-1], qsum[j-1] = tmp; ++ a1 = qsum[0]&3; a2 = qsum[1]&3; ++ p[0] = bcr.p[a1*5+a1]; p[1] = bcr.p[a1*5+a2] + prior; p[2] = bcr.p[a2*5+a2]; ++ if ("ACGT"[a1] != toupper(rb)) p[0] += prior + 3; ++ if ("ACGT"[a2] != toupper(rb)) p[2] += prior + 3; ++ if (p[0] < p[1] && p[0] < p[2]) call = (1<>28&0xf]; -- i = (call>>8&0xff)/10+1; ++ c = ",ACMGRSVTWYHKDBN"[call>>16&0xf]; ++ i = (call&0xffff)/10+1; if (i > 4) i = 4; attr |= COLOR_PAIR(i); if (c == toupper(rb)) c = '.'; @@@ -183,17 -183,17 +200,16 @@@ tview_t *tv_init(const char *fn, const { tview_t *tv = (tview_t*)calloc(1, sizeof(tview_t)); tv->is_dot = 1; -- tv->idx = bam_index_load(fn); -- if (tv->idx == 0) exit(1); tv->fp = bam_open(fn, "r"); bgzf_set_cache_size(tv->fp, 8 * 1024 *1024); assert(tv->fp); tv->header = bam_header_read(tv->fp); ++ tv->idx = bam_index_load(fn); ++ if (tv->idx == 0) exit(1); tv->lplbuf = bam_lplbuf_init(tv_pl_func, tv); if (fn_fa) tv->fai = fai_load(fn_fa); -- tv->bmc = bam_maqcns_init(); ++ tv->bca = bcf_call_init(0.83, 13); tv->ins = 1; -- bam_maqcns_prepare(tv->bmc); initscr(); keypad(stdscr, TRUE); @@@ -224,7 -224,7 +240,7 @@@ void tv_destroy(tview_t *tv endwin(); bam_lplbuf_destroy(tv->lplbuf); -- bam_maqcns_destroy(tv->bmc); ++ bcf_call_destroy(tv->bca); bam_index_destroy(tv->idx); if (tv->fai) fai_destroy(tv->fai); free(tv->ref); diff --cc sam/bamtk.c index 79635d6,79635d6..8ba2581 --- a/sam/bamtk.c +++ b/sam/bamtk.c @@@ -8,12 -8,12 +8,7 @@@ #include "knetfile.h" #endif --#ifndef PACKAGE_VERSION --#define PACKAGE_VERSION "0.1.12a (r862)" --#endif -- int bam_taf2baf(int argc, char *argv[]); --int bam_pileup(int argc, char *argv[]); int bam_mpileup(int argc, char *argv[]); int bam_merge(int argc, char *argv[]); int bam_index(int argc, char *argv[]); @@@ -27,60 -27,60 +22,24 @@@ int bam_idxstats(int argc, char *argv[] int main_samview(int argc, char *argv[]); int main_import(int argc, char *argv[]); int main_reheader(int argc, char *argv[]); ++int main_cut_target(int argc, char *argv[]); ++int main_phase(int argc, char *argv[]); ++int main_cat(int argc, char *argv[]); ++int main_depth(int argc, char *argv[]); ++int main_bam2fq(int argc, char *argv[]); int faidx_main(int argc, char *argv[]); --int glf3_view_main(int argc, char *argv[]); -- --int bam_tagview(int argc, char *argv[]) --{ -- bamFile fp; -- bam_header_t *header; -- bam1_t *b; -- char tag[2]; -- int ret; -- if (argc < 3) { -- fprintf(stderr, "Usage: samtools tagview \n"); -- return 1; -- } -- fp = strcmp(argv[1], "-")? bam_open(argv[1], "r") : bam_dopen(fileno(stdin), "r"); -- assert(fp); -- header = bam_header_read(fp); -- if (header == 0) { -- fprintf(stderr, "[bam_view] fail to read the BAM header. Abort!\n"); -- return 1; -- } -- tag[0] = argv[2][0]; tag[1] = argv[2][1]; -- b = (bam1_t*)calloc(1, sizeof(bam1_t)); -- while ((ret = bam_read1(fp, b)) >= 0) { -- uint8_t *d = bam_aux_get(b, tag); -- if (d) { -- printf("%s\t%d\t", bam1_qname(b), b->core.flag); -- if (d[0] == 'Z' || d[0] == 'H') printf("%s\n", bam_aux2Z(d)); -- else if (d[0] == 'f') printf("%f\n", bam_aux2f(d)); -- else if (d[0] == 'd') printf("%lf\n", bam_aux2d(d)); -- else if (d[0] == 'A') printf("%c\n", bam_aux2A(d)); -- else if (d[0] == 'c' || d[0] == 's' || d[0] == 'i') printf("%d\n", bam_aux2i(d)); -- else if (d[0] == 'C' || d[0] == 'S' || d[0] == 'I') printf("%u\n", bam_aux2i(d)); -- else printf("\n"); -- } -- } -- if (ret < -1) fprintf(stderr, "[bam_view] truncated file? Continue anyway. (%d)\n", ret); -- free(b->data); free(b); -- bam_header_destroy(header); -- bam_close(fp); -- return 0; --} static int usage() { fprintf(stderr, "\n"); fprintf(stderr, "Program: samtools (Tools for alignments in the SAM format)\n"); -- fprintf(stderr, "Version: %s\n\n", PACKAGE_VERSION); ++ fprintf(stderr, "Version: %s\n\n", BAM_VERSION); fprintf(stderr, "Usage: samtools [options]\n\n"); fprintf(stderr, "Command: view SAM<->BAM conversion\n"); fprintf(stderr, " sort sort alignment file\n"); -- fprintf(stderr, " pileup generate pileup output\n"); fprintf(stderr, " mpileup multi-way pileup\n"); ++ fprintf(stderr, " depth compute the depth\n"); fprintf(stderr, " faidx index/extract FASTA\n"); #if _CURSES_LIB != 0 fprintf(stderr, " tview text alignment viewer\n"); @@@ -88,13 -88,13 +47,21 @@@ fprintf(stderr, " index index alignment\n"); fprintf(stderr, " idxstats BAM index stats (r595 or later)\n"); fprintf(stderr, " fixmate fix mate information\n"); -- fprintf(stderr, " glfview print GLFv3 file\n"); fprintf(stderr, " flagstat simple stats\n"); fprintf(stderr, " calmd recalculate MD/NM tags and '=' bases\n"); fprintf(stderr, " merge merge sorted alignments\n"); fprintf(stderr, " rmdup remove PCR duplicates\n"); fprintf(stderr, " reheader replace BAM header\n"); ++ fprintf(stderr, " cat concatenate BAMs\n"); ++ fprintf(stderr, " targetcut cut fosmid regions (for fosmid pool only)\n"); ++ fprintf(stderr, " phase phase heterozygotes\n"); fprintf(stderr, "\n"); ++#ifdef _WIN32 ++ fprintf(stderr, "\ ++Note: The Windows version of SAMtools is mainly designed for read-only\n\ ++ operations, such as viewing the alignments and generating the pileup.\n\ ++ Binary files generated by the Windows version may be buggy.\n\n"); ++#endif return 1; } @@@ -110,7 -110,7 +77,6 @@@ int main(int argc, char *argv[] if (argc < 2) return usage(); if (strcmp(argv[1], "view") == 0) return main_samview(argc-1, argv+1); else if (strcmp(argv[1], "import") == 0) return main_import(argc-1, argv+1); -- else if (strcmp(argv[1], "pileup") == 0) return bam_pileup(argc-1, argv+1); else if (strcmp(argv[1], "mpileup") == 0) return bam_mpileup(argc-1, argv+1); else if (strcmp(argv[1], "merge") == 0) return bam_merge(argc-1, argv+1); else if (strcmp(argv[1], "sort") == 0) return bam_sort(argc-1, argv+1); @@@ -119,12 -119,12 +85,19 @@@ else if (strcmp(argv[1], "faidx") == 0) return faidx_main(argc-1, argv+1); else if (strcmp(argv[1], "fixmate") == 0) return bam_mating(argc-1, argv+1); else if (strcmp(argv[1], "rmdup") == 0) return bam_rmdup(argc-1, argv+1); -- else if (strcmp(argv[1], "glfview") == 0) return glf3_view_main(argc-1, argv+1); else if (strcmp(argv[1], "flagstat") == 0) return bam_flagstat(argc-1, argv+1); -- else if (strcmp(argv[1], "tagview") == 0) return bam_tagview(argc-1, argv+1); else if (strcmp(argv[1], "calmd") == 0) return bam_fillmd(argc-1, argv+1); else if (strcmp(argv[1], "fillmd") == 0) return bam_fillmd(argc-1, argv+1); else if (strcmp(argv[1], "reheader") == 0) return main_reheader(argc-1, argv+1); ++ else if (strcmp(argv[1], "cat") == 0) return main_cat(argc-1, argv+1); ++ else if (strcmp(argv[1], "targetcut") == 0) return main_cut_target(argc-1, argv+1); ++ else if (strcmp(argv[1], "phase") == 0) return main_phase(argc-1, argv+1); ++ else if (strcmp(argv[1], "depth") == 0) return main_depth(argc-1, argv+1); ++ else if (strcmp(argv[1], "bam2fq") == 0) return main_bam2fq(argc-1, argv+1); ++ else if (strcmp(argv[1], "pileup") == 0) { ++ fprintf(stderr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n"); ++ return 1; ++ } #if _CURSES_LIB != 0 else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1); #endif diff --cc sam/bcftools/Makefile index 8b890ba,8b890ba..9b6f863 --- a/sam/bcftools/Makefile +++ b/sam/bcftools/Makefile @@@ -1,9 -1,9 +1,9 @@@ CC= gcc CFLAGS= -g -Wall -O2 #-m64 #-arch ppc DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE --LOBJS= bcf.o vcf.o bcfutils.o prob1.o ld.o kfunc.o index.o fet.o bcf2qcall.o ++LOBJS= bcf.o vcf.o bcfutils.o prob1.o em.o kfunc.o kmin.o index.o fet.o mut.o bcf2qcall.o OMISC= .. --AOBJS= call1.o main.o $(OMISC)/kstring.o $(OMISC)/bgzf.o $(OMISC)/knetfile.o ++AOBJS= call1.o main.o $(OMISC)/kstring.o $(OMISC)/bgzf.o $(OMISC)/knetfile.o $(OMISC)/bedidx.o PROG= bcftools INCLUDES= SUBDIRS= . @@@ -28,10 -28,10 +28,10 @@@ all:$(PROG lib:libbcf.a libbcf.a:$(LOBJS) -- $(AR) -cru $@ $(LOBJS) ++ $(AR) -csru $@ $(LOBJS) bcftools:lib $(AOBJS) -- $(CC) $(CFLAGS) -o $@ $(AOBJS) -lm $(LIBPATH) -lz -L. -lbcf ++ $(CC) $(CFLAGS) -o $@ $(AOBJS) -L. $(LIBPATH) -lbcf -lm -lz bcf.o:bcf.h vcf.o:bcf.h diff --cc sam/bcftools/bcf-fix.pl index 61c6136,61c6136..0000000 deleted file mode 100755,100755 --- a/sam/bcftools/bcf-fix.pl +++ /dev/null @@@ -1,101 -1,101 +1,0 @@@ --#!/usr/bin/perl -w -- --use strict; --use warnings; --use Carp; -- --my $opts = parse_params(); --bcf_fix(); -- --exit; -- --#-------------------------------- -- --sub error --{ -- my (@msg) = @_; -- if ( scalar @msg ) { confess @msg; } -- die -- "Usage: bcftools view test.bcf | bcf-fix.pl > test.vcf\n", -- "Options:\n", -- " -h, -?, --help This help message.\n", -- "\n"; --} -- -- --sub parse_params --{ -- my $opts = {}; -- while (my $arg=shift(@ARGV)) -- { -- if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } -- error("Unknown parameter \"$arg\". Run -h for help.\n"); -- } -- return $opts; --} -- --sub bcf_fix --{ -- while (my $line=) -- { -- if ( $line=~/^#CHROM/ ) -- { -- print --qq[##INFO= --##INFO= --##INFO= --##INFO= --##INFO= --##INFO= --##FORMAT= --##FORMAT= --##FORMAT= --]; -- print $line; -- } -- elsif ( $line=~/^#/ ) -- { -- print $line; -- } -- else -- { -- my @items = split(/\t/,$line); -- my @tags = split(/:/,$items[8]); # FORMAT tags -- -- my $nidx=2; -- my @idxs; # Mapping which defines new ordering: $idxs[$inew]=$iold; GT comes first, PL second -- for (my $i=0; $i<@tags; $i++) -- { -- if ( $tags[$i] eq 'GT' ) { $idxs[0]=$i; } -- elsif ( $tags[$i] eq 'PL' ) { $idxs[1]=$i; } -- else { $idxs[$nidx++]=$i; } -- } -- if ( !exists($tags[0]) or !exists($tags[1]) ) { error("FIXME: expected GT and PL in the format field.\n"); } -- -- # First fix the FORMAT column -- $items[8] = 'GT:GL'; -- for (my $i=2; $i<@tags; $i++) -- { -- $items[8] .= ':'.$tags[$idxs[$i]]; -- } -- -- # Now all the genotype columns -- for (my $iitem=9; $iitem<@items; $iitem++) -- { -- @tags = split(/:/,$items[$iitem]); -- $items[$iitem] = $tags[$idxs[0]] .':'; -- -- # GL=-PL/10 -- my ($a,$b,$c) = split(/,/,$tags[$idxs[1]]); -- $items[$iitem] .= sprintf "%.2f,%.2f,%.2f",-$a/10.,-$b/10.,-$c/10.; -- -- for (my $itag=2; $itag<@tags; $itag++) -- { -- $items[$iitem] .= ':'.$tags[$idxs[$itag]]; -- } -- } -- print join("\t",@items); -- } -- } --} -- diff --cc sam/bcftools/bcf.c index 6e45695,6e45695..84a8e76 --- a/sam/bcftools/bcf.c +++ b/sam/bcftools/bcf.c @@@ -103,10 -103,10 +103,16 @@@ int bcf_sync(bcf1_t *b ks_tokaux_t aux; // set ref, alt, flt, info, fmt b->ref = b->alt = b->flt = b->info = b->fmt = 0; -- for (p = b->str, n = 0; p < b->str + b->l_str; ++p) -- if (*p == 0 && p+1 != b->str + b->l_str) tmp[n++] = p + 1; ++ for (p = b->str, n = 0; p < b->str + b->l_str; ++p) { ++ if (*p == 0 && p+1 != b->str + b->l_str) { ++ if (n == 5) { ++ ++n; ++ break; ++ } else tmp[n++] = p + 1; ++ } ++ } if (n != 5) { -- fprintf(stderr, "[%s] incorrect number of fields (%d != 5). Corrupted file?\n", __func__, n); ++ fprintf(stderr, "[%s] incorrect number of fields (%d != 5) at %d:%d\n", __func__, n, b->tid, b->pos); return -1; } b->ref = tmp[0]; b->alt = tmp[1]; b->flt = tmp[2]; b->info = tmp[3]; b->fmt = tmp[4]; @@@ -136,10 -136,10 +142,10 @@@ b->gi[i].len = b->n_alleles * (b->n_alleles + 1) / 2; } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("HQ", 2)) { b->gi[i].len = 2; -- } else if (b->gi[i].fmt == bcf_str2int("GQ", 2) || b->gi[i].fmt == bcf_str2int("GT", 2) -- || b->gi[i].fmt == bcf_str2int("SP", 2)) -- { ++ } else if (b->gi[i].fmt == bcf_str2int("GQ", 2) || b->gi[i].fmt == bcf_str2int("GT", 2)) { b->gi[i].len = 1; ++ } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) { ++ b->gi[i].len = 4; } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) { b->gi[i].len = b->n_alleles * (b->n_alleles + 1) / 2 * 4; } @@@ -240,8 -240,8 +246,10 @@@ void bcf_fmt_core(const bcf_hdr_t *h, b } } else if (b->gi[i].fmt == bcf_str2int("DP", 2)) { kputw(((uint16_t*)b->gi[i].data)[j], s); -- } else if (b->gi[i].fmt == bcf_str2int("GQ", 2) || b->gi[i].fmt == bcf_str2int("SP", 2)) { ++ } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) { kputw(((uint8_t*)b->gi[i].data)[j], s); ++ } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) { ++ kputw(((int32_t*)b->gi[i].data)[j], s); } else if (b->gi[i].fmt == bcf_str2int("GT", 2)) { int y = ((uint8_t*)b->gi[i].data)[j]; if (y>>7&1) { @@@ -259,7 -259,7 +267,7 @@@ if (k > 0) kputc(',', s); ksprintf(s, "%.2f", d[k]); } -- } ++ } else kputc('.', s); // custom fields } } } diff --cc sam/bcftools/bcf.h index f87ac1e,f87ac1e..822ae5c --- a/sam/bcftools/bcf.h +++ b/sam/bcftools/bcf.h @@@ -28,6 -28,6 +28,8 @@@ #ifndef BCF_H #define BCF_H ++#define BCF_VERSION "0.1.17-dev (r973:277)" ++ #include #include @@@ -129,6 -129,6 +131,8 @@@ extern "C" int vcf_close(bcf_t *bp); // read the VCF/BCF header bcf_hdr_t *vcf_hdr_read(bcf_t *bp); ++ // read the sequence dictionary from a separate file; required for VCF->BCF conversion ++ int vcf_dictread(bcf_t *bp, bcf_hdr_t *h, const char *fn); // read a VCF/BCF record; return -1 on end-of-file and <-1 for errors int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b); // write the VCF header @@@ -142,10 -142,10 +146,21 @@@ int bcf_gl2pl(bcf1_t *b); // if the site is an indel int bcf_is_indel(const bcf1_t *b); ++ bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list); ++ int bcf_subsam(int n_smpl, int *list, bcf1_t *b); ++ // move GT to the first FORMAT field ++ int bcf_fix_gt(bcf1_t *b); ++ // update PL generated by old samtools ++ int bcf_fix_pl(bcf1_t *b); ++ // convert PL to GLF-like 10-likelihood GL ++ int bcf_gl10(const bcf1_t *b, uint8_t *gl); ++ // convert up to 4 INDEL alleles to GLF-like 10-likelihood GL ++ int bcf_gl10_indel(const bcf1_t *b, uint8_t *gl); // string hash table void *bcf_build_refhash(bcf_hdr_t *h); void bcf_str2id_destroy(void *_hash); ++ void bcf_str2id_thorough_destroy(void *_hash); int bcf_str2id_add(void *_hash, const char *str); int bcf_str2id(void *_hash, const char *str); void *bcf_str2id_init(); diff --cc sam/bcftools/bcf.tex index 5ca1e28,5ca1e28..442fc2a --- a/sam/bcftools/bcf.tex +++ b/sam/bcftools/bcf.tex @@@ -14,50 -14,50 +14,64 @@@ \begin{tabular}{|l|l|l|l|l|} \hline \multicolumn{2}{|c|}{\bf Field} & \multicolumn{1}{c|}{\bf Descrption} & \multicolumn{1}{c|}{\bf Type} & \multicolumn{1}{c|}{\bf Value} \\\hline\hline --\multicolumn{2}{|l|}{\tt magic} & Magic string & {\tt char[4]} & {\tt BCF\char92 4} \\\hline --\multicolumn{2}{|l|}{\tt l\_nm} & Length of concatenated sequence names & {\tt int32\_t} & \\\hline --\multicolumn{2}{|l|}{\tt name} & Concatenated names, {\tt NULL} padded & {\tt char[l\_nm]} & \\\hline --\multicolumn{2}{|l|}{\tt l\_smpl} & Length of concatenated sample names & {\tt int32\_t} & \\\hline --\multicolumn{2}{|l|}{\tt sname} & Concatenated sample names & {\tt char[l\_smpl]} & \\\hline --\multicolumn{2}{|l|}{\tt l\_txt} & Length of the meta text (double-hash lines)& {\tt int32\_t} & \\\hline --\multicolumn{2}{|l|}{\tt text} & Meta text, {\tt NULL} terminated & {\tt char[l\_txt]} & \\\hline ++\multicolumn{2}{|l|}{\sf magic} & Magic string & {\tt char[4]} & {\tt BCF\char92 4} \\\hline ++\multicolumn{2}{|l|}{\sf l\_seqnm} & Length of concatenated sequence names & {\tt int32\_t} & \\\hline ++\multicolumn{2}{|l|}{\sf seqnm} & Concatenated names, {\tt NULL} padded & {\tt char[{\sf l\_seqnm}]} & \\\hline ++\multicolumn{2}{|l|}{\sf l\_smpl} & Length of concatenated sample names & {\tt int32\_t} & \\\hline ++\multicolumn{2}{|l|}{\sf smpl} & Concatenated sample names & {\tt char[{\sf l\_smpl}]} & \\\hline ++\multicolumn{2}{|l|}{\sf l\_meta} & Length of the meta text (double-hash lines)& {\tt int32\_t} & \\\hline ++\multicolumn{2}{|l|}{\sf meta} & Meta text, {\tt NULL} terminated & {\tt char[{\sf l\_meta}]} & \\\hline \multicolumn{5}{|c|}{\it \color{gray}{List of records until the end of the file}}\\\cline{2-5} --& {\tt seq\_id} & Reference sequence ID & {\tt int32\_t} & \\\cline{2-5} --& {\tt pos} & Position & {\tt int32\_t} & \\\cline{2-5} --& {\tt qual} & Variant quality & {\tt float} & \\\cline{2-5} --& {\tt l\_str} & Length of str & {\tt int32\_t} & \\\cline{2-5} --& {\tt str} & {\tt ID+REF+ALT+FILTER+INFO+FORMAT}, {\tt NULL} padded & {\tt char[slen]} &\\\cline{2-5} ++& {\sf seq\_id} & Reference sequence ID & {\tt int32\_t} & \\\cline{2-5} ++& {\sf pos} & Position & {\tt int32\_t} & \\\cline{2-5} ++& {\sf qual} & Variant quality & {\tt float} & \\\cline{2-5} ++& {\sf l\_str} & Length of {\sf str} & {\tt int32\_t} & \\\cline{2-5} ++& {\sf str} & {\tt ID+REF+ALT+FILTER+INFO+FORMAT}, {\tt NULL} padded & {\tt char[{\sf l\_str}]} &\\\cline{2-5} & \multicolumn{4}{c|}{Blocks of data; \#blocks and formats defined by {\tt FORMAT} (table below)}\\ \hline \end{tabular} \end{center} \begin{center} --\begin{tabular}{cll} ++\begin{tabular}{clp{9cm}} \hline \multicolumn{1}{l}{\bf Field} & \multicolumn{1}{l}{\bf Type} & \multicolumn{1}{l}{\bf Description} \\\hline {\tt DP} & {\tt uint16\_t[n]} & Read depth \\ --{\tt GL} & {\tt float[n*x]} & Log10 likelihood of data; $x=\frac{m(m+1)}{2}$, $m=\#\{alleles\}$\\ --{\tt GT} & {\tt uint8\_t[n]} & {\tt phase\char60\char60 6 | allele1\char60\char60 3 | allele2} \\ ++{\tt GL} & {\tt float[n*G]} & Log10 likelihood of data; $G=\frac{A(A+1)}{2}$, $A=\#\{alleles\}$\\ ++{\tt GT} & {\tt uint8\_t[n]} & {\tt missing\char60\char60 7 | phased\char60\char60 6 | allele1\char60\char60 3 | allele2} \\ ++{\tt \_GT} & {\tt uint8\_t+uint8\_t[n*P]} & {Generic GT; the first int equals the max ploidy $P$. If the highest bit is set, ++ the allele is not present (e.g. due to different ploidy between samples).} \\ {\tt GQ} & {\tt uint8\_t[n]} & {Genotype quality}\\ {\tt HQ} & {\tt uint8\_t[n*2]} & {Haplotype quality}\\ --{\tt PL} & {\tt uint8\_t[n*x]} & {Phred-scaled likelihood of data}\\ --\emph{misc} & {\tt int32\_t+char*} & {\tt NULL} padded concatenated strings (integer equal to the length) \\ ++{\tt \_HQ} & {\tt uint8\_t+uint8\_t[n*P]} & {Generic HQ}\\ ++{\tt IBD} & {\tt uint32\_t[n*2]} & {IBD}\\ ++{\tt \_IBD} & {\tt uint8\_t+uint32\_t[n*P]} & {Generic IBD}\\ ++{\tt PL} & {\tt uint8\_t[n*G]} & {Phred-scaled likelihood of data}\\ ++{\tt PS} & {\tt uint32\_t[n]} & {Phase set}\\ ++%{\tt SP} & {\tt uint8\_t[n]} & {Strand bias P-value (bcftools only)}\\ ++\emph{Integer} & {\tt int32\_t[n*X]} & {Fix-sized custom Integer; $X$ defined in the header}\\ ++\emph{Numeric} & {\tt double[n*X]} & {Fix-sized custom Numeric}\\ ++\emph{String} & {\tt uint32\_t+char*} & {\tt NULL} padded concat. strings (int equals to the length) \\ \hline \end{tabular} \end{center} \begin{itemize} --\item The file is {\tt BGZF} compressed. --\item All integers are little-endian. ++\item A BCF file is in the {\tt BGZF} format. ++\item All multi-byte numbers are little-endian. \item In a string, a missing value `.' is an empty C string ``{\tt \char92 0}'' (not ``{\tt .\char92 0}'') \item For {\tt GL} and {\tt PL}, likelihoods of genotypes appear in the order of alleles in {\tt REF} and then {\tt ALT}. For example, if {\tt REF=C}, {\tt ALT=T,A}, likelihoods appear in the order of {\tt -- CC,CT,CA,TT,TA,AA}. --\item {\tt GL} is an extension to and is backward compatible with the -- {\tt GL} genotype field in {\tt VCFv4.0}. ++ CC,CT,TT,CA,TA,AA} (NB: the ordering is different from the one in the original ++ BCF proposal). ++\item Predefined {\tt FORMAT} fields can be missing from VCF headers, but custom {\tt FORMAT} fields ++ are required to be explicitly defined in the headers. ++\item A {\tt FORMAT} field with its name starting with `{\tt \_}' is specific to BCF only. ++ It gives an alternative binary representation of the corresponding VCF field, in case ++ the default representation is unable to keep the genotype information, ++ for example, when the ploidy is not 2 or there are more than 8 alleles. \end{itemize} --\end{document} ++\end{document} diff --cc sam/bcftools/bcf2qcall.c index 8634c9e,8634c9e..a86bac2 --- a/sam/bcftools/bcf2qcall.c +++ b/sam/bcftools/bcf2qcall.c @@@ -77,8 -77,8 +77,8 @@@ int bcf_2qcall(bcf_hdr_t *h, bcf1_t *b for (k = j = 0; k < 4; ++k) { for (l = k; l < 4; ++l) { int t, x = map[k], y = map[l]; -- if (x > y) t = x, x = y, y = t; -- g[j++] = p[x * b->n_alleles - x * (x-1) / 2 + (y - x)]; ++ if (x > y) t = x, x = y, y = t; // swap ++ g[j++] = p[y * (y+1) / 2 + x]; } } printf("%s\t%d\t%c", h->ns[b->tid], b->pos+1, *b->ref); diff --cc sam/bcftools/bcftools.1 index 6c7403b,6c7403b..0000000 deleted file mode 100644,100644 --- a/sam/bcftools/bcftools.1 +++ /dev/null @@@ -1,120 -1,120 +1,0 @@@ --.TH bcftools 1 "2 October 2010" "bcftools" "Bioinformatics tools" --.SH NAME --.PP --bcftools - Utilities for the Binary Call Format (BCF) and VCF. --.SH SYNOPSIS --.PP --bcftools index in.bcf --.PP --bcftools view in.bcf chr2:100-200 > out.vcf --.PP --bcftools view -vc in.bcf > out.vcf 2> out.afs -- --.SH DESCRIPTION --.PP --Bcftools is a toolkit for processing VCF/BCF files, calling variants and --estimating site allele frequencies and allele frequency spectrums. -- --.SH COMMANDS AND OPTIONS -- --.TP 10 --.B view --.B bcftools view --.RB [ \-cbuSAGgHvNQ ] --.RB [ \-1 --.IR nGroup1 ] --.RB [ \-l --.IR listFile ] --.RB [ \-t --.IR mutRate ] --.RB [ \-p --.IR varThres ] --.RB [ \-P --.IR prior ] --.I in.bcf --.RI [ region ] -- --Convert between BCF and VCF, call variant candidates and estimate allele --frequencies. -- --.B OPTIONS: --.RS --.TP 10 --.B -b --Output in the BCF format. The default is VCF. --.TP --.B -c --Call variants. --.TP --.B -v --Output variant sites only (force -c) --.TP --.B -g --Call per-sample genotypes at variant sites (force -c) --.TP --.B -u --Uncompressed BCF output (force -b). --.TP --.B -S --The input is VCF instead of BCF. --.TP --.B -A --Retain all possible alternate alleles at variant sites. By default, this --command discards unlikely alleles. --.TP --.B -G --Suppress all individual genotype information. --.TP --.B -H --Perform Hardy-Weiberg Equilibrium test. This will add computation time, sometimes considerably. --.TP --.B -N --Skip sites where the REF field is not A/C/G/T --.TP --.B -Q --Output the QCALL likelihood format --.TP --.B -f --Reference-free variant calling mode. In this mode, the prior will be --folded; a variant is called iff the sample(s) contains at least two --alleles; the QUAL field in the VCF/BCF output is changed accordingly. --.TP --.BI "-1 " INT --Number of group-1 samples. This option is used for dividing input into --two groups for comparing. A zero value disables this functionality. [0] --.TP --.BI "-l " FILE --List of sites at which information are outputted [all sites] --.TP --.BI "-t " FLOAT --Scaled muttion rate for variant calling [0.001] --.TP --.BI "-p " FLOAT --A site is considered to be a variant if P(ref|D) ++#include #include "bcf.h" #include "kstring.h" #include "khash.h" KHASH_MAP_INIT_STR(str2id, int) ++#ifdef _WIN32 ++#define srand48(x) srand(x) ++#define drand48() ((double)rand() / RAND_MAX) ++#endif ++ ++// FIXME: valgrind report a memory leak in this function. Probably it does not get deallocated... void *bcf_build_refhash(bcf_hdr_t *h) { khash_t(str2id) *hash; @@@ -27,6 -27,6 +35,16 @@@ void bcf_str2id_destroy(void *_hash if (hash) kh_destroy(str2id, hash); // Note that strings are not freed. } ++void bcf_str2id_thorough_destroy(void *_hash) ++{ ++ khash_t(str2id) *hash = (khash_t(str2id)*)_hash; ++ khint_t k; ++ if (hash == 0) return; ++ for (k = 0; k < kh_end(hash); ++k) ++ if (kh_exist(hash, k)) free((char*)kh_key(hash, k)); ++ kh_destroy(str2id, hash); ++} ++ int bcf_str2id(void *_hash, const char *str) { khash_t(str2id) *hash = (khash_t(str2id)*)_hash; @@@ -51,8 -51,8 +69,9 @@@ int bcf_str2id_add(void *_hash, const c int bcf_shrink_alt(bcf1_t *b, int n) { char *p; -- int i, j, k, *z, n_smpl = b->n_smpl; ++ int i, j, k, n_smpl = b->n_smpl; if (b->n_alleles <= n) return -1; ++ // update ALT if (n > 1) { for (p = b->alt, k = 1; *p; ++p) if (*p == ',' && ++k == n) break; @@@ -61,10 -61,10 +80,7 @@@ ++p; memmove(p, b->flt, b->str + b->l_str - b->flt); b->l_str -= b->flt - p; -- z = alloca(sizeof(int) / 2 * n * (n+1)); -- for (i = k = 0; i < n; ++i) -- for (j = 0; j < n - i; ++j) -- z[k++] = i * b->n_alleles + j; ++ // update PL for (i = 0; i < b->n_gi; ++i) { bcf_ginfo_t *g = b->gi + i; if (g->fmt == bcf_str2int("PL", 2)) { @@@ -73,7 -73,7 +89,7 @@@ g->len = n * (n + 1) / 2; for (l = k = 0; l < n_smpl; ++l) { uint8_t *dl = d + l * x; -- for (j = 0; j < g->len; ++j) d[k++] = dl[z[j]]; ++ for (j = 0; j < g->len; ++j) d[k++] = dl[j]; } } // FIXME: to add GL } @@@ -107,3 -107,3 +123,268 @@@ int bcf_gl2pl(bcf1_t *b } return 0; } ++/* FIXME: this function will fail given AB:GTX:GT. BCFtools never ++ * produces such FMT, but others may do. */ ++int bcf_fix_gt(bcf1_t *b) ++{ ++ char *s; ++ int i; ++ uint32_t tmp; ++ bcf_ginfo_t gt; ++ // check the presence of the GT FMT ++ if ((s = strstr(b->fmt, ":GT")) == 0) return 0; // no GT or GT is already the first ++ if (s[3] != '\0' && s[3] != ':') return 0; // :GTX in fact ++ tmp = bcf_str2int("GT", 2); ++ for (i = 0; i < b->n_gi; ++i) ++ if (b->gi[i].fmt == tmp) break; ++ if (i == b->n_gi) return 0; // no GT in b->gi; probably a bug... ++ gt = b->gi[i]; ++ // move GT to the first ++ for (; i > 0; --i) b->gi[i] = b->gi[i-1]; ++ b->gi[0] = gt; ++ memmove(b->fmt + 3, b->fmt, s + 1 - b->fmt); ++ b->fmt[0] = 'G'; b->fmt[1] = 'T'; b->fmt[2] = ':'; ++ return 0; ++} ++ ++int bcf_fix_pl(bcf1_t *b) ++{ ++ int i; ++ uint32_t tmp; ++ uint8_t *PL, *swap; ++ bcf_ginfo_t *gi; ++ // pinpoint PL ++ tmp = bcf_str2int("PL", 2); ++ for (i = 0; i < b->n_gi; ++i) ++ if (b->gi[i].fmt == tmp) break; ++ if (i == b->n_gi) return 0; ++ // prepare ++ gi = b->gi + i; ++ PL = (uint8_t*)gi->data; ++ swap = alloca(gi->len); ++ // loop through individuals ++ for (i = 0; i < b->n_smpl; ++i) { ++ int k, l, x; ++ uint8_t *PLi = PL + i * gi->len; ++ memcpy(swap, PLi, gi->len); ++ for (k = x = 0; k < b->n_alleles; ++k) ++ for (l = k; l < b->n_alleles; ++l) ++ PLi[l*(l+1)/2 + k] = swap[x++]; ++ } ++ return 0; ++} ++ ++int bcf_smpl_covered(const bcf1_t *b) ++{ ++ int i, j, n = 0; ++ uint32_t tmp; ++ bcf_ginfo_t *gi; ++ // pinpoint PL ++ tmp = bcf_str2int("PL", 2); ++ for (i = 0; i < b->n_gi; ++i) ++ if (b->gi[i].fmt == tmp) break; ++ if (i == b->n_gi) return 0; ++ // count how many samples having PL!=[0..0] ++ gi = b->gi + i; ++ for (i = 0; i < b->n_smpl; ++i) { ++ uint8_t *PLi = ((uint8_t*)gi->data) + i * gi->len; ++ for (j = 0; j < gi->len; ++j) ++ if (PLi[j]) break; ++ if (j < gi->len) ++n; ++ } ++ return n; ++} ++ ++static void *locate_field(const bcf1_t *b, const char *fmt, int l) ++{ ++ int i; ++ uint32_t tmp; ++ tmp = bcf_str2int(fmt, l); ++ for (i = 0; i < b->n_gi; ++i) ++ if (b->gi[i].fmt == tmp) break; ++ return i == b->n_gi? 0 : b->gi[i].data; ++} ++ ++int bcf_anno_max(bcf1_t *b) ++{ ++ int k, max_gq, max_sp, n_het; ++ kstring_t str; ++ uint8_t *gt, *gq; ++ int32_t *sp; ++ max_gq = max_sp = n_het = 0; ++ gt = locate_field(b, "GT", 2); ++ if (gt == 0) return -1; ++ gq = locate_field(b, "GQ", 2); ++ sp = locate_field(b, "SP", 2); ++ if (sp) ++ for (k = 0; k < b->n_smpl; ++k) ++ if (gt[k]&0x3f) ++ max_sp = max_sp > (int)sp[k]? max_sp : sp[k]; ++ if (gq) ++ for (k = 0; k < b->n_smpl; ++k) ++ if (gt[k]&0x3f) ++ max_gq = max_gq > (int)gq[k]? max_gq : gq[k]; ++ for (k = 0; k < b->n_smpl; ++k) { ++ int a1, a2; ++ a1 = gt[k]&7; a2 = gt[k]>>3&7; ++ if ((!a1 && a2) || (!a2 && a1)) { // a het ++ if (gq == 0) ++n_het; ++ else if (gq[k] >= 20) ++n_het; ++ } ++ } ++ if (n_het) max_sp -= (int)(4.343 * log(n_het) + .499); ++ if (max_sp < 0) max_sp = 0; ++ memset(&str, 0, sizeof(kstring_t)); ++ if (*b->info) kputc(';', &str); ++ ksprintf(&str, "MXSP=%d;MXGQ=%d", max_sp, max_gq); ++ bcf_append_info(b, str.s, str.l); ++ free(str.s); ++ return 0; ++} ++ ++// FIXME: only data are shuffled; the header is NOT ++int bcf_shuffle(bcf1_t *b, int seed) ++{ ++ int i, j, *a; ++ if (seed > 0) srand48(seed); ++ a = malloc(b->n_smpl * sizeof(int)); ++ for (i = 0; i < b->n_smpl; ++i) a[i] = i; ++ for (i = b->n_smpl; i > 1; --i) { ++ int tmp; ++ j = (int)(drand48() * i); ++ tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp; ++ } ++ for (j = 0; j < b->n_gi; ++j) { ++ bcf_ginfo_t *gi = b->gi + j; ++ uint8_t *swap, *data = (uint8_t*)gi->data; ++ swap = malloc(gi->len * b->n_smpl); ++ for (i = 0; i < b->n_smpl; ++i) ++ memcpy(swap + gi->len * a[i], data + gi->len * i, gi->len); ++ free(gi->data); ++ gi->data = swap; ++ } ++ free(a); ++ return 0; ++} ++ ++bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list) ++{ ++ int i, ret, j; ++ khint_t k; ++ bcf_hdr_t *h; ++ khash_t(str2id) *hash; ++ kstring_t s; ++ s.l = s.m = 0; s.s = 0; ++ hash = kh_init(str2id); ++ for (i = 0; i < h0->n_smpl; ++i) { ++ k = kh_put(str2id, hash, h0->sns[i], &ret); ++ kh_val(hash, k) = i; ++ } ++ for (i = j = 0; i < n; ++i) { ++ k = kh_get(str2id, hash, samples[i]); ++ if (k != kh_end(hash)) { ++ list[j++] = kh_val(hash, k); ++ kputs(samples[i], &s); kputc('\0', &s); ++ } ++ } ++ if (j < n) fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j); ++ kh_destroy(str2id, hash); ++ h = calloc(1, sizeof(bcf_hdr_t)); ++ *h = *h0; ++ h->ns = 0; h->sns = 0; ++ h->name = malloc(h->l_nm); memcpy(h->name, h0->name, h->l_nm); ++ h->txt = calloc(1, h->l_txt + 1); memcpy(h->txt, h0->txt, h->l_txt); ++ h->l_smpl = s.l; h->sname = s.s; ++ bcf_hdr_sync(h); ++ return h; ++} ++ ++int bcf_subsam(int n_smpl, int *list, bcf1_t *b) ++{ ++ int i, j; ++ for (j = 0; j < b->n_gi; ++j) { ++ bcf_ginfo_t *gi = b->gi + j; ++ uint8_t *swap; ++ swap = malloc(gi->len * b->n_smpl); ++ for (i = 0; i < n_smpl; ++i) ++ memcpy(swap + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len); ++ free(gi->data); ++ gi->data = swap; ++ } ++ b->n_smpl = n_smpl; ++ return 0; ++} ++ ++static int8_t nt4_table[128] = { ++ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4, ++ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4, ++ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, ++ 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4 ++}; ++ ++int bcf_gl10(const bcf1_t *b, uint8_t *gl) ++{ ++ int a[4], k, l, map[4], k1, j, i; ++ const bcf_ginfo_t *PL; ++ char *s; ++ if (b->ref[1] != 0 || b->n_alleles > 4) return -1; // ref is not a single base or >4 alleles ++ for (i = 0; i < b->n_gi; ++i) ++ if (b->gi[i].fmt == bcf_str2int("PL", 2)) break; ++ if (i == b->n_gi) return -1; // no PL ++ PL = b->gi + i; ++ a[0] = nt4_table[(int)b->ref[0]]; ++ if (a[0] > 3 || a[0] < 0) return -1; // ref is not A/C/G/T ++ a[1] = a[2] = a[3] = -2; // -1 has a special meaning ++ if (b->alt[0] == 0) return -1; // no alternate allele ++ map[0] = map[1] = map[2] = map[3] = -2; ++ map[a[0]] = 0; ++ for (k = 0, s = b->alt, k1 = -1; k < 3 && *s; ++k, s += 2) { ++ if (s[1] != ',' && s[1] != 0) return -1; // ALT is not single base ++ a[k+1] = nt4_table[(int)*s]; ++ if (a[k+1] >= 0) map[a[k+1]] = k+1; ++ else k1 = k + 1; ++ if (s[1] == 0) break; // the end of the ALT string ++ } ++ for (k = 0; k < 4; ++k) ++ if (map[k] < 0) map[k] = k1; ++ for (i = 0; i < b->n_smpl; ++i) { ++ const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual ++ uint8_t *g = gl + 10 * i; ++ for (k = j = 0; k < 4; ++k) { ++ for (l = k; l < 4; ++l) { ++ int t, x = map[k], y = map[l]; ++ if (x > y) t = x, x = y, y = t; // make sure x is the smaller ++ g[j++] = p[y * (y+1) / 2 + x]; ++ } ++ } ++ } ++ return 0; ++} ++ ++int bcf_gl10_indel(const bcf1_t *b, uint8_t *gl) ++{ ++ int k, l, j, i; ++ const bcf_ginfo_t *PL; ++ if (b->alt[0] == 0) return -1; // no alternate allele ++ for (i = 0; i < b->n_gi; ++i) ++ if (b->gi[i].fmt == bcf_str2int("PL", 2)) break; ++ if (i == b->n_gi) return -1; // no PL ++ PL = b->gi + i; ++ for (i = 0; i < b->n_smpl; ++i) { ++ const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual ++ uint8_t *g = gl + 10 * i; ++ for (k = j = 0; k < 4; ++k) { ++ for (l = k; l < 4; ++l) { ++ int t, x = k, y = l; ++ if (x > y) t = x, x = y, y = t; // make sure x is the smaller ++ x = y * (y+1) / 2 + x; ++ g[j++] = x < PL->len? p[x] : 255; ++ } ++ } ++ } ++ return 0; ++} diff --cc sam/bcftools/call1.c index f293a6c,f293a6c..3cc4649 --- a/sam/bcftools/call1.c +++ b/sam/bcftools/call1.c @@@ -6,9 -6,9 +6,12 @@@ #include "bcf.h" #include "prob1.h" #include "kstring.h" ++#include "time.h" --#include "khash.h" --KHASH_SET_INIT_INT64(set64) ++#ifdef _WIN32 ++#define srand48(x) srand(x) ++#define lrand48() rand() ++#endif #include "kseq.h" KSTREAM_INIT(gzFile, gzread, 16384) @@@ -19,70 -19,70 +22,30 @@@ #define VC_VARONLY 16 #define VC_VCFIN 32 #define VC_UNCOMP 64 --#define VC_HWE 128 #define VC_KEEPALT 256 #define VC_ACGT_ONLY 512 #define VC_QCALL 1024 #define VC_CALL_GT 2048 #define VC_ADJLD 4096 #define VC_NO_INDEL 8192 --#define VC_FOLDED 16384 ++#define VC_ANNO_MAX 16384 ++#define VC_FIX_PL 32768 ++#define VC_EM 0x10000 ++#define VC_PAIRCALL 0x20000 ++#define VC_QCNT 0x40000 typedef struct { -- int flag, prior_type, n1; -- char *fn_list, *prior_file; -- double theta, pref, indel_frac; ++ int flag, prior_type, n1, n_sub, *sublist, n_perm; ++ uint32_t *trio_aux; ++ char *prior_file, **subsam, *fn_dict; ++ uint8_t *ploidy; ++ double theta, pref, indel_frac, min_perm_p, min_smpl_frac, min_lrt; ++ void *bed; } viewconf_t; --khash_t(set64) *bcf_load_pos(const char *fn, bcf_hdr_t *_h) --{ -- void *str2id; -- gzFile fp; -- kstream_t *ks; -- int ret, dret, lineno = 1; -- kstring_t *str; -- khash_t(set64) *hash = 0; -- -- hash = kh_init(set64); -- str2id = bcf_build_refhash(_h); -- str = calloc(1, sizeof(kstring_t)); -- fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); -- ks = ks_init(fp); -- while (ks_getuntil(ks, 0, str, &dret) >= 0) { -- int tid = bcf_str2id(str2id, str->s); -- if (tid >= 0 && dret != '\n') { -- if (ks_getuntil(ks, 0, str, &dret) >= 0) { -- uint64_t x = (uint64_t)tid<<32 | (atoi(str->s) - 1); -- kh_put(set64, hash, x, &ret); -- } else break; -- } else fprintf(stderr, "[%s] %s is not a reference name (line %d).\n", __func__, str->s, lineno); -- if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); -- if (dret < 0) break; -- ++lineno; -- } -- bcf_str2id_destroy(str2id); -- ks_destroy(ks); -- gzclose(fp); -- free(str->s); free(str); -- return hash; --} -- --static double test_hwe(const double g[3]) --{ -- extern double kf_gammaq(double p, double x); -- double fexp, chi2, f[3], n; -- int i; -- n = g[0] + g[1] + g[2]; -- fexp = (2. * g[2] + g[1]) / (2. * n); -- if (fexp > 1. - 1e-10) fexp = 1. - 1e-10; -- if (fexp < 1e-10) fexp = 1e-10; -- f[0] = n * (1. - fexp) * (1. - fexp); -- f[1] = n * 2. * fexp * (1. - fexp); -- f[2] = n * fexp * fexp; -- for (i = 0, chi2 = 0.; i < 3; ++i) -- chi2 += (g[i] - f[i]) * (g[i] - f[i]) / f[i]; -- return kf_gammaq(.5, chi2 / 2.); --} ++void *bed_read(const char *fn); ++void bed_destroy(void *_h); ++int bed_overlap(const void *_h, const char *chr, int beg, int end); typedef struct { double p[4]; @@@ -147,37 -147,37 +110,70 @@@ static void rm_info(bcf1_t *b, const ch bcf_sync(b); } --static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, double pref, int flag) ++static int update_bcf1(bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, double pref, int flag, double em[10], int cons_llr, int64_t cons_gt) { kstring_t s; -- int is_var = (pr->p_ref < pref); -- double p_hwe, r = is_var? pr->p_ref : 1. - pr->p_ref; ++ int has_I16, is_var; ++ double fq, r; anno16_t a; -- p_hwe = pr->g[0] >= 0.? test_hwe(pr->g) : 1.0; // only do HWE g[] is calculated -- test16(b, &a); -- rm_info(b, "I16="); ++ has_I16 = test16(b, &a) >= 0? 1 : 0; ++ rm_info(b, "I16="); // FIXME: probably this function has a bug. If I move it below, I16 will not be removed! memset(&s, 0, sizeof(kstring_t)); kputc('\0', &s); kputs(b->ref, &s); kputc('\0', &s); kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s); kputs(b->info, &s); if (b->info[0]) kputc(';', &s); --// ksprintf(&s, "AF1=%.4lg;AFE=%.4lg;CI95=%.4lg,%.4lg", 1.-pr->f_em, 1.-pr->f_exp, pr->cil, pr->cih); -- ksprintf(&s, "AF1=%.4lg;CI95=%.4lg,%.4lg", 1.-pr->f_em, pr->cil, pr->cih); -- ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq); -- if (a.is_tested) { -- if (pr->pc[0] >= 0.) ksprintf(&s, ";PC4=%lg,%lg,%lg,%lg", pr->pc[0], pr->pc[1], pr->pc[2], pr->pc[3]); -- ksprintf(&s, ";PV4=%.2lg,%.2lg,%.2lg,%.2lg", a.p[0], a.p[1], a.p[2], a.p[3]); ++ { // print EM ++ if (em[0] >= 0) ksprintf(&s, "AF1=%.4g", 1 - em[0]); ++ if (em[4] >= 0 && em[4] <= 0.05) ksprintf(&s, ";G3=%.4g,%.4g,%.4g;HWE=%.3g", em[3], em[2], em[1], em[4]); ++ if (em[5] >= 0 && em[6] >= 0) ksprintf(&s, ";AF2=%.4g,%.4g", 1 - em[5], 1 - em[6]); ++ if (em[7] >= 0) ksprintf(&s, ";LRT=%.3g", em[7]); ++ if (em[8] >= 0) ksprintf(&s, ";LRT2=%.3g", em[8]); ++ } ++ if (cons_llr > 0) { ++ ksprintf(&s, ";CLR=%d", cons_llr); ++ if (cons_gt > 0) ++ ksprintf(&s, ";UGT=%c%c%c;CGT=%c%c%c", cons_gt&0xff, cons_gt>>8&0xff, cons_gt>>16&0xff, ++ cons_gt>>32&0xff, cons_gt>>40&0xff, cons_gt>>48&0xff); } -- if (pr->g[0] >= 0. && p_hwe <= .2) -- ksprintf(&s, ";GC=%.2lf,%.2lf,%.2lf;HWE=%.3lf", pr->g[2], pr->g[1], pr->g[0], p_hwe); ++ if (pr == 0) { // if pr is unset, return ++ kputc('\0', &s); kputs(b->fmt, &s); kputc('\0', &s); ++ free(b->str); ++ b->m_str = s.m; b->l_str = s.l; b->str = s.s; ++ bcf_sync(b); ++ return 1; ++ } ++ ++ is_var = (pr->p_ref < pref); ++ r = is_var? pr->p_ref : pr->p_var; ++ ++// ksprintf(&s, ";CI95=%.4g,%.4g", pr->cil, pr->cih); // FIXME: when EM is not used, ";" should be omitted! ++ ksprintf(&s, ";AC1=%d", pr->ac); ++ if (has_I16) ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq); ++ fq = pr->p_ref_folded < 0.5? -4.343 * log(pr->p_ref_folded) : 4.343 * log(pr->p_var_folded); ++ if (fq < -999) fq = -999; ++ if (fq > 999) fq = 999; ++ ksprintf(&s, ";FQ=%.3g", fq); ++ if (pr->cmp[0] >= 0.) { // two sample groups ++ int i, q[3]; ++ for (i = 1; i < 3; ++i) { ++ double x = pr->cmp[i] + pr->cmp[0]/2.; ++ q[i] = x == 0? 255 : (int)(-4.343 * log(x) + .499); ++ if (q[i] > 255) q[i] = 255; ++ } ++ if (pr->perm_rank >= 0) ksprintf(&s, ";PR=%d", pr->perm_rank); ++ // ksprintf(&s, ";LRT3=%.3g", pr->lrt); ++ ksprintf(&s, ";PCHI2=%.3g;PC2=%d,%d", q[1], q[2], pr->p_chi2); ++ } ++ if (has_I16 && a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]); kputc('\0', &s); kputs(b->fmt, &s); kputc('\0', &s); free(b->str); b->m_str = s.m; b->l_str = s.l; b->str = s.s; -- b->qual = r < 1e-100? 99 : -4.343 * log(r); -- if (b->qual > 99) b->qual = 99; ++ b->qual = r < 1e-100? 999 : -4.343 * log(r); ++ if (b->qual > 999) b->qual = 999; bcf_sync(b); if (!is_var) bcf_shrink_alt(b, 1); else if (!(flag&VC_KEEPALT)) @@@ -189,7 -189,7 +185,7 @@@ b->m_str = s.m; b->l_str = s.l; b->str = s.s; bcf_sync(b); for (i = 0; i < b->n_smpl; ++i) { -- x = bcf_p1_call_gt(pa, pr->f_em, i); ++ x = bcf_p1_call_gt(pa, pr->f_exp, i); ((uint8_t*)b->gi[old_n_gi].data)[i] = (x&3) == 0? 1<<3|1 : (x&3) == 1? 1 : 0; ((uint8_t*)b->gi[old_n_gi+1].data)[i] = x>>2; } @@@ -197,47 -197,47 +193,174 @@@ return is_var; } --double bcf_ld_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]); ++static char **read_samples(const char *fn, int *_n) ++{ ++ gzFile fp; ++ kstream_t *ks; ++ kstring_t s; ++ int dret, n = 0, max = 0; ++ char **sam = 0; ++ *_n = 0; ++ s.l = s.m = 0; s.s = 0; ++ fp = gzopen(fn, "r"); ++ if (fp == 0) return 0; // fail to open file ++ ks = ks_init(fp); ++ while (ks_getuntil(ks, 0, &s, &dret) >= 0) { ++ int l; ++ if (max == n) { ++ max = max? max<<1 : 4; ++ sam = realloc(sam, sizeof(void*)*max); ++ } ++ l = s.l; ++ sam[n] = malloc(s.l + 2); ++ strcpy(sam[n], s.s); ++ sam[n][l+1] = 2; // by default, diploid ++ if (dret != '\n') { ++ if (ks_getuntil(ks, 0, &s, &dret) >= 0) { // read ploidy, 1 or 2 ++ int x = (int)s.s[0] - '0'; ++ if (x == 1 || x == 2) sam[n][l+1] = x; ++ else fprintf(stderr, "(%s) ploidy can only be 1 or 2; assume diploid\n", __func__); ++ } ++ if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); ++ } ++ ++n; ++ } ++ ks_destroy(ks); ++ gzclose(fp); ++ free(s.s); ++ *_n = n; ++ return sam; ++} ++ ++static void write_header(bcf_hdr_t *h) ++{ ++ kstring_t str; ++ str.l = h->l_txt? h->l_txt - 1 : 0; ++ str.m = str.l + 1; str.s = h->txt; ++ if (!strstr(str.s, "##INFO=\n", &str); ++ if (!strstr(str.s, "##INFO=\n", &str); ++ if (!strstr(str.s, "##INFO=\n", &str); ++ if (!strstr(str.s, "##INFO=\n", &str); ++ if (!strstr(str.s, "##INFO=\n", &str); ++ if (!strstr(str.s, "##INFO=\n", &str); ++ if (!strstr(str.s, "##INFO=\n", &str); ++ if (!strstr(str.s, "##INFO=\n", &str); ++ if (!strstr(str.s, "##INFO=\n", &str); ++ if (!strstr(str.s, "##INFO=\n", &str); ++ if (!strstr(str.s, "##INFO=\n", &str); ++// if (!strstr(str.s, "##INFO=\n", &str); ++ if (!strstr(str.s, "##INFO=\n", &str); ++ if (!strstr(str.s, "##INFO=\n", &str); ++ if (!strstr(str.s, "##INFO=\n", &str); ++ if (!strstr(str.s, "##INFO=\n", &str); ++ if (!strstr(str.s, "##INFO=\n", &str); ++ if (!strstr(str.s, "##INFO=\n", &str); ++ if (!strstr(str.s, "##INFO=\n", &str); ++ if (!strstr(str.s, "##FORMAT=\n", &str); ++ if (!strstr(str.s, "##FORMAT=\n", &str); ++ if (!strstr(str.s, "##FORMAT=\n", &str); ++ if (!strstr(str.s, "##FORMAT=\n", &str); ++ if (!strstr(str.s, "##FORMAT=\n", &str); ++ if (!strstr(str.s, "##FORMAT=\n", &str); ++ h->l_txt = str.l + 1; h->txt = str.s; ++} ++ ++double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]); int bcfview(int argc, char *argv[]) { extern int bcf_2qcall(bcf_hdr_t *h, bcf1_t *b); extern void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x); ++ extern int bcf_fix_gt(bcf1_t *b); ++ extern int bcf_anno_max(bcf1_t *b); ++ extern int bcf_shuffle(bcf1_t *b, int seed); ++ extern uint32_t *bcf_trio_prep(int is_x, int is_son); ++ extern int bcf_trio_call(uint32_t *prep, const bcf1_t *b, int *llr, int64_t *gt); ++ extern int bcf_pair_call(const bcf1_t *b); ++ extern int bcf_min_diff(const bcf1_t *b); ++ bcf_t *bp, *bout = 0; bcf1_t *b, *blast; -- int c; -- uint64_t n_processed = 0; ++ int c, *seeds = 0; ++ uint64_t n_processed = 0, qcnt[256]; viewconf_t vc; bcf_p1aux_t *p1 = 0; -- bcf_hdr_t *h; ++ bcf_hdr_t *hin, *hout; int tid, begin, end; char moder[4], modew[4]; -- khash_t(set64) *hash = 0; tid = begin = end = -1; memset(&vc, 0, sizeof(viewconf_t)); -- vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.; -- while ((c = getopt(argc, argv, "fN1:l:cHAGvbSuP:t:p:QgLi:I")) >= 0) { ++ vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.; vc.n_perm = 0; vc.min_perm_p = 0.01; vc.min_smpl_frac = 0; vc.min_lrt = 1; ++ memset(qcnt, 0, 8 * 256); ++ while ((c = getopt(argc, argv, "FN1:l:cC:eHAGvbSuP:t:p:QgLi:IMs:D:U:X:d:T:Y")) >= 0) { switch (c) { -- case 'f': vc.flag |= VC_FOLDED; break; case '1': vc.n1 = atoi(optarg); break; -- case 'l': vc.fn_list = strdup(optarg); break; ++ case 'l': vc.bed = bed_read(optarg); break; ++ case 'D': vc.fn_dict = strdup(optarg); break; ++ case 'F': vc.flag |= VC_FIX_PL; break; case 'N': vc.flag |= VC_ACGT_ONLY; break; case 'G': vc.flag |= VC_NO_GENO; break; case 'A': vc.flag |= VC_KEEPALT; break; case 'b': vc.flag |= VC_BCFOUT; break; case 'S': vc.flag |= VC_VCFIN; break; case 'c': vc.flag |= VC_CALL; break; ++ case 'e': vc.flag |= VC_EM; break; case 'v': vc.flag |= VC_VARONLY | VC_CALL; break; case 'u': vc.flag |= VC_UNCOMP | VC_BCFOUT; break; -- case 'H': vc.flag |= VC_HWE; break; case 'g': vc.flag |= VC_CALL_GT | VC_CALL; break; case 'I': vc.flag |= VC_NO_INDEL; break; ++ case 'M': vc.flag |= VC_ANNO_MAX; break; ++ case 'Y': vc.flag |= VC_QCNT; break; case 't': vc.theta = atof(optarg); break; case 'p': vc.pref = atof(optarg); break; case 'i': vc.indel_frac = atof(optarg); break; case 'Q': vc.flag |= VC_QCALL; break; case 'L': vc.flag |= VC_ADJLD; break; ++ case 'U': vc.n_perm = atoi(optarg); break; ++ case 'C': vc.min_lrt = atof(optarg); break; ++ case 'X': vc.min_perm_p = atof(optarg); break; ++ case 'd': vc.min_smpl_frac = atof(optarg); break; ++ case 's': vc.subsam = read_samples(optarg, &vc.n_sub); ++ vc.ploidy = calloc(vc.n_sub + 1, 1); ++ for (tid = 0; tid < vc.n_sub; ++tid) vc.ploidy[tid] = vc.subsam[tid][strlen(vc.subsam[tid]) + 1]; ++ tid = -1; ++ break; ++ case 'T': ++ if (strcmp(optarg, "trioauto") == 0) vc.trio_aux = bcf_trio_prep(0, 0); ++ else if (strcmp(optarg, "trioxd") == 0) vc.trio_aux = bcf_trio_prep(1, 0); ++ else if (strcmp(optarg, "trioxs") == 0) vc.trio_aux = bcf_trio_prep(1, 1); ++ else if (strcmp(optarg, "pair") == 0) vc.flag |= VC_PAIRCALL; ++ else { ++ fprintf(stderr, "[%s] Option '-T' can only take value trioauto, trioxd or trioxs.\n", __func__); ++ return 1; ++ } ++ break; case 'P': if (strcmp(optarg, "full") == 0) vc.prior_type = MC_PTYPE_FULL; else if (strcmp(optarg, "cond2") == 0) vc.prior_type = MC_PTYPE_COND2; @@@ -248,31 -248,31 +371,52 @@@ } if (argc == optind) { fprintf(stderr, "\n"); -- fprintf(stderr, "Usage: bcftools view [options] [reg]\n\n"); -- fprintf(stderr, "Options: -c SNP calling\n"); -- fprintf(stderr, " -v output potential variant sites only (force -c)\n"); -- fprintf(stderr, " -g call genotypes at variant sites (force -c)\n"); -- fprintf(stderr, " -b output BCF instead of VCF\n"); -- fprintf(stderr, " -u uncompressed BCF output (force -b)\n"); -- fprintf(stderr, " -S input is VCF\n"); -- fprintf(stderr, " -A keep all possible alternate alleles at variant sites\n"); -- fprintf(stderr, " -G suppress all individual genotype information\n"); -- fprintf(stderr, " -H perform Hardy-Weinberg test (slower)\n"); -- fprintf(stderr, " -N skip sites where REF is not A/C/G/T\n"); -- fprintf(stderr, " -Q output the QCALL likelihood format\n"); -- fprintf(stderr, " -L calculate LD for adjacent sites\n"); -- fprintf(stderr, " -I skip indels\n"); -- fprintf(stderr, " -f reference-free variant calling\n"); -- fprintf(stderr, " -1 INT number of group-1 samples [0]\n"); -- fprintf(stderr, " -l FILE list of sites to output [all sites]\n"); -- fprintf(stderr, " -t FLOAT scaled substitution mutation rate [%.4lg]\n", vc.theta); -- fprintf(stderr, " -i FLOAT indel-to-substitution ratio [%.4lg]\n", vc.indel_frac); -- fprintf(stderr, " -p FLOAT variant if P(ref|D) [reg]\n\n"); ++ fprintf(stderr, "Input/output options:\n\n"); ++ fprintf(stderr, " -A keep all possible alternate alleles at variant sites\n"); ++ fprintf(stderr, " -b output BCF instead of VCF\n"); ++ fprintf(stderr, " -D FILE sequence dictionary for VCF->BCF conversion [null]\n"); ++ fprintf(stderr, " -F PL generated by r921 or before (which generate old ordering)\n"); ++ fprintf(stderr, " -G suppress all individual genotype information\n"); ++ fprintf(stderr, " -l FILE list of sites (chr pos) or regions (BED) to output [all sites]\n"); ++ fprintf(stderr, " -L calculate LD for adjacent sites\n"); ++ fprintf(stderr, " -N skip sites where REF is not A/C/G/T\n"); ++ fprintf(stderr, " -Q output the QCALL likelihood format\n"); ++ fprintf(stderr, " -s FILE list of samples to use [all samples]\n"); ++ fprintf(stderr, " -S input is VCF\n"); ++ fprintf(stderr, " -u uncompressed BCF output (force -b)\n"); ++ fprintf(stderr, "\nConsensus/variant calling options:\n\n"); ++ fprintf(stderr, " -c SNP calling (force -e)\n"); ++ fprintf(stderr, " -d FLOAT skip loci where less than FLOAT fraction of samples covered [0]\n"); ++ fprintf(stderr, " -e likelihood based analyses\n"); ++ fprintf(stderr, " -g call genotypes at variant sites (force -c)\n"); ++ fprintf(stderr, " -i FLOAT indel-to-substitution ratio [%.4g]\n", vc.indel_frac); ++ fprintf(stderr, " -I skip indels\n"); ++ fprintf(stderr, " -p FLOAT variant if P(ref|D)BCF conversion please specify the sequence dictionary with -D\n", __func__); ++ return 1; ++ } ++ if (vc.n1 <= 0) vc.n_perm = 0; // TODO: give a warning here! ++ if (vc.n_perm > 0) { ++ seeds = malloc(vc.n_perm * sizeof(int)); ++ srand48(time(0)); ++ for (c = 0; c < vc.n_perm; ++c) seeds[c] = lrand48(); ++ } b = calloc(1, sizeof(bcf1_t)); blast = calloc(1, sizeof(bcf1_t)); strcpy(moder, "r"); @@@ -281,27 -281,27 +425,34 @@@ if (vc.flag & VC_BCFOUT) strcat(modew, "b"); if (vc.flag & VC_UNCOMP) strcat(modew, "u"); bp = vcf_open(argv[optind], moder); -- h = vcf_hdr_read(bp); ++ hin = hout = vcf_hdr_read(bp); ++ if (vc.fn_dict && (vc.flag & VC_VCFIN)) ++ vcf_dictread(bp, hin, vc.fn_dict); bout = vcf_open("-", modew); -- if (!(vc.flag & VC_QCALL)) vcf_hdr_write(bout, h); ++ if (!(vc.flag & VC_QCALL)) { ++ if (vc.n_sub) { ++ vc.sublist = calloc(vc.n_sub, sizeof(int)); ++ hout = bcf_hdr_subsam(hin, vc.n_sub, vc.subsam, vc.sublist); ++ } ++ if (vc.flag & VC_CALL) write_header(hout); ++ vcf_hdr_write(bout, hout); ++ } if (vc.flag & VC_CALL) { -- p1 = bcf_p1_init(h->n_smpl); ++ p1 = bcf_p1_init(hout->n_smpl, vc.ploidy); if (vc.prior_file) { if (bcf_p1_read_prior(p1, vc.prior_file) < 0) { fprintf(stderr, "[%s] fail to read the prior AFS.\n", __func__); return 1; } } else bcf_p1_init_prior(p1, vc.prior_type, vc.theta); -- if (vc.n1 > 0) { ++ if (vc.n1 > 0 && vc.min_lrt > 0.) { // set n1 bcf_p1_set_n1(p1, vc.n1); bcf_p1_init_subprior(p1, vc.prior_type, vc.theta); } if (vc.indel_frac > 0.) bcf_p1_indel_prior(p1, vc.indel_frac); // otherwise use the default indel_frac -- if (vc.flag & VC_FOLDED) bcf_p1_set_folded(p1); } -- if (vc.fn_list) hash = bcf_load_pos(vc.fn_list, h); if (optind + 1 < argc && !(vc.flag&VC_VCFIN)) { -- void *str2id = bcf_build_refhash(h); ++ void *str2id = bcf_build_refhash(hout); if (bcf_parse_region(str2id, argv[optind+1], &tid, &begin, &end) >= 0) { bcf_idx_t *idx; idx = bcf_idx_load(argv[optind]); @@@ -317,8 -317,8 +468,19 @@@ } } } -- while (vcf_read(bp, h, b) > 0) { -- int is_indel = bcf_is_indel(b); ++ while (vcf_read(bp, hin, b) > 0) { ++ int is_indel, cons_llr = -1; ++ int64_t cons_gt = -1; ++ double em[10]; ++ if ((vc.flag & VC_VARONLY) && strcmp(b->alt, "X") == 0) continue; ++ if ((vc.flag & VC_VARONLY) && vc.min_smpl_frac > 0.) { ++ extern int bcf_smpl_covered(const bcf1_t *b); ++ int n = bcf_smpl_covered(b); ++ if ((double)n / b->n_smpl < vc.min_smpl_frac) continue; ++ } ++ if (vc.n_sub) bcf_subsam(vc.n_sub, vc.sublist, b); ++ if (vc.flag & VC_FIX_PL) bcf_fix_pl(b); ++ is_indel = bcf_is_indel(b); if ((vc.flag & VC_NO_INDEL) && is_indel) continue; if ((vc.flag & VC_ACGT_ONLY) && !is_indel) { int x; @@@ -326,13 -326,13 +488,7 @@@ x = toupper(b->ref[0]); if (x != 'A' && x != 'C' && x != 'G' && x != 'T') continue; } -- if (hash) { -- uint64_t x = (uint64_t)b->tid<<32 | b->pos; -- khint_t k = kh_get(set64, hash, x); -- if (kh_size(hash) == 0) break; -- if (k == kh_end(hash)) continue; -- kh_del(set64, hash, k); -- } ++ if (vc.bed && !bed_overlap(vc.bed, hin->ns[b->tid], b->pos, b->pos + strlen(b->ref))) continue; if (tid >= 0) { int l = strlen(b->ref); l = b->pos + (l > 0? l : 1); @@@ -340,47 -340,47 +496,91 @@@ if (!(l > begin && end > b->pos)) continue; } ++n_processed; ++ if ((vc.flag & VC_QCNT) && !is_indel) { // summarize the difference ++ int x = bcf_min_diff(b); ++ if (x > 255) x = 255; ++ if (x >= 0) ++qcnt[x]; ++ } if (vc.flag & VC_QCALL) { // output QCALL format; STOP here -- bcf_2qcall(h, b); ++ bcf_2qcall(hout, b); continue; } -- if (vc.flag & (VC_CALL|VC_ADJLD)) bcf_gl2pl(b); ++ if (vc.trio_aux) // do trio calling ++ bcf_trio_call(vc.trio_aux, b, &cons_llr, &cons_gt); ++ else if (vc.flag & VC_PAIRCALL) ++ cons_llr = bcf_pair_call(b); ++ if (vc.flag & (VC_CALL|VC_ADJLD|VC_EM)) bcf_gl2pl(b); ++ if (vc.flag & VC_EM) bcf_em1(b, vc.n1, 0x1ff, em); ++ else { ++ int i; ++ for (i = 0; i < 9; ++i) em[i] = -1.; ++ } if (vc.flag & VC_CALL) { // call variants bcf_p1rst_t pr; -- bcf_p1_cal(b, p1, &pr); // pr.g[3] is not calculated here -- if (vc.flag&VC_HWE) bcf_p1_cal_g3(p1, pr.g); ++ int calret = bcf_p1_cal(b, (em[7] >= 0 && em[7] < vc.min_lrt), p1, &pr); if (n_processed % 100000 == 0) { fprintf(stderr, "[%s] %ld sites processed.\n", __func__, (long)n_processed); bcf_p1_dump_afs(p1); } if (pr.p_ref >= vc.pref && (vc.flag & VC_VARONLY)) continue; -- update_bcf1(h->n_smpl, b, p1, &pr, vc.pref, vc.flag); -- } ++ if (vc.n_perm && vc.n1 > 0 && pr.p_chi2 < vc.min_perm_p) { // permutation test ++ bcf_p1rst_t r; ++ int i, n = 0; ++ for (i = 0; i < vc.n_perm; ++i) { ++#ifdef BCF_PERM_LRT // LRT based permutation is much faster but less robust to artifacts ++ double x[10]; ++ bcf_shuffle(b, seeds[i]); ++ bcf_em1(b, vc.n1, 1<<7, x); ++ if (x[7] < em[7]) ++n; ++#else ++ bcf_shuffle(b, seeds[i]); ++ bcf_p1_cal(b, 1, p1, &r); ++ if (pr.p_chi2 >= r.p_chi2) ++n; ++#endif ++ } ++ pr.perm_rank = n; ++ } ++ if (calret >= 0) update_bcf1(b, p1, &pr, vc.pref, vc.flag, em, cons_llr, cons_gt); ++ } else if (vc.flag & VC_EM) update_bcf1(b, 0, 0, 0, vc.flag, em, cons_llr, cons_gt); if (vc.flag & VC_ADJLD) { // compute LD double f[4], r2; -- if ((r2 = bcf_ld_freq(blast, b, f)) >= 0) { ++ if ((r2 = bcf_pair_freq(blast, b, f)) >= 0) { kstring_t s; s.m = s.l = 0; s.s = 0; if (*b->info) kputc(';', &s); -- ksprintf(&s, "NEIR=%.3lf;NEIF=%.3lf,%.3lf", r2, f[0]+f[2], f[0]+f[1]); ++ ksprintf(&s, "NEIR=%.3f;NEIF4=%.3f,%.3f,%.3f,%.3f", r2, f[0], f[1], f[2], f[3]); bcf_append_info(b, s.s, s.l); free(s.s); } bcf_cpy(blast, b); } ++ if (vc.flag & VC_ANNO_MAX) bcf_anno_max(b); if (vc.flag & VC_NO_GENO) { // do not output GENO fields b->n_gi = 0; b->fmt[0] = '\0'; -- } -- vcf_write(bout, h, b); ++ b->l_str = b->fmt - b->str + 1; ++ } else bcf_fix_gt(b); ++ vcf_write(bout, hout, b); } if (vc.prior_file) free(vc.prior_file); if (vc.flag & VC_CALL) bcf_p1_dump_afs(p1); -- bcf_hdr_destroy(h); ++ if (hin != hout) bcf_hdr_destroy(hout); ++ bcf_hdr_destroy(hin); bcf_destroy(b); bcf_destroy(blast); vcf_close(bp); vcf_close(bout); -- if (hash) kh_destroy(set64, hash); -- if (vc.fn_list) free(vc.fn_list); ++ if (vc.fn_dict) free(vc.fn_dict); ++ if (vc.ploidy) free(vc.ploidy); ++ if (vc.trio_aux) free(vc.trio_aux); ++ if (vc.n_sub) { ++ int i; ++ for (i = 0; i < vc.n_sub; ++i) free(vc.subsam[i]); ++ free(vc.subsam); free(vc.sublist); ++ } ++ if (vc.bed) bed_destroy(vc.bed); ++ if (vc.flag & VC_QCNT) ++ for (c = 0; c < 256; ++c) ++ fprintf(stderr, "QT\t%d\t%lld\n", c, (long long)qcnt[c]); ++ if (seeds) free(seeds); if (p1) bcf_p1_destroy(p1); return 0; } diff --cc sam/bcftools/fet.c index 845f8c2,845f8c2..5812517 --- a/sam/bcftools/fet.c +++ b/sam/bcftools/fet.c @@@ -64,7 -64,7 +64,8 @@@ double kt_fisher_exact(int n11, int n12 n1_ = n11 + n12; n_1 = n11 + n21; n = n11 + n12 + n21 + n22; // calculate n1_, n_1 and n max = (n_1 < n1_) ? n_1 : n1_; // max n11, for right tail -- min = (n1_ + n_1 - n < 0) ? 0 : (n1_ + n_1 - n < 0); // min n11, for left tail ++ min = n1_ + n_1 - n; ++ if (min < 0) min = 0; // min n11, for left tail *two = *_left = *_right = 1.; if (min == max) return 1.; // no need to do test q = hypergeo_acc(n11, n1_, n_1, n, &aux); // the probability of the current table @@@ -79,6 -79,6 +80,7 @@@ p = hypergeo_acc(max, 0, 0, 0, &aux); for (right = 0., j = max - 1; p < 0.99999999 * q; --j) // loop until underflow right += p, p = hypergeo_acc(j, 0, 0, 0, &aux); ++ ++j; if (p < 1.00000001 * q) right += p; else ++j; // two-tail diff --cc sam/bcftools/ld.c index dc84d4b,dc84d4b..0000000 deleted file mode 100644,100644 --- a/sam/bcftools/ld.c +++ /dev/null @@@ -1,100 -1,100 +1,0 @@@ --#include --#include --#include --#include "bcf.h" -- --static double g_q2p[256]; -- --#define LD_ITER_MAX 50 --#define LD_ITER_EPS 1e-4 -- --#define _G1(h, k) ((h>>1&1) + (k>>1&1)) --#define _G2(h, k) ((h&1) + (k&1)) -- --// 0: the previous site; 1: the current site --static int freq_iter(int n, double *pdg[2], double f[4]) --{ -- double ff[4]; -- int i, k, h; -- memset(ff, 0, 4 * sizeof(double)); -- for (i = 0; i < n; ++i) { -- double *p[2], sum, tmp; -- p[0] = pdg[0] + i * 3; p[1] = pdg[1] + i * 3; -- for (k = 0, sum = 0.; k < 4; ++k) -- for (h = 0; h < 4; ++h) -- sum += f[k] * f[h] * p[0][_G1(k,h)] * p[1][_G2(k,h)]; -- for (k = 0; k < 4; ++k) { -- tmp = f[0] * (p[0][_G1(0,k)] * p[1][_G2(0,k)] + p[0][_G1(k,0)] * p[1][_G2(k,0)]) -- + f[1] * (p[0][_G1(1,k)] * p[1][_G2(1,k)] + p[0][_G1(k,1)] * p[1][_G2(k,1)]) -- + f[2] * (p[0][_G1(2,k)] * p[1][_G2(2,k)] + p[0][_G1(k,2)] * p[1][_G2(k,2)]) -- + f[3] * (p[0][_G1(3,k)] * p[1][_G2(3,k)] + p[0][_G1(k,3)] * p[1][_G2(k,3)]); -- ff[k] += f[k] * tmp / sum; -- } -- } -- for (k = 0; k < 4; ++k) f[k] = ff[k] / (2 * n); -- return 0; --} -- --double bcf_ld_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]) --{ -- const bcf1_t *b[2]; -- uint8_t *PL[2]; -- int i, j, PL_len[2], n_smpl; -- double *pdg[2], flast[4], r; -- // initialize g_q2p if necessary -- if (g_q2p[0] == 0.) -- for (i = 0; i < 256; ++i) -- g_q2p[i] = pow(10., -i / 10.); -- // initialize others -- if (b0->n_smpl != b1->n_smpl) return -1; // different number of samples -- n_smpl = b0->n_smpl; -- b[0] = b0; b[1] = b1; -- f[0] = f[1] = f[2] = f[3] = -1.; -- if (b[0]->n_alleles < 2 || b[1]->n_alleles < 2) return -1; // one allele only -- // set PL and PL_len -- for (j = 0; j < 2; ++j) { -- const bcf1_t *bj = b[j]; -- for (i = 0; i < bj->n_gi; ++i) { -- if (bj->gi[i].fmt == bcf_str2int("PL", 2)) { -- PL[j] = (uint8_t*)bj->gi[i].data; -- PL_len[j] = bj->gi[i].len; -- break; -- } -- } -- if (i == bj->n_gi) return -1; // no PL -- } -- // fill pdg[2] -- pdg[0] = malloc(3 * n_smpl * sizeof(double)); -- pdg[1] = malloc(3 * n_smpl * sizeof(double)); -- for (j = 0; j < 2; ++j) { -- for (i = 0; i < n_smpl; ++i) { -- const uint8_t *pi = PL[j] + i * PL_len[j]; -- double *p = pdg[j] + i * 3; -- p[0] = g_q2p[pi[b[j]->n_alleles]]; p[1] = g_q2p[pi[1]]; p[2] = g_q2p[pi[0]]; -- } -- } -- // iteration -- f[0] = f[1] = f[2] = f[3] = 0.25; // this is a really bad guess... -- for (j = 0; j < LD_ITER_MAX; ++j) { -- double eps = 0; -- memcpy(flast, f, 4 * sizeof(double)); -- freq_iter(n_smpl, pdg, f); -- for (i = 0; i < 4; ++i) { -- double x = fabs(f[i] - flast[i]); -- if (x > eps) eps = x; -- } -- if (eps < LD_ITER_EPS) break; -- } -- // free -- free(pdg[0]); free(pdg[1]); -- { // calculate r^2 -- double p[2], q[2], D; -- p[0] = f[0] + f[1]; q[0] = 1 - p[0]; -- p[1] = f[0] + f[2]; q[1] = 1 - p[1]; -- D = f[0] * f[3] - f[1] * f[2]; -- r = sqrt(D * D / (p[0] * p[1] * q[0] * q[1])); -- // fprintf(stderr, "R(%lf,%lf,%lf,%lf)=%lf\n", f[0], f[1], f[2], f[3], r2); -- if (isnan(r)) r = -1.; -- } -- return r; --} diff --cc sam/bcftools/main.c index 7ffc2a0,7ffc2a0..fcd94b8 --- a/sam/bcftools/main.c +++ b/sam/bcftools/main.c @@@ -1,8 -1,8 +1,12 @@@ #include #include #include ++#include #include "bcf.h" ++#include "kseq.h" ++KSTREAM_INIT(gzFile, gzread, 0x10000) ++ int bcfview(int argc, char *argv[]); int bcf_main_index(int argc, char *argv[]); @@@ -42,20 -42,20 +46,142 @@@ int bcf_cat(int n, char * const *fn return 0; } ++extern double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]); ++ ++int bcf_main_ldpair(int argc, char *argv[]) ++{ ++ bcf_t *fp; ++ bcf_hdr_t *h; ++ bcf1_t *b0, *b1; ++ bcf_idx_t *idx; ++ kstring_t str; ++ void *str2id; ++ gzFile fplist; ++ kstream_t *ks; ++ int dret, lineno = 0; ++ if (argc < 3) { ++ fprintf(stderr, "Usage: bcftools ldpair \n"); ++ return 1; ++ } ++ fplist = gzopen(argv[2], "rb"); ++ ks = ks_init(fplist); ++ memset(&str, 0, sizeof(kstring_t)); ++ fp = bcf_open(argv[1], "rb"); ++ h = bcf_hdr_read(fp); ++ str2id = bcf_build_refhash(h); ++ idx = bcf_idx_load(argv[1]); ++ if (idx == 0) { ++ fprintf(stderr, "[%s] No bcf index is found. Abort!\n", __func__); ++ return 1; ++ } ++ b0 = calloc(1, sizeof(bcf1_t)); ++ b1 = calloc(1, sizeof(bcf1_t)); ++ while (ks_getuntil(ks, '\n', &str, &dret) >= 0) { ++ char *p, *q; ++ int k; ++ int tid0 = -1, tid1 = -1, pos0 = -1, pos1 = -1; ++ ++lineno; ++ for (p = q = str.s, k = 0; *p; ++p) { ++ if (*p == ' ' || *p == '\t') { ++ *p = '\0'; ++ if (k == 0) tid0 = bcf_str2id(str2id, q); ++ else if (k == 1) pos0 = atoi(q) - 1; ++ else if (k == 2) tid1 = strcmp(q, "=")? bcf_str2id(str2id, q) : tid0; ++ else if (k == 3) pos1 = atoi(q) - 1; ++ q = p + 1; ++ ++k; ++ } ++ } ++ if (k == 3) pos1 = atoi(q) - 1; ++ if (tid0 >= 0 && tid1 >= 0 && pos0 >= 0 && pos1 >= 0) { ++ uint64_t off; ++ double r, f[4]; ++ off = bcf_idx_query(idx, tid0, pos0); ++ bgzf_seek(fp->fp, off, SEEK_SET); ++ while (bcf_read(fp, h, b0) >= 0 && b0->pos != pos0); ++ off = bcf_idx_query(idx, tid1, pos1); ++ bgzf_seek(fp->fp, off, SEEK_SET); ++ while (bcf_read(fp, h, b1) >= 0 && b1->pos != pos1); ++ r = bcf_pair_freq(b0, b1, f); ++ r *= r; ++ printf("%s\t%d\t%s\t%d\t%.4g\t%.4g\t%.4g\t%.4g\t%.4g\n", h->ns[tid0], pos0+1, h->ns[tid1], pos1+1, ++ r, f[0], f[1], f[2], f[3]); ++ } //else fprintf(stderr, "[%s] Parse error at line %d.\n", __func__, lineno); ++ } ++ bcf_destroy(b0); bcf_destroy(b1); ++ bcf_idx_destroy(idx); ++ bcf_str2id_destroy(str2id); ++ bcf_hdr_destroy(h); ++ bcf_close(fp); ++ free(str.s); ++ ks_destroy(ks); ++ gzclose(fplist); ++ return 0; ++} ++ ++int bcf_main_ld(int argc, char *argv[]) ++{ ++ bcf_t *fp; ++ bcf_hdr_t *h; ++ bcf1_t **b, *b0; ++ int i, j, m, n; ++ double f[4]; ++ if (argc == 1) { ++ fprintf(stderr, "Usage: bcftools ld \n"); ++ return 1; ++ } ++ fp = bcf_open(argv[1], "rb"); ++ h = bcf_hdr_read(fp); ++ // read the entire BCF ++ m = n = 0; b = 0; ++ b0 = calloc(1, sizeof(bcf1_t)); ++ while (bcf_read(fp, h, b0) >= 0) { ++ if (m == n) { ++ m = m? m<<1 : 16; ++ b = realloc(b, sizeof(void*) * m); ++ } ++ b[n] = calloc(1, sizeof(bcf1_t)); ++ bcf_cpy(b[n++], b0); ++ } ++ bcf_destroy(b0); ++ // compute pair-wise r^2 ++ printf("%d\n", n); // the number of loci ++ for (i = 0; i < n; ++i) { ++ printf("%s:%d", h->ns[b[i]->tid], b[i]->pos + 1); ++ for (j = 0; j < i; ++j) { ++ double r = bcf_pair_freq(b[i], b[j], f); ++ printf("\t%.3f", r*r); ++ } ++ printf("\t1.000\n"); ++ } ++ // free ++ for (i = 0; i < n; ++i) bcf_destroy(b[i]); ++ free(b); ++ bcf_hdr_destroy(h); ++ bcf_close(fp); ++ return 0; ++} ++ int main(int argc, char *argv[]) { if (argc == 1) { fprintf(stderr, "\n"); ++ fprintf(stderr, "Program: bcftools (Tools for data in the VCF/BCF formats)\n"); ++ fprintf(stderr, "Version: %s\n\n", BCF_VERSION); fprintf(stderr, "Usage: bcftools \n\n"); fprintf(stderr, "Command: view print, extract, convert and call SNPs from BCF\n"); fprintf(stderr, " index index BCF\n"); fprintf(stderr, " cat concatenate BCFs\n"); ++ fprintf(stderr, " ld compute all-pair r^2\n"); ++ fprintf(stderr, " ldpair compute r^2 between requested pairs\n"); fprintf(stderr, "\n"); return 1; } if (strcmp(argv[1], "view") == 0) return bcfview(argc-1, argv+1); else if (strcmp(argv[1], "index") == 0) return bcf_main_index(argc-1, argv+1); -- else if (strcmp(argv[1], "cat") == 0) return bcf_cat(argc-2, argv+2); ++ else if (strcmp(argv[1], "ld") == 0) return bcf_main_ld(argc-1, argv+1); ++ else if (strcmp(argv[1], "ldpair") == 0) return bcf_main_ldpair(argc-1, argv+1); ++ else if (strcmp(argv[1], "cat") == 0) return bcf_cat(argc-2, argv+2); // cat is different ... else { fprintf(stderr, "[main] Unrecognized command.\n"); return 1; diff --cc sam/bcftools/prob1.c index 8bf968f,8bf968f..a380484 --- a/sam/bcftools/prob1.c +++ b/sam/bcftools/prob1.c @@@ -3,13 -3,13 +3,14 @@@ #include #include #include ++#include #include "prob1.h" #include "kseq.h" KSTREAM_INIT(gzFile, gzread, 16384) #define MC_MAX_EM_ITER 16 --#define MC_EM_EPS 1e-4 ++#define MC_EM_EPS 1e-5 #define MC_DEF_INDEL 0.15 unsigned char seq_nt4_table[256] = { @@@ -32,24 -32,24 +33,20 @@@ }; struct __bcf_p1aux_t { -- int n, M, n1, is_indel, is_folded; ++ int n, M, n1, is_indel; ++ uint8_t *ploidy; // haploid or diploid ONLY double *q2p, *pdg; // pdg -> P(D|g) double *phi, *phi_indel; double *z, *zswap; // aux for afs double *z1, *z2, *phi1, *phi2; // only calculated when n1 is set ++ double **hg; // hypergeometric distribution ++ double *lf; // log factorial double t, t1, t2; double *afs, *afs1; // afs: accumulative AFS; afs1: site posterior distribution const uint8_t *PL; // point to PL int PL_len; }; --static void fold_array(int M, double *x) --{ -- int k; -- for (k = 0; k < M/2; ++k) -- x[k] = x[M-k] = (x[k] + x[M-k]) / 2.; --} -- void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x) { int i; @@@ -130,27 -130,27 +127,38 @@@ int bcf_p1_read_prior(bcf_p1aux_t *ma, return 0; } --bcf_p1aux_t *bcf_p1_init(int n) ++bcf_p1aux_t *bcf_p1_init(int n, uint8_t *ploidy) { bcf_p1aux_t *ma; int i; ma = calloc(1, sizeof(bcf_p1aux_t)); ma->n1 = -1; ma->n = n; ma->M = 2 * n; ++ if (ploidy) { ++ ma->ploidy = malloc(n); ++ memcpy(ma->ploidy, ploidy, n); ++ for (i = 0, ma->M = 0; i < n; ++i) ma->M += ploidy[i]; ++ if (ma->M == 2 * n) { ++ free(ma->ploidy); ++ ma->ploidy = 0; ++ } ++ } ma->q2p = calloc(256, sizeof(double)); ma->pdg = calloc(3 * ma->n, sizeof(double)); ma->phi = calloc(ma->M + 1, sizeof(double)); ma->phi_indel = calloc(ma->M + 1, sizeof(double)); ma->phi1 = calloc(ma->M + 1, sizeof(double)); ma->phi2 = calloc(ma->M + 1, sizeof(double)); -- ma->z = calloc(2 * ma->n + 1, sizeof(double)); -- ma->zswap = calloc(2 * ma->n + 1, sizeof(double)); ++ ma->z = calloc(ma->M + 1, sizeof(double)); ++ ma->zswap = calloc(ma->M + 1, sizeof(double)); ma->z1 = calloc(ma->M + 1, sizeof(double)); // actually we do not need this large ma->z2 = calloc(ma->M + 1, sizeof(double)); -- ma->afs = calloc(2 * ma->n + 1, sizeof(double)); -- ma->afs1 = calloc(2 * ma->n + 1, sizeof(double)); ++ ma->afs = calloc(ma->M + 1, sizeof(double)); ++ ma->afs1 = calloc(ma->M + 1, sizeof(double)); ++ ma->lf = calloc(ma->M + 1, sizeof(double)); for (i = 0; i < 256; ++i) ma->q2p[i] = pow(10., -i / 10.); ++ for (i = 0; i <= ma->M; ++i) ma->lf[i] = lgamma(i + 1); bcf_p1_init_prior(ma, MC_PTYPE_FULL, 1e-3); // the simplest prior return ma; } @@@ -158,23 -158,23 +166,24 @@@ int bcf_p1_set_n1(bcf_p1aux_t *b, int n1) { if (n1 == 0 || n1 >= b->n) return -1; ++ if (b->M != b->n * 2) { ++ fprintf(stderr, "[%s] unable to set `n1' when there are haploid samples.\n", __func__); ++ return -1; ++ } b->n1 = n1; return 0; } --void bcf_p1_set_folded(bcf_p1aux_t *p1a) --{ -- if (p1a->n1 < 0) { -- p1a->is_folded = 1; -- fold_array(p1a->M, p1a->phi); -- fold_array(p1a->M, p1a->phi_indel); -- } --} -- void bcf_p1_destroy(bcf_p1aux_t *ma) { if (ma) { -- free(ma->q2p); free(ma->pdg); ++ int k; ++ free(ma->lf); ++ if (ma->hg && ma->n1 > 0) { ++ for (k = 0; k <= 2*ma->n1; ++k) free(ma->hg[k]); ++ free(ma->hg); ++ } ++ free(ma->ploidy); free(ma->q2p); free(ma->pdg); free(ma->phi); free(ma->phi_indel); free(ma->phi1); free(ma->phi2); free(ma->z); free(ma->zswap); free(ma->z1); free(ma->z2); free(ma->afs); free(ma->afs1); @@@ -184,18 -184,18 +193,16 @@@ static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma) { -- int i, j, k; ++ int i, j; long *p, tmp; p = alloca(b->n_alleles * sizeof(long)); memset(p, 0, sizeof(long) * b->n_alleles); for (j = 0; j < ma->n; ++j) { const uint8_t *pi = ma->PL + j * ma->PL_len; double *pdg = ma->pdg + j * 3; -- pdg[0] = ma->q2p[pi[b->n_alleles]]; pdg[1] = ma->q2p[pi[1]]; pdg[2] = ma->q2p[pi[0]]; -- for (i = k = 0; i < b->n_alleles; ++i) { -- p[i] += (int)pi[k]; -- k += b->n_alleles - i; -- } ++ pdg[0] = ma->q2p[pi[2]]; pdg[1] = ma->q2p[pi[1]]; pdg[2] = ma->q2p[pi[0]]; ++ for (i = 0; i < b->n_alleles; ++i) ++ p[i] += (int)pi[(i+1)*(i+2)/2-1]; } for (i = 0; i < b->n_alleles; ++i) p[i] = p[i]<<4 | i; for (i = 1; i < b->n_alleles; ++i) // insertion sort @@@ -205,28 -205,28 +212,18 @@@ if ((p[i]&0xf) == 0) break; return i; } --// f0 is the reference allele frequency --static double mc_freq_iter(double f0, const bcf_p1aux_t *ma) --{ -- double f, f3[3]; -- int i; -- f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0; -- for (i = 0, f = 0.; i < ma->n; ++i) { -- double *pdg; -- pdg = ma->pdg + i * 3; -- f += (pdg[1] * f3[1] + 2. * pdg[2] * f3[2]) -- / (pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]); -- } -- f /= ma->n * 2.; -- return f; --} int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k) { double sum, g[3]; double max, f3[3], *pdg = ma->pdg + k * 3; -- int q, i, max_i; -- f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0; ++ int q, i, max_i, ploidy; ++ ploidy = ma->ploidy? ma->ploidy[k] : 2; ++ if (ploidy == 2) { ++ f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0; ++ } else { ++ f3[0] = 1. - f0; f3[1] = 0; f3[2] = f0; ++ } for (i = 0, sum = 0.; i < 3; ++i) sum += (g[i] = pdg[i] * f3[i]); for (i = 0, max = -1., max_i = 0; i < 3; ++i) { @@@ -246,6 -246,6 +243,7 @@@ static void mc_cal_y_core(bcf_p1aux_t * { double *z[2], *tmp, *pdg; int _j, last_min, last_max; ++ assert(beg == 0 || ma->M == ma->n*2); z[0] = ma->z; z[1] = ma->zswap; pdg = ma->pdg; @@@ -254,41 -254,41 +252,81 @@@ z[0][0] = 1.; last_min = last_max = 0; ma->t = 0.; -- for (_j = beg; _j < ma->n; ++_j) { -- int k, j = _j - beg, _min = last_min, _max = last_max; -- double p[3], sum; -- pdg = ma->pdg + _j * 3; -- p[0] = pdg[0]; p[1] = 2. * pdg[1]; p[2] = pdg[2]; -- for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.; -- for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.; -- _max += 2; -- if (_min == 0) -- k = 0, z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k]; -- if (_min <= 1) -- k = 1, z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k] + k*(2*j+2-k) * p[1] * z[0][k-1]; -- for (k = _min < 2? 2 : _min; k <= _max; ++k) -- z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k] -- + k*(2*j+2-k) * p[1] * z[0][k-1] -- + k*(k-1)* p[2] * z[0][k-2]; -- for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k]; -- ma->t += log(sum / ((2. * j + 2) * (2. * j + 1))); -- for (k = _min; k <= _max; ++k) z[1][k] /= sum; -- if (_min >= 1) z[1][_min-1] = 0.; -- if (_min >= 2) z[1][_min-2] = 0.; -- if (j < ma->n - 1) z[1][_max+1] = z[1][_max+2] = 0.; -- if (_j == ma->n1 - 1) { // set pop1 -- ma->t1 = ma->t; -- memcpy(ma->z1, z[1], sizeof(double) * (ma->n1 * 2 + 1)); ++ if (ma->M == ma->n * 2) { ++ int M = 0; ++ for (_j = beg; _j < ma->n; ++_j) { ++ int k, j = _j - beg, _min = last_min, _max = last_max, M0; ++ double p[3], sum; ++ M0 = M; M += 2; ++ pdg = ma->pdg + _j * 3; ++ p[0] = pdg[0]; p[1] = 2. * pdg[1]; p[2] = pdg[2]; ++ for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.; ++ for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.; ++ _max += 2; ++ if (_min == 0) k = 0, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k]; ++ if (_min <= 1) k = 1, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1]; ++ for (k = _min < 2? 2 : _min; k <= _max; ++k) ++ z[1][k] = (M0-k+1)*(M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1] + k*(k-1)* p[2] * z[0][k-2]; ++ for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k]; ++ ma->t += log(sum / (M * (M - 1.))); ++ for (k = _min; k <= _max; ++k) z[1][k] /= sum; ++ if (_min >= 1) z[1][_min-1] = 0.; ++ if (_min >= 2) z[1][_min-2] = 0.; ++ if (j < ma->n - 1) z[1][_max+1] = z[1][_max+2] = 0.; ++ if (_j == ma->n1 - 1) { // set pop1; ma->n1==-1 when unset ++ ma->t1 = ma->t; ++ memcpy(ma->z1, z[1], sizeof(double) * (ma->n1 * 2 + 1)); ++ } ++ tmp = z[0]; z[0] = z[1]; z[1] = tmp; ++ last_min = _min; last_max = _max; ++ } ++ //for (_j = 0; _j < last_min; ++_j) z[0][_j] = 0.; // TODO: are these necessary? ++ //for (_j = last_max + 1; _j < ma->M; ++_j) z[0][_j] = 0.; ++ } else { // this block is very similar to the block above; these two might be merged in future ++ int j, M = 0; ++ for (j = 0; j < ma->n; ++j) { ++ int k, M0, _min = last_min, _max = last_max; ++ double p[3], sum; ++ pdg = ma->pdg + j * 3; ++ for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.; ++ for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.; ++ M0 = M; ++ M += ma->ploidy[j]; ++ if (ma->ploidy[j] == 1) { ++ p[0] = pdg[0]; p[1] = pdg[2]; ++ _max++; ++ if (_min == 0) k = 0, z[1][k] = (M0+1-k) * p[0] * z[0][k]; ++ for (k = _min < 1? 1 : _min; k <= _max; ++k) ++ z[1][k] = (M0+1-k) * p[0] * z[0][k] + k * p[1] * z[0][k-1]; ++ for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k]; ++ ma->t += log(sum / M); ++ for (k = _min; k <= _max; ++k) z[1][k] /= sum; ++ if (_min >= 1) z[1][_min-1] = 0.; ++ if (j < ma->n - 1) z[1][_max+1] = 0.; ++ } else if (ma->ploidy[j] == 2) { ++ p[0] = pdg[0]; p[1] = 2 * pdg[1]; p[2] = pdg[2]; ++ _max += 2; ++ if (_min == 0) k = 0, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k]; ++ if (_min <= 1) k = 1, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1]; ++ for (k = _min < 2? 2 : _min; k <= _max; ++k) ++ z[1][k] = (M0-k+1)*(M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1] + k*(k-1)* p[2] * z[0][k-2]; ++ for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k]; ++ ma->t += log(sum / (M * (M - 1.))); ++ for (k = _min; k <= _max; ++k) z[1][k] /= sum; ++ if (_min >= 1) z[1][_min-1] = 0.; ++ if (_min >= 2) z[1][_min-2] = 0.; ++ if (j < ma->n - 1) z[1][_max+1] = z[1][_max+2] = 0.; ++ } ++ tmp = z[0]; z[0] = z[1]; z[1] = tmp; ++ last_min = _min; last_max = _max; } -- tmp = z[0]; z[0] = z[1]; z[1] = tmp; -- last_min = _min; last_max = _max; } if (z[0] != ma->z) memcpy(ma->z, z[0], sizeof(double) * (ma->M + 1)); } static void mc_cal_y(bcf_p1aux_t *ma) { -- if (ma->n1 > 0 && ma->n1 < ma->n) { ++ if (ma->n1 > 0 && ma->n1 < ma->n && ma->M == ma->n * 2) { // NB: ma->n1 is ineffective when there are haploid samples int k; long double x; memset(ma->z1, 0, sizeof(double) * (2 * ma->n1 + 1)); @@@ -304,41 -304,41 +342,131 @@@ } else mc_cal_y_core(ma, 0); } --static void contrast(bcf_p1aux_t *ma, double pc[4]) // mc_cal_y() must be called before hand ++#define CONTRAST_TINY 1e-30 ++ ++extern double kf_gammaq(double s, double z); // incomplete gamma function for chi^2 test ++ ++static inline double chi2_test(int a, int b, int c, int d) ++{ ++ double x, z; ++ x = (double)(a+b) * (c+d) * (b+d) * (a+c); ++ if (x == 0.) return 1; ++ z = a * d - b * c; ++ return kf_gammaq(.5, .5 * z * z * (a+b+c+d) / x); ++} ++ ++// chi2=(a+b+c+d)(ad-bc)^2/[(a+b)(c+d)(a+c)(b+d)] ++static inline double contrast2_aux(const bcf_p1aux_t *p1, double sum, int k1, int k2, double x[3]) { -- int k, n1 = ma->n1, n2 = ma->n - ma->n1; -- long double sum1, sum2; -- pc[0] = pc[1] = pc[2] = pc[3] = -1.; -- if (n1 <= 0 || n2 <= 0) return; -- for (k = 0, sum1 = 0.; k <= 2*n1; ++k) sum1 += ma->phi1[k] * ma->z1[k]; -- for (k = 0, sum2 = 0.; k <= 2*n2; ++k) sum2 += ma->phi2[k] * ma->z2[k]; -- pc[2] = ma->phi1[2*n1] * ma->z1[2*n1] / sum1; -- pc[3] = ma->phi2[2*n2] * ma->z2[2*n2] / sum2; -- for (k = 2; k < 4; ++k) { -- pc[k] = pc[k] > .5? -(-4.343 * log(1. - pc[k] + TINY) + .499) : -4.343 * log(pc[k] + TINY) + .499; -- pc[k] = (int)pc[k]; -- if (pc[k] > 99) pc[k] = 99; -- if (pc[k] < -99) pc[k] = -99; ++ double p = p1->phi[k1+k2] * p1->z1[k1] * p1->z2[k2] / sum * p1->hg[k1][k2]; ++ int n1 = p1->n1, n2 = p1->n - p1->n1; ++ if (p < CONTRAST_TINY) return -1; ++ if (.5*k1/n1 < .5*k2/n2) x[1] += p; ++ else if (.5*k1/n1 > .5*k2/n2) x[2] += p; ++ else x[0] += p; ++ return p * chi2_test(k1, k2, (n1<<1) - k1, (n2<<1) - k2); ++} ++ ++static double contrast2(bcf_p1aux_t *p1, double ret[3]) ++{ ++ int k, k1, k2, k10, k20, n1, n2; ++ double sum; ++ // get n1 and n2 ++ n1 = p1->n1; n2 = p1->n - p1->n1; ++ if (n1 <= 0 || n2 <= 0) return 0.; ++ if (p1->hg == 0) { // initialize the hypergeometric distribution ++ /* NB: the hg matrix may take a lot of memory when there are many samples. There is a way ++ to avoid precomputing this matrix, but it is slower and quite intricate. The following ++ computation in this block can be accelerated with a similar strategy, but perhaps this ++ is not a serious concern for now. */ ++ double tmp = lgamma(2*(n1+n2)+1) - (lgamma(2*n1+1) + lgamma(2*n2+1)); ++ p1->hg = calloc(2*n1+1, sizeof(void*)); ++ for (k1 = 0; k1 <= 2*n1; ++k1) { ++ p1->hg[k1] = calloc(2*n2+1, sizeof(double)); ++ for (k2 = 0; k2 <= 2*n2; ++k2) ++ p1->hg[k1][k2] = exp(lgamma(k1+k2+1) + lgamma(p1->M-k1-k2+1) - (lgamma(k1+1) + lgamma(k2+1) + lgamma(2*n1-k1+1) + lgamma(2*n2-k2+1) + tmp)); ++ } ++ } ++ { // compute ++ long double suml = 0; ++ for (k = 0; k <= p1->M; ++k) suml += p1->phi[k] * p1->z[k]; ++ sum = suml; ++ } ++ { // get the max k1 and k2 ++ double max; ++ int max_k; ++ for (k = 0, max = 0, max_k = -1; k <= 2*n1; ++k) { ++ double x = p1->phi1[k] * p1->z1[k]; ++ if (x > max) max = x, max_k = k; ++ } ++ k10 = max_k; ++ for (k = 0, max = 0, max_k = -1; k <= 2*n2; ++k) { ++ double x = p1->phi2[k] * p1->z2[k]; ++ if (x > max) max = x, max_k = k; ++ } ++ k20 = max_k; ++ } ++ { // We can do the following with one nested loop, but that is an O(N^2) thing. The following code block is much faster for large N. ++ double x[3], y; ++ long double z = 0., L[2]; ++ x[0] = x[1] = x[2] = 0; L[0] = L[1] = 0; ++ for (k1 = k10; k1 >= 0; --k1) { ++ for (k2 = k20; k2 >= 0; --k2) { ++ if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break; ++ else z += y; ++ } ++ for (k2 = k20 + 1; k2 <= 2*n2; ++k2) { ++ if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break; ++ else z += y; ++ } ++ } ++ ret[0] = x[0]; ret[1] = x[1]; ret[2] = x[2]; ++ x[0] = x[1] = x[2] = 0; ++ for (k1 = k10 + 1; k1 <= 2*n1; ++k1) { ++ for (k2 = k20; k2 >= 0; --k2) { ++ if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break; ++ else z += y; ++ } ++ for (k2 = k20 + 1; k2 <= 2*n2; ++k2) { ++ if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break; ++ else z += y; ++ } ++ } ++ ret[0] += x[0]; ret[1] += x[1]; ret[2] += x[2]; ++ if (ret[0] + ret[1] + ret[2] < 0.95) { // in case of bad things happened ++ ret[0] = ret[1] = ret[2] = 0; L[0] = L[1] = 0; ++ for (k1 = 0, z = 0.; k1 <= 2*n1; ++k1) ++ for (k2 = 0; k2 <= 2*n2; ++k2) ++ if ((y = contrast2_aux(p1, sum, k1, k2, ret)) >= 0) z += y; ++ if (ret[0] + ret[1] + ret[2] < 0.95) // It seems that this may be caused by floating point errors. I do not really understand why... ++ z = 1.0, ret[0] = ret[1] = ret[2] = 1./3; ++ } ++ return (double)z; } -- pc[0] = ma->phi2[2*n2] * ma->z2[2*n2] / sum2 * (1. - ma->phi1[2*n1] * ma->z1[2*n1] / sum1); -- pc[1] = ma->phi1[2*n1] * ma->z1[2*n1] / sum1 * (1. - ma->phi2[2*n2] * ma->z2[2*n2] / sum2); -- pc[0] = pc[0] == 1.? 99 : (int)(-4.343 * log(1. - pc[0]) + .499); -- pc[1] = pc[1] == 1.? 99 : (int)(-4.343 * log(1. - pc[1]) + .499); } --static double mc_cal_afs(bcf_p1aux_t *ma) ++static double mc_cal_afs(bcf_p1aux_t *ma, double *p_ref_folded, double *p_var_folded) { int k; -- long double sum = 0.; ++ long double sum = 0., sum2; double *phi = ma->is_indel? ma->phi_indel : ma->phi; memset(ma->afs1, 0, sizeof(double) * (ma->M + 1)); mc_cal_y(ma); ++ // compute AFS for (k = 0, sum = 0.; k <= ma->M; ++k) sum += (long double)phi[k] * ma->z[k]; for (k = 0; k <= ma->M; ++k) { ma->afs1[k] = phi[k] * ma->z[k] / sum; if (isnan(ma->afs1[k]) || isinf(ma->afs1[k])) return -1.; } ++ // compute folded variant probability ++ for (k = 0, sum = 0.; k <= ma->M; ++k) ++ sum += (long double)(phi[k] + phi[ma->M - k]) / 2. * ma->z[k]; ++ for (k = 1, sum2 = 0.; k < ma->M; ++k) ++ sum2 += (long double)(phi[k] + phi[ma->M - k]) / 2. * ma->z[k]; ++ *p_var_folded = sum2 / sum; ++ *p_ref_folded = (phi[k] + phi[ma->M - k]) / 2. * (ma->z[ma->M] + ma->z[0]) / sum; ++ // the expected frequency for (k = 0, sum = 0.; k <= ma->M; ++k) { ma->afs[k] += ma->afs1[k]; sum += k * ma->afs1[k]; @@@ -346,37 -346,37 +474,12 @@@ return sum / ma->M; } --long double bcf_p1_cal_g3(bcf_p1aux_t *p1a, double g[3]) --{ -- long double pd = 0., g2[3]; -- int i, k; -- memset(g2, 0, sizeof(long double) * 3); -- for (k = 0; k < p1a->M; ++k) { -- double f = (double)k / p1a->M, f3[3], g1[3]; -- long double z = 1.; -- g1[0] = g1[1] = g1[2] = 0.; -- f3[0] = (1. - f) * (1. - f); f3[1] = 2. * f * (1. - f); f3[2] = f * f; -- for (i = 0; i < p1a->n; ++i) { -- double *pdg = p1a->pdg + i * 3; -- double x = pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]; -- z *= x; -- g1[0] += pdg[0] * f3[0] / x; -- g1[1] += pdg[1] * f3[1] / x; -- g1[2] += pdg[2] * f3[2] / x; -- } -- pd += p1a->phi[k] * z; -- for (i = 0; i < 3; ++i) -- g2[i] += p1a->phi[k] * z * g1[i]; -- } -- for (i = 0; i < 3; ++i) g[i] = g2[i] / pd; -- return pd; --} -- --int bcf_p1_cal(bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst) ++int bcf_p1_cal(const bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1rst_t *rst) { int i, k; long double sum = 0.; ma->is_indel = bcf_is_indel(b); ++ rst->perm_rank = -1; // set PL and PL_len for (i = 0; i < b->n_gi; ++i) { if (b->gi[i].fmt == bcf_str2int("PL", 2)) { @@@ -385,11 -385,11 +488,22 @@@ break; } } ++ if (i == b->n_gi) return -1; // no PL if (b->n_alleles < 2) return -1; // FIXME: find a better solution // rst->rank0 = cal_pdg(b, ma); -- rst->f_exp = mc_cal_afs(ma); -- rst->p_ref = ma->is_folded? ma->afs1[ma->M] + ma->afs1[0] : ma->afs1[ma->M]; ++ rst->f_exp = mc_cal_afs(ma, &rst->p_ref_folded, &rst->p_var_folded); ++ rst->p_ref = ma->afs1[ma->M]; ++ for (k = 0, sum = 0.; k < ma->M; ++k) ++ sum += ma->afs1[k]; ++ rst->p_var = (double)sum; ++ { // compute the allele count ++ double max = -1; ++ rst->ac = -1; ++ for (k = 0; k <= ma->M; ++k) ++ if (max < ma->z[k]) max = ma->z[k], rst->ac = k; ++ rst->ac = ma->M - rst->ac; ++ } // calculate f_flat and f_em for (k = 0, sum = 0.; k <= ma->M; ++k) sum += (long double)ma->z[k]; @@@ -399,36 -399,36 +513,39 @@@ rst->f_flat += k * p; } rst->f_flat /= ma->M; -- { // calculate f_em -- double flast = rst->f_flat; -- for (i = 0; i < MC_MAX_EM_ITER; ++i) { -- rst->f_em = mc_freq_iter(flast, ma); -- if (fabs(rst->f_em - flast) < MC_EM_EPS) break; -- flast = rst->f_em; -- } -- } { // estimate equal-tail credible interval (95% level) int l, h; double p; -- for (i = 0, p = 0.; i < ma->M; ++i) ++ for (i = 0, p = 0.; i <= ma->M; ++i) if (p + ma->afs1[i] > 0.025) break; else p += ma->afs1[i]; l = i; -- for (i = ma->M-1, p = 0.; i >= 0; --i) ++ for (i = ma->M, p = 0.; i >= 0; --i) if (p + ma->afs1[i] > 0.025) break; else p += ma->afs1[i]; h = i; rst->cil = (double)(ma->M - h) / ma->M; rst->cih = (double)(ma->M - l) / ma->M; } -- rst->g[0] = rst->g[1] = rst->g[2] = -1.; -- contrast(ma, rst->pc); ++ if (ma->n1 > 0) { // compute LRT ++ double max0, max1, max2; ++ for (k = 0, max0 = -1; k <= ma->M; ++k) ++ if (max0 < ma->z[k]) max0 = ma->z[k]; ++ for (k = 0, max1 = -1; k <= ma->n1 * 2; ++k) ++ if (max1 < ma->z1[k]) max1 = ma->z1[k]; ++ for (k = 0, max2 = -1; k <= ma->M - ma->n1 * 2; ++k) ++ if (max2 < ma->z2[k]) max2 = ma->z2[k]; ++ rst->lrt = log(max1 * max2 / max0); ++ rst->lrt = rst->lrt < 0? 1 : kf_gammaq(.5, rst->lrt); ++ } else rst->lrt = -1.0; ++ rst->cmp[0] = rst->cmp[1] = rst->cmp[2] = rst->p_chi2 = -1.0; ++ if (do_contrast && rst->p_var > 0.5) // skip contrast2() if the locus is a strong non-variant ++ rst->p_chi2 = contrast2(ma, rst->cmp); return 0; } void bcf_p1_dump_afs(bcf_p1aux_t *ma) { int k; -- if (ma->is_folded) fold_array(ma->M, ma->afs); fprintf(stderr, "[afs]"); for (k = 0; k <= ma->M; ++k) fprintf(stderr, " %d:%.3lf", k, ma->afs[ma->M - k]); diff --cc sam/bcftools/prob1.h index 3827534,3827534..0a51a0a --- a/sam/bcftools/prob1.h +++ b/sam/bcftools/prob1.h @@@ -7,11 -7,11 +7,11 @@@ struct __bcf_p1aux_t typedef struct __bcf_p1aux_t bcf_p1aux_t; typedef struct { -- int rank0; -- double f_em, f_exp, f_flat, p_ref; ++ int rank0, perm_rank; // NB: perm_rank is always set to -1 by bcf_p1_cal() ++ int ac; // ML alternative allele count ++ double f_exp, f_flat, p_ref_folded, p_ref, p_var_folded, p_var; double cil, cih; -- double pc[4]; -- double g[3]; ++ double cmp[3], p_chi2, lrt; // used by contrast2() } bcf_p1rst_t; #define MC_PTYPE_FULL 1 @@@ -22,18 -22,18 +22,19 @@@ extern "C" { #endif -- bcf_p1aux_t *bcf_p1_init(int n); ++ bcf_p1aux_t *bcf_p1_init(int n, uint8_t *ploidy); void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta); void bcf_p1_init_subprior(bcf_p1aux_t *ma, int type, double theta); void bcf_p1_destroy(bcf_p1aux_t *ma); -- int bcf_p1_cal(bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst); ++ int bcf_p1_cal(const bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1rst_t *rst); int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k); void bcf_p1_dump_afs(bcf_p1aux_t *ma); int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn); -- long double bcf_p1_cal_g3(bcf_p1aux_t *p1a, double g[3]); int bcf_p1_set_n1(bcf_p1aux_t *b, int n1); void bcf_p1_set_folded(bcf_p1aux_t *p1a); // only effective when set_n1() is not called ++ int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10]); ++ #ifdef __cplusplus } #endif diff --cc sam/bcftools/vcf.c index 9b661ff,9b661ff..9daa845 --- a/sam/bcftools/vcf.c +++ b/sam/bcftools/vcf.c @@@ -72,6 -72,6 +72,33 @@@ bcf_t *vcf_open(const char *fn, const c return bp; } ++int vcf_dictread(bcf_t *bp, bcf_hdr_t *h, const char *fn) ++{ ++ vcf_t *v; ++ gzFile fp; ++ kstream_t *ks; ++ kstring_t s, rn; ++ int dret; ++ if (bp == 0) return -1; ++ if (!bp->is_vcf) return 0; ++ s.l = s.m = 0; s.s = 0; ++ rn.m = rn.l = h->l_nm; rn.s = h->name; ++ v = (vcf_t*)bp->v; ++ fp = gzopen(fn, "r"); ++ ks = ks_init(fp); ++ while (ks_getuntil(ks, 0, &s, &dret) >= 0) { ++ bcf_str2id_add(v->refhash, strdup(s.s)); ++ kputs(s.s, &rn); kputc('\0', &rn); ++ if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); ++ } ++ ks_destroy(ks); ++ gzclose(fp); ++ h->l_nm = rn.l; h->name = rn.s; ++ bcf_hdr_sync(h); ++ free(s.s); ++ return 0; ++} ++ int vcf_close(bcf_t *bp) { vcf_t *v; @@@ -84,7 -84,7 +111,7 @@@ } if (v->fpout) fclose(v->fpout); free(v->line.s); -- bcf_str2id_destroy(v->refhash); ++ bcf_str2id_thorough_destroy(v->refhash); free(v); free(bp); return 0; @@@ -93,15 -93,15 +120,14 @@@ int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h) { vcf_t *v = (vcf_t*)bp->v; -- int i, has_ref = 0, has_ver = 0; ++ int i, has_ver = 0; if (!bp->is_vcf) return bcf_hdr_write(bp, h); if (h->l_txt > 0) { if (strstr(h->txt, "##fileformat=")) has_ver = 1; -- if (has_ver == 0) fprintf(v->fpout, "##fileformat=VCFv4.0\n"); ++ if (has_ver == 0) fprintf(v->fpout, "##fileformat=VCFv4.1\n"); fwrite(h->txt, 1, h->l_txt - 1, v->fpout); -- if (strstr(h->txt, "##SQ=")) has_ref = 1; } -- if (has_ver == 0) fprintf(v->fpout, "##fileformat=VCFv4.0\n"); ++ if (h->l_txt == 0) fprintf(v->fpout, "##fileformat=VCFv4.1\n"); fprintf(v->fpout, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"); for (i = 0; i < h->n_smpl; ++i) fprintf(v->fpout, "\t%s", h->sns[i]); @@@ -138,7 -138,7 +164,7 @@@ int vcf_read(bcf_t *bp, bcf_hdr_t *h, b if (k == 0) { // ref int tid = bcf_str2id(v->refhash, p); if (tid < 0) { -- tid = bcf_str2id_add(v->refhash, p); ++ tid = bcf_str2id_add(v->refhash, strdup(p)); kputs(p, &rn); kputc('\0', &rn); sync = 1; } @@@ -156,8 -156,8 +182,10 @@@ for (i = 0; i < b->n_gi; ++i) { if (b->gi[i].fmt == bcf_str2int("GT", 2)) { ((uint8_t*)b->gi[i].data)[k-9] = 1<<7; -- } else if (b->gi[i].fmt == bcf_str2int("GQ", 2) || b->gi[i].fmt == bcf_str2int("SP", 2)) { ++ } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) { ((uint8_t*)b->gi[i].data)[k-9] = 0; ++ } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) { ++ ((int32_t*)b->gi[i].data)[k-9] = 0; } else if (b->gi[i].fmt == bcf_str2int("DP", 2)) { ((uint16_t*)b->gi[i].data)[k-9] = 0; } else if (b->gi[i].fmt == bcf_str2int("PL", 2)) { @@@ -173,11 -173,11 +201,15 @@@ for (q = kstrtok(p, ":", &a2), i = 0; q && i < b->n_gi; q = kstrtok(0, 0, &a2), ++i) { if (b->gi[i].fmt == bcf_str2int("GT", 2)) { ((uint8_t*)b->gi[i].data)[k-9] = (q[0] - '0')<<3 | (q[2] - '0') | (q[1] == '/'? 0 : 1) << 6; -- } else if (b->gi[i].fmt == bcf_str2int("GQ", 2) || b->gi[i].fmt == bcf_str2int("SP", 2)) { ++ } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) { double _x = strtod(q, &q); int x = (int)(_x + .499); if (x > 255) x = 255; ((uint8_t*)b->gi[i].data)[k-9] = x; ++ } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) { ++ int x = strtol(q, &q, 10); ++ if (x > 0xffff) x = 0xffff; ++ ((uint32_t*)b->gi[i].data)[k-9] = x; } else if (b->gi[i].fmt == bcf_str2int("DP", 2)) { int x = strtol(q, &q, 10); if (x > 0xffff) x = 0xffff; @@@ -198,7 -198,7 +230,7 @@@ y = b->n_alleles * (b->n_alleles + 1) / 2; for (j = 0; j < y; ++j) { x = strtod(q, &q); -- data[(k-9) * y + j] = x; ++ data[(k-9) * y + j] = x > 0? -x/10. : x; ++q; } } diff --cc sam/bcftools/vcfutils.pl index cd86b0f,cd86b0f..2b7ba0b --- a/sam/bcftools/vcfutils.pl +++ b/sam/bcftools/vcfutils.pl @@@ -14,7 -14,7 +14,7 @@@ sub main my $command = shift(@ARGV); my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter, hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&varFilter, ldstats=>\&ldstats, -- gapstats=>\&gapstats, splitchr=>\&splitchr); ++ gapstats=>\&gapstats, splitchr=>\&splitchr, vcf2fq=>\&vcf2fq); die("Unknown command \"$command\".\n") if (!defined($func{$command})); &{$func{$command}}; } @@@ -86,7 -86,7 +86,7 @@@ sub fillac print; } else { my @t = split; -- my @c = (0); ++ my @c = (0, 0); my $n = 0; my $s = -1; @_ = split(":", $t[8]); @@@ -215,8 -215,8 +215,8 @@@ Note: This command discards indels. Out } sub varFilter { -- my %opts = (d=>2, D=>10000, a=>2, W=>10, Q=>10, w=>10, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4); -- getopts('pd:D:W:Q:w:a:1:2:3:4:', \%opts); ++ my %opts = (d=>2, D=>10000000, a=>2, W=>10, Q=>10, w=>3, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, G=>0, S=>1000, e=>1e-4); ++ getopts('pd:D:W:Q:w:a:1:2:3:4:G:S:e:', \%opts); die(qq/ Usage: vcfutils.pl varFilter [options] @@@ -230,6 -230,6 +230,7 @@@ Options: -Q INT minimum RMS mapping -2 FLOAT min P-value for baseQ bias [$opts{2}] -3 FLOAT min P-value for mapQ bias [$opts{3}] -4 FLOAT min P-value for end distance bias [$opts{4}] ++ -e FLOAT min P-value for HWE (plus F<0) [$opts{e}] -p print filtered variants Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools. @@@ -246,6 -246,6 +247,7 @@@ print; next; } next if ($t[4] eq '.'); # skip non-var sites ++ next if ($t[3] eq 'N'); # skip sites with unknown ref ('N') # check if the site is a SNP my $type = 1; # SNP if (length($t[3]) > 1) { @@@ -289,6 -289,6 +291,13 @@@ $flt = 1 if ($flt == 0 && $mq >= 0 && $mq < $opts{Q}); $flt = 7 if ($flt == 0 && /PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/ && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4})); ++ $flt = 8 if ($flt == 0 && ((/MXGQ=(\d+)/ && $1 < $opts{G}) || (/MXSP=(\d+)/ && $1 >= $opts{S}))); ++ # HWE filter ++ if ($t[7] =~ /G3=([^;,]+),([^;,]+),([^;,]+).*HWE=([^;,]+)/ && $4 < $opts{e}) { ++ my $p = 2*$1 + $2; ++ my $f = ($p > 0 && $p < 1)? 1 - $2 / ($p * (1-$p)) : 0; ++ $flt = 9 if ($f < 0); ++ } my $score = $t[5] * 100 + $dp_alt; my $rlen = length($t[3]) - 1; # $indel_score<0 for SNPs @@@ -311,7 -311,7 +320,10 @@@ } else { # SNP or MNP for my $x (@staging) { next if (($x->[0]&3) != 3 || $x->[4] + $x->[2] + $ow < $t[1]); -- $flt = 5; ++ if ($x->[4] + length($x->[7]) - 1 == $t[1] && substr($x->[7], -1, 1) eq substr($t[4], 0, 1) ++ && length($x->[7]) - length($x->[6]) == 1) { ++ $x->[1] = 5; ++ } else { $flt = 5; } last; } # check MNP @@@ -338,7 -338,7 +350,7 @@@ sub varFilter_aux if ($first->[1] == 0) { print join("\t", @$first[3 .. @$first-1]), "\n"; } elsif ($is_print) { -- print STDERR join("\t", substr("UQdDaGgPM", $first->[1], 1), @$first[3 .. @$first-1]), "\n"; ++ print STDERR join("\t", substr("UQdDaGgPMS", $first->[1], 1), @$first[3 .. @$first-1]), "\n"; } } @@@ -454,6 -454,6 +466,87 @@@ sub hapmap2vcf } } ++sub vcf2fq { ++ my %opts = (d=>3, D=>100000, Q=>10, l=>5); ++ getopts('d:D:Q:l:', \%opts); ++ die(qq/ ++Usage: vcfutils.pl vcf2fq [options] ++ ++Options: -d INT minimum depth [$opts{d}] ++ -D INT maximum depth [$opts{D}] ++ -Q INT min RMS mapQ [$opts{Q}] ++ -l INT INDEL filtering window [$opts{l}] ++\n/) if (@ARGV == 0 && -t STDIN); ++ ++ my ($last_chr, $seq, $qual, $last_pos, @gaps); ++ my $_Q = $opts{Q}; ++ my $_d = $opts{d}; ++ my $_D = $opts{D}; ++ ++ my %het = (AC=>'M', AG=>'R', AT=>'W', CA=>'M', CG=>'S', CT=>'Y', ++ GA=>'R', GC=>'S', GT=>'K', TA=>'W', TC=>'Y', TG=>'K'); ++ ++ $last_chr = ''; ++ while (<>) { ++ next if (/^#/); ++ my @t = split; ++ if ($last_chr ne $t[0]) { ++ &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}) if ($last_chr); ++ ($last_chr, $last_pos) = ($t[0], 0); ++ $seq = $qual = ''; ++ @gaps = (); ++ } ++ die("[vcf2fq] unsorted input\n") if ($t[1] - $last_pos < 0); ++ if ($t[1] - $last_pos > 1) { ++ $seq .= 'n' x ($t[1] - $last_pos - 1); ++ $qual .= '!' x ($t[1] - $last_pos - 1); ++ } ++ if (length($t[3]) == 1 && $t[7] !~ /INDEL/ && $t[4] =~ /^([A-Za-z.])(,[A-Za-z])*$/) { # a SNP or reference ++ my ($ref, $alt) = ($t[3], $1); ++ my ($b, $q); ++ $q = $1 if ($t[7] =~ /FQ=(-?[\d\.]+)/); ++ if ($q < 0) { ++ $_ = ($t[7] =~ /AF1=([\d\.]+)/)? $1 : 0; ++ $b = ($_ < .5 || $alt eq '.')? $ref : $alt; ++ $q = -$q; ++ } else { ++ $b = $het{"$ref$alt"}; ++ $b ||= 'N'; ++ } ++ $b = lc($b); ++ $b = uc($b) if (($t[7] =~ /MQ=(\d+)/ && $1 >= $_Q) && ($t[7] =~ /DP=(\d+)/ && $1 >= $_d && $1 <= $_D)); ++ $q = int($q + 33 + .499); ++ $q = chr($q <= 126? $q : 126); ++ $seq .= $b; ++ $qual .= $q; ++ } elsif ($t[4] ne '.') { # an INDEL ++ push(@gaps, [$t[1], length($t[3])]); ++ } ++ $last_pos = $t[1]; ++ } ++ &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}); ++} ++ ++sub v2q_post_process { ++ my ($chr, $seq, $qual, $gaps, $l) = @_; ++ for my $g (@$gaps) { ++ my $beg = $g->[0] > $l? $g->[0] - $l : 0; ++ my $end = $g->[0] + $g->[1] + $l; ++ $end = length($$seq) if ($end > length($$seq)); ++ substr($$seq, $beg, $end - $beg) = lc(substr($$seq, $beg, $end - $beg)); ++ } ++ print "\@$chr\n"; &v2q_print_str($seq); ++ print "+\n"; &v2q_print_str($qual); ++} ++ ++sub v2q_print_str { ++ my ($s) = @_; ++ my $l = length($$s); ++ for (my $i = 0; $i < $l; $i += 60) { ++ print substr($$s, $i, 60), "\n"; ++ } ++} ++ sub usage { die(qq/ Usage: vcfutils.pl []\n @@@ -461,8 -461,8 +554,14 @@@ Command: subsam get a subset of s listsam list the samples fillac fill the allele count field qstats SNP stats stratified by QUAL -- varFilter filtering short variants ++ hapmap2vcf convert the hapmap format to VCF ucscsnp2vcf convert UCSC SNP SQL dump to VCF ++ ++ varFilter filtering short variants (*) ++ vcf2fq VCF->fastq (**) ++ ++Notes: Commands with description endting with (*) may need bcftools ++ specific annotations. \n/); } diff --cc sam/bgzf.c index 66d6b02,66d6b02..216cd04 --- a/sam/bgzf.c +++ b/sam/bgzf.c @@@ -111,6 -111,6 +111,32 @@@ report_error(BGZF* fp, const char* mess fp->error = message; } ++int bgzf_check_bgzf(const char *fn) ++{ ++ BGZF *fp; ++ uint8_t buf[10],magic[10]="\037\213\010\4\0\0\0\0\0\377"; ++ int n; ++ ++ if ((fp = bgzf_open(fn, "r")) == 0) ++ { ++ fprintf(stderr, "[bgzf_check_bgzf] failed to open the file: %s\n",fn); ++ return -1; ++ } ++ ++#ifdef _USE_KNETFILE ++ n = knet_read(fp->x.fpr, buf, 10); ++#else ++ n = fread(buf, 1, 10, fp->file); ++#endif ++ bgzf_close(fp); ++ ++ if ( n!=10 ) ++ return -1; ++ ++ if ( !memcmp(magic, buf, 10) ) return 1; ++ return 0; ++} ++ static BGZF *bgzf_read_init() { BGZF *fp; @@@ -148,7 -148,7 +174,7 @@@ open_read(int fd static BGZF* --open_write(int fd, bool is_uncompressed) ++open_write(int fd, int compress_level) // compress_level==-1 for the default level { FILE* file = fdopen(fd, "w"); BGZF* fp; @@@ -156,7 -156,7 +182,9 @@@ fp = malloc(sizeof(BGZF)); fp->file_descriptor = fd; fp->open_mode = 'w'; -- fp->owned_file = 0; fp->is_uncompressed = is_uncompressed; ++ fp->owned_file = 0; ++ fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1 ++ if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION; #ifdef _USE_KNETFILE fp->x.fpw = file; #else @@@ -195,13 -195,13 +223,20 @@@ bgzf_open(const char* __restrict path, fp = open_read(fd); #endif } else if (strchr(mode, 'w') || strchr(mode, 'W')) { -- int fd, oflag = O_WRONLY | O_CREAT | O_TRUNC; ++ int fd, compress_level = -1, oflag = O_WRONLY | O_CREAT | O_TRUNC; #ifdef _WIN32 oflag |= O_BINARY; #endif fd = open(path, oflag, 0666); if (fd == -1) return 0; -- fp = open_write(fd, strchr(mode, 'u')? 1 : 0); ++ { // set compress_level ++ int i; ++ for (i = 0; mode[i]; ++i) ++ if (mode[i] >= '0' && mode[i] <= '9') break; ++ if (mode[i]) compress_level = (int)mode[i] - '0'; ++ if (strchr(mode, 'u')) compress_level = 0; ++ } ++ fp = open_write(fd, compress_level); } if (fp != NULL) fp->owned_file = 1; return fp; @@@ -214,7 -214,7 +249,12 @@@ bgzf_fdopen(int fd, const char * __rest if (mode[0] == 'r' || mode[0] == 'R') { return open_read(fd); } else if (mode[0] == 'w' || mode[0] == 'W') { -- return open_write(fd, strstr(mode, "u")? 1 : 0); ++ int i, compress_level = -1; ++ for (i = 0; mode[i]; ++i) ++ if (mode[i] >= '0' && mode[i] <= '9') break; ++ if (mode[i]) compress_level = (int)mode[i] - '0'; ++ if (strchr(mode, 'u')) compress_level = 0; ++ return open_write(fd, compress_level); } else { return NULL; } @@@ -254,7 -254,7 +294,6 @@@ deflate_block(BGZF* fp, int block_lengt int input_length = block_length; int compressed_length = 0; while (1) { -- int compress_level = fp->is_uncompressed? 0 : Z_DEFAULT_COMPRESSION; z_stream zs; zs.zalloc = NULL; zs.zfree = NULL; @@@ -263,7 -263,7 +302,7 @@@ zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH]; zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; -- int status = deflateInit2(&zs, compress_level, Z_DEFLATED, ++ int status = deflateInit2(&zs, fp->compress_level, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY); if (status != Z_OK) { report_error(fp, "deflate init failed"); @@@ -330,6 -330,6 +369,7 @@@ inflate_block(BGZF* fp, int block_lengt // Inflate the block in fp->compressed_block into fp->uncompressed_block z_stream zs; ++ int status; zs.zalloc = NULL; zs.zfree = NULL; zs.next_in = fp->compressed_block + 18; @@@ -337,7 -337,7 +377,7 @@@ zs.next_out = fp->uncompressed_block; zs.avail_out = fp->uncompressed_block_size; -- int status = inflateInit2(&zs, GZIP_WINDOW_BITS); ++ status = inflateInit2(&zs, GZIP_WINDOW_BITS); if (status != Z_OK) { report_error(fp, "inflate init failed"); return -1; @@@ -431,7 -431,7 +471,7 @@@ in bgzf_read_block(BGZF* fp) { bgzf_byte_t header[BLOCK_HEADER_LENGTH]; -- int count, size = 0; ++ int count, size = 0, block_length, remaining; #ifdef _USE_KNETFILE int64_t block_address = knet_tell(fp->x.fpr); if (load_block_from_cache(fp, block_address)) return 0; @@@ -454,10 -454,10 +494,10 @@@ report_error(fp, "invalid block header"); return -1; } -- int block_length = unpackInt16((uint8_t*)&header[16]) + 1; ++ block_length = unpackInt16((uint8_t*)&header[16]) + 1; bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block; memcpy(compressed_block, header, BLOCK_HEADER_LENGTH); -- int remaining = block_length - BLOCK_HEADER_LENGTH; ++ remaining = block_length - BLOCK_HEADER_LENGTH; #ifdef _USE_KNETFILE count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining); #else @@@ -494,7 -494,7 +534,8 @@@ bgzf_read(BGZF* fp, void* data, int len int bytes_read = 0; bgzf_byte_t* output = data; while (bytes_read < length) { -- int available = fp->block_length - fp->block_offset; ++ int copy_length, available = fp->block_length - fp->block_offset; ++ bgzf_byte_t *buffer; if (available <= 0) { if (bgzf_read_block(fp) != 0) { return -1; @@@ -504,8 -504,8 +545,8 @@@ break; } } -- int copy_length = bgzf_min(length-bytes_read, available); -- bgzf_byte_t* buffer = fp->uncompressed_block; ++ copy_length = bgzf_min(length-bytes_read, available); ++ buffer = fp->uncompressed_block; memcpy(output, buffer + fp->block_offset, copy_length); fp->block_offset += copy_length; output += copy_length; @@@ -552,6 -552,6 +593,8 @@@ int bgzf_flush_try(BGZF *fp, int size int bgzf_write(BGZF* fp, const void* data, int length) { ++ const bgzf_byte_t *input = data; ++ int block_length, bytes_written; if (fp->open_mode != 'w') { report_error(fp, "file not open for writing"); return -1; @@@ -560,9 -560,9 +603,9 @@@ if (fp->uncompressed_block == NULL) fp->uncompressed_block = malloc(fp->uncompressed_block_size); -- const bgzf_byte_t* input = data; -- int block_length = fp->uncompressed_block_size; -- int bytes_written = 0; ++ input = data; ++ block_length = fp->uncompressed_block_size; ++ bytes_written = 0; while (bytes_written < length) { int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written); bgzf_byte_t* buffer = fp->uncompressed_block; diff --cc sam/bgzf.h index 099ae9a,099ae9a..7295f37 --- a/sam/bgzf.h +++ b/sam/bgzf.h @@@ -26,7 -26,7 +26,6 @@@ #include #include --#include #include #ifdef _USE_KNETFILE #include "knetfile.h" @@@ -37,7 -37,7 +36,7 @@@ typedef struct { int file_descriptor; char open_mode; // 'r' or 'w' -- bool owned_file, is_uncompressed; ++ int16_t owned_file, compress_level; #ifdef _USE_KNETFILE union { knetFile *fpr; @@@ -129,6 -129,6 +128,7 @@@ int bgzf_check_EOF(BGZF *fp) int bgzf_read_block(BGZF* fp); int bgzf_flush(BGZF* fp); int bgzf_flush_try(BGZF *fp, int size); ++int bgzf_check_bgzf(const char *fn); #ifdef __cplusplus } diff --cc sam/errmod.h index e3e9a90,e3e9a90..32c07b6 --- a/sam/errmod.h +++ b/sam/errmod.h @@@ -12,6 -12,6 +12,13 @@@ typedef struct errmod_t *errmod_init(float depcorr); void errmod_destroy(errmod_t *em); ++ ++/* ++ n: number of bases ++ m: maximum base ++ bases[i]: qual:6, strand:1, base:4 ++ q[i*m+j]: phred-scaled likelihood of (i,j) ++ */ int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q); #endif diff --cc sam/examples/Makefile index ec976ae,ec976ae..309399f --- a/sam/examples/Makefile +++ b/sam/examples/Makefile @@@ -40,11 -40,11 +40,11 @@@ ex1.bcf:ex1.bam ex1.fa.fa (cd ..; make libbam.a) calDepth:../libbam.a calDepth.c -- gcc -g -Wall -O2 -I.. calDepth.c -o $@ -lm -lz -L.. -lbam ++ gcc -g -Wall -O2 -I.. calDepth.c -o $@ -L.. -lbam -lm -lz clean: rm -fr *.bam *.bai *.glf* *.fai *.pileup* *~ calDepth *.dSYM ex1*.rg ex1.bcf # ../samtools pileup ex1.bam|perl -ape '$_=$F[4];s/(\d+)(??{".{$1}"})|\^.//g;@_=(tr/A-Z//,tr/a-z//);$_=join("\t",@F[0,1],@_)."\n"' --# ../samtools pileup -cf ex1.fa ex1.bam|perl -ape '$_=$F[8];s/\^.//g;s/(\d+)(??{".{$1}"})|\^.//g;@_=(tr/A-Za-z//,tr/,.//);$_=join("\t",@F[0,1],@_)."\n"' ++# ../samtools pileup -cf ex1.fa ex1.bam|perl -ape '$_=$F[8];s/\^.//g;s/(\d+)(??{".{$1}"})|\^.//g;@_=(tr/A-Za-z//,tr/,.//);$_=join("\t",@F[0,1],@_)."\n"' diff --cc sam/examples/toy.sam index 1aff220,1aff220..33449b1 --- a/sam/examples/toy.sam +++ b/sam/examples/toy.sam @@@ -1,6 -1,6 +1,6 @@@ @SQ SN:ref LN:45 @SQ SN:ref2 LN:40 --r001 163 ref 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * ++r001 163 ref 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * XX:B:S,12561,2,20,112 r002 0 ref 9 30 1S2I6M1P1I1P1I4M2I * 0 0 AAAAGATAAGGGATAAA * r003 0 ref 9 30 5H6M * 0 0 AGCTAA * r004 0 ref 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC * diff --cc sam/faidx.c index dbd8b3e,dbd8b3e..f0798fc --- a/sam/faidx.c +++ b/sam/faidx.c @@@ -2,11 -2,11 +2,13 @@@ #include #include #include ++#include #include "faidx.h" #include "khash.h" typedef struct { -- uint64_t len:32, line_len:16, line_blen:16; ++ int32_t line_len, line_blen; ++ int64_t len; uint64_t offset; } faidx1_t; KHASH_MAP_INIT_STR(s, faidx1_t) @@@ -63,10 -63,10 +65,11 @@@ faidx_t *fai_build_core(RAZF *rz { char c, *name; int l_name, m_name, ret; -- int len, line_len, line_blen, state; ++ int line_len, line_blen, state; int l1, l2; faidx_t *idx; uint64_t offset; ++ int64_t len; idx = (faidx_t*)calloc(1, sizeof(faidx_t)); idx->hash = kh_init(s); @@@ -118,11 -118,11 +121,6 @@@ return 0; } ++l1; len += l2; -- if (l2 >= 0x10000) { -- fprintf(stderr, "[fai_build_core] line length exceeds 65535 in sequence '%s'.\n", name); -- free(name); fai_destroy(idx); -- return 0; -- } if (state == 1) line_len = l1, line_blen = l2, state = 0; else if (state == 0) { if (l1 != line_len || l2 != line_blen) state = 2; @@@ -304,8 -304,8 +302,8 @@@ faidx_t *fai_load(const char *fn char *fai_fetch(const faidx_t *fai, const char *str, int *len) { -- char *s, *p, c; -- int i, l, k; ++ char *s, c; ++ int i, l, k, name_end; khiter_t iter; faidx1_t val; khash_t(s) *h; @@@ -313,31 -313,31 +311,43 @@@ beg = end = -1; h = fai->hash; -- l = strlen(str); -- p = s = (char*)malloc(l+1); -- /* squeeze out "," */ -- for (i = k = 0; i != l; ++i) -- if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i]; -- s[k] = 0; -- for (i = 0; i != k; ++i) if (s[i] == ':') break; -- s[i] = 0; -- iter = kh_get(s, h, s); /* get the ref_id */ -- if (iter == kh_end(h)) { -- *len = 0; -- free(s); return 0; -- } ++ name_end = l = strlen(str); ++ s = (char*)malloc(l+1); ++ // remove space ++ for (i = k = 0; i < l; ++i) ++ if (!isspace(str[i])) s[k++] = str[i]; ++ s[k] = 0; l = k; ++ // determine the sequence name ++ for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end ++ if (i >= 0) name_end = i; ++ if (name_end < l) { // check if this is really the end ++ int n_hyphen = 0; ++ for (i = name_end + 1; i < l; ++i) { ++ if (s[i] == '-') ++n_hyphen; ++ else if (!isdigit(s[i]) && s[i] != ',') break; ++ } ++ if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name ++ s[name_end] = 0; ++ iter = kh_get(s, h, s); ++ if (iter == kh_end(h)) { // cannot find the sequence name ++ iter = kh_get(s, h, str); // try str as the name ++ if (iter == kh_end(h)) { ++ *len = 0; ++ free(s); return 0; ++ } else s[name_end] = ':', name_end = l; ++ } ++ } else iter = kh_get(s, h, str); val = kh_value(h, iter); -- if (i == k) { /* dump the whole sequence */ -- beg = 0; end = val.len; -- } else { -- for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break; -- beg = atoi(p); -- if (i < k) { -- p = s + i + 1; -- end = atoi(p); -- } else end = val.len; -- } -- if (beg > 0) --beg; ++ // parse the interval ++ if (name_end < l) { ++ for (i = k = name_end + 1; i < l; ++i) ++ if (s[i] != ',') s[k++] = s[i]; ++ s[k] = 0; ++ beg = atoi(s + name_end + 1); ++ for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break; ++ end = i < k? atoi(s + i + 1) : val.len; ++ if (beg > 0) --beg; ++ } else beg = 0, end = val.len; if (beg >= val.len) beg = val.len; if (end >= val.len) end = val.len; if (beg > end) beg = end; diff --cc sam/glf.c index 8d5346a,8d5346a..0000000 deleted file mode 100644,100644 --- a/sam/glf.c +++ /dev/null @@@ -1,236 -1,236 +1,0 @@@ --#include --#include --#include "glf.h" -- --#ifdef _NO_BGZF --// then alias bgzf_*() functions --#endif -- --static int glf3_is_BE = 0; -- --static inline uint32_t bam_swap_endian_4(uint32_t v) --{ -- v = ((v & 0x0000FFFFU) << 16) | (v >> 16); -- return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); --} -- --static inline uint16_t bam_swap_endian_2(uint16_t v) --{ -- return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8)); --} -- --static inline int bam_is_big_endian() --{ -- long one= 1; -- return !(*((char *)(&one))); --} -- --glf3_header_t *glf3_header_init() --{ -- glf3_is_BE = bam_is_big_endian(); -- return (glf3_header_t*)calloc(1, sizeof(glf3_header_t)); --} -- --glf3_header_t *glf3_header_read(glfFile fp) --{ -- glf3_header_t *h; -- char magic[4]; -- h = glf3_header_init(); -- bgzf_read(fp, magic, 4); -- if (strncmp(magic, "GLF\3", 4)) { -- fprintf(stderr, "[glf3_header_read] invalid magic.\n"); -- glf3_header_destroy(h); -- return 0; -- } -- bgzf_read(fp, &h->l_text, 4); -- if (glf3_is_BE) h->l_text = bam_swap_endian_4(h->l_text); -- if (h->l_text) { -- h->text = (uint8_t*)calloc(h->l_text + 1, 1); -- bgzf_read(fp, h->text, h->l_text); -- } -- return h; --} -- --void glf3_header_write(glfFile fp, const glf3_header_t *h) --{ -- int32_t x; -- bgzf_write(fp, "GLF\3", 4); -- x = glf3_is_BE? bam_swap_endian_4(h->l_text) : h->l_text; -- bgzf_write(fp, &x, 4); -- if (h->l_text) bgzf_write(fp, h->text, h->l_text); --} -- --void glf3_header_destroy(glf3_header_t *h) --{ -- free(h->text); -- free(h); --} -- --char *glf3_ref_read(glfFile fp, int *len) --{ -- int32_t n, x; -- char *str; -- *len = 0; -- if (bgzf_read(fp, &n, 4) != 4) return 0; -- if (glf3_is_BE) n = bam_swap_endian_4(n); -- if (n < 0) { -- fprintf(stderr, "[glf3_ref_read] invalid reference name length: %d.\n", n); -- return 0; -- } -- str = (char*)calloc(n + 1, 1); // not necesarily n+1 in fact -- x = bgzf_read(fp, str, n); -- x += bgzf_read(fp, len, 4); -- if (x != n + 4) { -- free(str); *len = -1; return 0; // truncated -- } -- if (glf3_is_BE) *len = bam_swap_endian_4(*len); -- return str; --} -- --void glf3_ref_write(glfFile fp, const char *str, int len) --{ -- int32_t m, n = strlen(str) + 1; -- m = glf3_is_BE? bam_swap_endian_4(n) : n; -- bgzf_write(fp, &m, 4); -- bgzf_write(fp, str, n); -- if (glf3_is_BE) len = bam_swap_endian_4(len); -- bgzf_write(fp, &len, 4); --} -- --void glf3_view1(const char *ref_name, const glf3_t *g3, int pos) --{ -- int j; -- if (g3->rtype == GLF3_RTYPE_END) return; -- printf("%s\t%d\t%c\t%d\t%d\t%d", ref_name, pos + 1, -- g3->rtype == GLF3_RTYPE_INDEL? '*' : "XACMGRSVTWYHKDBN"[g3->ref_base], -- g3->depth, g3->rms_mapQ, g3->min_lk); -- if (g3->rtype == GLF3_RTYPE_SUB) -- for (j = 0; j != 10; ++j) printf("\t%d", g3->lk[j]); -- else { -- printf("\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t", g3->lk[0], g3->lk[1], g3->lk[2], g3->indel_len[0], g3->indel_len[1], -- g3->indel_len[0]? g3->indel_seq[0] : "*", g3->indel_len[1]? g3->indel_seq[1] : "*"); -- } -- printf("\n"); --} -- --int glf3_write1(glfFile fp, const glf3_t *g3) --{ -- int r; -- uint8_t c; -- uint32_t y[2]; -- c = g3->rtype<<4 | g3->ref_base; -- r = bgzf_write(fp, &c, 1); -- if (g3->rtype == GLF3_RTYPE_END) return r; -- y[0] = g3->offset; -- y[1] = g3->min_lk<<24 | g3->depth; -- if (glf3_is_BE) { -- y[0] = bam_swap_endian_4(y[0]); -- y[1] = bam_swap_endian_4(y[1]); -- } -- r += bgzf_write(fp, y, 8); -- r += bgzf_write(fp, &g3->rms_mapQ, 1); -- if (g3->rtype == GLF3_RTYPE_SUB) r += bgzf_write(fp, g3->lk, 10); -- else { -- int16_t x[2]; -- r += bgzf_write(fp, g3->lk, 3); -- x[0] = glf3_is_BE? bam_swap_endian_2(g3->indel_len[0]) : g3->indel_len[0]; -- x[1] = glf3_is_BE? bam_swap_endian_2(g3->indel_len[1]) : g3->indel_len[1]; -- r += bgzf_write(fp, x, 4); -- if (g3->indel_len[0]) r += bgzf_write(fp, g3->indel_seq[0], abs(g3->indel_len[0])); -- if (g3->indel_len[1]) r += bgzf_write(fp, g3->indel_seq[1], abs(g3->indel_len[1])); -- } -- return r; --} -- --#ifndef kv_roundup32 --#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) --#endif -- --int glf3_read1(glfFile fp, glf3_t *g3) --{ -- int r; -- uint8_t c; -- uint32_t y[2]; -- r = bgzf_read(fp, &c, 1); -- if (r == 0) return 0; -- g3->ref_base = c & 0xf; -- g3->rtype = c>>4; -- if (g3->rtype == GLF3_RTYPE_END) return r; -- r += bgzf_read(fp, y, 8); -- if (glf3_is_BE) { -- y[0] = bam_swap_endian_4(y[0]); -- y[1] = bam_swap_endian_4(y[1]); -- } -- g3->offset = y[0]; -- g3->min_lk = y[1]>>24; -- g3->depth = y[1]<<8>>8; -- r += bgzf_read(fp, &g3->rms_mapQ, 1); -- if (g3->rtype == GLF3_RTYPE_SUB) r += bgzf_read(fp, g3->lk, 10); -- else { -- int16_t x[2], max; -- r += bgzf_read(fp, g3->lk, 3); -- r += bgzf_read(fp, x, 4); -- if (glf3_is_BE) { -- x[0] = bam_swap_endian_2(x[0]); -- x[1] = bam_swap_endian_2(x[1]); -- } -- g3->indel_len[0] = x[0]; -- g3->indel_len[1] = x[1]; -- x[0] = abs(x[0]); x[1] = abs(x[1]); -- max = (x[0] > x[1]? x[0] : x[1]) + 1; -- if (g3->max_len < max) { -- g3->max_len = max; -- kv_roundup32(g3->max_len); -- g3->indel_seq[0] = (char*)realloc(g3->indel_seq[0], g3->max_len); -- g3->indel_seq[1] = (char*)realloc(g3->indel_seq[1], g3->max_len); -- } -- r += bgzf_read(fp, g3->indel_seq[0], x[0]); -- r += bgzf_read(fp, g3->indel_seq[1], x[1]); -- g3->indel_seq[0][x[0]] = g3->indel_seq[1][x[1]] = 0; -- } -- return r; --} -- --void glf3_view(glfFile fp) --{ -- glf3_header_t *h; -- char *name; -- glf3_t *g3; -- int len; -- h = glf3_header_read(fp); -- g3 = glf3_init1(); -- while ((name = glf3_ref_read(fp, &len)) != 0) { -- int pos = 0; -- while (glf3_read1(fp, g3) && g3->rtype != GLF3_RTYPE_END) { -- pos += g3->offset; -- glf3_view1(name, g3, pos); -- } -- free(name); -- } -- glf3_header_destroy(h); -- glf3_destroy1(g3); --} -- --int glf3_view_main(int argc, char *argv[]) --{ -- glfFile fp; -- if (argc == 1) { -- fprintf(stderr, "Usage: glfview \n"); -- return 1; -- } -- fp = (strcmp(argv[1], "-") == 0)? bgzf_fdopen(fileno(stdin), "r") : bgzf_open(argv[1], "r"); -- if (fp == 0) { -- fprintf(stderr, "Fail to open file '%s'\n", argv[1]); -- return 1; -- } -- glf3_view(fp); -- bgzf_close(fp); -- return 0; --} -- --#ifdef GLFVIEW_MAIN --int main(int argc, char *argv[]) --{ -- return glf3_view_main(argc, argv); --} --#endif diff --cc sam/glf.h index 12e5400,12e5400..0000000 deleted file mode 100644,100644 --- a/sam/glf.h +++ /dev/null @@@ -1,56 -1,56 +1,0 @@@ --#ifndef GLF_H_ --#define GLF_H_ -- --typedef struct { -- unsigned char ref_base:4, dummy:4; /** "XACMGRSVTWYHKDBN"[ref_base] gives the reference base */ -- unsigned char max_mapQ; /** maximum mapping quality */ -- unsigned char lk[10]; /** log likelihood ratio, capped at 255 */ -- unsigned min_lk:8, depth:24; /** minimum lk capped at 255, and the number of mapped reads */ --} glf1_t; -- --#include --#include "bgzf.h" --typedef BGZF *glfFile; -- --#define GLF3_RTYPE_END 0 --#define GLF3_RTYPE_SUB 1 --#define GLF3_RTYPE_INDEL 2 -- --typedef struct { -- uint8_t ref_base:4, rtype:4; /** "XACMGRSVTWYHKDBN"[ref_base] gives the reference base */ -- uint8_t rms_mapQ; /** RMS mapping quality */ -- uint8_t lk[10]; /** log likelihood ratio, capped at 255 */ -- uint32_t min_lk:8, depth:24; /** minimum lk capped at 255, and the number of mapped reads */ -- int32_t offset; /** the first base in a chromosome has offset zero. */ -- // for indel (lkHom1, lkHom2 and lkHet are the first three elements in lk[10]) -- int16_t indel_len[2]; -- int32_t max_len; // maximum indel len; will be modified by glf3_read1() -- char *indel_seq[2]; --} glf3_t; -- --typedef struct { -- int32_t l_text; -- uint8_t *text; --} glf3_header_t; -- --#ifdef __cplusplus --extern "C" { --#endif -- --#define glf3_init1() ((glf3_t*)calloc(1, sizeof(glf3_t))) --#define glf3_destroy1(g3) do { free((g3)->indel_seq[0]); free((g3)->indel_seq[1]); free(g3); } while (0) -- -- glf3_header_t *glf3_header_init(); -- glf3_header_t *glf3_header_read(glfFile fp); -- void glf3_header_write(glfFile fp, const glf3_header_t *h); -- void glf3_header_destroy(glf3_header_t *h); -- char *glf3_ref_read(glfFile fp, int *len); -- void glf3_ref_write(glfFile fp, const char *name, int len); -- int glf3_write1(glfFile fp, const glf3_t *g3); -- int glf3_read1(glfFile fp, glf3_t *g3); -- --#ifdef __cplusplus --} --#endif -- --#endif diff --cc sam/khash.h index 1d583ef,1d583ef..a7e8056 --- a/sam/khash.h +++ b/sam/khash.h @@@ -1,6 -1,6 +1,6 @@@ /* The MIT License -- Copyright (c) 2008 Genome Research Ltd (GRL). ++ Copyright (c) 2008, 2009, 2011 by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@@ -23,8 -23,8 +23,6 @@@ SOFTWARE. */ --/* Contact: Heng Li */ -- /* An example: @@@ -49,6 -49,6 +47,14 @@@ int main() */ /* ++ 2011-02-14 (0.2.5): ++ ++ * Allow to declare global functions. ++ ++ 2009-09-26 (0.2.4): ++ ++ * Improve portability ++ 2008-09-19 (0.2.3): * Corrected the example @@@ -88,17 -88,17 +94,35 @@@ @copyright Heng Li */ --#define AC_VERSION_KHASH_H "0.2.2" ++#define AC_VERSION_KHASH_H "0.2.5" --#include #include #include ++#include ++ ++/* compipler specific configuration */ ++ ++#if UINT_MAX == 0xffffffffu ++typedef unsigned int khint32_t; ++#elif ULONG_MAX == 0xffffffffu ++typedef unsigned long khint32_t; ++#endif ++ ++#if ULONG_MAX == ULLONG_MAX ++typedef unsigned long khint64_t; ++#else ++typedef unsigned long long khint64_t; ++#endif ++ ++#ifdef _MSC_VER ++#define inline __inline ++#endif --typedef uint32_t khint_t; ++typedef khint32_t khint_t; typedef khint_t khiter_t; #define __ac_HASH_PRIME_SIZE 32 --static const uint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] = ++static const khint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] = { 0ul, 3ul, 11ul, 23ul, 53ul, 97ul, 193ul, 389ul, 769ul, 1543ul, @@@ -119,17 -119,17 +143,32 @@@ static const double __ac_HASH_UPPER = 0.77; --#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ ++#define KHASH_DECLARE(name, khkey_t, khval_t) \ ++ typedef struct { \ ++ khint_t n_buckets, size, n_occupied, upper_bound; \ ++ khint32_t *flags; \ ++ khkey_t *keys; \ ++ khval_t *vals; \ ++ } kh_##name##_t; \ ++ extern kh_##name##_t *kh_init_##name(); \ ++ extern void kh_destroy_##name(kh_##name##_t *h); \ ++ extern void kh_clear_##name(kh_##name##_t *h); \ ++ extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ ++ extern void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ ++ extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ ++ extern void kh_del_##name(kh_##name##_t *h, khint_t x); ++ ++#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ typedef struct { \ khint_t n_buckets, size, n_occupied, upper_bound; \ -- uint32_t *flags; \ ++ khint32_t *flags; \ khkey_t *keys; \ khval_t *vals; \ } kh_##name##_t; \ -- static inline kh_##name##_t *kh_init_##name() { \ ++ SCOPE kh_##name##_t *kh_init_##name() { \ return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ } \ -- static inline void kh_destroy_##name(kh_##name##_t *h) \ ++ SCOPE void kh_destroy_##name(kh_##name##_t *h) \ { \ if (h) { \ free(h->keys); free(h->flags); \ @@@ -137,14 -137,14 +176,14 @@@ free(h); \ } \ } \ -- static inline void kh_clear_##name(kh_##name##_t *h) \ ++ SCOPE void kh_clear_##name(kh_##name##_t *h) \ { \ if (h && h->flags) { \ -- memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t)); \ ++ memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(khint32_t)); \ h->size = h->n_occupied = 0; \ } \ } \ -- static inline khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ ++ SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ { \ if (h->n_buckets) { \ khint_t inc, k, i, last; \ @@@ -158,9 -158,9 +197,9 @@@ return __ac_iseither(h->flags, i)? h->n_buckets : i; \ } else return 0; \ } \ -- static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ ++ SCOPE void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ { \ -- uint32_t *new_flags = 0; \ ++ khint32_t *new_flags = 0; \ khint_t j = 1; \ { \ khint_t t = __ac_HASH_PRIME_SIZE - 1; \ @@@ -168,8 -168,8 +207,8 @@@ new_n_buckets = __ac_prime_list[t+1]; \ if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \ else { \ -- new_flags = (uint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \ -- memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \ ++ new_flags = (khint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ ++ memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(khint32_t)); \ if (h->n_buckets < new_n_buckets) { \ h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ if (kh_is_map) \ @@@ -218,7 -218,7 +257,7 @@@ h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ } \ } \ -- static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ ++ SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ { \ khint_t x; \ if (h->n_occupied >= h->upper_bound) { \ @@@ -256,7 -256,7 +295,7 @@@ } else *ret = 0; \ return x; \ } \ -- static inline void kh_del_##name(kh_##name##_t *h, khint_t x) \ ++ SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ { \ if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ __ac_set_isdel_true(h->flags, x); \ @@@ -264,24 -264,24 +303,27 @@@ } \ } ++#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ ++ KHASH_INIT2(name, static inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) ++ /* --- BEGIN OF HASH FUNCTIONS --- */ /*! @function @abstract Integer hash function -- @param key The integer [uint32_t] ++ @param key The integer [khint32_t] @return The hash value [khint_t] */ --#define kh_int_hash_func(key) (uint32_t)(key) ++#define kh_int_hash_func(key) (khint32_t)(key) /*! @function @abstract Integer comparison function */ #define kh_int_hash_equal(a, b) ((a) == (b)) /*! @function @abstract 64-bit integer hash function -- @param key The integer [uint64_t] ++ @param key The integer [khint64_t] @return The hash value [khint_t] */ --#define kh_int64_hash_func(key) (uint32_t)((key)>>33^(key)^(key)<<11) ++#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) /*! @function @abstract 64-bit integer comparison function */ @@@ -442,7 -442,7 +484,7 @@@ static inline khint_t __ac_X31_hash_str @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_INT(name) \ -- KHASH_INIT(name, uint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) ++ KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) /*! @function @abstract Instantiate a hash map containing integer keys @@@ -450,14 -450,14 +492,14 @@@ @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_INT(name, khval_t) \ -- KHASH_INIT(name, uint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) ++ KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @param name Name of the hash table [symbol] */ #define KHASH_SET_INIT_INT64(name) \ -- KHASH_INIT(name, uint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) ++ KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) /*! @function @abstract Instantiate a hash map containing 64-bit integer keys @@@ -465,7 -465,7 +507,7 @@@ @param khval_t Type of values [type] */ #define KHASH_MAP_INIT_INT64(name, khval_t) \ -- KHASH_INIT(name, uint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) ++ KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) typedef const char *kh_cstr_t; /*! @function diff --cc sam/knetfile.c index 1e2c042,1e2c042..af09146 --- a/sam/knetfile.c +++ b/sam/knetfile.c @@@ -1,6 -1,6 +1,7 @@@ /* The MIT License -- Copyright (c) 2008 Genome Research Ltd (GRL). ++ Copyright (c) 2008 by Genome Research Ltd (GRL). ++ 2010 by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@@ -23,11 -23,11 +24,9 @@@ SOFTWARE. */ --/* Contact: Heng Li */ -- /* Probably I will not do socket programming in the next few years and therefore I decide to heavily annotate this file, for Linux and -- Windows as well. -lh3 */ ++ Windows as well. -ac */ #include #include @@@ -90,7 -90,7 +89,7 @@@ static int socket_connect(const char *h int on = 1, fd; struct linger lng = { 0, 0 }; -- struct addrinfo hints, *res; ++ struct addrinfo hints, *res = 0; memset(&hints, 0, sizeof(struct addrinfo)); hints.ai_family = AF_UNSPEC; hints.ai_socktype = SOCK_STREAM; diff --cc sam/kprobaln.c index 5201c1a,5201c1a..894a2ae --- a/sam/kprobaln.c +++ b/sam/kprobaln.c @@@ -161,7 -161,7 +161,7 @@@ int kpa_glocal(const uint8_t *_ref, in double p = 1., Pr1 = 0.; for (i = 0; i <= l_query + 1; ++i) { p *= s[i]; -- if (p < 1e-100) Pr += -4.343 * log(p), p = 1.; ++ if (p < 1e-100) Pr1 += -4.343 * log(p), p = 1.; } Pr1 += -4.343 * log(p * l_ref * l_query); Pr = (int)(Pr1 + .499); diff --cc sam/kseq.h index 82face0,82face0..0bbc7dc --- a/sam/kseq.h +++ b/sam/kseq.h @@@ -1,6 -1,6 +1,6 @@@ /* The MIT License -- Copyright (c) 2008 Genome Research Ltd (GRL). ++ Copyright (c) 2008, 2009, 2011 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@@ -23,13 -23,13 +23,7 @@@ SOFTWARE. */ --/* Contact: Heng Li */ -- --/* -- 2009-07-16 (lh3): in kstream_t, change "char*" to "unsigned char*" -- */ -- --/* Last Modified: 12APR2009 */ ++/* Last Modified: 18AUG2011 */ #ifndef AC_KSEQ_H #define AC_KSEQ_H @@@ -94,10 -94,10 +88,10 @@@ typedef struct __kstring_t #endif #define __KS_GETUNTIL(__read, __bufsize) \ -- static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ ++ static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ { \ if (dret) *dret = 0; \ -- str->l = 0; \ ++ str->l = append? str->l : 0; \ if (ks->begin >= ks->end && ks->is_eof) return -1; \ for (;;) { \ int i; \ @@@ -132,13 -132,13 +126,15 @@@ break; \ } \ } \ -- if (str->l == 0) { \ ++ if (str->s == 0) { \ str->m = 1; \ str->s = (char*)calloc(1, 1); \ } \ str->s[str->l] = '\0'; \ return str->l; \ -- } ++ } \ ++ static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ ++ { return ks_getuntil2(ks, delimiter, str, dret, 0); } #define KSTREAM_INIT(type_t, __read, __bufsize) \ __KS_TYPE(type_t) \ @@@ -171,44 -171,44 +167,45 @@@ -1 end-of-file -2 truncated quality string */ --#define __KSEQ_READ \ -- static int kseq_read(kseq_t *seq) \ -- { \ -- int c; \ -- kstream_t *ks = seq->f; \ ++#define __KSEQ_READ \ ++ static int kseq_read(kseq_t *seq) \ ++ { \ ++ int c; \ ++ kstream_t *ks = seq->f; \ if (seq->last_char == 0) { /* then jump to the next header line */ \ -- while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ -- if (c == -1) return -1; /* end of file */ \ -- seq->last_char = c; \ -- } /* the first header char has been read */ \ -- seq->comment.l = seq->seq.l = seq->qual.l = 0; \ -- if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \ -- if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \ ++ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ ++ if (c == -1) return -1; /* end of file */ \ ++ seq->last_char = c; \ ++ } /* else: the first header char has been read in the previous call */ \ ++ seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ ++ if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ ++ if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); /* read FASTA/Q comment */ \ ++ if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ ++ seq->seq.m = 256; \ ++ seq->seq.s = (char*)malloc(seq->seq.m); \ ++ } \ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ -- if (isgraph(c)) { /* printable non-space character */ \ -- if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \ -- seq->seq.m = seq->seq.l + 2; \ -- kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \ -- seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ -- } \ -- seq->seq.s[seq->seq.l++] = (char)c; \ -- } \ -- } \ ++ seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ ++ ks_getuntil2(ks, '\n', &seq->seq, 0, 1); /* read the rest of the line */ \ ++ } \ if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ -- seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ -- if (c != '+') return seq->seq.l; /* FASTA */ \ -- if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \ -- seq->qual.m = seq->seq.m; \ -- seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ -- } \ ++ if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ ++ seq->seq.m = seq->seq.l + 2; \ ++ kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ ++ seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ ++ } \ ++ seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ ++ if (c != '+') return seq->seq.l; /* FASTA */ \ ++ if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ ++ seq->qual.m = seq->seq.m; \ ++ seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ ++ } \ while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ -- if (c == -1) return -2; /* we should not stop here */ \ -- while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \ -- if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \ -- seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \ ++ if (c == -1) return -2; /* error: no quality string */ \ ++ while (ks_getuntil2(ks, '\n', &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ seq->last_char = 0; /* we have not come to the next header line */ \ -- if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \ -- return seq->seq.l; \ ++ if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ ++ return seq->seq.l; \ } #define __KSEQ_TYPE(type_t) \ @@@ -219,7 -219,7 +216,7 @@@ } kseq_t; #define KSEQ_INIT(type_t, __read) \ -- KSTREAM_INIT(type_t, __read, 4096) \ ++ KSTREAM_INIT(type_t, __read, 16384) \ __KSEQ_TYPE(type_t) \ __KSEQ_BASIC(type_t) \ __KSEQ_READ diff --cc sam/kstring.c index 43d524c,43d524c..b2a0dab --- a/sam/kstring.c +++ b/sam/kstring.c @@@ -29,16 -29,16 +29,24 @@@ char *kstrtok(const char *str, const ch const char *p, *start; if (sep) { // set up the table if (str == 0 && (aux->tab[0]&1)) return 0; // no need to set up if we have finished -- aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0; -- for (p = sep; *p; ++p) -- aux->tab[*p/64] |= 1ull<<(*p%64); ++ aux->finished = 0; ++ if (sep[1]) { ++ aux->sep = -1; ++ aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0; ++ for (p = sep; *p; ++p) aux->tab[*p>>6] |= 1ull<<(*p&0x3f); ++ } else aux->sep = sep[0]; ++ } ++ if (aux->finished) return 0; ++ else if (str) aux->p = str - 1, aux->finished = 0; ++ if (aux->sep < 0) { ++ for (p = start = aux->p + 1; *p; ++p) ++ if (aux->tab[*p>>6]>>(*p&0x3f)&1) break; ++ } else { ++ for (p = start = aux->p + 1; *p; ++p) ++ if (*p == aux->sep) break; } -- if (str) aux->p = str - 1, aux->tab[0] &= ~1ull; -- else if (aux->tab[0]&1) return 0; -- for (p = start = aux->p + 1; *p; ++p) -- if (aux->tab[*p/64]>>(*p%64)&1) break; aux->p = p; // end of token -- if (*p == 0) aux->tab[0] |= 1; // no more tokens ++ if (*p == 0) aux->finished = 1; // no more tokens return (char*)start; } diff --cc sam/kstring.h index c46a62b,c46a62b..ec5775b --- a/sam/kstring.h +++ b/sam/kstring.h @@@ -19,6 -19,6 +19,7 @@@ typedef struct __kstring_t typedef struct { uint64_t tab[4]; ++ int sep, finished; const char *p; // end of the current token } ks_tokaux_t; diff --cc sam/misc/Makefile index 6c25c78,6c25c78..d2f8bd8 --- a/sam/misc/Makefile +++ b/sam/misc/Makefile @@@ -4,7 -4,7 +4,7 @@@ CFLAGS= -g -Wall -O2 #-m64 #-arch pp CXXFLAGS= $(CFLAGS) DFLAGS= -D_FILE_OFFSET_BITS=64 OBJS= --PROG= md5sum-lite md5fa maq2sam-short maq2sam-long wgsim ++PROG= md5sum-lite md5fa maq2sam-short maq2sam-long wgsim seqtk INCLUDES= -I.. SUBDIRS= . @@@ -27,11 -27,11 +27,11 @@@ lib-recur all-recur clean-recur cleanlo lib: --afs2:afs2.o -- $(CC) $(CFLAGS) -o $@ afs2.o -lm -lz -L.. -lbam ++seqtk:seqtk.o ++ $(CC) $(CFLAGS) -o $@ seqtk.o -lm -lz wgsim:wgsim.o -- $(CC) $(CFLAGS) -o $@ wgsim.o -lm ++ $(CC) $(CFLAGS) -o $@ wgsim.o -lm -lz md5fa:md5.o md5fa.o md5.h ../kseq.h $(CC) $(CFLAGS) -o $@ md5.o md5fa.o -lz @@@ -51,8 -51,8 +51,11 @@@ maq2sam-long:maq2sam. md5fa.o:md5.h md5fa.c $(CC) $(CFLAGS) -c -I.. -o $@ md5fa.c --afs2.o:afs2.c ../bam.h -- $(CC) $(CFLAGS) -c -I.. -o $@ afs2.c ++seqtk.o:seqtk.c ../khash.h ../kseq.h ++ $(CC) $(CFLAGS) -c -I.. -o $@ seqtk.c ++ ++wgsim.o:wgsim.c ../kseq.h ++ $(CC) $(CFLAGS) -c -I.. -o $@ wgsim.c cleanlocal: rm -fr gmon.out *.o a.out *.exe *.dSYM $(PROG) *~ *.a diff --cc sam/misc/export2sam.pl index a2a436c,a2a436c..ec6dacf --- a/sam/misc/export2sam.pl +++ b/sam/misc/export2sam.pl @@@ -1,461 -1,461 +1,545 @@@ --#!/usr/bin/env perl --# --# --# Script to convert GERALD export files to SAM format. --# --# --# --########## License: --# --# The MIT License --# --# Original SAMtools version 0.1.2 copyright (c) 2008-2009 Genome Research Ltd. --# Modifications from version 0.1.2 to 2.0.0 copyright (c) 2010 Illumina, Inc. --# --# Permission is hereby granted, free of charge, to any person obtaining a copy --# of this software and associated documentation files (the "Software"), to deal --# in the Software without restriction, including without limitation the rights --# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell --# copies of the Software, and to permit persons to whom the Software is --# furnished to do so, subject to the following conditions: --# --# The above copyright notice and this permission notice shall be included in --# all copies or substantial portions of the Software. --# --# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR --# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, --# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE --# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER --# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, --# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN --# THE SOFTWARE. --# --# --# --########## ChangeLog: --# --# Version: 2.0.0 (15FEB2010) --# Script updated by Illumina in conjunction with CASAVA 1.7.0 release. --# Major changes are as follows: --# - The CIGAR string has been updated to include all gaps from ELANDv2 alignments. --# - The ELAND single read alignment score is always stored in the optional "SM" field --# and the ELAND paired read alignment score is stored in the optional "AS" field --# when it exists. --# - The MAPQ value is set to the higher of the two alignment scores, but no greater --# than 254, i.e. min(254,max(SM,AS)) --# - The SAM "proper pair" bit (0x0002) is now set for read pairs meeting ELAND's --# expected orientation and insert size criteria. --# - The default quality score translation is set for export files which contain --# Phread+64 quality values. An option, "--qlogodds", has been added to --# translate quality values from the Solexa+64 format used in export files prior --# to Pipeline 1.3 --# - The export match descriptor is now reverse-complemented when necessary such that --# it always corresponds to the forward strand of the reference, to be consistent --# with other information in the SAM record. It is now written to the optional --# 'XD' field (rather than 'MD') to acknowledge its minor differences from the --# samtools match descriptor (see additional detail below). --# - An option, "--nofilter", has been added to include reads which have failed --# primary analysis quality filtration. Such reads will have the corresponding --# SAM flag bit (0x0200) set. --# - Labels in the export 'contig' field are preserved by setting RNAME to --# "$export_chromosome/$export_contig" when then contig label exists. --# --# --# Contact: lh3 --# Version: 0.1.2 (03JAN2009) --# --# --# --########## Known Conversion Limitations: --# --# - Export records for reads that map to a position < 1 (allowed in export format), are converted --# to unmapped reads in the SAM record. --# - Export records contain the reserved chromosome names: "NM" and "QC". "NM" indicates that the --# aligner could not map the read to the reference sequence set, and "QC" means that the --# aligner did not attempt to map the read due to some technical limitation. Both of these --# alignment types are collapsed to the single unmapped alignment state in the SAM record. --# - The export match descriptor is slightly different than the samtools match descriptor. For --# this reason it is stored in the optional SAM field 'XD' (and not 'MD'). Note that the --# export match descriptor differs from the samtools version in two respects: (1) indels --# are explicitly closed with the '$' character and (2) insertions must be enumerated in --# the match descriptor. For example a 35-base read with a two-base insertion is described --# as: 20^2$14 --# --# --# -- --my $version = "2.0.0"; -- --use strict; --use warnings; -- --use File::Spec qw(splitpath); --use Getopt::Long; --use List::Util qw(min max); -- -- --use constant { -- EXPORT_INDEX => 6, -- EXPORT_READNO => 7, -- EXPORT_READ => 8, -- EXPORT_QUAL => 9, -- EXPORT_CHROM => 10, -- EXPORT_CONTIG => 11, -- EXPORT_POS => 12, -- EXPORT_STRAND => 13, -- EXPORT_MD => 14, -- EXPORT_SEMAP => 15, -- EXPORT_PEMAP => 16, -- EXPORT_PASSFILT => 21, --}; -- -- --use constant { -- SAM_QNAME => 0, -- SAM_FLAG => 1, -- SAM_RNAME => 2, -- SAM_POS => 3, -- SAM_MAPQ => 4, -- SAM_CIGAR => 5, -- SAM_MRNM => 6, -- SAM_MPOS => 7, -- SAM_ISIZE => 8, -- SAM_SEQ => 9, -- SAM_QUAL => 10, --}; -- -- --# function prototypes for Richard's code --sub match_desc_to_cigar($); --sub match_desc_frag_length($); --sub reverse_compl_match_descriptor($); --sub write_header($;$;$); -- -- --&export2sam; --exit; -- -- -- -- --sub export2sam { -- -- my $cmdline = $0 . " " . join(" ",@ARGV); -- my $arg_count = scalar @ARGV; -- my @spval = File::Spec->splitpath($0); -- my $progname = $spval[2]; -- -- my $is_logodds_qvals = 0; # if true, assume files contain logodds (i.e. "solexa") quality values -- my $is_nofilter = 0; -- my $read1file; -- my $read2file; -- my $print_version = 0; -- my $help = 0; -- -- my $result = GetOptions( "qlogodds" => \$is_logodds_qvals, -- "nofilter" => \$is_nofilter, -- "read1=s" => \$read1file, -- "read2=s" => \$read2file, -- "version" => \$print_version, -- "help" => \$help ); -- -- my $usage = <) { -- $export_line_count++; -- my (@s1, @s2); -- &export2sam_aux($_, $export_line_count, \@s1, \@conv_table, $is_paired, 1, $is_nofilter); -- if ($is_paired) { -- my $read2line = <$fh2>; -- if(not $read2line){ -- die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read1 file at line no: $export_line_count.\n\n"); -- } -- &export2sam_aux($read2line, $export_line_count, \@s2, \@conv_table, $is_paired, 2, $is_nofilter); -- -- if (@s1 && @s2) { # then set mate coordinate -- if($s1[SAM_QNAME] ne $s2[SAM_QNAME]){ -- die("\nERROR: Non-paired reads in export files on line: $export_line_count.\n Read1: $_ Read2: $read2line\n"); -- } -- -- my $isize = 0; -- if ($s1[SAM_RNAME] ne '*' && $s1[SAM_RNAME] eq $s2[SAM_RNAME]) { # then calculate $isize -- my $x1 = ($s1[SAM_FLAG] & 0x10)? $s1[SAM_POS] + length($s1[SAM_SEQ]) : $s1[SAM_POS]; -- my $x2 = ($s2[SAM_FLAG] & 0x10)? $s2[SAM_POS] + length($s2[SAM_SEQ]) : $s2[SAM_POS]; -- $isize = $x2 - $x1; -- } -- -- foreach ([\@s1,\@s2,$isize],[\@s2,\@s1,-$isize]){ -- my ($sa,$sb,$is) = @{$_}; -- if ($sb->[SAM_RNAME] ne '*') { -- $sa->[SAM_MRNM] = ($sb->[SAM_RNAME] eq $sa->[SAM_RNAME]) ? "=" : $sb->[SAM_RNAME]; -- $sa->[SAM_MPOS] = $sb->[SAM_POS]; -- $sa->[SAM_ISIZE] = $is; -- $sa->[SAM_FLAG] |= 0x20 if ($sb->[SAM_FLAG] & 0x10); -- } else { -- $sa->[SAM_FLAG] |= 0x8; -- } -- } -- } -- } -- print join("\t", @s1), "\n" if (@s1); -- print join("\t", @s2), "\n" if (@s2 && $is_paired); -- } -- close($fh1); -- if($is_paired) { -- while(my $read2line = <$fh2>){ -- $export_line_count++; -- die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read2 file at line no: $export_line_count.\n\n"); -- } -- close($fh2); -- } --} -- --sub export2sam_aux { -- my ($line, $line_no, $s, $ct, $is_paired, $read_no, $is_nofilter) = @_; -- chomp($line); -- my @t = split("\t", $line); -- @$s = (); -- my $isPassFilt = ($t[EXPORT_PASSFILT] eq 'Y'); -- return if(not ($isPassFilt or $is_nofilter)); -- # read name -- $s->[SAM_QNAME] = $t[1]? "$t[0]_$t[1]:$t[2]:$t[3]:$t[4]:$t[5]" : "$t[0]:$t[2]:$t[3]:$t[4]:$t[5]"; -- # initial flag (will be updated later) -- $s->[SAM_FLAG] = 0; -- if($is_paired) { -- if($t[EXPORT_READNO] != $read_no){ -- die("\nERROR: read$read_no export file contains record with read number: " .$t[EXPORT_READNO] . " on line: $line_no\n\n"); -- } -- $s->[SAM_FLAG] |= 1 | 1<<(5 + $read_no); -- } -- $s->[SAM_FLAG] |= 0x200 if (not $isPassFilt); -- -- # read & quality -- my $is_export_rev = ($t[EXPORT_STRAND] eq 'R'); -- if ($is_export_rev) { # then reverse the sequence and quality -- $s->[SAM_SEQ] = reverse($t[EXPORT_READ]); -- $s->[SAM_SEQ] =~ tr/ACGTacgt/TGCAtgca/; -- $s->[SAM_QUAL] = reverse($t[EXPORT_QUAL]); -- } else { -- $s->[SAM_SEQ] = $t[EXPORT_READ]; -- $s->[SAM_QUAL] = $t[EXPORT_QUAL]; -- } -- my @convqual = (); -- foreach (unpack('C*', $s->[SAM_QUAL])){ -- my $val=$ct->[$_]; -- if(not defined $val){ -- my $msg="\nERROR: can't interpret export quality value: " . $_ . " in read$read_no export file, line: $line_no\n"; -- if( $_ < 64 ) { $msg .= " Use --qlogodds flag to translate logodds (solexa) quality values.\n"; } -- die($msg . "\n"); -- } -- push @convqual,$val; -- } -- -- $s->[SAM_QUAL] = pack('C*',@convqual); # change coding -- -- -- # coor -- my $has_coor = 0; -- $s->[SAM_RNAME] = "*"; -- if ($t[EXPORT_CHROM] eq 'NM' or $t[EXPORT_CHROM] eq 'QC') { -- $s->[SAM_FLAG] |= 0x4; # unmapped -- } elsif ($t[EXPORT_CHROM] =~ /(\d+):(\d+):(\d+)/) { -- $s->[SAM_FLAG] |= 0x4; # TODO: should I set BAM_FUNMAP in this case? -- push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3") -- } elsif ($t[EXPORT_POS] < 1) { -- $s->[SAM_FLAG] |= 0x4; # unmapped -- } else { -- $s->[SAM_RNAME] = $t[EXPORT_CHROM]; -- $s->[SAM_RNAME] .= "/" . $t[EXPORT_CONTIG] if($t[EXPORT_CONTIG] ne ''); -- $has_coor = 1; -- } -- $s->[SAM_POS] = $has_coor? $t[EXPORT_POS] : 0; -- --# print STDERR "t[14] = " . $t[14] . "\n"; -- my $matchDesc = ''; -- $s->[SAM_CIGAR] = "*"; -- if($has_coor){ -- $matchDesc = ($is_export_rev) ? reverse_compl_match_descriptor($t[EXPORT_MD]) : $t[EXPORT_MD]; -- -- if($matchDesc =~ /\^/){ -- # construct CIGAR string using Richard's function -- $s->[SAM_CIGAR] = match_desc_to_cigar($matchDesc); # indel processing -- } else { -- $s->[SAM_CIGAR] = length($s->[SAM_SEQ]) . "M"; -- } -- } -- --# print STDERR "cigar_string = $cigar_string\n"; -- -- $s->[SAM_FLAG] |= 0x10 if ($has_coor && $is_export_rev); -- if($has_coor){ -- my $semap = ($t[EXPORT_SEMAP] ne '') ? $t[EXPORT_SEMAP] : 0; -- my $pemap = 0; -- if($is_paired) { -- $pemap = ($t[EXPORT_PEMAP] ne '') ? $t[EXPORT_PEMAP] : 0; -- -- # set `proper pair' bit if non-blank, non-zero PE alignment score: -- $s->[SAM_FLAG] |= 0x02 if ($pemap > 0); -- } -- $s->[SAM_MAPQ] = min(254,max($semap,$pemap)); -- } else { -- $s->[SAM_MAPQ] = 0; -- } -- # mate coordinate -- $s->[SAM_MRNM] = '*'; -- $s->[SAM_MPOS] = 0; -- $s->[SAM_ISIZE] = 0; -- # aux -- push(@$s, "BC:Z:$t[EXPORT_INDEX]") if ($t[EXPORT_INDEX]); -- if($has_coor){ -- # The export match descriptor differs slightly from the samtools match descriptor. -- # In order for the converted SAM files to be as compliant as possible, -- # we put the export match descriptor in optional field 'XD' rather than 'MD': -- push(@$s, "XD:Z:$matchDesc"); -- push(@$s, "SM:i:$t[EXPORT_SEMAP]") if ($t[EXPORT_SEMAP] ne ''); -- push(@$s, "AS:i:$t[EXPORT_PEMAP]") if ($is_paired and ($t[EXPORT_PEMAP] ne '')); -- } --} -- -- -- --# --# the following code is taken from Richard Shaw's sorted2sam.pl file --# --sub reverse_compl_match_descriptor($) --{ --# print "\nREVERSING THE MATCH DESCRIPTOR!\n"; -- my ($match_desc) = @_; -- my $rev_compl_match_desc = reverse($match_desc); -- $rev_compl_match_desc =~ tr/ACGT\^\$/TGCA\$\^/; -- -- # Unreverse the digits of numbers. -- $rev_compl_match_desc = join('', -- map {($_ =~ /\d+/) -- ? join('', reverse(split('', $_))) -- : $_} split(/(\d+)/, -- $rev_compl_match_desc)); -- -- return $rev_compl_match_desc; --} -- -- -- --sub match_desc_to_cigar($) --{ -- my ($match_desc) = @_; -- -- my @match_desc_parts = split(/(\^.*?\$)/, $match_desc); -- my $cigar_str = ''; -- my $cigar_del_ch = 'D'; -- my $cigar_ins_ch = 'I'; -- my $cigar_match_ch = 'M'; -- -- foreach my $match_desc_part (@match_desc_parts) { -- next if (!$match_desc_part); -- -- if ($match_desc_part =~ /^\^([ACGTN]+)\$$/) { -- # Deletion -- $cigar_str .= (length($1) . $cigar_del_ch); -- } elsif ($match_desc_part =~ /^\^(\d+)\$$/) { -- # Insertion -- $cigar_str .= ($1 . $cigar_ins_ch); -- } else { -- $cigar_str .= (match_desc_frag_length($match_desc_part) -- . $cigar_match_ch); -- } -- } -- -- return $cigar_str; --} -- -- --#------------------------------------------------------------------------------ -- --sub match_desc_frag_length($) -- { -- my ($match_desc_str) = @_; -- my $len = 0; -- -- my @match_desc_fields = split(/([ACGTN]+)/, $match_desc_str); -- -- foreach my $match_desc_field (@match_desc_fields) { -- next if ($match_desc_field eq ''); -- -- $len += (($match_desc_field =~ /(\d+)/) -- ? $1 : length($match_desc_field)); -- } -- -- return $len; --} -- -- --# argument holds the command line --sub write_header($;$;$) --{ -- my ($progname,$version,$cl) = @_; -- my $complete_header = ""; -- $complete_header .= "\@PG\tID:$progname\tVN:$version\tCL:$cl\n"; -- -- return $complete_header; --} ++#!/usr/bin/env perl ++# ++# ++# export2sam.pl converts GERALD export files to SAM format. ++# ++# ++# ++########## License: ++# ++# The MIT License ++# ++# Original SAMtools work copyright (c) 2008-2009 Genome Research Ltd. ++# Modified SAMtools work copyright (c) 2010 Illumina, Inc. ++# ++# Permission is hereby granted, free of charge, to any person obtaining a copy ++# of this software and associated documentation files (the "Software"), to deal ++# in the Software without restriction, including without limitation the rights ++# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ++# copies of the Software, and to permit persons to whom the Software is ++# furnished to do so, subject to the following conditions: ++# ++# The above copyright notice and this permission notice shall be included in ++# all copies or substantial portions of the Software. ++# ++# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ++# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ++# THE SOFTWARE. ++# ++# ++# ++# ++########## ChangeLog: ++# ++# Version: 2.3.1 (18MAR2011) ++# ++# - Restore file '-' as stdin input. ++# ++# Version: 2.3.0 (24JAN2011) ++# ++# - Add support for export reserved chromosome name "CONTROL", ++# which is translated to optional field "XC:Z:CONTROL". ++# - Check for ".gz" file extension on export files and open ++# these as gzip pipes when the extension is found. ++# ++# Version: 2.2.0 (16NOV2010) ++# ++# - Remove any leading zeros in export fields: RUNNO,LANE,TILE,X,Y ++# - For export records with reserved chromosome name identifiers ++# "QC" and "RM", add the optional field "XC:Z:QC" or "XC:Z:RM" ++# to the SAM record, so that these cases can be distinguished ++# from other unmatched reads. ++# ++# Version: 2.1.0 (21SEP2010) ++# ++# - Additional export record error checking. ++# - Convert export records with chromomsome value of "RM" to unmapped ++# SAM records. ++# ++# Version: 2.0.0 (15FEB2010) ++# ++# Script updated by Illumina in conjunction with CASAVA 1.7.0 ++# release. ++# ++# Major changes are as follows: ++# - The CIGAR string has been updated to include all gaps from ++# ELANDv2 alignments. ++# - The ELAND single read alignment score is always stored in the ++# optional "SM" field and the ELAND paired read alignment score ++# is stored in the optional "AS" field when it exists. ++# - The MAPQ value is set to the higher of the two alignment scores, ++# but no greater than 254, i.e. min(254,max(SM,AS)) ++# - The SAM "proper pair" bit (0x0002) is now set for read pairs ++# meeting ELAND's expected orientation and insert size criteria. ++# - The default quality score translation is set for export files ++# which contain Phread+64 quality values. An option, ++# "--qlogodds", has been added to translate quality values from ++# the Solexa+64 format used in export files prior to Pipeline ++# 1.3 ++# - The export match descriptor is now reverse-complemented when ++# necessary such that it always corresponds to the forward ++# strand of the reference, to be consistent with other ++# information in the SAM record. It is now written to the ++# optional 'XD' field (rather than 'MD') to acknowledge its ++# minor differences from the samtools match descriptor (see ++# additional detail below). ++# - An option, "--nofilter", has been added to include reads which ++# have failed primary analysis quality filtration. Such reads ++# will have the corresponding SAM flag bit (0x0200) set. ++# - Labels in the export 'contig' field are preserved by setting ++# RNAME to "$export_chromosome/$export_contig" when the contig ++# label exists. ++# ++# ++# Contact: lh3 ++# Version: 0.1.2 (03JAN2009) ++# ++# ++# ++########## Known Conversion Limitations: ++# ++# - Export records for reads that map to a position < 1 (allowed ++# in export format), are converted to unmapped reads in the SAM ++# record. ++# - Export records contain the reserved chromosome names: "NM", ++# "QC","RM" and "CONTROL". "NM" indicates that the aligner could ++# not map the read to the reference sequence set. "QC" means that ++# the aligner did not attempt to map the read due to some ++# technical limitation. "RM" means that the read mapped to a set ++# of 'contaminant' sequences specified in GERALD's RNA-seq ++# workflow. "CONTROL" means that the read is a control. All of ++# these alignment types are collapsed to the single unmapped ++# alignment state in the SAM record, but the optional SAM "XC" ++# field is used to record the original reserved chromosome name of ++# the read for all but the "NM" case. ++# - The export match descriptor is slightly different than the ++# samtools match descriptor. For this reason it is stored in the ++# optional SAM field 'XD' (and not 'MD'). Note that the export ++# match descriptor differs from the samtools version in two ++# respects: (1) indels are explicitly closed with the '$' ++# character and (2) insertions must be enumerated in the match ++# descriptor. For example a 35-base read with a two-base insertion ++# is described as: 20^2$14 ++# ++# ++# ++ ++my $version = "2.3.1"; ++ ++use strict; ++use warnings; ++ ++use Getopt::Long; ++use File::Spec; ++use List::Util qw(min max); ++ ++ ++use constant { ++ EXPORT_MACHINE => 0, ++ EXPORT_RUNNO => 1, ++ EXPORT_LANE => 2, ++ EXPORT_TILE => 3, ++ EXPORT_X => 4, ++ EXPORT_Y => 5, ++ EXPORT_INDEX => 6, ++ EXPORT_READNO => 7, ++ EXPORT_READ => 8, ++ EXPORT_QUAL => 9, ++ EXPORT_CHROM => 10, ++ EXPORT_CONTIG => 11, ++ EXPORT_POS => 12, ++ EXPORT_STRAND => 13, ++ EXPORT_MD => 14, ++ EXPORT_SEMAP => 15, ++ EXPORT_PEMAP => 16, ++ EXPORT_PASSFILT => 21, ++ EXPORT_SIZE => 22, ++}; ++ ++ ++use constant { ++ SAM_QNAME => 0, ++ SAM_FLAG => 1, ++ SAM_RNAME => 2, ++ SAM_POS => 3, ++ SAM_MAPQ => 4, ++ SAM_CIGAR => 5, ++ SAM_MRNM => 6, ++ SAM_MPOS => 7, ++ SAM_ISIZE => 8, ++ SAM_SEQ => 9, ++ SAM_QUAL => 10, ++}; ++ ++ ++# function prototypes for Richard's code ++sub match_desc_to_cigar($); ++sub match_desc_frag_length($); ++sub reverse_compl_match_descriptor($); ++sub write_header($;$;$); ++ ++ ++&export2sam; ++exit; ++ ++ ++ ++ ++sub export2sam { ++ ++ my $cmdline = $0 . " " . join(" ",@ARGV); ++ my $arg_count = scalar @ARGV; ++ my $progname = (File::Spec->splitpath($0))[2]; ++ ++ my $is_logodds_qvals = 0; # if true, assume files contain logodds (i.e. "solexa") quality values ++ my $is_nofilter = 0; ++ my $read1file; ++ my $read2file; ++ my $print_version = 0; ++ my $help = 0; ++ ++ my $result = GetOptions( "qlogodds" => \$is_logodds_qvals, ++ "nofilter" => \$is_nofilter, ++ "read1=s" => \$read1file, ++ "read2=s" => \$read2file, ++ "version" => \$print_version, ++ "help" => \$help ); ++ ++ my $usage = <) { ++ $export_line_count++; ++ my (@s1, @s2); ++ &export2sam_aux($_, $export_line_count, \@s1, \@conv_table, $is_paired, 1, $is_nofilter); ++ if ($is_paired) { ++ my $read2line = <$fh2>; ++ if(not $read2line){ ++ die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read1 file at line no: $export_line_count.\n\n"); ++ } ++ &export2sam_aux($read2line, $export_line_count, \@s2, \@conv_table, $is_paired, 2, $is_nofilter); ++ ++ if (@s1 && @s2) { # then set mate coordinate ++ if($s1[SAM_QNAME] ne $s2[SAM_QNAME]){ ++ die("\nERROR: Non-paired reads in export files on line: $export_line_count.\n Read1: $_ Read2: $read2line\n"); ++ } ++ ++ my $isize = 0; ++ if ($s1[SAM_RNAME] ne '*' && $s1[SAM_RNAME] eq $s2[SAM_RNAME]) { # then calculate $isize ++ my $x1 = ($s1[SAM_FLAG] & 0x10)? $s1[SAM_POS] + length($s1[SAM_SEQ]) : $s1[SAM_POS]; ++ my $x2 = ($s2[SAM_FLAG] & 0x10)? $s2[SAM_POS] + length($s2[SAM_SEQ]) : $s2[SAM_POS]; ++ $isize = $x2 - $x1; ++ } ++ ++ foreach ([\@s1,\@s2,$isize],[\@s2,\@s1,-$isize]){ ++ my ($sa,$sb,$is) = @{$_}; ++ if ($sb->[SAM_RNAME] ne '*') { ++ $sa->[SAM_MRNM] = ($sb->[SAM_RNAME] eq $sa->[SAM_RNAME]) ? "=" : $sb->[SAM_RNAME]; ++ $sa->[SAM_MPOS] = $sb->[SAM_POS]; ++ $sa->[SAM_ISIZE] = $is; ++ $sa->[SAM_FLAG] |= 0x20 if ($sb->[SAM_FLAG] & 0x10); ++ } else { ++ $sa->[SAM_FLAG] |= 0x8; ++ } ++ } ++ } ++ } ++ print join("\t", @s1), "\n" if (@s1); ++ print join("\t", @s2), "\n" if (@s2 && $is_paired); ++ } ++ close($fh1); ++ if($is_paired) { ++ while(my $read2line = <$fh2>){ ++ $export_line_count++; ++ die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read2 file at line no: $export_line_count.\n\n"); ++ } ++ close($fh2); ++ } ++} ++ ++sub export2sam_aux { ++ my ($line, $line_no, $s, $ct, $is_paired, $read_no, $is_nofilter) = @_; ++ chomp($line); ++ my @t = split("\t", $line); ++ if(scalar(@t) < EXPORT_SIZE) { ++ my $msg="\nERROR: Unexpected number of fields in export record on line $line_no of read$read_no export file. Found " . scalar(@t) . " fields but expected " . EXPORT_SIZE . ".\n"; ++ $msg.="\t...erroneous export record:\n" . $line . "\n\n"; ++ die($msg); ++ } ++ @$s = (); ++ my $isPassFilt = ($t[EXPORT_PASSFILT] eq 'Y'); ++ return if(not ($isPassFilt or $is_nofilter)); ++ # read name ++ my $samQnamePrefix = $t[EXPORT_MACHINE] . (($t[EXPORT_RUNNO] ne "") ? "_" . int($t[EXPORT_RUNNO]) : ""); ++ $s->[SAM_QNAME] = join(':', $samQnamePrefix, int($t[EXPORT_LANE]), int($t[EXPORT_TILE]), ++ int($t[EXPORT_X]), int($t[EXPORT_Y])); ++ # initial flag (will be updated later) ++ $s->[SAM_FLAG] = 0; ++ if($is_paired) { ++ if($t[EXPORT_READNO] != $read_no){ ++ die("\nERROR: read$read_no export file contains record with read number: " .$t[EXPORT_READNO] . " on line: $line_no\n\n"); ++ } ++ $s->[SAM_FLAG] |= 1 | 1<<(5 + $read_no); ++ } ++ $s->[SAM_FLAG] |= 0x200 if (not $isPassFilt); ++ ++ # read & quality ++ my $is_export_rev = ($t[EXPORT_STRAND] eq 'R'); ++ if ($is_export_rev) { # then reverse the sequence and quality ++ $s->[SAM_SEQ] = reverse($t[EXPORT_READ]); ++ $s->[SAM_SEQ] =~ tr/ACGTacgt/TGCAtgca/; ++ $s->[SAM_QUAL] = reverse($t[EXPORT_QUAL]); ++ } else { ++ $s->[SAM_SEQ] = $t[EXPORT_READ]; ++ $s->[SAM_QUAL] = $t[EXPORT_QUAL]; ++ } ++ my @convqual = (); ++ foreach (unpack('C*', $s->[SAM_QUAL])){ ++ my $val=$ct->[$_]; ++ if(not defined $val){ ++ my $msg="\nERROR: can't interpret export quality value: " . $_ . " in read$read_no export file, line: $line_no\n"; ++ if( $_ < 64 ) { $msg .= " Use --qlogodds flag to translate logodds (solexa) quality values.\n"; } ++ die($msg . "\n"); ++ } ++ push @convqual,$val; ++ } ++ ++ $s->[SAM_QUAL] = pack('C*',@convqual); # change coding ++ ++ ++ # coor ++ my $has_coor = 0; ++ $s->[SAM_RNAME] = "*"; ++ if (($t[EXPORT_CHROM] eq 'NM') or ++ ($t[EXPORT_CHROM] eq 'QC') or ++ ($t[EXPORT_CHROM] eq 'RM') or ++ ($t[EXPORT_CHROM] eq 'CONTROL')) { ++ $s->[SAM_FLAG] |= 0x4; # unmapped ++ push(@$s,"XC:Z:".$t[EXPORT_CHROM]) if($t[EXPORT_CHROM] ne 'NM'); ++ } elsif ($t[EXPORT_CHROM] =~ /(\d+):(\d+):(\d+)/) { ++ $s->[SAM_FLAG] |= 0x4; # TODO: should I set BAM_FUNMAP in this case? ++ push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3") ++ } elsif ($t[EXPORT_POS] < 1) { ++ $s->[SAM_FLAG] |= 0x4; # unmapped ++ } else { ++ $s->[SAM_RNAME] = $t[EXPORT_CHROM]; ++ $s->[SAM_RNAME] .= "/" . $t[EXPORT_CONTIG] if($t[EXPORT_CONTIG] ne ''); ++ $has_coor = 1; ++ } ++ $s->[SAM_POS] = $has_coor? $t[EXPORT_POS] : 0; ++ ++# print STDERR "t[14] = " . $t[14] . "\n"; ++ my $matchDesc = ''; ++ $s->[SAM_CIGAR] = "*"; ++ if($has_coor){ ++ $matchDesc = ($is_export_rev) ? reverse_compl_match_descriptor($t[EXPORT_MD]) : $t[EXPORT_MD]; ++ ++ if($matchDesc =~ /\^/){ ++ # construct CIGAR string using Richard's function ++ $s->[SAM_CIGAR] = match_desc_to_cigar($matchDesc); # indel processing ++ } else { ++ $s->[SAM_CIGAR] = length($s->[SAM_SEQ]) . "M"; ++ } ++ } ++ ++# print STDERR "cigar_string = $cigar_string\n"; ++ ++ $s->[SAM_FLAG] |= 0x10 if ($has_coor && $is_export_rev); ++ if($has_coor){ ++ my $semap = ($t[EXPORT_SEMAP] ne '') ? $t[EXPORT_SEMAP] : 0; ++ my $pemap = 0; ++ if($is_paired) { ++ $pemap = ($t[EXPORT_PEMAP] ne '') ? $t[EXPORT_PEMAP] : 0; ++ ++ # set `proper pair' bit if non-blank, non-zero PE alignment score: ++ $s->[SAM_FLAG] |= 0x02 if ($pemap > 0); ++ } ++ $s->[SAM_MAPQ] = min(254,max($semap,$pemap)); ++ } else { ++ $s->[SAM_MAPQ] = 0; ++ } ++ # mate coordinate ++ $s->[SAM_MRNM] = '*'; ++ $s->[SAM_MPOS] = 0; ++ $s->[SAM_ISIZE] = 0; ++ # aux ++ push(@$s, "BC:Z:$t[EXPORT_INDEX]") if ($t[EXPORT_INDEX]); ++ if($has_coor){ ++ # The export match descriptor differs slightly from the samtools match descriptor. ++ # In order for the converted SAM files to be as compliant as possible, ++ # we put the export match descriptor in optional field 'XD' rather than 'MD': ++ push(@$s, "XD:Z:$matchDesc"); ++ push(@$s, "SM:i:$t[EXPORT_SEMAP]") if ($t[EXPORT_SEMAP] ne ''); ++ push(@$s, "AS:i:$t[EXPORT_PEMAP]") if ($is_paired and ($t[EXPORT_PEMAP] ne '')); ++ } ++} ++ ++ ++ ++# ++# the following code is taken from Richard Shaw's sorted2sam.pl file ++# ++sub reverse_compl_match_descriptor($) ++{ ++# print "\nREVERSING THE MATCH DESCRIPTOR!\n"; ++ my ($match_desc) = @_; ++ my $rev_compl_match_desc = reverse($match_desc); ++ $rev_compl_match_desc =~ tr/ACGT\^\$/TGCA\$\^/; ++ ++ # Unreverse the digits of numbers. ++ $rev_compl_match_desc = join('', ++ map {($_ =~ /\d+/) ++ ? join('', reverse(split('', $_))) ++ : $_} split(/(\d+)/, ++ $rev_compl_match_desc)); ++ ++ return $rev_compl_match_desc; ++} ++ ++ ++ ++sub match_desc_to_cigar($) ++{ ++ my ($match_desc) = @_; ++ ++ my @match_desc_parts = split(/(\^.*?\$)/, $match_desc); ++ my $cigar_str = ''; ++ my $cigar_del_ch = 'D'; ++ my $cigar_ins_ch = 'I'; ++ my $cigar_match_ch = 'M'; ++ ++ foreach my $match_desc_part (@match_desc_parts) { ++ next if (!$match_desc_part); ++ ++ if ($match_desc_part =~ /^\^([ACGTN]+)\$$/) { ++ # Deletion ++ $cigar_str .= (length($1) . $cigar_del_ch); ++ } elsif ($match_desc_part =~ /^\^(\d+)\$$/) { ++ # Insertion ++ $cigar_str .= ($1 . $cigar_ins_ch); ++ } else { ++ $cigar_str .= (match_desc_frag_length($match_desc_part) ++ . $cigar_match_ch); ++ } ++ } ++ ++ return $cigar_str; ++} ++ ++ ++#------------------------------------------------------------------------------ ++ ++sub match_desc_frag_length($) ++ { ++ my ($match_desc_str) = @_; ++ my $len = 0; ++ ++ my @match_desc_fields = split(/([ACGTN]+)/, $match_desc_str); ++ ++ foreach my $match_desc_field (@match_desc_fields) { ++ next if ($match_desc_field eq ''); ++ ++ $len += (($match_desc_field =~ /(\d+)/) ++ ? $1 : length($match_desc_field)); ++ } ++ ++ return $len; ++} ++ ++ ++# argument holds the command line ++sub write_header($;$;$) ++{ ++ my ($progname,$version,$cl) = @_; ++ my $complete_header = ""; ++ $complete_header .= "\@PG\tID:$progname\tVN:$version\tCL:$cl\n"; ++ ++ return $complete_header; ++} diff --cc sam/misc/wgsim.c index 7b5f095,7b5f095..b9c513c --- a/sam/misc/wgsim.c +++ b/sam/misc/wgsim.c @@@ -1,6 -1,6 +1,7 @@@ /* The MIT License Copyright (c) 2008 Genome Research Ltd (GRL). ++ 2011 Heng Li Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@@ -23,11 -23,11 +24,8 @@@ SOFTWARE. */ --/* Contact: Heng Li */ -- /* This program is separated from maq's read simulator with Colin -- * Hercus' modification to allow longer indels. Colin is the chief -- * developer of novoalign. */ ++ * Hercus' modification to allow longer indels. */ #include #include @@@ -38,8 -38,8 +36,11 @@@ #include #include #include ++#include ++#include "kseq.h" ++KSEQ_INIT(gzFile, gzread) --#define PACKAGE_VERSION "0.2.3" ++#define PACKAGE_VERSION "0.3.0" const uint8_t nst_nt4_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, @@@ -60,8 -60,8 +61,6 @@@ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }; --const int nst_color_space_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4}; -- /* Simple normal random number generator, copied from genran.c */ double ran_normal() @@@ -85,78 -85,78 +84,6 @@@ } } --/* FASTA parser, copied from seq.c */ -- --typedef struct { -- int l, m; /* length and maximum buffer size */ -- unsigned char *s; /* sequence */ --} seq_t; -- --#define INIT_SEQ(seq) (seq).s = 0; (seq).l = (seq).m = 0 -- --static int SEQ_BLOCK_SIZE = 512; -- --void seq_set_block_size(int size) --{ -- SEQ_BLOCK_SIZE = size; --} -- --int seq_read_fasta(FILE *fp, seq_t *seq, char *locus, char *comment) --{ -- int c, l, max; -- char *p; -- -- c = 0; -- while (!feof(fp) && fgetc(fp) != '>'); -- if (feof(fp)) return -1; -- p = locus; -- while (!feof(fp) && (c = fgetc(fp)) != ' ' && c != '\t' && c != '\n') -- if (c != '\r') *p++ = c; -- *p = '\0'; -- if (comment) { -- p = comment; -- if (c != '\n') { -- while (!feof(fp) && ((c = fgetc(fp)) == ' ' || c == '\t')); -- if (c != '\n') { -- *p++ = c; -- while (!feof(fp) && (c = fgetc(fp)) != '\n') -- if (c != '\r') *p++ = c; -- } -- } -- *p = '\0'; -- } else if (c != '\n') while (!feof(fp) && fgetc(fp) != '\n'); -- l = 0; max = seq->m; -- while (!feof(fp) && (c = fgetc(fp)) != '>') { -- if (isalpha(c) || c == '-' || c == '.') { -- if (l + 1 >= max) { -- max += SEQ_BLOCK_SIZE; -- seq->s = (unsigned char*)realloc(seq->s, sizeof(char) * max); -- } -- seq->s[l++] = (unsigned char)c; -- } -- } -- if (c == '>') ungetc(c,fp); -- seq->s[l] = 0; -- seq->m = max; seq->l = l; -- return l; --} -- --/* Error-checking open, copied from utils.c */ -- --#define xopen(fn, mode) err_xopen_core(__func__, fn, mode) -- --FILE *err_xopen_core(const char *func, const char *fn, const char *mode) --{ -- FILE *fp = 0; -- if (strcmp(fn, "-") == 0) -- return (strstr(mode, "r"))? stdin : stdout; -- if ((fp = fopen(fn, mode)) == 0) { -- fprintf(stderr, "[%s] fail to open file '%s'. Abort!\n", func, fn); -- abort(); -- } -- return fp; --} -- /* wgsim */ enum muttype_t {NOCHANGE = 0, INSERT = 0x1000, SUBSTITUTE = 0xe000, DELETE = 0xf000}; @@@ -170,24 -170,24 +97,23 @@@ typedef struct static double ERR_RATE = 0.02; static double MUT_RATE = 0.001; --static double INDEL_FRAC = 0.1; ++static double INDEL_FRAC = 0.15; static double INDEL_EXTEND = 0.3; --static int IS_SOLID = 0; --static int SHOW_MM_INFO = 1; ++static double MAX_N_RATIO = 0.1; --void maq_mut_diref(const seq_t *seq, int is_hap, mutseq_t *hap1, mutseq_t *hap2) ++void wgsim_mut_diref(const kseq_t *ks, int is_hap, mutseq_t *hap1, mutseq_t *hap2) { int i, deleting = 0; mutseq_t *ret[2]; ret[0] = hap1; ret[1] = hap2; -- ret[0]->l = seq->l; ret[1]->l = seq->l; -- ret[0]->m = seq->m; ret[1]->m = seq->m; -- ret[0]->s = (mut_t *)calloc(seq->m, sizeof(mut_t)); -- ret[1]->s = (mut_t *)calloc(seq->m, sizeof(mut_t)); -- for (i = 0; i != seq->l; ++i) { ++ ret[0]->l = ks->seq.l; ret[1]->l = ks->seq.l; ++ ret[0]->m = ks->seq.m; ret[1]->m = ks->seq.m; ++ ret[0]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t)); ++ ret[1]->s = (mut_t *)calloc(ks->seq.m, sizeof(mut_t)); ++ for (i = 0; i != ks->seq.l; ++i) { int c; -- c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)seq->s[i]]; ++ c = ret[0]->s[i] = ret[1]->s[i] = (mut_t)nst_nt4_table[(int)ks->seq.s[i]]; if (deleting) { if (drand48() < INDEL_EXTEND) { if (deleting & 1) ret[0]->s[i] |= DELETE; @@@ -230,12 -230,12 +156,12 @@@ } } } --void maq_print_mutref(const char *name, const seq_t *seq, mutseq_t *hap1, mutseq_t *hap2) ++void wgsim_print_mutref(const char *name, const kseq_t *ks, mutseq_t *hap1, mutseq_t *hap2) { int i; -- for (i = 0; i != seq->l; ++i) { ++ for (i = 0; i != ks->seq.l; ++i) { int c[3]; -- c[0] = nst_nt4_table[(int)seq->s[i]]; ++ c[0] = nst_nt4_table[(int)ks->seq.s[i]]; c[1] = hap1->s[i]; c[2] = hap2->s[i]; if (c[0] >= 4) continue; if ((c[1] & mutmsk) != NOCHANGE || (c[2] & mutmsk) != NOCHANGE) { @@@ -248,8 -248,8 +174,9 @@@ } else if (((c[1] & mutmsk) >> 12) <= 5) { // ins printf("-\t"); int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4; -- while(n > 0) { ++ while (n > 0) { putchar("ACGTN"[ins & 0x3]); ++ ins >>= 2; n--; } printf("\t-\n"); @@@ -266,6 -266,6 +193,7 @@@ int n = (c[1]&mutmsk) >> 12, ins = c[1] >> 4; while (n > 0) { putchar("ACGTN"[ins & 0x3]); ++ ins >>= 2; n--; } printf("\t+\n"); @@@ -284,46 -284,46 +212,51 @@@ } } --void wgsim_core(FILE *fpout1, FILE *fpout2, FILE *fp_fa, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r) ++void wgsim_core(FILE *fpout1, FILE *fpout2, const char *fn, int is_hap, uint64_t N, int dist, int std_dev, int size_l, int size_r) { -- seq_t seq; ++ kseq_t *ks; mutseq_t rseq[2]; ++ gzFile fp_fa; uint64_t tot_len, ii; int i, l, n_ref; -- char name[256], *qstr; -- int size[2], Q; ++ char *qstr; ++ int size[2], Q, max_size; uint8_t *tmp_seq[2]; mut_t *target; -- INIT_SEQ(seq); -- srand48(time(0)); -- seq_set_block_size(0x1000000); l = size_l > size_r? size_l : size_r; qstr = (char*)calloc(l+1, 1); tmp_seq[0] = (uint8_t*)calloc(l+2, 1); tmp_seq[1] = (uint8_t*)calloc(l+2, 1); size[0] = size_l; size[1] = size_r; ++ max_size = size_l > size_r? size_l : size_r; Q = (ERR_RATE == 0.0)? 'I' : (int)(-10.0 * log(ERR_RATE) / log(10.0) + 0.499) + 33; ++ fp_fa = gzopen(fn, "r"); ++ ks = kseq_init(fp_fa); tot_len = n_ref = 0; -- while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) { ++ fprintf(stderr, "[%s] calculating the total length of the reference sequence...\n", __func__); ++ while ((l = kseq_read(ks)) >= 0) { tot_len += l; ++n_ref; } -- fprintf(stderr, "[wgsim_core] %d sequences, total length: %llu\n", n_ref, (long long)tot_len); -- rewind(fp_fa); ++ fprintf(stderr, "[%s] %d sequences, total length: %llu\n", __func__, n_ref, (long long)tot_len); ++ kseq_destroy(ks); ++ gzclose(fp_fa); -- while ((l = seq_read_fasta(fp_fa, &seq, name, 0)) >= 0) { ++ fp_fa = gzopen(fn, "r"); ++ ks = kseq_init(fp_fa); ++ while ((l = kseq_read(ks)) >= 0) { uint64_t n_pairs = (uint64_t)((long double)l / tot_len * N + 0.5); if (l < dist + 3 * std_dev) { -- fprintf(stderr, "[wgsim_core] kkip sequence '%s' as it is shorter than %d!\n", name, dist + 3 * std_dev); ++ fprintf(stderr, "[%s] skip sequence '%s' as it is shorter than %d!\n", __func__, ks->name.s, dist + 3 * std_dev); continue; } // generate mutations and print them out -- maq_mut_diref(&seq, is_hap, rseq, rseq+1); -- maq_print_mutref(name, &seq, rseq, rseq+1); ++ wgsim_mut_diref(ks, is_hap, rseq, rseq+1); ++ wgsim_print_mutref(ks->name.s, ks, rseq, rseq+1); for (ii = 0; ii != n_pairs; ++ii) { // the core loop double ran; @@@ -335,8 -335,8 +268,9 @@@ ran = ran_normal(); ran = ran * std_dev + dist; d = (int)(ran + 0.5); ++ d = d > max_size? d : max_size; pos = (int)((l - d + 1) * drand48()); -- } while (pos < 0 || pos >= seq.l || pos + d - 1 >= seq.l); ++ } while (pos < 0 || pos >= ks->seq.l || pos + d - 1 >= ks->seq.l); // flip or not if (drand48() < 0.5) { @@@ -353,7 -353,7 +287,7 @@@ n_sub[0] = n_sub[1] = n_indel[0] = n_indel[1] = n_err[0] = n_err[1] = 0; #define __gen_read(x, start, iter) do { \ -- for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < seq.l && k < s[x]; iter) { \ ++ for (i = (start), k = 0, ext_coor[x] = -10; i >= 0 && i < ks->seq.l && k < s[x]; iter) { \ int c = target[i], mut_type = c & mutmsk; \ if (ext_coor[x] < 0) { \ if (mut_type != NOCHANGE && mut_type != SUBSTITUTE) continue; \ @@@ -374,33 -374,33 +308,9 @@@ if (k != s[x]) ext_coor[x] = -10; \ } while (0) -- if (!IS_SOLID) { -- __gen_read(0, pos, ++i); -- __gen_read(1, pos + d - 1, --i); -- for (k = 0; k < s[1]; ++k) tmp_seq[1][k] = tmp_seq[1][k] < 4? 3 - tmp_seq[1][k] : 4; // complement -- } else { -- int c1, c2, c; -- ++s[0]; ++s[1]; // temporarily increase read length by 1 -- if (is_flip) { // RR pair -- __gen_read(0, pos + s[0], --i); -- __gen_read(1, pos + d - 1, --i); -- } else { // FF pair -- __gen_read(0, pos, ++i); -- __gen_read(1, pos + d - 1 - s[1], ++i); -- ++ext_coor[0]; ++ext_coor[1]; -- } -- // change to color sequence: (0,1,2,3) -> (A,C,G,T) -- for (j = 0; j < 2; ++j) { -- c1 = tmp_seq[j][0]; -- for (i = 1; i < s[j]; ++i) { -- c2 = tmp_seq[j][i]; -- c = (c1 >= 4 || c2 >= 4)? 4 : nst_color_space_table[(1<= 4) c = 4; // actually c should be never larger than 4 if everything is correct -- else if (drand48() < ERR_RATE) { -- c = (c + (int)(drand48() * 3.0 + 1)) & 3; ++ if (c >= 4) { // actually c should be never larger than 4 if everything is correct ++ c = 4; ++ ++n_n; ++ } else if (drand48() < ERR_RATE) { ++ // c = (c + (int)(drand48() * 3.0 + 1)) & 3; // random sequencing errors ++ c = (c + 1) & 3; // recurrent sequencing errors ++n_err[j]; } tmp_seq[j][i] = c; } ++ if ((double)n_n / s[j] > MAX_N_RATIO) break; ++ } ++ if (j < 2) { // too many ambiguous bases on one of the reads ++ --ii; ++ continue; } // print for (j = 0; j < 2; ++j) { for (i = 0; i < s[j]; ++i) qstr[i] = Q; qstr[i] = 0; -- if (SHOW_MM_INFO) { -- fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", name, ext_coor[0]+1, ext_coor[1]+1, -- n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1], -- (long long)ii, j==0? is_flip+1 : 2-is_flip); -- } else { -- fprintf(fpo[j], "@%s_%u_%u_%llx/%d %d:%d:%d_%d:%d:%d\n", name, ext_coor[0]+1, ext_coor[1]+1, -- (long long)ii, j==0? is_flip+1 : 2-is_flip, -- n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1]); -- } ++ fprintf(fpo[j], "@%s_%u_%u_%d:%d:%d_%d:%d:%d_%llx/%d\n", ks->name.s, ext_coor[0]+1, ext_coor[1]+1, ++ n_err[0], n_sub[0], n_indel[0], n_err[1], n_sub[1], n_indel[1], ++ (long long)ii, j==0? is_flip+1 : 2-is_flip); for (i = 0; i < s[j]; ++i) fputc("ACGTN"[(int)tmp_seq[j][i]], fpo[j]); fprintf(fpo[j], "\n+\n%s\n", qstr); @@@ -439,7 -439,7 +352,9 @@@ } free(rseq[0].s); free(rseq[1].s); } -- free(seq.s); free(qstr); ++ kseq_destroy(ks); ++ gzclose(fp_fa); ++ free(qstr); free(tmp_seq[0]); free(tmp_seq[1]); } @@@ -459,11 -459,11 +374,9 @@@ static int simu_usage( fprintf(stderr, " -r FLOAT rate of mutations [%.4f]\n", MUT_RATE); fprintf(stderr, " -R FLOAT fraction of indels [%.2f]\n", INDEL_FRAC); fprintf(stderr, " -X FLOAT probability an indel is extended [%.2f]\n", INDEL_EXTEND); -- fprintf(stderr, " -c generate reads in color space (SOLiD reads)\n"); -- fprintf(stderr, " -C show mismatch info in comment rather than read name\n"); ++ fprintf(stderr, " -S INT seed for random generator [-1]\n"); fprintf(stderr, " -h haplotype mode\n"); fprintf(stderr, "\n"); -- fprintf(stderr, "Note: For SOLiD reads, the first read is F3 and the second is R3.\n\n"); return 1; } @@@ -471,11 -471,11 +384,12 @@@ int main(int argc, char *argv[] { int64_t N; int dist, std_dev, c, size_l, size_r, is_hap = 0; -- FILE *fpout1, *fpout2, *fp_fa; ++ FILE *fpout1, *fpout2; ++ int seed = -1; N = 1000000; dist = 500; std_dev = 50; size_l = size_r = 70; -- while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:cC")) >= 0) { ++ while ((c = getopt(argc, argv, "e:d:s:N:1:2:r:R:hX:S:")) >= 0) { switch (c) { case 'd': dist = atoi(optarg); break; case 's': std_dev = atoi(optarg); break; @@@ -486,17 -486,17 +400,20 @@@ case 'r': MUT_RATE = atof(optarg); break; case 'R': INDEL_FRAC = atof(optarg); break; case 'X': INDEL_EXTEND = atof(optarg); break; -- case 'c': IS_SOLID = 1; break; -- case 'C': SHOW_MM_INFO = 0; break; ++ case 'S': seed = atoi(optarg); break; case 'h': is_hap = 1; break; } } if (argc - optind < 3) return simu_usage(); -- fp_fa = (strcmp(argv[optind+0], "-") == 0)? stdin : xopen(argv[optind+0], "r"); -- fpout1 = xopen(argv[optind+1], "w"); -- fpout2 = xopen(argv[optind+2], "w"); -- wgsim_core(fpout1, fpout2, fp_fa, is_hap, N, dist, std_dev, size_l, size_r); ++ fpout1 = fopen(argv[optind+1], "w"); ++ fpout2 = fopen(argv[optind+2], "w"); ++ if (!fpout1 || !fpout2) { ++ fprintf(stderr, "[wgsim] file open error\n"); ++ return 1; ++ } ++ srand48(seed > 0? seed : time(0)); ++ wgsim_core(fpout1, fpout2, argv[optind], is_hap, N, dist, std_dev, size_l, size_r); -- fclose(fpout1); fclose(fpout2); fclose(fp_fa); ++ fclose(fpout1); fclose(fpout2); return 0; } diff --cc sam/sam.c index ecdee02,ecdee02..f026bc8 --- a/sam/sam.c +++ b/sam/sam.c @@@ -40,9 -40,9 +40,9 @@@ samfile_t *samopen(const char *fn, cons { samfile_t *fp; fp = (samfile_t*)calloc(1, sizeof(samfile_t)); -- if (mode[0] == 'r') { // read ++ if (strchr(mode, 'r')) { // read fp->type |= TYPE_READ; -- if (mode[1] == 'b') { // binary ++ if (strchr(mode, 'b')) { // binary fp->type |= TYPE_BAM; fp->x.bam = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); if (fp->x.bam == 0) goto open_err_ret; @@@ -59,15 -59,15 +59,19 @@@ append_header_text(fp->header, textheader->text, textheader->l_text); bam_header_destroy(textheader); } -- if (fp->header->n_targets == 0) ++ if (fp->header->n_targets == 0 && bam_verbose >= 1) fprintf(stderr, "[samopen] no @SQ lines in the header.\n"); -- } else fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets); ++ } else if (bam_verbose >= 2) fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets); } -- } else if (mode[0] == 'w') { // write ++ } else if (strchr(mode, 'w')) { // write fp->header = bam_header_dup((const bam_header_t*)aux); -- if (mode[1] == 'b') { // binary ++ if (strchr(mode, 'b')) { // binary char bmode[3]; -- bmode[0] = 'w'; bmode[1] = strstr(mode, "u")? 'u' : 0; bmode[2] = 0; ++ int i, compress_level = -1; ++ for (i = 0; mode[i]; ++i) if (mode[i] >= '0' && mode[i] <= '9') break; ++ if (mode[i]) compress_level = mode[i] - '0'; ++ if (strchr(mode, 'u')) compress_level = 0; ++ bmode[0] = 'w'; bmode[1] = compress_level < 0? 0 : compress_level + '0'; bmode[2] = 0; fp->type |= TYPE_BAM; fp->x.bam = strcmp(fn, "-")? bam_open(fn, bmode) : bam_dopen(fileno(stdout), bmode); if (fp->x.bam == 0) goto open_err_ret; @@@ -76,11 -76,11 +80,11 @@@ // open file fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout; if (fp->x.tamr == 0) goto open_err_ret; -- if (strstr(mode, "X")) fp->type |= BAM_OFSTR<<2; -- else if (strstr(mode, "x")) fp->type |= BAM_OFHEX<<2; ++ if (strchr(mode, 'X')) fp->type |= BAM_OFSTR<<2; ++ else if (strchr(mode, 'x')) fp->type |= BAM_OFHEX<<2; else fp->type |= BAM_OFDEC<<2; // write header -- if (strstr(mode, "h")) { ++ if (strchr(mode, 'h')) { int i; bam_header_t *alt; // parse the header text @@@ -89,10 -89,10 +93,10 @@@ sam_header_parse(alt); alt->l_text = 0; alt->text = 0; // check if there are @SQ lines in the header -- fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw); ++ fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw); // FIXME: better to skip the trailing NULL if (alt->n_targets) { // then write the header text without dumping ->target_{name,len} -- if (alt->n_targets != fp->header->n_targets) -- fprintf(stderr, "[samopen] inconsistent number of target sequences.\n"); ++ if (alt->n_targets != fp->header->n_targets && bam_verbose >= 1) ++ fprintf(stderr, "[samopen] inconsistent number of target sequences. Output the text header.\n"); } else { // then dump ->target_{name,len} for (i = 0; i < fp->header->n_targets; ++i) fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]); @@@ -164,7 -164,7 +168,7 @@@ char *samfaipath(const char *fn_ref if (access(fn_ref, R_OK) == -1) { fprintf(stderr, "[samfaipath] fail to read file %s.\n", fn_ref); } else { -- fprintf(stderr, "[samfaipath] build FASTA index...\n"); ++ if (bam_verbose >= 3) fprintf(stderr, "[samfaipath] build FASTA index...\n"); if (fai_build(fn_ref) == -1) { fprintf(stderr, "[samfaipath] fail to build FASTA index.\n"); free(fn_list); fn_list = 0; diff --cc sam/sam_header.c index 05d75de,05d75de..f4c8a3b --- a/sam/sam_header.c +++ b/sam/sam_header.c @@@ -38,7 -38,7 +38,7 @@@ const char *o_sq_tags[] = {"AS","M5","U const char *r_sq_tags[] = {"SN","LN",NULL}; const char *u_sq_tags[] = {"SN",NULL}; --const char *o_rg_tags[] = {"LB","DS","PU","PI","CN","DT","PL",NULL}; ++const char *o_rg_tags[] = {"CN","DS","DT","FO","KS","LB","PG","PI","PL","PU","SM",NULL}; const char *r_rg_tags[] = {"ID",NULL}; const char *u_rg_tags[] = {"ID",NULL}; @@@ -563,6 -563,6 +563,7 @@@ void *sam_header_parse2(const char *hea const char *text; char *buf=NULL; size_t nbuf = 0; ++ int tovalidate = 0; if ( !headerText ) return 0; @@@ -571,7 -571,7 +572,7 @@@ while ( (text=nextline(&buf, &nbuf, text)) ) { hline = sam_header_line_parse(buf); -- if ( hline && sam_header_line_validate(hline) ) ++ if ( hline && (!tovalidate || sam_header_line_validate(hline)) ) // With too many (~250,000) reference sequences the header parsing was too slow with list_append. hlines = list_append_to_end(hlines, hline); else diff --cc sam/sam_view.c index eb69449,eb69449..efda4e8 --- a/sam/sam_view.c +++ b/sam/sam_view.c @@@ -6,6 -6,6 +6,7 @@@ #include "sam_header.h" #include "sam.h" #include "faidx.h" ++#include "kstring.h" #include "khash.h" KHASH_SET_INIT_STR(rg) @@@ -18,32 -18,32 +19,28 @@@ typedef struct typedef khash_t(rg) *rghash_t; --rghash_t g_rghash = 0; ++// FIXME: we'd better use no global variables... ++static rghash_t g_rghash = 0; static int g_min_mapQ = 0, g_flag_on = 0, g_flag_off = 0; ++static float g_subsam = -1; static char *g_library, *g_rg; --static int g_sol2sanger_tbl[128]; ++static void *g_bed; --static void sol2sanger(bam1_t *b) --{ -- int l; -- uint8_t *qual = bam1_qual(b); -- if (g_sol2sanger_tbl[30] == 0) { -- for (l = 0; l != 128; ++l) { -- g_sol2sanger_tbl[l] = (int)(10.0 * log(1.0 + pow(10.0, (l - 64 + 33) / 10.0)) / log(10.0) + .499); -- if (g_sol2sanger_tbl[l] >= 93) g_sol2sanger_tbl[l] = 93; -- } -- } -- for (l = 0; l < b->core.l_qseq; ++l) { -- int q = qual[l]; -- if (q > 127) q = 127; -- qual[l] = g_sol2sanger_tbl[q]; -- } --} ++void *bed_read(const char *fn); ++void bed_destroy(void *_h); ++int bed_overlap(const void *_h, const char *chr, int beg, int end); static inline int __g_skip_aln(const bam_header_t *h, const bam1_t *b) { if (b->core.qual < g_min_mapQ || ((b->core.flag & g_flag_on) != g_flag_on) || (b->core.flag & g_flag_off)) return 1; ++ if (g_bed && b->core.tid >= 0 && !bed_overlap(g_bed, h->target_name[b->core.tid], b->core.pos, bam_calend(&b->core, bam1_cigar(b)))) ++ return 1; ++ if (g_subsam > 0.) { ++ int x = (int)(g_subsam + .499); ++ uint32_t k = __ac_X31_hash_string(bam1_qname(b)) + x; ++ if (k%1024 / 1024.0 >= g_subsam - x) return 1; ++ } if (g_rg || g_rghash) { uint8_t *s = bam_aux_get(b, "RG"); if (s) { @@@ -61,6 -61,6 +58,37 @@@ return 0; } ++static char *drop_rg(char *hdtxt, rghash_t h, int *len) ++{ ++ char *p = hdtxt, *q, *r, *s; ++ kstring_t str; ++ memset(&str, 0, sizeof(kstring_t)); ++ while (1) { ++ int toprint = 0; ++ q = strchr(p, '\n'); ++ if (q == 0) q = p + strlen(p); ++ if (q - p < 3) break; // the line is too short; then stop ++ if (strncmp(p, "@RG\t", 4) == 0) { ++ int c; ++ khint_t k; ++ if ((r = strstr(p, "\tID:")) != 0) { ++ r += 4; ++ for (s = r; *s != '\0' && *s != '\n' && *s != '\t'; ++s); ++ c = *s; *s = '\0'; ++ k = kh_get(rg, h, r); ++ *s = c; ++ if (k != kh_end(h)) toprint = 1; ++ } ++ } else toprint = 1; ++ if (toprint) { ++ kputsn(p, q - p, &str); kputc('\n', &str); ++ } ++ p = q + 1; ++ } ++ *len = str.l; ++ return str.s; ++} ++ // callback function for bam_fetch() that prints nonskipped records static int view_func(const bam1_t *b, void *data) { @@@ -82,7 -82,7 +110,7 @@@ static int usage(int is_long_help) int main_samview(int argc, char *argv[]) { -- int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, is_uncompressed = 0, is_bamout = 0, slx2sngr = 0, is_count = 0; ++ int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, compress_level = -1, is_bamout = 0, is_count = 0; int of_type = BAM_OFDEC, is_long_help = 0; int count = 0; samfile_t *in = 0, *out = 0; @@@ -90,10 -90,10 +118,10 @@@ /* parse command-line options */ strcpy(in_mode, "r"); strcpy(out_mode, "w"); -- while ((c = getopt(argc, argv, "Sbct:hHo:q:f:F:ul:r:xX?T:CR:")) >= 0) { ++ while ((c = getopt(argc, argv, "Sbct:h1Ho:q:f:F:ul:r:xX?T:R:L:s:")) >= 0) { switch (c) { ++ case 's': g_subsam = atof(optarg); break; case 'c': is_count = 1; break; -- case 'C': slx2sngr = 1; break; case 'S': is_bamin = 0; break; case 'b': is_bamout = 1; break; case 't': fn_list = strdup(optarg); is_bamin = 0; break; @@@ -103,8 -103,8 +131,10 @@@ case 'f': g_flag_on = strtol(optarg, 0, 0); break; case 'F': g_flag_off = strtol(optarg, 0, 0); break; case 'q': g_min_mapQ = atoi(optarg); break; -- case 'u': is_uncompressed = 1; break; ++ case 'u': compress_level = 0; break; ++ case '1': compress_level = 1; break; case 'l': g_library = strdup(optarg); break; ++ case 'L': g_bed = bed_read(optarg); break; case 'r': g_rg = strdup(optarg); break; case 'R': fn_rg = strdup(optarg); break; case 'x': of_type = BAM_OFHEX; break; @@@ -114,7 -114,7 +144,7 @@@ default: return usage(is_long_help); } } -- if (is_uncompressed) is_bamout = 1; ++ if (compress_level >= 0) is_bamout = 1; if (is_header_only) is_header = 1; if (is_bamout) strcat(out_mode, "b"); else { @@@ -123,7 -123,7 +153,11 @@@ } if (is_bamin) strcat(in_mode, "b"); if (is_header) strcat(out_mode, "h"); -- if (is_uncompressed) strcat(out_mode, "u"); ++ if (compress_level >= 0) { ++ char tmp[2]; ++ tmp[0] = compress_level + '0'; tmp[1] = '\0'; ++ strcat(out_mode, tmp); ++ } if (argc == optind) return usage(is_long_help); // potential memory leak... // read the list of read groups @@@ -151,6 -151,6 +185,14 @@@ ret = 1; goto view_end; } ++ if (g_rghash) { // FIXME: I do not know what "bam_header_t::n_text" is for... ++ char *tmp; ++ int l; ++ tmp = drop_rg(in->header->text, g_rghash, &l); ++ free(in->header->text); ++ in->header->text = tmp; ++ in->header->l_text = l; ++ } if (!is_count && (out = samopen(fn_out? fn_out : "-", out_mode, in->header)) == 0) { fprintf(stderr, "[main_samview] fail to open \"%s\" for writing.\n", fn_out? fn_out : "standard output"); ret = 1; @@@ -163,7 -163,7 +205,6 @@@ int r; while ((r = samread(in, b)) >= 0) { // read one alignment from `in' if (!__g_skip_aln(in->header, b)) { -- if (slx2sngr) sol2sanger(b); if (!is_count) samwrite(out, b); // write the alignment to `out' count++; } @@@ -210,6 -210,6 +251,7 @@@ view_end } // close files, free and return free(fn_list); free(fn_ref); free(fn_out); free(g_library); free(g_rg); free(fn_rg); ++ if (g_bed) bed_destroy(g_bed); if (g_rghash) { khint_t k; for (k = 0; k < kh_end(g_rghash); ++k) @@@ -231,9 -231,9 +273,11 @@@ static int usage(int is_long_help fprintf(stderr, " -H print header only (no alignments)\n"); fprintf(stderr, " -S input is SAM\n"); fprintf(stderr, " -u uncompressed BAM output (force -b)\n"); ++ fprintf(stderr, " -1 fast compression (force -b)\n"); fprintf(stderr, " -x output FLAG in HEX (samtools-C specific)\n"); fprintf(stderr, " -X output FLAG in string (samtools-C specific)\n"); fprintf(stderr, " -c print only the count of matching records\n"); ++ fprintf(stderr, " -L FILE output alignments overlapping the input BED FILE [null]\n"); fprintf(stderr, " -t FILE list of reference names and lengths (force -S) [null]\n"); fprintf(stderr, " -T FILE reference sequence file (force -S) [null]\n"); fprintf(stderr, " -o FILE output file name [stdout]\n"); @@@ -243,6 -243,6 +287,7 @@@ fprintf(stderr, " -q INT minimum mapping quality [0]\n"); fprintf(stderr, " -l STR only output reads in library STR [null]\n"); fprintf(stderr, " -r STR only output reads in read group STR [null]\n"); ++ fprintf(stderr, " -s FLOAT fraction of templates to subsample; integer part as seed [-1]\n"); fprintf(stderr, " -? longer help\n"); fprintf(stderr, "\n"); if (is_long_help) @@@ -293,3 -293,3 +338,69 @@@ int main_import(int argc, char *argv[] free(argv2); return ret; } ++ ++int8_t seq_comp_table[16] = { 0, 8, 4, 12, 2, 10, 9, 14, 1, 6, 5, 13, 3, 11, 7, 15 }; ++ ++int main_bam2fq(int argc, char *argv[]) ++{ ++ bamFile fp; ++ bam_header_t *h; ++ bam1_t *b; ++ int8_t *buf; ++ int max_buf; ++ if (argc == 1) { ++ fprintf(stderr, "Usage: samtools bam2fq \n"); ++ return 1; ++ } ++ fp = strcmp(argv[1], "-")? bam_open(argv[1], "r") : bam_dopen(fileno(stdin), "r"); ++ if (fp == 0) return 1; ++ h = bam_header_read(fp); ++ b = bam_init1(); ++ buf = 0; ++ max_buf = 0; ++ while (bam_read1(fp, b) >= 0) { ++ int i, qlen = b->core.l_qseq; ++ uint8_t *seq; ++ putchar('@'); fputs(bam1_qname(b), stdout); ++ if ((b->core.flag & 0x40) && !(b->core.flag & 0x80)) puts("/1"); ++ else if ((b->core.flag & 0x80) && !(b->core.flag & 0x40)) puts("/2"); ++ else putchar('\n'); ++ if (max_buf < qlen + 1) { ++ max_buf = qlen + 1; ++ kroundup32(max_buf); ++ buf = realloc(buf, max_buf); ++ } ++ buf[qlen] = 0; ++ seq = bam1_seq(b); ++ for (i = 0; i < qlen; ++i) ++ buf[i] = bam1_seqi(seq, i); ++ if (b->core.flag & 16) { // reverse complement ++ for (i = 0; i < qlen>>1; ++i) { ++ int8_t t = seq_comp_table[buf[qlen - 1 - i]]; ++ buf[qlen - 1 - i] = seq_comp_table[buf[i]]; ++ buf[i] = t; ++ } ++ if (qlen&1) buf[i] = seq_comp_table[buf[i]]; ++ } ++ for (i = 0; i < qlen; ++i) ++ buf[i] = bam_nt16_rev_table[buf[i]]; ++ puts((char*)buf); ++ puts("+"); ++ seq = bam1_qual(b); ++ for (i = 0; i < qlen; ++i) ++ buf[i] = 33 + seq[i]; ++ if (b->core.flag & 16) { // reverse ++ for (i = 0; i < qlen>>1; ++i) { ++ int8_t t = buf[qlen - 1 - i]; ++ buf[qlen - 1 - i] = buf[i]; ++ buf[i] = t; ++ } ++ } ++ puts((char*)buf); ++ } ++ free(buf); ++ bam_destroy1(b); ++ bam_header_destroy(h); ++ bam_close(fp); ++ return 0; ++} diff --cc sam/sample.c index b3d2642,b3d2642..830b9d1 --- a/sam/sample.c +++ b/sam/sample.c @@@ -52,10 -52,10 +52,15 @@@ static void add_pair(bam_sample_t *sm, int bam_smpl_add(bam_sample_t *sm, const char *fn, const char *txt) { const char *p = txt, *q, *r; -- kstring_t buf; ++ kstring_t buf, first_sm; int n = 0; khash_t(sm) *sm2id = (khash_t(sm)*)sm->sm2id; ++ if (txt == 0) { ++ add_pair(sm, sm2id, fn, fn); ++ return 0; ++ } memset(&buf, 0, sizeof(kstring_t)); ++ memset(&first_sm, 0, sizeof(kstring_t)); while ((q = strstr(p, "@RG")) != 0) { p = q + 3; r = q = 0; @@@ -69,12 -69,12 +74,22 @@@ oq = *u; or = *v; *u = *v = '\0'; buf.l = 0; kputs(fn, &buf); kputc('/', &buf); kputs(q, &buf); add_pair(sm, sm2id, buf.s, r); ++ if ( !first_sm.s ) ++ kputs(r,&first_sm); *u = oq; *v = or; } else break; p = q > r? q : r; ++n; } if (n == 0) add_pair(sm, sm2id, fn, fn); ++ // If there is only one RG tag present in the header and reads are not annotated, don't refuse to work but ++ // use the tag instead. ++ else if ( n==1 && first_sm.s ) ++ add_pair(sm,sm2id,fn,first_sm.s); ++ if ( first_sm.s ) ++ free(first_sm.s); ++ ++// add_pair(sm, sm2id, fn, fn); free(buf.s); return 0; } diff --cc sam/samtools.1 index 57f1aff,57f1aff..98ce9d0 --- a/sam/samtools.1 +++ b/sam/samtools.1 @@@ -1,7 -1,7 +1,9 @@@ --.TH samtools 1 "2 December 2010" "samtools-0.1.12" "Bioinformatics tools" ++.TH samtools 1 "05 July 2011" "samtools-0.1.17" "Bioinformatics tools" .SH NAME .PP samtools - Utilities for the Sequence Alignment/Map (SAM) format ++ ++bcftools - Utilities for the Binary Call Format (BCF) and VCF .SH SYNOPSIS .PP samtools view -bt ref_list.txt -o aln.bam aln.sam.gz @@@ -23,6 -23,6 +25,12 @@@ samtools pileup -vcf ref.fasta aln.sort samtools mpileup -C50 -gf ref.fasta -r chr3:1,000-2,000 in1.bam in2.bam .PP samtools tview aln.sorted.bam ref.fasta ++.PP ++bcftools index in.bcf ++.PP ++bcftools view in.bcf chr2:100-200 > out.vcf ++.PP ++bcftools view -vc in.bcf > out.vcf 2> out.afs .SH DESCRIPTION .PP @@@ -43,7 -43,7 +51,7 @@@ Samtools checks the current working dir will download the index upon absence. Samtools does not retrieve the entire alignment file unless it is asked to do so. --.SH COMMANDS AND OPTIONS ++.SH SAMTOOLS COMMANDS AND OPTIONS .TP 10 .B view @@@ -137,21 -137,21 +145,68 @@@ viewing the same reference sequence .TP .B mpileup --samtools mpileup [-Bug] [-C capQcoef] [-r reg] [-f in.fa] [-l list] [-M capMapQ] [-Q minBaseQ] [-q minMapQ] in.bam [in2.bam [...]] ++.B samtools mpileup ++.RB [ \-EBug ] ++.RB [ \-C ++.IR capQcoef ] ++.RB [ \-r ++.IR reg ] ++.RB [ \-f ++.IR in.fa ] ++.RB [ \-l ++.IR list ] ++.RB [ \-M ++.IR capMapQ ] ++.RB [ \-Q ++.IR minBaseQ ] ++.RB [ \-q ++.IR minMapQ ] ++.I in.bam ++.RI [ in2.bam ++.RI [ ... ]] Generate BCF or pileup for one or multiple BAM files. Alignment records are grouped by sample identifiers in @RG header lines. If sample identifiers are absent, each input file is regarded as one sample. --.B OPTIONS: ++In the pileup format (without ++.BR -u or -g ), ++each ++line represents a genomic position, consisting of chromosome name, ++coordinate, reference base, read bases, read qualities and alignment ++mapping qualities. Information on match, mismatch, indel, strand, ++mapping quality and start and end of a read are all encoded at the read ++base column. At this column, a dot stands for a match to the reference ++base on the forward strand, a comma for a match on the reverse strand, ++a '>' or '<' for a reference skip, `ACGTN' for a mismatch on the forward ++strand and `acgtn' for a mismatch on the reverse strand. A pattern ++`\\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between this ++reference position and the next reference position. The length of the ++insertion is given by the integer in the pattern, followed by the ++inserted sequence. Similarly, a pattern `-[0-9]+[ACGTNacgtn]+' ++represents a deletion from the reference. The deleted bases will be ++presented as `*' in the following lines. Also at the read base column, a ++symbol `^' marks the start of a read. The ASCII of the character ++following `^' minus 33 gives the mapping quality. A symbol `$' marks the ++end of a read segment. ++ ++.B Input Options: .RS --.TP 8 ++.TP 10 ++.B -6 ++Assume the quality is in the Illumina 1.3+ encoding. ++.B -A ++Do not skip anomalous read pairs in variant calling. ++.TP .B -B Disable probabilistic realignment for the computation of base alignment quality (BAQ). BAQ is the Phred-scaled probability of a read base being misaligned. Applying this option greatly helps to reduce false SNPs caused by misalignments. .TP ++.BI -b \ FILE ++List of input BAM files, one file per line [null] ++.TP .BI -C \ INT Coefficient for downgrading mapping quality for reads containing excessive mismatches. Given a read with a phred-scaled probability q of @@@ -159,17 -159,17 +214,62 @@@ being generated from the mapped positio about sqrt((INT-q)/INT)*INT. A zero value disables this functionality; if enabled, the recommended value for BWA is 50. [0] .TP --.BI -e \ INT --Phred-scaled gap extension sequencing error probability. Reducing ++.BI -d \ INT ++At a position, read maximally .I INT --leads to longer indels. [20] ++reads per input BAM. [250] ++.TP ++.B -E ++Extended BAQ computation. This option helps sensitivity especially for MNPs, but may hurt ++specificity a little bit. .TP .BI -f \ FILE --The reference file [null] ++The ++.BR faidx -indexed ++reference file in the FASTA format. The file can be optionally compressed by ++.BR razip . ++[null] ++.TP ++.BI -l \ FILE ++BED or position list file containing a list of regions or sites where pileup or BCF should be generated [null] ++.TP ++.BI -q \ INT ++Minimum mapping quality for an alignment to be used [0] ++.TP ++.BI -Q \ INT ++Minimum base quality for a base to be considered [13] ++.TP ++.BI -r \ STR ++Only generate pileup in region ++.I STR ++[all sites] ++.TP ++.B Output Options: ++ ++.TP ++.B -D ++Output per-sample read depth .TP .B -g Compute genotype likelihoods and output them in the binary call format (BCF). .TP ++.B -S ++Output per-sample Phred-scaled strand bias P-value ++.TP ++.B -u ++Similar to ++.B -g ++except that the output is uncompressed BCF, which is preferred for piping. ++ ++.TP ++.B Options for Genotype Likelihood Computation (for -g or -u): ++ ++.TP ++.BI -e \ INT ++Phred-scaled gap extension sequencing error probability. Reducing ++.I INT ++leads to longer indels. [20] ++.TP .BI -h \ INT Coefficient for modeling homopolymer errors. Given an .IR l -long @@@ -180,8 -180,8 +280,13 @@@ is modeled a .IR INT * s / l . [100] .TP --.BI -l \ FILE --File containing a list of sites where pileup or BCF is outputted [null] ++.B -I ++Do not perform INDEL calling ++.TP ++.BI -L \ INT ++Skip INDEL calling if the average per-sample depth is above ++.IR INT . ++[250] .TP .BI -o \ INT Phred-scaled gap open sequencing error probability. Reducing @@@ -194,22 -194,22 +299,6 @@@ Comma dilimited list of platforms (dete from which indel candidates are obtained. It is recommended to collect indel candidates from sequencing technologies that have low indel error rate such as ILLUMINA. [all] --.TP --.BI -q \ INT --Minimum mapping quality for an alignment to be used [0] --.TP --.BI -Q \ INT --Minimum base quality for a base to be considered [13] --.TP --.BI -r \ STR --Only generate pileup in region --.I STR --[all sites] --.TP --.B -u --Similar to --.B -g --except that the output is uncompressed BCF, which is preferred for piping. .RE .TP @@@ -223,6 -223,6 +312,16 @@@ with the header i This command is much faster than replacing the header with a BAM->SAM->BAM conversion. ++.TP ++.B cat ++samtools cat [-h header.sam] [-o out.bam] [ ... ] ++ ++Concatenate BAMs. The sequence dictionary of each input BAM must be identical, ++although this command does not check this. This command uses a similar trick ++to ++.B reheader ++which enables fast BAM concatenation. ++ .TP .B sort samtools sort [-no] [-m maxMem] @@@ -249,7 -249,7 +348,7 @@@ Approximately the maximum required memo .TP .B merge --samtools merge [-nur] [-h inh.sam] [-R reg] [...] ++samtools merge [-nur1f] [-h inh.sam] [-R reg] [...] Merge multiple sorted alignments. The header reference lists of all the input BAM files, and the @SQ headers of @@@ -266,6 -266,6 +365,12 @@@ and the headers of other files will be .B OPTIONS: .RS .TP 8 ++.B -1 ++Use zlib compression level 1 to comrpess the output ++.TP ++.B -f ++Force to overwrite the output file if present. ++.TP 8 .BI -h \ FILE Use the lines of .I FILE @@@ -277,17 -277,17 +382,18 @@@ replacing any header lines that would o is actually in SAM format, though any alignment records it may contain are ignored.) .TP ++.B -n ++The input alignments are sorted by read names rather than by chromosomal ++coordinates ++.TP .BI -R \ STR Merge files in the specified region indicated by .I STR ++[null] .TP .B -r Attach an RG tag to each alignment. The tag value is inferred from file names. .TP --.B -n --The input alignments are sorted by read names rather than by chromosomal --coordinates --.TP .B -u Uncompressed BAM output .RE @@@ -355,7 -355,7 +461,7 @@@ Treat paired-end reads and single-end r .TP .B calmd --samtools calmd [-eubSr] [-C capQcoef] ++samtools calmd [-EeubSr] [-C capQcoef] Generate the MD tag. If the MD tag is already present, this command will give a warning if the MD tag generated is different from the existing @@@ -388,142 -388,142 +494,228 @@@ Coefficient to cap mapping quality of p command for details. [0] .TP .B -r --Compute the BQ tag without changing the base quality. ++Compute the BQ tag (without -A) or cap base quality by BAQ (with -A). ++.TP ++.B -E ++Extended BAQ calculation. This option trades specificity for sensitivity, though the ++effect is minor. .RE .TP --.B pileup --samtools pileup [-2sSBicv] [-f in.ref.fasta] [-t in.ref_list] [-l --in.site_list] [-C capMapQ] [-M maxMapQ] [-T theta] [-N nHap] [-r --pairDiffRate] [-m mask] [-d maxIndelDepth] [-G indelPrior] --| ++.B targetcut ++samtools targetcut [-Q minBaseQ] [-i inPenalty] [-0 em0] [-1 em1] [-2 em2] [-f ref] --Print the alignment in the pileup format. In the pileup format, each --line represents a genomic position, consisting of chromosome name, --coordinate, reference base, read bases, read qualities and alignment --mapping qualities. Information on match, mismatch, indel, strand, --mapping quality and start and end of a read are all encoded at the read --base column. At this column, a dot stands for a match to the reference --base on the forward strand, a comma for a match on the reverse strand, --a '>' or '<' for a reference skip, `ACGTN' for a mismatch on the forward --strand and `acgtn' for a mismatch on the reverse strand. A pattern --`\\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between this --reference position and the next reference position. The length of the --insertion is given by the integer in the pattern, followed by the --inserted sequence. Similarly, a pattern `-[0-9]+[ACGTNacgtn]+' --represents a deletion from the reference. The deleted bases will be --presented as `*' in the following lines. Also at the read base column, a --symbol `^' marks the start of a read. The ASCII of the character --following `^' minus 33 gives the mapping quality. A symbol `$' marks the --end of a read segment. ++This command identifies target regions by examining the continuity of read depth, computes ++haploid consensus sequences of targets and outputs a SAM with each sequence corresponding ++to a target. When option ++.B -f ++is in use, BAQ will be applied. This command is ++.B only ++designed for cutting fosmid clones from fosmid pool sequencing [Ref. Kitzman et al. (2010)]. ++.RE --If option --.B -c --is applied, the consensus base, Phred-scaled consensus quality, SNP --quality (i.e. the Phred-scaled probability of the consensus being --identical to the reference) and root mean square (RMS) mapping quality --of the reads covering the site will be inserted between the `reference --base' and the `read bases' columns. An indel occupies an additional --line. Each indel line consists of chromosome name, coordinate, a star, --the genotype, consensus quality, SNP quality, RMS mapping quality, # --covering reads, the first alllele, the second allele, # reads supporting --the first allele, # reads supporting the second allele and # reads --containing indels different from the top two alleles. -- --.B NOTE: --Since 0.1.10, the `pileup' command is deprecated by `mpileup'. ++.TP ++.B phase ++samtools phase [-AF] [-k len] [-b prefix] [-q minLOD] [-Q minBaseQ] ++Call and phase heterozygous SNPs. .B OPTIONS: .RS --.TP 10 --.B -B --Disable the BAQ computation. See the --.B mpileup --command for details. ++.TP 8 ++.B -A ++Drop reads with ambiguous phase. ++.TP 8 ++.BI -b \ STR ++Prefix of BAM output. When this option is in use, phase-0 reads will be saved in file ++.BR STR .0.bam ++and phase-1 reads in ++.BR STR .1.bam. ++Phase unknown reads will be randomly allocated to one of the two files. Chimeric reads ++with switch errors will be saved in ++.BR STR .chimeric.bam. ++[null] .TP --.B -c --Call the consensus sequence. Options --.BR -T ", " -N ", " -I " and " -r --are only effective when --.BR -c " or " -g --is in use. ++.B -F ++Do not attempt to fix chimeric reads. .TP --.BI -C \ INT --Coefficient for downgrading the mapping quality of poorly mapped --reads. See the --.B mpileup --command for details. [0] ++.BI -k \ INT ++Maximum length for local phasing. [13] .TP --.BI -d \ INT --Use the first --.I NUM --reads in the pileup for indel calling for speed up. Zero for unlimited. [1024] ++.BI -q \ INT ++Minimum Phred-scaled LOD to call a heterozygote. [40] .TP --.BI -f \ FILE --The reference sequence in the FASTA format. Index file --.I FILE.fai --will be created if --absent. ++.BI -Q \ INT ++Minimum base quality to be used in het calling. [13] ++.RE ++ ++.SH BCFTOOLS COMMANDS AND OPTIONS ++ ++.TP 10 ++.B view ++.B bcftools view ++.RB [ \-AbFGNQSucgv ] ++.RB [ \-D ++.IR seqDict ] ++.RB [ \-l ++.IR listLoci ] ++.RB [ \-s ++.IR listSample ] ++.RB [ \-i ++.IR gapSNPratio ] ++.RB [ \-t ++.IR mutRate ] ++.RB [ \-p ++.IR varThres ] ++.RB [ \-P ++.IR prior ] ++.RB [ \-1 ++.IR nGroup1 ] ++.RB [ \-d ++.IR minFrac ] ++.RB [ \-U ++.IR nPerm ] ++.RB [ \-X ++.IR permThres ] ++.RB [ \-T ++.IR trioType ] ++.I in.bcf ++.RI [ region ] ++ ++Convert between BCF and VCF, call variant candidates and estimate allele ++frequencies. ++ ++.RS .TP --.B -g --Generate genotype likelihood in the binary GLFv3 format. This option --suppresses -c, -i and -s. This option is deprecated by the --.B mpileup --command. ++.B Input/Output Options: ++.TP 10 ++.B -A ++Retain all possible alternate alleles at variant sites. By default, the view ++command discards unlikely alleles. ++.TP 10 ++.B -b ++Output in the BCF format. The default is VCF. .TP --.B -i --Only output pileup lines containing indels. ++.BI -D \ FILE ++Sequence dictionary (list of chromosome names) for VCF->BCF conversion [null] .TP --.BI -I \ INT --Phred probability of an indel in sequencing/prep. [40] ++.B -F ++Indicate PL is generated by r921 or before (ordering is different). ++.TP ++.B -G ++Suppress all individual genotype information. .TP .BI -l \ FILE --List of sites at which pileup is output. This file is space --delimited. The first two columns are required to be chromosome and --1-based coordinate. Additional columns are ignored. It is --recommended to use option ++List of sites at which information are outputted [all sites] .TP --.BI -m \ INT --Filter reads with flag containing bits in --.I INT --[1796] ++.B -N ++Skip sites where the REF field is not A/C/G/T .TP --.BI -M \ INT --Cap mapping quality at INT [60] ++.B -Q ++Output the QCALL likelihood format .TP --.BI -N \ INT --Number of haplotypes in the sample (>=2) [2] ++.BI -s \ FILE ++List of samples to use. The first column in the input gives the sample names ++and the second gives the ploidy, which can only be 1 or 2. When the 2nd column ++is absent, the sample ploidy is assumed to be 2. In the output, the ordering of ++samples will be identical to the one in ++.IR FILE . ++[null] .TP --.BI -r \ FLOAT --Expected fraction of differences between a pair of haplotypes [0.001] ++.B -S ++The input is VCF instead of BCF. .TP --.B -s --Print the mapping quality as the last column. This option makes the --output easier to parse, although this format is not space efficient. ++.B -u ++Uncompressed BCF output (force -b). .TP --.B -S --The input file is in SAM. ++.B Consensus/Variant Calling Options: ++.TP 10 ++.B -c ++Call variants using Bayesian inference. This option automatically invokes option ++.BR -e . .TP --.BI -t \ FILE --List of reference names ane sequence lengths, in the format described --for the --.B import --command. If this option is present, samtools assumes the input --.I --is in SAM format; otherwise it assumes in BAM format. ++.BI -d \ FLOAT ++When ++.B -v ++is in use, skip loci where the fraction of samples covered by reads is below FLOAT. [0] ++.TP ++.B -e ++Perform max-likelihood inference only, including estimating the site allele frequency, ++testing Hardy-Weinberg equlibrium and testing associations with LRT. ++.TP ++.B -g ++Call per-sample genotypes at variant sites (force -c) ++.TP ++.BI -i \ FLOAT ++Ratio of INDEL-to-SNP mutation rate [0.15] ++.TP ++.BI -p \ FLOAT ++A site is considered to be a variant if P(ref|D)| [region1 [...]] -- -- Extract/print all or sub alignments in SAM or BAM format. If -- no region is specified, all the alignments will be printed; -- otherwise only alignments overlapping the specified regions -- will be output. An alignment may be given multiple times if -- it is overlapping several regions. A region can be presented, -- for example, in the following format: `chr2' (the whole -- chr2), `chr2:1000000' (region starting from 1,000,000bp) or -- `chr2:1,000,000-2,000,000' (region between 1,000,000 and -- 2,000,000bp including the end points). The coordinate is -- 1-based. -- -- OPTIONS: -- -- -b Output in the BAM format. -- -- -f INT Only output alignments with all bits in INT present -- in the FLAG field. INT can be in hex in the format of -- /^0x[0-9A-F]+/ [0] -- -- -F INT Skip alignments with bits present in INT [0] -- -- -h Include the header in the output. -- -- -H Output the header only. -- -- -l STR Only output reads in library STR [null] -- -- -o FILE Output file [stdout] -- -- -q INT Skip alignments with MAPQ smaller than INT [0] -- -- -r STR Only output reads in read group STR [null] -- -- -R FILE Output reads in read groups listed in FILE [null] -- -- -S Input is in SAM. If @SQ header lines are absent, the -- `-t' option is required. -- -- -c Instead of printing the alignments, only count them -- and print the total number. All filter options, such -- as `-f', `-F' and `-q' , are taken into account. -- -- -t FILE This file is TAB-delimited. Each line must contain -- the reference name and the length of the reference, -- one line for each distinct reference; additional -- fields are ignored. This file also defines the order -- of the reference sequences in sorting. If you run -- `samtools faidx ', the resultant index file -- .fai can be used as this file. -- -- -u Output uncompressed BAM. This option saves time spent -- on compression/decomprssion and is thus preferred -- when the output is piped to another samtools command. -- -- -- tview samtools tview [ref.fasta] -- -- Text alignment viewer (based on the ncurses library). In the -- viewer, press `?' for help and press `g' to check the align- -- ment start from a region in the format like -- `chr10:10,000,000' or `=10,000,000' when viewing the same -- reference sequence. -- -- -- mpileup samtools mpileup [-Bug] [-C capQcoef] [-r reg] [-f in.fa] [-l -- list] [-M capMapQ] [-Q minBaseQ] [-q minMapQ] in.bam [in2.bam -- [...]] -- -- Generate BCF or pileup for one or multiple BAM files. Align- -- ment records are grouped by sample identifiers in @RG header -- lines. If sample identifiers are absent, each input file is -- regarded as one sample. -- -- OPTIONS: -- -- -B Disable probabilistic realignment for the computation -- of base alignment quality (BAQ). BAQ is the Phred- -- scaled probability of a read base being misaligned. -- Applying this option greatly helps to reduce false -- SNPs caused by misalignments. -- -- -C INT Coefficient for downgrading mapping quality for reads -- containing excessive mismatches. Given a read with a -- phred-scaled probability q of being generated from -- the mapped position, the new mapping quality is about -- sqrt((INT-q)/INT)*INT. A zero value disables this -- functionality; if enabled, the recommended value for -- BWA is 50. [0] -- -- -e INT Phred-scaled gap extension sequencing error probabil- -- ity. Reducing INT leads to longer indels. [20] -- -- -f FILE The reference file [null] -- -- -g Compute genotype likelihoods and output them in the -- binary call format (BCF). -- -- -h INT Coefficient for modeling homopolymer errors. Given an -- l-long homopolymer run, the sequencing error of an -- indel of size s is modeled as INT*s/l. [100] -- -- -l FILE File containing a list of sites where pileup or BCF -- is outputted [null] -- -- -o INT Phred-scaled gap open sequencing error probability. -- Reducing INT leads to more indel calls. [40] -- -- -P STR Comma dilimited list of platforms (determined by @RG- -- PL) from which indel candidates are obtained. It is -- recommended to collect indel candidates from sequenc- -- ing technologies that have low indel error rate such -- as ILLUMINA. [all] -- -- -q INT Minimum mapping quality for an alignment to be used -- [0] -- -- -Q INT Minimum base quality for a base to be considered [13] -- -- -r STR Only generate pileup in region STR [all sites] -- -- -u Similar to -g except that the output is uncompressed -- BCF, which is preferred for piping. -- -- -- reheader samtools reheader -- -- Replace the header in in.bam with the header in -- in.header.sam. This command is much faster than replacing -- the header with a BAM->SAM->BAM conversion. -- -- -- sort samtools sort [-no] [-m maxMem] -- -- Sort alignments by leftmost coordinates. File .bam will be created. This command may also create tempo- -- rary files .%d.bam when the whole alignment can- -- not be fitted into memory (controlled by option -m). -- -- OPTIONS: -- -- -o Output the final alignment to the standard output. -- -- -n Sort by read names rather than by chromosomal coordi- -- nates -- -- -m INT Approximately the maximum required memory. -- [500000000] -- -- -- merge samtools merge [-nur] [-h inh.sam] [-R reg] -- [...] -- -- Merge multiple sorted alignments. The header reference lists -- of all the input BAM files, and the @SQ headers of inh.sam, -- if any, must all refer to the same set of reference -- sequences. The header reference list and (unless overridden -- by -h) `@' headers of in1.bam will be copied to out.bam, and -- the headers of other files will be ignored. -- -- OPTIONS: -- -- -h FILE Use the lines of FILE as `@' headers to be copied to -- out.bam, replacing any header lines that would other- -- wise be copied from in1.bam. (FILE is actually in -- SAM format, though any alignment records it may con- -- tain are ignored.) -- -- -R STR Merge files in the specified region indicated by STR -- -- -r Attach an RG tag to each alignment. The tag value is -- inferred from file names. -- -- -n The input alignments are sorted by read names rather -- than by chromosomal coordinates -- -- -u Uncompressed BAM output -- -- -- index samtools index -- -- Index sorted alignment for fast random access. Index file -- .bai will be created. -- -- -- idxstats samtools idxstats -- -- Retrieve and print stats in the index file. The output is TAB -- delimited with each line consisting of reference sequence -- name, sequence length, # mapped reads and # unmapped reads. -- -- -- faidx samtools faidx [region1 [...]] -- -- Index reference sequence in the FASTA format or extract sub- -- sequence from indexed reference sequence. If no region is -- specified, faidx will index the file and create -- .fai on the disk. If regions are speficified, the -- subsequences will be retrieved and printed to stdout in the -- FASTA format. The input file can be compressed in the RAZF -- format. -- -- -- fixmate samtools fixmate -- -- Fill in mate coordinates, ISIZE and mate related flags from a -- name-sorted alignment. -- -- -- rmdup samtools rmdup [-sS] -- -- Remove potential PCR duplicates: if multiple read pairs have -- identical external coordinates, only retain the pair with -- highest mapping quality. In the paired-end mode, this com- -- mand ONLY works with FR orientation and requires ISIZE is -- correctly set. It does not work for unpaired reads (e.g. two -- ends mapped to different chromosomes or orphan reads). -- -- OPTIONS: -- -- -s Remove duplicate for single-end reads. By default, -- the command works for paired-end reads only. -- -- -S Treat paired-end reads and single-end reads. -- -- -- calmd samtools calmd [-eubSr] [-C capQcoef] -- -- Generate the MD tag. If the MD tag is already present, this -- command will give a warning if the MD tag generated is dif- -- ferent from the existing tag. Output SAM by default. -- -- OPTIONS: -- -- -A When used jointly with -r this option overwrites the -- original base quality. -- -- -e Convert a the read base to = if it is identical to -- the aligned reference base. Indel caller does not -- support the = bases at the moment. -- -- -u Output uncompressed BAM -- -- -b Output compressed BAM -- -- -S The input is SAM with header lines -- -- -C INT Coefficient to cap mapping quality of poorly mapped -- reads. See the pileup command for details. [0] -- -- -r Compute the BQ tag without changing the base quality. -- -- -- pileup samtools pileup [-2sSBicv] [-f in.ref.fasta] [-t in.ref_list] -- [-l in.site_list] [-C capMapQ] [-M maxMapQ] [-T theta] [-N -- nHap] [-r pairDiffRate] [-m mask] [-d maxIndelDepth] [-G -- indelPrior] | -- -- Print the alignment in the pileup format. In the pileup for- -- mat, each line represents a genomic position, consisting of -- chromosome name, coordinate, reference base, read bases, read -- qualities and alignment mapping qualities. Information on -- match, mismatch, indel, strand, mapping quality and start and -- end of a read are all encoded at the read base column. At -- this column, a dot stands for a match to the reference base -- on the forward strand, a comma for a match on the reverse -- strand, a '>' or '<' for a reference skip, `ACGTN' for a mis- -- match on the forward strand and `acgtn' for a mismatch on the -- reverse strand. A pattern `\+[0-9]+[ACGTNacgtn]+' indicates -- there is an insertion between this reference position and the -- next reference position. The length of the insertion is given -- by the integer in the pattern, followed by the inserted -- sequence. Similarly, a pattern `-[0-9]+[ACGTNacgtn]+' repre- -- sents a deletion from the reference. The deleted bases will -- be presented as `*' in the following lines. Also at the read -- base column, a symbol `^' marks the start of a read. The -- ASCII of the character following `^' minus 33 gives the map- -- ping quality. A symbol `$' marks the end of a read segment. -- -- If option -c is applied, the consensus base, Phred-scaled -- consensus quality, SNP quality (i.e. the Phred-scaled proba- -- bility of the consensus being identical to the reference) and -- root mean square (RMS) mapping quality of the reads covering -- the site will be inserted between the `reference base' and -- the `read bases' columns. An indel occupies an additional -- line. Each indel line consists of chromosome name, coordi- -- nate, a star, the genotype, consensus quality, SNP quality, -- RMS mapping quality, # covering reads, the first alllele, the -- second allele, # reads supporting the first allele, # reads -- supporting the second allele and # reads containing indels -- different from the top two alleles. -- -- NOTE: Since 0.1.10, the `pileup' command is deprecated by -- `mpileup'. -- -- OPTIONS: -- -- -B Disable the BAQ computation. See the mpileup com- -- mand for details. -- -- -c Call the consensus sequence. Options -T, -N, -I and -- -r are only effective when -c or -g is in use. -- -- -C INT Coefficient for downgrading the mapping quality of -- poorly mapped reads. See the mpileup command for -- details. [0] -- -- -d INT Use the first NUM reads in the pileup for indel -- calling for speed up. Zero for unlimited. [1024] -- -- -f FILE The reference sequence in the FASTA format. Index -- file FILE.fai will be created if absent. -- -- -g Generate genotype likelihood in the binary GLFv3 -- format. This option suppresses -c, -i and -s. This -- option is deprecated by the mpileup command. -- -- -i Only output pileup lines containing indels. -- -- -I INT Phred probability of an indel in sequencing/prep. -- [40] -- -- -l FILE List of sites at which pileup is output. This file -- is space delimited. The first two columns are -- required to be chromosome and 1-based coordinate. -- Additional columns are ignored. It is recommended -- to use option -- -- -m INT Filter reads with flag containing bits in INT -- [1796] -- -- -M INT Cap mapping quality at INT [60] -- -- -N INT Number of haplotypes in the sample (>=2) [2] -- -- -r FLOAT Expected fraction of differences between a pair of -- haplotypes [0.001] -- -- -s Print the mapping quality as the last column. This -- option makes the output easier to parse, although -- this format is not space efficient. -- -- -S The input file is in SAM. -- -- -t FILE List of reference names ane sequence lengths, in -- the format described for the import command. If -- this option is present, samtools assumes the input -- is in SAM format; otherwise it -- assumes in BAM format. -s together with -l as in -- the default format we may not know the mapping -- quality. -- -- -T FLOAT The theta parameter (error dependency coefficient) -- in the maq consensus calling model [0.85] -- -- --SAM FORMAT -- SAM is TAB-delimited. Apart from the header lines, which are started -- with the `@' symbol, each alignment line consists of: -- -- -- +----+-------+----------------------------------------------------------+ -- |Col | Field | Description | -- +----+-------+----------------------------------------------------------+ -- | 1 | QNAME | Query (pair) NAME | -- | 2 | FLAG | bitwise FLAG | -- | 3 | RNAME | Reference sequence NAME | -- | 4 | POS | 1-based leftmost POSition/coordinate of clipped sequence | -- | 5 | MAPQ | MAPping Quality (Phred-scaled) | -- | 6 | CIAGR | extended CIGAR string | -- | 7 | MRNM | Mate Reference sequence NaMe (`=' if same as RNAME) | -- | 8 | MPOS | 1-based Mate POSistion | -- | 9 | ISIZE | Inferred insert SIZE | -- |10 | SEQ | query SEQuence on the same strand as the reference | -- |11 | QUAL | query QUALity (ASCII-33 gives the Phred base quality) | -- |12 | OPT | variable OPTional fields in the format TAG:VTYPE:VALUE | -- +----+-------+----------------------------------------------------------+ -- -- Each bit in the FLAG field is defined as: -- -- -- +-------+-----+--------------------------------------------------+ -- | Flag | Chr | Description | -- +-------+-----+--------------------------------------------------+ -- |0x0001 | p | the read is paired in sequencing | -- |0x0002 | P | the read is mapped in a proper pair | -- |0x0004 | u | the query sequence itself is unmapped | -- |0x0008 | U | the mate is unmapped | -- |0x0010 | r | strand of the query (1 for reverse) | -- |0x0020 | R | strand of the mate | -- |0x0040 | 1 | the read is the first read in a pair | -- |0x0080 | 2 | the read is the second read in a pair | -- |0x0100 | s | the alignment is not primary | -- |0x0200 | f | the read fails platform/vendor quality checks | -- |0x0400 | d | the read is either a PCR or an optical duplicate | -- +-------+-----+--------------------------------------------------+ -- --EXAMPLES -- o Import SAM to BAM when @SQ lines are present in the header: -- -- samtools view -bS aln.sam > aln.bam -- -- If @SQ lines are absent: -- -- samtools faidx ref.fa -- samtools view -bt ref.fa.fai aln.sam > aln.bam -- -- where ref.fa.fai is generated automatically by the faidx command. -- -- -- o Attach the RG tag while merging sorted alignments: -- -- perl -e 'print "@RG\tID:ga\tSM:hs\tLB:ga\tPL:Illu- -- mina\n@RG\tID:454\tSM:hs\tLB:454\tPL:454\n"' > rg.txt -- samtools merge -rh rg.txt merged.bam ga.bam 454.bam -- -- The value in a RG tag is determined by the file name the read is com- -- ing from. In this example, in the merged.bam, reads from ga.bam will -- be attached RG:Z:ga, while reads from 454.bam will be attached -- RG:Z:454. -- -- -- o Call SNPs and short indels for one diploid individual: -- -- samtools mpileup -ugf ref.fa aln.bam | bcftools view -bvcg - > -- var.raw.bcf -- bcftools view var.raw.bcf | vcfutils.pl varFilter -D 100 > -- var.flt.vcf -- -- The -D option of varFilter controls the maximum read depth, which -- should be adjusted to about twice the average read depth. One may -- consider to add -C50 to mpileup if mapping quality is overestimated -- for reads containing excessive mismatches. Applying this option usu- -- ally helps BWA-short but may not other mappers. -- -- -- o Call SNPs and short indels for multiple diploid individuals: -- -- samtools mpileup -P ILLUMINA -ugf ref.fa *.bam | bcftools view -- -bcvg - > var.raw.bcf -- bcftools view var.raw.bcf | vcfutils.pl varFilter -D 2000 > -- var.flt.vcf -- -- Individuals are identified from the SM tags in the @RG header lines. -- Individuals can be pooled in one alignment file; one individual can -- also be separated into multiple files. The -P option specifies that -- indel candidates should be collected only from read groups with the -- @RG-PL tag set to ILLUMINA. Collecting indel candidates from reads -- sequenced by an indel-prone technology may affect the performance of -- indel calling. -- -- -- o Derive the allele frequency spectrum (AFS) on a list of sites from -- multiple individuals: -- -- samtools mpileup -Igf ref.fa *.bam > all.bcf -- bcftools view -bl sites.list all.bcf > sites.bcf -- bcftools view -cGP cond2 sites.bcf > /dev/null 2> sites.1.afs -- bcftools view -cGP sites.1.afs sites.bcf > /dev/null 2> sites.2.afs -- bcftools view -cGP sites.2.afs sites.bcf > /dev/null 2> sites.3.afs -- ...... -- -- where sites.list contains the list of sites with each line consisting -- of the reference sequence name and position. The following bcftools -- commands estimate AFS by EM. -- -- -- o Dump BAQ applied alignment for other SNP callers: -- -- samtools calmd -bAr aln.bam > aln.baq.bam -- -- It adds and corrects the NM and MD tags at the same time. The calmd -- command also comes with the -C option, the same as the one in pileup -- and mpileup. Apply if it helps. -- -- --LIMITATIONS -- o Unaligned words used in bam_import.c, bam_endian.h, bam.c and -- bam_aux.c. -- -- o In merging, the input files are required to have the same number of -- reference sequences. The requirement can be relaxed. In addition, -- merging does not reconstruct the header dictionaries automatically. -- Endusers have to provide the correct header. Picard is better at -- merging. -- -- o Samtools paired-end rmdup does not work for unpaired reads (e.g. -- orphan reads or ends mapped to different chromosomes). If this is a -- concern, please use Picard's MarkDuplicate which correctly handles -- these cases, although a little slower. -- -- --AUTHOR -- Heng Li from the Sanger Institute wrote the C version of samtools. Bob -- Handsaker from the Broad Institute implemented the BGZF library and Jue -- Ruan from Beijing Genomics Institute wrote the RAZF library. John Mar- -- shall and Petr Danecek contribute to the source code and various people -- from the 1000 Genomes Project have contributed to the SAM format speci- -- fication. -- -- --SEE ALSO -- Samtools website: -- -- -- --samtools-0.1.12 2 December 2010 samtools(1) diff --cc synthesisRef.cpp index 0c6695e,3bb2808..8ce268c --- a/synthesisRef.cpp +++ b/synthesisRef.cpp @@@ -17,7 -17,7 +17,7 @@@ int M map name2seq; map::iterator iter; --Transcripts transcripts; ++Transcripts transcripts(1); // no genome, just transcript set char groupF[STRLEN], tiF[STRLEN], refFastaF[STRLEN], chromListF[STRLEN]; bool hasMappingFile; diff --cc utils.h index 278e95e,278e95e..0991fb9 --- a/utils.h +++ b/utils.h @@@ -156,4 -156,4 +156,9 @@@ void genReadFileNames(const char* readF } } ++void exitWithError(const char* errmsg) { ++ fprintf(stderr, "%s\n", errmsg); ++ exit(-1); ++} ++ #endif diff --cc wiggle.cpp index 0000000,90b6f8b..19f52b4 mode 000000,100644..100644 --- a/wiggle.cpp +++ b/wiggle.cpp @@@ -1,0 -1,108 +1,112 @@@ + #include + #include + #include + -#include "wiggle.h" - ++#include + #include "sam/bam.h" + #include "sam/sam.h" + ++#include "wiggle.h" ++ + void add_bam_record_to_wiggle(const bam1_t *b, Wiggle& wiggle) { - float w = bam_aux2f(bam_aux_get(b, "ZW")); ++ uint8_t *p_tag = bam_aux_get(b, "ZW"); ++ float w = (p_tag != NULL ? bam_aux2f(p_tag) : 1.0); + int pos = b->core.pos; + uint32_t *p = bam1_cigar(b); + + for (int i = 0; i < (int)b->core.n_cigar; i++, ++p) { + int op = *p & BAM_CIGAR_MASK; + int op_len = *p >> BAM_CIGAR_SHIFT; + + switch (op) { + //case BAM_CSOFT_CLIP : pos += op_len; break; + case BAM_CINS : pos += op_len; break; + case BAM_CMATCH : + for (int j = 0; j < op_len; j++, ++pos) { + wiggle.read_depth[pos] += w; + } + break; + case BAM_CREF_SKIP : pos += op_len; break; + default : assert(false); + } + } + } + + void build_wiggles(const std::string& bam_filename, + WiggleProcessor& processor) { + samfile_t *bam_in = samopen(bam_filename.c_str(), "rb", NULL); + if (bam_in == 0) { fprintf(stderr, "Cannot open %s!\n", bam_filename.c_str()); exit(-1); } + //assert(bam_in != 0); + + int cur_tid = -1; //current tid; + int cnt = 0; + bam1_t *b = bam_init1(); + Wiggle wiggle; + while (samread(bam_in, b) >= 0) { ++ if (b->core.flag & 0x0004) continue; ++ + if (b->core.tid != cur_tid) { + if (cur_tid >= 0) processor.process(wiggle); + cur_tid = b->core.tid; + wiggle.name = bam_in->header->target_name[cur_tid]; + wiggle.read_depth.assign(bam_in->header->target_len[cur_tid], 0.0); + } + add_bam_record_to_wiggle(b, wiggle); + ++cnt; + if (cnt % 1000000 == 0) fprintf(stderr, "%d FIN\n", cnt); + } + if (cur_tid >= 0) processor.process(wiggle); + + samclose(bam_in); + bam_destroy1(b); + } + + UCSCWiggleTrackWriter::UCSCWiggleTrackWriter(const std::string& output_filename, + const std::string& track_name) { + fo = fopen(output_filename.c_str(), "w"); + fprintf(fo, "track type=wiggle_0 name=\"%s\" description=\"%s\" visibility=full\n", + track_name.c_str(), + track_name.c_str()); + } + + UCSCWiggleTrackWriter::~UCSCWiggleTrackWriter() { + fclose(fo); + } + + void UCSCWiggleTrackWriter::process(const Wiggle& wiggle) { + int sp, ep; + + sp = ep = -1; + for (size_t i = 0; i < wiggle.read_depth.size(); i++) { + if (wiggle.read_depth[i] > 0) { + ep = i; + } + else { + if (sp < ep) { + ++sp; + fprintf(fo, "fixedStep chrom=%s start=%d step=1\n", wiggle.name.c_str(), sp + 1); + for (int j = sp; j <= ep; j++) fprintf(fo, "%.7g\n", wiggle.read_depth[j]); + } + sp = i; + } + } + if (sp < ep) { + ++sp; + fprintf(fo, "fixedStep chrom=%s start=%d step=1\n", wiggle.name.c_str(), sp + 1); + for (int j = sp; j <= ep; j++) fprintf(fo, "%.7g\n", wiggle.read_depth[j]); + } + } + + ReadDepthWriter::ReadDepthWriter(std::ostream& stream) + : stream_(stream) { + } + + void ReadDepthWriter::process(const Wiggle& wiggle) { + stream_ << wiggle.name << '\t' + << wiggle.read_depth.size() << '\t'; + for (size_t i = 0; i < wiggle.read_depth.size(); ++i) { + if (i > 0) stream_ << ' '; + stream_ << wiggle.read_depth[i]; + } + stream_ << '\n'; + }