From: Heng Li <lh3@live.co.uk>
Date: Fri, 17 Sep 2010 14:11:37 +0000 (+0000)
Subject:  * added comments
X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=f418206d4563377a981d66804873599e4e050842;p=samtools.git

 * added comments
 * VCF->BCF is not possible without knowing the sequence dictionary before hand...
---

diff --git a/bcftools/Makefile b/bcftools/Makefile
index 9592025..2dbcf59 100644
--- a/bcftools/Makefile
+++ b/bcftools/Makefile
@@ -34,8 +34,12 @@ bcftools:lib $(AOBJS)
 		$(CC) $(CFLAGS) -o $@ $(AOBJS) -lm $(LIBPATH) -lz -L. -lbcf
 
 bcf.o:bcf.h
+vcf.o:bcf.h
+index.o:bcf.h
+bcfutils.o:bcf.h
 prob1.o:prob1.h bcf.h
-vcfout.o:bcf.h prob1.h
+call1.o:prob1.h bcf.h
+bcf2qcall.o:bcf.h
 main.o:bcf.h
 
 bcf.pdf:bcf.tex
diff --git a/bcftools/bcf.c b/bcftools/bcf.c
index 2fd1696..feeda57 100644
--- a/bcftools/bcf.c
+++ b/bcftools/bcf.c
@@ -4,12 +4,6 @@
 #include "kstring.h"
 #include "bcf.h"
 
-void bcf_hdr_clear(bcf_hdr_t *b)
-{
-	free(b->name); free(b->sname); free(b->txt); free(b->ns); free(b->sns);
-	memset(b, 0, sizeof(bcf_hdr_t));
-}
-
 bcf_t *bcf_open(const char *fn, const char *mode)
 {
 	bcf_t *b;
@@ -110,7 +104,7 @@ int bcf_sync(bcf1_t *b)
 	for (p = b->str, n = 0; p < b->str + b->l_str; ++p)
 		if (*p == 0 && p+1 != b->str + b->l_str) tmp[n++] = p + 1;
 	if (n != 5) {
-		fprintf(stderr, "[bcf_sync] incorrect number of fields (%d != 5)\n", n);
+		fprintf(stderr, "[%s] incorrect number of fields (%d != 5). Corrupted file?\n", __func__, n);
 		return -1;
 	}
 	b->ref = tmp[0]; b->alt = tmp[1]; b->flt = tmp[2]; b->info = tmp[3]; b->fmt = tmp[4];
diff --git a/bcftools/bcf.h b/bcftools/bcf.h
index 92b2c03..7286111 100644
--- a/bcftools/bcf.h
+++ b/bcftools/bcf.h
@@ -1,3 +1,30 @@
+/* The MIT License
+
+   Copyright (c) 2010 Broad Institute
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3@live.co.uk> */
+
 #ifndef BCF_H
 #define BCF_H
 
@@ -5,38 +32,49 @@
 #include <zlib.h>
 #include "bgzf.h"
 
+/*
+  A member in the structs below is said to "primary" if its content
+  cannot be inferred from other members in any of structs below; a
+  member is said to be "derived" if its content can be derived from
+  other members. For example, bcf1_t::str is primary as this comes from
+  the input data, while bcf1_t::info is derived as it can always be
+  correctly set if we know bcf1_t::str. Derived members are for quick
+  access to the content and must be synchronized with the primary data.
+ */
+
 typedef struct {
-	uint32_t fmt;
-	int len; // len is the unit length
-	void *data;
-	// derived info: fmt, len
+	uint32_t fmt; // format of the block, set by bcf_str2int(). 
+	int len; // length of data for each individual
+	void *data; // concatenated data
+	// derived info: fmt, len (<-bcf1_t::fmt)
 } bcf_ginfo_t;
 
 typedef struct {
-	int32_t tid, pos;
-	int32_t l_str, m_str;
-	float qual;
-	char *str, *ref, *alt, *flt, *info, *fmt; // fmt, ref, alt and info point to str
-	int n_gi, m_gi;
-	bcf_ginfo_t *gi;
-	int n_alleles, n_smpl;
-	// derived info: ref, alt, flt, info, fmt, n_gi, n_alleles
+	int32_t tid, pos; // refID and 0-based position
+	int32_t l_str, m_str; // length and the allocated size of ->str
+	float qual; // SNP quality
+	char *str; // concatenated string of variable length strings in VCF (from col.2 to col.7)
+	char *ref, *alt, *flt, *info, *fmt; // they all point to ->str; no memory allocation
+	int n_gi, m_gi; // number and the allocated size of geno fields
+	bcf_ginfo_t *gi; // array of geno fields
+	int n_alleles, n_smpl; // number of alleles and samples
+	// derived info: ref, alt, flt, info, fmt (<-str), n_gi (<-fmt), n_alleles (<-alt), n_smpl (<-bcf_hdr_t::n_smpl)
 } bcf1_t;
 
 typedef struct {
-	int32_t n_ref, n_smpl;
-	int32_t l_nm;
-	int32_t l_smpl;
-	int32_t l_txt;
-	char *name, *sname, *txt;
-	char **ns, **sns;
-	// derived info: n_ref, n_smpl, ns, sns
+	int32_t n_ref, n_smpl; // number of reference sequences and samples
+	int32_t l_nm; // length of concatenated sequence names; 0 padded
+	int32_t l_smpl; // length of concatenated sample names; 0 padded
+	int32_t l_txt; // length of header text (lines started with ##)
+	char *name, *sname, *txt; // concatenated sequence names, sample names and header text
+	char **ns, **sns; // array of sequence and sample names; point to name and sname, respectively
+	// derived info: n_ref (<-name), n_smpl (<-sname), ns (<-name), sns (<-sname)
 } bcf_hdr_t;
 
 typedef struct {
-	int is_vcf;
-	void *v;
-	BGZF *fp;
+	int is_vcf; // if the file in operation is a VCF
+	void *v; // auxillary data structure for VCF
+	BGZF *fp; // file handler for BCF
 } bcf_t;
 
 struct __bcf_idx_t;
@@ -46,34 +84,55 @@ typedef struct __bcf_idx_t bcf_idx_t;
 extern "C" {
 #endif
 
+	// open a BCF file; for BCF file only
 	bcf_t *bcf_open(const char *fn, const char *mode);
+	// close file
 	int bcf_close(bcf_t *b);
+	// read one record from BCF; return -1 on end-of-file, and <-1 for errors
 	int bcf_read(bcf_t *bp, const bcf_hdr_t *h, bcf1_t *b);
+	// call this function if b->str is changed
 	int bcf_sync(bcf1_t *b);
+	// write a BCF record
 	int bcf_write(bcf_t *bp, const bcf_hdr_t *h, const bcf1_t *b);
+	// read the BCF header; BCF only
 	bcf_hdr_t *bcf_hdr_read(bcf_t *b);
+	// write the BCF header
 	int bcf_hdr_write(bcf_t *b, const bcf_hdr_t *h);
+	// set bcf_hdr_t::ns and bcf_hdr_t::sns
 	int bcf_hdr_sync(bcf_hdr_t *b);
+	// destroy the header
 	void bcf_hdr_destroy(bcf_hdr_t *h);
+	// destroy a record
 	int bcf_destroy(bcf1_t *b);
+	// BCF->VCF conversion
 	char *bcf_fmt(const bcf_hdr_t *h, bcf1_t *b);
 
+	// open a VCF or BCF file if "b" is set in "mode"
 	bcf_t *vcf_open(const char *fn, const char *mode);
+	// close a VCF/BCF file
 	int vcf_close(bcf_t *bp);
+	// read the VCF/BCF header
 	bcf_hdr_t *vcf_hdr_read(bcf_t *bp);
+	// read a VCF/BCF record; return -1 on end-of-file and <-1 for errors
+	int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b);
+	// write the VCF header
 	int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h);
+	// write a VCF record
 	int vcf_write(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b);
-	int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b);
 
+	// keep the first n alleles and discard the rest
 	int bcf_shrink_alt(bcf1_t *b, int n);
+	// convert GL to PL
 	int bcf_gl2pl(bcf1_t *b);
 
+	// string hash table
 	void *bcf_build_refhash(bcf_hdr_t *h);
 	void bcf_str2id_destroy(void *_hash);
 	int bcf_str2id_add(void *_hash, const char *str);
 	int bcf_str2id(void *_hash, const char *str);
 	void *bcf_str2id_init();
 
+	// indexing related functions
 	int bcf_idx_build(const char *fn);
 	uint64_t bcf_idx_query(const bcf_idx_t *idx, int tid, int beg);
 	int bcf_parse_region(void *str2id, const char *str, int *tid, int *begin, int *end);
diff --git a/bcftools/vcf.c b/bcftools/vcf.c
index 51ccb74..ebca869 100644
--- a/bcftools/vcf.c
+++ b/bcftools/vcf.c
@@ -102,13 +102,6 @@ int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h)
 		if (strstr(h->txt, "##SQ=")) has_ref = 1;
 	}
 	if (has_ver == 0) fprintf(v->fpout, "##fileformat=VCFv4.0\n");
-	if (!has_ref) {
-		fprintf(v->fpout, "##SQ=");
-		for (i = 0; i < h->n_ref; ++i) {
-			fprintf(v->fpout, "%s", h->ns[i]);
-			fputc(i == h->n_ref - 1? '\n' : ',', v->fpout);
-		}
-	}
 	fprintf(v->fpout, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT");
 	for (i = 0; i < h->n_smpl; ++i)
 		fprintf(v->fpout, "\t%s", h->sns[i]);