]> git.donarmstrong.com Git - samtools.git/commitdiff
* Merge from branches/dev/
authorHeng Li <lh3@live.co.uk>
Thu, 22 Jan 2009 15:14:27 +0000 (15:14 +0000)
committerHeng Li <lh3@live.co.uk>
Thu, 22 Jan 2009 15:14:27 +0000 (15:14 +0000)
 * all future development will happen here at trunk/

30 files changed:
COPYING
ChangeLog
INSTALL [new file with mode: 0644]
Makefile
Makefile.lite [new file with mode: 0644]
bam.c
bam.h
bam_import.c
bam_index.c
bam_lpileup.c
bam_maqcns.c
bam_maqcns.h
bam_mate.c [new file with mode: 0644]
bam_pileup.c
bam_plcmd.c
bam_rmdup.c [new file with mode: 0644]
bam_sort.c
bamtk.c
bgzf.h
examples/00README.txt
examples/Makefile [new file with mode: 0644]
examples/ex1.fa.fai [deleted file]
examples/ex1.sam.gz
faidx.c
glf.c [new file with mode: 0644]
glf.h
misc/Makefile
misc/maq2sam.c
razf.c
source.dot

diff --git a/COPYING b/COPYING
index 2f596e5cba971f5e39e9fb237c3daa668f320efb..82fa2f4e50197fcaae7416092134181e3980b5c2 100644 (file)
--- a/COPYING
+++ b/COPYING
@@ -1,6 +1,6 @@
 The MIT License
 
-Copyright (c) 2008 Genome Research Ltd.
+Copyright (c) 2008-2009 Genome Research Ltd.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
index 4c52aadcbc4c9f7382adb57f4e0594ea4764980b..45359784b8682812c04c79d8b7d56e17a01a3922 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,366 @@
+------------------------------------------------------------------------
+r104 | lh3lh3 | 2009-01-18 17:31:21 +0000 (Sun, 18 Jan 2009) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/bam.h
+   M /branches/dev/samtools/bam_lpileup.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-18
+ * fixed a bug in bam_lpileup.c: segment start and end are not correctly recognized
+
+------------------------------------------------------------------------
+r103 | lh3lh3 | 2009-01-18 16:34:03 +0000 (Sun, 18 Jan 2009) | 5 lines
+Changed paths:
+   M /branches/dev/samtools/bam_import.c
+   M /branches/dev/samtools/bam_index.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-17
+ * fixed a bug when there are reads without coordinates
+ * also recognize type 'c' as 'A'      
+ * found a bug in bam_lpileup.c; NOT fixed yet
+
+------------------------------------------------------------------------
+r102 | lh3lh3 | 2009-01-17 19:46:49 +0000 (Sat, 17 Jan 2009) | 2 lines
+Changed paths:
+   A /branches/dev/samtools/INSTALL
+
+Instruction for compilation
+
+------------------------------------------------------------------------
+r101 | lh3lh3 | 2009-01-17 19:31:36 +0000 (Sat, 17 Jan 2009) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/Makefile
+   A /branches/dev/samtools/Makefile.lite
+   M /branches/dev/samtools/bam.h
+   M /branches/dev/samtools/faidx.c
+   M /branches/dev/samtools/misc/Makefile
+   M /branches/dev/samtools/razf.c
+
+ * replaced HAVE_RAZF with _NO_RAZF
+ * added Makefile.lite for people who have trouble with razf.c
+
+------------------------------------------------------------------------
+r100 | lh3lh3 | 2009-01-16 10:03:37 +0000 (Fri, 16 Jan 2009) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/bam_mate.c
+   M /branches/dev/samtools/bamtk.c
+   M /branches/dev/samtools/misc/wgsim.c
+
+ * samtools-0.1.1-15
+ * fixed another bug in fixmate: unmapped pair has non-zero isize
+
+------------------------------------------------------------------------
+r99 | lh3lh3 | 2009-01-16 09:13:36 +0000 (Fri, 16 Jan 2009) | 4 lines
+Changed paths:
+   M /branches/dev/samtools/ChangeLog
+   M /branches/dev/samtools/bam_mate.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-14
+ * fixed a bug in fixmate: isize not equal to zero if two ends mapped to
+   different chr
+
+------------------------------------------------------------------------
+r98 | lh3lh3 | 2009-01-15 16:47:41 +0000 (Thu, 15 Jan 2009) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/bam_maqcns.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-13
+ * fixed the prior for hom indels (Richard pointed this out)
+
+------------------------------------------------------------------------
+r97 | lh3lh3 | 2009-01-15 16:38:47 +0000 (Thu, 15 Jan 2009) | 4 lines
+Changed paths:
+   M /branches/dev/samtools/COPYING
+   M /branches/dev/samtools/bam_sort.c
+   M /branches/dev/samtools/bamtk.c
+   M /branches/dev/samtools/source.dot
+
+ * samtools-0.1.1-12
+ * fixed a bug in sort
+ * update source file graph and copyright information
+
+------------------------------------------------------------------------
+r96 | lh3lh3 | 2009-01-14 21:46:14 +0000 (Wed, 14 Jan 2009) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/glf.c
+
+fixed a typo
+
+------------------------------------------------------------------------
+r95 | lh3lh3 | 2009-01-14 21:44:53 +0000 (Wed, 14 Jan 2009) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/glf.c
+
+added a main function for glf.c
+
+------------------------------------------------------------------------
+r94 | lh3lh3 | 2009-01-14 17:14:59 +0000 (Wed, 14 Jan 2009) | 4 lines
+Changed paths:
+   M /branches/dev/samtools/Makefile
+   M /branches/dev/samtools/bam_plcmd.c
+   M /branches/dev/samtools/bamtk.c
+   M /branches/dev/samtools/bgzf.h
+   A /branches/dev/samtools/glf.c
+   M /branches/dev/samtools/glf.h
+
+ * samtools-0.1.1-11
+ * generate binary GLFv2
+ * added glfview command to dump GLFv2 binary file
+
+------------------------------------------------------------------------
+r93 | lh3lh3 | 2009-01-14 15:07:44 +0000 (Wed, 14 Jan 2009) | 4 lines
+Changed paths:
+   M /branches/dev/samtools/bam_rmdup.c
+   M /branches/dev/samtools/bamtk.c
+   M /branches/dev/samtools/glf.h
+
+ * samtools-0.1.1-10
+ * fixed several bugs in rmdup
+ * prepare to generate GLF2
+
+------------------------------------------------------------------------
+r92 | lh3lh3 | 2009-01-14 13:27:44 +0000 (Wed, 14 Jan 2009) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/Makefile
+   M /branches/dev/samtools/bam.h
+   M /branches/dev/samtools/bam_import.c
+   A /branches/dev/samtools/bam_rmdup.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-9
+ * implemented rmdup; NOT tested yet
+
+------------------------------------------------------------------------
+r91 | lh3lh3 | 2009-01-13 20:15:43 +0000 (Tue, 13 Jan 2009) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/examples/00README.txt
+
+update README for typos
+
+------------------------------------------------------------------------
+r90 | lh3lh3 | 2009-01-13 19:57:50 +0000 (Tue, 13 Jan 2009) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/examples/ex1.sam.gz
+
+update example
+
+------------------------------------------------------------------------
+r89 | lh3lh3 | 2009-01-13 17:21:38 +0000 (Tue, 13 Jan 2009) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/Makefile
+   M /branches/dev/samtools/bam.c
+   A /branches/dev/samtools/bam_mate.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-8
+ * added fixmate command
+
+------------------------------------------------------------------------
+r88 | lh3lh3 | 2009-01-13 10:48:23 +0000 (Tue, 13 Jan 2009) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/bam_plcmd.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-7
+ * change the reported indel position to the previous way
+
+------------------------------------------------------------------------
+r87 | lh3lh3 | 2009-01-12 22:12:12 +0000 (Mon, 12 Jan 2009) | 4 lines
+Changed paths:
+   M /branches/dev/samtools/bam_plcmd.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-6
+ * addd glt output
+ * allow to change indel calling parameters at the command line
+
+------------------------------------------------------------------------
+r86 | lh3lh3 | 2009-01-12 21:16:48 +0000 (Mon, 12 Jan 2009) | 4 lines
+Changed paths:
+   M /branches/dev/samtools/bam.h
+   M /branches/dev/samtools/bam_pileup.c
+   M /branches/dev/samtools/bam_plcmd.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-5
+ * added two more flags
+ * allowed to select reads shown in pileup with a mask
+
+------------------------------------------------------------------------
+r85 | lh3lh3 | 2009-01-12 20:47:51 +0000 (Mon, 12 Jan 2009) | 4 lines
+Changed paths:
+   M /branches/dev/samtools/bam_index.c
+   M /branches/dev/samtools/bam_plcmd.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-4
+ * fixed a bug in indexing (linear index)
+ * prepare to add glt output from pileup
+
+------------------------------------------------------------------------
+r84 | lh3lh3 | 2009-01-12 09:22:35 +0000 (Mon, 12 Jan 2009) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/bam_plcmd.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-3
+ * fixed a bug in outputing the coordinate of an indel
+
+------------------------------------------------------------------------
+r83 | lh3lh3 | 2009-01-11 15:18:01 +0000 (Sun, 11 Jan 2009) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/bam_plcmd.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-2
+ * pileup: allows to output indel sites only
+
+------------------------------------------------------------------------
+r82 | lh3lh3 | 2009-01-10 23:34:31 +0000 (Sat, 10 Jan 2009) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/bam_maqcns.c
+   M /branches/dev/samtools/bam_maqcns.h
+   M /branches/dev/samtools/bam_plcmd.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-1
+ * implemented a Bayesian indel caller
+
+------------------------------------------------------------------------
+r81 | lh3lh3 | 2009-01-09 09:54:28 +0000 (Fri, 09 Jan 2009) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/examples/00README.txt
+   D /branches/dev/samtools/examples/ex1.fa.fai
+
+Let users generate ex1.fa.fai.
+
+------------------------------------------------------------------------
+r80 | lh3lh3 | 2009-01-08 16:10:08 +0000 (Thu, 08 Jan 2009) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/misc/bowtie2sam.pl
+
+make the bowtie converter works for "-k 2"
+
+------------------------------------------------------------------------
+r78 | lh3lh3 | 2009-01-03 17:25:24 +0000 (Sat, 03 Jan 2009) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/misc/export2sam.pl
+
+fixed a bug for "QC" reads
+
+------------------------------------------------------------------------
+r77 | lh3lh3 | 2009-01-01 18:32:06 +0000 (Thu, 01 Jan 2009) | 3 lines
+Changed paths:
+   A /branches/dev/samtools/misc/bowtie2sam.pl
+   M /branches/dev/samtools/misc/soap2sam.pl
+
+ * soap2sam.pl: added NM tag
+ * bowtie2sam.pl: converter for bowtie
+
+------------------------------------------------------------------------
+r76 | lh3lh3 | 2008-12-31 23:24:24 +0000 (Wed, 31 Dec 2008) | 2 lines
+Changed paths:
+   A /branches/dev/samtools/misc/soap2sam.pl
+
+soap2sam.pl: convert soap output to SAM
+
+------------------------------------------------------------------------
+r75 | lh3lh3 | 2008-12-31 17:54:32 +0000 (Wed, 31 Dec 2008) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/misc/wgsim_eval.pl
+
+ * wgsim_eval.pl-0.1.1
+ * fixed a bug for a contig name like "NT_012345"
+
+------------------------------------------------------------------------
+r74 | lh3lh3 | 2008-12-31 16:38:21 +0000 (Wed, 31 Dec 2008) | 2 lines
+Changed paths:
+   A /branches/dev/samtools/misc/wgsim_eval.pl
+
+ * evaluate alignment for reads generated by wgsim
+
+------------------------------------------------------------------------
+r73 | lh3lh3 | 2008-12-31 15:11:22 +0000 (Wed, 31 Dec 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/misc/Makefile
+   M /branches/dev/samtools/misc/wgsim.c
+
+fixed compiling warnings for wgsim
+
+------------------------------------------------------------------------
+r72 | lh3lh3 | 2008-12-31 13:40:51 +0000 (Wed, 31 Dec 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/bam_tview.c
+
+remove an unused variable (a compiler warning only)
+
+------------------------------------------------------------------------
+r71 | lh3lh3 | 2008-12-31 13:37:16 +0000 (Wed, 31 Dec 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/misc/Makefile
+   A /branches/dev/samtools/misc/wgsim.c
+
+wgsim: Paired-end reads simulator
+
+------------------------------------------------------------------------
+r70 | bhandsaker | 2008-12-29 20:27:16 +0000 (Mon, 29 Dec 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/bam_maqcns.c
+   M /branches/dev/samtools/bam_tview.c
+
+Move definition of bam_nt16_nt4_table so we can build without curses.
+
+------------------------------------------------------------------------
+r62 | lh3lh3 | 2008-12-22 15:55:13 +0000 (Mon, 22 Dec 2008) | 2 lines
+Changed paths:
+   A /branches/dev/samtools/NEWS
+   M /branches/dev/samtools/bamtk.c
+   M /branches/dev/samtools/samtools.1
+
+Release samtools-0.1.1
+
+------------------------------------------------------------------------
+r61 | lh3lh3 | 2008-12-22 15:46:08 +0000 (Mon, 22 Dec 2008) | 10 lines
+Changed paths:
+   M /branches/dev/samtools/bam_aux.c
+   M /branches/dev/samtools/bam_index.c
+   M /branches/dev/samtools/bam_plcmd.c
+   M /branches/dev/samtools/bam_tview.c
+   M /branches/dev/samtools/bamtk.c
+   M /branches/dev/samtools/razf.c
+   M /branches/dev/samtools/samtools.1
+
+ * samtools-0.1.0-66
+ * fixed a bug in razf.c: reset z_eof when razf_seek() is called
+ * fixed a memory leak in parsing a region
+ * changed pileup a little bit when -s is in use: output ^ and $
+ * when a bam is not indexed, output more meaningful error message
+ * fixed a bug in indexing for small alignment
+ * fixed a bug in the viewer when we come to the end of a reference file
+ * updated documentation
+ * prepare to release 0.1.1
+
+------------------------------------------------------------------------
+r60 | lh3lh3 | 2008-12-22 15:10:16 +0000 (Mon, 22 Dec 2008) | 2 lines
+Changed paths:
+   A /branches/dev/samtools/examples
+   A /branches/dev/samtools/examples/00README.txt
+   A /branches/dev/samtools/examples/ex1.fa
+   A /branches/dev/samtools/examples/ex1.fa.fai
+   A /branches/dev/samtools/examples/ex1.sam.gz
+
+example
+
+------------------------------------------------------------------------
+r59 | lh3lh3 | 2008-12-22 09:38:15 +0000 (Mon, 22 Dec 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/ChangeLog
+
+update ChangeLog
+
 ------------------------------------------------------------------------
 r58 | lh3lh3 | 2008-12-20 23:06:00 +0000 (Sat, 20 Dec 2008) | 3 lines
 Changed paths:
diff --git a/INSTALL b/INSTALL
new file mode 100644 (file)
index 0000000..9efcd0c
--- /dev/null
+++ b/INSTALL
@@ -0,0 +1,7 @@
+Compiling samtools requires the zlib library <http://www.zlib.net/>;
+compiling the alignment viewer further requires the GNU ncurses library
+<http://www.gnu.org/software/ncurses/>. The default Makefile builds the
+alignment viewer. If compilation fails due to errors in RAZF or the lack
+of ncurses, you can compile with `make -f Makefile.lite'. However, the
+alignment viewer will not be available; the FASTA indexer will not
+support indexing compressed FASTA, either.
\ No newline at end of file
index 32e4c41aa15cb717424cf85f5459b9510db712b1..aff7eaa0b2991157d336ce748638060cf26f5882 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -2,9 +2,10 @@ CC=                    gcc
 CXX=           g++
 CFLAGS=                -g -Wall -O2 -m64 #-arch ppc
 CXXFLAGS=      $(CFLAGS)
-DFLAGS=                -D_IOLIB=2 -D_FILE_OFFSET_BITS=64 -DHAVE_RAZF #-D_NO_CURSES
+DFLAGS=                -D_IOLIB=2 -D_FILE_OFFSET_BITS=64 #-D_NO_RAZF #-D_NO_CURSES
 OBJS=          bam.o bam_import.o bam_pileup.o bam_lpileup.o bam_sort.o bam_index.o \
-                       razf.o bgzf.o faidx.o bam_tview.o bam_maqcns.o bam_aux.o bam_plcmd.o
+                       razf.o bgzf.o faidx.o bam_tview.o bam_maqcns.o bam_aux.o bam_plcmd.o \
+                       bam_mate.o bam_rmdup.o glf.o
 PROG=          razip bgzip samtools
 INCLUDES=      
 LIBS=          -lm -lz
diff --git a/Makefile.lite b/Makefile.lite
new file mode 100644 (file)
index 0000000..2abb0ab
--- /dev/null
@@ -0,0 +1,63 @@
+CC=                    gcc
+CXX=           g++
+CFLAGS=                -g -Wall -O2 -m64 #-arch ppc
+CXXFLAGS=      $(CFLAGS)
+DFLAGS=                -D_IOLIB=2 -D_FILE_OFFSET_BITS=64 -D_NO_CURSES -D_NO_RAZF
+OBJS=          bam.o bam_import.o bam_pileup.o bam_lpileup.o bam_sort.o bam_index.o \
+                       bgzf.o faidx.o bam_tview.o bam_maqcns.o bam_aux.o bam_plcmd.o \
+                       bam_mate.o bam_rmdup.o glf.o
+PROG=          samtools
+INCLUDES=      
+LIBS=          -lm -lz
+SUBDIRS=       .
+
+.SUFFIXES:.c .o
+
+.c.o:
+               $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
+
+all-recur lib-recur clean-recur cleanlocal-recur install-recur:
+               @target=`echo $@ | sed s/-recur//`; \
+               wdir=`pwd`; \
+               list='$(SUBDIRS)'; for subdir in $$list; do \
+                       cd $$subdir; \
+                       $(MAKE) -f Makefile.lite CC="$(CC)" CXX="$(CXX)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \
+                               INCLUDES="$(INCLUDES)" $$target || exit 1; \
+                       cd $$wdir; \
+               done;
+
+all:$(PROG)
+
+lib:libbam.a
+
+libbam.a:$(OBJS)
+               $(AR) -cru $@ $(OBJS)
+
+samtools:lib bamtk.o
+               $(CC) $(CFLAGS) -o $@ bamtk.o $(LIBS) -L. -lbam
+
+razip:razip.o razf.o
+               $(CC) $(CFLAGS) -o $@ razf.o razip.o $(LIBS)
+
+bgzip:bgzip.o bgzf.o
+               $(CC) $(CFLAGS) -o $@ bgzf.o bgzip.o $(LIBS)
+
+razip.o:razf.h
+bam.o:bam.h razf.h bam_endian.h
+bam_import.o:bam.h kseq.h khash.h razf.h
+bam_pileup.o:bam.h razf.h ksort.h
+bam_plcmd.o:bam.h faidx.h bam_maqcns.h
+bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h
+bam_lpileup.o:bam.h ksort.h
+bam_tview.o:bam.h faidx.h bam_maqcns.h
+bam_maqcns.o:bam.h ksort.h bam_maqcns.h
+bam_sort.o:bam.h ksort.h razf.h
+razf.o:razf.h
+
+faidx.o:faidx.h razf.h khash.h
+faidx_main.o:faidx.h razf.h
+
+cleanlocal:
+               rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a
+
+clean:cleanlocal-recur
diff --git a/bam.c b/bam.c
index 6ccca7c0a1153c336a8e914742f5519abacb6edd..e470e629430cc23e0d9426dcdd1b3db764f6a1d0 100644 (file)
--- a/bam.c
+++ b/bam.c
@@ -249,6 +249,7 @@ void bam_view1(const bam_header_t *header, const bam1_t *b)
        }
        putchar('\t');
        if (c->mtid < 0) printf("*\t");
+       else if (c->mtid == c->tid) printf("=\t");
        else printf("%s\t", header->target_name[c->mtid]);
        printf("%d\t%d\t", c->mpos + 1, c->isize);
        for (i = 0; i < c->l_qseq; ++i) putchar(bam_nt16_rev_table[bam1_seqi(s, i)]);
diff --git a/bam.h b/bam.h
index 4b3a68856e7245d4f1595a194cc724f2f95af2eb..76f3e0f99f783a1779df83fc4b1be7c88ec6aaf8 100644 (file)
--- a/bam.h
+++ b/bam.h
@@ -46,7 +46,7 @@
 #include <string.h>
 #include <stdio.h>
 
-#if _IOLIB == 1
+#if _IOLIB == 1 && !defined(_NO_RAZF)
 #define BAM_TRUE_OFFSET
 #include "razf.h"
 /*! @abstract BAM file handler */
@@ -118,6 +118,10 @@ typedef struct {
 #define BAM_FREAD1        64
 #define BAM_FREAD2       128
 #define BAM_FSECONDARY   256
+#define BAM_FQCFAIL      512
+#define BAM_FDUP        1024
+
+#define BAM_DEF_MASK (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)
 
 #define BAM_CORE_SIZE   sizeof(bam1_core_t)
 
@@ -461,6 +465,8 @@ extern "C" {
        /*! @abstract pileup buffer */
        typedef struct __bam_plbuf_t bam_plbuf_t;
 
+       void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask);
+
        /*! @typedef
          @abstract    Type of function to be called by bam_plbuf_push().
          @param  tid  chromosome ID as is defined in the header
@@ -516,7 +522,7 @@ extern "C" {
          @discussion The file position indicator must be placed right
          before the start of an alignment. See also bam_plbuf_push().
         */
-       int bam_pileup_file(bamFile fp, bam_pileup_f func, void *func_data);
+       int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data);
 
        struct __bam_lplbuf_t;
        typedef struct __bam_lplbuf_t bam_lplbuf_t;
@@ -533,7 +539,7 @@ extern "C" {
        int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *buf);
 
        /*! @abstract  bam_plbuf_file() equivalent with level calculated. */
-       int bam_lpileup_file(bamFile fp, bam_pileup_f func, void *func_data);
+       int bam_lpileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data);
 
        struct __bam_index_t;
        typedef struct __bam_index_t bam_index_t;
@@ -641,7 +647,7 @@ static inline int bam_reg2bin(uint32_t beg, uint32_t end)
        return 0;
 }
 
-static inline void bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
+static inline bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
 {
        uint8_t *data = bdst->data;
        int m_data = bdst->m_data;   // backup data and m_data
@@ -654,6 +660,18 @@ static inline void bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
        // restore the backup
        bdst->m_data = m_data;
        bdst->data = data;
+       return bdst;
+}
+
+static inline bam1_t *bam_dup1(const bam1_t *src)
+{
+       bam1_t *b;
+       b = bam_init1();
+       *b = *src;
+       b->m_data = b->data_len;
+       b->data = (uint8_t*)calloc(b->data_len, 1);
+       memcpy(b->data, src->data, b->data_len);
+       return b;
 }
 
 #endif
index 6b3b4bc69352a92d0c9e74379fe8ddd2e4564c43..4b7bb2122e0e046bf65874bdda27e03828479b06 100644 (file)
@@ -245,9 +245,9 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b)
                        type = str->s[3];
                        s = alloc_data(b, doff + 3) + doff;
                        s[0] = key[0]; s[1] = key[1]; s += 2; doff += 2;
-                       if (type == 'A' || type == 'a') {
+                       if (type == 'A' || type == 'a' || type == 'c' || type == 'C') { // c and C for backward compatibility
                                s = alloc_data(b, doff + 2) + doff;
-                               *s++ = type; *s = str->s[5];
+                               *s++ = 'A'; *s = str->s[5];
                                doff += 2;
                        } else if (type == 'I' || type == 'i') {
                                long long x;
@@ -341,7 +341,7 @@ static void taf2baf_core(const char *fntaf, const char *fnbaf, bam_header_t *hea
        int ret;
 
        b = (bam1_t*)calloc(1, sizeof(bam1_t));
-       fpbaf = bam_open(fnbaf, "w");
+       fpbaf = (strcmp(fnbaf, "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(fnbaf, "w");
        fp = sam_open(fntaf);
        ret = sam_read1(fp, header, b);
        bam_header_write(fpbaf, header);
index 2b01815f94ac6225b0448afe2a76aded6907fec3..6bc735eeaac4fbbb4318eabd6eb8c07c443465e5 100644 (file)
@@ -35,6 +35,7 @@
  */
 
 #define BAM_MIN_CHUNK_GAP 32768
+// 1<<14 is the size of minimum bin.
 #define BAM_LIDX_SHIFT    14
 
 typedef struct {
@@ -81,17 +82,21 @@ static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t
        l->list[l->n].u = beg; l->list[l->n++].v = end;
 }
 
-static inline void insert_offset2(bam_lidx_t *index2, int last, int curr, uint64_t offset)
+static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
 {
-       int i;
-       if (index2->m < curr + 1) {
-               index2->m = curr + 1;
+       int i, beg, end;
+       beg = b->core.pos >> BAM_LIDX_SHIFT;
+       end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
+       if (index2->m < end + 1) {
+               int old_m = index2->m;
+               index2->m = end + 1;
                kroundup32(index2->m);
                index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
+               memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
        }
-       if (last > curr) last = -1;
-       for (i = last + 1; i <= curr; ++i) index2->offset[i] = offset;
-       index2->n = curr + 1;
+       for (i = beg + 1; i <= end; ++i)
+               if (index2->offset[i] == 0) index2->offset[i] = offset;
+       index2->n = end + 1;
 }
 
 static void merge_chunks(bam_index_t *idx)
@@ -127,7 +132,8 @@ bam_index_t *bam_index_core(bamFile fp)
        bam_header_t *h;
        int i, ret;
        bam_index_t *idx;
-       uint32_t last_coor, last_tid, last_bin, save_bin, save_tid;
+       uint32_t last_bin, save_bin;
+       int32_t last_coor, last_tid, save_tid;
        bam1_core_t *c;
        uint64_t save_off, last_off;
 
@@ -152,14 +158,14 @@ bam_index_t *bam_index_core(bamFile fp)
                        fprintf(stderr, "[bam_index_core] the alignment is not sorted. Abort!\n");
                        exit(1);
                }
-               if (last_coor>>BAM_LIDX_SHIFT != b->core.pos>>BAM_LIDX_SHIFT) // then write the linear index
-                       insert_offset2(&idx->index2[b->core.tid], last_coor>>BAM_LIDX_SHIFT, b->core.pos>>BAM_LIDX_SHIFT, last_off);
+               if (b->core.bin < 4681) insert_offset2(&idx->index2[b->core.tid], b, last_off);
                if (c->bin != last_bin) { // then possibly write the binning index
                        if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
                                insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
                        save_off = last_off;
                        save_bin = last_bin = c->bin;
                        save_tid = c->tid;
+                       if (save_tid < 0) break;
                }
                if (bam_tell(fp) <= last_off) {
                        fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
@@ -169,7 +175,7 @@ bam_index_t *bam_index_core(bamFile fp)
                last_off = bam_tell(fp);
                last_coor = b->core.pos;
        }
-       insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
+       if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
        merge_chunks(idx);
        if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
        free(b->data); free(b);
index 83f91c2d3edb39ffce3212ee1751e0fcec36d43f..368028766842db8b9b71c82c4fc85a2eb3a87092 100644 (file)
@@ -92,7 +92,7 @@ static int tview_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl
        max_level = 0;
        for (i = l = 0; i < n; ++i) {
                const bam_pileup1_t *p = pl + i;
-               if (p->qpos == 0) {
+               if (p->is_head) {
                        if (tv->head->next && tv->head->cnt == 0) { // then take a free slot
                                freenode_t *p = tv->head->next;
                                tv->cur_level[i] = tv->head->level;
@@ -146,7 +146,7 @@ static int tview_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl
        // squeeze out terminated levels
        for (i = l = 0; i < n; ++i) {
                const bam_pileup1_t *p = pl + i;
-               if (p->qpos != p->b->core.l_qseq - 1)
+               if (!p->is_tail)
                        tv->pre_level[l++] = tv->pre_level[i];
        }
        tv->n_pre = l;
@@ -180,13 +180,14 @@ int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *tv)
        return bam_plbuf_push(b, tv->plbuf);
 }
 
-int bam_lpileup_file(bamFile fp, bam_pileup_f func, void *func_data)
+int bam_lpileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data)
 {
        bam_lplbuf_t *buf;
        int ret;
        bam1_t *b;
        b = (bam1_t*)calloc(1, sizeof(bam1_t));
        buf = bam_lplbuf_init(func, func_data);
+       bam_plbuf_set_mask(buf->plbuf, mask);
        while ((ret = bam_read1(fp, b)) >= 0)
                bam_lplbuf_push(b, buf);
        bam_lplbuf_push(0, buf);
index 65ed64c7031b030f84d9ac8ffc317ba8e5d942c1..3c3d81892ed95d06e298ee57539b36e3c5a26bf4 100644 (file)
@@ -262,6 +262,9 @@ uint32_t bam_maqcns_call(int n, const bam_pileup1_t *pl, bam_maqcns_t *bm)
 bam_maqindel_opt_t *bam_maqindel_opt_init()
 {
        bam_maqindel_opt_t *mi = (bam_maqindel_opt_t*)calloc(1, sizeof(bam_maqindel_opt_t));
+       mi->q_indel = 40;
+       mi->r_indel = 0.00015;
+       //
        mi->mm_penalty = 3;
        mi->indel_err = 4;
        mi->ambi_thres = 10;
@@ -271,7 +274,7 @@ bam_maqindel_opt_t *bam_maqindel_opt_init()
 void bam_maqindel_ret_destroy(bam_maqindel_ret_t *mir)
 {
        if (mir == 0) return;
-       free(mir->s1); free(mir->s2); free(mir);
+       free(mir->s[0]); free(mir->s[1]); free(mir);
 }
 
 #define MINUS_CONST 0x10000000
@@ -323,7 +326,7 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c
        }
        { // the core part
                char *ref2, *inscns = 0;
-               int k, l, *score, max_ins = types[n_types-1];
+               int k, l, *score, *pscore, max_ins = types[n_types-1];
                ref2 = (char*)calloc(right - left + types[n_types-1] + 2, 1);
                if (max_ins > 0) { // get the consensus of inserted sequences
                        int *inscns_aux = (int*)calloc(4 * n_types * max_ins, sizeof(int));
@@ -340,7 +343,7 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c
                                        }
                                }
                        }
-                       // construct the consensus
+                       // construct the consensus of inserted sequence
                        inscns = (char*)calloc(n_types * max_ins, sizeof(char));
                        for (i = 0; i < n_types; ++i) {
                                for (j = 0; j < types[i]; ++j) {
@@ -358,6 +361,7 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c
                }
                // calculate score
                score = (int*)calloc(n_types * n, sizeof(int));
+               pscore = (int*)calloc(n_types * n, sizeof(int));
                for (i = 0; i < n_types; ++i) {
                        // write ref2
                        for (k = 0, j = left; j <= pos; ++j)
@@ -372,27 +376,32 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c
                                const bam_pileup1_t *p = pl + j;
                                uint32_t *cigar;
                                bam1_core_t *c = &p->b->core;
-                               int s;
+                               int s, ps;
                                bam_segreg_t seg;
                                if (c->flag&BAM_FUNMAP) continue;
                                cigar = bam1_cigar(p->b);
                                bam_segreg(pos, c, cigar, &seg);
-                               for (s = 0, l = seg.qbeg; c->pos + l < right && l < seg.qend; ++l) {
+                               for (ps = s = 0, l = seg.qbeg; c->pos + l < right && l < seg.qend; ++l) {
                                        int cq = bam1_seqi(bam1_seq(p->b), l), ct;
                                        ct = c->pos + l >= left? ref2[c->pos + l - left] : 15; // "<" should not happen if there is no bug
-                                       if (cq < 15 && ct < 15)
+                                       if (cq < 15 && ct < 15) {
                                                s += cq == ct? 1 : -mi->mm_penalty;
+                                               if (cq != ct) ps += bam1_qual(p->b)[l];
+                                       }
                                }
-                               score[i*n + j] = s;
+                               score[i*n + j] = s; pscore[i*n + j] = ps;
                                if (types[i] != 0) { // then try the other way to calculate the score
-                                       for (s = 0, l = seg.qbeg; c->pos + l + types[i] < right && l < seg.qend; ++l) {
+                                       for (ps = s = 0, l = seg.qbeg; c->pos + l + types[i] < right && l < seg.qend; ++l) {
                                                int cq = bam1_seqi(bam1_seq(p->b), l), ct;
                                                ct = c->pos + l + types[i] >= left? ref2[c->pos + l + types[i] - left] : 15;
-                                               if (cq < 15 && ct < 15)
+                                               if (cq < 15 && ct < 15) {
                                                        s += cq == ct? 1 : -mi->mm_penalty;
+                                                       if (cq != ct) ps += bam1_qual(p->b)[l];
+                                               }
                                        }
                                }
                                if (score[i*n+j] < s) score[i*n+j] = s; // choose the higher of the two scores
+                               if (pscore[i*n+j] > ps) pscore[i*n+j] = ps;
                                if (types[i] != 0) score[i*n+j] -= mi->indel_err;
                                //printf("%d, %d, %d, %d\n", i, types[i], j, score[i*n+j]);
                        }
@@ -403,7 +412,7 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c
                        sum = (int*)calloc(n_types, sizeof(int));
                        for (i = 0; i < n_types; ++i)
                                for (j = 0; j < n; ++j)
-                                       sum[i] += score[i*n+j];
+                                       sum[i] += -pscore[i*n+j];
                        max1 = max2 = -0x7fffffff; max1_i = max2_i = -1;
                        for (i = 0; i < n_types; ++i) {
                                if (sum[i] > max1) {
@@ -416,26 +425,28 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c
                        // write ret
                        ret = (bam_maqindel_ret_t*)calloc(1, sizeof(bam_maqindel_ret_t));
                        ret->indel1 = types[max1_i]; ret->indel2 = types[max2_i];
-                       ret->s1 = (char*)calloc(abs(ret->indel1) + 2, 1);
-                       ret->s2 = (char*)calloc(abs(ret->indel2) + 2, 1);
+                       ret->s[0] = (char*)calloc(abs(ret->indel1) + 2, 1);
+                       ret->s[1] = (char*)calloc(abs(ret->indel2) + 2, 1);
+                       // write indel sequence
                        if (ret->indel1 > 0) {
-                               ret->s1[0] = '+';
+                               ret->s[0][0] = '+';
                                for (k = 0; k < ret->indel1; ++k)
-                                       ret->s1[k+1] = bam_nt16_rev_table[(int)inscns[max1_i*max_ins + k]];
+                                       ret->s[0][k+1] = bam_nt16_rev_table[(int)inscns[max1_i*max_ins + k]];
                        } else if (ret->indel1 < 0) {
-                               ret->s1[0] = '-';
+                               ret->s[0][0] = '-';
                                for (k = 0; k < -ret->indel1 && ref[pos + k + 1]; ++k)
-                                       ret->s1[k+1] = ref[pos + k + 1];
-                       } else ret->s1[0] = '*';
+                                       ret->s[0][k+1] = ref[pos + k + 1];
+                       } else ret->s[0][0] = '*';
                        if (ret->indel2 > 0) {
-                               ret->s2[0] = '+';
+                               ret->s[1][0] = '+';
                                for (k = 0; k < ret->indel2; ++k)
-                                       ret->s2[k+1] = bam_nt16_rev_table[(int)inscns[max2_i*max_ins + k]];
+                                       ret->s[1][k+1] = bam_nt16_rev_table[(int)inscns[max2_i*max_ins + k]];
                        } else if (ret->indel2 < 0) {
-                               ret->s2[0] = '-';
+                               ret->s[1][0] = '-';
                                for (k = 0; k < -ret->indel2 && ref[pos + k + 1]; ++k)
-                                       ret->s2[k+1] = ref[pos + k + 1];
-                       } else ret->s2[0] = '*';
+                                       ret->s[1][k+1] = ref[pos + k + 1];
+                       } else ret->s[1][0] = '*';
+                       // write count
                        for (j = 0; j < n; ++j) {
                                if (score[max1_i*n+j] < 0 && score[max2_i*n+j] < 0) ++ret->cnt_anti;
                                else {
@@ -445,8 +456,34 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c
                                        else ++ret->cnt_ambi;
                                }
                        }
+                       // write gl[]
+                       ret->gl[0] = ret->gl[1] = 0;
+                       for (j = 0; j < n; ++j) {
+                               int s1 = pscore[max1_i*n + j], s2 = pscore[max2_i*n + j];
+                               ret->gl[0] += s1 < s2? 0 : s1 - s2 < mi->q_indel? s1 - s2 : mi->q_indel;
+                               ret->gl[1] += s2 < s1? 0 : s2 - s1 < mi->q_indel? s2 - s1 : mi->q_indel;
+                       }
+               }
+               free(score); free(pscore); free(ref2); free(inscns);
+       }
+       { // call genotype
+               int q[3], qr_indel = (int)(-4.343 * log(mi->r_indel) + 0.5);
+               int min1, min2, min1_i;
+               q[0] = ret->gl[0] + (ret->s[0][0] != '*'? 0 : 0) * qr_indel;
+               q[1] = ret->gl[1] + (ret->s[1][0] != '*'? 0 : 0) * qr_indel;
+               q[2] = n * 3 + (ret->s[0][0] == '*' || ret->s[1][0] == '*'? 1 : 1) * qr_indel;
+               min1 = min2 = 0x7fffffff; min1_i = -1;
+               for (i = 0; i < 3; ++i) {
+                       if (q[i] < min1) {
+                               min2 = min1; min1 = q[i]; min1_i = i;
+                       } else if (q[i] < min2) min2 = q[i];
                }
-               free(score); free(ref2); free(inscns);
+               ret->gt = min1_i;
+               ret->q_cns = min2 - min1;
+               // set q_ref
+               if (ret->gt < 2) ret->q_ref = (ret->s[ret->gt][0] == '*')? 0 : q[1-ret->gt] - q[ret->gt] - qr_indel - 3;
+               else ret->q_ref = (ret->s[0][0] == '*')? q[0] - q[2] : q[1] - q[2];
+               if (ret->q_ref < 0) ret->q_ref = 0;
        }
        free(types);
        return ret;
index 5d410ef9fda607ba3f02a925522c4aa59864cf25..2c94fec066b25552b6008f28b8c129f5672cccc5 100644 (file)
@@ -16,13 +16,19 @@ typedef struct {
 } bam_maqcns_t;
 
 typedef struct {
+       int q_indel;
+       float r_indel;
+       // hidden parameters, unchangeable from command line
        int mm_penalty, indel_err, ambi_thres;
 } bam_maqindel_opt_t;
 
 typedef struct {
        int indel1, indel2;
        int cnt1, cnt2, cnt_ambi, cnt_anti;
-       char *s1, *s2;
+       char *s[2];
+       //
+       int gt, gl[2];
+       int q_cns, q_ref;
 } bam_maqindel_ret_t;
 
 #ifdef __cplusplus
diff --git a/bam_mate.c b/bam_mate.c
new file mode 100644 (file)
index 0000000..bb53605
--- /dev/null
@@ -0,0 +1,68 @@
+#include <stdlib.h>
+#include <string.h>
+#include "bam.h"
+
+// currently, this function ONLY works if each read has one hit
+void bam_mating_core(bamFile in, bamFile out)
+{
+       bam_header_t *header;
+       bam1_t *b[2];
+       int curr, has_prev;
+
+       header = bam_header_read(in);
+       bam_header_write(out, header);
+
+       b[0] = bam_init1();
+       b[1] = bam_init1();
+       curr = 0; has_prev = 0;
+       while (bam_read1(in, b[curr]) >= 0) {
+               bam1_t *cur = b[curr], *pre = b[1-curr];
+               if (has_prev) {
+                       if (strcmp(bam1_qname(cur), bam1_qname(pre)) == 0) { // identical pair name
+                               cur->core.mtid = pre->core.tid; cur->core.mpos = pre->core.pos;
+                               pre->core.mtid = cur->core.tid; pre->core.mpos = cur->core.pos;
+                               if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))
+                                       && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)))
+                               {
+                                       uint32_t cur5, pre5;
+                                       cur5 = (cur->core.flag&BAM_FREVERSE)? bam_calend(&cur->core, bam1_cigar(cur)) : cur->core.pos;
+                                       pre5 = (pre->core.flag&BAM_FREVERSE)? bam_calend(&pre->core, bam1_cigar(pre)) : pre->core.pos;
+                                       cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5;
+                               } else cur->core.isize = pre->core.isize = 0;
+                               if (pre->core.flag&BAM_FREVERSE) cur->core.flag |= BAM_FREVERSE;
+                               else cur->core.flag &= ~BAM_FMREVERSE;
+                               if (cur->core.flag&BAM_FREVERSE) pre->core.flag |= BAM_FREVERSE;
+                               else pre->core.flag &= ~BAM_FMREVERSE;
+                               if (cur->core.flag & BAM_FUNMAP) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FPROPER_PAIR; }
+                               if (pre->core.flag & BAM_FUNMAP) { cur->core.flag |= BAM_FMUNMAP; cur->core.flag &= ~BAM_FPROPER_PAIR; }
+                               bam_write1(out, pre);
+                               bam_write1(out, cur);
+                               has_prev = 0;
+                       } else { // unpaired or singleton
+                               pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0;
+                               pre->core.flag |= BAM_FMUNMAP;
+                               pre->core.flag &= ~BAM_FMREVERSE & ~BAM_FPROPER_PAIR;
+                               bam_write1(out, pre);
+                       }
+               } else has_prev = 1;
+               curr = 1 - curr;
+       }
+       if (has_prev) bam_write1(out, b[1-curr]);
+       bam_header_destroy(header);
+       bam_destroy1(b[0]);
+       bam_destroy1(b[1]);
+}
+
+int bam_mating(int argc, char *argv[])
+{
+       bamFile in, out;
+       if (argc < 3) {
+               fprintf(stderr, "samtools fixmate <in.nameSrt.bam> <out.nameSrt.bam>\n");
+               return 1;
+       }
+       in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r");
+    out = (strcmp(argv[2], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[2], "w");
+       bam_mating_core(in, out);
+       bam_close(in); bam_close(out);
+       return 0;
+}
index d01f9a3d56c1753d48149aef6f9325c753da6808..6a578319c24b739ea7bcfbaae7305b0f75607f52 100644 (file)
@@ -111,6 +111,7 @@ struct __bam_plbuf_t {
        int32_t tid, pos, max_tid, max_pos;
        int max_pu, is_eof;
        bam_pileup1_t *pu;
+       int flag_mask;
 };
 
 void bam_plbuf_reset(bam_plbuf_t *buf)
@@ -127,6 +128,12 @@ void bam_plbuf_reset(bam_plbuf_t *buf)
        buf->head = buf->tail;
 }
 
+void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask)
+{
+       if (mask < 0) buf->flag_mask = BAM_DEF_MASK;
+       else buf->flag_mask = BAM_FUNMAP | mask;
+}
+
 bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data)
 {
        bam_plbuf_t *buf;
@@ -136,6 +143,7 @@ bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data)
        buf->head = buf->tail = mp_alloc(buf->mp);
        buf->dummy = mp_alloc(buf->mp);
        buf->max_tid = buf->max_pos = -1;
+       buf->flag_mask = BAM_DEF_MASK;
        return buf;
 }
 
@@ -153,6 +161,7 @@ void bam_plbuf_destroy(bam_plbuf_t *buf)
 int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf)
 {
        if (b) { // fill buffer
+               if (b->core.flag & buf->flag_mask) return 0;
                bam_copy1(&buf->tail->b, b);
                buf->tail->beg = b->core.pos; buf->tail->end = bam_calend(&b->core, bam1_cigar(b));
                if (!(b->core.tid >= buf->max_tid || (b->core.tid == buf->max_tid && buf->tail->beg >= buf->max_pos))) {
@@ -197,13 +206,14 @@ int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf)
        return 0;
 }
 
-int bam_pileup_file(bamFile fp, bam_pileup_f func, void *func_data)
+int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data)
 {
        bam_plbuf_t *buf;
        int ret;
        bam1_t *b;
        b = (bam1_t*)calloc(1, sizeof(bam1_t));
        buf = bam_plbuf_init(func, func_data);
+       bam_plbuf_set_mask(buf, mask);
        while ((ret = bam_read1(fp, b)) >= 0)
                bam_plbuf_push(b, buf);
        bam_plbuf_push(0, buf);
index 0140c6670c1facfbd6b3084bff5cf812ab4f08ab..dbfc29576b57d9a7dbdee8c4f928c7334f468ce5 100644 (file)
@@ -5,10 +5,13 @@
 #include "faidx.h"
 #include "bam_maqcns.h"
 #include "khash.h"
+#include "glf.h"
 KHASH_SET_INIT_INT64(64)
 
-#define BAM_PLF_SIMPLE 0x01
-#define BAM_PLF_CNS 0x02
+#define BAM_PLF_SIMPLE     0x01
+#define BAM_PLF_CNS        0x02
+#define BAM_PLF_INDEL_ONLY 0x04
+#define BAM_PLF_GLF        0x08
 
 typedef struct {
        bam_header_t *h;
@@ -18,7 +21,9 @@ typedef struct {
        khash_t(64) *hash;
        uint32_t format;
        int tid, len;
+       int mask;
        char *ref;
+       glfFile fp; // for glf output only
 } pu_data_t;
 
 char **bam_load_pos(const char *fn, int *_n);
@@ -46,6 +51,61 @@ static khash_t(64) *load_pos(const char *fn, bam_header_t *h)
        return hash;
 }
 
+// an analogy to pileup_func() below
+static int glt_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data)
+{
+       pu_data_t *d = (pu_data_t*)data;
+       bam_maqindel_ret_t *r = 0;
+       int rb;
+       glf1_t *g;
+       glf2_t g2;
+       if (d->fai == 0) {
+               fprintf(stderr, "[glt_func] reference sequence is required for generating GLT. Abort!\n");
+               exit(1);
+       }
+       if (d->hash && kh_get(64, d->hash, (uint64_t)tid<<32|pos) == kh_end(d->hash)) return 0;
+       if (d->fai && (int)tid != d->tid) {
+               if (d->ref) { // then write the end mark
+                       memset(&g2, 0, sizeof(glf2_t));
+                       g2.type = GLF_TYPE_END;
+                       bgzf_write(d->fp, &g2, sizeof(glf2_t));
+               }
+               glf_ref_write(d->fp, d->h->target_name[tid]); // write reference
+               free(d->ref);
+               d->ref = fai_fetch(d->fai, d->h->target_name[tid], &d->len);
+               d->tid = tid;
+       }
+       rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N';
+       g = bam_maqcns_glfgen(n, pu, bam_nt16_table[rb], d->c);
+       memcpy(&g2, g, sizeof(glf1_t));
+       g2.type = GLF_TYPE_NORMAL;
+       g2.pos = pos;
+       bgzf_write(d->fp, &g2, sizeof(glf2_t));
+       r = bam_maqindel(n, pos, d->ido, pu, d->ref);
+       if (r) { // then write indel line
+               int g3 = 3 * n, min;
+               min = g3;
+               if (min > r->gl[0]) min = r->gl[0];
+               if (min > r->gl[1]) min = r->gl[1];
+               g3 -= min;
+               g2.ref_base = 0;
+               g2.type = GLF_TYPE_INDEL;
+               memset(g2.lk, 0, 10);
+               g2.lk[0] = r->gl[0] < 255? r->gl[0] : 255;
+               g2.lk[1] = r->gl[1] < 255? r->gl[1] : 255;
+               g2.lk[2] = g3 < 255? g3 : 255;
+               *(int16_t*)(g2.lk + 3) = (int16_t)r->indel1;
+               *(int16_t*)(g2.lk + 5) = (int16_t)r->indel2;
+               g2.min_lk = min < 255? min : 255;
+               bgzf_write(d->fp, &g2, sizeof(glf2_t));
+               if (r->indel1) bgzf_write(d->fp, r->s[0]+1, r->indel1>0? r->indel1 : -r->indel1);
+               if (r->indel2) bgzf_write(d->fp, r->s[1]+1, r->indel2>0? r->indel2 : -r->indel2);
+               bam_maqindel_ret_destroy(r);
+       }
+       free(g);
+       return 0;
+}
+
 static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data)
 {
        pu_data_t *d = (pu_data_t*)data;
@@ -53,12 +113,18 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p
        int i, j, rb;
        uint32_t x;
        if (d->hash && kh_get(64, d->hash, (uint64_t)tid<<32|pos) == kh_end(d->hash)) return 0;
+       if (d->format & BAM_PLF_GLF) return glt_func(tid, pos, n, pu, data);
        if (d->fai && (int)tid != d->tid) {
                free(d->ref);
                d->ref = fai_fetch(d->fai, d->h->target_name[tid], &d->len);
                d->tid = tid;
        }
        rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N';
+       if (d->format & BAM_PLF_INDEL_ONLY) {
+               for (i = 0; i < n; ++i)
+                       if (pu[i].indel != 0) break;
+               if (i == n) return 0;
+       }
        printf("%s\t%d\t%c\t", d->h->target_name[tid], pos + 1, rb);
        if (d->format & BAM_PLF_CNS) { // consensus
                int ref_q, rb4 = bam_nt16_table[rb];
@@ -69,6 +135,8 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p
                        if (ref_q > 255) ref_q = 255;
                }
                printf("%c\t%d\t%d\t%d\t", bam_nt16_rev_table[x>>28], x>>8&0xff, ref_q, x>>16&0xff);
+       }
+       if (d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY)) {
                if (d->ref) // indel calling
                        r = bam_maqindel(n, pos, d->ido, pu, d->ref);
        }
@@ -115,7 +183,12 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p
        }
        putchar('\n');
        if (r) { // then print indel line
-               printf("%s\t%d\t*\t%s/%s\t", d->h->target_name[tid], pos + 1, r->s1, r->s2);
+               printf("%s\t%d\t*\t", d->h->target_name[tid], pos + 1);
+               if (r->gt < 2) printf("%s/%s\t", r->s[r->gt], r->s[r->gt]);
+               else printf("%s/%s\t", r->s[0], r->s[1]);
+               printf("%d\t%d\t", r->q_cns, r->q_ref);
+               printf("%s\t%s\t", r->s[0], r->s[1]);
+               //printf("%d\t%d\t", r->gl[0], r->gl[1]);
                printf("%d\t%d\t%d\t%d\n", r->cnt1, r->cnt2, r->cnt_ambi, r->cnt_anti);
                bam_maqindel_ret_destroy(r);
        }
@@ -127,9 +200,10 @@ int bam_pileup(int argc, char *argv[])
        int c;
        char *fn_list = 0, *fn_fa = 0, *fn_pos = 0;
        pu_data_t *d = (pu_data_t*)calloc(1, sizeof(pu_data_t));
-       d->tid = -1;
+       d->tid = -1; d->mask = BAM_DEF_MASK;
        d->c = bam_maqcns_init();
-       while ((c = getopt(argc, argv, "st:f:cT:N:r:l:")) >= 0) {
+       d->ido = bam_maqindel_opt_init();
+       while ((c = getopt(argc, argv, "st:f:cT:N:r:l:im:gI:G:")) >= 0) {
                switch (c) {
                case 's': d->format |= BAM_PLF_SIMPLE; break;
                case 't': fn_list = strdup(optarg); break;
@@ -139,6 +213,11 @@ int bam_pileup(int argc, char *argv[])
                case 'N': d->c->n_hap = atoi(optarg); break;
                case 'r': d->c->het_rate = atoi(optarg); break;
                case 'c': d->format |= BAM_PLF_CNS; break;
+               case 'i': d->format |= BAM_PLF_INDEL_ONLY; break;
+               case 'm': d->mask = atoi(optarg); break;
+               case 'g': d->format |= BAM_PLF_GLF; break;
+               case 'I': d->ido->q_indel = atoi(optarg); break;
+               case 'G': d->ido->r_indel = atof(optarg); break;
                default: fprintf(stderr, "Unrecognizd option '-%c'.\n", c); return 1;
                }
        }
@@ -146,26 +225,38 @@ int bam_pileup(int argc, char *argv[])
                fprintf(stderr, "\n");
                fprintf(stderr, "Usage:  bamtk pileup [options] <in.bam>|<in.sam>\n\n");
                fprintf(stderr, "Option: -s        simple (yet incomplete) pileup format\n");
+               fprintf(stderr, "        -i        only show lines/consensus with indels\n");
+               fprintf(stderr, "        -m INT    filtering reads with bits in INT [%d]\n", d->mask);
                fprintf(stderr, "        -t FILE   list of reference sequences (assume the input is in SAM)\n");
                fprintf(stderr, "        -l FILE   list of sites at which pileup is output\n");
                fprintf(stderr, "        -f FILE   reference sequence in the FASTA format\n\n");
                fprintf(stderr, "        -c        output the maq consensus sequence\n");
-               fprintf(stderr, "        -T FLOAT  theta in maq consensus calling model (for -c only) [%f]\n", d->c->theta);
-               fprintf(stderr, "        -N INT    number of haplotypes in the sample (for -c only) [%d]\n", d->c->n_hap);
-               fprintf(stderr, "        -r FLOAT  prior of a difference between any two haplotypes (for -c only) [%f]\n\n",
-                               d->c->het_rate);
+               fprintf(stderr, "        -g        output in the extended GLT format (suppressing -c/-i/-s)\n");
+               fprintf(stderr, "        -T FLOAT  theta in maq consensus calling model (for -c/-g) [%f]\n", d->c->theta);
+               fprintf(stderr, "        -N INT    number of haplotypes in the sample (for -c/-g) [%d]\n", d->c->n_hap);
+               fprintf(stderr, "        -r FLOAT  prior of a difference between two haplotypes (for -c/-g) [%f]\n", d->c->het_rate);
+               fprintf(stderr, "        -G FLOAT  prior of an indel between two haplotypes (for -c/-g) [%f]\n", d->ido->r_indel);
+               fprintf(stderr, "        -I INT    phred prob. of an indel in sequencing/prep. (for -c/-g) [%d]\n", d->ido->q_indel);
+               fprintf(stderr, "\n");
                free(fn_list); free(fn_fa); free(d);
                return 1;
        }
        if (fn_fa) d->fai = fai_load(fn_fa);
        free(fn_fa);
        bam_maqcns_prepare(d->c);
-       d->ido = bam_maqindel_opt_init();
+       if (d->format & BAM_PLF_GLF) {
+               glf_header_t *h;
+               h = glf_header_init();
+               d->fp = bgzf_fdopen(fileno(stdout), "w");
+               glf_header_write(d->fp, h);
+               glf_header_destroy(h);
+       }
        if (fn_list) {
                tamFile fp;
                bam1_t *b;
                int ret;
                bam_plbuf_t *buf = bam_plbuf_init(pileup_func, d);
+               bam_plbuf_set_mask(buf, d->mask);
                d->h = sam_header_read2(fn_list);
                if (fn_pos) d->hash = load_pos(fn_pos, d->h);
                fp = sam_open(argv[optind]);
@@ -181,9 +272,10 @@ int bam_pileup(int argc, char *argv[])
                fp = (strcmp(argv[optind], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[optind], "r");
                d->h = bam_header_read(fp);
                if (fn_pos) d->hash = load_pos(fn_pos, d->h);
-               bam_pileup_file(fp, pileup_func, d);
+               bam_pileup_file(fp, d->mask, pileup_func, d);
                bam_close(fp);
        }
+       if (d->format & BAM_PLF_GLF) bgzf_close(d->fp);
        free(fn_pos); free(fn_list);
        kh_destroy(64, d->hash);
        bam_header_destroy(d->h);
diff --git a/bam_rmdup.c b/bam_rmdup.c
new file mode 100644 (file)
index 0000000..321938f
--- /dev/null
@@ -0,0 +1,140 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <zlib.h>
+#include "bam.h"
+
+typedef bam1_t *bam1_p;
+#include "khash.h"
+KHASH_SET_INIT_STR(name)
+KHASH_MAP_INIT_INT64(pos, bam1_p)
+
+#define BUFFER_SIZE 0x40000
+
+typedef struct {
+       int n, max;
+       bam1_t **a;
+} tmp_stack_t;
+
+static inline void stack_insert(tmp_stack_t *stack, bam1_t *b)
+{
+       if (stack->n == stack->max) {
+               stack->max = stack->max? stack->max<<1 : 0x10000;
+               stack->a = (bam1_t**)realloc(stack->a, sizeof(bam1_t*) * stack->max);
+       }
+       stack->a[stack->n++] = b;
+}
+
+static inline void dump_best(tmp_stack_t *stack, khash_t(pos) *best_hash, bamFile out)
+{
+       int i;
+       for (i = 0; i != stack->n; ++i) {
+               bam_write1(out, stack->a[i]);
+               bam_destroy1(stack->a[i]);
+       }
+       stack->n = 0;
+       if (kh_size(best_hash) > BUFFER_SIZE) kh_clear(pos, best_hash);
+}
+
+static void clear_del_set(khash_t(name) *del_set)
+{
+       khint_t k;
+       for (k = kh_begin(del_set); k < kh_end(del_set); ++k)
+               if (kh_exist(del_set, k))
+                       free((char*)kh_key(del_set, k));
+       kh_clear(name, del_set);
+}
+
+void bam_rmdup_core(bamFile in, bamFile out)
+{
+       bam_header_t *header;
+       bam1_t *b;
+       int last_tid = -1, last_pos = -1;
+       uint64_t n_checked = 0, n_removed = 0;
+       tmp_stack_t stack;
+       khint_t k;
+       khash_t(pos) *best_hash;
+       khash_t(name) *del_set;
+
+       best_hash = kh_init(pos);
+       del_set = kh_init(name);
+       b = bam_init1();
+       memset(&stack, 0, sizeof(tmp_stack_t));
+       header = bam_header_read(in);
+       bam_header_write(out, header);
+
+       kh_resize(name, del_set, 4 * BUFFER_SIZE);
+       kh_resize(pos, best_hash, 3 * BUFFER_SIZE);
+       while (bam_read1(in, b) >= 0) {
+               bam1_core_t *c = &b->core;
+               if (c->tid != last_tid || last_pos != c->pos) {
+                       dump_best(&stack, best_hash, out); // write the result
+                       if (c->tid != last_tid) {
+                               kh_clear(pos, best_hash);
+                               if (kh_size(del_set)) { // check
+                                       fprintf(stderr, "[bam_rmdup_core] %llu unmatched pairs\n", (long long)kh_size(del_set));
+                                       clear_del_set(del_set);
+                               }
+                               last_tid = c->tid;
+                               fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", header->target_name[c->tid]);
+                       }
+               }
+               if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) {
+                       bam_write1(out, b);
+               } else if (c->isize > 0) { // paired, head
+                       uint64_t key = (uint64_t)c->pos<<32 | c->isize;
+                       int ret;
+                       ++n_checked;
+                       k = kh_put(pos, best_hash, key, &ret);
+                       if (ret == 0) { // found in best_hash
+                               bam1_t *p = kh_val(best_hash, k);
+                               ++n_removed;
+                               if (p->core.qual < c->qual) { // the current alignment is better
+                                       kh_put(name, del_set, strdup(bam1_qname(p)), &ret); // p will be removed
+                                       bam_copy1(p, b); // replaced as b
+                               } else kh_put(name, del_set, strdup(bam1_qname(b)), &ret); // b will be removed
+                               if (ret == 0)
+                                       fprintf(stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam1_qname(b));
+                       } else { // not found in best_hash
+                               kh_val(best_hash, k) = bam_dup1(b);
+                               stack_insert(&stack, kh_val(best_hash, k));
+                       }
+               } else { // paired, tail
+                       k = kh_get(name, del_set, bam1_qname(b));
+                       if (k != kh_end(del_set)) {
+                               free((char*)kh_key(del_set, k));
+                               kh_del(name, del_set, k);
+                       } else bam_write1(out, b);
+               }
+               last_pos = c->pos;
+       }
+       dump_best(&stack, best_hash, out);
+
+       bam_header_destroy(header);
+       clear_del_set(del_set);
+       kh_destroy(name, del_set);
+       kh_destroy(pos, best_hash);
+       free(stack.a);
+       bam_destroy1(b);
+       fprintf(stderr, "[bam_rmdup_core] %lld / %lld = %.4lf\n", (long long)n_removed, (long long)n_checked,
+                       (double)n_removed/n_checked);
+}
+int bam_rmdup(int argc, char *argv[])
+{
+       bamFile in, out;
+       if (argc < 3) {
+               fprintf(stderr, "Usage: samtools rmdup <input.srt.bam> <output.bam>\n");
+               return 1;
+       }
+       in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r");
+       out = (strcmp(argv[2], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[2], "w");
+       if (in == 0 || out == 0) {
+               fprintf(stderr, "[bam_rmdup] fail to read/write input files\n");
+               return 1;
+       }
+       bam_rmdup_core(in, out);
+       bam_close(in);
+       bam_close(out);
+       return 0;
+}
index c5ed5835b0c26cb4e3ed0a756af26ee8f6d76ef6..68d15dc58f7cde597615a5d49aa988f7a65cd366 100644 (file)
@@ -191,7 +191,7 @@ void bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t m
                        fns[i] = (char*)calloc(strlen(prefix) + 20, 1);
                        sprintf(fns[i], "%s.%.4d.bam", prefix, i);
                }
-               bam_merge_core(0, fnout, n, fns);
+               bam_merge_core(is_by_qname, fnout, n, fns);
                free(fnout);
                for (i = 0; i < n; ++i) {
                        unlink(fns[i]);
diff --git a/bamtk.c b/bamtk.c
index 54ef455bdc1adbd01438554bed13e6db359a2096..ffd2e48804a2c3ff929c1cfe19b3b22f49275ad6 100644 (file)
--- a/bamtk.c
+++ b/bamtk.c
@@ -3,7 +3,7 @@
 #include "bam.h"
 
 #ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.1.1"
+#define PACKAGE_VERSION "0.1.1-18"
 #endif
 
 int bam_taf2baf(int argc, char *argv[]);
@@ -12,7 +12,11 @@ int bam_merge(int argc, char *argv[]);
 int bam_index(int argc, char *argv[]);
 int bam_sort(int argc, char *argv[]);
 int bam_tview_main(int argc, char *argv[]);
+int bam_mating(int argc, char *argv[]);
+int bam_rmdup(int argc, char *argv[]);
+
 int faidx_main(int argc, char *argv[]);
+int glf_view_main(int argc, char *argv[]);
 
 static int view_aux(const bam1_t *b, void *data)
 {
@@ -87,6 +91,9 @@ static int usage()
        fprintf(stderr, "         tview       text alignment viewer\n");
 #endif
        fprintf(stderr, "         index       index alignment\n");
+       fprintf(stderr, "         fixmate     fix mate information\n");
+       fprintf(stderr, "         rmdup       remove PCR duplicates\n");
+       fprintf(stderr, "         glfview     print GLFv2 file\n");
        fprintf(stderr, "\n");
        return 1;
 }
@@ -101,6 +108,9 @@ int main(int argc, char *argv[])
        else if (strcmp(argv[1], "sort") == 0) return bam_sort(argc-1, argv+1);
        else if (strcmp(argv[1], "index") == 0) return bam_index(argc-1, argv+1);
        else if (strcmp(argv[1], "faidx") == 0) return faidx_main(argc-1, argv+1);
+       else if (strcmp(argv[1], "fixmate") == 0) return bam_mating(argc-1, argv+1);
+       else if (strcmp(argv[1], "rmdup") == 0) return bam_rmdup(argc-1, argv+1);
+       else if (strcmp(argv[1], "glfview") == 0) return glf_view_main(argc-1, argv+1);
 #ifndef _NO_CURSES
        else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1);
 #endif
diff --git a/bgzf.h b/bgzf.h
index 4ed5c2996001ab74a501e84ded93abf2c426bfa7..7e76bcb9d7eeb128ba3b7047cb1211bdca582a91 100644 (file)
--- a/bgzf.h
+++ b/bgzf.h
@@ -9,7 +9,7 @@
  * or functionality.
  */
 
-#ifndef __BCGZ_H
+#ifndef __BGZF_H
 #define __BGZF_H
 
 #include <stdint.h>
index 5dd123cb1eaa65b4c88815f1fe0366ed75bbbaa7..dbb276f467124d3eab407a22657949be27000c4d 100644 (file)
@@ -1,28 +1,23 @@
-NA18507_part.fa contains two sequences cut from the human genome
+File ex1.fa contains two sequences cut from the human genome
 build36. They were exatracted with command:
 
   samtools faidx human_b36.fa 2:2043966-2045540 20:67967-69550
 
-Sequence names were changed manually for simplicity. ex1.fa.fai is the
-index for the sequence file, generated by:
-
-  samtools faidx ex1.fa
-
-This index file also works as the reference list file used by `import'
-and `pileup' commands of samtools. ex1.sam.gz contains MAQ alignments
-exatracted with:
+Sequence names were changed manually for simplicity. File ex1.sam.gz
+contains MAQ alignments exatracted with:
 
   (samtools view NA18507_maq.bam 2:2044001-2045500;
    samtools view NA18507_maq.bam 20:68001-69500)
 
-and processed with an awk command to make everything consistent as a
+and processed with `samtools fixmate' to make it self-consistent as a
 standalone alignment.
 
 To try samtools, you may run the following commands:
 
-  samtools import ex1.fa.fai ex1.sam.gz ex1.bam
-  samtools index ex1.bam
-  samtools tview ex1.bam ex1.fa
-  samtools pileup -cf ex1.fa ex1.bam
+  samtools faidx ex1.fa                 # index the reference FASTA
+  samtools import ex1.fa.fai ex1.sam.gz ex1.bam   # SAM->BAM
+  samtools index ex1.bam                # index BAM
+  samtools tview ex1.bam ex1.fa         # view alignment
+  samtools pileup -cf ex1.fa ex1.bam    # pileup and consensus
   samtools pileup -cf ex1.fa -t ex1.fa.fai ex1.sam.gz
 
diff --git a/examples/Makefile b/examples/Makefile
new file mode 100644 (file)
index 0000000..b6908fa
--- /dev/null
@@ -0,0 +1,21 @@
+all:../samtools ex1.glf ex1.pileup.gz ex1.bam.bai ex1.glfview.gz
+               @echo; echo \# You can now launch the viewer with: \'samtools tview ex1.bam ex1.fa\'; echo;
+
+ex1.fa.fai:ex1.fa
+               ../samtools faidx ex1.fa
+ex1.bam:ex1.sam.gz ex1.fa.fai
+               ../samtools import ex1.fa.fai ex1.sam.gz ex1.bam
+ex1.bam.bai:ex1.bam
+               ../samtools index ex1.bam
+ex1.pileup.gz:ex1.bam ex1.fa
+               ../samtools pileup -cf ex1.fa ex1.bam | gzip > ex1.pileup.gz
+ex1.glf:ex1.bam ex1.fa
+               ../samtools pileup -gf ex1.fa ex1.bam > ex1.glf
+ex1.glfview.gz:ex1.glf
+               ../samtools glfview ex1.glf | gzip > ex1.glfview.gz
+
+../samtools:
+               (cd ..; make samtools)
+
+clean:
+               rm -f *.bam *.bai *.glf* *.fai *.pileup* *~
\ No newline at end of file
diff --git a/examples/ex1.fa.fai b/examples/ex1.fa.fai
deleted file mode 100644 (file)
index bac151a..0000000
+++ /dev/null
@@ -1,2 +0,0 @@
-seq1   1575    6       60      61
-seq2   1584    1614    60      61
index 1a213d15ff9b6c79cec4b497021a9cbdf83ae88c..26c06a6135996e082ed71c4c25367817181db705 100644 (file)
Binary files a/examples/ex1.sam.gz and b/examples/ex1.sam.gz differ
diff --git a/faidx.c b/faidx.c
index 44e7f5710ce06fdd350d265ae4887eb2326eea51..9302dfb4d6480bb94295d65e7fc7fcf4c4644332 100644 (file)
--- a/faidx.c
+++ b/faidx.c
@@ -12,7 +12,7 @@ typedef struct {
 } faidx1_t;
 KHASH_MAP_INIT_STR(s, faidx1_t)
 
-#ifdef HAVE_RAZF
+#ifndef _NO_RAZF
 #include "razf.h"
 #else
 extern off_t ftello(FILE *stream);
diff --git a/glf.c b/glf.c
new file mode 100644 (file)
index 0000000..302f9ef
--- /dev/null
+++ b/glf.c
@@ -0,0 +1,156 @@
+#include <string.h>
+#include <stdlib.h>
+#include "glf.h"
+
+#ifdef _NO_BGZF
+// then alias bgzf_*() functions
+#endif
+
+static inline int bam_is_big_endian()
+{
+       long one= 1;
+       return !(*((char *)(&one)));
+}
+
+glf_header_t *glf_header_init()
+{
+       return (glf_header_t*)calloc(1, sizeof(glf_header_t));
+}
+
+glf_header_t *glf_header_read(glfFile fp)
+{
+       glf_header_t *h;
+       char magic[4];
+       if (bam_is_big_endian()) {
+               fprintf(stderr, "[glf_header_read] Big endian is detected. Abort.\n");
+               exit(1);
+       }
+       h = (glf_header_t*)calloc(1, sizeof(glf_header_t));
+       bgzf_read(fp, magic, 4);
+       if (strncmp(magic, "GLF\2", 4)) {
+               fprintf(stderr, "[glf_header_read] invalid magic. Abort.\n");
+               exit(1);
+       }
+       bgzf_read(fp, &h->l_text, 4);
+       if (h->l_text) {
+               h->text = (uint8_t*)calloc(h->l_text + 1, 1);
+               bgzf_read(fp, h->text, h->l_text);
+       }
+       return h;
+}
+
+void glf_header_write(glfFile fp, const glf_header_t *h)
+{
+       bgzf_write(fp, "GLF\2", 4);
+       bgzf_write(fp, &h->l_text, 4);
+       if (h->l_text) bgzf_write(fp, h->text, h->l_text);
+}
+
+void glf_header_destroy(glf_header_t *h)
+{
+       free(h->text);
+       free(h);
+}
+
+char *glf_ref_read(glfFile fp)
+{
+       int32_t n;
+       char *str;
+       if (bgzf_read(fp, &n, 4) != 4) return 0;
+       if (n < 0) {
+               fprintf(stderr, "[glf_ref_read] invalid reference name length: %d.\n", n);
+               return 0;
+       }
+       str = (char*)calloc(n + 1, 1); // not necesarily n+1 in fact
+       bgzf_read(fp, str, n);
+       return str;
+}
+
+void glf_ref_write(glfFile fp, const char *str)
+{
+       int32_t n = strlen(str);
+       ++n;
+       bgzf_write(fp, &n, 4);
+       bgzf_write(fp, str, n);
+}
+
+void glf_view_normal(const char *ref_name, glf2_t *g1)
+{
+       int j;
+       printf("%s\t%d\t%c\t%d\t%d\t%d", ref_name, g1->pos + 1, "XACMGRSVTWYHKDBN"[g1->ref_base],
+                  g1->depth, g1->max_mapQ, g1->min_lk);
+       for (j = 0; j != 10; ++j) printf("\t%d", g1->lk[j]);
+       printf("\n");
+}
+
+static char *glf_read_indel(glfFile fp, char *str, int *max, int16_t indel)
+{
+       int l = indel > 0? indel : -indel;
+       if (l + 1 > *max) {
+               *max = l + 1;
+               str = (char*)realloc(str, *max);
+       }
+       bgzf_read(fp, str, l);
+       str[l] = 0;
+       return str;
+}
+
+void glf_view(glfFile fp)
+{
+       glf_header_t *h;
+       char *name, *str;
+       glf2_t g2;
+       int max;
+       
+       h = glf_header_read(fp);
+       str = 0; max = 0;
+       while ((name = glf_ref_read(fp)) != 0) {
+               while (bgzf_read(fp, &g2, sizeof(glf2_t))) {
+                       if (g2.type == GLF_TYPE_END) break;
+                       else if (g2.type == GLF_TYPE_NORMAL) glf_view_normal(name, &g2);
+                       else if (g2.type == GLF_TYPE_INDEL) {
+                               int16_t indel1, indel2;
+                               printf("%s\t%d\t*\t%d\t%d\t%d\t", name, g2.pos + 1, g2.depth, g2.max_mapQ, g2.min_lk);
+                               printf("%d\t%d\t%d\t", g2.lk[0], g2.lk[1], g2.lk[2]);
+                               indel1 = *(int16_t*)(g2.lk + 3);
+                               indel2 = *(int16_t*)(g2.lk + 5);
+                               printf("%d\t%d\t", indel1, indel2);
+                               if (indel1) {
+                                       str = glf_read_indel(fp, str, &max, indel1);
+                                       printf("%c%d%s\t", indel1>0? '+':'-', indel1>0?indel1:-indel1, str);
+                               } else printf("*\t");
+                               if (indel2) {
+                                       str = glf_read_indel(fp, str, &max, indel2);
+                                       printf("%c%d%s\n", indel2>0? '+':'-', indel2>0?indel2:-indel2, str);
+                               } else printf("*\n");
+                       }
+               }
+               free(name);
+       }
+       glf_header_destroy(h);
+       free(str);
+}
+
+int glf_view_main(int argc, char *argv[])
+{
+       glfFile fp;
+       if (argc == 1) {
+               fprintf(stderr, "Usage: glfview <in.glf>\n");
+               return 1;
+       }
+       fp = (strcmp(argv[1], "-") == 0)? bgzf_fdopen(fileno(stdin), "r") : bgzf_open(argv[1], "r");
+       if (fp == 0) {
+               fprintf(stderr, "Fail to open file '%s'\n", argv[1]);
+               return 1;
+       }
+       glf_view(fp);
+       bgzf_close(fp);
+       return 0;
+}
+
+#ifdef GLFVIEW_MAIN
+int main(int argc, char *argv[])
+{
+       return glf_view_main(argc, argv);
+}
+#endif
diff --git a/glf.h b/glf.h
index d9d23c624bb0aded1b4309b68311fdc94106f115..af72a48f95385635a09ee692138969acb1c8cfde 100644 (file)
--- a/glf.h
+++ b/glf.h
@@ -8,4 +8,40 @@ typedef struct {
        unsigned min_lk:8, depth:24; /** minimum lk capped at 255, and the number of mapped reads */
 } glf1_t;
 
+#include <stdint.h>
+#include "bgzf.h"
+typedef BGZF *glfFile;
+
+#define GLF_TYPE_NORMAL 0
+#define GLF_TYPE_INDEL  1
+#define GLF_TYPE_END   15
+
+typedef struct {
+       unsigned char ref_base:4, type:4; /** "XACMGRSVTWYHKDBN"[ref_base] gives the reference base */
+       unsigned char max_mapQ; /** maximum mapping quality */
+       unsigned char lk[10];   /** log likelihood ratio, capped at 255 */
+       unsigned min_lk:8, depth:24; /** minimum lk capped at 255, and the number of mapped reads */
+       unsigned pos; /** this is ***ZERO-BASED*** coordinate */
+} glf2_t;
+
+typedef struct {
+       int32_t l_text;
+       uint8_t *text;
+} glf_header_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+       glf_header_t *glf_header_init();
+       glf_header_t *glf_header_read(glfFile fp);
+       void glf_header_write(glfFile fp, const glf_header_t *h);
+       void glf_header_destroy(glf_header_t *h);
+       char *glf_ref_read(glfFile fp);
+       void glf_ref_write(glfFile fp, const char *str);
+
+#ifdef __cplusplus
+}
+#endif
+
 #endif
index 8a38f54a3cdf2fd01aab1ac8b0723f49ee228ee6..f4e50ff6ab39f11884b882c9877562b2e552ab82 100644 (file)
@@ -29,7 +29,7 @@ lib-recur all-recur clean-recur cleanlocal-recur install-recur:
 lib:
 
 faidx:../faidx.c ../faidx.h
-               $(CC) $(CFLAGS) -DFAIDX_MAIN -o $@ ../faidx.c
+               $(CC) $(CFLAGS) -D_NO_RAZF -DFAIDX_MAIN -o $@ ../faidx.c
 
 md5fa:md5.o md5fa.o md5.h ../kseq.h
                $(CC) $(CFLAGS) -o $@ md5.o md5fa.o -lz
index e30aa925784198aab70132184a2dddb6f2e7705d..70ecf4145a73db6265b5df61cf18270b87d3f1c5 100644 (file)
@@ -5,6 +5,8 @@
 #include <stdlib.h>
 #include <assert.h>
 
+#define PACKAGE_VERSION "0.1.1 (20090120)"
+
 //#define MAQ_LONGREADS
 
 #ifdef MAQ_LONGREADS
@@ -84,7 +86,7 @@ maqmap_t *maqmap_read_header(gzFile fp)
        return mm;
 }
 
-void maq2tam_core(gzFile fp)
+void maq2tam_core(gzFile fp, const char *rg)
 {
        maqmap_t *mm;
        maqmap1_t mm1, *m1;
@@ -141,6 +143,7 @@ void maq2tam_core(gzFile fp)
                for (j = 0; j != m1->size; ++j)
                        putchar((m1->seq[j]&0x3f) + 33);
                putchar('\t');
+               if (rg) printf("RG:Z:%s\t", rg);
                if (flag&4) {
                        printf("MF:i:%d\n", m1->flag);
                } else {
@@ -158,11 +161,12 @@ int main(int argc, char *argv[])
 {
        gzFile fp;
        if (argc == 1) {
-               fprintf(stderr, "Usage: maq2tam <in.map>\n");
+               fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
+               fprintf(stderr, "Usage: maq2sam <in.map> [<readGroup>]\n");
                return 1;
        }
        fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r");
-       maq2tam_core(fp);
+       maq2tam_core(fp, argc > 2? argv[2] : 0);
        gzclose(fp);
        return 0;
 }
diff --git a/razf.c b/razf.c
index 6611f0b9e5e95721d8b29510ec423213d86ae8f4..13d6589b0c2907858d5617786e52fbc177df432c 100644 (file)
--- a/razf.c
+++ b/razf.c
@@ -31,6 +31,8 @@
  * To compile razf.c, zlib-1.2.3(or greater) is required.
  */
 
+#ifndef _NO_RAZF
+
 #include <fcntl.h>
 #include <stdio.h>
 #include "razf.h"
@@ -645,3 +647,5 @@ void razf_close(RAZF *rz){
        close(rz->filedes);
        free(rz);
 }
+
+#endif
index cfa2222045a4c0993108be83a2706d137f5b0423..17357740ba0e84a716960e024d03322b68af28a0 100644 (file)
@@ -5,11 +5,15 @@ digraph {
   sort[label="bam_sort.c\n(sort, merge)"]
   index[label="bam_index.c\n(index)"]
   tview[label="bam_tview.c\n(tview)"]
+  glf[label="glf.c\n(glfview)"]
+  rmdup[label="bam_rmdup.c\n(rmdup)"]
+  fixmate[label="bam_mate.c\n(fixmate)"]
   "bam_aux.c" -> {"bam.c", import}
-  "bgzf.c" -> "bam.c"
-  "bam.c" -> {index, "bam_pileup.c", sort, import}
+  glf -> {"bam_maqcns.c", plcmd}
+  "bgzf.c" -> {"bam.c", glf}
+  "bam.c" -> {index, "bam_pileup.c", sort, import, rmdup, fixmate}
   "bam_pileup.c" -> {"bam_lpileup.c", plcmd}
   {"bam_lpileup.c", index, faidx, "bam_maqcns.c"} -> tview
   {import, faidx, "bam_maqcns.c"} -> plcmd
-  {tview, plcmd, faidx, sort, import, index} -> "bamtk.c\n(view)"
+  {tview, plcmd, faidx, sort, import, index, glf, rmdup, fixmate} -> "bamtk.c\n(view)"
 }
\ No newline at end of file