The MIT License
-Copyright (c) 2008 Genome Research Ltd.
+Copyright (c) 2008-2009 Genome Research Ltd.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
+------------------------------------------------------------------------
+r104 | lh3lh3 | 2009-01-18 17:31:21 +0000 (Sun, 18 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/bam.h
+ M /branches/dev/samtools/bam_lpileup.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-18
+ * fixed a bug in bam_lpileup.c: segment start and end are not correctly recognized
+
+------------------------------------------------------------------------
+r103 | lh3lh3 | 2009-01-18 16:34:03 +0000 (Sun, 18 Jan 2009) | 5 lines
+Changed paths:
+ M /branches/dev/samtools/bam_import.c
+ M /branches/dev/samtools/bam_index.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-17
+ * fixed a bug when there are reads without coordinates
+ * also recognize type 'c' as 'A'
+ * found a bug in bam_lpileup.c; NOT fixed yet
+
+------------------------------------------------------------------------
+r102 | lh3lh3 | 2009-01-17 19:46:49 +0000 (Sat, 17 Jan 2009) | 2 lines
+Changed paths:
+ A /branches/dev/samtools/INSTALL
+
+Instruction for compilation
+
+------------------------------------------------------------------------
+r101 | lh3lh3 | 2009-01-17 19:31:36 +0000 (Sat, 17 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/Makefile
+ A /branches/dev/samtools/Makefile.lite
+ M /branches/dev/samtools/bam.h
+ M /branches/dev/samtools/faidx.c
+ M /branches/dev/samtools/misc/Makefile
+ M /branches/dev/samtools/razf.c
+
+ * replaced HAVE_RAZF with _NO_RAZF
+ * added Makefile.lite for people who have trouble with razf.c
+
+------------------------------------------------------------------------
+r100 | lh3lh3 | 2009-01-16 10:03:37 +0000 (Fri, 16 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/bam_mate.c
+ M /branches/dev/samtools/bamtk.c
+ M /branches/dev/samtools/misc/wgsim.c
+
+ * samtools-0.1.1-15
+ * fixed another bug in fixmate: unmapped pair has non-zero isize
+
+------------------------------------------------------------------------
+r99 | lh3lh3 | 2009-01-16 09:13:36 +0000 (Fri, 16 Jan 2009) | 4 lines
+Changed paths:
+ M /branches/dev/samtools/ChangeLog
+ M /branches/dev/samtools/bam_mate.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-14
+ * fixed a bug in fixmate: isize not equal to zero if two ends mapped to
+ different chr
+
+------------------------------------------------------------------------
+r98 | lh3lh3 | 2009-01-15 16:47:41 +0000 (Thu, 15 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/bam_maqcns.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-13
+ * fixed the prior for hom indels (Richard pointed this out)
+
+------------------------------------------------------------------------
+r97 | lh3lh3 | 2009-01-15 16:38:47 +0000 (Thu, 15 Jan 2009) | 4 lines
+Changed paths:
+ M /branches/dev/samtools/COPYING
+ M /branches/dev/samtools/bam_sort.c
+ M /branches/dev/samtools/bamtk.c
+ M /branches/dev/samtools/source.dot
+
+ * samtools-0.1.1-12
+ * fixed a bug in sort
+ * update source file graph and copyright information
+
+------------------------------------------------------------------------
+r96 | lh3lh3 | 2009-01-14 21:46:14 +0000 (Wed, 14 Jan 2009) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/glf.c
+
+fixed a typo
+
+------------------------------------------------------------------------
+r95 | lh3lh3 | 2009-01-14 21:44:53 +0000 (Wed, 14 Jan 2009) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/glf.c
+
+added a main function for glf.c
+
+------------------------------------------------------------------------
+r94 | lh3lh3 | 2009-01-14 17:14:59 +0000 (Wed, 14 Jan 2009) | 4 lines
+Changed paths:
+ M /branches/dev/samtools/Makefile
+ M /branches/dev/samtools/bam_plcmd.c
+ M /branches/dev/samtools/bamtk.c
+ M /branches/dev/samtools/bgzf.h
+ A /branches/dev/samtools/glf.c
+ M /branches/dev/samtools/glf.h
+
+ * samtools-0.1.1-11
+ * generate binary GLFv2
+ * added glfview command to dump GLFv2 binary file
+
+------------------------------------------------------------------------
+r93 | lh3lh3 | 2009-01-14 15:07:44 +0000 (Wed, 14 Jan 2009) | 4 lines
+Changed paths:
+ M /branches/dev/samtools/bam_rmdup.c
+ M /branches/dev/samtools/bamtk.c
+ M /branches/dev/samtools/glf.h
+
+ * samtools-0.1.1-10
+ * fixed several bugs in rmdup
+ * prepare to generate GLF2
+
+------------------------------------------------------------------------
+r92 | lh3lh3 | 2009-01-14 13:27:44 +0000 (Wed, 14 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/Makefile
+ M /branches/dev/samtools/bam.h
+ M /branches/dev/samtools/bam_import.c
+ A /branches/dev/samtools/bam_rmdup.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-9
+ * implemented rmdup; NOT tested yet
+
+------------------------------------------------------------------------
+r91 | lh3lh3 | 2009-01-13 20:15:43 +0000 (Tue, 13 Jan 2009) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/examples/00README.txt
+
+update README for typos
+
+------------------------------------------------------------------------
+r90 | lh3lh3 | 2009-01-13 19:57:50 +0000 (Tue, 13 Jan 2009) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/examples/ex1.sam.gz
+
+update example
+
+------------------------------------------------------------------------
+r89 | lh3lh3 | 2009-01-13 17:21:38 +0000 (Tue, 13 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/Makefile
+ M /branches/dev/samtools/bam.c
+ A /branches/dev/samtools/bam_mate.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-8
+ * added fixmate command
+
+------------------------------------------------------------------------
+r88 | lh3lh3 | 2009-01-13 10:48:23 +0000 (Tue, 13 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/bam_plcmd.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-7
+ * change the reported indel position to the previous way
+
+------------------------------------------------------------------------
+r87 | lh3lh3 | 2009-01-12 22:12:12 +0000 (Mon, 12 Jan 2009) | 4 lines
+Changed paths:
+ M /branches/dev/samtools/bam_plcmd.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-6
+ * addd glt output
+ * allow to change indel calling parameters at the command line
+
+------------------------------------------------------------------------
+r86 | lh3lh3 | 2009-01-12 21:16:48 +0000 (Mon, 12 Jan 2009) | 4 lines
+Changed paths:
+ M /branches/dev/samtools/bam.h
+ M /branches/dev/samtools/bam_pileup.c
+ M /branches/dev/samtools/bam_plcmd.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-5
+ * added two more flags
+ * allowed to select reads shown in pileup with a mask
+
+------------------------------------------------------------------------
+r85 | lh3lh3 | 2009-01-12 20:47:51 +0000 (Mon, 12 Jan 2009) | 4 lines
+Changed paths:
+ M /branches/dev/samtools/bam_index.c
+ M /branches/dev/samtools/bam_plcmd.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-4
+ * fixed a bug in indexing (linear index)
+ * prepare to add glt output from pileup
+
+------------------------------------------------------------------------
+r84 | lh3lh3 | 2009-01-12 09:22:35 +0000 (Mon, 12 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/bam_plcmd.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-3
+ * fixed a bug in outputing the coordinate of an indel
+
+------------------------------------------------------------------------
+r83 | lh3lh3 | 2009-01-11 15:18:01 +0000 (Sun, 11 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/bam_plcmd.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-2
+ * pileup: allows to output indel sites only
+
+------------------------------------------------------------------------
+r82 | lh3lh3 | 2009-01-10 23:34:31 +0000 (Sat, 10 Jan 2009) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/bam_maqcns.c
+ M /branches/dev/samtools/bam_maqcns.h
+ M /branches/dev/samtools/bam_plcmd.c
+ M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.1-1
+ * implemented a Bayesian indel caller
+
+------------------------------------------------------------------------
+r81 | lh3lh3 | 2009-01-09 09:54:28 +0000 (Fri, 09 Jan 2009) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/examples/00README.txt
+ D /branches/dev/samtools/examples/ex1.fa.fai
+
+Let users generate ex1.fa.fai.
+
+------------------------------------------------------------------------
+r80 | lh3lh3 | 2009-01-08 16:10:08 +0000 (Thu, 08 Jan 2009) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/misc/bowtie2sam.pl
+
+make the bowtie converter works for "-k 2"
+
+------------------------------------------------------------------------
+r78 | lh3lh3 | 2009-01-03 17:25:24 +0000 (Sat, 03 Jan 2009) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/misc/export2sam.pl
+
+fixed a bug for "QC" reads
+
+------------------------------------------------------------------------
+r77 | lh3lh3 | 2009-01-01 18:32:06 +0000 (Thu, 01 Jan 2009) | 3 lines
+Changed paths:
+ A /branches/dev/samtools/misc/bowtie2sam.pl
+ M /branches/dev/samtools/misc/soap2sam.pl
+
+ * soap2sam.pl: added NM tag
+ * bowtie2sam.pl: converter for bowtie
+
+------------------------------------------------------------------------
+r76 | lh3lh3 | 2008-12-31 23:24:24 +0000 (Wed, 31 Dec 2008) | 2 lines
+Changed paths:
+ A /branches/dev/samtools/misc/soap2sam.pl
+
+soap2sam.pl: convert soap output to SAM
+
+------------------------------------------------------------------------
+r75 | lh3lh3 | 2008-12-31 17:54:32 +0000 (Wed, 31 Dec 2008) | 3 lines
+Changed paths:
+ M /branches/dev/samtools/misc/wgsim_eval.pl
+
+ * wgsim_eval.pl-0.1.1
+ * fixed a bug for a contig name like "NT_012345"
+
+------------------------------------------------------------------------
+r74 | lh3lh3 | 2008-12-31 16:38:21 +0000 (Wed, 31 Dec 2008) | 2 lines
+Changed paths:
+ A /branches/dev/samtools/misc/wgsim_eval.pl
+
+ * evaluate alignment for reads generated by wgsim
+
+------------------------------------------------------------------------
+r73 | lh3lh3 | 2008-12-31 15:11:22 +0000 (Wed, 31 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/misc/Makefile
+ M /branches/dev/samtools/misc/wgsim.c
+
+fixed compiling warnings for wgsim
+
+------------------------------------------------------------------------
+r72 | lh3lh3 | 2008-12-31 13:40:51 +0000 (Wed, 31 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/bam_tview.c
+
+remove an unused variable (a compiler warning only)
+
+------------------------------------------------------------------------
+r71 | lh3lh3 | 2008-12-31 13:37:16 +0000 (Wed, 31 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/misc/Makefile
+ A /branches/dev/samtools/misc/wgsim.c
+
+wgsim: Paired-end reads simulator
+
+------------------------------------------------------------------------
+r70 | bhandsaker | 2008-12-29 20:27:16 +0000 (Mon, 29 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/bam_maqcns.c
+ M /branches/dev/samtools/bam_tview.c
+
+Move definition of bam_nt16_nt4_table so we can build without curses.
+
+------------------------------------------------------------------------
+r62 | lh3lh3 | 2008-12-22 15:55:13 +0000 (Mon, 22 Dec 2008) | 2 lines
+Changed paths:
+ A /branches/dev/samtools/NEWS
+ M /branches/dev/samtools/bamtk.c
+ M /branches/dev/samtools/samtools.1
+
+Release samtools-0.1.1
+
+------------------------------------------------------------------------
+r61 | lh3lh3 | 2008-12-22 15:46:08 +0000 (Mon, 22 Dec 2008) | 10 lines
+Changed paths:
+ M /branches/dev/samtools/bam_aux.c
+ M /branches/dev/samtools/bam_index.c
+ M /branches/dev/samtools/bam_plcmd.c
+ M /branches/dev/samtools/bam_tview.c
+ M /branches/dev/samtools/bamtk.c
+ M /branches/dev/samtools/razf.c
+ M /branches/dev/samtools/samtools.1
+
+ * samtools-0.1.0-66
+ * fixed a bug in razf.c: reset z_eof when razf_seek() is called
+ * fixed a memory leak in parsing a region
+ * changed pileup a little bit when -s is in use: output ^ and $
+ * when a bam is not indexed, output more meaningful error message
+ * fixed a bug in indexing for small alignment
+ * fixed a bug in the viewer when we come to the end of a reference file
+ * updated documentation
+ * prepare to release 0.1.1
+
+------------------------------------------------------------------------
+r60 | lh3lh3 | 2008-12-22 15:10:16 +0000 (Mon, 22 Dec 2008) | 2 lines
+Changed paths:
+ A /branches/dev/samtools/examples
+ A /branches/dev/samtools/examples/00README.txt
+ A /branches/dev/samtools/examples/ex1.fa
+ A /branches/dev/samtools/examples/ex1.fa.fai
+ A /branches/dev/samtools/examples/ex1.sam.gz
+
+example
+
+------------------------------------------------------------------------
+r59 | lh3lh3 | 2008-12-22 09:38:15 +0000 (Mon, 22 Dec 2008) | 2 lines
+Changed paths:
+ M /branches/dev/samtools/ChangeLog
+
+update ChangeLog
+
------------------------------------------------------------------------
r58 | lh3lh3 | 2008-12-20 23:06:00 +0000 (Sat, 20 Dec 2008) | 3 lines
Changed paths:
--- /dev/null
+Compiling samtools requires the zlib library <http://www.zlib.net/>;
+compiling the alignment viewer further requires the GNU ncurses library
+<http://www.gnu.org/software/ncurses/>. The default Makefile builds the
+alignment viewer. If compilation fails due to errors in RAZF or the lack
+of ncurses, you can compile with `make -f Makefile.lite'. However, the
+alignment viewer will not be available; the FASTA indexer will not
+support indexing compressed FASTA, either.
\ No newline at end of file
CXX= g++
CFLAGS= -g -Wall -O2 -m64 #-arch ppc
CXXFLAGS= $(CFLAGS)
-DFLAGS= -D_IOLIB=2 -D_FILE_OFFSET_BITS=64 -DHAVE_RAZF #-D_NO_CURSES
+DFLAGS= -D_IOLIB=2 -D_FILE_OFFSET_BITS=64 #-D_NO_RAZF #-D_NO_CURSES
OBJS= bam.o bam_import.o bam_pileup.o bam_lpileup.o bam_sort.o bam_index.o \
- razf.o bgzf.o faidx.o bam_tview.o bam_maqcns.o bam_aux.o bam_plcmd.o
+ razf.o bgzf.o faidx.o bam_tview.o bam_maqcns.o bam_aux.o bam_plcmd.o \
+ bam_mate.o bam_rmdup.o glf.o
PROG= razip bgzip samtools
INCLUDES=
LIBS= -lm -lz
--- /dev/null
+CC= gcc
+CXX= g++
+CFLAGS= -g -Wall -O2 -m64 #-arch ppc
+CXXFLAGS= $(CFLAGS)
+DFLAGS= -D_IOLIB=2 -D_FILE_OFFSET_BITS=64 -D_NO_CURSES -D_NO_RAZF
+OBJS= bam.o bam_import.o bam_pileup.o bam_lpileup.o bam_sort.o bam_index.o \
+ bgzf.o faidx.o bam_tview.o bam_maqcns.o bam_aux.o bam_plcmd.o \
+ bam_mate.o bam_rmdup.o glf.o
+PROG= samtools
+INCLUDES=
+LIBS= -lm -lz
+SUBDIRS= .
+
+.SUFFIXES:.c .o
+
+.c.o:
+ $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
+
+all-recur lib-recur clean-recur cleanlocal-recur install-recur:
+ @target=`echo $@ | sed s/-recur//`; \
+ wdir=`pwd`; \
+ list='$(SUBDIRS)'; for subdir in $$list; do \
+ cd $$subdir; \
+ $(MAKE) -f Makefile.lite CC="$(CC)" CXX="$(CXX)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \
+ INCLUDES="$(INCLUDES)" $$target || exit 1; \
+ cd $$wdir; \
+ done;
+
+all:$(PROG)
+
+lib:libbam.a
+
+libbam.a:$(OBJS)
+ $(AR) -cru $@ $(OBJS)
+
+samtools:lib bamtk.o
+ $(CC) $(CFLAGS) -o $@ bamtk.o $(LIBS) -L. -lbam
+
+razip:razip.o razf.o
+ $(CC) $(CFLAGS) -o $@ razf.o razip.o $(LIBS)
+
+bgzip:bgzip.o bgzf.o
+ $(CC) $(CFLAGS) -o $@ bgzf.o bgzip.o $(LIBS)
+
+razip.o:razf.h
+bam.o:bam.h razf.h bam_endian.h
+bam_import.o:bam.h kseq.h khash.h razf.h
+bam_pileup.o:bam.h razf.h ksort.h
+bam_plcmd.o:bam.h faidx.h bam_maqcns.h
+bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h
+bam_lpileup.o:bam.h ksort.h
+bam_tview.o:bam.h faidx.h bam_maqcns.h
+bam_maqcns.o:bam.h ksort.h bam_maqcns.h
+bam_sort.o:bam.h ksort.h razf.h
+razf.o:razf.h
+
+faidx.o:faidx.h razf.h khash.h
+faidx_main.o:faidx.h razf.h
+
+cleanlocal:
+ rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a
+
+clean:cleanlocal-recur
}
putchar('\t');
if (c->mtid < 0) printf("*\t");
+ else if (c->mtid == c->tid) printf("=\t");
else printf("%s\t", header->target_name[c->mtid]);
printf("%d\t%d\t", c->mpos + 1, c->isize);
for (i = 0; i < c->l_qseq; ++i) putchar(bam_nt16_rev_table[bam1_seqi(s, i)]);
#include <string.h>
#include <stdio.h>
-#if _IOLIB == 1
+#if _IOLIB == 1 && !defined(_NO_RAZF)
#define BAM_TRUE_OFFSET
#include "razf.h"
/*! @abstract BAM file handler */
#define BAM_FREAD1 64
#define BAM_FREAD2 128
#define BAM_FSECONDARY 256
+#define BAM_FQCFAIL 512
+#define BAM_FDUP 1024
+
+#define BAM_DEF_MASK (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)
#define BAM_CORE_SIZE sizeof(bam1_core_t)
/*! @abstract pileup buffer */
typedef struct __bam_plbuf_t bam_plbuf_t;
+ void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask);
+
/*! @typedef
@abstract Type of function to be called by bam_plbuf_push().
@param tid chromosome ID as is defined in the header
@discussion The file position indicator must be placed right
before the start of an alignment. See also bam_plbuf_push().
*/
- int bam_pileup_file(bamFile fp, bam_pileup_f func, void *func_data);
+ int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data);
struct __bam_lplbuf_t;
typedef struct __bam_lplbuf_t bam_lplbuf_t;
int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *buf);
/*! @abstract bam_plbuf_file() equivalent with level calculated. */
- int bam_lpileup_file(bamFile fp, bam_pileup_f func, void *func_data);
+ int bam_lpileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data);
struct __bam_index_t;
typedef struct __bam_index_t bam_index_t;
return 0;
}
-static inline void bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
+static inline bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
{
uint8_t *data = bdst->data;
int m_data = bdst->m_data; // backup data and m_data
// restore the backup
bdst->m_data = m_data;
bdst->data = data;
+ return bdst;
+}
+
+static inline bam1_t *bam_dup1(const bam1_t *src)
+{
+ bam1_t *b;
+ b = bam_init1();
+ *b = *src;
+ b->m_data = b->data_len;
+ b->data = (uint8_t*)calloc(b->data_len, 1);
+ memcpy(b->data, src->data, b->data_len);
+ return b;
}
#endif
type = str->s[3];
s = alloc_data(b, doff + 3) + doff;
s[0] = key[0]; s[1] = key[1]; s += 2; doff += 2;
- if (type == 'A' || type == 'a') {
+ if (type == 'A' || type == 'a' || type == 'c' || type == 'C') { // c and C for backward compatibility
s = alloc_data(b, doff + 2) + doff;
- *s++ = type; *s = str->s[5];
+ *s++ = 'A'; *s = str->s[5];
doff += 2;
} else if (type == 'I' || type == 'i') {
long long x;
int ret;
b = (bam1_t*)calloc(1, sizeof(bam1_t));
- fpbaf = bam_open(fnbaf, "w");
+ fpbaf = (strcmp(fnbaf, "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(fnbaf, "w");
fp = sam_open(fntaf);
ret = sam_read1(fp, header, b);
bam_header_write(fpbaf, header);
*/
#define BAM_MIN_CHUNK_GAP 32768
+// 1<<14 is the size of minimum bin.
#define BAM_LIDX_SHIFT 14
typedef struct {
l->list[l->n].u = beg; l->list[l->n++].v = end;
}
-static inline void insert_offset2(bam_lidx_t *index2, int last, int curr, uint64_t offset)
+static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset)
{
- int i;
- if (index2->m < curr + 1) {
- index2->m = curr + 1;
+ int i, beg, end;
+ beg = b->core.pos >> BAM_LIDX_SHIFT;
+ end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT;
+ if (index2->m < end + 1) {
+ int old_m = index2->m;
+ index2->m = end + 1;
kroundup32(index2->m);
index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
+ memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
}
- if (last > curr) last = -1;
- for (i = last + 1; i <= curr; ++i) index2->offset[i] = offset;
- index2->n = curr + 1;
+ for (i = beg + 1; i <= end; ++i)
+ if (index2->offset[i] == 0) index2->offset[i] = offset;
+ index2->n = end + 1;
}
static void merge_chunks(bam_index_t *idx)
bam_header_t *h;
int i, ret;
bam_index_t *idx;
- uint32_t last_coor, last_tid, last_bin, save_bin, save_tid;
+ uint32_t last_bin, save_bin;
+ int32_t last_coor, last_tid, save_tid;
bam1_core_t *c;
uint64_t save_off, last_off;
fprintf(stderr, "[bam_index_core] the alignment is not sorted. Abort!\n");
exit(1);
}
- if (last_coor>>BAM_LIDX_SHIFT != b->core.pos>>BAM_LIDX_SHIFT) // then write the linear index
- insert_offset2(&idx->index2[b->core.tid], last_coor>>BAM_LIDX_SHIFT, b->core.pos>>BAM_LIDX_SHIFT, last_off);
+ if (b->core.bin < 4681) insert_offset2(&idx->index2[b->core.tid], b, last_off);
if (c->bin != last_bin) { // then possibly write the binning index
if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
save_off = last_off;
save_bin = last_bin = c->bin;
save_tid = c->tid;
+ if (save_tid < 0) break;
}
if (bam_tell(fp) <= last_off) {
fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
last_off = bam_tell(fp);
last_coor = b->core.pos;
}
- insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
+ if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
merge_chunks(idx);
if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
free(b->data); free(b);
max_level = 0;
for (i = l = 0; i < n; ++i) {
const bam_pileup1_t *p = pl + i;
- if (p->qpos == 0) {
+ if (p->is_head) {
if (tv->head->next && tv->head->cnt == 0) { // then take a free slot
freenode_t *p = tv->head->next;
tv->cur_level[i] = tv->head->level;
// squeeze out terminated levels
for (i = l = 0; i < n; ++i) {
const bam_pileup1_t *p = pl + i;
- if (p->qpos != p->b->core.l_qseq - 1)
+ if (!p->is_tail)
tv->pre_level[l++] = tv->pre_level[i];
}
tv->n_pre = l;
return bam_plbuf_push(b, tv->plbuf);
}
-int bam_lpileup_file(bamFile fp, bam_pileup_f func, void *func_data)
+int bam_lpileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data)
{
bam_lplbuf_t *buf;
int ret;
bam1_t *b;
b = (bam1_t*)calloc(1, sizeof(bam1_t));
buf = bam_lplbuf_init(func, func_data);
+ bam_plbuf_set_mask(buf->plbuf, mask);
while ((ret = bam_read1(fp, b)) >= 0)
bam_lplbuf_push(b, buf);
bam_lplbuf_push(0, buf);
bam_maqindel_opt_t *bam_maqindel_opt_init()
{
bam_maqindel_opt_t *mi = (bam_maqindel_opt_t*)calloc(1, sizeof(bam_maqindel_opt_t));
+ mi->q_indel = 40;
+ mi->r_indel = 0.00015;
+ //
mi->mm_penalty = 3;
mi->indel_err = 4;
mi->ambi_thres = 10;
void bam_maqindel_ret_destroy(bam_maqindel_ret_t *mir)
{
if (mir == 0) return;
- free(mir->s1); free(mir->s2); free(mir);
+ free(mir->s[0]); free(mir->s[1]); free(mir);
}
#define MINUS_CONST 0x10000000
}
{ // the core part
char *ref2, *inscns = 0;
- int k, l, *score, max_ins = types[n_types-1];
+ int k, l, *score, *pscore, max_ins = types[n_types-1];
ref2 = (char*)calloc(right - left + types[n_types-1] + 2, 1);
if (max_ins > 0) { // get the consensus of inserted sequences
int *inscns_aux = (int*)calloc(4 * n_types * max_ins, sizeof(int));
}
}
}
- // construct the consensus
+ // construct the consensus of inserted sequence
inscns = (char*)calloc(n_types * max_ins, sizeof(char));
for (i = 0; i < n_types; ++i) {
for (j = 0; j < types[i]; ++j) {
}
// calculate score
score = (int*)calloc(n_types * n, sizeof(int));
+ pscore = (int*)calloc(n_types * n, sizeof(int));
for (i = 0; i < n_types; ++i) {
// write ref2
for (k = 0, j = left; j <= pos; ++j)
const bam_pileup1_t *p = pl + j;
uint32_t *cigar;
bam1_core_t *c = &p->b->core;
- int s;
+ int s, ps;
bam_segreg_t seg;
if (c->flag&BAM_FUNMAP) continue;
cigar = bam1_cigar(p->b);
bam_segreg(pos, c, cigar, &seg);
- for (s = 0, l = seg.qbeg; c->pos + l < right && l < seg.qend; ++l) {
+ for (ps = s = 0, l = seg.qbeg; c->pos + l < right && l < seg.qend; ++l) {
int cq = bam1_seqi(bam1_seq(p->b), l), ct;
ct = c->pos + l >= left? ref2[c->pos + l - left] : 15; // "<" should not happen if there is no bug
- if (cq < 15 && ct < 15)
+ if (cq < 15 && ct < 15) {
s += cq == ct? 1 : -mi->mm_penalty;
+ if (cq != ct) ps += bam1_qual(p->b)[l];
+ }
}
- score[i*n + j] = s;
+ score[i*n + j] = s; pscore[i*n + j] = ps;
if (types[i] != 0) { // then try the other way to calculate the score
- for (s = 0, l = seg.qbeg; c->pos + l + types[i] < right && l < seg.qend; ++l) {
+ for (ps = s = 0, l = seg.qbeg; c->pos + l + types[i] < right && l < seg.qend; ++l) {
int cq = bam1_seqi(bam1_seq(p->b), l), ct;
ct = c->pos + l + types[i] >= left? ref2[c->pos + l + types[i] - left] : 15;
- if (cq < 15 && ct < 15)
+ if (cq < 15 && ct < 15) {
s += cq == ct? 1 : -mi->mm_penalty;
+ if (cq != ct) ps += bam1_qual(p->b)[l];
+ }
}
}
if (score[i*n+j] < s) score[i*n+j] = s; // choose the higher of the two scores
+ if (pscore[i*n+j] > ps) pscore[i*n+j] = ps;
if (types[i] != 0) score[i*n+j] -= mi->indel_err;
//printf("%d, %d, %d, %d\n", i, types[i], j, score[i*n+j]);
}
sum = (int*)calloc(n_types, sizeof(int));
for (i = 0; i < n_types; ++i)
for (j = 0; j < n; ++j)
- sum[i] += score[i*n+j];
+ sum[i] += -pscore[i*n+j];
max1 = max2 = -0x7fffffff; max1_i = max2_i = -1;
for (i = 0; i < n_types; ++i) {
if (sum[i] > max1) {
// write ret
ret = (bam_maqindel_ret_t*)calloc(1, sizeof(bam_maqindel_ret_t));
ret->indel1 = types[max1_i]; ret->indel2 = types[max2_i];
- ret->s1 = (char*)calloc(abs(ret->indel1) + 2, 1);
- ret->s2 = (char*)calloc(abs(ret->indel2) + 2, 1);
+ ret->s[0] = (char*)calloc(abs(ret->indel1) + 2, 1);
+ ret->s[1] = (char*)calloc(abs(ret->indel2) + 2, 1);
+ // write indel sequence
if (ret->indel1 > 0) {
- ret->s1[0] = '+';
+ ret->s[0][0] = '+';
for (k = 0; k < ret->indel1; ++k)
- ret->s1[k+1] = bam_nt16_rev_table[(int)inscns[max1_i*max_ins + k]];
+ ret->s[0][k+1] = bam_nt16_rev_table[(int)inscns[max1_i*max_ins + k]];
} else if (ret->indel1 < 0) {
- ret->s1[0] = '-';
+ ret->s[0][0] = '-';
for (k = 0; k < -ret->indel1 && ref[pos + k + 1]; ++k)
- ret->s1[k+1] = ref[pos + k + 1];
- } else ret->s1[0] = '*';
+ ret->s[0][k+1] = ref[pos + k + 1];
+ } else ret->s[0][0] = '*';
if (ret->indel2 > 0) {
- ret->s2[0] = '+';
+ ret->s[1][0] = '+';
for (k = 0; k < ret->indel2; ++k)
- ret->s2[k+1] = bam_nt16_rev_table[(int)inscns[max2_i*max_ins + k]];
+ ret->s[1][k+1] = bam_nt16_rev_table[(int)inscns[max2_i*max_ins + k]];
} else if (ret->indel2 < 0) {
- ret->s2[0] = '-';
+ ret->s[1][0] = '-';
for (k = 0; k < -ret->indel2 && ref[pos + k + 1]; ++k)
- ret->s2[k+1] = ref[pos + k + 1];
- } else ret->s2[0] = '*';
+ ret->s[1][k+1] = ref[pos + k + 1];
+ } else ret->s[1][0] = '*';
+ // write count
for (j = 0; j < n; ++j) {
if (score[max1_i*n+j] < 0 && score[max2_i*n+j] < 0) ++ret->cnt_anti;
else {
else ++ret->cnt_ambi;
}
}
+ // write gl[]
+ ret->gl[0] = ret->gl[1] = 0;
+ for (j = 0; j < n; ++j) {
+ int s1 = pscore[max1_i*n + j], s2 = pscore[max2_i*n + j];
+ ret->gl[0] += s1 < s2? 0 : s1 - s2 < mi->q_indel? s1 - s2 : mi->q_indel;
+ ret->gl[1] += s2 < s1? 0 : s2 - s1 < mi->q_indel? s2 - s1 : mi->q_indel;
+ }
+ }
+ free(score); free(pscore); free(ref2); free(inscns);
+ }
+ { // call genotype
+ int q[3], qr_indel = (int)(-4.343 * log(mi->r_indel) + 0.5);
+ int min1, min2, min1_i;
+ q[0] = ret->gl[0] + (ret->s[0][0] != '*'? 0 : 0) * qr_indel;
+ q[1] = ret->gl[1] + (ret->s[1][0] != '*'? 0 : 0) * qr_indel;
+ q[2] = n * 3 + (ret->s[0][0] == '*' || ret->s[1][0] == '*'? 1 : 1) * qr_indel;
+ min1 = min2 = 0x7fffffff; min1_i = -1;
+ for (i = 0; i < 3; ++i) {
+ if (q[i] < min1) {
+ min2 = min1; min1 = q[i]; min1_i = i;
+ } else if (q[i] < min2) min2 = q[i];
}
- free(score); free(ref2); free(inscns);
+ ret->gt = min1_i;
+ ret->q_cns = min2 - min1;
+ // set q_ref
+ if (ret->gt < 2) ret->q_ref = (ret->s[ret->gt][0] == '*')? 0 : q[1-ret->gt] - q[ret->gt] - qr_indel - 3;
+ else ret->q_ref = (ret->s[0][0] == '*')? q[0] - q[2] : q[1] - q[2];
+ if (ret->q_ref < 0) ret->q_ref = 0;
}
free(types);
return ret;
} bam_maqcns_t;
typedef struct {
+ int q_indel;
+ float r_indel;
+ // hidden parameters, unchangeable from command line
int mm_penalty, indel_err, ambi_thres;
} bam_maqindel_opt_t;
typedef struct {
int indel1, indel2;
int cnt1, cnt2, cnt_ambi, cnt_anti;
- char *s1, *s2;
+ char *s[2];
+ //
+ int gt, gl[2];
+ int q_cns, q_ref;
} bam_maqindel_ret_t;
#ifdef __cplusplus
--- /dev/null
+#include <stdlib.h>
+#include <string.h>
+#include "bam.h"
+
+// currently, this function ONLY works if each read has one hit
+void bam_mating_core(bamFile in, bamFile out)
+{
+ bam_header_t *header;
+ bam1_t *b[2];
+ int curr, has_prev;
+
+ header = bam_header_read(in);
+ bam_header_write(out, header);
+
+ b[0] = bam_init1();
+ b[1] = bam_init1();
+ curr = 0; has_prev = 0;
+ while (bam_read1(in, b[curr]) >= 0) {
+ bam1_t *cur = b[curr], *pre = b[1-curr];
+ if (has_prev) {
+ if (strcmp(bam1_qname(cur), bam1_qname(pre)) == 0) { // identical pair name
+ cur->core.mtid = pre->core.tid; cur->core.mpos = pre->core.pos;
+ pre->core.mtid = cur->core.tid; pre->core.mpos = cur->core.pos;
+ if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))
+ && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)))
+ {
+ uint32_t cur5, pre5;
+ cur5 = (cur->core.flag&BAM_FREVERSE)? bam_calend(&cur->core, bam1_cigar(cur)) : cur->core.pos;
+ pre5 = (pre->core.flag&BAM_FREVERSE)? bam_calend(&pre->core, bam1_cigar(pre)) : pre->core.pos;
+ cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5;
+ } else cur->core.isize = pre->core.isize = 0;
+ if (pre->core.flag&BAM_FREVERSE) cur->core.flag |= BAM_FREVERSE;
+ else cur->core.flag &= ~BAM_FMREVERSE;
+ if (cur->core.flag&BAM_FREVERSE) pre->core.flag |= BAM_FREVERSE;
+ else pre->core.flag &= ~BAM_FMREVERSE;
+ if (cur->core.flag & BAM_FUNMAP) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FPROPER_PAIR; }
+ if (pre->core.flag & BAM_FUNMAP) { cur->core.flag |= BAM_FMUNMAP; cur->core.flag &= ~BAM_FPROPER_PAIR; }
+ bam_write1(out, pre);
+ bam_write1(out, cur);
+ has_prev = 0;
+ } else { // unpaired or singleton
+ pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0;
+ pre->core.flag |= BAM_FMUNMAP;
+ pre->core.flag &= ~BAM_FMREVERSE & ~BAM_FPROPER_PAIR;
+ bam_write1(out, pre);
+ }
+ } else has_prev = 1;
+ curr = 1 - curr;
+ }
+ if (has_prev) bam_write1(out, b[1-curr]);
+ bam_header_destroy(header);
+ bam_destroy1(b[0]);
+ bam_destroy1(b[1]);
+}
+
+int bam_mating(int argc, char *argv[])
+{
+ bamFile in, out;
+ if (argc < 3) {
+ fprintf(stderr, "samtools fixmate <in.nameSrt.bam> <out.nameSrt.bam>\n");
+ return 1;
+ }
+ in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r");
+ out = (strcmp(argv[2], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[2], "w");
+ bam_mating_core(in, out);
+ bam_close(in); bam_close(out);
+ return 0;
+}
int32_t tid, pos, max_tid, max_pos;
int max_pu, is_eof;
bam_pileup1_t *pu;
+ int flag_mask;
};
void bam_plbuf_reset(bam_plbuf_t *buf)
buf->head = buf->tail;
}
+void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask)
+{
+ if (mask < 0) buf->flag_mask = BAM_DEF_MASK;
+ else buf->flag_mask = BAM_FUNMAP | mask;
+}
+
bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data)
{
bam_plbuf_t *buf;
buf->head = buf->tail = mp_alloc(buf->mp);
buf->dummy = mp_alloc(buf->mp);
buf->max_tid = buf->max_pos = -1;
+ buf->flag_mask = BAM_DEF_MASK;
return buf;
}
int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf)
{
if (b) { // fill buffer
+ if (b->core.flag & buf->flag_mask) return 0;
bam_copy1(&buf->tail->b, b);
buf->tail->beg = b->core.pos; buf->tail->end = bam_calend(&b->core, bam1_cigar(b));
if (!(b->core.tid >= buf->max_tid || (b->core.tid == buf->max_tid && buf->tail->beg >= buf->max_pos))) {
return 0;
}
-int bam_pileup_file(bamFile fp, bam_pileup_f func, void *func_data)
+int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data)
{
bam_plbuf_t *buf;
int ret;
bam1_t *b;
b = (bam1_t*)calloc(1, sizeof(bam1_t));
buf = bam_plbuf_init(func, func_data);
+ bam_plbuf_set_mask(buf, mask);
while ((ret = bam_read1(fp, b)) >= 0)
bam_plbuf_push(b, buf);
bam_plbuf_push(0, buf);
#include "faidx.h"
#include "bam_maqcns.h"
#include "khash.h"
+#include "glf.h"
KHASH_SET_INIT_INT64(64)
-#define BAM_PLF_SIMPLE 0x01
-#define BAM_PLF_CNS 0x02
+#define BAM_PLF_SIMPLE 0x01
+#define BAM_PLF_CNS 0x02
+#define BAM_PLF_INDEL_ONLY 0x04
+#define BAM_PLF_GLF 0x08
typedef struct {
bam_header_t *h;
khash_t(64) *hash;
uint32_t format;
int tid, len;
+ int mask;
char *ref;
+ glfFile fp; // for glf output only
} pu_data_t;
char **bam_load_pos(const char *fn, int *_n);
return hash;
}
+// an analogy to pileup_func() below
+static int glt_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data)
+{
+ pu_data_t *d = (pu_data_t*)data;
+ bam_maqindel_ret_t *r = 0;
+ int rb;
+ glf1_t *g;
+ glf2_t g2;
+ if (d->fai == 0) {
+ fprintf(stderr, "[glt_func] reference sequence is required for generating GLT. Abort!\n");
+ exit(1);
+ }
+ if (d->hash && kh_get(64, d->hash, (uint64_t)tid<<32|pos) == kh_end(d->hash)) return 0;
+ if (d->fai && (int)tid != d->tid) {
+ if (d->ref) { // then write the end mark
+ memset(&g2, 0, sizeof(glf2_t));
+ g2.type = GLF_TYPE_END;
+ bgzf_write(d->fp, &g2, sizeof(glf2_t));
+ }
+ glf_ref_write(d->fp, d->h->target_name[tid]); // write reference
+ free(d->ref);
+ d->ref = fai_fetch(d->fai, d->h->target_name[tid], &d->len);
+ d->tid = tid;
+ }
+ rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N';
+ g = bam_maqcns_glfgen(n, pu, bam_nt16_table[rb], d->c);
+ memcpy(&g2, g, sizeof(glf1_t));
+ g2.type = GLF_TYPE_NORMAL;
+ g2.pos = pos;
+ bgzf_write(d->fp, &g2, sizeof(glf2_t));
+ r = bam_maqindel(n, pos, d->ido, pu, d->ref);
+ if (r) { // then write indel line
+ int g3 = 3 * n, min;
+ min = g3;
+ if (min > r->gl[0]) min = r->gl[0];
+ if (min > r->gl[1]) min = r->gl[1];
+ g3 -= min;
+ g2.ref_base = 0;
+ g2.type = GLF_TYPE_INDEL;
+ memset(g2.lk, 0, 10);
+ g2.lk[0] = r->gl[0] < 255? r->gl[0] : 255;
+ g2.lk[1] = r->gl[1] < 255? r->gl[1] : 255;
+ g2.lk[2] = g3 < 255? g3 : 255;
+ *(int16_t*)(g2.lk + 3) = (int16_t)r->indel1;
+ *(int16_t*)(g2.lk + 5) = (int16_t)r->indel2;
+ g2.min_lk = min < 255? min : 255;
+ bgzf_write(d->fp, &g2, sizeof(glf2_t));
+ if (r->indel1) bgzf_write(d->fp, r->s[0]+1, r->indel1>0? r->indel1 : -r->indel1);
+ if (r->indel2) bgzf_write(d->fp, r->s[1]+1, r->indel2>0? r->indel2 : -r->indel2);
+ bam_maqindel_ret_destroy(r);
+ }
+ free(g);
+ return 0;
+}
+
static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data)
{
pu_data_t *d = (pu_data_t*)data;
int i, j, rb;
uint32_t x;
if (d->hash && kh_get(64, d->hash, (uint64_t)tid<<32|pos) == kh_end(d->hash)) return 0;
+ if (d->format & BAM_PLF_GLF) return glt_func(tid, pos, n, pu, data);
if (d->fai && (int)tid != d->tid) {
free(d->ref);
d->ref = fai_fetch(d->fai, d->h->target_name[tid], &d->len);
d->tid = tid;
}
rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N';
+ if (d->format & BAM_PLF_INDEL_ONLY) {
+ for (i = 0; i < n; ++i)
+ if (pu[i].indel != 0) break;
+ if (i == n) return 0;
+ }
printf("%s\t%d\t%c\t", d->h->target_name[tid], pos + 1, rb);
if (d->format & BAM_PLF_CNS) { // consensus
int ref_q, rb4 = bam_nt16_table[rb];
if (ref_q > 255) ref_q = 255;
}
printf("%c\t%d\t%d\t%d\t", bam_nt16_rev_table[x>>28], x>>8&0xff, ref_q, x>>16&0xff);
+ }
+ if (d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY)) {
if (d->ref) // indel calling
r = bam_maqindel(n, pos, d->ido, pu, d->ref);
}
}
putchar('\n');
if (r) { // then print indel line
- printf("%s\t%d\t*\t%s/%s\t", d->h->target_name[tid], pos + 1, r->s1, r->s2);
+ printf("%s\t%d\t*\t", d->h->target_name[tid], pos + 1);
+ if (r->gt < 2) printf("%s/%s\t", r->s[r->gt], r->s[r->gt]);
+ else printf("%s/%s\t", r->s[0], r->s[1]);
+ printf("%d\t%d\t", r->q_cns, r->q_ref);
+ printf("%s\t%s\t", r->s[0], r->s[1]);
+ //printf("%d\t%d\t", r->gl[0], r->gl[1]);
printf("%d\t%d\t%d\t%d\n", r->cnt1, r->cnt2, r->cnt_ambi, r->cnt_anti);
bam_maqindel_ret_destroy(r);
}
int c;
char *fn_list = 0, *fn_fa = 0, *fn_pos = 0;
pu_data_t *d = (pu_data_t*)calloc(1, sizeof(pu_data_t));
- d->tid = -1;
+ d->tid = -1; d->mask = BAM_DEF_MASK;
d->c = bam_maqcns_init();
- while ((c = getopt(argc, argv, "st:f:cT:N:r:l:")) >= 0) {
+ d->ido = bam_maqindel_opt_init();
+ while ((c = getopt(argc, argv, "st:f:cT:N:r:l:im:gI:G:")) >= 0) {
switch (c) {
case 's': d->format |= BAM_PLF_SIMPLE; break;
case 't': fn_list = strdup(optarg); break;
case 'N': d->c->n_hap = atoi(optarg); break;
case 'r': d->c->het_rate = atoi(optarg); break;
case 'c': d->format |= BAM_PLF_CNS; break;
+ case 'i': d->format |= BAM_PLF_INDEL_ONLY; break;
+ case 'm': d->mask = atoi(optarg); break;
+ case 'g': d->format |= BAM_PLF_GLF; break;
+ case 'I': d->ido->q_indel = atoi(optarg); break;
+ case 'G': d->ido->r_indel = atof(optarg); break;
default: fprintf(stderr, "Unrecognizd option '-%c'.\n", c); return 1;
}
}
fprintf(stderr, "\n");
fprintf(stderr, "Usage: bamtk pileup [options] <in.bam>|<in.sam>\n\n");
fprintf(stderr, "Option: -s simple (yet incomplete) pileup format\n");
+ fprintf(stderr, " -i only show lines/consensus with indels\n");
+ fprintf(stderr, " -m INT filtering reads with bits in INT [%d]\n", d->mask);
fprintf(stderr, " -t FILE list of reference sequences (assume the input is in SAM)\n");
fprintf(stderr, " -l FILE list of sites at which pileup is output\n");
fprintf(stderr, " -f FILE reference sequence in the FASTA format\n\n");
fprintf(stderr, " -c output the maq consensus sequence\n");
- fprintf(stderr, " -T FLOAT theta in maq consensus calling model (for -c only) [%f]\n", d->c->theta);
- fprintf(stderr, " -N INT number of haplotypes in the sample (for -c only) [%d]\n", d->c->n_hap);
- fprintf(stderr, " -r FLOAT prior of a difference between any two haplotypes (for -c only) [%f]\n\n",
- d->c->het_rate);
+ fprintf(stderr, " -g output in the extended GLT format (suppressing -c/-i/-s)\n");
+ fprintf(stderr, " -T FLOAT theta in maq consensus calling model (for -c/-g) [%f]\n", d->c->theta);
+ fprintf(stderr, " -N INT number of haplotypes in the sample (for -c/-g) [%d]\n", d->c->n_hap);
+ fprintf(stderr, " -r FLOAT prior of a difference between two haplotypes (for -c/-g) [%f]\n", d->c->het_rate);
+ fprintf(stderr, " -G FLOAT prior of an indel between two haplotypes (for -c/-g) [%f]\n", d->ido->r_indel);
+ fprintf(stderr, " -I INT phred prob. of an indel in sequencing/prep. (for -c/-g) [%d]\n", d->ido->q_indel);
+ fprintf(stderr, "\n");
free(fn_list); free(fn_fa); free(d);
return 1;
}
if (fn_fa) d->fai = fai_load(fn_fa);
free(fn_fa);
bam_maqcns_prepare(d->c);
- d->ido = bam_maqindel_opt_init();
+ if (d->format & BAM_PLF_GLF) {
+ glf_header_t *h;
+ h = glf_header_init();
+ d->fp = bgzf_fdopen(fileno(stdout), "w");
+ glf_header_write(d->fp, h);
+ glf_header_destroy(h);
+ }
if (fn_list) {
tamFile fp;
bam1_t *b;
int ret;
bam_plbuf_t *buf = bam_plbuf_init(pileup_func, d);
+ bam_plbuf_set_mask(buf, d->mask);
d->h = sam_header_read2(fn_list);
if (fn_pos) d->hash = load_pos(fn_pos, d->h);
fp = sam_open(argv[optind]);
fp = (strcmp(argv[optind], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[optind], "r");
d->h = bam_header_read(fp);
if (fn_pos) d->hash = load_pos(fn_pos, d->h);
- bam_pileup_file(fp, pileup_func, d);
+ bam_pileup_file(fp, d->mask, pileup_func, d);
bam_close(fp);
}
+ if (d->format & BAM_PLF_GLF) bgzf_close(d->fp);
free(fn_pos); free(fn_list);
kh_destroy(64, d->hash);
bam_header_destroy(d->h);
--- /dev/null
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <zlib.h>
+#include "bam.h"
+
+typedef bam1_t *bam1_p;
+#include "khash.h"
+KHASH_SET_INIT_STR(name)
+KHASH_MAP_INIT_INT64(pos, bam1_p)
+
+#define BUFFER_SIZE 0x40000
+
+typedef struct {
+ int n, max;
+ bam1_t **a;
+} tmp_stack_t;
+
+static inline void stack_insert(tmp_stack_t *stack, bam1_t *b)
+{
+ if (stack->n == stack->max) {
+ stack->max = stack->max? stack->max<<1 : 0x10000;
+ stack->a = (bam1_t**)realloc(stack->a, sizeof(bam1_t*) * stack->max);
+ }
+ stack->a[stack->n++] = b;
+}
+
+static inline void dump_best(tmp_stack_t *stack, khash_t(pos) *best_hash, bamFile out)
+{
+ int i;
+ for (i = 0; i != stack->n; ++i) {
+ bam_write1(out, stack->a[i]);
+ bam_destroy1(stack->a[i]);
+ }
+ stack->n = 0;
+ if (kh_size(best_hash) > BUFFER_SIZE) kh_clear(pos, best_hash);
+}
+
+static void clear_del_set(khash_t(name) *del_set)
+{
+ khint_t k;
+ for (k = kh_begin(del_set); k < kh_end(del_set); ++k)
+ if (kh_exist(del_set, k))
+ free((char*)kh_key(del_set, k));
+ kh_clear(name, del_set);
+}
+
+void bam_rmdup_core(bamFile in, bamFile out)
+{
+ bam_header_t *header;
+ bam1_t *b;
+ int last_tid = -1, last_pos = -1;
+ uint64_t n_checked = 0, n_removed = 0;
+ tmp_stack_t stack;
+ khint_t k;
+ khash_t(pos) *best_hash;
+ khash_t(name) *del_set;
+
+ best_hash = kh_init(pos);
+ del_set = kh_init(name);
+ b = bam_init1();
+ memset(&stack, 0, sizeof(tmp_stack_t));
+ header = bam_header_read(in);
+ bam_header_write(out, header);
+
+ kh_resize(name, del_set, 4 * BUFFER_SIZE);
+ kh_resize(pos, best_hash, 3 * BUFFER_SIZE);
+ while (bam_read1(in, b) >= 0) {
+ bam1_core_t *c = &b->core;
+ if (c->tid != last_tid || last_pos != c->pos) {
+ dump_best(&stack, best_hash, out); // write the result
+ if (c->tid != last_tid) {
+ kh_clear(pos, best_hash);
+ if (kh_size(del_set)) { // check
+ fprintf(stderr, "[bam_rmdup_core] %llu unmatched pairs\n", (long long)kh_size(del_set));
+ clear_del_set(del_set);
+ }
+ last_tid = c->tid;
+ fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", header->target_name[c->tid]);
+ }
+ }
+ if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) {
+ bam_write1(out, b);
+ } else if (c->isize > 0) { // paired, head
+ uint64_t key = (uint64_t)c->pos<<32 | c->isize;
+ int ret;
+ ++n_checked;
+ k = kh_put(pos, best_hash, key, &ret);
+ if (ret == 0) { // found in best_hash
+ bam1_t *p = kh_val(best_hash, k);
+ ++n_removed;
+ if (p->core.qual < c->qual) { // the current alignment is better
+ kh_put(name, del_set, strdup(bam1_qname(p)), &ret); // p will be removed
+ bam_copy1(p, b); // replaced as b
+ } else kh_put(name, del_set, strdup(bam1_qname(b)), &ret); // b will be removed
+ if (ret == 0)
+ fprintf(stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam1_qname(b));
+ } else { // not found in best_hash
+ kh_val(best_hash, k) = bam_dup1(b);
+ stack_insert(&stack, kh_val(best_hash, k));
+ }
+ } else { // paired, tail
+ k = kh_get(name, del_set, bam1_qname(b));
+ if (k != kh_end(del_set)) {
+ free((char*)kh_key(del_set, k));
+ kh_del(name, del_set, k);
+ } else bam_write1(out, b);
+ }
+ last_pos = c->pos;
+ }
+ dump_best(&stack, best_hash, out);
+
+ bam_header_destroy(header);
+ clear_del_set(del_set);
+ kh_destroy(name, del_set);
+ kh_destroy(pos, best_hash);
+ free(stack.a);
+ bam_destroy1(b);
+ fprintf(stderr, "[bam_rmdup_core] %lld / %lld = %.4lf\n", (long long)n_removed, (long long)n_checked,
+ (double)n_removed/n_checked);
+}
+int bam_rmdup(int argc, char *argv[])
+{
+ bamFile in, out;
+ if (argc < 3) {
+ fprintf(stderr, "Usage: samtools rmdup <input.srt.bam> <output.bam>\n");
+ return 1;
+ }
+ in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r");
+ out = (strcmp(argv[2], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[2], "w");
+ if (in == 0 || out == 0) {
+ fprintf(stderr, "[bam_rmdup] fail to read/write input files\n");
+ return 1;
+ }
+ bam_rmdup_core(in, out);
+ bam_close(in);
+ bam_close(out);
+ return 0;
+}
fns[i] = (char*)calloc(strlen(prefix) + 20, 1);
sprintf(fns[i], "%s.%.4d.bam", prefix, i);
}
- bam_merge_core(0, fnout, n, fns);
+ bam_merge_core(is_by_qname, fnout, n, fns);
free(fnout);
for (i = 0; i < n; ++i) {
unlink(fns[i]);
#include "bam.h"
#ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.1.1"
+#define PACKAGE_VERSION "0.1.1-18"
#endif
int bam_taf2baf(int argc, char *argv[]);
int bam_index(int argc, char *argv[]);
int bam_sort(int argc, char *argv[]);
int bam_tview_main(int argc, char *argv[]);
+int bam_mating(int argc, char *argv[]);
+int bam_rmdup(int argc, char *argv[]);
+
int faidx_main(int argc, char *argv[]);
+int glf_view_main(int argc, char *argv[]);
static int view_aux(const bam1_t *b, void *data)
{
fprintf(stderr, " tview text alignment viewer\n");
#endif
fprintf(stderr, " index index alignment\n");
+ fprintf(stderr, " fixmate fix mate information\n");
+ fprintf(stderr, " rmdup remove PCR duplicates\n");
+ fprintf(stderr, " glfview print GLFv2 file\n");
fprintf(stderr, "\n");
return 1;
}
else if (strcmp(argv[1], "sort") == 0) return bam_sort(argc-1, argv+1);
else if (strcmp(argv[1], "index") == 0) return bam_index(argc-1, argv+1);
else if (strcmp(argv[1], "faidx") == 0) return faidx_main(argc-1, argv+1);
+ else if (strcmp(argv[1], "fixmate") == 0) return bam_mating(argc-1, argv+1);
+ else if (strcmp(argv[1], "rmdup") == 0) return bam_rmdup(argc-1, argv+1);
+ else if (strcmp(argv[1], "glfview") == 0) return glf_view_main(argc-1, argv+1);
#ifndef _NO_CURSES
else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1);
#endif
* or functionality.
*/
-#ifndef __BCGZ_H
+#ifndef __BGZF_H
#define __BGZF_H
#include <stdint.h>
-NA18507_part.fa contains two sequences cut from the human genome
+File ex1.fa contains two sequences cut from the human genome
build36. They were exatracted with command:
samtools faidx human_b36.fa 2:2043966-2045540 20:67967-69550
-Sequence names were changed manually for simplicity. ex1.fa.fai is the
-index for the sequence file, generated by:
-
- samtools faidx ex1.fa
-
-This index file also works as the reference list file used by `import'
-and `pileup' commands of samtools. ex1.sam.gz contains MAQ alignments
-exatracted with:
+Sequence names were changed manually for simplicity. File ex1.sam.gz
+contains MAQ alignments exatracted with:
(samtools view NA18507_maq.bam 2:2044001-2045500;
samtools view NA18507_maq.bam 20:68001-69500)
-and processed with an awk command to make everything consistent as a
+and processed with `samtools fixmate' to make it self-consistent as a
standalone alignment.
To try samtools, you may run the following commands:
- samtools import ex1.fa.fai ex1.sam.gz ex1.bam
- samtools index ex1.bam
- samtools tview ex1.bam ex1.fa
- samtools pileup -cf ex1.fa ex1.bam
+ samtools faidx ex1.fa # index the reference FASTA
+ samtools import ex1.fa.fai ex1.sam.gz ex1.bam # SAM->BAM
+ samtools index ex1.bam # index BAM
+ samtools tview ex1.bam ex1.fa # view alignment
+ samtools pileup -cf ex1.fa ex1.bam # pileup and consensus
samtools pileup -cf ex1.fa -t ex1.fa.fai ex1.sam.gz
--- /dev/null
+all:../samtools ex1.glf ex1.pileup.gz ex1.bam.bai ex1.glfview.gz
+ @echo; echo \# You can now launch the viewer with: \'samtools tview ex1.bam ex1.fa\'; echo;
+
+ex1.fa.fai:ex1.fa
+ ../samtools faidx ex1.fa
+ex1.bam:ex1.sam.gz ex1.fa.fai
+ ../samtools import ex1.fa.fai ex1.sam.gz ex1.bam
+ex1.bam.bai:ex1.bam
+ ../samtools index ex1.bam
+ex1.pileup.gz:ex1.bam ex1.fa
+ ../samtools pileup -cf ex1.fa ex1.bam | gzip > ex1.pileup.gz
+ex1.glf:ex1.bam ex1.fa
+ ../samtools pileup -gf ex1.fa ex1.bam > ex1.glf
+ex1.glfview.gz:ex1.glf
+ ../samtools glfview ex1.glf | gzip > ex1.glfview.gz
+
+../samtools:
+ (cd ..; make samtools)
+
+clean:
+ rm -f *.bam *.bai *.glf* *.fai *.pileup* *~
\ No newline at end of file
+++ /dev/null
-seq1 1575 6 60 61
-seq2 1584 1614 60 61
} faidx1_t;
KHASH_MAP_INIT_STR(s, faidx1_t)
-#ifdef HAVE_RAZF
+#ifndef _NO_RAZF
#include "razf.h"
#else
extern off_t ftello(FILE *stream);
--- /dev/null
+#include <string.h>
+#include <stdlib.h>
+#include "glf.h"
+
+#ifdef _NO_BGZF
+// then alias bgzf_*() functions
+#endif
+
+static inline int bam_is_big_endian()
+{
+ long one= 1;
+ return !(*((char *)(&one)));
+}
+
+glf_header_t *glf_header_init()
+{
+ return (glf_header_t*)calloc(1, sizeof(glf_header_t));
+}
+
+glf_header_t *glf_header_read(glfFile fp)
+{
+ glf_header_t *h;
+ char magic[4];
+ if (bam_is_big_endian()) {
+ fprintf(stderr, "[glf_header_read] Big endian is detected. Abort.\n");
+ exit(1);
+ }
+ h = (glf_header_t*)calloc(1, sizeof(glf_header_t));
+ bgzf_read(fp, magic, 4);
+ if (strncmp(magic, "GLF\2", 4)) {
+ fprintf(stderr, "[glf_header_read] invalid magic. Abort.\n");
+ exit(1);
+ }
+ bgzf_read(fp, &h->l_text, 4);
+ if (h->l_text) {
+ h->text = (uint8_t*)calloc(h->l_text + 1, 1);
+ bgzf_read(fp, h->text, h->l_text);
+ }
+ return h;
+}
+
+void glf_header_write(glfFile fp, const glf_header_t *h)
+{
+ bgzf_write(fp, "GLF\2", 4);
+ bgzf_write(fp, &h->l_text, 4);
+ if (h->l_text) bgzf_write(fp, h->text, h->l_text);
+}
+
+void glf_header_destroy(glf_header_t *h)
+{
+ free(h->text);
+ free(h);
+}
+
+char *glf_ref_read(glfFile fp)
+{
+ int32_t n;
+ char *str;
+ if (bgzf_read(fp, &n, 4) != 4) return 0;
+ if (n < 0) {
+ fprintf(stderr, "[glf_ref_read] invalid reference name length: %d.\n", n);
+ return 0;
+ }
+ str = (char*)calloc(n + 1, 1); // not necesarily n+1 in fact
+ bgzf_read(fp, str, n);
+ return str;
+}
+
+void glf_ref_write(glfFile fp, const char *str)
+{
+ int32_t n = strlen(str);
+ ++n;
+ bgzf_write(fp, &n, 4);
+ bgzf_write(fp, str, n);
+}
+
+void glf_view_normal(const char *ref_name, glf2_t *g1)
+{
+ int j;
+ printf("%s\t%d\t%c\t%d\t%d\t%d", ref_name, g1->pos + 1, "XACMGRSVTWYHKDBN"[g1->ref_base],
+ g1->depth, g1->max_mapQ, g1->min_lk);
+ for (j = 0; j != 10; ++j) printf("\t%d", g1->lk[j]);
+ printf("\n");
+}
+
+static char *glf_read_indel(glfFile fp, char *str, int *max, int16_t indel)
+{
+ int l = indel > 0? indel : -indel;
+ if (l + 1 > *max) {
+ *max = l + 1;
+ str = (char*)realloc(str, *max);
+ }
+ bgzf_read(fp, str, l);
+ str[l] = 0;
+ return str;
+}
+
+void glf_view(glfFile fp)
+{
+ glf_header_t *h;
+ char *name, *str;
+ glf2_t g2;
+ int max;
+
+ h = glf_header_read(fp);
+ str = 0; max = 0;
+ while ((name = glf_ref_read(fp)) != 0) {
+ while (bgzf_read(fp, &g2, sizeof(glf2_t))) {
+ if (g2.type == GLF_TYPE_END) break;
+ else if (g2.type == GLF_TYPE_NORMAL) glf_view_normal(name, &g2);
+ else if (g2.type == GLF_TYPE_INDEL) {
+ int16_t indel1, indel2;
+ printf("%s\t%d\t*\t%d\t%d\t%d\t", name, g2.pos + 1, g2.depth, g2.max_mapQ, g2.min_lk);
+ printf("%d\t%d\t%d\t", g2.lk[0], g2.lk[1], g2.lk[2]);
+ indel1 = *(int16_t*)(g2.lk + 3);
+ indel2 = *(int16_t*)(g2.lk + 5);
+ printf("%d\t%d\t", indel1, indel2);
+ if (indel1) {
+ str = glf_read_indel(fp, str, &max, indel1);
+ printf("%c%d%s\t", indel1>0? '+':'-', indel1>0?indel1:-indel1, str);
+ } else printf("*\t");
+ if (indel2) {
+ str = glf_read_indel(fp, str, &max, indel2);
+ printf("%c%d%s\n", indel2>0? '+':'-', indel2>0?indel2:-indel2, str);
+ } else printf("*\n");
+ }
+ }
+ free(name);
+ }
+ glf_header_destroy(h);
+ free(str);
+}
+
+int glf_view_main(int argc, char *argv[])
+{
+ glfFile fp;
+ if (argc == 1) {
+ fprintf(stderr, "Usage: glfview <in.glf>\n");
+ return 1;
+ }
+ fp = (strcmp(argv[1], "-") == 0)? bgzf_fdopen(fileno(stdin), "r") : bgzf_open(argv[1], "r");
+ if (fp == 0) {
+ fprintf(stderr, "Fail to open file '%s'\n", argv[1]);
+ return 1;
+ }
+ glf_view(fp);
+ bgzf_close(fp);
+ return 0;
+}
+
+#ifdef GLFVIEW_MAIN
+int main(int argc, char *argv[])
+{
+ return glf_view_main(argc, argv);
+}
+#endif
unsigned min_lk:8, depth:24; /** minimum lk capped at 255, and the number of mapped reads */
} glf1_t;
+#include <stdint.h>
+#include "bgzf.h"
+typedef BGZF *glfFile;
+
+#define GLF_TYPE_NORMAL 0
+#define GLF_TYPE_INDEL 1
+#define GLF_TYPE_END 15
+
+typedef struct {
+ unsigned char ref_base:4, type:4; /** "XACMGRSVTWYHKDBN"[ref_base] gives the reference base */
+ unsigned char max_mapQ; /** maximum mapping quality */
+ unsigned char lk[10]; /** log likelihood ratio, capped at 255 */
+ unsigned min_lk:8, depth:24; /** minimum lk capped at 255, and the number of mapped reads */
+ unsigned pos; /** this is ***ZERO-BASED*** coordinate */
+} glf2_t;
+
+typedef struct {
+ int32_t l_text;
+ uint8_t *text;
+} glf_header_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ glf_header_t *glf_header_init();
+ glf_header_t *glf_header_read(glfFile fp);
+ void glf_header_write(glfFile fp, const glf_header_t *h);
+ void glf_header_destroy(glf_header_t *h);
+ char *glf_ref_read(glfFile fp);
+ void glf_ref_write(glfFile fp, const char *str);
+
+#ifdef __cplusplus
+}
+#endif
+
#endif
lib:
faidx:../faidx.c ../faidx.h
- $(CC) $(CFLAGS) -DFAIDX_MAIN -o $@ ../faidx.c
+ $(CC) $(CFLAGS) -D_NO_RAZF -DFAIDX_MAIN -o $@ ../faidx.c
md5fa:md5.o md5fa.o md5.h ../kseq.h
$(CC) $(CFLAGS) -o $@ md5.o md5fa.o -lz
#include <stdlib.h>
#include <assert.h>
+#define PACKAGE_VERSION "0.1.1 (20090120)"
+
//#define MAQ_LONGREADS
#ifdef MAQ_LONGREADS
return mm;
}
-void maq2tam_core(gzFile fp)
+void maq2tam_core(gzFile fp, const char *rg)
{
maqmap_t *mm;
maqmap1_t mm1, *m1;
for (j = 0; j != m1->size; ++j)
putchar((m1->seq[j]&0x3f) + 33);
putchar('\t');
+ if (rg) printf("RG:Z:%s\t", rg);
if (flag&4) {
printf("MF:i:%d\n", m1->flag);
} else {
{
gzFile fp;
if (argc == 1) {
- fprintf(stderr, "Usage: maq2tam <in.map>\n");
+ fprintf(stderr, "Version: %s\n", PACKAGE_VERSION);
+ fprintf(stderr, "Usage: maq2sam <in.map> [<readGroup>]\n");
return 1;
}
fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r");
- maq2tam_core(fp);
+ maq2tam_core(fp, argc > 2? argv[2] : 0);
gzclose(fp);
return 0;
}
* To compile razf.c, zlib-1.2.3(or greater) is required.
*/
+#ifndef _NO_RAZF
+
#include <fcntl.h>
#include <stdio.h>
#include "razf.h"
close(rz->filedes);
free(rz);
}
+
+#endif
sort[label="bam_sort.c\n(sort, merge)"]
index[label="bam_index.c\n(index)"]
tview[label="bam_tview.c\n(tview)"]
+ glf[label="glf.c\n(glfview)"]
+ rmdup[label="bam_rmdup.c\n(rmdup)"]
+ fixmate[label="bam_mate.c\n(fixmate)"]
"bam_aux.c" -> {"bam.c", import}
- "bgzf.c" -> "bam.c"
- "bam.c" -> {index, "bam_pileup.c", sort, import}
+ glf -> {"bam_maqcns.c", plcmd}
+ "bgzf.c" -> {"bam.c", glf}
+ "bam.c" -> {index, "bam_pileup.c", sort, import, rmdup, fixmate}
"bam_pileup.c" -> {"bam_lpileup.c", plcmd}
{"bam_lpileup.c", index, faidx, "bam_maqcns.c"} -> tview
{import, faidx, "bam_maqcns.c"} -> plcmd
- {tview, plcmd, faidx, sort, import, index} -> "bamtk.c\n(view)"
+ {tview, plcmd, faidx, sort, import, index, glf, rmdup, fixmate} -> "bamtk.c\n(view)"
}
\ No newline at end of file