From: Heng Li Date: Thu, 22 Jan 2009 15:14:27 +0000 (+0000) Subject: * Merge from branches/dev/ X-Git-Url: https://git.donarmstrong.com/?p=samtools.git;a=commitdiff_plain;h=635998cfe030da5f3dbec42a6daa3ca82fa5c871 * Merge from branches/dev/ * all future development will happen here at trunk/ --- diff --git a/COPYING b/COPYING index 2f596e5..82fa2f4 100644 --- a/COPYING +++ b/COPYING @@ -1,6 +1,6 @@ The MIT License -Copyright (c) 2008 Genome Research Ltd. +Copyright (c) 2008-2009 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/ChangeLog b/ChangeLog index 4c52aad..4535978 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,366 @@ +------------------------------------------------------------------------ +r104 | lh3lh3 | 2009-01-18 17:31:21 +0000 (Sun, 18 Jan 2009) | 3 lines +Changed paths: + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_lpileup.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-18 + * fixed a bug in bam_lpileup.c: segment start and end are not correctly recognized + +------------------------------------------------------------------------ +r103 | lh3lh3 | 2009-01-18 16:34:03 +0000 (Sun, 18 Jan 2009) | 5 lines +Changed paths: + M /branches/dev/samtools/bam_import.c + M /branches/dev/samtools/bam_index.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-17 + * fixed a bug when there are reads without coordinates + * also recognize type 'c' as 'A' + * found a bug in bam_lpileup.c; NOT fixed yet + +------------------------------------------------------------------------ +r102 | lh3lh3 | 2009-01-17 19:46:49 +0000 (Sat, 17 Jan 2009) | 2 lines +Changed paths: + A /branches/dev/samtools/INSTALL + +Instruction for compilation + +------------------------------------------------------------------------ +r101 | lh3lh3 | 2009-01-17 19:31:36 +0000 (Sat, 17 Jan 2009) | 3 lines +Changed paths: + M /branches/dev/samtools/Makefile + A /branches/dev/samtools/Makefile.lite + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/faidx.c + M /branches/dev/samtools/misc/Makefile + M /branches/dev/samtools/razf.c + + * replaced HAVE_RAZF with _NO_RAZF + * added Makefile.lite for people who have trouble with razf.c + +------------------------------------------------------------------------ +r100 | lh3lh3 | 2009-01-16 10:03:37 +0000 (Fri, 16 Jan 2009) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_mate.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/misc/wgsim.c + + * samtools-0.1.1-15 + * fixed another bug in fixmate: unmapped pair has non-zero isize + +------------------------------------------------------------------------ +r99 | lh3lh3 | 2009-01-16 09:13:36 +0000 (Fri, 16 Jan 2009) | 4 lines +Changed paths: + M /branches/dev/samtools/ChangeLog + M /branches/dev/samtools/bam_mate.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-14 + * fixed a bug in fixmate: isize not equal to zero if two ends mapped to + different chr + +------------------------------------------------------------------------ +r98 | lh3lh3 | 2009-01-15 16:47:41 +0000 (Thu, 15 Jan 2009) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-13 + * fixed the prior for hom indels (Richard pointed this out) + +------------------------------------------------------------------------ +r97 | lh3lh3 | 2009-01-15 16:38:47 +0000 (Thu, 15 Jan 2009) | 4 lines +Changed paths: + M /branches/dev/samtools/COPYING + M /branches/dev/samtools/bam_sort.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/source.dot + + * samtools-0.1.1-12 + * fixed a bug in sort + * update source file graph and copyright information + +------------------------------------------------------------------------ +r96 | lh3lh3 | 2009-01-14 21:46:14 +0000 (Wed, 14 Jan 2009) | 2 lines +Changed paths: + M /branches/dev/samtools/glf.c + +fixed a typo + +------------------------------------------------------------------------ +r95 | lh3lh3 | 2009-01-14 21:44:53 +0000 (Wed, 14 Jan 2009) | 2 lines +Changed paths: + M /branches/dev/samtools/glf.c + +added a main function for glf.c + +------------------------------------------------------------------------ +r94 | lh3lh3 | 2009-01-14 17:14:59 +0000 (Wed, 14 Jan 2009) | 4 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/bgzf.h + A /branches/dev/samtools/glf.c + M /branches/dev/samtools/glf.h + + * samtools-0.1.1-11 + * generate binary GLFv2 + * added glfview command to dump GLFv2 binary file + +------------------------------------------------------------------------ +r93 | lh3lh3 | 2009-01-14 15:07:44 +0000 (Wed, 14 Jan 2009) | 4 lines +Changed paths: + M /branches/dev/samtools/bam_rmdup.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/glf.h + + * samtools-0.1.1-10 + * fixed several bugs in rmdup + * prepare to generate GLF2 + +------------------------------------------------------------------------ +r92 | lh3lh3 | 2009-01-14 13:27:44 +0000 (Wed, 14 Jan 2009) | 3 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_import.c + A /branches/dev/samtools/bam_rmdup.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-9 + * implemented rmdup; NOT tested yet + +------------------------------------------------------------------------ +r91 | lh3lh3 | 2009-01-13 20:15:43 +0000 (Tue, 13 Jan 2009) | 2 lines +Changed paths: + M /branches/dev/samtools/examples/00README.txt + +update README for typos + +------------------------------------------------------------------------ +r90 | lh3lh3 | 2009-01-13 19:57:50 +0000 (Tue, 13 Jan 2009) | 2 lines +Changed paths: + M /branches/dev/samtools/examples/ex1.sam.gz + +update example + +------------------------------------------------------------------------ +r89 | lh3lh3 | 2009-01-13 17:21:38 +0000 (Tue, 13 Jan 2009) | 3 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bam.c + A /branches/dev/samtools/bam_mate.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-8 + * added fixmate command + +------------------------------------------------------------------------ +r88 | lh3lh3 | 2009-01-13 10:48:23 +0000 (Tue, 13 Jan 2009) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-7 + * change the reported indel position to the previous way + +------------------------------------------------------------------------ +r87 | lh3lh3 | 2009-01-12 22:12:12 +0000 (Mon, 12 Jan 2009) | 4 lines +Changed paths: + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-6 + * addd glt output + * allow to change indel calling parameters at the command line + +------------------------------------------------------------------------ +r86 | lh3lh3 | 2009-01-12 21:16:48 +0000 (Mon, 12 Jan 2009) | 4 lines +Changed paths: + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_pileup.c + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-5 + * added two more flags + * allowed to select reads shown in pileup with a mask + +------------------------------------------------------------------------ +r85 | lh3lh3 | 2009-01-12 20:47:51 +0000 (Mon, 12 Jan 2009) | 4 lines +Changed paths: + M /branches/dev/samtools/bam_index.c + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-4 + * fixed a bug in indexing (linear index) + * prepare to add glt output from pileup + +------------------------------------------------------------------------ +r84 | lh3lh3 | 2009-01-12 09:22:35 +0000 (Mon, 12 Jan 2009) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-3 + * fixed a bug in outputing the coordinate of an indel + +------------------------------------------------------------------------ +r83 | lh3lh3 | 2009-01-11 15:18:01 +0000 (Sun, 11 Jan 2009) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-2 + * pileup: allows to output indel sites only + +------------------------------------------------------------------------ +r82 | lh3lh3 | 2009-01-10 23:34:31 +0000 (Sat, 10 Jan 2009) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bam_maqcns.h + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-1 + * implemented a Bayesian indel caller + +------------------------------------------------------------------------ +r81 | lh3lh3 | 2009-01-09 09:54:28 +0000 (Fri, 09 Jan 2009) | 2 lines +Changed paths: + M /branches/dev/samtools/examples/00README.txt + D /branches/dev/samtools/examples/ex1.fa.fai + +Let users generate ex1.fa.fai. + +------------------------------------------------------------------------ +r80 | lh3lh3 | 2009-01-08 16:10:08 +0000 (Thu, 08 Jan 2009) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/bowtie2sam.pl + +make the bowtie converter works for "-k 2" + +------------------------------------------------------------------------ +r78 | lh3lh3 | 2009-01-03 17:25:24 +0000 (Sat, 03 Jan 2009) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/export2sam.pl + +fixed a bug for "QC" reads + +------------------------------------------------------------------------ +r77 | lh3lh3 | 2009-01-01 18:32:06 +0000 (Thu, 01 Jan 2009) | 3 lines +Changed paths: + A /branches/dev/samtools/misc/bowtie2sam.pl + M /branches/dev/samtools/misc/soap2sam.pl + + * soap2sam.pl: added NM tag + * bowtie2sam.pl: converter for bowtie + +------------------------------------------------------------------------ +r76 | lh3lh3 | 2008-12-31 23:24:24 +0000 (Wed, 31 Dec 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/misc/soap2sam.pl + +soap2sam.pl: convert soap output to SAM + +------------------------------------------------------------------------ +r75 | lh3lh3 | 2008-12-31 17:54:32 +0000 (Wed, 31 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/misc/wgsim_eval.pl + + * wgsim_eval.pl-0.1.1 + * fixed a bug for a contig name like "NT_012345" + +------------------------------------------------------------------------ +r74 | lh3lh3 | 2008-12-31 16:38:21 +0000 (Wed, 31 Dec 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/misc/wgsim_eval.pl + + * evaluate alignment for reads generated by wgsim + +------------------------------------------------------------------------ +r73 | lh3lh3 | 2008-12-31 15:11:22 +0000 (Wed, 31 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/Makefile + M /branches/dev/samtools/misc/wgsim.c + +fixed compiling warnings for wgsim + +------------------------------------------------------------------------ +r72 | lh3lh3 | 2008-12-31 13:40:51 +0000 (Wed, 31 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/bam_tview.c + +remove an unused variable (a compiler warning only) + +------------------------------------------------------------------------ +r71 | lh3lh3 | 2008-12-31 13:37:16 +0000 (Wed, 31 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/Makefile + A /branches/dev/samtools/misc/wgsim.c + +wgsim: Paired-end reads simulator + +------------------------------------------------------------------------ +r70 | bhandsaker | 2008-12-29 20:27:16 +0000 (Mon, 29 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bam_tview.c + +Move definition of bam_nt16_nt4_table so we can build without curses. + +------------------------------------------------------------------------ +r62 | lh3lh3 | 2008-12-22 15:55:13 +0000 (Mon, 22 Dec 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/NEWS + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/samtools.1 + +Release samtools-0.1.1 + +------------------------------------------------------------------------ +r61 | lh3lh3 | 2008-12-22 15:46:08 +0000 (Mon, 22 Dec 2008) | 10 lines +Changed paths: + M /branches/dev/samtools/bam_aux.c + M /branches/dev/samtools/bam_index.c + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bam_tview.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/razf.c + M /branches/dev/samtools/samtools.1 + + * samtools-0.1.0-66 + * fixed a bug in razf.c: reset z_eof when razf_seek() is called + * fixed a memory leak in parsing a region + * changed pileup a little bit when -s is in use: output ^ and $ + * when a bam is not indexed, output more meaningful error message + * fixed a bug in indexing for small alignment + * fixed a bug in the viewer when we come to the end of a reference file + * updated documentation + * prepare to release 0.1.1 + +------------------------------------------------------------------------ +r60 | lh3lh3 | 2008-12-22 15:10:16 +0000 (Mon, 22 Dec 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/examples + A /branches/dev/samtools/examples/00README.txt + A /branches/dev/samtools/examples/ex1.fa + A /branches/dev/samtools/examples/ex1.fa.fai + A /branches/dev/samtools/examples/ex1.sam.gz + +example + +------------------------------------------------------------------------ +r59 | lh3lh3 | 2008-12-22 09:38:15 +0000 (Mon, 22 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/ChangeLog + +update ChangeLog + ------------------------------------------------------------------------ r58 | lh3lh3 | 2008-12-20 23:06:00 +0000 (Sat, 20 Dec 2008) | 3 lines Changed paths: diff --git a/INSTALL b/INSTALL new file mode 100644 index 0000000..9efcd0c --- /dev/null +++ b/INSTALL @@ -0,0 +1,7 @@ +Compiling samtools requires the zlib library ; +compiling the alignment viewer further requires the GNU ncurses library +. The default Makefile builds the +alignment viewer. If compilation fails due to errors in RAZF or the lack +of ncurses, you can compile with `make -f Makefile.lite'. However, the +alignment viewer will not be available; the FASTA indexer will not +support indexing compressed FASTA, either. \ No newline at end of file diff --git a/Makefile b/Makefile index 32e4c41..aff7eaa 100644 --- a/Makefile +++ b/Makefile @@ -2,9 +2,10 @@ CC= gcc CXX= g++ CFLAGS= -g -Wall -O2 -m64 #-arch ppc CXXFLAGS= $(CFLAGS) -DFLAGS= -D_IOLIB=2 -D_FILE_OFFSET_BITS=64 -DHAVE_RAZF #-D_NO_CURSES +DFLAGS= -D_IOLIB=2 -D_FILE_OFFSET_BITS=64 #-D_NO_RAZF #-D_NO_CURSES OBJS= bam.o bam_import.o bam_pileup.o bam_lpileup.o bam_sort.o bam_index.o \ - razf.o bgzf.o faidx.o bam_tview.o bam_maqcns.o bam_aux.o bam_plcmd.o + razf.o bgzf.o faidx.o bam_tview.o bam_maqcns.o bam_aux.o bam_plcmd.o \ + bam_mate.o bam_rmdup.o glf.o PROG= razip bgzip samtools INCLUDES= LIBS= -lm -lz diff --git a/Makefile.lite b/Makefile.lite new file mode 100644 index 0000000..2abb0ab --- /dev/null +++ b/Makefile.lite @@ -0,0 +1,63 @@ +CC= gcc +CXX= g++ +CFLAGS= -g -Wall -O2 -m64 #-arch ppc +CXXFLAGS= $(CFLAGS) +DFLAGS= -D_IOLIB=2 -D_FILE_OFFSET_BITS=64 -D_NO_CURSES -D_NO_RAZF +OBJS= bam.o bam_import.o bam_pileup.o bam_lpileup.o bam_sort.o bam_index.o \ + bgzf.o faidx.o bam_tview.o bam_maqcns.o bam_aux.o bam_plcmd.o \ + bam_mate.o bam_rmdup.o glf.o +PROG= samtools +INCLUDES= +LIBS= -lm -lz +SUBDIRS= . + +.SUFFIXES:.c .o + +.c.o: + $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ + +all-recur lib-recur clean-recur cleanlocal-recur install-recur: + @target=`echo $@ | sed s/-recur//`; \ + wdir=`pwd`; \ + list='$(SUBDIRS)'; for subdir in $$list; do \ + cd $$subdir; \ + $(MAKE) -f Makefile.lite CC="$(CC)" CXX="$(CXX)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \ + INCLUDES="$(INCLUDES)" $$target || exit 1; \ + cd $$wdir; \ + done; + +all:$(PROG) + +lib:libbam.a + +libbam.a:$(OBJS) + $(AR) -cru $@ $(OBJS) + +samtools:lib bamtk.o + $(CC) $(CFLAGS) -o $@ bamtk.o $(LIBS) -L. -lbam + +razip:razip.o razf.o + $(CC) $(CFLAGS) -o $@ razf.o razip.o $(LIBS) + +bgzip:bgzip.o bgzf.o + $(CC) $(CFLAGS) -o $@ bgzf.o bgzip.o $(LIBS) + +razip.o:razf.h +bam.o:bam.h razf.h bam_endian.h +bam_import.o:bam.h kseq.h khash.h razf.h +bam_pileup.o:bam.h razf.h ksort.h +bam_plcmd.o:bam.h faidx.h bam_maqcns.h +bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h +bam_lpileup.o:bam.h ksort.h +bam_tview.o:bam.h faidx.h bam_maqcns.h +bam_maqcns.o:bam.h ksort.h bam_maqcns.h +bam_sort.o:bam.h ksort.h razf.h +razf.o:razf.h + +faidx.o:faidx.h razf.h khash.h +faidx_main.o:faidx.h razf.h + +cleanlocal: + rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a + +clean:cleanlocal-recur diff --git a/bam.c b/bam.c index 6ccca7c..e470e62 100644 --- a/bam.c +++ b/bam.c @@ -249,6 +249,7 @@ void bam_view1(const bam_header_t *header, const bam1_t *b) } putchar('\t'); if (c->mtid < 0) printf("*\t"); + else if (c->mtid == c->tid) printf("=\t"); else printf("%s\t", header->target_name[c->mtid]); printf("%d\t%d\t", c->mpos + 1, c->isize); for (i = 0; i < c->l_qseq; ++i) putchar(bam_nt16_rev_table[bam1_seqi(s, i)]); diff --git a/bam.h b/bam.h index 4b3a688..76f3e0f 100644 --- a/bam.h +++ b/bam.h @@ -46,7 +46,7 @@ #include #include -#if _IOLIB == 1 +#if _IOLIB == 1 && !defined(_NO_RAZF) #define BAM_TRUE_OFFSET #include "razf.h" /*! @abstract BAM file handler */ @@ -118,6 +118,10 @@ typedef struct { #define BAM_FREAD1 64 #define BAM_FREAD2 128 #define BAM_FSECONDARY 256 +#define BAM_FQCFAIL 512 +#define BAM_FDUP 1024 + +#define BAM_DEF_MASK (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) #define BAM_CORE_SIZE sizeof(bam1_core_t) @@ -461,6 +465,8 @@ extern "C" { /*! @abstract pileup buffer */ typedef struct __bam_plbuf_t bam_plbuf_t; + void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask); + /*! @typedef @abstract Type of function to be called by bam_plbuf_push(). @param tid chromosome ID as is defined in the header @@ -516,7 +522,7 @@ extern "C" { @discussion The file position indicator must be placed right before the start of an alignment. See also bam_plbuf_push(). */ - int bam_pileup_file(bamFile fp, bam_pileup_f func, void *func_data); + int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data); struct __bam_lplbuf_t; typedef struct __bam_lplbuf_t bam_lplbuf_t; @@ -533,7 +539,7 @@ extern "C" { int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *buf); /*! @abstract bam_plbuf_file() equivalent with level calculated. */ - int bam_lpileup_file(bamFile fp, bam_pileup_f func, void *func_data); + int bam_lpileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data); struct __bam_index_t; typedef struct __bam_index_t bam_index_t; @@ -641,7 +647,7 @@ static inline int bam_reg2bin(uint32_t beg, uint32_t end) return 0; } -static inline void bam_copy1(bam1_t *bdst, const bam1_t *bsrc) +static inline bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) { uint8_t *data = bdst->data; int m_data = bdst->m_data; // backup data and m_data @@ -654,6 +660,18 @@ static inline void bam_copy1(bam1_t *bdst, const bam1_t *bsrc) // restore the backup bdst->m_data = m_data; bdst->data = data; + return bdst; +} + +static inline bam1_t *bam_dup1(const bam1_t *src) +{ + bam1_t *b; + b = bam_init1(); + *b = *src; + b->m_data = b->data_len; + b->data = (uint8_t*)calloc(b->data_len, 1); + memcpy(b->data, src->data, b->data_len); + return b; } #endif diff --git a/bam_import.c b/bam_import.c index 6b3b4bc..4b7bb21 100644 --- a/bam_import.c +++ b/bam_import.c @@ -245,9 +245,9 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) type = str->s[3]; s = alloc_data(b, doff + 3) + doff; s[0] = key[0]; s[1] = key[1]; s += 2; doff += 2; - if (type == 'A' || type == 'a') { + if (type == 'A' || type == 'a' || type == 'c' || type == 'C') { // c and C for backward compatibility s = alloc_data(b, doff + 2) + doff; - *s++ = type; *s = str->s[5]; + *s++ = 'A'; *s = str->s[5]; doff += 2; } else if (type == 'I' || type == 'i') { long long x; @@ -341,7 +341,7 @@ static void taf2baf_core(const char *fntaf, const char *fnbaf, bam_header_t *hea int ret; b = (bam1_t*)calloc(1, sizeof(bam1_t)); - fpbaf = bam_open(fnbaf, "w"); + fpbaf = (strcmp(fnbaf, "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(fnbaf, "w"); fp = sam_open(fntaf); ret = sam_read1(fp, header, b); bam_header_write(fpbaf, header); diff --git a/bam_index.c b/bam_index.c index 2b01815..6bc735e 100644 --- a/bam_index.c +++ b/bam_index.c @@ -35,6 +35,7 @@ */ #define BAM_MIN_CHUNK_GAP 32768 +// 1<<14 is the size of minimum bin. #define BAM_LIDX_SHIFT 14 typedef struct { @@ -81,17 +82,21 @@ static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t l->list[l->n].u = beg; l->list[l->n++].v = end; } -static inline void insert_offset2(bam_lidx_t *index2, int last, int curr, uint64_t offset) +static inline void insert_offset2(bam_lidx_t *index2, bam1_t *b, uint64_t offset) { - int i; - if (index2->m < curr + 1) { - index2->m = curr + 1; + int i, beg, end; + beg = b->core.pos >> BAM_LIDX_SHIFT; + end = (bam_calend(&b->core, bam1_cigar(b)) - 1) >> BAM_LIDX_SHIFT; + if (index2->m < end + 1) { + int old_m = index2->m; + index2->m = end + 1; kroundup32(index2->m); index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8); + memset(index2->offset + old_m, 0, 8 * (index2->m - old_m)); } - if (last > curr) last = -1; - for (i = last + 1; i <= curr; ++i) index2->offset[i] = offset; - index2->n = curr + 1; + for (i = beg + 1; i <= end; ++i) + if (index2->offset[i] == 0) index2->offset[i] = offset; + index2->n = end + 1; } static void merge_chunks(bam_index_t *idx) @@ -127,7 +132,8 @@ bam_index_t *bam_index_core(bamFile fp) bam_header_t *h; int i, ret; bam_index_t *idx; - uint32_t last_coor, last_tid, last_bin, save_bin, save_tid; + uint32_t last_bin, save_bin; + int32_t last_coor, last_tid, save_tid; bam1_core_t *c; uint64_t save_off, last_off; @@ -152,14 +158,14 @@ bam_index_t *bam_index_core(bamFile fp) fprintf(stderr, "[bam_index_core] the alignment is not sorted. Abort!\n"); exit(1); } - if (last_coor>>BAM_LIDX_SHIFT != b->core.pos>>BAM_LIDX_SHIFT) // then write the linear index - insert_offset2(&idx->index2[b->core.tid], last_coor>>BAM_LIDX_SHIFT, b->core.pos>>BAM_LIDX_SHIFT, last_off); + if (b->core.bin < 4681) insert_offset2(&idx->index2[b->core.tid], b, last_off); if (c->bin != last_bin) { // then possibly write the binning index if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record insert_offset(idx->index[save_tid], save_bin, save_off, last_off); save_off = last_off; save_bin = last_bin = c->bin; save_tid = c->tid; + if (save_tid < 0) break; } if (bam_tell(fp) <= last_off) { fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n", @@ -169,7 +175,7 @@ bam_index_t *bam_index_core(bamFile fp) last_off = bam_tell(fp); last_coor = b->core.pos; } - insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp)); + if (save_tid >= 0) insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp)); merge_chunks(idx); if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret); free(b->data); free(b); diff --git a/bam_lpileup.c b/bam_lpileup.c index 83f91c2..3680287 100644 --- a/bam_lpileup.c +++ b/bam_lpileup.c @@ -92,7 +92,7 @@ static int tview_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl max_level = 0; for (i = l = 0; i < n; ++i) { const bam_pileup1_t *p = pl + i; - if (p->qpos == 0) { + if (p->is_head) { if (tv->head->next && tv->head->cnt == 0) { // then take a free slot freenode_t *p = tv->head->next; tv->cur_level[i] = tv->head->level; @@ -146,7 +146,7 @@ static int tview_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl // squeeze out terminated levels for (i = l = 0; i < n; ++i) { const bam_pileup1_t *p = pl + i; - if (p->qpos != p->b->core.l_qseq - 1) + if (!p->is_tail) tv->pre_level[l++] = tv->pre_level[i]; } tv->n_pre = l; @@ -180,13 +180,14 @@ int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *tv) return bam_plbuf_push(b, tv->plbuf); } -int bam_lpileup_file(bamFile fp, bam_pileup_f func, void *func_data) +int bam_lpileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data) { bam_lplbuf_t *buf; int ret; bam1_t *b; b = (bam1_t*)calloc(1, sizeof(bam1_t)); buf = bam_lplbuf_init(func, func_data); + bam_plbuf_set_mask(buf->plbuf, mask); while ((ret = bam_read1(fp, b)) >= 0) bam_lplbuf_push(b, buf); bam_lplbuf_push(0, buf); diff --git a/bam_maqcns.c b/bam_maqcns.c index 65ed64c..3c3d818 100644 --- a/bam_maqcns.c +++ b/bam_maqcns.c @@ -262,6 +262,9 @@ uint32_t bam_maqcns_call(int n, const bam_pileup1_t *pl, bam_maqcns_t *bm) bam_maqindel_opt_t *bam_maqindel_opt_init() { bam_maqindel_opt_t *mi = (bam_maqindel_opt_t*)calloc(1, sizeof(bam_maqindel_opt_t)); + mi->q_indel = 40; + mi->r_indel = 0.00015; + // mi->mm_penalty = 3; mi->indel_err = 4; mi->ambi_thres = 10; @@ -271,7 +274,7 @@ bam_maqindel_opt_t *bam_maqindel_opt_init() void bam_maqindel_ret_destroy(bam_maqindel_ret_t *mir) { if (mir == 0) return; - free(mir->s1); free(mir->s2); free(mir); + free(mir->s[0]); free(mir->s[1]); free(mir); } #define MINUS_CONST 0x10000000 @@ -323,7 +326,7 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c } { // the core part char *ref2, *inscns = 0; - int k, l, *score, max_ins = types[n_types-1]; + int k, l, *score, *pscore, max_ins = types[n_types-1]; ref2 = (char*)calloc(right - left + types[n_types-1] + 2, 1); if (max_ins > 0) { // get the consensus of inserted sequences int *inscns_aux = (int*)calloc(4 * n_types * max_ins, sizeof(int)); @@ -340,7 +343,7 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c } } } - // construct the consensus + // construct the consensus of inserted sequence inscns = (char*)calloc(n_types * max_ins, sizeof(char)); for (i = 0; i < n_types; ++i) { for (j = 0; j < types[i]; ++j) { @@ -358,6 +361,7 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c } // calculate score score = (int*)calloc(n_types * n, sizeof(int)); + pscore = (int*)calloc(n_types * n, sizeof(int)); for (i = 0; i < n_types; ++i) { // write ref2 for (k = 0, j = left; j <= pos; ++j) @@ -372,27 +376,32 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c const bam_pileup1_t *p = pl + j; uint32_t *cigar; bam1_core_t *c = &p->b->core; - int s; + int s, ps; bam_segreg_t seg; if (c->flag&BAM_FUNMAP) continue; cigar = bam1_cigar(p->b); bam_segreg(pos, c, cigar, &seg); - for (s = 0, l = seg.qbeg; c->pos + l < right && l < seg.qend; ++l) { + for (ps = s = 0, l = seg.qbeg; c->pos + l < right && l < seg.qend; ++l) { int cq = bam1_seqi(bam1_seq(p->b), l), ct; ct = c->pos + l >= left? ref2[c->pos + l - left] : 15; // "<" should not happen if there is no bug - if (cq < 15 && ct < 15) + if (cq < 15 && ct < 15) { s += cq == ct? 1 : -mi->mm_penalty; + if (cq != ct) ps += bam1_qual(p->b)[l]; + } } - score[i*n + j] = s; + score[i*n + j] = s; pscore[i*n + j] = ps; if (types[i] != 0) { // then try the other way to calculate the score - for (s = 0, l = seg.qbeg; c->pos + l + types[i] < right && l < seg.qend; ++l) { + for (ps = s = 0, l = seg.qbeg; c->pos + l + types[i] < right && l < seg.qend; ++l) { int cq = bam1_seqi(bam1_seq(p->b), l), ct; ct = c->pos + l + types[i] >= left? ref2[c->pos + l + types[i] - left] : 15; - if (cq < 15 && ct < 15) + if (cq < 15 && ct < 15) { s += cq == ct? 1 : -mi->mm_penalty; + if (cq != ct) ps += bam1_qual(p->b)[l]; + } } } if (score[i*n+j] < s) score[i*n+j] = s; // choose the higher of the two scores + if (pscore[i*n+j] > ps) pscore[i*n+j] = ps; if (types[i] != 0) score[i*n+j] -= mi->indel_err; //printf("%d, %d, %d, %d\n", i, types[i], j, score[i*n+j]); } @@ -403,7 +412,7 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c sum = (int*)calloc(n_types, sizeof(int)); for (i = 0; i < n_types; ++i) for (j = 0; j < n; ++j) - sum[i] += score[i*n+j]; + sum[i] += -pscore[i*n+j]; max1 = max2 = -0x7fffffff; max1_i = max2_i = -1; for (i = 0; i < n_types; ++i) { if (sum[i] > max1) { @@ -416,26 +425,28 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c // write ret ret = (bam_maqindel_ret_t*)calloc(1, sizeof(bam_maqindel_ret_t)); ret->indel1 = types[max1_i]; ret->indel2 = types[max2_i]; - ret->s1 = (char*)calloc(abs(ret->indel1) + 2, 1); - ret->s2 = (char*)calloc(abs(ret->indel2) + 2, 1); + ret->s[0] = (char*)calloc(abs(ret->indel1) + 2, 1); + ret->s[1] = (char*)calloc(abs(ret->indel2) + 2, 1); + // write indel sequence if (ret->indel1 > 0) { - ret->s1[0] = '+'; + ret->s[0][0] = '+'; for (k = 0; k < ret->indel1; ++k) - ret->s1[k+1] = bam_nt16_rev_table[(int)inscns[max1_i*max_ins + k]]; + ret->s[0][k+1] = bam_nt16_rev_table[(int)inscns[max1_i*max_ins + k]]; } else if (ret->indel1 < 0) { - ret->s1[0] = '-'; + ret->s[0][0] = '-'; for (k = 0; k < -ret->indel1 && ref[pos + k + 1]; ++k) - ret->s1[k+1] = ref[pos + k + 1]; - } else ret->s1[0] = '*'; + ret->s[0][k+1] = ref[pos + k + 1]; + } else ret->s[0][0] = '*'; if (ret->indel2 > 0) { - ret->s2[0] = '+'; + ret->s[1][0] = '+'; for (k = 0; k < ret->indel2; ++k) - ret->s2[k+1] = bam_nt16_rev_table[(int)inscns[max2_i*max_ins + k]]; + ret->s[1][k+1] = bam_nt16_rev_table[(int)inscns[max2_i*max_ins + k]]; } else if (ret->indel2 < 0) { - ret->s2[0] = '-'; + ret->s[1][0] = '-'; for (k = 0; k < -ret->indel2 && ref[pos + k + 1]; ++k) - ret->s2[k+1] = ref[pos + k + 1]; - } else ret->s2[0] = '*'; + ret->s[1][k+1] = ref[pos + k + 1]; + } else ret->s[1][0] = '*'; + // write count for (j = 0; j < n; ++j) { if (score[max1_i*n+j] < 0 && score[max2_i*n+j] < 0) ++ret->cnt_anti; else { @@ -445,8 +456,34 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c else ++ret->cnt_ambi; } } + // write gl[] + ret->gl[0] = ret->gl[1] = 0; + for (j = 0; j < n; ++j) { + int s1 = pscore[max1_i*n + j], s2 = pscore[max2_i*n + j]; + ret->gl[0] += s1 < s2? 0 : s1 - s2 < mi->q_indel? s1 - s2 : mi->q_indel; + ret->gl[1] += s2 < s1? 0 : s2 - s1 < mi->q_indel? s2 - s1 : mi->q_indel; + } + } + free(score); free(pscore); free(ref2); free(inscns); + } + { // call genotype + int q[3], qr_indel = (int)(-4.343 * log(mi->r_indel) + 0.5); + int min1, min2, min1_i; + q[0] = ret->gl[0] + (ret->s[0][0] != '*'? 0 : 0) * qr_indel; + q[1] = ret->gl[1] + (ret->s[1][0] != '*'? 0 : 0) * qr_indel; + q[2] = n * 3 + (ret->s[0][0] == '*' || ret->s[1][0] == '*'? 1 : 1) * qr_indel; + min1 = min2 = 0x7fffffff; min1_i = -1; + for (i = 0; i < 3; ++i) { + if (q[i] < min1) { + min2 = min1; min1 = q[i]; min1_i = i; + } else if (q[i] < min2) min2 = q[i]; } - free(score); free(ref2); free(inscns); + ret->gt = min1_i; + ret->q_cns = min2 - min1; + // set q_ref + if (ret->gt < 2) ret->q_ref = (ret->s[ret->gt][0] == '*')? 0 : q[1-ret->gt] - q[ret->gt] - qr_indel - 3; + else ret->q_ref = (ret->s[0][0] == '*')? q[0] - q[2] : q[1] - q[2]; + if (ret->q_ref < 0) ret->q_ref = 0; } free(types); return ret; diff --git a/bam_maqcns.h b/bam_maqcns.h index 5d410ef..2c94fec 100644 --- a/bam_maqcns.h +++ b/bam_maqcns.h @@ -16,13 +16,19 @@ typedef struct { } bam_maqcns_t; typedef struct { + int q_indel; + float r_indel; + // hidden parameters, unchangeable from command line int mm_penalty, indel_err, ambi_thres; } bam_maqindel_opt_t; typedef struct { int indel1, indel2; int cnt1, cnt2, cnt_ambi, cnt_anti; - char *s1, *s2; + char *s[2]; + // + int gt, gl[2]; + int q_cns, q_ref; } bam_maqindel_ret_t; #ifdef __cplusplus diff --git a/bam_mate.c b/bam_mate.c new file mode 100644 index 0000000..bb53605 --- /dev/null +++ b/bam_mate.c @@ -0,0 +1,68 @@ +#include +#include +#include "bam.h" + +// currently, this function ONLY works if each read has one hit +void bam_mating_core(bamFile in, bamFile out) +{ + bam_header_t *header; + bam1_t *b[2]; + int curr, has_prev; + + header = bam_header_read(in); + bam_header_write(out, header); + + b[0] = bam_init1(); + b[1] = bam_init1(); + curr = 0; has_prev = 0; + while (bam_read1(in, b[curr]) >= 0) { + bam1_t *cur = b[curr], *pre = b[1-curr]; + if (has_prev) { + if (strcmp(bam1_qname(cur), bam1_qname(pre)) == 0) { // identical pair name + cur->core.mtid = pre->core.tid; cur->core.mpos = pre->core.pos; + pre->core.mtid = cur->core.tid; pre->core.mpos = cur->core.pos; + if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)) + && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) + { + uint32_t cur5, pre5; + cur5 = (cur->core.flag&BAM_FREVERSE)? bam_calend(&cur->core, bam1_cigar(cur)) : cur->core.pos; + pre5 = (pre->core.flag&BAM_FREVERSE)? bam_calend(&pre->core, bam1_cigar(pre)) : pre->core.pos; + cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5; + } else cur->core.isize = pre->core.isize = 0; + if (pre->core.flag&BAM_FREVERSE) cur->core.flag |= BAM_FREVERSE; + else cur->core.flag &= ~BAM_FMREVERSE; + if (cur->core.flag&BAM_FREVERSE) pre->core.flag |= BAM_FREVERSE; + else pre->core.flag &= ~BAM_FMREVERSE; + if (cur->core.flag & BAM_FUNMAP) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FPROPER_PAIR; } + if (pre->core.flag & BAM_FUNMAP) { cur->core.flag |= BAM_FMUNMAP; cur->core.flag &= ~BAM_FPROPER_PAIR; } + bam_write1(out, pre); + bam_write1(out, cur); + has_prev = 0; + } else { // unpaired or singleton + pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0; + pre->core.flag |= BAM_FMUNMAP; + pre->core.flag &= ~BAM_FMREVERSE & ~BAM_FPROPER_PAIR; + bam_write1(out, pre); + } + } else has_prev = 1; + curr = 1 - curr; + } + if (has_prev) bam_write1(out, b[1-curr]); + bam_header_destroy(header); + bam_destroy1(b[0]); + bam_destroy1(b[1]); +} + +int bam_mating(int argc, char *argv[]) +{ + bamFile in, out; + if (argc < 3) { + fprintf(stderr, "samtools fixmate \n"); + return 1; + } + in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r"); + out = (strcmp(argv[2], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[2], "w"); + bam_mating_core(in, out); + bam_close(in); bam_close(out); + return 0; +} diff --git a/bam_pileup.c b/bam_pileup.c index d01f9a3..6a57831 100644 --- a/bam_pileup.c +++ b/bam_pileup.c @@ -111,6 +111,7 @@ struct __bam_plbuf_t { int32_t tid, pos, max_tid, max_pos; int max_pu, is_eof; bam_pileup1_t *pu; + int flag_mask; }; void bam_plbuf_reset(bam_plbuf_t *buf) @@ -127,6 +128,12 @@ void bam_plbuf_reset(bam_plbuf_t *buf) buf->head = buf->tail; } +void bam_plbuf_set_mask(bam_plbuf_t *buf, int mask) +{ + if (mask < 0) buf->flag_mask = BAM_DEF_MASK; + else buf->flag_mask = BAM_FUNMAP | mask; +} + bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data) { bam_plbuf_t *buf; @@ -136,6 +143,7 @@ bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data) buf->head = buf->tail = mp_alloc(buf->mp); buf->dummy = mp_alloc(buf->mp); buf->max_tid = buf->max_pos = -1; + buf->flag_mask = BAM_DEF_MASK; return buf; } @@ -153,6 +161,7 @@ void bam_plbuf_destroy(bam_plbuf_t *buf) int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf) { if (b) { // fill buffer + if (b->core.flag & buf->flag_mask) return 0; bam_copy1(&buf->tail->b, b); buf->tail->beg = b->core.pos; buf->tail->end = bam_calend(&b->core, bam1_cigar(b)); if (!(b->core.tid >= buf->max_tid || (b->core.tid == buf->max_tid && buf->tail->beg >= buf->max_pos))) { @@ -197,13 +206,14 @@ int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf) return 0; } -int bam_pileup_file(bamFile fp, bam_pileup_f func, void *func_data) +int bam_pileup_file(bamFile fp, int mask, bam_pileup_f func, void *func_data) { bam_plbuf_t *buf; int ret; bam1_t *b; b = (bam1_t*)calloc(1, sizeof(bam1_t)); buf = bam_plbuf_init(func, func_data); + bam_plbuf_set_mask(buf, mask); while ((ret = bam_read1(fp, b)) >= 0) bam_plbuf_push(b, buf); bam_plbuf_push(0, buf); diff --git a/bam_plcmd.c b/bam_plcmd.c index 0140c66..dbfc295 100644 --- a/bam_plcmd.c +++ b/bam_plcmd.c @@ -5,10 +5,13 @@ #include "faidx.h" #include "bam_maqcns.h" #include "khash.h" +#include "glf.h" KHASH_SET_INIT_INT64(64) -#define BAM_PLF_SIMPLE 0x01 -#define BAM_PLF_CNS 0x02 +#define BAM_PLF_SIMPLE 0x01 +#define BAM_PLF_CNS 0x02 +#define BAM_PLF_INDEL_ONLY 0x04 +#define BAM_PLF_GLF 0x08 typedef struct { bam_header_t *h; @@ -18,7 +21,9 @@ typedef struct { khash_t(64) *hash; uint32_t format; int tid, len; + int mask; char *ref; + glfFile fp; // for glf output only } pu_data_t; char **bam_load_pos(const char *fn, int *_n); @@ -46,6 +51,61 @@ static khash_t(64) *load_pos(const char *fn, bam_header_t *h) return hash; } +// an analogy to pileup_func() below +static int glt_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data) +{ + pu_data_t *d = (pu_data_t*)data; + bam_maqindel_ret_t *r = 0; + int rb; + glf1_t *g; + glf2_t g2; + if (d->fai == 0) { + fprintf(stderr, "[glt_func] reference sequence is required for generating GLT. Abort!\n"); + exit(1); + } + if (d->hash && kh_get(64, d->hash, (uint64_t)tid<<32|pos) == kh_end(d->hash)) return 0; + if (d->fai && (int)tid != d->tid) { + if (d->ref) { // then write the end mark + memset(&g2, 0, sizeof(glf2_t)); + g2.type = GLF_TYPE_END; + bgzf_write(d->fp, &g2, sizeof(glf2_t)); + } + glf_ref_write(d->fp, d->h->target_name[tid]); // write reference + free(d->ref); + d->ref = fai_fetch(d->fai, d->h->target_name[tid], &d->len); + d->tid = tid; + } + rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N'; + g = bam_maqcns_glfgen(n, pu, bam_nt16_table[rb], d->c); + memcpy(&g2, g, sizeof(glf1_t)); + g2.type = GLF_TYPE_NORMAL; + g2.pos = pos; + bgzf_write(d->fp, &g2, sizeof(glf2_t)); + r = bam_maqindel(n, pos, d->ido, pu, d->ref); + if (r) { // then write indel line + int g3 = 3 * n, min; + min = g3; + if (min > r->gl[0]) min = r->gl[0]; + if (min > r->gl[1]) min = r->gl[1]; + g3 -= min; + g2.ref_base = 0; + g2.type = GLF_TYPE_INDEL; + memset(g2.lk, 0, 10); + g2.lk[0] = r->gl[0] < 255? r->gl[0] : 255; + g2.lk[1] = r->gl[1] < 255? r->gl[1] : 255; + g2.lk[2] = g3 < 255? g3 : 255; + *(int16_t*)(g2.lk + 3) = (int16_t)r->indel1; + *(int16_t*)(g2.lk + 5) = (int16_t)r->indel2; + g2.min_lk = min < 255? min : 255; + bgzf_write(d->fp, &g2, sizeof(glf2_t)); + if (r->indel1) bgzf_write(d->fp, r->s[0]+1, r->indel1>0? r->indel1 : -r->indel1); + if (r->indel2) bgzf_write(d->fp, r->s[1]+1, r->indel2>0? r->indel2 : -r->indel2); + bam_maqindel_ret_destroy(r); + } + free(g); + return 0; +} + static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data) { pu_data_t *d = (pu_data_t*)data; @@ -53,12 +113,18 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p int i, j, rb; uint32_t x; if (d->hash && kh_get(64, d->hash, (uint64_t)tid<<32|pos) == kh_end(d->hash)) return 0; + if (d->format & BAM_PLF_GLF) return glt_func(tid, pos, n, pu, data); if (d->fai && (int)tid != d->tid) { free(d->ref); d->ref = fai_fetch(d->fai, d->h->target_name[tid], &d->len); d->tid = tid; } rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N'; + if (d->format & BAM_PLF_INDEL_ONLY) { + for (i = 0; i < n; ++i) + if (pu[i].indel != 0) break; + if (i == n) return 0; + } printf("%s\t%d\t%c\t", d->h->target_name[tid], pos + 1, rb); if (d->format & BAM_PLF_CNS) { // consensus int ref_q, rb4 = bam_nt16_table[rb]; @@ -69,6 +135,8 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p if (ref_q > 255) ref_q = 255; } printf("%c\t%d\t%d\t%d\t", bam_nt16_rev_table[x>>28], x>>8&0xff, ref_q, x>>16&0xff); + } + if (d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY)) { if (d->ref) // indel calling r = bam_maqindel(n, pos, d->ido, pu, d->ref); } @@ -115,7 +183,12 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p } putchar('\n'); if (r) { // then print indel line - printf("%s\t%d\t*\t%s/%s\t", d->h->target_name[tid], pos + 1, r->s1, r->s2); + printf("%s\t%d\t*\t", d->h->target_name[tid], pos + 1); + if (r->gt < 2) printf("%s/%s\t", r->s[r->gt], r->s[r->gt]); + else printf("%s/%s\t", r->s[0], r->s[1]); + printf("%d\t%d\t", r->q_cns, r->q_ref); + printf("%s\t%s\t", r->s[0], r->s[1]); + //printf("%d\t%d\t", r->gl[0], r->gl[1]); printf("%d\t%d\t%d\t%d\n", r->cnt1, r->cnt2, r->cnt_ambi, r->cnt_anti); bam_maqindel_ret_destroy(r); } @@ -127,9 +200,10 @@ int bam_pileup(int argc, char *argv[]) int c; char *fn_list = 0, *fn_fa = 0, *fn_pos = 0; pu_data_t *d = (pu_data_t*)calloc(1, sizeof(pu_data_t)); - d->tid = -1; + d->tid = -1; d->mask = BAM_DEF_MASK; d->c = bam_maqcns_init(); - while ((c = getopt(argc, argv, "st:f:cT:N:r:l:")) >= 0) { + d->ido = bam_maqindel_opt_init(); + while ((c = getopt(argc, argv, "st:f:cT:N:r:l:im:gI:G:")) >= 0) { switch (c) { case 's': d->format |= BAM_PLF_SIMPLE; break; case 't': fn_list = strdup(optarg); break; @@ -139,6 +213,11 @@ int bam_pileup(int argc, char *argv[]) case 'N': d->c->n_hap = atoi(optarg); break; case 'r': d->c->het_rate = atoi(optarg); break; case 'c': d->format |= BAM_PLF_CNS; break; + case 'i': d->format |= BAM_PLF_INDEL_ONLY; break; + case 'm': d->mask = atoi(optarg); break; + case 'g': d->format |= BAM_PLF_GLF; break; + case 'I': d->ido->q_indel = atoi(optarg); break; + case 'G': d->ido->r_indel = atof(optarg); break; default: fprintf(stderr, "Unrecognizd option '-%c'.\n", c); return 1; } } @@ -146,26 +225,38 @@ int bam_pileup(int argc, char *argv[]) fprintf(stderr, "\n"); fprintf(stderr, "Usage: bamtk pileup [options] |\n\n"); fprintf(stderr, "Option: -s simple (yet incomplete) pileup format\n"); + fprintf(stderr, " -i only show lines/consensus with indels\n"); + fprintf(stderr, " -m INT filtering reads with bits in INT [%d]\n", d->mask); fprintf(stderr, " -t FILE list of reference sequences (assume the input is in SAM)\n"); fprintf(stderr, " -l FILE list of sites at which pileup is output\n"); fprintf(stderr, " -f FILE reference sequence in the FASTA format\n\n"); fprintf(stderr, " -c output the maq consensus sequence\n"); - fprintf(stderr, " -T FLOAT theta in maq consensus calling model (for -c only) [%f]\n", d->c->theta); - fprintf(stderr, " -N INT number of haplotypes in the sample (for -c only) [%d]\n", d->c->n_hap); - fprintf(stderr, " -r FLOAT prior of a difference between any two haplotypes (for -c only) [%f]\n\n", - d->c->het_rate); + fprintf(stderr, " -g output in the extended GLT format (suppressing -c/-i/-s)\n"); + fprintf(stderr, " -T FLOAT theta in maq consensus calling model (for -c/-g) [%f]\n", d->c->theta); + fprintf(stderr, " -N INT number of haplotypes in the sample (for -c/-g) [%d]\n", d->c->n_hap); + fprintf(stderr, " -r FLOAT prior of a difference between two haplotypes (for -c/-g) [%f]\n", d->c->het_rate); + fprintf(stderr, " -G FLOAT prior of an indel between two haplotypes (for -c/-g) [%f]\n", d->ido->r_indel); + fprintf(stderr, " -I INT phred prob. of an indel in sequencing/prep. (for -c/-g) [%d]\n", d->ido->q_indel); + fprintf(stderr, "\n"); free(fn_list); free(fn_fa); free(d); return 1; } if (fn_fa) d->fai = fai_load(fn_fa); free(fn_fa); bam_maqcns_prepare(d->c); - d->ido = bam_maqindel_opt_init(); + if (d->format & BAM_PLF_GLF) { + glf_header_t *h; + h = glf_header_init(); + d->fp = bgzf_fdopen(fileno(stdout), "w"); + glf_header_write(d->fp, h); + glf_header_destroy(h); + } if (fn_list) { tamFile fp; bam1_t *b; int ret; bam_plbuf_t *buf = bam_plbuf_init(pileup_func, d); + bam_plbuf_set_mask(buf, d->mask); d->h = sam_header_read2(fn_list); if (fn_pos) d->hash = load_pos(fn_pos, d->h); fp = sam_open(argv[optind]); @@ -181,9 +272,10 @@ int bam_pileup(int argc, char *argv[]) fp = (strcmp(argv[optind], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[optind], "r"); d->h = bam_header_read(fp); if (fn_pos) d->hash = load_pos(fn_pos, d->h); - bam_pileup_file(fp, pileup_func, d); + bam_pileup_file(fp, d->mask, pileup_func, d); bam_close(fp); } + if (d->format & BAM_PLF_GLF) bgzf_close(d->fp); free(fn_pos); free(fn_list); kh_destroy(64, d->hash); bam_header_destroy(d->h); diff --git a/bam_rmdup.c b/bam_rmdup.c new file mode 100644 index 0000000..321938f --- /dev/null +++ b/bam_rmdup.c @@ -0,0 +1,140 @@ +#include +#include +#include +#include +#include +#include "bam.h" + +typedef bam1_t *bam1_p; +#include "khash.h" +KHASH_SET_INIT_STR(name) +KHASH_MAP_INIT_INT64(pos, bam1_p) + +#define BUFFER_SIZE 0x40000 + +typedef struct { + int n, max; + bam1_t **a; +} tmp_stack_t; + +static inline void stack_insert(tmp_stack_t *stack, bam1_t *b) +{ + if (stack->n == stack->max) { + stack->max = stack->max? stack->max<<1 : 0x10000; + stack->a = (bam1_t**)realloc(stack->a, sizeof(bam1_t*) * stack->max); + } + stack->a[stack->n++] = b; +} + +static inline void dump_best(tmp_stack_t *stack, khash_t(pos) *best_hash, bamFile out) +{ + int i; + for (i = 0; i != stack->n; ++i) { + bam_write1(out, stack->a[i]); + bam_destroy1(stack->a[i]); + } + stack->n = 0; + if (kh_size(best_hash) > BUFFER_SIZE) kh_clear(pos, best_hash); +} + +static void clear_del_set(khash_t(name) *del_set) +{ + khint_t k; + for (k = kh_begin(del_set); k < kh_end(del_set); ++k) + if (kh_exist(del_set, k)) + free((char*)kh_key(del_set, k)); + kh_clear(name, del_set); +} + +void bam_rmdup_core(bamFile in, bamFile out) +{ + bam_header_t *header; + bam1_t *b; + int last_tid = -1, last_pos = -1; + uint64_t n_checked = 0, n_removed = 0; + tmp_stack_t stack; + khint_t k; + khash_t(pos) *best_hash; + khash_t(name) *del_set; + + best_hash = kh_init(pos); + del_set = kh_init(name); + b = bam_init1(); + memset(&stack, 0, sizeof(tmp_stack_t)); + header = bam_header_read(in); + bam_header_write(out, header); + + kh_resize(name, del_set, 4 * BUFFER_SIZE); + kh_resize(pos, best_hash, 3 * BUFFER_SIZE); + while (bam_read1(in, b) >= 0) { + bam1_core_t *c = &b->core; + if (c->tid != last_tid || last_pos != c->pos) { + dump_best(&stack, best_hash, out); // write the result + if (c->tid != last_tid) { + kh_clear(pos, best_hash); + if (kh_size(del_set)) { // check + fprintf(stderr, "[bam_rmdup_core] %llu unmatched pairs\n", (long long)kh_size(del_set)); + clear_del_set(del_set); + } + last_tid = c->tid; + fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", header->target_name[c->tid]); + } + } + if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) { + bam_write1(out, b); + } else if (c->isize > 0) { // paired, head + uint64_t key = (uint64_t)c->pos<<32 | c->isize; + int ret; + ++n_checked; + k = kh_put(pos, best_hash, key, &ret); + if (ret == 0) { // found in best_hash + bam1_t *p = kh_val(best_hash, k); + ++n_removed; + if (p->core.qual < c->qual) { // the current alignment is better + kh_put(name, del_set, strdup(bam1_qname(p)), &ret); // p will be removed + bam_copy1(p, b); // replaced as b + } else kh_put(name, del_set, strdup(bam1_qname(b)), &ret); // b will be removed + if (ret == 0) + fprintf(stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam1_qname(b)); + } else { // not found in best_hash + kh_val(best_hash, k) = bam_dup1(b); + stack_insert(&stack, kh_val(best_hash, k)); + } + } else { // paired, tail + k = kh_get(name, del_set, bam1_qname(b)); + if (k != kh_end(del_set)) { + free((char*)kh_key(del_set, k)); + kh_del(name, del_set, k); + } else bam_write1(out, b); + } + last_pos = c->pos; + } + dump_best(&stack, best_hash, out); + + bam_header_destroy(header); + clear_del_set(del_set); + kh_destroy(name, del_set); + kh_destroy(pos, best_hash); + free(stack.a); + bam_destroy1(b); + fprintf(stderr, "[bam_rmdup_core] %lld / %lld = %.4lf\n", (long long)n_removed, (long long)n_checked, + (double)n_removed/n_checked); +} +int bam_rmdup(int argc, char *argv[]) +{ + bamFile in, out; + if (argc < 3) { + fprintf(stderr, "Usage: samtools rmdup \n"); + return 1; + } + in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r"); + out = (strcmp(argv[2], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[2], "w"); + if (in == 0 || out == 0) { + fprintf(stderr, "[bam_rmdup] fail to read/write input files\n"); + return 1; + } + bam_rmdup_core(in, out); + bam_close(in); + bam_close(out); + return 0; +} diff --git a/bam_sort.c b/bam_sort.c index c5ed583..68d15dc 100644 --- a/bam_sort.c +++ b/bam_sort.c @@ -191,7 +191,7 @@ void bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t m fns[i] = (char*)calloc(strlen(prefix) + 20, 1); sprintf(fns[i], "%s.%.4d.bam", prefix, i); } - bam_merge_core(0, fnout, n, fns); + bam_merge_core(is_by_qname, fnout, n, fns); free(fnout); for (i = 0; i < n; ++i) { unlink(fns[i]); diff --git a/bamtk.c b/bamtk.c index 54ef455..ffd2e48 100644 --- a/bamtk.c +++ b/bamtk.c @@ -3,7 +3,7 @@ #include "bam.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.1.1" +#define PACKAGE_VERSION "0.1.1-18" #endif int bam_taf2baf(int argc, char *argv[]); @@ -12,7 +12,11 @@ int bam_merge(int argc, char *argv[]); int bam_index(int argc, char *argv[]); int bam_sort(int argc, char *argv[]); int bam_tview_main(int argc, char *argv[]); +int bam_mating(int argc, char *argv[]); +int bam_rmdup(int argc, char *argv[]); + int faidx_main(int argc, char *argv[]); +int glf_view_main(int argc, char *argv[]); static int view_aux(const bam1_t *b, void *data) { @@ -87,6 +91,9 @@ static int usage() fprintf(stderr, " tview text alignment viewer\n"); #endif fprintf(stderr, " index index alignment\n"); + fprintf(stderr, " fixmate fix mate information\n"); + fprintf(stderr, " rmdup remove PCR duplicates\n"); + fprintf(stderr, " glfview print GLFv2 file\n"); fprintf(stderr, "\n"); return 1; } @@ -101,6 +108,9 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "sort") == 0) return bam_sort(argc-1, argv+1); else if (strcmp(argv[1], "index") == 0) return bam_index(argc-1, argv+1); else if (strcmp(argv[1], "faidx") == 0) return faidx_main(argc-1, argv+1); + else if (strcmp(argv[1], "fixmate") == 0) return bam_mating(argc-1, argv+1); + else if (strcmp(argv[1], "rmdup") == 0) return bam_rmdup(argc-1, argv+1); + else if (strcmp(argv[1], "glfview") == 0) return glf_view_main(argc-1, argv+1); #ifndef _NO_CURSES else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1); #endif diff --git a/bgzf.h b/bgzf.h index 4ed5c29..7e76bcb 100644 --- a/bgzf.h +++ b/bgzf.h @@ -9,7 +9,7 @@ * or functionality. */ -#ifndef __BCGZ_H +#ifndef __BGZF_H #define __BGZF_H #include diff --git a/examples/00README.txt b/examples/00README.txt index 5dd123c..dbb276f 100644 --- a/examples/00README.txt +++ b/examples/00README.txt @@ -1,28 +1,23 @@ -NA18507_part.fa contains two sequences cut from the human genome +File ex1.fa contains two sequences cut from the human genome build36. They were exatracted with command: samtools faidx human_b36.fa 2:2043966-2045540 20:67967-69550 -Sequence names were changed manually for simplicity. ex1.fa.fai is the -index for the sequence file, generated by: - - samtools faidx ex1.fa - -This index file also works as the reference list file used by `import' -and `pileup' commands of samtools. ex1.sam.gz contains MAQ alignments -exatracted with: +Sequence names were changed manually for simplicity. File ex1.sam.gz +contains MAQ alignments exatracted with: (samtools view NA18507_maq.bam 2:2044001-2045500; samtools view NA18507_maq.bam 20:68001-69500) -and processed with an awk command to make everything consistent as a +and processed with `samtools fixmate' to make it self-consistent as a standalone alignment. To try samtools, you may run the following commands: - samtools import ex1.fa.fai ex1.sam.gz ex1.bam - samtools index ex1.bam - samtools tview ex1.bam ex1.fa - samtools pileup -cf ex1.fa ex1.bam + samtools faidx ex1.fa # index the reference FASTA + samtools import ex1.fa.fai ex1.sam.gz ex1.bam # SAM->BAM + samtools index ex1.bam # index BAM + samtools tview ex1.bam ex1.fa # view alignment + samtools pileup -cf ex1.fa ex1.bam # pileup and consensus samtools pileup -cf ex1.fa -t ex1.fa.fai ex1.sam.gz diff --git a/examples/Makefile b/examples/Makefile new file mode 100644 index 0000000..b6908fa --- /dev/null +++ b/examples/Makefile @@ -0,0 +1,21 @@ +all:../samtools ex1.glf ex1.pileup.gz ex1.bam.bai ex1.glfview.gz + @echo; echo \# You can now launch the viewer with: \'samtools tview ex1.bam ex1.fa\'; echo; + +ex1.fa.fai:ex1.fa + ../samtools faidx ex1.fa +ex1.bam:ex1.sam.gz ex1.fa.fai + ../samtools import ex1.fa.fai ex1.sam.gz ex1.bam +ex1.bam.bai:ex1.bam + ../samtools index ex1.bam +ex1.pileup.gz:ex1.bam ex1.fa + ../samtools pileup -cf ex1.fa ex1.bam | gzip > ex1.pileup.gz +ex1.glf:ex1.bam ex1.fa + ../samtools pileup -gf ex1.fa ex1.bam > ex1.glf +ex1.glfview.gz:ex1.glf + ../samtools glfview ex1.glf | gzip > ex1.glfview.gz + +../samtools: + (cd ..; make samtools) + +clean: + rm -f *.bam *.bai *.glf* *.fai *.pileup* *~ \ No newline at end of file diff --git a/examples/ex1.fa.fai b/examples/ex1.fa.fai deleted file mode 100644 index bac151a..0000000 --- a/examples/ex1.fa.fai +++ /dev/null @@ -1,2 +0,0 @@ -seq1 1575 6 60 61 -seq2 1584 1614 60 61 diff --git a/examples/ex1.sam.gz b/examples/ex1.sam.gz index 1a213d1..26c06a6 100644 Binary files a/examples/ex1.sam.gz and b/examples/ex1.sam.gz differ diff --git a/faidx.c b/faidx.c index 44e7f57..9302dfb 100644 --- a/faidx.c +++ b/faidx.c @@ -12,7 +12,7 @@ typedef struct { } faidx1_t; KHASH_MAP_INIT_STR(s, faidx1_t) -#ifdef HAVE_RAZF +#ifndef _NO_RAZF #include "razf.h" #else extern off_t ftello(FILE *stream); diff --git a/glf.c b/glf.c new file mode 100644 index 0000000..302f9ef --- /dev/null +++ b/glf.c @@ -0,0 +1,156 @@ +#include +#include +#include "glf.h" + +#ifdef _NO_BGZF +// then alias bgzf_*() functions +#endif + +static inline int bam_is_big_endian() +{ + long one= 1; + return !(*((char *)(&one))); +} + +glf_header_t *glf_header_init() +{ + return (glf_header_t*)calloc(1, sizeof(glf_header_t)); +} + +glf_header_t *glf_header_read(glfFile fp) +{ + glf_header_t *h; + char magic[4]; + if (bam_is_big_endian()) { + fprintf(stderr, "[glf_header_read] Big endian is detected. Abort.\n"); + exit(1); + } + h = (glf_header_t*)calloc(1, sizeof(glf_header_t)); + bgzf_read(fp, magic, 4); + if (strncmp(magic, "GLF\2", 4)) { + fprintf(stderr, "[glf_header_read] invalid magic. Abort.\n"); + exit(1); + } + bgzf_read(fp, &h->l_text, 4); + if (h->l_text) { + h->text = (uint8_t*)calloc(h->l_text + 1, 1); + bgzf_read(fp, h->text, h->l_text); + } + return h; +} + +void glf_header_write(glfFile fp, const glf_header_t *h) +{ + bgzf_write(fp, "GLF\2", 4); + bgzf_write(fp, &h->l_text, 4); + if (h->l_text) bgzf_write(fp, h->text, h->l_text); +} + +void glf_header_destroy(glf_header_t *h) +{ + free(h->text); + free(h); +} + +char *glf_ref_read(glfFile fp) +{ + int32_t n; + char *str; + if (bgzf_read(fp, &n, 4) != 4) return 0; + if (n < 0) { + fprintf(stderr, "[glf_ref_read] invalid reference name length: %d.\n", n); + return 0; + } + str = (char*)calloc(n + 1, 1); // not necesarily n+1 in fact + bgzf_read(fp, str, n); + return str; +} + +void glf_ref_write(glfFile fp, const char *str) +{ + int32_t n = strlen(str); + ++n; + bgzf_write(fp, &n, 4); + bgzf_write(fp, str, n); +} + +void glf_view_normal(const char *ref_name, glf2_t *g1) +{ + int j; + printf("%s\t%d\t%c\t%d\t%d\t%d", ref_name, g1->pos + 1, "XACMGRSVTWYHKDBN"[g1->ref_base], + g1->depth, g1->max_mapQ, g1->min_lk); + for (j = 0; j != 10; ++j) printf("\t%d", g1->lk[j]); + printf("\n"); +} + +static char *glf_read_indel(glfFile fp, char *str, int *max, int16_t indel) +{ + int l = indel > 0? indel : -indel; + if (l + 1 > *max) { + *max = l + 1; + str = (char*)realloc(str, *max); + } + bgzf_read(fp, str, l); + str[l] = 0; + return str; +} + +void glf_view(glfFile fp) +{ + glf_header_t *h; + char *name, *str; + glf2_t g2; + int max; + + h = glf_header_read(fp); + str = 0; max = 0; + while ((name = glf_ref_read(fp)) != 0) { + while (bgzf_read(fp, &g2, sizeof(glf2_t))) { + if (g2.type == GLF_TYPE_END) break; + else if (g2.type == GLF_TYPE_NORMAL) glf_view_normal(name, &g2); + else if (g2.type == GLF_TYPE_INDEL) { + int16_t indel1, indel2; + printf("%s\t%d\t*\t%d\t%d\t%d\t", name, g2.pos + 1, g2.depth, g2.max_mapQ, g2.min_lk); + printf("%d\t%d\t%d\t", g2.lk[0], g2.lk[1], g2.lk[2]); + indel1 = *(int16_t*)(g2.lk + 3); + indel2 = *(int16_t*)(g2.lk + 5); + printf("%d\t%d\t", indel1, indel2); + if (indel1) { + str = glf_read_indel(fp, str, &max, indel1); + printf("%c%d%s\t", indel1>0? '+':'-', indel1>0?indel1:-indel1, str); + } else printf("*\t"); + if (indel2) { + str = glf_read_indel(fp, str, &max, indel2); + printf("%c%d%s\n", indel2>0? '+':'-', indel2>0?indel2:-indel2, str); + } else printf("*\n"); + } + } + free(name); + } + glf_header_destroy(h); + free(str); +} + +int glf_view_main(int argc, char *argv[]) +{ + glfFile fp; + if (argc == 1) { + fprintf(stderr, "Usage: glfview \n"); + return 1; + } + fp = (strcmp(argv[1], "-") == 0)? bgzf_fdopen(fileno(stdin), "r") : bgzf_open(argv[1], "r"); + if (fp == 0) { + fprintf(stderr, "Fail to open file '%s'\n", argv[1]); + return 1; + } + glf_view(fp); + bgzf_close(fp); + return 0; +} + +#ifdef GLFVIEW_MAIN +int main(int argc, char *argv[]) +{ + return glf_view_main(argc, argv); +} +#endif diff --git a/glf.h b/glf.h index d9d23c6..af72a48 100644 --- a/glf.h +++ b/glf.h @@ -8,4 +8,40 @@ typedef struct { unsigned min_lk:8, depth:24; /** minimum lk capped at 255, and the number of mapped reads */ } glf1_t; +#include +#include "bgzf.h" +typedef BGZF *glfFile; + +#define GLF_TYPE_NORMAL 0 +#define GLF_TYPE_INDEL 1 +#define GLF_TYPE_END 15 + +typedef struct { + unsigned char ref_base:4, type:4; /** "XACMGRSVTWYHKDBN"[ref_base] gives the reference base */ + unsigned char max_mapQ; /** maximum mapping quality */ + unsigned char lk[10]; /** log likelihood ratio, capped at 255 */ + unsigned min_lk:8, depth:24; /** minimum lk capped at 255, and the number of mapped reads */ + unsigned pos; /** this is ***ZERO-BASED*** coordinate */ +} glf2_t; + +typedef struct { + int32_t l_text; + uint8_t *text; +} glf_header_t; + +#ifdef __cplusplus +extern "C" { +#endif + + glf_header_t *glf_header_init(); + glf_header_t *glf_header_read(glfFile fp); + void glf_header_write(glfFile fp, const glf_header_t *h); + void glf_header_destroy(glf_header_t *h); + char *glf_ref_read(glfFile fp); + void glf_ref_write(glfFile fp, const char *str); + +#ifdef __cplusplus +} +#endif + #endif diff --git a/misc/Makefile b/misc/Makefile index 8a38f54..f4e50ff 100644 --- a/misc/Makefile +++ b/misc/Makefile @@ -29,7 +29,7 @@ lib-recur all-recur clean-recur cleanlocal-recur install-recur: lib: faidx:../faidx.c ../faidx.h - $(CC) $(CFLAGS) -DFAIDX_MAIN -o $@ ../faidx.c + $(CC) $(CFLAGS) -D_NO_RAZF -DFAIDX_MAIN -o $@ ../faidx.c md5fa:md5.o md5fa.o md5.h ../kseq.h $(CC) $(CFLAGS) -o $@ md5.o md5fa.o -lz diff --git a/misc/maq2sam.c b/misc/maq2sam.c index e30aa92..70ecf41 100644 --- a/misc/maq2sam.c +++ b/misc/maq2sam.c @@ -5,6 +5,8 @@ #include #include +#define PACKAGE_VERSION "0.1.1 (20090120)" + //#define MAQ_LONGREADS #ifdef MAQ_LONGREADS @@ -84,7 +86,7 @@ maqmap_t *maqmap_read_header(gzFile fp) return mm; } -void maq2tam_core(gzFile fp) +void maq2tam_core(gzFile fp, const char *rg) { maqmap_t *mm; maqmap1_t mm1, *m1; @@ -141,6 +143,7 @@ void maq2tam_core(gzFile fp) for (j = 0; j != m1->size; ++j) putchar((m1->seq[j]&0x3f) + 33); putchar('\t'); + if (rg) printf("RG:Z:%s\t", rg); if (flag&4) { printf("MF:i:%d\n", m1->flag); } else { @@ -158,11 +161,12 @@ int main(int argc, char *argv[]) { gzFile fp; if (argc == 1) { - fprintf(stderr, "Usage: maq2tam \n"); + fprintf(stderr, "Version: %s\n", PACKAGE_VERSION); + fprintf(stderr, "Usage: maq2sam []\n"); return 1; } fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); - maq2tam_core(fp); + maq2tam_core(fp, argc > 2? argv[2] : 0); gzclose(fp); return 0; } diff --git a/razf.c b/razf.c index 6611f0b..13d6589 100644 --- a/razf.c +++ b/razf.c @@ -31,6 +31,8 @@ * To compile razf.c, zlib-1.2.3(or greater) is required. */ +#ifndef _NO_RAZF + #include #include #include "razf.h" @@ -645,3 +647,5 @@ void razf_close(RAZF *rz){ close(rz->filedes); free(rz); } + +#endif diff --git a/source.dot b/source.dot index cfa2222..1735774 100644 --- a/source.dot +++ b/source.dot @@ -5,11 +5,15 @@ digraph { sort[label="bam_sort.c\n(sort, merge)"] index[label="bam_index.c\n(index)"] tview[label="bam_tview.c\n(tview)"] + glf[label="glf.c\n(glfview)"] + rmdup[label="bam_rmdup.c\n(rmdup)"] + fixmate[label="bam_mate.c\n(fixmate)"] "bam_aux.c" -> {"bam.c", import} - "bgzf.c" -> "bam.c" - "bam.c" -> {index, "bam_pileup.c", sort, import} + glf -> {"bam_maqcns.c", plcmd} + "bgzf.c" -> {"bam.c", glf} + "bam.c" -> {index, "bam_pileup.c", sort, import, rmdup, fixmate} "bam_pileup.c" -> {"bam_lpileup.c", plcmd} {"bam_lpileup.c", index, faidx, "bam_maqcns.c"} -> tview {import, faidx, "bam_maqcns.c"} -> plcmd - {tview, plcmd, faidx, sort, import, index} -> "bamtk.c\n(view)" + {tview, plcmd, faidx, sort, import, index, glf, rmdup, fixmate} -> "bamtk.c\n(view)" } \ No newline at end of file