]> git.donarmstrong.com Git - samtools.git/commitdiff
Create trunk copy
authorHeng Li <lh3@live.co.uk>
Mon, 22 Dec 2008 15:58:02 +0000 (15:58 +0000)
committerHeng Li <lh3@live.co.uk>
Mon, 22 Dec 2008 15:58:02 +0000 (15:58 +0000)
44 files changed:
COPYING [new file with mode: 0644]
ChangeLog [new file with mode: 0644]
ChangeLog.old [new file with mode: 0644]
Makefile [new file with mode: 0644]
NEWS [new file with mode: 0644]
bam.c [new file with mode: 0644]
bam.h [new file with mode: 0644]
bam_aux.c [new file with mode: 0644]
bam_endian.h [new file with mode: 0644]
bam_import.c [new file with mode: 0644]
bam_index.c [new file with mode: 0644]
bam_lpileup.c [new file with mode: 0644]
bam_maqcns.c [new file with mode: 0644]
bam_maqcns.h [new file with mode: 0644]
bam_pileup.c [new file with mode: 0644]
bam_plcmd.c [new file with mode: 0644]
bam_sort.c [new file with mode: 0644]
bam_tview.c [new file with mode: 0644]
bamtk.c [new file with mode: 0644]
bgzf.c [new file with mode: 0644]
bgzf.h [new file with mode: 0644]
bgzip.c [new file with mode: 0644]
examples/00README.txt [new file with mode: 0644]
examples/ex1.fa [new file with mode: 0644]
examples/ex1.fa.fai [new file with mode: 0644]
examples/ex1.sam.gz [new file with mode: 0644]
faidx.c [new file with mode: 0644]
faidx.h [new file with mode: 0644]
glf.h [new file with mode: 0644]
khash.h [new file with mode: 0644]
kseq.h [new file with mode: 0644]
ksort.h [new file with mode: 0644]
misc/Makefile [new file with mode: 0644]
misc/export2sam.pl [new file with mode: 0755]
misc/maq2sam.c [new file with mode: 0644]
misc/md5.c [new file with mode: 0644]
misc/md5.h [new file with mode: 0644]
misc/md5fa.c [new file with mode: 0644]
razf.c [new file with mode: 0644]
razf.h [new file with mode: 0644]
razip.c [new file with mode: 0644]
samtools.1 [new file with mode: 0644]
source.dot [new file with mode: 0644]
zutil.h [new file with mode: 0644]

diff --git a/COPYING b/COPYING
new file mode 100644 (file)
index 0000000..2f596e5
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,21 @@
+The MIT License
+
+Copyright (c) 2008 Genome Research Ltd.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
\ No newline at end of file
diff --git a/ChangeLog b/ChangeLog
new file mode 100644 (file)
index 0000000..4c52aad
--- /dev/null
+++ b/ChangeLog
@@ -0,0 +1,540 @@
+------------------------------------------------------------------------
+r58 | lh3lh3 | 2008-12-20 23:06:00 +0000 (Sat, 20 Dec 2008) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/misc/export2sam.pl
+
+ * added comments
+ * fixed several bugs 
+
+------------------------------------------------------------------------
+r57 | lh3lh3 | 2008-12-20 15:44:20 +0000 (Sat, 20 Dec 2008) | 2 lines
+Changed paths:
+   A /branches/dev/samtools/misc/export2sam.pl
+
+convert Export format to SAM; not thoroughly tested
+
+------------------------------------------------------------------------
+r56 | lh3lh3 | 2008-12-19 22:13:28 +0000 (Fri, 19 Dec 2008) | 6 lines
+Changed paths:
+   M /branches/dev/samtools/bam_import.c
+   M /branches/dev/samtools/bam_plcmd.c
+   M /branches/dev/samtools/bam_tview.c
+   M /branches/dev/samtools/bamtk.c
+   A /branches/dev/samtools/source.dot
+
+ * samtools-0.1.0-65
+ * pileup: generate maq-like simple output
+ * pileup: allow to output pileup at required sites
+ * source.dot: source file relationship graph
+ * tview: fixed a minor bug
+
+------------------------------------------------------------------------
+r55 | lh3lh3 | 2008-12-19 20:10:26 +0000 (Fri, 19 Dec 2008) | 2 lines
+Changed paths:
+   D /branches/dev/samtools/misc/all2sam.pl
+
+remove all2sam.pl
+
+------------------------------------------------------------------------
+r54 | lh3lh3 | 2008-12-16 22:34:25 +0000 (Tue, 16 Dec 2008) | 2 lines
+Changed paths:
+   A /branches/dev/samtools/COPYING
+   M /branches/dev/samtools/bam.h
+   M /branches/dev/samtools/faidx.h
+   M /branches/dev/samtools/khash.h
+   M /branches/dev/samtools/kseq.h
+   M /branches/dev/samtools/ksort.h
+   M /branches/dev/samtools/samtools.1
+
+Added copyright information and a bit more documentation. No code change.
+
+------------------------------------------------------------------------
+r53 | lh3lh3 | 2008-12-16 13:40:18 +0000 (Tue, 16 Dec 2008) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/bam.c
+   M /branches/dev/samtools/bam.h
+   M /branches/dev/samtools/bam_index.c
+   M /branches/dev/samtools/bam_maqcns.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-64
+ * improved efficiency of the indel caller for spliced alignments
+
+------------------------------------------------------------------------
+r52 | lh3lh3 | 2008-12-16 10:28:20 +0000 (Tue, 16 Dec 2008) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/bam.c
+   M /branches/dev/samtools/bam.h
+   M /branches/dev/samtools/bam_aux.c
+   M /branches/dev/samtools/bam_index.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-63
+ * a bit code cleanup: reduce the dependency between source files
+
+------------------------------------------------------------------------
+r51 | lh3lh3 | 2008-12-15 14:29:32 +0000 (Mon, 15 Dec 2008) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/bam_maqcns.c
+   M /branches/dev/samtools/bam_plcmd.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-62
+ * fixed a memory leak
+
+------------------------------------------------------------------------
+r50 | lh3lh3 | 2008-12-15 14:00:13 +0000 (Mon, 15 Dec 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/ChangeLog
+   M /branches/dev/samtools/bam.h
+   M /branches/dev/samtools/samtools.1
+
+update documentation, ChangeLog and a comment
+
+------------------------------------------------------------------------
+r49 | lh3lh3 | 2008-12-15 13:36:43 +0000 (Mon, 15 Dec 2008) | 6 lines
+Changed paths:
+   M /branches/dev/samtools/Makefile
+   M /branches/dev/samtools/bam.h
+   M /branches/dev/samtools/bam_maqcns.c
+   M /branches/dev/samtools/bam_maqcns.h
+   M /branches/dev/samtools/bam_pileup.c
+   A /branches/dev/samtools/bam_plcmd.c
+   M /branches/dev/samtools/bamtk.c
+   M /branches/dev/samtools/samtools.1
+
+ * samtools-0.1.0-61
+ * moved pileup command to a separate source file
+ * added indel caller
+ * added bam_cal_segend(). (NOT WORKING for spliced alignment!!!)
+ * updated documentation
+
+------------------------------------------------------------------------
+r48 | lh3lh3 | 2008-12-12 13:55:36 +0000 (Fri, 12 Dec 2008) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/bam_maqcns.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-60
+ * fixed another bug in maqcns when there is a nearby deletion
+
+------------------------------------------------------------------------
+r47 | lh3lh3 | 2008-12-12 13:42:16 +0000 (Fri, 12 Dec 2008) | 5 lines
+Changed paths:
+   M /branches/dev/samtools/bam_maqcns.c
+   M /branches/dev/samtools/bam_pileup.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-59
+ * pileup: outputing consensus is now optional
+ * fixed a bug in glfgen. This bug also exists in maq's glfgen. However,
+   I am not quite sure why the previous version may have problem.
+
+------------------------------------------------------------------------
+r46 | lh3lh3 | 2008-12-12 11:44:56 +0000 (Fri, 12 Dec 2008) | 6 lines
+Changed paths:
+   M /branches/dev/samtools/bam_pileup.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-58
+ * add maq consensus to pileup. However, I will move this part to a new
+   command as strictly speaking, consensus callin is not part of pileup,
+   and imposing it would make it harder to generate for other language
+   bindings.
+
+------------------------------------------------------------------------
+r45 | bhandsaker | 2008-12-11 20:43:56 +0000 (Thu, 11 Dec 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/bgzf.c
+
+Fix bug in tell() after reads that consume to the exact end of a block.
+
+------------------------------------------------------------------------
+r44 | lh3lh3 | 2008-12-11 09:36:53 +0000 (Thu, 11 Dec 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/samtools.1
+
+update manual
+
+------------------------------------------------------------------------
+r43 | lh3lh3 | 2008-12-11 09:25:36 +0000 (Thu, 11 Dec 2008) | 4 lines
+Changed paths:
+   M /branches/dev/samtools/bam_import.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-57
+ * fixed a bug in parser when there is auxiliary fields
+ * made the parser a bit more robust
+
+------------------------------------------------------------------------
+r42 | lh3lh3 | 2008-12-10 14:57:29 +0000 (Wed, 10 Dec 2008) | 5 lines
+Changed paths:
+   M /branches/dev/samtools/bam_index.c
+   M /branches/dev/samtools/bamtk.c
+   M /branches/dev/samtools/bgzf.c
+
+ * samtools-0.1.0-56
+ * fixed a bug in bgzf (only reading is affected)
+ * fixed a typo in bam_index.c
+ * in bam_index.c, check potential bugs in the underlying I/O library
+
+------------------------------------------------------------------------
+r41 | lh3lh3 | 2008-12-10 12:53:08 +0000 (Wed, 10 Dec 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/samtools.1
+
+update manual
+
+------------------------------------------------------------------------
+r40 | lh3lh3 | 2008-12-10 11:52:10 +0000 (Wed, 10 Dec 2008) | 5 lines
+Changed paths:
+   M /branches/dev/samtools/bam.h
+   M /branches/dev/samtools/bam_pileup.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-55
+ * tried to make pileup work with clipping (previously not), though NOT tested
+ * removed -v from pileup
+ * made pileup take the reference sequence
+
+------------------------------------------------------------------------
+r39 | lh3lh3 | 2008-12-09 11:59:28 +0000 (Tue, 09 Dec 2008) | 4 lines
+Changed paths:
+   M /branches/dev/samtools/bam_import.c
+   M /branches/dev/samtools/bamtk.c
+   M /branches/dev/samtools/samtools.1
+
+ * samtools-0.1.0-54
+ * in parser, recognize "=", rather than ",", as a match
+ * in parser, correctl parse "=" at the MRNM field.
+
+------------------------------------------------------------------------
+r38 | lh3lh3 | 2008-12-09 11:39:07 +0000 (Tue, 09 Dec 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/misc/maq2sam.c
+
+fixed a bug in handling maq flag 64 and 192
+
+------------------------------------------------------------------------
+r37 | lh3lh3 | 2008-12-09 09:53:46 +0000 (Tue, 09 Dec 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/misc/md5fa.c
+
+also calculate unordered md5sum check
+
+------------------------------------------------------------------------
+r36 | lh3lh3 | 2008-12-09 09:46:21 +0000 (Tue, 09 Dec 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/misc/md5fa.c
+
+fixed a minor bug when there are space in the sequence
+
+------------------------------------------------------------------------
+r35 | lh3lh3 | 2008-12-09 09:40:45 +0000 (Tue, 09 Dec 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/misc/md5fa.c
+
+fixed a potential memory leak
+
+------------------------------------------------------------------------
+r34 | lh3lh3 | 2008-12-08 14:52:17 +0000 (Mon, 08 Dec 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/bam_import.c
+   M /branches/dev/samtools/bam_index.c
+   M /branches/dev/samtools/bamtk.c
+
+ * fixed a bug in import: bin is wrongly calculated
+
+------------------------------------------------------------------------
+r33 | lh3lh3 | 2008-12-08 14:08:01 +0000 (Mon, 08 Dec 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/misc/all2sam.pl
+
+nothing, really
+
+------------------------------------------------------------------------
+r32 | lh3lh3 | 2008-12-08 12:56:02 +0000 (Mon, 08 Dec 2008) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/Makefile
+   M /branches/dev/samtools/kseq.h
+   M /branches/dev/samtools/misc/Makefile
+   A /branches/dev/samtools/misc/md5.c
+   A /branches/dev/samtools/misc/md5.h
+   A /branches/dev/samtools/misc/md5fa.c
+
+ * fixed two warnings in kseq.h
+ * added md5sum utilities
+
+------------------------------------------------------------------------
+r31 | lh3lh3 | 2008-12-08 11:35:29 +0000 (Mon, 08 Dec 2008) | 5 lines
+Changed paths:
+   M /branches/dev/samtools/Makefile
+   M /branches/dev/samtools/bam_import.c
+   M /branches/dev/samtools/bamtk.c
+   A /branches/dev/samtools/kseq.h
+   D /branches/dev/samtools/kstream.h
+
+ * samtools-0.1.0-52
+ * replace kstream with kseq. kseq is a superset of kstream. I need the
+   extra functions in kseq.h.
+ * also compile stand-alone faidx
+
+------------------------------------------------------------------------
+r30 | lh3lh3 | 2008-12-08 11:17:04 +0000 (Mon, 08 Dec 2008) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/bam.h
+   M /branches/dev/samtools/bam_sort.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-51
+ * sorting by read names is available
+
+------------------------------------------------------------------------
+r29 | lh3lh3 | 2008-12-08 10:29:02 +0000 (Mon, 08 Dec 2008) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/bam.c
+   M /branches/dev/samtools/bam.h
+   M /branches/dev/samtools/bam_import.c
+   M /branches/dev/samtools/bam_maqcns.c
+   M /branches/dev/samtools/bam_pileup.c
+   M /branches/dev/samtools/bam_sort.c
+   M /branches/dev/samtools/bam_tview.c
+   M /branches/dev/samtools/bamtk.c
+   M /branches/dev/samtools/misc/maq2sam.c
+
+ * samtools-0.1.0-50
+ * format change to meet the latest specification
+
+------------------------------------------------------------------------
+r28 | lh3lh3 | 2008-12-04 16:09:21 +0000 (Thu, 04 Dec 2008) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/bam_maqcns.c
+   M /branches/dev/samtools/misc/maq2sam.c
+
+ * minor change in maqcns: special care when n==0
+ * change maq2sam to meet the latest specification
+
+------------------------------------------------------------------------
+r27 | lh3lh3 | 2008-12-04 15:55:44 +0000 (Thu, 04 Dec 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/razf.c
+   M /branches/dev/samtools/razf.h
+
+considerable code clean up in razf
+
+------------------------------------------------------------------------
+r26 | lh3lh3 | 2008-12-04 15:08:18 +0000 (Thu, 04 Dec 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/ChangeLog
+   M /branches/dev/samtools/Makefile
+   M /branches/dev/samtools/faidx.c
+
+make RAZF optional in faidx.c
+
+------------------------------------------------------------------------
+r25 | lh3lh3 | 2008-12-01 15:27:22 +0000 (Mon, 01 Dec 2008) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/Makefile
+   M /branches/dev/samtools/bam.h
+   M /branches/dev/samtools/bam_aux.c
+   M /branches/dev/samtools/bamtk.c
+   M /branches/dev/samtools/samtools.1
+
+ * samtools-0.1.0-49
+ * added routines for retrieving aux data, NOT TESTED YET!
+
+------------------------------------------------------------------------
+r24 | lh3lh3 | 2008-12-01 14:29:43 +0000 (Mon, 01 Dec 2008) | 5 lines
+Changed paths:
+   M /branches/dev/samtools/bam.c
+   M /branches/dev/samtools/bam_import.c
+   M /branches/dev/samtools/bam_maqcns.c
+   M /branches/dev/samtools/bamtk.c
+   M /branches/dev/samtools/bgzf.c
+   M /branches/dev/samtools/samtools.1
+
+ * samtools-0.1.0-48
+ * bgzf: fixed a potential integer overflow on 32-it machines
+ * maqcns: set the minimum combined quality as 0
+ * supporting hex strings
+
+------------------------------------------------------------------------
+r23 | lh3lh3 | 2008-11-27 17:14:37 +0000 (Thu, 27 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/bam_maqcns.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-47
+ * fixed the bug in maqcns
+
+------------------------------------------------------------------------
+r22 | lh3lh3 | 2008-11-27 17:08:11 +0000 (Thu, 27 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/Makefile
+   M /branches/dev/samtools/bam.h
+   A /branches/dev/samtools/bam_maqcns.c
+   A /branches/dev/samtools/bam_maqcns.h
+   M /branches/dev/samtools/bam_tview.c
+   M /branches/dev/samtools/bamtk.c
+   A /branches/dev/samtools/glf.h
+
+ * samtools-0.1.0-46
+ * add MAQ consensus caller, currently BUGGY!
+
+------------------------------------------------------------------------
+r21 | lh3lh3 | 2008-11-27 13:51:28 +0000 (Thu, 27 Nov 2008) | 4 lines
+Changed paths:
+   M /branches/dev/samtools/bam_pileup.c
+   M /branches/dev/samtools/bam_tview.c
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-45
+ * tview: display padded alignment (but not P operation)
+ * better coordinates and reference sequence
+
+------------------------------------------------------------------------
+r19 | lh3lh3 | 2008-11-27 09:26:05 +0000 (Thu, 27 Nov 2008) | 2 lines
+Changed paths:
+   A /branches/dev/samtools/ChangeLog
+
+new ChangeLog
+
+------------------------------------------------------------------------
+r18 | lh3lh3 | 2008-11-27 09:24:45 +0000 (Thu, 27 Nov 2008) | 3 lines
+Changed paths:
+   D /branches/dev/samtools/ChangeLog
+   A /branches/dev/samtools/ChangeLog.old (from /branches/dev/samtools/ChangeLog:6)
+
+Rename ChangeLog to ChangeLog.old. This old ChangeLog is generated from
+the log of my personal SVN repository.
+
+------------------------------------------------------------------------
+r17 | lh3lh3 | 2008-11-27 09:22:55 +0000 (Thu, 27 Nov 2008) | 6 lines
+Changed paths:
+   M /branches/dev/samtools/Makefile
+   M /branches/dev/samtools/bamtk.c
+   M /branches/dev/samtools/bgzf.c
+
+ * samtools-0.1.0-44
+ * declare fseeko and ftello as some Linux may not do this by default and
+   missing these declarations will make bgzf buggy
+ * get rid of some harmless warings
+ * use BGZF by default, now
+
+------------------------------------------------------------------------
+r16 | lh3lh3 | 2008-11-26 21:19:11 +0000 (Wed, 26 Nov 2008) | 4 lines
+Changed paths:
+   M /branches/dev/samtools/bam_index.c
+   M /branches/dev/samtools/bamtk.c
+   M /branches/dev/samtools/razf.c
+
+ * samtools-0.1.0-43
+ * fixed a bug in razf_read()
+ * give more warnings when the file is truncated (or due to bugs in I/O library)
+
+------------------------------------------------------------------------
+r15 | lh3lh3 | 2008-11-26 20:41:39 +0000 (Wed, 26 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/bgzf.c
+
+fixed a bug in bgzf.c at the end of the file
+
+------------------------------------------------------------------------
+r14 | lh3lh3 | 2008-11-26 17:05:18 +0000 (Wed, 26 Nov 2008) | 4 lines
+Changed paths:
+   M /branches/dev/samtools/bamtk.c
+
+ * samtools-0.1.0-42
+ * a lot happened to RAZF, although samtools itself is untouched. Better
+   also update the version number anyway to avoid confusion
+
+------------------------------------------------------------------------
+r13 | lh3lh3 | 2008-11-26 17:03:48 +0000 (Wed, 26 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/razf.c
+
+a change from Jue, but I think it should not matter
+
+------------------------------------------------------------------------
+r12 | lh3lh3 | 2008-11-26 16:48:14 +0000 (Wed, 26 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/dev/samtools/razf.c
+
+fixed a potential bug in razf. However, it seems still buggy, just
+rarely happens, very rarely.
+
+------------------------------------------------------------------------
+r11 | lh3lh3 | 2008-11-26 14:02:56 +0000 (Wed, 26 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/razf.c
+
+fixed a bug in razf, with the help of Jue
+
+------------------------------------------------------------------------
+r10 | lh3lh3 | 2008-11-26 11:55:32 +0000 (Wed, 26 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/bam_index.c
+
+remove a comment
+
+------------------------------------------------------------------------
+r9 | lh3lh3 | 2008-11-26 11:37:05 +0000 (Wed, 26 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/dev/samtools/Makefile
+   M /branches/dev/samtools/bam.h
+   M /branches/dev/samtools/razf.c
+   M /branches/dev/samtools/razf.h
+
+ * Jue has updated razf to realize Bob's scheme
+
+------------------------------------------------------------------------
+r7 | lh3lh3 | 2008-11-25 20:37:37 +0000 (Tue, 25 Nov 2008) | 2 lines
+Changed paths:
+   A /branches/dev/samtools/samtools.1
+
+the manual page
+
+------------------------------------------------------------------------
+r6 | lh3lh3 | 2008-11-25 20:37:16 +0000 (Tue, 25 Nov 2008) | 3 lines
+Changed paths:
+   A /branches/dev/samtools/ChangeLog
+   A /branches/dev/samtools/Makefile
+   A /branches/dev/samtools/bam.c
+   A /branches/dev/samtools/bam.h
+   A /branches/dev/samtools/bam_aux.c
+   A /branches/dev/samtools/bam_endian.h
+   A /branches/dev/samtools/bam_import.c
+   A /branches/dev/samtools/bam_index.c
+   A /branches/dev/samtools/bam_lpileup.c
+   A /branches/dev/samtools/bam_pileup.c
+   A /branches/dev/samtools/bam_sort.c
+   A /branches/dev/samtools/bam_tview.c
+   A /branches/dev/samtools/bamtk.c
+   A /branches/dev/samtools/bgzf.c
+   A /branches/dev/samtools/bgzf.h
+   A /branches/dev/samtools/bgzip.c
+   A /branches/dev/samtools/faidx.c
+   A /branches/dev/samtools/faidx.h
+   A /branches/dev/samtools/khash.h
+   A /branches/dev/samtools/ksort.h
+   A /branches/dev/samtools/kstream.h
+   A /branches/dev/samtools/misc
+   A /branches/dev/samtools/misc/Makefile
+   A /branches/dev/samtools/misc/all2sam.pl
+   A /branches/dev/samtools/misc/maq2sam.c
+   A /branches/dev/samtools/razf.c
+   A /branches/dev/samtools/razf.h
+   A /branches/dev/samtools/razip.c
+   A /branches/dev/samtools/zutil.h
+
+The initial version of samtools, replicated from my local SVN repository.
+The current version is: 0.1.0-42. All future development will happen here.
+
+------------------------------------------------------------------------
+r5 | lh3lh3 | 2008-11-25 20:30:49 +0000 (Tue, 25 Nov 2008) | 2 lines
+Changed paths:
+   A /branches/dev/samtools
+
+samtools (C version)
+
+------------------------------------------------------------------------
diff --git a/ChangeLog.old b/ChangeLog.old
new file mode 100644 (file)
index 0000000..2e1214e
--- /dev/null
@@ -0,0 +1,806 @@
+------------------------------------------------------------------------
+r703 | lh3 | 2008-11-25 20:20:02 +0000 (Tue, 25 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/prog/bam/samtools.1
+
+rename bamtk to samtools
+
+------------------------------------------------------------------------
+r702 | lh3 | 2008-11-25 20:15:09 +0000 (Tue, 25 Nov 2008) | 2 lines
+Changed paths:
+   D /branches/prog/bam/bamtk.1
+   A /branches/prog/bam/samtools.1 (from /branches/prog/bam/bamtk.1:679)
+
+rename bamtk.1 to samtools.1
+
+------------------------------------------------------------------------
+r701 | lh3 | 2008-11-25 13:29:10 +0000 (Tue, 25 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/prog/bam/Makefile
+   M /branches/prog/bam/bam.c
+   M /branches/prog/bam/bam.h
+   M /branches/prog/bam/bam_import.c
+   M /branches/prog/bam/bam_index.c
+   M /branches/prog/bam/bam_pileup.c
+   M /branches/prog/bam/bamtk.c
+   M /branches/prog/bam/misc/Makefile
+
+ * samtools-0.1.0-41
+ * small (but a bit dangerous) changes to meet the latest specification
+
+------------------------------------------------------------------------
+r700 | lh3 | 2008-11-25 13:15:11 +0000 (Tue, 25 Nov 2008) | 2 lines
+Changed paths:
+   A /branches/prog/bam/misc/all2sam.pl (from /branches/prog/bam/misc/all2tam.pl:649)
+   D /branches/prog/bam/misc/all2tam.pl
+   A /branches/prog/bam/misc/maq2sam.c (from /branches/prog/bam/misc/maq2tam.c:699)
+   D /branches/prog/bam/misc/maq2tam.c
+
+rename tam to sam
+
+------------------------------------------------------------------------
+r699 | lh3 | 2008-11-25 13:14:49 +0000 (Tue, 25 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/prog/bam/misc/maq2tam.c
+
+change for the new specification
+
+------------------------------------------------------------------------
+r698 | lh3 | 2008-11-24 13:15:20 +0000 (Mon, 24 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/prog/bam/bam.h
+   M /branches/prog/bam/razf.c
+   M /branches/prog/bam/razf.h
+
+ * add a fake BGZF mode to razf. It is fake in that it loads razf index into
+   memory but gives BGZF like virtual offset
+
+------------------------------------------------------------------------
+r697 | lh3 | 2008-11-24 09:53:44 +0000 (Mon, 24 Nov 2008) | 2 lines
+Changed paths:
+   A /branches/prog/bam/ChangeLog
+
+change log
+
+------------------------------------------------------------------------
+r696 | lh3 | 2008-11-24 09:53:23 +0000 (Mon, 24 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/prog/bam/bgzf.c
+
+updated bgzf, on behalf of Bob
+
+------------------------------------------------------------------------
+r695 | lh3 | 2008-11-23 11:40:31 +0000 (Sun, 23 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/prog/bam/bam_index.c
+   M /branches/prog/bam/razf.c
+
+fixed a bug in razf
+
+------------------------------------------------------------------------
+r694 | lh3 | 2008-11-22 16:23:52 +0000 (Sat, 22 Nov 2008) | 4 lines
+Changed paths:
+   M /branches/prog/bam/bam_index.c
+   M /branches/prog/bam/bam_lpileup.c
+   M /branches/prog/bam/bam_tview.c
+   M /branches/prog/bam/bamtk.c
+
+ * bam-0.1.0-40
+ * fixed two small memory leaks
+ * fixed a memory problem when seek outside the length of the sequence
+
+------------------------------------------------------------------------
+r693 | lh3 | 2008-11-22 16:10:04 +0000 (Sat, 22 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/prog/bam/bam_index.c
+   M /branches/prog/bam/bamtk.c
+
+ * bam-0.1.0-39
+ * fixed an uninitialized warning. This does not matter in fact
+
+------------------------------------------------------------------------
+r692 | lh3 | 2008-11-22 15:44:05 +0000 (Sat, 22 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/prog/bam/razf.c
+   M /branches/prog/bam/razf.h
+
+Jue's new razf
+
+------------------------------------------------------------------------
+r691 | lh3 | 2008-11-21 21:30:39 +0000 (Fri, 21 Nov 2008) | 4 lines
+Changed paths:
+   M /branches/prog/bam/Makefile
+   M /branches/prog/bam/bam.h
+   M /branches/prog/bam/bam_index.c
+   M /branches/prog/bam/bamtk.c
+   M /branches/prog/bam/bgzip.c
+
+ * bam-0.1.0-38
+ * get rid of some warings in bgzip.c
+ * potentially improve performance in indexing for BGZF
+
+------------------------------------------------------------------------
+r690 | lh3 | 2008-11-21 21:15:51 +0000 (Fri, 21 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/prog/bam/bgzf.c
+
+I think I have fixed the bug in bgzf
+
+------------------------------------------------------------------------
+r689 | lh3 | 2008-11-21 20:48:56 +0000 (Fri, 21 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/prog/bam/bgzf.c
+
+bug fix by Bob
+
+------------------------------------------------------------------------
+r688 | lh3 | 2008-11-21 20:37:27 +0000 (Fri, 21 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/prog/bam/bam.h
+   M /branches/prog/bam/bam_index.c
+
+fixed a bug due to the name change in _IOLIB
+
+------------------------------------------------------------------------
+r687 | lh3 | 2008-11-21 14:42:56 +0000 (Fri, 21 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/prog/bam/bgzf.c
+
+fix small things
+
+------------------------------------------------------------------------
+r686 | lh3 | 2008-11-21 14:37:59 +0000 (Fri, 21 Nov 2008) | 2 lines
+Changed paths:
+   A /branches/prog/bam/bgzf.c
+   A /branches/prog/bam/bgzf.h
+   A /branches/prog/bam/bgzip.c
+
+Bob's BGZF format, although currently buggy
+
+------------------------------------------------------------------------
+r685 | lh3 | 2008-11-21 09:48:20 +0000 (Fri, 21 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/prog/bam/bam_index.c
+   M /branches/prog/bam/bam_tview.c
+   M /branches/prog/bam/bamtk.c
+
+ * bam-0.1.0-37
+ * improve interface a little bit
+
+------------------------------------------------------------------------
+r684 | lh3 | 2008-11-21 09:30:18 +0000 (Fri, 21 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/prog/bam/bam_tview.c
+   M /branches/prog/bam/bamtk.c
+
+ * bam-0.1.0-36
+ * improve the interface of tview, a little bit
+
+------------------------------------------------------------------------
+r683 | lh3 | 2008-11-20 22:33:54 +0000 (Thu, 20 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/prog/bam/bam_tview.c
+
+a little better viewer
+
+------------------------------------------------------------------------
+r682 | lh3 | 2008-11-20 22:27:01 +0000 (Thu, 20 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/prog/bam/bam.h
+   M /branches/prog/bam/bam_tview.c
+   M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-35
+ * better viewer
+
+------------------------------------------------------------------------
+r681 | lh3 | 2008-11-20 20:51:16 +0000 (Thu, 20 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/prog/bam/Makefile
+   M /branches/prog/bam/bam_tview.c
+   M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-34
+ * tview is now a component of bamtk
+
+------------------------------------------------------------------------
+r680 | lh3 | 2008-11-20 19:17:30 +0000 (Thu, 20 Nov 2008) | 2 lines
+Changed paths:
+   A /branches/prog/bam/bam_tview.c
+
+text alignment viewer
+
+------------------------------------------------------------------------
+r679 | lh3 | 2008-11-20 19:17:15 +0000 (Thu, 20 Nov 2008) | 5 lines
+Changed paths:
+   M /branches/prog/bam/Makefile
+   M /branches/prog/bam/bam.h
+   M /branches/prog/bam/bam_index.c
+   M /branches/prog/bam/bam_lpileup.c
+   M /branches/prog/bam/bam_pileup.c
+   M /branches/prog/bam/bamtk.1
+   M /branches/prog/bam/bamtk.c
+   M /branches/prog/bam/faidx.c
+
+ * bamtk-0.1.0-33
+ * added routines to reset pileup bufferes
+ * fixed a bug in faidx
+ * add text alignment viewer
+
+------------------------------------------------------------------------
+r678 | lh3 | 2008-11-20 11:05:02 +0000 (Thu, 20 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/prog/bam/Makefile
+   A /branches/prog/bam/bam_lpileup.c (from /branches/prog/bam/bam_tview.c:668)
+   D /branches/prog/bam/bam_tview.c
+
+rename tview as lpileup
+
+------------------------------------------------------------------------
+r677 | lh3 | 2008-11-20 10:08:52 +0000 (Thu, 20 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/prog/bam/razf.c
+
+fixed a bug in razf
+
+------------------------------------------------------------------------
+r676 | lh3 | 2008-11-19 22:52:20 +0000 (Wed, 19 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/prog/bam/bam.h
+   M /branches/prog/bam/bam_index.c
+   M /branches/prog/bam/faidx.h
+
+add documentations
+
+------------------------------------------------------------------------
+r674 | lh3 | 2008-11-19 21:39:17 +0000 (Wed, 19 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/prog/bam/bam.h
+   M /branches/prog/bam/bamtk.1
+   M /branches/prog/bam/faidx.h
+
+update documentation
+
+------------------------------------------------------------------------
+r673 | lh3 | 2008-11-19 21:19:03 +0000 (Wed, 19 Nov 2008) | 2 lines
+Changed paths:
+   A /branches/prog/bam/bamtk.1
+
+add manual page
+
+------------------------------------------------------------------------
+r672 | lh3 | 2008-11-19 16:40:49 +0000 (Wed, 19 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/prog/bam/bamtk.c
+   M /branches/prog/bam/faidx.c
+
+ * bamtk-0.1.0-32
+ * make faidx more error resistant
+
+------------------------------------------------------------------------
+r671 | lh3 | 2008-11-19 16:09:55 +0000 (Wed, 19 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/prog/bam/faidx.h
+
+add index
+
+------------------------------------------------------------------------
+r670 | lh3 | 2008-11-19 16:02:39 +0000 (Wed, 19 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/prog/bam/bam_pileup.c
+   M /branches/prog/bam/bamtk.c
+   M /branches/prog/bam/faidx.c
+
+ * bamtk-0.1.0-31
+ * show reference sequence in pileup -v (not in the default pileup)
+
+------------------------------------------------------------------------
+r669 | lh3 | 2008-11-19 14:51:17 +0000 (Wed, 19 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/prog/bam/Makefile
+   M /branches/prog/bam/bamtk.c
+   M /branches/prog/bam/faidx.c
+
+ * bamtk-0.1.0-30
+ * put faidx in bamtk and remove faidx_main.c
+
+------------------------------------------------------------------------
+r668 | lh3 | 2008-11-19 14:15:05 +0000 (Wed, 19 Nov 2008) | 4 lines
+Changed paths:
+   M /branches/prog/bam/Makefile
+   M /branches/prog/bam/bam_index.c
+   M /branches/prog/bam/bam_tview.c
+   M /branches/prog/bam/bamtk.c
+   A /branches/prog/bam/faidx.c
+   A /branches/prog/bam/faidx.h
+   M /branches/prog/bam/razf.c
+
+ * bamtk-0.1.0-29
+ * fixed a bug in tview.c
+ * prepare to add faidx
+
+------------------------------------------------------------------------
+r667 | lh3 | 2008-11-19 10:20:45 +0000 (Wed, 19 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/prog/bam/bam.h
+   M /branches/prog/bam/razf.c
+   M /branches/prog/bam/razf.h
+
+gzip-compatible razf
+
+------------------------------------------------------------------------
+r664 | lh3 | 2008-11-18 12:50:23 +0000 (Tue, 18 Nov 2008) | 5 lines
+Changed paths:
+   M /branches/prog/bam/bam.h
+   M /branches/prog/bam/bam_index.c
+   M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-28
+ * fetch: fixed a bug at an array boundary
+ * fetch: fixed a bug when the whole chromosome is retrieved
+ * add linear index
+
+------------------------------------------------------------------------
+r663 | lh3 | 2008-11-17 21:29:22 +0000 (Mon, 17 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/prog/bam/Makefile
+   M /branches/prog/bam/bam.c
+   M /branches/prog/bam/bam.h
+   M /branches/prog/bam/bam_import.c
+   M /branches/prog/bam/bam_pileup.c
+   M /branches/prog/bam/bam_tview.c
+   M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-27
+ * put l_qseq into core and move l_aux to bam1_t
+
+------------------------------------------------------------------------
+r662 | lh3 | 2008-11-17 20:55:16 +0000 (Mon, 17 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/prog/bam/bam.c
+   M /branches/prog/bam/bam.h
+   M /branches/prog/bam/bam_import.c
+   M /branches/prog/bam/bam_index.c
+   M /branches/prog/bam/bam_pileup.c
+   M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-26
+ * save seq and qual separately
+
+------------------------------------------------------------------------
+r661 | lh3 | 2008-11-17 13:09:37 +0000 (Mon, 17 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/prog/bam/bam.h
+
+little
+
+------------------------------------------------------------------------
+r660 | lh3 | 2008-11-17 13:06:14 +0000 (Mon, 17 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/prog/bam/bam.h
+
+more documentations
+
+------------------------------------------------------------------------
+r659 | lh3 | 2008-11-17 12:55:08 +0000 (Mon, 17 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/prog/bam/bam_pileup.c
+   M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-25
+ * make tview work for TAM
+
+------------------------------------------------------------------------
+r658 | lh3 | 2008-11-17 12:50:21 +0000 (Mon, 17 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/prog/bam/Makefile
+   M /branches/prog/bam/bam.h
+   M /branches/prog/bam/bam_pileup.c
+   M /branches/prog/bam/bam_tview.c
+   M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-24
+ * make tview as an independent module
+
+------------------------------------------------------------------------
+r657 | lh3 | 2008-11-17 11:26:06 +0000 (Mon, 17 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/prog/bam/Makefile
+   M /branches/prog/bam/bam.h
+   M /branches/prog/bam/bam_pileup.c
+
+change little
+
+------------------------------------------------------------------------
+r656 | lh3 | 2008-11-16 21:33:19 +0000 (Sun, 16 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/prog/bam/bam_pileup.c
+   M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-23
+ * also add tview for TAM
+
+------------------------------------------------------------------------
+r655 | lh3 | 2008-11-16 21:29:46 +0000 (Sun, 16 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/prog/bam/Makefile
+   M /branches/prog/bam/bam_tview.c
+   M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-22
+ * make tview more efficient for deep depth
+
+------------------------------------------------------------------------
+r654 | lh3 | 2008-11-16 20:52:19 +0000 (Sun, 16 Nov 2008) | 4 lines
+Changed paths:
+   M /branches/prog/bam/Makefile
+   M /branches/prog/bam/bam_import.c
+   M /branches/prog/bam/bam_pileup.c
+   A /branches/prog/bam/bam_tview.c
+   M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-21
+ * fixed bug in the TAM parser: lowercase not recognized
+ * unfinished function to leveled pileup (tview)
+
+------------------------------------------------------------------------
+r653 | lh3 | 2008-11-15 12:58:36 +0000 (Sat, 15 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/prog/bam/bam.h
+   M /branches/prog/bam/bam_pileup.c
+   M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-20
+ * pileup now display deleted bases as '*'
+
+------------------------------------------------------------------------
+r652 | lh3 | 2008-11-15 09:58:39 +0000 (Sat, 15 Nov 2008) | 4 lines
+Changed paths:
+   M /branches/prog/bam/bam_index.c
+   M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-19
+ * fixed a bug in fetch()
+ * reduce memory in indexing
+
+------------------------------------------------------------------------
+r651 | lh3 | 2008-11-14 21:56:05 +0000 (Fri, 14 Nov 2008) | 5 lines
+Changed paths:
+   M /branches/prog/bam/Makefile
+   M /branches/prog/bam/bam.h
+   M /branches/prog/bam/bam_index.c
+   M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-18
+ * important changes are made to index: the index size is increased, but
+   now we have no limit on file sizes and the new method potentially
+   works with BGZF, Bob's new compression format.
+
+------------------------------------------------------------------------
+r650 | lh3 | 2008-11-14 16:03:22 +0000 (Fri, 14 Nov 2008) | 4 lines
+Changed paths:
+   M /branches/prog/bam/bam.h
+   M /branches/prog/bam/bam_index.c
+   M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-17
+ * more comments in bam.h
+ * fixed a bug in bam_index.c
+
+------------------------------------------------------------------------
+r649 | lh3 | 2008-11-13 16:04:18 +0000 (Thu, 13 Nov 2008) | 4 lines
+Changed paths:
+   M /branches/prog/bam/bam.c
+   M /branches/prog/bam/bam.h
+   M /branches/prog/bam/bam_import.c
+   M /branches/prog/bam/bam_index.c
+   M /branches/prog/bam/bam_pileup.c
+   M /branches/prog/bam/bam_sort.c
+   M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-16
+ * use macros to retrieve pointers from bam1_t and thus reduce the size
+   of bam1_t struct.
+
+------------------------------------------------------------------------
+r648 | lh3 | 2008-11-13 13:21:39 +0000 (Thu, 13 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/prog/bam/bam_sort.c
+   M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-15
+ * make more things work over pipe
+
+------------------------------------------------------------------------
+r647 | lh3 | 2008-11-13 12:49:28 +0000 (Thu, 13 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/prog/bam/misc/maq2tam.c
+
+fixed a bug in maq2tam
+
+------------------------------------------------------------------------
+r646 | lh3 | 2008-11-13 11:46:59 +0000 (Thu, 13 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/prog/bam/Makefile
+   M /branches/prog/bam/misc/Makefile
+   M /branches/prog/bam/misc/maq2tam.c
+
+ * bug fix in maq2tam.c
+ * improve Makefile
+
+------------------------------------------------------------------------
+r645 | lh3 | 2008-11-13 11:39:46 +0000 (Thu, 13 Nov 2008) | 3 lines
+Changed paths:
+   A /branches/prog/bam/misc/Makefile
+   M /branches/prog/bam/misc/maq2tam.c
+
+ * corrected maq2tam
+ * add Makefile
+
+------------------------------------------------------------------------
+r644 | lh3 | 2008-11-13 11:25:45 +0000 (Thu, 13 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/prog/bam/razf.c
+
+fixed the bug in buffered write (on behalf of Jue)
+
+------------------------------------------------------------------------
+r643 | lh3 | 2008-11-13 10:53:42 +0000 (Thu, 13 Nov 2008) | 2 lines
+Changed paths:
+   D /branches/prog/bam/all2tam.pl
+   A /branches/prog/bam/misc/all2tam.pl (from /branches/prog/bam/all2tam.pl:642)
+
+move to misc
+
+------------------------------------------------------------------------
+r642 | lh3 | 2008-11-13 10:53:23 +0000 (Thu, 13 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/prog/bam/all2tam.pl
+
+change tag
+
+------------------------------------------------------------------------
+r641 | lh3 | 2008-11-13 10:53:12 +0000 (Thu, 13 Nov 2008) | 2 lines
+Changed paths:
+   D /branches/prog/bam/utils
+
+has been renamed
+
+------------------------------------------------------------------------
+r640 | lh3 | 2008-11-13 10:52:50 +0000 (Thu, 13 Nov 2008) | 2 lines
+Changed paths:
+   A /branches/prog/bam/misc (from /branches/prog/bam/utils:639)
+
+rename
+
+------------------------------------------------------------------------
+r639 | lh3 | 2008-11-13 10:52:35 +0000 (Thu, 13 Nov 2008) | 2 lines
+Changed paths:
+   A /branches/prog/bam/utils
+   A /branches/prog/bam/utils/maq2tam.c
+
+utilities (converters and so on)
+
+------------------------------------------------------------------------
+r638 | lh3 | 2008-11-12 22:24:22 +0000 (Wed, 12 Nov 2008) | 4 lines
+Changed paths:
+   M /branches/prog/bam/bam.c
+   M /branches/prog/bam/bam.h
+   M /branches/prog/bam/bam_import.c
+   M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-14
+ * copy the text header to BAM
+ * add BAM1 header flag
+
+------------------------------------------------------------------------
+r637 | lh3 | 2008-11-12 14:56:08 +0000 (Wed, 12 Nov 2008) | 4 lines
+Changed paths:
+   M /branches/prog/bam/bam.h
+   M /branches/prog/bam/bam_index.c
+   M /branches/prog/bam/bamtk.c
+   M /branches/prog/bam/razf.c
+
+ * bamtk-0.1.0-13
+ * fixed a bug in razf
+ * improved and fixed potential bugs in index
+
+------------------------------------------------------------------------
+r636 | lh3 | 2008-11-12 11:57:13 +0000 (Wed, 12 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/prog/bam/bam.h
+   M /branches/prog/bam/bam_index.c
+   M /branches/prog/bam/bam_pileup.c
+   M /branches/prog/bam/bamtk.c
+
+update documentation in the HeaderDOC format
+
+------------------------------------------------------------------------
+r635 | lh3 | 2008-11-12 10:08:38 +0000 (Wed, 12 Nov 2008) | 4 lines
+Changed paths:
+   M /branches/prog/bam/bam.c
+   M /branches/prog/bam/bam.h
+   M /branches/prog/bam/bam_import.c
+   M /branches/prog/bam/bam_index.c
+   M /branches/prog/bam/bam_pileup.c
+   M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-12
+ * more documentations
+ * rename baf1_core_t as bam1_core_t
+
+------------------------------------------------------------------------
+r634 | lh3 | 2008-11-11 23:00:35 +0000 (Tue, 11 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/prog/bam/bam.h
+   M /branches/prog/bam/bam_pileup.c
+
+documentation
+
+------------------------------------------------------------------------
+r633 | lh3 | 2008-11-11 21:23:49 +0000 (Tue, 11 Nov 2008) | 4 lines
+Changed paths:
+   M /branches/prog/bam/bam_index.c
+   M /branches/prog/bam/bam_pileup.c
+   M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-11
+ * give up regional pileup. We can now use pipe to mimic that.
+ * for index file, change suffix .idx to .bmi
+
+------------------------------------------------------------------------
+r632 | lh3 | 2008-11-11 21:00:11 +0000 (Tue, 11 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/prog/bam/bam.h
+   M /branches/prog/bam/bam_import.c
+   M /branches/prog/bam/bam_pileup.c
+   M /branches/prog/bam/bamtk.c
+   M /branches/prog/bam/razf.c
+
+ * bamtk-0.1.0-10
+ * make pileup work on TAM
+
+------------------------------------------------------------------------
+r631 | lh3 | 2008-11-11 09:20:29 +0000 (Tue, 11 Nov 2008) | 4 lines
+Changed paths:
+   M /branches/prog/bam/bam_pileup.c
+   M /branches/prog/bam/bamtk.c
+   M /branches/prog/bam/razf.c
+   M /branches/prog/bam/razf.h
+   M /branches/prog/bam/razip.c
+
+ * bamtk-0.1.0-9
+ * razf now supports streaming
+ * prepare to improve pileup (have not yet)
+
+------------------------------------------------------------------------
+r630 | lh3 | 2008-11-10 18:34:40 +0000 (Mon, 10 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/prog/bam/bam.h
+   M /branches/prog/bam/bam_import.c
+   M /branches/prog/bam/bam_pileup.c
+   M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-8
+ * improve the interface of TAM parser
+
+------------------------------------------------------------------------
+r629 | lh3 | 2008-11-10 13:06:13 +0000 (Mon, 10 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/prog/bam/bam_pileup.c
+   M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-7
+ * almost nothing
+
+------------------------------------------------------------------------
+r628 | lh3 | 2008-11-10 12:56:36 +0000 (Mon, 10 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/prog/bam/bam.c
+   M /branches/prog/bam/bam.h
+   M /branches/prog/bam/bam_pileup.c
+   M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-6
+ * fixed a bug in bam_pileup.c
+
+------------------------------------------------------------------------
+r627 | lh3 | 2008-11-10 11:32:46 +0000 (Mon, 10 Nov 2008) | 4 lines
+Changed paths:
+   M /branches/prog/bam/bam.h
+   M /branches/prog/bam/bam_pileup.c
+   M /branches/prog/bam/bamtk.c
+   M /branches/prog/bam/razf.c
+
+ * bamtk-0.1.0-5
+ * fixed a bug in razf.c, caused by my modifications
+ * improve the interface of pileup. Now it will be slower but more flexible
+
+------------------------------------------------------------------------
+r626 | lh3 | 2008-11-09 20:51:04 +0000 (Sun, 09 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/prog/bam/bam.h
+   M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-4
+ * view: dumping binary output
+
+------------------------------------------------------------------------
+r625 | lh3 | 2008-11-09 20:31:54 +0000 (Sun, 09 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/prog/bam/bam.c
+   M /branches/prog/bam/bam.h
+   M /branches/prog/bam/bam_import.c
+   M /branches/prog/bam/bam_index.c
+   M /branches/prog/bam/bam_pileup.c
+   M /branches/prog/bam/bam_sort.c
+   M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-3
+ * rename functions
+
+------------------------------------------------------------------------
+r624 | lh3 | 2008-11-09 15:07:32 +0000 (Sun, 09 Nov 2008) | 2 lines
+Changed paths:
+   M /branches/prog/bam/bam.h
+
+add comments
+
+------------------------------------------------------------------------
+r623 | lh3 | 2008-11-08 22:32:49 +0000 (Sat, 08 Nov 2008) | 4 lines
+Changed paths:
+   M /branches/prog/bam/bam_index.c
+   M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-2
+ * improve indexing for a mixture of long and short reads, although currently
+   I do not know whether it really works...
+
+------------------------------------------------------------------------
+r622 | lh3 | 2008-11-08 22:13:58 +0000 (Sat, 08 Nov 2008) | 3 lines
+Changed paths:
+   M /branches/prog/bam/bam_index.c
+   M /branches/prog/bam/bamtk.c
+
+ * bamtk-0.1.0-1
+ * prepare for improving indexing algorithm
+
+------------------------------------------------------------------------
+r621 | lh3 | 2008-11-08 20:28:09 +0000 (Sat, 08 Nov 2008) | 4 lines
+Changed paths:
+   A /branches/prog/bam/all2tam.pl
+   M /branches/prog/bam/bam.c
+   M /branches/prog/bam/bam.h
+   M /branches/prog/bam/bam_import.c
+   M /branches/prog/bam/bamtk.c
+   D /branches/prog/bam/tam_utils.pl
+
+ * bamtk-0.1.0
+ * smarter integers
+ * rename tam_utils.pl to all2tam.pl
+
+------------------------------------------------------------------------
+r620 | lh3 | 2008-11-08 17:17:22 +0000 (Sat, 08 Nov 2008) | 2 lines
+Changed paths:
+   A /branches/prog/bam
+   A /branches/prog/bam/Makefile
+   A /branches/prog/bam/bam.c
+   A /branches/prog/bam/bam.h
+   A /branches/prog/bam/bam_endian.h
+   A /branches/prog/bam/bam_import.c
+   A /branches/prog/bam/bam_index.c
+   A /branches/prog/bam/bam_pileup.c
+   A /branches/prog/bam/bam_sort.c
+   A /branches/prog/bam/bamtk.c
+   A /branches/prog/bam/khash.h
+   A /branches/prog/bam/ksort.h
+   A /branches/prog/bam/kstream.h
+   A /branches/prog/bam/razf.c
+   A /branches/prog/bam/razf.h
+   A /branches/prog/bam/razip.c
+   A /branches/prog/bam/tam_utils.pl
+   A /branches/prog/bam/zutil.h
+
+The Binary Alignment/Mapping format.
+
+------------------------------------------------------------------------
diff --git a/Makefile b/Makefile
new file mode 100644 (file)
index 0000000..32e4c41
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,62 @@
+CC=                    gcc
+CXX=           g++
+CFLAGS=                -g -Wall -O2 -m64 #-arch ppc
+CXXFLAGS=      $(CFLAGS)
+DFLAGS=                -D_IOLIB=2 -D_FILE_OFFSET_BITS=64 -DHAVE_RAZF #-D_NO_CURSES
+OBJS=          bam.o bam_import.o bam_pileup.o bam_lpileup.o bam_sort.o bam_index.o \
+                       razf.o bgzf.o faidx.o bam_tview.o bam_maqcns.o bam_aux.o bam_plcmd.o
+PROG=          razip bgzip samtools
+INCLUDES=      
+LIBS=          -lm -lz
+SUBDIRS=       . misc
+
+.SUFFIXES:.c .o
+
+.c.o:
+               $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
+
+all-recur lib-recur clean-recur cleanlocal-recur install-recur:
+               @target=`echo $@ | sed s/-recur//`; \
+               wdir=`pwd`; \
+               list='$(SUBDIRS)'; for subdir in $$list; do \
+                       cd $$subdir; \
+                       $(MAKE) CC="$(CC)" CXX="$(CXX)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \
+                               INCLUDES="$(INCLUDES)" $$target || exit 1; \
+                       cd $$wdir; \
+               done;
+
+all:$(PROG)
+
+lib:libbam.a
+
+libbam.a:$(OBJS)
+               $(AR) -cru $@ $(OBJS)
+
+samtools:lib bamtk.o
+               $(CC) $(CFLAGS) -o $@ bamtk.o $(LIBS) -L. -lbam -lcurses
+
+razip:razip.o razf.o
+               $(CC) $(CFLAGS) -o $@ razf.o razip.o $(LIBS)
+
+bgzip:bgzip.o bgzf.o
+               $(CC) $(CFLAGS) -o $@ bgzf.o bgzip.o $(LIBS)
+
+razip.o:razf.h
+bam.o:bam.h razf.h bam_endian.h
+bam_import.o:bam.h kseq.h khash.h razf.h
+bam_pileup.o:bam.h razf.h ksort.h
+bam_plcmd.o:bam.h faidx.h bam_maqcns.h
+bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h
+bam_lpileup.o:bam.h ksort.h
+bam_tview.o:bam.h faidx.h bam_maqcns.h
+bam_maqcns.o:bam.h ksort.h bam_maqcns.h
+bam_sort.o:bam.h ksort.h razf.h
+razf.o:razf.h
+
+faidx.o:faidx.h razf.h khash.h
+faidx_main.o:faidx.h razf.h
+
+cleanlocal:
+               rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a
+
+clean:cleanlocal-recur
diff --git a/NEWS b/NEWS
new file mode 100644 (file)
index 0000000..1ed90ab
--- /dev/null
+++ b/NEWS
@@ -0,0 +1,6 @@
+Beta Release 0.1.1 (22 December, 2008)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The is the first public release of samtools. For more information,
+please check the manual page `samtools.1' and the samtools website
+http://samtools.sourceforge.net
\ No newline at end of file
diff --git a/bam.c b/bam.c
new file mode 100644 (file)
index 0000000..6ccca7c
--- /dev/null
+++ b/bam.c
@@ -0,0 +1,274 @@
+#include <stdio.h>
+#include <ctype.h>
+#include "bam.h"
+#include "bam_endian.h"
+
+int bam_is_be = 0;
+
+/**************************
+ * CIGAR related routines *
+ **************************/
+
+int bam_segreg(int32_t pos, const bam1_core_t *c, const uint32_t *cigar, bam_segreg_t *reg)
+{
+       unsigned k;
+       int32_t x = c->pos, y = 0;
+       int state = 0;
+       for (k = 0; k < c->n_cigar; ++k) {
+               int op = cigar[k] & BAM_CIGAR_MASK; // operation
+               int l = cigar[k] >> BAM_CIGAR_SHIFT; // length
+               if (state == 0 && (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CINS) && x + l > pos) {
+                       reg->tbeg = x; reg->qbeg = y; reg->cbeg = k;
+                       state = 1;
+               }
+               if (op == BAM_CMATCH) { x += l; y += l; }
+               else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
+               else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
+               if (state == 1 && (op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP || op == BAM_CREF_SKIP || k == c->n_cigar - 1)) {
+                       reg->tend = x; reg->qend = y; reg->cend = k;
+               }
+       }
+       return state? 0 : -1;
+}
+
+uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar)
+{
+       uint32_t k, end;
+       end = c->pos;
+       for (k = 0; k < c->n_cigar; ++k) {
+               int op = cigar[k] & BAM_CIGAR_MASK;
+               if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP)
+                       end += cigar[k] >> BAM_CIGAR_SHIFT;
+       }
+       return end;
+}
+
+int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar)
+{
+       uint32_t k;
+       int32_t l = 0;
+       for (k = 0; k < c->n_cigar; ++k) {
+               int op = cigar[k] & BAM_CIGAR_MASK;
+               if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP)
+                       l += cigar[k] >> BAM_CIGAR_SHIFT;
+       }
+       return l;
+}
+
+/********************
+ * BAM I/O routines *
+ ********************/
+
+bam_header_t *bam_header_init()
+{
+       bam_is_be = bam_is_big_endian();
+       return (bam_header_t*)calloc(1, sizeof(bam_header_t));
+}
+
+void bam_header_destroy(bam_header_t *header)
+{
+       int32_t i;
+       extern void bam_destroy_header_hash(bam_header_t *header);
+       if (header == 0) return;
+       if (header->target_name) {
+               for (i = 0; i < header->n_targets; ++i)
+                       free(header->target_name[i]);
+               free(header->target_name);
+               free(header->target_len);
+       }
+       free(header->text);
+#ifndef BAM_NO_HASH
+       bam_destroy_header_hash(header);
+#endif
+       free(header);
+}
+
+bam_header_t *bam_header_read(bamFile fp)
+{
+       bam_header_t *header;
+       char buf[4];
+       int32_t i, name_len;
+       // read "BAM1"
+       if (bam_read(fp, buf, 4) != 4) return 0;
+       if (strncmp(buf, "BAM\001", 4)) {
+               fprintf(stderr, "[bam_header_read] wrong header\n");
+               return 0;
+       }
+       header = bam_header_init();
+       // read plain text and the number of reference sequences
+       bam_read(fp, &header->l_text, 4);
+       if (bam_is_be) bam_swap_endian_4p(&header->l_text);
+       header->text = (char*)calloc(header->l_text + 1, 1);
+       bam_read(fp, header->text, header->l_text);
+       bam_read(fp, &header->n_targets, 4);
+       if (bam_is_be) bam_swap_endian_4p(&header->n_targets);
+       assert(header->n_targets > 0);
+       // read reference sequence names and lengths
+       header->target_name = (char**)calloc(header->n_targets, sizeof(char*));
+       header->target_len = (uint32_t*)calloc(header->n_targets, 4);
+       for (i = 0; i != header->n_targets; ++i) {
+               bam_read(fp, &name_len, 4);
+               if (bam_is_be) bam_swap_endian_4p(&name_len);
+               header->target_name[i] = (char*)calloc(name_len, 1);
+               bam_read(fp, header->target_name[i], name_len);
+               bam_read(fp, &header->target_len[i], 4);
+               if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]);
+       }
+       return header;
+}
+
+int bam_header_write(bamFile fp, const bam_header_t *header)
+{
+       char buf[4];
+       int32_t i, name_len, x;
+       // write "BAM1"
+       strncpy(buf, "BAM\001", 4);
+       bam_write(fp, buf, 4);
+       // write plain text and the number of reference sequences
+       if (bam_is_be) {
+               x = bam_swap_endian_4(header->l_text);
+               bam_write(fp, &x, 4);
+               if (header->l_text) bam_write(fp, header->text, header->l_text);
+               x = bam_swap_endian_4(header->n_targets);
+               bam_write(fp, &x, 4);
+       } else {
+               bam_write(fp, &header->l_text, 4);
+               if (header->l_text) bam_write(fp, header->text, header->l_text);
+               bam_write(fp, &header->n_targets, 4);
+       }
+       // write sequence names and lengths
+       for (i = 0; i != header->n_targets; ++i) {
+               char *p = header->target_name[i];
+               name_len = strlen(p) + 1;
+               if (bam_is_be) {
+                       x = bam_swap_endian_4(name_len);
+                       bam_write(fp, &x, 4);
+               } else bam_write(fp, &name_len, 4);
+               bam_write(fp, p, name_len);
+               if (bam_is_be) {
+                       x = bam_swap_endian_4(header->target_len[i]);
+                       bam_write(fp, &x, 4);
+               } else bam_write(fp, &header->target_len[i], 4);
+       }
+       return 0;
+}
+
+static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data)
+{
+       uint8_t *s;
+       uint32_t i, *cigar = (uint32_t*)(data + c->l_qname);
+       s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2;
+       for (i = 0; i < c->n_cigar; ++i) bam_swap_endian_4p(&cigar[i]);
+       while (s < data + data_len) {
+               uint8_t type;
+               s += 2; // skip key
+               type = toupper(*s); ++s; // skip type
+               if (type == 'C' || type == 'A') ++s;
+               else if (type == 'S') { bam_swap_endian_2p(s); s += 2; }
+               else if (type == 'I' || type == 'F') { bam_swap_endian_4p(s); s += 4; }
+               else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; }
+       }
+}
+
+int bam_read1(bamFile fp, bam1_t *b)
+{
+       bam1_core_t *c = &b->core;
+       int32_t block_len, ret, i;
+       uint32_t x[8];
+
+       assert(BAM_CORE_SIZE == 32);
+       if ((ret = bam_read(fp, &block_len, 4)) != 4) {
+               if (ret == 0) return -1; // normal end-of-file
+               else return -2; // truncated
+       }
+       if (bam_read(fp, x, BAM_CORE_SIZE) != BAM_CORE_SIZE) return -3;
+       if (bam_is_be) {
+               bam_swap_endian_4p(&block_len);
+               for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i);
+       }
+       c->tid = x[0]; c->pos = x[1];
+       c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
+       c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
+       c->l_qseq = x[4];
+       c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7];
+       b->data_len = block_len - BAM_CORE_SIZE;
+       if (b->m_data < b->data_len) {
+               b->m_data = b->data_len;
+               kroundup32(b->m_data);
+               b->data = (uint8_t*)realloc(b->data, b->m_data);
+       }
+       if (bam_read(fp, b->data, b->data_len) != b->data_len) return -4;
+       b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2;
+       if (bam_is_be) swap_endian_data(c, b->data_len, b->data);
+       return 4 + block_len;
+}
+
+inline int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data)
+{
+       uint32_t x[8], block_len = data_len + BAM_CORE_SIZE, y;
+       int i;
+       assert(BAM_CORE_SIZE == 32);
+       x[0] = c->tid;
+       x[1] = c->pos;
+       x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | c->l_qname;
+       x[3] = (uint32_t)c->flag<<16 | c->n_cigar;
+       x[4] = c->l_qseq;
+       x[5] = c->mtid;
+       x[6] = c->mpos;
+       x[7] = c->isize;
+       if (bam_is_be) {
+               for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i);
+               y = block_len;
+               bam_write(fp, bam_swap_endian_4p(&y), 4);
+               swap_endian_data(c, data_len, data);
+       } else bam_write(fp, &block_len, 4);
+       bam_write(fp, x, BAM_CORE_SIZE);
+       bam_write(fp, data, data_len);
+       if (bam_is_be) swap_endian_data(c, data_len, data);
+       return 4 + block_len;
+}
+
+int bam_write1(bamFile fp, const bam1_t *b)
+{
+       return bam_write1_core(fp, &b->core, b->data_len, b->data);
+}
+
+void bam_view1(const bam_header_t *header, const bam1_t *b)
+{
+       uint8_t *s = bam1_seq(b), *t = bam1_qual(b);
+       int i;
+       const bam1_core_t *c = &b->core;
+       printf("%s\t%d\t", bam1_qname(b), c->flag);
+       if (c->tid < 0) printf("*\t");
+       else printf("%s\t", header->target_name[c->tid]);
+       printf("%d\t%d\t", c->pos + 1, c->qual);
+       if (c->n_cigar == 0) putchar('*');
+       else {
+               for (i = 0; i < c->n_cigar; ++i)
+                       printf("%d%c", bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, "MIDNSHP"[bam1_cigar(b)[i]&BAM_CIGAR_MASK]);
+       }
+       putchar('\t');
+       if (c->mtid < 0) printf("*\t");
+       else printf("%s\t", header->target_name[c->mtid]);
+       printf("%d\t%d\t", c->mpos + 1, c->isize);
+       for (i = 0; i < c->l_qseq; ++i) putchar(bam_nt16_rev_table[bam1_seqi(s, i)]);
+       putchar('\t');
+       for (i = 0; i < c->l_qseq; ++i) putchar(t[i] + 33);
+       s = bam1_aux(b);
+       while (s < b->data + b->data_len) {
+               uint8_t type, key[2];
+               key[0] = s[0]; key[1] = s[1];
+               s += 2; type = *s; ++s;
+               printf("\t%c%c:", key[0], key[1]);
+               if (type == 'A') { printf("A:%c", *s); ++s; }
+               else if (type == 'C') { printf("i:%u", *s); ++s; }
+               else if (type == 'c') { printf("i:%d", *s); ++s; }
+               else if (type == 'S') { printf("i:%u", *(uint16_t*)s); s += 2; }
+               else if (type == 's') { printf("i:%d", *(int16_t*)s); s += 2; }
+               else if (type == 'I') { printf("i:%u", *(uint32_t*)s); s += 4; }
+               else if (type == 'i') { printf("i:%d", *(int32_t*)s); s += 4; }
+               else if (type == 'f') { printf("f:%g", *(float*)s); s += 4; }
+               else if (type == 'Z' || type == 'H') { printf("%c:", type); while (*s) putchar(*s++); ++s; }
+       }
+       putchar('\n');
+}
diff --git a/bam.h b/bam.h
new file mode 100644 (file)
index 0000000..4b3a688
--- /dev/null
+++ b/bam.h
@@ -0,0 +1,659 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3@sanger.ac.uk> */
+
+#ifndef BAM_BAM_H
+#define BAM_BAM_H
+
+/*!
+  @header
+
+  BAM library provides I/O and various operations on manipulating files
+  in the BAM (Binary Alignment/Mapping) or TAM (Text Alignment/Mapping)
+  format. It now supports importing from or exporting to TAM, sorting,
+  merging, generating pileup, and quickly retrieval of reads overlapped
+  with a specified region.
+
+  @copyright Genome Research Ltd.
+ */
+
+#include <stdint.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+#if _IOLIB == 1
+#define BAM_TRUE_OFFSET
+#include "razf.h"
+/*! @abstract BAM file handler */
+typedef RAZF *bamFile;
+#define bam_open(fn, mode) razf_open(fn, mode)
+#define bam_dopen(fd, mode) razf_dopen(fd, mode)
+#define bam_close(fp) razf_close(fp)
+#define bam_read(fp, buf, size) razf_read(fp, buf, size)
+#define bam_write(fp, buf, size) razf_write(fp, buf, size)
+#define bam_tell(fp) razf_tell(fp)
+#define bam_seek(fp, pos, dir) razf_seek(fp, pos, dir)
+#elif _IOLIB == 2
+#define BAM_VIRTUAL_OFFSET16
+#include "bgzf.h"
+/*! @abstract BAM file handler */
+typedef BGZF *bamFile;
+#define bam_open(fn, mode) bgzf_open(fn, mode)
+#define bam_dopen(fd, mode) bgzf_fdopen(fd, mode)
+#define bam_close(fp) bgzf_close(fp)
+#define bam_read(fp, buf, size) bgzf_read(fp, buf, size)
+#define bam_write(fp, buf, size) bgzf_write(fp, buf, size)
+#define bam_tell(fp) bgzf_tell(fp)
+#define bam_seek(fp, pos, dir) bgzf_seek(fp, pos, dir)
+#elif _IOLIB == 3
+#define BAM_VIRTUAL_OFFSET16
+#include "razf.h"
+/*! @abstract BAM file handler */
+typedef RAZF *bamFile;
+#define bam_open(fn, mode) razf_open2(fn, mode)
+#define bam_dopen(fd, mode) razf_dopen2(fd, mode)
+#define bam_close(fp) razf_close(fp)
+#define bam_read(fp, buf, size) razf_read(fp, buf, size)
+#define bam_write(fp, buf, size) razf_write(fp, buf, size)
+#define bam_tell(fp) razf_tell2(fp)
+#define bam_seek(fp, pos, dir) razf_seek2(fp, pos, dir)
+#endif
+
+/*! @typedef
+  @abstract Structure for the alignment header.
+  @field n_targets   number of reference sequences
+  @field target_name names of the reference sequences
+  @field target_len  lengths of the referene sequences
+  @field hash        hash table for fast name lookup
+  @field l_text      length of the plain text in the header
+  @field text        plain text
+
+  @discussion Field hash points to null by default. It is a private
+  member.
+ */
+typedef struct {
+       int32_t n_targets;
+       char **target_name;
+       uint32_t *target_len;
+       void *hash;
+       int l_text;
+       char *text;
+} bam_header_t;
+
+/*! @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */
+#define BAM_FPAIRED        1
+/*! @abstract the read is mapped in a proper pair */
+#define BAM_FPROPER_PAIR   2
+/*! @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */
+#define BAM_FUNMAP         4
+/*! @abstract the mate is unmapped */
+#define BAM_FMUNMAP        8
+#define BAM_FREVERSE      16
+#define BAM_FMREVERSE     32
+#define BAM_FREAD1        64
+#define BAM_FREAD2       128
+#define BAM_FSECONDARY   256
+
+#define BAM_CORE_SIZE   sizeof(bam1_core_t)
+
+/**
+ * Describing how CIGAR operation/length is packed in a 32-bit integer.
+ */
+#define BAM_CIGAR_SHIFT 4
+#define BAM_CIGAR_MASK  ((1 << BAM_CIGAR_SHIFT) - 1)
+
+/*
+  CIGAR operations.
+ */
+/*! @abstract CIGAR: match */
+#define BAM_CMATCH      0
+/*! @abstract CIGAR: insertion to the reference */
+#define BAM_CINS        1
+/*! @abstract CIGAR: deletion from the reference */
+#define BAM_CDEL        2
+/*! @abstract CIGAR: skip on the reference (e.g. spliced alignment) */
+#define BAM_CREF_SKIP   3
+/*! @abstract CIGAR: clip on the read with clipped sequence present in qseq */
+#define BAM_CSOFT_CLIP  4
+/*! @abstract CIGAR: clip on the read with clipped sequence trimmed off */
+#define BAM_CHARD_CLIP  5
+/*! @abstract CIGAR: padding */
+#define BAM_CPAD        6
+
+/*! @typedef
+  @abstract Structure for core alignment information.
+  @field  tid     chromosome ID, defined by bam_header_t
+  @field  pos     0-based leftmost coordinate
+  @field  strand  strand; 0 for forward and 1 otherwise
+  @field  bin     bin calculated by bam_reg2bin()
+  @field  qual    mapping quality
+  @field  l_qname length of the query name
+  @field  flag    bitwise flag
+  @field  n_cigar number of CIGAR operations
+  @field  l_qseq  length of the query sequence (read)
+ */
+typedef struct {
+       int32_t tid;
+       int32_t pos;
+       uint32_t bin:16, qual:8, l_qname:8;
+       uint32_t flag:16, n_cigar:16;
+       int32_t l_qseq;
+       int32_t mtid;
+       int32_t mpos;
+       int32_t isize;
+} bam1_core_t;
+
+/*! @typedef
+  @abstract Structure for one alignment.
+  @field  core       core information about the alignment
+  @field  l_aux      length of auxiliary data
+  @field  data_len   current length of bam1_t::data
+  @field  m_data     maximum length of bam1_t::data
+  @field  data       all variable-length data, concatenated; structure: cigar-qname-seq-qual-aux
+  @field  hash       hash table for fast retrieval of tag-value pairs; private
+
+  @discussion Notes:
+   1. qname is zero tailing and core.l_qname includes the tailing '\0'.
+   2. l_qseq is calculated from the total length of an alignment block
+      on reading or from CIGAR.
+ */
+typedef struct {
+       bam1_core_t core;
+       int l_aux, data_len, m_data;
+       uint8_t *data;
+       void *hash;
+} bam1_t;
+
+#define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0)
+#define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0)
+
+/*! @function
+  @abstract  Get the CIGAR array
+  @param  b  pointer to an alignment
+  @return    pointer to the CIGAR array
+
+  @discussion In the CIGAR array, each element is a 32-bit integer. The
+  lower 4 bits gives a CIGAR operation and the higher 28 bits keep the
+  length of a CIGAR.
+ */
+#define bam1_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname))
+
+/*! @function
+  @abstract  Get the name of the query
+  @param  b  pointer to an alignment
+  @return    pointer to the name string, null terminated
+ */
+#define bam1_qname(b) ((char*)((b)->data))
+
+/*! @function
+  @abstract  Get query sequence
+  @param  b  pointer to an alignment
+  @return    pointer to sequence
+
+  @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G,
+  8 for T and 15 for N. Two bases are packed in one byte with the base
+  at the higher 4 bits having smaller coordinate on the read. It is
+  recommended to use bam1_seqi() macro to get the base.
+ */
+#define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname)
+
+/*! @function
+  @abstract  Get query quality
+  @param  b  pointer to an alignment
+  @return    pointer to quality string
+ */
+#define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + ((b)->core.l_qseq + 1)/2)
+
+/*! @function
+  @abstract  Get a base on read
+  @param  s  Query sequence returned by bam1_seq()
+  @param  i  The i-th position, 0-based
+  @return    4-bit integer representing the base.
+ */
+#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf)
+
+/*! @function
+  @abstract  Get query sequence and quality
+  @param  b  pointer to an alignment
+  @return    pointer to the concatenated auxiliary data
+ */
+#define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2)
+
+typedef struct {
+       int32_t qbeg, qend;
+       int32_t tbeg, tend;
+       int32_t cbeg, cend;
+} bam_segreg_t;
+
+#ifndef kroundup32
+/*! @function
+  @abstract  Round an integer to the next closest power-2 integer.
+  @param  x  integer to be rounded (in place)
+  @discussion x will be modified.
+ */
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+/*!
+  @abstract Whether the machine is big-endian; modified only in
+  bam_header_init().
+ */
+extern int bam_is_be;
+
+/*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */
+extern unsigned char bam_nt16_table[256];
+
+/*! @abstract Table for converting a 4-bit encoded nucleotide to a letter. */
+extern char *bam_nt16_rev_table;
+
+extern char bam_nt16_nt4_table[];
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+       /*! @abstract TAM file handler */
+       typedef struct __tamFile_t *tamFile;
+
+       /*!
+         @abstract   Open a TAM file, either uncompressed or compressed by gzip/zlib.
+         @param  fn  TAM file name
+         @return     TAM file handler
+        */
+       tamFile sam_open(const char *fn);
+
+       /*!
+         @abstract   Close a TAM file handler
+         @param  fp  TAM file handler
+        */
+       void sam_close(tamFile fp);
+
+       /*!
+         @abstract      Read one alignment from a TAM file handler
+         @param  fp     TAM file handler
+         @param  header header information (ordered names of chromosomes)
+         @param  b      read alignment; all members in b will be updated
+         @return        0 if successful; otherwise negative
+        */
+       int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b);
+
+       /*!
+         @abstract       Read header information from a TAB-delimited list file.
+         @param  fn_list file name for the list
+         @return         a pointer to the header structure
+
+         @discussion Each line in this file consists of chromosome name and
+         the length of chromosome.
+        */
+       bam_header_t *sam_header_read2(const char *fn_list);
+
+#define sam_write1(header, b) bam_view1(header, b)
+
+       /*!
+         @abstract Initialize a header structure.
+         @return   the pointer to the header structure
+
+         @discussion This function also modifies the global variable
+         bam_is_be.
+        */
+       bam_header_t *bam_header_init();
+
+       /*!
+         @abstract        Destroy a header structure.
+         @param  header  pointer to the header
+        */
+       void bam_header_destroy(bam_header_t *header);
+
+       /*!
+         @abstract   Read a header structure from BAM.
+         @param  fp  BAM file handler, opened by bam_open()
+         @return     pointer to the header structure
+
+         @discussion The file position indicator must be placed at the
+         beginning of the file. Upon success, the position indicator will
+         be set at the start of the first alignment.
+        */
+       bam_header_t *bam_header_read(bamFile fp);
+
+       /*!
+         @abstract      Write a header structure to BAM.
+         @param  fp     BAM file handler
+         @param  header pointer to the header structure
+         @return        always 0 currently
+        */
+       int bam_header_write(bamFile fp, const bam_header_t *header);
+
+       /*!
+         @abstract   Read an alignment from BAM.
+         @param  fp  BAM file handler
+         @param  b   read alignment; all members are updated.
+         @return     number of bytes read from the file
+
+         @discussion The file position indicator must be
+         placed right before an alignment. Upon success, this function
+         will set the position indicator to the start of the next
+         alignment. This function is not affected by the machine
+         endianness.
+        */
+       int bam_read1(bamFile fp, bam1_t *b);
+
+       /*!
+         @abstract Write an alignment to BAM.
+         @param  fp       BAM file handler
+         @param  c        pointer to the bam1_core_t structure
+         @param  data_len total length of variable size data related to
+                          the alignment
+         @param  data     pointer to the concatenated data
+         @return          number of bytes written to the file
+
+         @discussion This function is not affected by the machine
+         endianness.
+        */
+       int bam_write1_core(bamFile fp, const bam1_core_t *c, int data_len, uint8_t *data);
+
+       /*!
+         @abstract   Write an alignment to BAM.
+         @param  fp  BAM file handler
+         @param  b   alignment to write
+         @return     number of bytes written to the file
+
+         @abstract It is equivalent to:
+           bam_write1_core(fp, &b->core, b->data_len, b->data)
+        */
+       int bam_write1(bamFile fp, const bam1_t *b);
+
+       /*! @function
+         @abstract  Initiate a pointer to bam1_t struct
+        */
+#define bam_init1() ((bam1_t*)calloc(1, sizeof(bam1_t)))
+
+       /*! @function
+         @abstract  Free the memory allocated for an alignment.
+         @param  b  pointer to an alignment
+        */
+#define bam_destroy1(b) do {                                                                                   \
+               if ((b)->hash) bam_aux_destroy(b); free((b)->data); free(b);    \
+       } while (0)
+
+       /*!
+         @abstract       Print an alignment to the standard output in TAM format.
+         @param  header  pointer to the header structure
+         @param  b       alignment to print
+        */
+       void bam_view1(const bam_header_t *header, const bam1_t *b);
+
+       /*!
+         @abstract    Merge multiple sorted BAM.
+         @param  is_by_qname whether to sort by query name
+         @param  out  output BAM file name
+         @param  n    number of files to be merged
+         @param  fn   names of files to be merged
+
+         @discussion Padding information may NOT correctly maintained. This
+         function is NOT thread safe.
+        */
+       void bam_merge_core(int is_by_qname, const char *out, int n, char * const *fn);
+
+       /*!
+         @abstract Sort an unsorted BAM file based on the chromosome order
+         and the leftmost position of an alignment
+
+         @param  is_by_qname whether to sort by query name
+         @param  fn       name of the file to be sorted
+         @param  prefix   prefix of the output and the temporary files; upon
+                          sucessess, prefix.bam will be written.
+         @param  max_mem  approxiate maximum memory (very inaccurate)
+
+         @discussion It may create multiple temporary subalignment files
+         and then merge them by calling bam_merge_core(). This function is
+         NOT thread safe.
+        */
+       void bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t max_mem);
+
+       /*! @typedef
+         @abstract Structure for one alignment covering the pileup position.
+         @field  b      pointer to the alignment
+         @field  qpos   position of the read base at the pileup site, 0-based
+         @field  indel  indel length; 0 for no indel, positive for ins and negative for del
+         @field  is_del 1 iff the base on the padded read is a deletion
+         @field  level  the level of the read in the "viewer" mode
+
+         @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The
+         difference between the two functions is that the former does not
+         set bam_pileup1_t::level, while the later does. Level helps the
+         implementation of alignment viewers, but calculating this has some
+         overhead.
+        */
+       typedef struct {
+               bam1_t *b;
+               int32_t qpos;
+               int indel, level;
+               uint32_t is_del:1, is_head:1, is_tail:1;
+       } bam_pileup1_t;
+
+       struct __bam_plbuf_t;
+       /*! @abstract pileup buffer */
+       typedef struct __bam_plbuf_t bam_plbuf_t;
+
+       /*! @typedef
+         @abstract    Type of function to be called by bam_plbuf_push().
+         @param  tid  chromosome ID as is defined in the header
+         @param  pos  start coordinate of the alignment, 0-based
+         @param  n    number of elements in pl array
+         @param  pl   array of alignments
+         @param  data user provided data
+         @discussion  See also bam_plbuf_push(), bam_plbuf_init() and bam_pileup1_t.
+        */
+       typedef int (*bam_pileup_f)(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data);
+
+       void bam_plbuf_reset(bam_plbuf_t *buf);
+
+       /*!
+         @abstract     Initialize a buffer for pileup.
+         @param  func  fucntion to be called by bam_pileup_core()
+         @param  data  user provided data
+         @return       pointer to the pileup buffer
+        */
+       bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data);
+
+       /*!
+         @abstract    Destroy a pileup buffer.
+         @param  buf  pointer to the pileup buffer
+        */
+       void bam_plbuf_destroy(bam_plbuf_t *buf);
+
+       /*!
+         @abstract    Push an alignment to the pileup buffer.
+         @param  b    alignment to be pushed
+         @param  buf  pileup buffer
+         @see         bam_plbuf_init()
+         @return      always 0 currently
+
+         @discussion If all the alignments covering a particular site have
+         been collected, this function will call the user defined function
+         as is provided to bam_plbuf_init(). The coordinate of the site the
+         all the alignments will be transferred to the user defined
+         function as function parameters.
+        
+         When all the alignments are pushed to the buffer, this function
+         needs to be called with b equal to NULL. This will flush the
+         buffer. A pileup buffer cannot be reused.
+        */
+       int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf);
+
+       /*!
+         @abstract         A more convenient interface to bam_plbuf_push()
+         @param  fp        BAM file handler
+         @param  func      user defined function
+         @param  func_data user provided data
+
+         @discussion The file position indicator must be placed right
+         before the start of an alignment. See also bam_plbuf_push().
+        */
+       int bam_pileup_file(bamFile fp, bam_pileup_f func, void *func_data);
+
+       struct __bam_lplbuf_t;
+       typedef struct __bam_lplbuf_t bam_lplbuf_t;
+
+       void bam_lplbuf_reset(bam_lplbuf_t *buf);
+
+       /*! @abstract  bam_plbuf_init() equivalent with level calculated. */
+       bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data);
+
+       /*! @abstract  bam_plbuf_destroy() equivalent with level calculated. */
+       void bam_lplbuf_destroy(bam_lplbuf_t *tv);
+
+       /*! @abstract  bam_plbuf_push() equivalent with level calculated. */
+       int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *buf);
+
+       /*! @abstract  bam_plbuf_file() equivalent with level calculated. */
+       int bam_lpileup_file(bamFile fp, bam_pileup_f func, void *func_data);
+
+       struct __bam_index_t;
+       typedef struct __bam_index_t bam_index_t;
+
+       /*!
+         @abstract   Build index for a BAM file.
+         @discussion Index file "fn.bai" will be created.
+         @param  fn  name of the BAM file
+         @return     always 0 currently
+        */
+       int bam_index_build(const char *fn);
+
+       /*!
+         @abstract   Load index from file "fn.bai".
+         @param  fn  name of the BAM file (NOT the index file)
+         @return     pointer to the index structure
+        */
+       bam_index_t *bam_index_load(const char *fn);
+
+       /*!
+         @abstract    Destroy an index structure.
+         @param  idx  pointer to the index structure
+        */
+       void bam_index_destroy(bam_index_t *idx);
+
+       /*! @typedef
+         @abstract      Type of function to be called by bam_fetch().
+         @param  b     the alignment
+         @param  data  user provided data
+        */
+       typedef int (*bam_fetch_f)(const bam1_t *b, void *data);
+
+       /*!
+         @abstract Retrieve the alignments that are overlapped with the
+         specified region.
+
+         @discussion A user defined function will be called for each
+         retrieved alignment ordered by its start position.
+
+         @param  fp    BAM file handler
+         @param  idx   pointer to the alignment index
+         @param  tid   chromosome ID as is defined in the header
+         @param  beg   start coordinate, 0-based
+         @param  end   end coordinate, 0-based
+         @param  data  user provided data (will be transferred to func)
+         @param  func  user defined function
+        */
+       int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func);
+
+       /*!
+         @abstract       Parse a region in the format: "chr2:100,000-200,000".
+         @discussion     bam_header_t::hash will be initialized if empty.
+         @param  header  pointer to the header structure
+         @param  str     string to be parsed
+         @param  ref_id  the returned chromosome ID
+         @param  begin   the returned start coordinate
+         @param  end     the returned end coordinate
+        */
+       void bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end);
+
+       int32_t bam_aux_geti(bam1_t *b, const char tag[2], int *err);
+       float bam_aux_getf(bam1_t *b, const char tag[2], int *err);
+       char bam_aux_getc(bam1_t *b, const char tag[2], int *err);
+       char *bam_aux_getZH(bam1_t *b, const char tag[2], int *err);
+       void bam_aux_destroy(bam1_t *b);
+
+       /*!  
+         @abstract Calculate the rightmost coordinate of an alignment on the
+         reference genome.
+
+         @param  c      pointer to the bam1_core_t structure
+         @param  cigar  the corresponding CIGAR array (from bam1_t::cigar)
+         @return        the rightmost coordinate, 0-based
+       */
+       uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar);
+
+       /*!
+         @abstract      Calculate the length of the query sequence from CIGAR.
+         @param  c      pointer to the bam1_core_t structure
+         @param  cigar  the corresponding CIGAR array (from bam1_t::cigar)
+         @return        length of the query sequence
+       */
+       int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar);
+
+       int bam_segreg(int32_t pos, const bam1_core_t *c, const uint32_t *cigar, bam_segreg_t *reg);
+
+#ifdef __cplusplus
+}
+#endif
+
+/*!
+  @abstract    Calculate the minimum bin that contains a region [beg,end).
+  @param  beg  start of the region, 0-based
+  @param  end  end of the region, 0-based
+  @return      bin
+ */
+static inline int bam_reg2bin(uint32_t beg, uint32_t end)
+{
+       --end;
+       if (beg>>14 == end>>14) return 4681 + (beg>>14);
+       if (beg>>17 == end>>17) return  585 + (beg>>17);
+       if (beg>>20 == end>>20) return   73 + (beg>>20);
+       if (beg>>23 == end>>23) return    9 + (beg>>23);
+       if (beg>>26 == end>>26) return    1 + (beg>>26);
+       return 0;
+}
+
+static inline void bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
+{
+       uint8_t *data = bdst->data;
+       int m_data = bdst->m_data;   // backup data and m_data
+       if (m_data < bsrc->m_data) { // double the capacity
+               m_data = bsrc->m_data; kroundup32(m_data);
+               data = (uint8_t*)realloc(data, m_data);
+       }
+       memcpy(data, bsrc->data, bsrc->data_len); // copy var-len data
+       *bdst = *bsrc; // copy the rest
+       // restore the backup
+       bdst->m_data = m_data;
+       bdst->data = data;
+}
+
+#endif
diff --git a/bam_aux.c b/bam_aux.c
new file mode 100644 (file)
index 0000000..081f07b
--- /dev/null
+++ b/bam_aux.c
@@ -0,0 +1,160 @@
+#include <ctype.h>
+#include "bam.h"
+#include "khash.h"
+KHASH_MAP_INIT_INT(aux, uint8_t*)
+KHASH_MAP_INIT_STR(s, int)
+
+void bam_init_header_hash(bam_header_t *header)
+{
+       if (header->hash == 0) {
+               int ret, i;
+               khiter_t iter;
+               khash_t(s) *h;
+               header->hash = h = kh_init(s);
+               for (i = 0; i < header->n_targets; ++i) {
+                       iter = kh_put(s, h, header->target_name[i], &ret);
+                       kh_value(h, iter) = i;
+               }
+       }
+}
+
+void bam_destroy_header_hash(bam_header_t *header)
+{
+       if (header->hash)
+               kh_destroy(s, (khash_t(s)*)header->hash);
+}
+
+int32_t bam_get_tid(const bam_header_t *header, const char *seq_name)
+{
+       khint_t k;
+       khash_t(s) *h = (khash_t(s)*)header->hash;
+       k = kh_get(s, h, seq_name);
+       return k == kh_end(h)? -1 : kh_value(h, k);
+}
+
+void bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end)
+{
+       char *s, *p;
+       int i, l, k;
+       khiter_t iter;
+       khash_t(s) *h;
+
+       bam_init_header_hash(header);
+       h = (khash_t(s)*)header->hash;
+       
+       l = strlen(str);
+       p = s = (char*)malloc(l+1);
+       /* squeeze out "," */
+       for (i = k = 0; i != l; ++i)
+               if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i];
+       s[k] = 0;
+       for (i = 0; i != k; ++i) if (s[i] == ':') break;
+       s[i] = 0;
+       iter = kh_get(s, h, s); /* get the ref_id */
+       if (iter == kh_end(h)) { // name not found
+               *ref_id = -1; free(s);
+               return;
+       }
+       *ref_id = kh_value(h, iter);
+       if (i == k) { /* dump the whole sequence */
+               *begin = 0; *end = 1<<29; free(s);
+               return;
+       }
+       for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break;
+       *begin = atoi(p);
+       if (i < k) {
+               p = s + i + 1;
+               *end = atoi(p);
+       } else *end = 1<<29;
+       if (*begin > 0) --*begin;
+       assert(*begin <= *end);
+       free(s);
+}
+
+void bam_aux_init(bam1_t *b)
+{
+       khash_t(aux) *h;
+       uint8_t *s;
+       if (b->hash == 0) {
+               h = kh_init(aux);
+               b->hash = h;
+       } else {
+               h = (khash_t(aux)*)b->hash;
+               kh_clear(aux, h);
+       }
+       s = bam1_aux(b);
+       while (s < b->data + b->data_len) {
+               uint32_t x = (uint32_t)s[0]<<8 | s[1];
+               int ret, type;
+               khint_t k;
+               s += 2; type = toupper(*s); ++s;
+               k = kh_put(aux, h, x, &ret);
+               kh_value(h, k) = s;
+               if (type == 'C') ++s;
+               else if (type == 'S') s += 2;
+               else if (type == 'I') s += 4;
+               else if (type == 'F') s += 4;
+               else if (type == 'Z') { while (*s) putchar(*s++); ++s; }
+       }
+}
+void bam_aux_destroy(bam1_t *b)
+{
+       khash_t(aux) *h = (khash_t(aux)*)b->hash;
+       kh_destroy(aux, h);
+       b->hash = 0;
+}
+static uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2])
+{
+       uint32_t x = (uint32_t)tag[0]<<8 | tag[1];
+       khint_t k;
+       khash_t(aux) *h;
+       if (b->hash == 0) bam_aux_init(b);
+       h = (khash_t(aux)*)b->hash;
+       k = kh_get(aux, h, x);
+       if (k == kh_end(h)) return 0;
+       return kh_value(h, k);
+}
+int32_t bam_aux_geti(bam1_t *b, const char tag[2], int *err)
+{
+       int type;
+       uint8_t *s = bam_aux_get_core(b, tag);
+       *err = 0;
+       if (s == 0) { *err = -1; return 0; }
+       type = *s++;
+       if (type == 'c') return (int32_t)*(int8_t*)s;
+       else if (type == 'C') return (int32_t)*(uint8_t*)s;
+       else if (type == 's') return (int32_t)*(int16_t*)s;
+       else if (type == 'S') return (int32_t)*(uint16_t*)s;
+       else if (type == 'i' || type == 'I') return *(int32_t*)s;
+       else { *err = -2; return 0; }
+}
+float bam_aux_getf(bam1_t *b, const char tag[2], int *err)
+{
+       int type;
+       uint8_t *s = bam_aux_get_core(b, tag);
+       *err = 0;
+       type = *s++;
+       if (s == 0) { *err = -1; return 0; }
+       if (type == 'f') return *(float*)s;
+       else { *err = -2; return 0; }
+}
+char bam_aux_getc(bam1_t *b, const char tag[2], int *err)
+{
+       int type;
+       uint8_t *s = bam_aux_get_core(b, tag);
+       *err = 0;
+       type = *s++;
+       if (s == 0) { *err = -1; return 0; }
+       if (type == 'c') return *(char*)s;
+       else { *err = -2; return 0; }
+}
+char *bam_aux_getZH(bam1_t *b, const char tag[2], int *err)
+{
+       int type;
+       uint8_t *s = bam_aux_get_core(b, tag);
+       *err = 0;
+       type = *s++;
+       if (s == 0) { *err = -1; return 0; }
+       if (type == 'Z' || type == 'H') return (char*)s;
+       else { *err = -2; return 0; }
+}
diff --git a/bam_endian.h b/bam_endian.h
new file mode 100644 (file)
index 0000000..0fc74a8
--- /dev/null
@@ -0,0 +1,42 @@
+#ifndef BAM_ENDIAN_H
+#define BAM_ENDIAN_H
+
+#include <stdint.h>
+
+static inline int bam_is_big_endian()
+{
+       long one= 1;
+       return !(*((char *)(&one)));
+}
+static inline uint16_t bam_swap_endian_2(uint16_t v)
+{
+       return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8));
+}
+static inline void *bam_swap_endian_2p(void *x)
+{
+       *(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x);
+       return x;
+}
+static inline uint32_t bam_swap_endian_4(uint32_t v)
+{
+       v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
+       return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
+}
+static inline void *bam_swap_endian_4p(void *x)
+{
+       *(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x);
+       return x;
+}
+static inline uint64_t bam_swap_endian_8(uint64_t v)
+{
+       v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
+       v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
+       return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
+}
+static inline void *bam_swap_endian_8p(void *x)
+{
+       *(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x);
+       return x;
+}
+
+#endif
diff --git a/bam_import.c b/bam_import.c
new file mode 100644 (file)
index 0000000..6b3b4bc
--- /dev/null
@@ -0,0 +1,372 @@
+#include <zlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <assert.h>
+#include "bam.h"
+#include "kseq.h"
+#include "khash.h"
+
+KSTREAM_INIT(gzFile, gzread, 8192)
+KHASH_MAP_INIT_STR(ref, uint64_t)
+
+void bam_init_header_hash(bam_header_t *header);
+void bam_destroy_header_hash(bam_header_t *header);
+int32_t bam_get_tid(const bam_header_t *header, const char *seq_name);
+
+unsigned char bam_nt16_table[256] = {
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+        1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
+       15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
+       15,15, 5, 6,  8,15, 7, 9, 15,10,15,15, 15,15,15,15,
+       15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
+       15,15, 5, 6,  8,15, 7, 9, 15,10,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
+};
+
+char *bam_nt16_rev_table = "=ACMGRSVTWYHKDBN";
+
+struct __tamFile_t {
+       gzFile fp;
+       kstream_t *ks;
+       kstring_t *str;
+       uint64_t n_lines;
+};
+
+char **bam_load_pos(const char *fn, int *_n)
+{
+       char **list = 0, *s;
+       int n = 0, dret, m = 0, c;
+       gzFile fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
+       kstream_t *ks;
+       kstring_t *str;
+       str = (kstring_t*)calloc(1, sizeof(kstring_t));
+       ks = ks_init(fp);
+       while (ks_getuntil(ks, 0, str, &dret) > 0) {
+               if (n == m) {
+                       m = m? m << 1 : 16;
+                       list = (char**)realloc(list, m * sizeof(char*));
+               }
+               s = list[n++] = (char*)calloc(str->l + 5, 1);
+               strcpy(s, str->s);
+               s += str->l + 1;
+               ks_getuntil(ks, 0, str, &dret);
+               *((uint32_t*)s) = atoi(str->s);
+               if (dret != '\n')
+                       while ((c = ks_getc(fp)) >= 0 && c != '\n');
+       }
+       ks_destroy(ks);
+       free(str->s); free(str);
+       *_n = n;
+       return list;
+}
+
+static bam_header_t *hash2header(const kh_ref_t *hash)
+{
+       bam_header_t *header;
+       khiter_t k;
+       header = bam_header_init();
+       header->n_targets = kh_size(hash);
+       header->target_name = (char**)calloc(kh_size(hash), sizeof(char*));
+       header->target_len = (uint32_t*)calloc(kh_size(hash), 4);
+       for (k = kh_begin(hash); k != kh_end(hash); ++k) {
+               if (kh_exist(hash, k)) {
+                       int i = (int)kh_value(hash, k);
+                       header->target_name[i] = (char*)kh_key(hash, k);
+                       header->target_len[i] = kh_value(hash, k)>>32;
+               }
+       }
+       bam_init_header_hash(header);
+       return header;
+}
+bam_header_t *sam_header_read2(const char *fn)
+{
+       bam_header_t *header;
+       int c, dret, ret;
+       gzFile fp;
+       kstream_t *ks;
+       kstring_t *str;
+       kh_ref_t *hash;
+       khiter_t k;
+       hash = kh_init(ref);
+       fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
+       assert(fp);
+       ks = ks_init(fp);
+       str = (kstring_t*)calloc(1, sizeof(kstring_t));
+       while (ks_getuntil(ks, 0, str, &dret) >= 0) {
+               char *s = strdup(str->s);
+               int len, i;
+               i = kh_size(hash);
+               ks_getuntil(ks, 0, str, &dret);
+               len = atoi(str->s);
+               k = kh_put(ref, hash, s, &ret);
+               kh_value(hash, k) = (uint64_t)len<<32 | i;
+               if (dret != '\n')
+                       while ((c = ks_getc(ks)) != '\n' && c != -1);
+       }
+       ks_destroy(ks);
+       gzclose(fp);
+       free(str->s); free(str);
+       fprintf(stderr, "[sam_header_read2] %d sequences loaded.\n", kh_size(hash));
+       header = hash2header(hash);
+       kh_destroy(ref, hash);
+       return header;
+}
+static inline uint8_t *alloc_data(bam1_t *b, int size)
+{
+       if (b->m_data < size) {
+               b->m_data = size;
+               kroundup32(b->m_data);
+               b->data = (uint8_t*)realloc(b->data, b->m_data);
+       }
+       return b->data;
+}
+static inline void parse_error(int64_t n_lines, const char * __restrict msg)
+{
+       fprintf(stderr, "Parse error at line %lld: %s\n", (long long)n_lines, msg);
+       abort();
+}
+static inline void append_text(bam_header_t *header, kstring_t *str)
+{
+       int x = header->l_text, y = header->l_text + str->l + 2; // 2 = 1 byte dret + 1 byte null
+       kroundup32(x); kroundup32(y);
+       if (x < y) header->text = (char*)realloc(header->text, y);
+       strncpy(header->text + header->l_text, str->s, str->l+1); // we cannot use strcpy() here.
+       header->l_text += str->l + 1;
+       header->text[header->l_text] = 0;
+}
+int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b)
+{
+       int ret, doff, doff0, dret;
+       bam1_core_t *c = &b->core;
+       kstring_t *str = fp->str;
+       kstream_t *ks = fp->ks;
+
+       while ((ret = ks_getuntil(fp->ks, 0, str, &dret)) >= 0 && str->s[0] == '@') { // skip header
+               str->s[str->l] = dret; // note that str->s is NOT null terminated!!
+               append_text(header, str);
+               if (dret != '\n') {
+                       ret = ks_getuntil(fp->ks, '\n', str, &dret);
+                       str->s[str->l] = '\n'; // NOT null terminated!!
+                       append_text(header, str);
+               }
+               ++fp->n_lines;
+       }
+       while (ret == 0) ret = ks_getuntil(fp->ks, 0, str, &dret); // special consideration for "\r\n"
+       if (ret < 0) return -1;
+       ++fp->n_lines;
+       doff = 0;
+
+       { // name
+               c->l_qname = strlen(str->s) + 1;
+               memcpy(alloc_data(b, doff + c->l_qname) + doff, str->s, c->l_qname);
+               doff += c->l_qname;
+       }
+       { // flag, tid, pos, qual
+               ret = ks_getuntil(ks, 0, str, &dret); c->flag = atoi(str->s);
+               ret = ks_getuntil(ks, 0, str, &dret); c->tid = bam_get_tid(header, str->s);
+               ret = ks_getuntil(ks, 0, str, &dret); c->pos = isdigit(str->s[0])? atoi(str->s) - 1 : -1;
+               ret = ks_getuntil(ks, 0, str, &dret); c->qual = isdigit(str->s[0])? atoi(str->s) : 0;
+               if (ret < 0) return -2;
+       }
+       { // cigar
+               char *s, *t;
+               int i, op;
+               long x;
+               c->n_cigar = 0;
+               if (ks_getuntil(ks, 0, str, &dret) < 0) return -3;
+               if (str->s[0] != '*') {
+                       for (s = str->s; *s; ++s) {
+                               if (isalpha(*s)) ++c->n_cigar;
+                               else if (!isdigit(*s)) parse_error(fp->n_lines, "invalid CIGAR character");
+                       }
+                       b->data = alloc_data(b, doff + c->n_cigar * 4);
+                       for (i = 0, s = str->s; i != c->n_cigar; ++i) {
+                               x = strtol(s, &t, 10);
+                               op = toupper(*t);
+                               if (op == 'M') op = BAM_CMATCH;
+                               else if (op == 'I') op = BAM_CINS;
+                               else if (op == 'D') op = BAM_CDEL;
+                               else if (op == 'N') op = BAM_CREF_SKIP;
+                               else if (op == 'S') op = BAM_CSOFT_CLIP;
+                               else if (op == 'H') op = BAM_CHARD_CLIP;
+                               else if (op == 'P') op = BAM_CPAD;
+                               else parse_error(fp->n_lines, "invalid CIGAR operation");
+                               s = t + 1;
+                               bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op;
+                       }
+                       if (*s) parse_error(fp->n_lines, "unmatched CIGAR operation");
+                       c->bin = bam_reg2bin(c->pos, bam_calend(c, bam1_cigar(b)));
+                       doff += c->n_cigar * 4;
+               }
+       }
+       { // mtid, mpos, isize
+               ret = ks_getuntil(ks, 0, str, &dret); c->mtid = strcmp(str->s, "=")? bam_get_tid(header, str->s) : c->tid;
+               ret = ks_getuntil(ks, 0, str, &dret); c->mpos = isdigit(str->s[0])? atoi(str->s) - 1 : -1;
+               ret = ks_getuntil(ks, 0, str, &dret); c->isize = (str->s[0] == '-' || isdigit(str->s[0]))? atoi(str->s) : 0;
+               if (ret < 0) return -4;
+       }
+       { // seq and qual
+               int i;
+               uint8_t *p;
+               if (ks_getuntil(ks, 0, str, &dret) < 0) return -5; // seq
+               c->l_qseq = strlen(str->s);
+               if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b)))
+                       parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent");
+               p = (uint8_t*)alloc_data(b, doff + c->l_qseq + (c->l_qseq+1)/2) + doff;
+               bzero(p, (c->l_qseq+1)/2);
+               for (i = 0; i < c->l_qseq; ++i)
+                       p[i/2] |= bam_nt16_table[(int)str->s[i]] << 4*(1-i%2);
+               if (ks_getuntil(ks, 0, str, &dret) < 0) return -6; // qual
+               if (c->l_qseq != strlen(str->s))
+                       parse_error(fp->n_lines, "sequence and quality are inconsistent");
+               p += (c->l_qseq+1)/2;
+               for (i = 0; i < c->l_qseq; ++i) p[i] = str->s[i] - 33;
+               doff += c->l_qseq + (c->l_qseq+1)/2;
+       }
+       doff0 = doff;
+       if (dret != '\n' && dret != '\r') { // aux
+               while (ks_getuntil(ks, 0, str, &dret) >= 0) {
+                       uint8_t *s, type, key[2];
+                       if (str->l < 6 || str->s[2] != ':' || str->s[4] != ':')
+                               parse_error(fp->n_lines, "missing colon in auxiliary data");
+                       key[0] = str->s[0]; key[1] = str->s[1];
+                       type = str->s[3];
+                       s = alloc_data(b, doff + 3) + doff;
+                       s[0] = key[0]; s[1] = key[1]; s += 2; doff += 2;
+                       if (type == 'A' || type == 'a') {
+                               s = alloc_data(b, doff + 2) + doff;
+                               *s++ = type; *s = str->s[5];
+                               doff += 2;
+                       } else if (type == 'I' || type == 'i') {
+                               long long x;
+                               s = alloc_data(b, doff + 5) + doff;
+                               x = (long long)atoll(str->s + 5);
+                               if (x < 0) {
+                                       if (x >= -127) {
+                                               *s++ = 'c'; *(int8_t*)s = (int8_t)x;
+                                               s += 1; doff += 2;
+                                       } else if (x >= -32767) {
+                                               *s++ = 's'; *(int16_t*)s = (int16_t)x;
+                                               s += 2; doff += 3;
+                                       } else {
+                                               *s++ = 'i'; *(int32_t*)s = (int32_t)x;
+                                               s += 4; doff += 5;
+                                               if (x < -2147483648ll)
+                                                       fprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.",
+                                                                       (long long)fp->n_lines, x);
+                                       }
+                               } else {
+                                       if (x <= 255) {
+                                               *s++ = 'C'; *s++ = (uint8_t)x;
+                                               doff += 2;
+                                       } else if (x <= 65535) {
+                                               *s++ = 'S'; *(uint16_t*)s = (uint16_t)x;
+                                               s += 2; doff += 3;
+                                       } else {
+                                               *s++ = 'I'; *(uint32_t*)s = (uint32_t)x;
+                                               s += 4; doff += 5;
+                                               if (x > 4294967295ll)
+                                                       fprintf(stderr, "Parse warning at line %lld: integer %lld is out of range.",
+                                                                       (long long)fp->n_lines, x);
+                                       }
+                               }
+                       } else if (type == 'f') {
+                               s = alloc_data(b, doff + 5) + doff;
+                               *s++ = 'f';
+                               *(float*)s = (float)atof(str->s + 5);
+                               s += 4; doff += 5;
+                       } else if (type == 'Z' || type == 'H') {
+                               int size = 1 + (str->l - 5) + 1;
+                               if (type == 'H') { // check whether the hex string is valid
+                                       int i;
+                                       if ((str->l - 5) % 2 == 1) parse_error(fp->n_lines, "length of the hex string not even");
+                                       for (i = 0; i < str->l - 5; ++i) {
+                                               int c = toupper(str->s[5 + i]);
+                                               if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F')))
+                                                       parse_error(fp->n_lines, "invalid hex character");
+                                       }
+                               }
+                               s = alloc_data(b, doff + size) + doff;
+                               *s++ = type;
+                               memcpy(s, str->s + 5, str->l - 5);
+                               s[str->l - 5] = 0;
+                               doff += size;
+                       } else parse_error(fp->n_lines, "unrecognized type");
+                       if (dret == '\n' || dret == '\r') break;
+               }
+       }
+       b->l_aux = doff - doff0;
+       b->data_len = doff;
+       return 0;
+}
+
+tamFile sam_open(const char *fn)
+{
+       tamFile fp;
+       fp = (tamFile)calloc(1, sizeof(struct __tamFile_t));
+       fp->str = (kstring_t*)calloc(1, sizeof(kstring_t));
+       fp->fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
+       fp->ks = ks_init(fp->fp);
+       fp->n_lines = 0;
+       return fp;
+}
+
+void sam_close(tamFile fp)
+{
+       if (fp) {
+               ks_destroy(fp->ks);
+               gzclose(fp->fp);
+               free(fp->str->s); free(fp->str);
+               free(fp);
+       }
+}
+
+static void taf2baf_core(const char *fntaf, const char *fnbaf, bam_header_t *header)
+{
+       bamFile fpbaf;
+       bam1_t *b;
+       tamFile fp;
+       int ret;
+
+       b = (bam1_t*)calloc(1, sizeof(bam1_t));
+       fpbaf = bam_open(fnbaf, "w");
+       fp = sam_open(fntaf);
+       ret = sam_read1(fp, header, b);
+       bam_header_write(fpbaf, header);
+       if (ret >= 0) {
+               bam_write1(fpbaf, b);
+               while (sam_read1(fp, header, b) >= 0) bam_write1(fpbaf, b);
+       }
+       bam_close(fpbaf);
+       free(b->data); free(b);
+       sam_close(fp);
+}
+
+int bam_taf2baf(int argc, char *argv[])
+{
+       int c;
+       bam_header_t *header;
+
+       while ((c = getopt(argc, argv, "")) >= 0) {
+       }
+       if (optind + 3 > argc) {
+               fprintf(stderr, "Usage: bamtk import <in.ref_list> <in.sam> <out.bam>\n");
+               return 1;
+       }
+       header = sam_header_read2(argv[optind]);
+       taf2baf_core(argv[optind+1], argv[optind+2], header);
+       bam_header_destroy(header);
+       return 0;
+}
diff --git a/bam_index.c b/bam_index.c
new file mode 100644 (file)
index 0000000..2b01815
--- /dev/null
@@ -0,0 +1,452 @@
+#include <ctype.h>
+#include "bam.h"
+#include "khash.h"
+#include "ksort.h"
+#include "bam_endian.h"
+
+/*!
+  @header
+
+  Alignment indexing. Before indexing, BAM must be sorted based on the
+  leftmost coordinate of alignments. In indexing, BAM uses two indices:
+  a UCSC binning index and a simple linear index. The binning index is
+  efficient for alignments spanning long distance, while the auxiliary
+  linear index helps to reduce unnecessary seek calls especially for
+  short alignments.
+
+  The UCSC binning scheme was suggested by Richard Durbin and Lincoln
+  Stein and is explained by Kent et al. (2002). In this scheme, each bin
+  represents a contiguous genomic region which can be fully contained in
+  another bin; each alignment is associated with a bin which represents
+  the smallest region containing the entire alignment. The binning
+  scheme is essentially another representation of R-tree. A distinct bin
+  uniquely corresponds to a distinct internal node in a R-tree. Bin A is
+  a child of Bin B if region A is contained in B.
+
+  In BAM, each bin may span 2^29, 2^26, 2^23, 2^20, 2^17 or 2^14 bp. Bin
+  0 spans a 512Mbp region, bins 1-8 span 64Mbp, 9-72 8Mbp, 73-584 1Mbp,
+  585-4680 128Kbp and bins 4681-37449 span 16Kbp regions. If we want to
+  find the alignments overlapped with a region [rbeg,rend), we need to
+  calculate the list of bins that may be overlapped the region and test
+  the alignments in the bins to confirm the overlaps. If the specified
+  region is short, typically only a few alignments in six bins need to
+  be retrieved. The overlapping alignments can be quickly fetched.
+
+ */
+
+#define BAM_MIN_CHUNK_GAP 32768
+#define BAM_LIDX_SHIFT    14
+
+typedef struct {
+       uint64_t u, v;
+} pair64_t;
+
+#define pair64_lt(a,b) ((a).u < (b).u)
+KSORT_INIT(off, pair64_t, pair64_lt)
+
+typedef struct {
+       uint32_t m, n;
+       pair64_t *list;
+} bam_binlist_t;
+
+typedef struct {
+       int32_t n, m;
+       uint64_t *offset;
+} bam_lidx_t;
+
+KHASH_MAP_INIT_INT(i, bam_binlist_t)
+
+struct __bam_index_t {
+       int32_t n;
+       khash_t(i) **index;
+       bam_lidx_t *index2;
+};
+
+// requirement: len <= LEN_MASK
+static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end)
+{
+       khint_t k;
+       bam_binlist_t *l;
+       int ret;
+       k = kh_put(i, h, bin, &ret);
+       l = &kh_value(h, k);
+       if (ret) { // not present
+               l->m = 1; l->n = 0;
+               l->list = (pair64_t*)calloc(l->m, 16);
+       }
+       if (l->n == l->m) {
+               l->m <<= 1;
+               l->list = (pair64_t*)realloc(l->list, l->m * 16);
+       }
+       l->list[l->n].u = beg; l->list[l->n++].v = end;
+}
+
+static inline void insert_offset2(bam_lidx_t *index2, int last, int curr, uint64_t offset)
+{
+       int i;
+       if (index2->m < curr + 1) {
+               index2->m = curr + 1;
+               kroundup32(index2->m);
+               index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
+       }
+       if (last > curr) last = -1;
+       for (i = last + 1; i <= curr; ++i) index2->offset[i] = offset;
+       index2->n = curr + 1;
+}
+
+static void merge_chunks(bam_index_t *idx)
+{
+#if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
+       khash_t(i) *index;
+       int i, l, m;
+       khint_t k;
+       for (i = 0; i < idx->n; ++i) {
+               index = idx->index[i];
+               for (k = kh_begin(index); k != kh_end(index); ++k) {
+                       bam_binlist_t *p;
+                       if (!kh_exist(index, k)) continue;
+                       p = &kh_value(index, k);
+                       m = 0;
+                       for (l = 1; l < p->n; ++l) {
+#ifdef BAM_TRUE_OFFSET
+                               if (p->list[m].v + BAM_MIN_CHUNK_GAP > p->list[l].u) p->list[m].v = p->list[l].v;
+#else
+                               if (p->list[m].v>>16 == p->list[l].u>>16) p->list[m].v = p->list[l].v;
+#endif
+                               else p->list[++m] = p->list[l];
+                       } // ~for(l)
+                       p->n = m + 1;
+               } // ~for(k)
+       } // ~for(i)
+#endif // defined(BAM_TRUE_OFFSET) || defined(BAM_BGZF)
+}
+
+bam_index_t *bam_index_core(bamFile fp)
+{
+       bam1_t *b;
+       bam_header_t *h;
+       int i, ret;
+       bam_index_t *idx;
+       uint32_t last_coor, last_tid, last_bin, save_bin, save_tid;
+       bam1_core_t *c;
+       uint64_t save_off, last_off;
+
+       idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));
+       b = (bam1_t*)calloc(1, sizeof(bam1_t));
+       h = bam_header_read(fp);
+       c = &b->core;
+
+       idx->n = h->n_targets;
+       bam_header_destroy(h);
+       idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
+       for (i = 0; i < idx->n; ++i) idx->index[i] = kh_init(i);
+       idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
+
+       save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
+       save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
+       while ((ret = bam_read1(fp, b)) >= 0) {
+               if (last_tid != c->tid) { // change of chromosomes
+                       last_tid = c->tid;
+                       last_bin = 0xffffffffu;
+               } else if (last_coor > c->pos) {
+                       fprintf(stderr, "[bam_index_core] the alignment is not sorted. Abort!\n");
+                       exit(1);
+               }
+               if (last_coor>>BAM_LIDX_SHIFT != b->core.pos>>BAM_LIDX_SHIFT) // then write the linear index
+                       insert_offset2(&idx->index2[b->core.tid], last_coor>>BAM_LIDX_SHIFT, b->core.pos>>BAM_LIDX_SHIFT, last_off);
+               if (c->bin != last_bin) { // then possibly write the binning index
+                       if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
+                               insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
+                       save_off = last_off;
+                       save_bin = last_bin = c->bin;
+                       save_tid = c->tid;
+               }
+               if (bam_tell(fp) <= last_off) {
+                       fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
+                                       (unsigned long long)bam_tell(fp), (unsigned long long)last_off);
+                       exit(1);
+               }
+               last_off = bam_tell(fp);
+               last_coor = b->core.pos;
+       }
+       insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
+       merge_chunks(idx);
+       if (ret < -1) fprintf(stderr, "[bam_index_core] truncated file? Continue anyway. (%d)\n", ret);
+       free(b->data); free(b);
+       return idx;
+}
+
+void bam_index_destroy(bam_index_t *idx)
+{
+       khint_t k;
+       int i;
+       if (idx == 0) return;
+       for (i = 0; i < idx->n; ++i) {
+               khash_t(i) *index = idx->index[i];
+               bam_lidx_t *index2 = idx->index2 + i;
+               for (k = kh_begin(index); k != kh_end(index); ++k) {
+                       if (kh_exist(index, k))
+                               free(kh_value(index, k).list);
+               }
+               kh_destroy(i, index);
+               free(index2->offset);
+       }
+       free(idx->index); free(idx->index2);
+       free(idx);
+}
+
+void bam_index_save(const bam_index_t *idx, FILE *fp)
+{
+       int32_t i, size;
+       khint_t k;
+       fwrite("BAI\1", 1, 4, fp);
+       if (bam_is_be) {
+               uint32_t x = idx->n;
+               fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+       } else fwrite(&idx->n, 4, 1, fp);
+       for (i = 0; i < idx->n; ++i) {
+               khash_t(i) *index = idx->index[i];
+               bam_lidx_t *index2 = idx->index2 + i;
+               // write binning index
+               size = kh_size(index);
+               if (bam_is_be) { // big endian
+                       uint32_t x = size;
+                       fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+               } else fwrite(&size, 4, 1, fp);
+               for (k = kh_begin(index); k != kh_end(index); ++k) {
+                       if (kh_exist(index, k)) {
+                               bam_binlist_t *p = &kh_value(index, k);
+                               if (bam_is_be) { // big endian
+                                       uint32_t x;
+                                       x = kh_key(index, k); fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+                                       x = p->n; fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+                                       for (x = 0; (int)x < p->n; ++x) {
+                                               bam_swap_endian_8p(&p->list[x].u);
+                                               bam_swap_endian_8p(&p->list[x].v);
+                                       }
+                                       fwrite(p->list, 16, p->n, fp);
+                                       for (x = 0; (int)x < p->n; ++x) {
+                                               bam_swap_endian_8p(&p->list[x].u);
+                                               bam_swap_endian_8p(&p->list[x].v);
+                                       }
+                               } else {
+                                       fwrite(&kh_key(index, k), 4, 1, fp);
+                                       fwrite(&p->n, 4, 1, fp);
+                                       fwrite(p->list, 16, p->n, fp);
+                               }
+                       }
+               }
+               // write linear index (index2)
+               if (bam_is_be) {
+                       int x = index2->n;
+                       fwrite(bam_swap_endian_4p(&x), 4, 1, fp);
+               } else fwrite(&index2->n, 4, 1, fp);
+               if (bam_is_be) { // big endian
+                       int x;
+                       for (x = 0; (int)x < index2->n; ++x)
+                               bam_swap_endian_8p(&index2->offset[x]);
+                       fwrite(index2->offset, 8, index2->n, fp);
+                       for (x = 0; (int)x < index2->n; ++x)
+                               bam_swap_endian_8p(&index2->offset[x]);
+               } else fwrite(index2->offset, 8, index2->n, fp);
+       }
+       fflush(fp);
+}
+
+bam_index_t *bam_index_load(const char *fn)
+{
+       bam_index_t *idx;
+       FILE *fp;
+       int i;
+       char *fnidx, magic[4];
+
+       fnidx = (char*)calloc(strlen(fn) + 5, 1);
+       strcpy(fnidx, fn); strcat(fnidx, ".bai");
+       if ((fp = fopen(fnidx, "r")) == 0) {
+               fprintf(stderr, "[bam_index_load] the alignment is not indexed. Please run `index' command first. Abort!\n");
+               exit(1);
+       }
+       free(fnidx);
+
+       fread(magic, 1, 4, fp);
+       if (strncmp(magic, "BAI\1", 4)) {
+               fprintf(stderr, "[bam_index_load] wrong magic number.\n");
+               fclose(fp);
+               return 0;
+       }
+       idx = (bam_index_t*)calloc(1, sizeof(bam_index_t));     
+       fread(&idx->n, 4, 1, fp);
+       if (bam_is_be) bam_swap_endian_4p(&idx->n);
+       idx->index = (khash_t(i)**)calloc(idx->n, sizeof(void*));
+       idx->index2 = (bam_lidx_t*)calloc(idx->n, sizeof(bam_lidx_t));
+       for (i = 0; i < idx->n; ++i) {
+               khash_t(i) *index;
+               bam_lidx_t *index2 = idx->index2 + i;
+               uint32_t key, size;
+               khint_t k;
+               int j, ret;
+               bam_binlist_t *p;
+               index = idx->index[i] = kh_init(i);
+               // load binning index
+               fread(&size, 4, 1, fp);
+               if (bam_is_be) bam_swap_endian_4p(&size);
+               for (j = 0; j < (int)size; ++j) {
+                       fread(&key, 4, 1, fp);
+                       if (bam_is_be) bam_swap_endian_4p(&key);
+                       k = kh_put(i, index, key, &ret);
+                       p = &kh_value(index, k);
+                       fread(&p->n, 4, 1, fp);
+                       if (bam_is_be) bam_swap_endian_4p(&p->n);
+                       p->m = p->n;
+                       p->list = (pair64_t*)malloc(p->m * 16);
+                       fread(p->list, 16, p->n, fp);
+                       if (bam_is_be) {
+                               int x;
+                               for (x = 0; x < p->n; ++x) {
+                                       bam_swap_endian_8p(&p->list[x].u);
+                                       bam_swap_endian_8p(&p->list[x].v);
+                               }
+                       }
+               }
+               // load linear index
+               fread(&index2->n, 4, 1, fp);
+               if (bam_is_be) bam_swap_endian_4p(&index2->n);
+               index2->m = index2->n;
+               index2->offset = (uint64_t*)calloc(index2->m, 8);
+               fread(index2->offset, index2->n, 8, fp);
+               if (bam_is_be)
+                       for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
+       }
+       fclose(fp);
+       return idx;
+}
+
+int bam_index_build(const char *fn)
+{
+       char *fnidx;
+       FILE *fpidx;
+       bamFile fp;
+       bam_index_t *idx;
+       assert(fp = bam_open(fn, "r"));
+       idx = bam_index_core(fp);
+       bam_close(fp);
+       fnidx = (char*)calloc(strlen(fn) + 5, 1);
+       strcpy(fnidx, fn); strcat(fnidx, ".bai");
+       assert(fpidx = fopen(fnidx, "w"));
+       bam_index_save(idx, fpidx);
+       bam_index_destroy(idx);
+       fclose(fpidx);
+       free(fnidx);
+       return 0;
+}
+
+int bam_index(int argc, char *argv[])
+{
+       if (argc < 2) {
+               fprintf(stderr, "Usage: samtools index <in.bam>\n");
+               return 1;
+       }
+       bam_index_build(argv[1]);
+       return 0;
+}
+
+#define MAX_BIN 37450 // =(8^6-1)/7+1
+
+static inline int reg2bins(uint32_t beg, uint32_t end, uint16_t list[MAX_BIN])
+{
+       int i = 0, k;
+       --end;
+       list[i++] = 0;
+       for (k =    1 + (beg>>26); k <=    1 + (end>>26); ++k) list[i++] = k;
+       for (k =    9 + (beg>>23); k <=    9 + (end>>23); ++k) list[i++] = k;
+       for (k =   73 + (beg>>20); k <=   73 + (end>>20); ++k) list[i++] = k;
+       for (k =  585 + (beg>>17); k <=  585 + (end>>17); ++k) list[i++] = k;
+       for (k = 4681 + (beg>>14); k <= 4681 + (end>>14); ++k) list[i++] = k;
+       return i;
+}
+
+static inline int is_overlap(uint32_t beg, uint32_t end, const bam1_t *b)
+{
+       uint32_t rbeg = b->core.pos;
+       uint32_t rend = bam_calend(&b->core, bam1_cigar(b));
+       return (rend > beg && rbeg < end);
+}
+
+int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
+{
+       uint16_t *bins;
+       int i, n_bins, n_off;
+       pair64_t *off;
+       khint_t k;
+       khash_t(i) *index;
+       uint64_t min_off;
+
+       bins = (uint16_t*)calloc(MAX_BIN, 2);
+       n_bins = reg2bins(beg, end, bins);
+       index = idx->index[tid];
+       min_off = (beg>>BAM_LIDX_SHIFT >= idx->index2[tid].n)? 0 : idx->index2[tid].offset[beg>>BAM_LIDX_SHIFT];
+       for (i = n_off = 0; i < n_bins; ++i) {
+               if ((k = kh_get(i, index, bins[i])) != kh_end(index))
+                       n_off += kh_value(index, k).n;
+       }
+       if (n_off == 0) {
+               free(bins); return 0;
+       }
+       off = (pair64_t*)calloc(n_off, 16);
+       for (i = n_off = 0; i < n_bins; ++i) {
+               if ((k = kh_get(i, index, bins[i])) != kh_end(index)) {
+                       int j;
+                       bam_binlist_t *p = &kh_value(index, k);
+                       for (j = 0; j < p->n; ++j)
+                               if (p->list[j].v > min_off) off[n_off++] = p->list[j];
+               }
+       }
+       free(bins);
+       {
+               bam1_t *b;
+               int ret, n_seeks;
+               uint64_t curr_off;
+               b = (bam1_t*)calloc(1, sizeof(bam1_t));
+               ks_introsort(off, n_off, off);
+               // resolve overlaps between adjecent blocks; this may happen due to the merge in indexing
+               for (i = 1; i < n_off; ++i)
+                       if (off[i-1].v >= off[i].u) off[i-1].v = off[i].u;
+               { // merge adjacent blocks
+#if defined(BAM_TRUE_OFFSET) || defined(BAM_VIRTUAL_OFFSET16)
+                       int l;
+                       for (i = 1, l = 0; i < n_off; ++i) {
+#ifdef BAM_TRUE_OFFSET
+                               if (off[l].v + BAM_MIN_CHUNK_GAP > off[i].u) off[l].v = off[i].v;
+#else
+                               if (off[l].v>>16 == off[i].u>>16) off[l].v = off[i].v;
+#endif
+                               else off[++l] = off[i];
+                       }
+                       n_off = l + 1;
+#endif
+               }
+               // retrive alignments
+               n_seeks = 0; i = -1; curr_off = 0;
+               for (;;) {
+                       if (curr_off == 0 || curr_off >= off[i].v) { // then jump to the next chunk
+                               if (i == n_off - 1) break; // no more chunks
+                               if (i >= 0) assert(curr_off == off[i].v); // otherwise bug
+                               if (i < 0 || off[i].v != off[i+1].u) { // not adjacent chunks; then seek
+                                       bam_seek(fp, off[i+1].u, SEEK_SET);
+                                       curr_off = bam_tell(fp);
+                                       ++n_seeks;
+                               }
+                               ++i;
+                       }
+                       if ((ret = bam_read1(fp, b)) > 0) {
+                               curr_off = bam_tell(fp);
+                               if (b->core.tid != tid || b->core.pos >= end) break; // no need to proceed
+                               else if (is_overlap(beg, end, b)) func(b, data);
+                       } else break; // end of file
+               }
+//             fprintf(stderr, "[bam_fetch] # seek calls: %d\n", n_seeks);
+               bam_destroy1(b);
+       }
+       free(off);
+       return 0;
+}
diff --git a/bam_lpileup.c b/bam_lpileup.c
new file mode 100644 (file)
index 0000000..83f91c2
--- /dev/null
@@ -0,0 +1,196 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include "bam.h"
+#include "ksort.h"
+
+#define TV_GAP 2
+
+typedef struct __freenode_t {
+       uint32_t level:28, cnt:4;
+       struct __freenode_t *next;
+} freenode_t, *freenode_p;
+
+#define freenode_lt(a,b) ((a)->cnt < (b)->cnt || ((a)->cnt == (b)->cnt && (a)->level < (b)->level))
+KSORT_INIT(node, freenode_p, freenode_lt)
+
+/* Memory pool, similar to the one in bam_pileup.c */
+typedef struct {
+       int cnt, n, max;
+       freenode_t **buf;
+} mempool_t;
+
+static mempool_t *mp_init()
+{
+       return (mempool_t*)calloc(1, sizeof(mempool_t));
+}
+static void mp_destroy(mempool_t *mp)
+{
+       int k;
+       for (k = 0; k < mp->n; ++k) free(mp->buf[k]);
+       free(mp->buf); free(mp);
+}
+static inline freenode_t *mp_alloc(mempool_t *mp)
+{
+       ++mp->cnt;
+       if (mp->n == 0) return (freenode_t*)calloc(1, sizeof(freenode_t));
+       else return mp->buf[--mp->n];
+}
+static inline void mp_free(mempool_t *mp, freenode_t *p)
+{
+       --mp->cnt; p->next = 0; p->cnt = TV_GAP;
+       if (mp->n == mp->max) {
+               mp->max = mp->max? mp->max<<1 : 256;
+               mp->buf = (freenode_t**)realloc(mp->buf, sizeof(freenode_t*) * mp->max);
+       }
+       mp->buf[mp->n++] = p;
+}
+
+/* core part */
+struct __bam_lplbuf_t {
+       int max, n_cur, n_pre;
+       int max_level, *cur_level, *pre_level;
+       mempool_t *mp;
+       freenode_t **aux, *head, *tail;
+       int n_nodes, m_aux;
+       bam_pileup_f func;
+       void *user_data;
+       bam_plbuf_t *plbuf;
+};
+
+void bam_lplbuf_reset(bam_lplbuf_t *buf)
+{
+       freenode_t *p, *q;
+       bam_plbuf_reset(buf->plbuf);
+       for (p = buf->head; p->next;) {
+               q = p->next;
+               mp_free(buf->mp, p);
+               p = q;
+       }
+       buf->head = buf->tail;
+       buf->max_level = 0;
+       buf->n_cur = buf->n_pre = 0;
+       buf->n_nodes = 0;
+}
+
+static int tview_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data)
+{
+       bam_lplbuf_t *tv = (bam_lplbuf_t*)data;
+       freenode_t *p;
+       int i, l, max_level;
+       // allocate memory if necessary
+       if (tv->max < n) { // enlarge
+               tv->max = n;
+               kroundup32(tv->max);
+               tv->cur_level = (int*)realloc(tv->cur_level, sizeof(int) * tv->max);
+               tv->pre_level = (int*)realloc(tv->pre_level, sizeof(int) * tv->max);
+       }
+       tv->n_cur = n;
+       // update cnt
+       for (p = tv->head; p->next; p = p->next)
+               if (p->cnt > 0) --p->cnt;
+       // calculate cur_level[]
+       max_level = 0;
+       for (i = l = 0; i < n; ++i) {
+               const bam_pileup1_t *p = pl + i;
+               if (p->qpos == 0) {
+                       if (tv->head->next && tv->head->cnt == 0) { // then take a free slot
+                               freenode_t *p = tv->head->next;
+                               tv->cur_level[i] = tv->head->level;
+                               mp_free(tv->mp, tv->head);
+                               tv->head = p;
+                               --tv->n_nodes;
+                       } else tv->cur_level[i] = ++tv->max_level;
+               } else {
+                       tv->cur_level[i] = tv->pre_level[l++];
+                       if (p->qpos == p->b->core.l_qseq - 1) { // then return a free slot
+                               tv->tail->level = tv->cur_level[i];
+                               tv->tail->next = mp_alloc(tv->mp);
+                               tv->tail = tv->tail->next;
+                               ++tv->n_nodes;
+                       }
+               }
+               if (tv->cur_level[i] > max_level) max_level = tv->cur_level[i];
+               ((bam_pileup1_t*)p)->level = tv->cur_level[i];
+       }
+       assert(l == tv->n_pre);
+       tv->func(tid, pos, n, pl, tv->user_data);
+       // sort the linked list
+       if (tv->n_nodes) {
+               freenode_t *q;
+               if (tv->n_nodes + 1 > tv->m_aux) { // enlarge
+                       tv->m_aux = tv->n_nodes + 1;
+                       kroundup32(tv->m_aux);
+                       tv->aux = (freenode_t**)realloc(tv->aux, sizeof(void*) * tv->m_aux);
+               }
+               for (p = tv->head, i = l = 0; p->next;) {
+                       if (p->level > max_level) { // then discard this entry
+                               q = p->next;
+                               mp_free(tv->mp, p);
+                               p = q;
+                       } else {
+                               tv->aux[i++] = p;
+                               p = p->next;
+                       }
+               }
+               tv->aux[i] = tv->tail; // add a proper tail for the loop below
+               tv->n_nodes = i;
+               if (tv->n_nodes) {
+                       ks_introsort(node, tv->n_nodes, tv->aux);
+                       for (i = 0; i < tv->n_nodes; ++i) tv->aux[i]->next = tv->aux[i+1];
+                       tv->head = tv->aux[0];
+               } else tv->head = tv->tail;
+       }
+       // clean up
+       tv->max_level = max_level;
+       memcpy(tv->pre_level, tv->cur_level, tv->n_cur * 4);
+       // squeeze out terminated levels
+       for (i = l = 0; i < n; ++i) {
+               const bam_pileup1_t *p = pl + i;
+               if (p->qpos != p->b->core.l_qseq - 1)
+                       tv->pre_level[l++] = tv->pre_level[i];
+       }
+       tv->n_pre = l;
+       return 0;
+}
+
+bam_lplbuf_t *bam_lplbuf_init(bam_pileup_f func, void *data)
+{
+       bam_lplbuf_t *tv;
+       tv = (bam_lplbuf_t*)calloc(1, sizeof(bam_lplbuf_t));
+       tv->mp = mp_init();
+       tv->head = tv->tail = mp_alloc(tv->mp);
+       tv->func = func;
+       tv->user_data = data;
+       tv->plbuf = bam_plbuf_init(tview_func, tv);
+       return (bam_lplbuf_t*)tv;
+}
+
+void bam_lplbuf_destroy(bam_lplbuf_t *tv)
+{
+       mp_free(tv->mp, tv->head);
+       mp_destroy(tv->mp);
+       free(tv->cur_level); free(tv->pre_level);
+       bam_plbuf_destroy(tv->plbuf);
+       free(tv->aux);
+       free(tv);
+}
+
+int bam_lplbuf_push(const bam1_t *b, bam_lplbuf_t *tv)
+{
+       return bam_plbuf_push(b, tv->plbuf);
+}
+
+int bam_lpileup_file(bamFile fp, bam_pileup_f func, void *func_data)
+{
+       bam_lplbuf_t *buf;
+       int ret;
+       bam1_t *b;
+       b = (bam1_t*)calloc(1, sizeof(bam1_t));
+       buf = bam_lplbuf_init(func, func_data);
+       while ((ret = bam_read1(fp, b)) >= 0)
+               bam_lplbuf_push(b, buf);
+       bam_lplbuf_push(0, buf);
+       bam_lplbuf_destroy(buf);
+       free(b->data); free(b);
+       return 0;
+}
diff --git a/bam_maqcns.c b/bam_maqcns.c
new file mode 100644 (file)
index 0000000..c8009aa
--- /dev/null
@@ -0,0 +1,451 @@
+#include <math.h>
+#include "bam.h"
+#include "bam_maqcns.h"
+#include "ksort.h"
+KSORT_INIT_GENERIC(uint32_t)
+
+typedef struct __bmc_aux_t {
+       int max;
+       uint32_t *info;
+} bmc_aux_t;
+
+typedef struct {
+       float esum[4], fsum[4];
+       uint32_t c[4];
+       uint32_t mapQ_max;
+} glf_call_aux_t;
+
+/*
+  P(<b1,b2>) = \theta \sum_{i=1}^{N-1} 1/i
+  P(D|<b1,b2>) = \sum_{k=1}^{N-1} p_k 1/2 [(k/N)^n_2(1-k/N)^n_1 + (k/N)^n1(1-k/N)^n_2]
+  p_k = i/k / \sum_{i=1}^{N-1} 1/i
+ */
+static void cal_het(bam_maqcns_t *aa)
+{
+       int k, n1, n2;
+       double sum_harmo; // harmonic sum
+       double poly_rate;
+       double p1 = 0.0, p3 = 0.0; // just for testing
+
+       free(aa->lhet);
+       aa->lhet = (double*)calloc(256 * 256, sizeof(double));
+       sum_harmo = 0.0;
+       for (k = 1; k <= aa->n_hap - 1; ++k)
+               sum_harmo += 1.0 / k;
+       for (n1 = 0; n1 < 256; ++n1) {
+               for (n2 = 0; n2 < 256; ++n2) {
+                       long double sum = 0.0;
+                       double lC = lgamma(n1+n2+1) - lgamma(n1+1) - lgamma(n2+1); // \binom{n1+n2}{n1}
+                       for (k = 1; k <= aa->n_hap - 1; ++k) {
+                               double pk = 1.0 / k / sum_harmo;
+                               double log1 = log((double)k/aa->n_hap);
+                               double log2 = log(1.0 - (double)k/aa->n_hap);
+                               sum += pk * 0.5 * (expl(log1*n2) * expl(log2*n1) + expl(log1*n1) * expl(log2*n2));
+                       }
+                       aa->lhet[n1<<8|n2] = lC + logl(sum);
+                       if (n1 == 17 && n2 == 3) p3 = lC + logl(expl(logl(0.5) * 20));
+                       if (n1 == 19 && n2 == 1) p1 = lC + logl(expl(logl(0.5) * 20));
+               }
+       }
+       poly_rate = aa->het_rate * sum_harmo;
+       aa->q_r = -4.343 * log(2.0 * poly_rate / (1.0 - poly_rate));
+}
+
+/** initialize the helper structure */
+static void cal_coef(bam_maqcns_t *aa)
+{
+       int k, n, q;
+       long double sum_a[257], b[256], q_c[256], tmp[256], fk2[256];
+       double *lC;
+
+       lC = (double*)calloc(256 * 256, sizeof(double));
+       // aa->lhet will be allocated and initialized 
+       free(aa->fk); free(aa->coef);
+       aa->fk = (double*)calloc(256, sizeof(double));
+       aa->coef = (double*)calloc(256*256*64, sizeof(double));
+       aa->fk[0] = fk2[0] = 1.0;
+       for (n = 1; n != 256; ++n) {
+               aa->fk[n] = pow(aa->theta, n) * (1.0 - aa->eta) + aa->eta;
+               fk2[n] = aa->fk[n>>1]; // this is an approximation, assuming reads equally likely come from both strands
+       }
+       for (n = 1; n != 256; ++n)
+               for (k = 1; k <= n; ++k)
+                       lC[n<<8|k] = lgamma(n+1) - lgamma(k+1) - lgamma(n-k+1);
+       for (q = 1; q != 64; ++q) {
+               double e = pow(10.0, -q/10.0);
+               double le = log(e);
+               double le1 = log(1.0-e);
+               for (n = 1; n != 256; ++n) {
+                       double *coef = aa->coef + (q<<16|n<<8);
+                       sum_a[n+1] = 0.0;
+                       for (k = n; k >= 0; --k) { // a_k = \sum_{i=k}^n C^n_k \epsilon^k (1-\epsilon)^{n-k}
+                               sum_a[k] = sum_a[k+1] + expl(lC[n<<8|k] + k*le + (n-k)*le1);
+                               b[k] = sum_a[k+1] / sum_a[k];
+                               if (b[k] > 0.99) b[k] = 0.99;
+                       }
+                       for (k = 0; k != n; ++k) // log(\bar\beta_{nk}(\bar\epsilon)^{f_k})
+                               q_c[k] = -4.343 * fk2[k] * logl(b[k] / e);
+                       for (k = 1; k != n; ++k) q_c[k] += q_c[k-1]; // \prod_{i=0}^k c_i
+                       for (k = 0; k <= n; ++k) { // powl() in 64-bit mode seems broken on my Mac OS X 10.4.9
+                               tmp[k] = -4.343 * logl(1.0 - expl(fk2[k] * logl(b[k])));
+                               coef[k] = (k? q_c[k-1] : 0) + tmp[k]; // this is the final c_{nk}
+                       }
+               }
+       }
+       free(lC);
+}
+
+bam_maqcns_t *bam_maqcns_init()
+{
+       bam_maqcns_t *bm;
+       bm = (bam_maqcns_t*)calloc(1, sizeof(bam_maqcns_t));
+       bm->aux = (bmc_aux_t*)calloc(1, sizeof(bmc_aux_t));
+       bm->het_rate = 0.001;
+       bm->theta = 0.85;
+       bm->n_hap = 2;
+       bm->eta = 0.03;
+       return bm;
+}
+
+void bam_maqcns_prepare(bam_maqcns_t *bm)
+{
+       cal_coef(bm); cal_het(bm);
+}
+
+void bam_maqcns_destroy(bam_maqcns_t *bm)
+{
+       if (bm == 0) return;
+       free(bm->lhet); free(bm->fk); free(bm->coef); free(bm->aux->info);
+       free(bm->aux); free(bm);
+}
+
+glf1_t *bam_maqcns_glfgen(int _n, const bam_pileup1_t *pl, uint8_t ref_base, bam_maqcns_t *bm)
+{
+       glf_call_aux_t *b;
+       int i, j, k, w[8], c, n;
+       glf1_t *g = (glf1_t*)calloc(1, sizeof(glf1_t));
+       float p[16], min_p = 1e30;
+
+       g->ref_base = ref_base;
+       if (_n == 0) return g;
+
+       // construct aux array
+       if (bm->aux->max < _n) {
+               bm->aux->max = _n;
+               kroundup32(bm->aux->max);
+               bm->aux->info = (uint32_t*)realloc(bm->aux->info, 4 * bm->aux->max);
+       }
+       for (i = n = 0; i < _n; ++i) {
+               const bam_pileup1_t *p = pl + i;
+               uint32_t q, x = 0;
+               if (p->is_del || (p->b->core.flag&BAM_FUNMAP)) continue;
+               q = (uint32_t)bam1_qual(p->b)[p->qpos];
+               x |= (uint32_t)bam1_strand(p->b) << 18 | q << 8 | p->b->core.qual;
+               if (p->b->core.qual < q) q = p->b->core.qual;
+               x |= q << 24;
+               q = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos)];
+               if (!p->is_del && q < 4) x |= 1 << 21 | q << 16;
+               bm->aux->info[n++] = x;
+       }
+       ks_introsort(uint32_t, n, bm->aux->info);
+       // generate esum and fsum
+       b = (glf_call_aux_t*)calloc(1, sizeof(glf_call_aux_t));
+       for (k = 0; k != 8; ++k) w[k] = 0;
+       b->mapQ_max = 0;
+       for (j = n - 1; j >= 0; --j) { // calculate esum and fsum
+               uint32_t info = bm->aux->info[j];
+               if (info>>24 < 4 && (info>>8&0x3f) != 0) info = 4<<24 | (info&0xffffff);
+               k = info>>16&7;
+               if (info>>24 > 0) {
+                       b->esum[k&3] += bm->fk[w[k]] * (info>>24);
+                       b->fsum[k&3] += bm->fk[w[k]];
+                       if (w[k] < 0xff) ++w[k];
+                       ++b->c[k&3];
+               }
+               if (b->mapQ_max < (info&0x7f)) b->mapQ_max = info&0x7f;
+       }
+       // rescale ->c[]
+       for (j = c = 0; j != 4; ++j) c += b->c[j];
+       if (c > 255) {
+               for (j = 0; j != 4; ++j) b->c[j] = (int)(254.0 * b->c[j] / c + 0.5);
+               for (j = c = 0; j != 4; ++j) c += b->c[j];
+       }
+       // generate likelihood
+       for (j = 0; j != 4; ++j) {
+               // homozygous
+               float tmp1, tmp3;
+               int tmp2, bar_e;
+               for (k = 0, tmp1 = tmp3 = 0.0, tmp2 = 0; k != 4; ++k) {
+                       if (j == k) continue;
+                       tmp1 += b->esum[k]; tmp2 += b->c[k]; tmp3 += b->fsum[k];
+               }
+               if (tmp2) {
+                       bar_e = (int)(tmp1 / tmp3 + 0.5);
+                       if (bar_e < 4) bar_e = 4; // should not happen
+                       if (bar_e > 63) bar_e = 63;
+                       p[j<<2|j] = tmp1 + bm->coef[bar_e<<16|c<<8|tmp2];
+               } else p[j<<2|j] = 0.0; // all the bases are j
+               // heterozygous
+               for (k = j + 1; k < 4; ++k) {
+                       for (i = 0, tmp2 = 0, tmp1 = tmp3 = 0.0; i != 4; ++i) {
+                               if (i == j || i == k) continue;
+                               tmp1 += b->esum[i]; tmp2 += b->c[i]; tmp3 += b->fsum[i];
+                       }
+                       if (tmp2) {
+                               bar_e = (int)(tmp1 / tmp3 + 0.5);
+                               if (bar_e < 4) bar_e = 4;
+                               if (bar_e > 63) bar_e = 63;
+                               p[j<<2|k] = p[k<<2|j] = -4.343 * bm->lhet[b->c[j]<<8|b->c[k]] + tmp1 + bm->coef[bar_e<<16|c<<8|tmp2];
+                       } else p[j<<2|k] = p[k<<2|j] = -4.343 * bm->lhet[b->c[j]<<8|b->c[k]]; // all the bases are either j or k
+               }
+               //
+               for (k = 0; k != 4; ++k)
+                       if (p[j<<2|k] < 0.0) p[j<<2|k] = 0.0;
+       }
+
+       // convert necessary information to glf1_t
+       g->ref_base = ref_base; g->max_mapQ = b->mapQ_max;
+       g->depth = n > 16777215? 16777215 : n;
+       for (j = 0; j != 4; ++j)
+               for (k = j; k < 4; ++k)
+                       if (p[j<<2|k] < min_p) min_p = p[j<<2|k];
+       g->min_lk = min_p > 255.0? 255 : (int)(min_p + 0.5);
+       for (j = c = 0; j != 4; ++j)
+               for (k = j; k < 4; ++k)
+                       g->lk[c++] = p[j<<2|k]-min_p > 255.0? 255 : (int)(p[j<<2|k]-min_p + 0.5);
+
+       free(b);
+       return g;
+}
+
+uint32_t glf2cns(const glf1_t *g, int q_r)
+{
+       int i, j, k, tmp[16], min = 10000, min2 = 10000, min3 = 10000, min_g = -1, min_g2 = -1;
+       uint32_t x = 0;
+       for (i = k = 0; i < 4; ++i)
+               for (j = i; j < 4; ++j) {
+                       tmp[j<<2|i] = -1;
+                       tmp[i<<2|j] = g->lk[k++] + (i == j? 0 : q_r);
+               }
+       for (i = 0; i < 16; ++i) {
+               if (tmp[i] < 0) continue;
+               if (tmp[i] < min) {
+                       min3 = min2; min2 = min; min = tmp[i]; min_g2 = min_g; min_g = i;
+               } else if (tmp[i] < min2) {
+                       min3 = min2; min2 = tmp[i]; min_g2 = i;
+               } else if (tmp[i] < min3) min3 = tmp[i];
+       }
+       x = min_g >= 0? (1U<<(min_g>>2&3) | 1U<<(min_g&3)) << 28 : 0xf << 28;
+       x |= min_g2 >= 0? (1U<<(min_g2>>2&3) | 1U<<(min_g2&3)) << 24 : 0xf << 24;
+       x |= (uint32_t)g->max_mapQ << 16;
+       x |= min2 < 10000? (min2 - min < 256? min2 - min : 255) << 8 : 0xff << 8;
+       x |= min2 < 10000 && min3 < 10000? (min3 - min2 < 256? min3 - min2 : 255) : 0xff;
+       return x;
+}
+
+uint32_t bam_maqcns_call(int n, const bam_pileup1_t *pl, bam_maqcns_t *bm)
+{
+       glf1_t *g;
+       uint32_t x;
+       if (n) {
+               g = bam_maqcns_glfgen(n, pl, 0xf, bm);
+               x = glf2cns(g, (int)(bm->q_r + 0.5));
+               free(g);
+       } else x = 0xfU<<28 | 0xfU<<24;
+       return x;
+}
+
+/************** *****************/
+
+bam_maqindel_opt_t *bam_maqindel_opt_init()
+{
+       bam_maqindel_opt_t *mi = (bam_maqindel_opt_t*)calloc(1, sizeof(bam_maqindel_opt_t));
+       mi->mm_penalty = 3;
+       mi->indel_err = 4;
+       mi->ambi_thres = 10;
+       return mi;
+}
+
+void bam_maqindel_ret_destroy(bam_maqindel_ret_t *mir)
+{
+       if (mir == 0) return;
+       free(mir->s1); free(mir->s2); free(mir);
+}
+
+#define MINUS_CONST 0x10000000
+
+bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref)
+{
+       int i, j, n_types, *types, left, right;
+       bam_maqindel_ret_t *ret = 0;
+       for (i = 0; i < n; ++i) {
+               const bam_pileup1_t *p = pl + i;
+               if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0) break;
+       }
+       if (i == n) return 0; // no indel
+       { // calculate how many types of indels are available (set n_types and types)
+               int m;
+               uint32_t *aux;
+               aux = (uint32_t*)calloc(n+1, 4);
+               m = 0;
+               aux[m++] = MINUS_CONST; // zero indel is always a type
+               for (i = 0; i < n; ++i) {
+                       const bam_pileup1_t *p = pl + i;
+                       if (!(p->b->core.flag&BAM_FUNMAP) && p->indel != 0)
+                               aux[m++] = MINUS_CONST + p->indel;
+               }
+               ks_introsort(uint32_t, m, aux);
+               n_types = 1;
+               for (i = 1; i < m; ++i)
+                       if (aux[i] != aux[i-1]) ++n_types;
+               types = (int*)calloc(n_types, sizeof(int));
+               j = 0;
+               types[j++] = aux[0] - MINUS_CONST; 
+               for (i = 1; i < m; ++i) {
+                       if (aux[i] != aux[i-1])
+                               types[j++] = aux[i] - MINUS_CONST;
+               }
+               free(aux);
+       }
+       { // calculate left and right boundary
+               bam_segreg_t seg;
+               left = 0x7fffffff; right = 0;
+               for (i = 0; i < n; ++i) {
+                       const bam_pileup1_t *p = pl + i;
+                       if (!(p->b->core.flag&BAM_FUNMAP)) {
+                               bam_segreg(pos, &p->b->core, bam1_cigar(p->b), &seg);
+                               if (seg.tbeg < left) left = seg.tbeg;
+                               if (seg.tend > right) right = seg.tend;
+                       }
+               }
+       }
+       { // the core part
+               char *ref2, *inscns = 0;
+               int k, l, *score, max_ins = types[n_types-1];
+               ref2 = (char*)calloc(right - left + types[n_types-1] + 2, 1);
+               if (max_ins > 0) { // get the consensus of inserted sequences
+                       int *inscns_aux = (int*)calloc(4 * n_types * max_ins, sizeof(int));
+                       // count occurrences
+                       for (i = 0; i < n_types; ++i) {
+                               if (types[i] <= 0) continue; // not insertion
+                               for (j = 0; j < n; ++j) {
+                                       const bam_pileup1_t *p = pl + j;
+                                       if (!(p->b->core.flag&BAM_FUNMAP) && p->indel == types[i]) {
+                                               for (k = 1; k <= p->indel; ++k) {
+                                                       int c = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos + k)];
+                                                       if (c < 4) ++inscns_aux[i*max_ins*4 + (k-1)*4 + c];
+                                               }
+                                       }
+                               }
+                       }
+                       // construct the consensus
+                       inscns = (char*)calloc(n_types * max_ins, sizeof(char));
+                       for (i = 0; i < n_types; ++i) {
+                               for (j = 0; j < types[i]; ++j) {
+                                       int max = 0, max_k = -1, *ia = inscns_aux + i*max_ins*4 + j*4;
+                                       for (k = 0; k < 4; ++k) {
+                                               if (ia[k] > max) {
+                                                       max = ia[k];
+                                                       max_k = k;
+                                               }
+                                       }
+                                       inscns[i*max_ins + j] = max? 1<<max_k : 15;
+                               }
+                       }
+                       free(inscns_aux);
+               }
+               // calculate score
+               score = (int*)calloc(n_types * n, sizeof(int));
+               for (i = 0; i < n_types; ++i) {
+                       // write ref2
+                       for (k = 0, j = left; j <= pos; ++j)
+                               ref2[k++] = bam_nt16_table[(int)ref[j]];
+                       if (types[i] <= 0) j += -types[i];
+                       else for (l = 0; l < types[i]; ++l)
+                                        ref2[k++] = inscns[i*max_ins + l];
+                       for (; j < right && ref[j]; ++j)
+                               ref2[k++] = bam_nt16_table[(int)ref[j]];
+                       // calculate score for each read
+                       for (j = 0; j < n; ++j) {
+                               const bam_pileup1_t *p = pl + j;
+                               uint32_t *cigar;
+                               bam1_core_t *c = &p->b->core;
+                               int s;
+                               bam_segreg_t seg;
+                               if (c->flag&BAM_FUNMAP) continue;
+                               cigar = bam1_cigar(p->b);
+                               bam_segreg(pos, c, cigar, &seg);
+                               for (s = 0, l = seg.qbeg; c->pos + l < right && l < seg.qend; ++l) {
+                                       int cq = bam1_seqi(bam1_seq(p->b), l), ct;
+                                       ct = c->pos + l >= left? ref2[c->pos + l - left] : 15; // "<" should not happen if there is no bug
+                                       if (cq < 15 && ct < 15)
+                                               s += cq == ct? 1 : -mi->mm_penalty;
+                               }
+                               score[i*n + j] = s;
+                               if (types[i] != 0) { // then try the other way to calculate the score
+                                       for (s = 0, l = seg.qbeg; c->pos + l + types[i] < right && l < seg.qend; ++l) {
+                                               int cq = bam1_seqi(bam1_seq(p->b), l), ct;
+                                               ct = c->pos + l + types[i] >= left? ref2[c->pos + l + types[i] - left] : 15;
+                                               if (cq < 15 && ct < 15)
+                                                       s += cq == ct? 1 : -mi->mm_penalty;
+                                       }
+                               }
+                               if (score[i*n+j] < s) score[i*n+j] = s; // choose the higher of the two scores
+                               if (types[i] != 0) score[i*n+j] -= mi->indel_err;
+                               //printf("%d, %d, %d, %d\n", i, types[i], j, score[i*n+j]);
+                       }
+               }
+               { // get final result
+                       int *sum, max1, max2, max1_i, max2_i;
+                       // pick up the best two score
+                       sum = (int*)calloc(n_types, sizeof(int));
+                       for (i = 0; i < n_types; ++i)
+                               for (j = 0; j < n; ++j)
+                                       sum[i] += score[i*n+j];
+                       max1 = max2 = -0x7fffffff; max1_i = max2_i = -1;
+                       for (i = 0; i < n_types; ++i) {
+                               if (sum[i] > max1) {
+                                       max2 = max1; max2_i = max1_i; max1 = sum[i]; max1_i = i;
+                               } else if (sum[i] > max2) {
+                                       max2 = sum[i]; max2_i = i;
+                               }
+                       }
+                       free(sum);
+                       // write ret
+                       ret = (bam_maqindel_ret_t*)calloc(1, sizeof(bam_maqindel_ret_t));
+                       ret->indel1 = types[max1_i]; ret->indel2 = types[max2_i];
+                       ret->s1 = (char*)calloc(abs(ret->indel1) + 2, 1);
+                       ret->s2 = (char*)calloc(abs(ret->indel2) + 2, 1);
+                       if (ret->indel1 > 0) {
+                               ret->s1[0] = '+';
+                               for (k = 0; k < ret->indel1; ++k)
+                                       ret->s1[k+1] = bam_nt16_rev_table[(int)inscns[max1_i*max_ins + k]];
+                       } else if (ret->indel1 < 0) {
+                               ret->s1[0] = '-';
+                               for (k = 0; k < -ret->indel1 && ref[pos + k + 1]; ++k)
+                                       ret->s1[k+1] = ref[pos + k + 1];
+                       } else ret->s1[0] = '*';
+                       if (ret->indel2 > 0) {
+                               ret->s2[0] = '+';
+                               for (k = 0; k < ret->indel2; ++k)
+                                       ret->s2[k+1] = bam_nt16_rev_table[(int)inscns[max2_i*max_ins + k]];
+                       } else if (ret->indel2 < 0) {
+                               ret->s2[0] = '-';
+                               for (k = 0; k < -ret->indel2 && ref[pos + k + 1]; ++k)
+                                       ret->s2[k+1] = ref[pos + k + 1];
+                       } else ret->s2[0] = '*';
+                       for (j = 0; j < n; ++j) {
+                               if (score[max1_i*n+j] < 0 && score[max2_i*n+j] < 0) ++ret->cnt_anti;
+                               else {
+                                       int diff = score[max1_i*n+j] - score[max2_i*n+j];
+                                       if (diff > mi->ambi_thres) ++ret->cnt1;
+                                       else if (diff < -mi->ambi_thres) ++ret->cnt2;
+                                       else ++ret->cnt_ambi;
+                               }
+                       }
+               }
+               free(score); free(ref2); free(inscns);
+       }
+       free(types);
+       return ret;
+}
diff --git a/bam_maqcns.h b/bam_maqcns.h
new file mode 100644 (file)
index 0000000..5d410ef
--- /dev/null
@@ -0,0 +1,48 @@
+#ifndef BAM_MAQCNS_H
+#define BAM_MAQCNS_H
+
+#include "glf.h"
+
+struct __bmc_aux_t;
+
+typedef struct {
+       float het_rate, theta;
+       int n_hap;
+
+       float eta, q_r;
+       double *fk, *coef;
+       double *lhet;
+       struct __bmc_aux_t *aux;
+} bam_maqcns_t;
+
+typedef struct {
+       int mm_penalty, indel_err, ambi_thres;
+} bam_maqindel_opt_t;
+
+typedef struct {
+       int indel1, indel2;
+       int cnt1, cnt2, cnt_ambi, cnt_anti;
+       char *s1, *s2;
+} bam_maqindel_ret_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+       bam_maqcns_t *bam_maqcns_init();
+       void bam_maqcns_prepare(bam_maqcns_t *bm);
+       void bam_maqcns_destroy(bam_maqcns_t *bm);
+       glf1_t *bam_maqcns_glfgen(int n, const bam_pileup1_t *pl, uint8_t ref_base, bam_maqcns_t *bm);
+       uint32_t bam_maqcns_call(int n, const bam_pileup1_t *pl, bam_maqcns_t *bm);
+       // return: cns<<28 | cns2<<24 | mapQ<<16 | cnsQ<<8 | cnsQ2
+       uint32_t glf2cns(const glf1_t *g, int q_r);
+
+       bam_maqindel_opt_t *bam_maqindel_opt_init();
+       bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, const bam_pileup1_t *pl, const char *ref);
+       void bam_maqindel_ret_destroy(bam_maqindel_ret_t*);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/bam_pileup.c b/bam_pileup.c
new file mode 100644 (file)
index 0000000..d01f9a3
--- /dev/null
@@ -0,0 +1,213 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include "bam.h"
+
+typedef struct __linkbuf_t {
+       bam1_t b;
+       uint32_t beg, end;
+       struct __linkbuf_t *next;
+} lbnode_t;
+
+/* --- BEGIN: Memory pool */
+
+typedef struct {
+       int cnt, n, max;
+       lbnode_t **buf;
+} mempool_t;
+
+static mempool_t *mp_init()
+{
+       mempool_t *mp;
+       mp = (mempool_t*)calloc(1, sizeof(mempool_t));
+       return mp;
+}
+static void mp_destroy(mempool_t *mp)
+{
+       int k;
+       for (k = 0; k < mp->n; ++k) {
+               free(mp->buf[k]->b.data);
+               free(mp->buf[k]);
+       }
+       free(mp->buf);
+       free(mp);
+}
+static inline lbnode_t *mp_alloc(mempool_t *mp)
+{
+       ++mp->cnt;
+       if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
+       else return mp->buf[--mp->n];
+}
+static inline void mp_free(mempool_t *mp, lbnode_t *p)
+{
+       --mp->cnt; p->next = 0; // clear lbnode_t::next here
+       if (mp->n == mp->max) {
+               mp->max = mp->max? mp->max<<1 : 256;
+               mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
+       }
+       mp->buf[mp->n++] = p;
+}
+
+/* --- END: Memory pool */
+
+/* --- BEGIN: Auxiliary functions */
+
+static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos)
+{
+       unsigned k;
+       bam1_t *b = p->b;
+       bam1_core_t *c = &b->core;
+       uint32_t x = c->pos, y = 0;
+       int ret = 1, is_restart = 1;
+
+       if (c->flag&BAM_FUNMAP) return 0; // unmapped read
+       assert(x <= pos);
+       p->qpos = -1; p->indel = 0; p->is_del = p->is_head = p->is_tail = 0;
+       for (k = 0; k < c->n_cigar; ++k) {
+               int op = bam1_cigar(b)[k] & BAM_CIGAR_MASK; // operation
+               int l = bam1_cigar(b)[k] >> BAM_CIGAR_SHIFT; // length
+               if (op == BAM_CMATCH) { // NOTE: this assumes the first and the last operation MUST BE a match or a clip
+                       if (x + l > pos) { // overlap with pos
+                               p->indel = p->is_del = 0;
+                               p->qpos = y + (pos - x);
+                               if (x == pos && is_restart) p->is_head = 1;
+                               if (x + l - 1 == pos) { // come to the end of a match
+                                       if (k < c->n_cigar - 1) { // there are additional operation(s)
+                                               uint32_t cigar = bam1_cigar(b)[k+1]; // next CIGAR
+                                               int op_next = cigar&BAM_CIGAR_MASK; // next CIGAR operation
+                                               if (op_next == BAM_CDEL) p->indel = -(int32_t)(cigar>>BAM_CIGAR_SHIFT); // del
+                                               else if (op_next == BAM_CINS) p->indel = cigar>>BAM_CIGAR_SHIFT; // ins
+                                               if (op_next == BAM_CSOFT_CLIP || op_next == BAM_CREF_SKIP || op_next == BAM_CHARD_CLIP)
+                                                       p->is_tail = 1; // tail
+                                       } else p->is_tail = 1; // this is the last operation; set tail
+                               }
+                       }
+                       x += l; y += l;
+               } else if (op == BAM_CDEL) { // then set ->is_del
+                       if (x + l > pos) {
+                               p->indel = 0; p->is_del = 1;
+                               p->qpos = y + (pos - x);
+                       }
+                       x += l;
+               } else if (op == BAM_CREF_SKIP) x += l;
+               else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
+               is_restart = (op == BAM_CREF_SKIP || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP);
+               if (x > pos) {
+                       if (op == BAM_CREF_SKIP) ret = 0; // then do not put it into pileup at all
+                       break;
+               }
+       }
+       assert(x > pos);
+       return ret;
+}
+
+/* --- END: Auxiliary functions */
+
+struct __bam_plbuf_t {
+       mempool_t *mp;
+       lbnode_t *head, *tail, *dummy;
+       bam_pileup_f func;
+       void *func_data;
+       int32_t tid, pos, max_tid, max_pos;
+       int max_pu, is_eof;
+       bam_pileup1_t *pu;
+};
+
+void bam_plbuf_reset(bam_plbuf_t *buf)
+{
+       lbnode_t *p, *q;
+       buf->max_tid = buf->max_pos = -1;
+       buf->tid = buf->pos = 0;
+       buf->is_eof = 0;
+       for (p = buf->head; p->next;) {
+               q = p->next;
+               mp_free(buf->mp, p);
+               p = q;
+       }
+       buf->head = buf->tail;
+}
+
+bam_plbuf_t *bam_plbuf_init(bam_pileup_f func, void *data)
+{
+       bam_plbuf_t *buf;
+       buf = (bam_plbuf_t*)calloc(1, sizeof(bam_plbuf_t));
+       buf->func = func; buf->func_data = data;
+       buf->mp = mp_init();
+       buf->head = buf->tail = mp_alloc(buf->mp);
+       buf->dummy = mp_alloc(buf->mp);
+       buf->max_tid = buf->max_pos = -1;
+       return buf;
+}
+
+void bam_plbuf_destroy(bam_plbuf_t *buf)
+{
+       mp_free(buf->mp, buf->dummy);
+       mp_free(buf->mp, buf->head);
+       if (buf->mp->cnt != 0)
+               fprintf(stderr, "[bam_plbuf_destroy] memory leak: %d. Continue anyway.\n", buf->mp->cnt);
+       mp_destroy(buf->mp);
+       free(buf->pu);
+       free(buf);
+}
+
+int bam_plbuf_push(const bam1_t *b, bam_plbuf_t *buf)
+{
+       if (b) { // fill buffer
+               bam_copy1(&buf->tail->b, b);
+               buf->tail->beg = b->core.pos; buf->tail->end = bam_calend(&b->core, bam1_cigar(b));
+               if (!(b->core.tid >= buf->max_tid || (b->core.tid == buf->max_tid && buf->tail->beg >= buf->max_pos))) {
+                       fprintf(stderr, "[bam_pileup_core] the input is not sorted. Abort!\n");
+                       abort();
+               }
+               buf->max_tid = b->core.tid; buf->max_pos = buf->tail->beg;
+               if (buf->tail->end > buf->pos) {
+                       buf->tail->next = mp_alloc(buf->mp);
+                       buf->tail = buf->tail->next;
+               }
+       } else buf->is_eof = 1;
+       while (buf->is_eof || buf->max_tid > buf->tid || (buf->max_tid == buf->tid && buf->max_pos > buf->pos)) {
+               int n_pu = 0;
+               lbnode_t *p, *q;
+               buf->dummy->next = buf->head;
+               for (p = buf->head, q = buf->dummy; p->next; q = p, p = p->next) {
+                       if (p->b.core.tid < buf->tid || (p->b.core.tid == buf->tid && p->end <= buf->pos)) { // then remove from the list
+                               q->next = p->next; mp_free(buf->mp, p); p = q;
+                       } else if (p->b.core.tid == buf->tid && p->beg <= buf->pos) { // here: p->end > pos; then add to pileup
+                               if (n_pu == buf->max_pu) { // then double the capacity
+                                       buf->max_pu = buf->max_pu? buf->max_pu<<1 : 256;
+                                       buf->pu = (bam_pileup1_t*)realloc(buf->pu, sizeof(bam_pileup1_t) * buf->max_pu);
+                               }
+                               buf->pu[n_pu].b = &p->b;
+                               if (resolve_cigar(buf->pu + n_pu, buf->pos)) ++n_pu; // skip the read if we are looking at BAM_CREF_SKIP
+                       }
+               }
+               buf->head = buf->dummy->next; // dummy->next may be changed
+               if (n_pu) { // then call user defined function
+                       buf->func(buf->tid, buf->pos, n_pu, buf->pu, buf->func_data);
+               }
+               // update tid and pos
+               if (buf->head->next) assert(buf->tid <= buf->head->b.core.tid); // otherwise, not sorted
+               if (buf->tid < buf->head->b.core.tid) { // come to a new reference sequence
+                       buf->tid = buf->head->b.core.tid; buf->pos = buf->head->beg; // jump to the next reference
+               } else if (buf->pos < buf->head->beg) { // here: tid == head->b.core.tid
+                       buf->pos = buf->head->beg; // jump to the next position
+               } else ++buf->pos; // scan contiguously
+               if (buf->is_eof && buf->head->next == 0) break;
+       }
+       return 0;
+}
+
+int bam_pileup_file(bamFile fp, bam_pileup_f func, void *func_data)
+{
+       bam_plbuf_t *buf;
+       int ret;
+       bam1_t *b;
+       b = (bam1_t*)calloc(1, sizeof(bam1_t));
+       buf = bam_plbuf_init(func, func_data);
+       while ((ret = bam_read1(fp, b)) >= 0)
+               bam_plbuf_push(b, buf);
+       bam_plbuf_push(0, buf);
+       bam_plbuf_destroy(buf);
+       free(b->data); free(b);
+       return 0;
+}
diff --git a/bam_plcmd.c b/bam_plcmd.c
new file mode 100644 (file)
index 0000000..0140c66
--- /dev/null
@@ -0,0 +1,194 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <ctype.h>
+#include "bam.h"
+#include "faidx.h"
+#include "bam_maqcns.h"
+#include "khash.h"
+KHASH_SET_INIT_INT64(64)
+
+#define BAM_PLF_SIMPLE 0x01
+#define BAM_PLF_CNS 0x02
+
+typedef struct {
+       bam_header_t *h;
+       bam_maqcns_t *c;
+       bam_maqindel_opt_t *ido;
+       faidx_t *fai;
+       khash_t(64) *hash;
+       uint32_t format;
+       int tid, len;
+       char *ref;
+} pu_data_t;
+
+char **bam_load_pos(const char *fn, int *_n);
+void bam_init_header_hash(bam_header_t *header);
+int32_t bam_get_tid(const bam_header_t *header, const char *seq_name);
+
+static khash_t(64) *load_pos(const char *fn, bam_header_t *h)
+{
+       int n, tmp, i;
+       char **list, *s;
+       uint64_t x;
+       khash_t(64) *hash;
+       bam_init_header_hash(h);
+       list = bam_load_pos(fn, &n);
+       hash = kh_init(64);
+       for (i = 0; i < n; ++i) {
+               x = (uint64_t)bam_get_tid(h, list[i]) << 32;
+               s = list[i];
+               while (*s++);
+               x |= *((uint32_t*)s) - 1;
+               kh_put(64, hash, x, &tmp);
+               free(list[i]);
+       }
+       free(list);
+       return hash;
+}
+
+static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data)
+{
+       pu_data_t *d = (pu_data_t*)data;
+       bam_maqindel_ret_t *r = 0;
+       int i, j, rb;
+       uint32_t x;
+       if (d->hash && kh_get(64, d->hash, (uint64_t)tid<<32|pos) == kh_end(d->hash)) return 0;
+       if (d->fai && (int)tid != d->tid) {
+               free(d->ref);
+               d->ref = fai_fetch(d->fai, d->h->target_name[tid], &d->len);
+               d->tid = tid;
+       }
+       rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N';
+       printf("%s\t%d\t%c\t", d->h->target_name[tid], pos + 1, rb);
+       if (d->format & BAM_PLF_CNS) { // consensus
+               int ref_q, rb4 = bam_nt16_table[rb];
+               x = bam_maqcns_call(n, pu, d->c);
+               ref_q = 0;
+               if (rb4 != 15 && x>>28 != 15 && x>>28 != rb4) { // a SNP
+                       ref_q = ((x>>24&0xf) == rb4)? x>>8&0xff : (x>>8&0xff) + (x&0xff);
+                       if (ref_q > 255) ref_q = 255;
+               }
+               printf("%c\t%d\t%d\t%d\t", bam_nt16_rev_table[x>>28], x>>8&0xff, ref_q, x>>16&0xff);
+               if (d->ref) // indel calling
+                       r = bam_maqindel(n, pos, d->ido, pu, d->ref);
+       }
+       // pileup strings
+       printf("%d\t", n);
+       for (i = 0; i < n; ++i) {
+               const bam_pileup1_t *p = pu + i;
+               if (p->is_head) printf("^%c", p->b->core.qual > 93? 126 : p->b->core.qual + 33);
+               if (!p->is_del) {
+                       int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)];
+                       if (toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.';
+                       else bam1_strand(p->b)? tolower(c) : toupper(c);
+                       putchar(c);
+                       if (p->indel > 0) {
+                               printf("+%d", p->indel);
+                               for (j = 1; j <= p->indel; ++j) {
+                                       c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)];
+                                       putchar(bam1_strand(p->b)? tolower(c) : toupper(c));
+                               }
+                       } else if (p->indel < 0) {
+                               printf("%d", p->indel);
+                               for (j = 1; j <= -p->indel; ++j) {
+                                       c = (d->ref && (int)pos+j < d->len)? d->ref[pos+j] : 'N';
+                                       putchar(bam1_strand(p->b)? tolower(c) : toupper(c));
+                               }
+                       }
+               } else putchar('*');
+               if (p->is_tail) putchar('$');
+       }
+       putchar('\t');
+       for (i = 0; i < n; ++i) {
+               const bam_pileup1_t *p = pu + i;
+               int c = bam1_qual(p->b)[p->qpos] + 33;
+               if (c > 126) c = 126;
+               putchar(c);
+       }
+       if (d->format & BAM_PLF_SIMPLE) {
+               putchar('\t');
+               for (i = 0; i < n; ++i) {
+                       int c = pu[i].b->core.qual + 33;
+                       if (c > 126) c = 126;
+                       putchar(c);
+               }
+       }
+       putchar('\n');
+       if (r) { // then print indel line
+               printf("%s\t%d\t*\t%s/%s\t", d->h->target_name[tid], pos + 1, r->s1, r->s2);
+               printf("%d\t%d\t%d\t%d\n", r->cnt1, r->cnt2, r->cnt_ambi, r->cnt_anti);
+               bam_maqindel_ret_destroy(r);
+       }
+       return 0;
+}
+
+int bam_pileup(int argc, char *argv[])
+{
+       int c;
+       char *fn_list = 0, *fn_fa = 0, *fn_pos = 0;
+       pu_data_t *d = (pu_data_t*)calloc(1, sizeof(pu_data_t));
+       d->tid = -1;
+       d->c = bam_maqcns_init();
+       while ((c = getopt(argc, argv, "st:f:cT:N:r:l:")) >= 0) {
+               switch (c) {
+               case 's': d->format |= BAM_PLF_SIMPLE; break;
+               case 't': fn_list = strdup(optarg); break;
+               case 'l': fn_pos = strdup(optarg); break;
+               case 'f': fn_fa = strdup(optarg); break;
+               case 'T': d->c->theta = atof(optarg); break;
+               case 'N': d->c->n_hap = atoi(optarg); break;
+               case 'r': d->c->het_rate = atoi(optarg); break;
+               case 'c': d->format |= BAM_PLF_CNS; break;
+               default: fprintf(stderr, "Unrecognizd option '-%c'.\n", c); return 1;
+               }
+       }
+       if (optind == argc) {
+               fprintf(stderr, "\n");
+               fprintf(stderr, "Usage:  bamtk pileup [options] <in.bam>|<in.sam>\n\n");
+               fprintf(stderr, "Option: -s        simple (yet incomplete) pileup format\n");
+               fprintf(stderr, "        -t FILE   list of reference sequences (assume the input is in SAM)\n");
+               fprintf(stderr, "        -l FILE   list of sites at which pileup is output\n");
+               fprintf(stderr, "        -f FILE   reference sequence in the FASTA format\n\n");
+               fprintf(stderr, "        -c        output the maq consensus sequence\n");
+               fprintf(stderr, "        -T FLOAT  theta in maq consensus calling model (for -c only) [%f]\n", d->c->theta);
+               fprintf(stderr, "        -N INT    number of haplotypes in the sample (for -c only) [%d]\n", d->c->n_hap);
+               fprintf(stderr, "        -r FLOAT  prior of a difference between any two haplotypes (for -c only) [%f]\n\n",
+                               d->c->het_rate);
+               free(fn_list); free(fn_fa); free(d);
+               return 1;
+       }
+       if (fn_fa) d->fai = fai_load(fn_fa);
+       free(fn_fa);
+       bam_maqcns_prepare(d->c);
+       d->ido = bam_maqindel_opt_init();
+       if (fn_list) {
+               tamFile fp;
+               bam1_t *b;
+               int ret;
+               bam_plbuf_t *buf = bam_plbuf_init(pileup_func, d);
+               d->h = sam_header_read2(fn_list);
+               if (fn_pos) d->hash = load_pos(fn_pos, d->h);
+               fp = sam_open(argv[optind]);
+               b = (bam1_t*)calloc(1, sizeof(bam1_t));
+               while ((ret = sam_read1(fp, d->h, b)) >= 0)
+                       bam_plbuf_push(b, buf);
+               bam_plbuf_push(0, buf);
+               bam_plbuf_destroy(buf);
+               bam_destroy1(b);
+               sam_close(fp);
+       } else {
+               bamFile fp;
+               fp = (strcmp(argv[optind], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[optind], "r");
+               d->h = bam_header_read(fp);
+               if (fn_pos) d->hash = load_pos(fn_pos, d->h);
+               bam_pileup_file(fp, pileup_func, d);
+               bam_close(fp);
+       }
+       free(fn_pos); free(fn_list);
+       kh_destroy(64, d->hash);
+       bam_header_destroy(d->h);
+       if (d->fai) fai_destroy(d->fai);
+       bam_maqcns_destroy(d->c);
+       free(d->ido); free(d->ref); free(d);
+       return 0;
+}
diff --git a/bam_sort.c b/bam_sort.c
new file mode 100644 (file)
index 0000000..c5ed583
--- /dev/null
@@ -0,0 +1,229 @@
+#include <stdlib.h>
+#include <ctype.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include "bam.h"
+#include "ksort.h"
+
+static int g_is_by_qname = 0;
+
+static inline int strnum_cmp(const char *a, const char *b)
+{
+       char *pa, *pb;
+       pa = (char*)a; pb = (char*)b;
+       while (*pa && *pb) {
+               if (isdigit(*pa) && isdigit(*pb)) {
+                       long ai, bi;
+                       ai = strtol(pa, &pa, 10);
+                       bi = strtol(pb, &pb, 10);
+                       if (ai != bi) return ai<bi? -1 : ai>bi? 1 : 0;
+               } else {
+                       if (*pa != *pb) break;
+                       ++pa; ++pb;
+               }
+       }
+       if (*pa == *pb)
+               return (pa-a) < (pb-b)? -1 : (pa-a) > (pb-b)? 1 : 0;
+       return *pa<*pb? -1 : *pa>*pb? 1 : 0;
+}
+
+#define HEAP_EMPTY 0xffffffffffffffffull
+
+typedef struct {
+       int i;
+       uint64_t pos;
+       bam1_t *b;
+} heap1_t;
+
+static inline int heap_lt(const heap1_t a, const heap1_t b)
+{
+       if (g_is_by_qname) {
+               int t = strnum_cmp(bam1_qname(a.b), bam1_qname(b.b));
+               return (t > 0 || (t == 0 && a.pos > b.pos));
+       } else return (a.pos > b.pos);
+}
+
+KSORT_INIT(heap, heap1_t, heap_lt)
+
+void bam_merge_core(int by_qname, const char *out, int n, char * const *fn)
+{
+       bamFile fpout, *fp;
+       heap1_t *heap;
+       bam_header_t *hout = 0;
+       int i, j;
+
+       g_is_by_qname = by_qname;
+       fp = (bamFile*)calloc(n, sizeof(bamFile));
+       heap = (heap1_t*)calloc(n, sizeof(heap1_t));
+       for (i = 0; i != n; ++i) {
+               heap1_t *h;
+               bam_header_t *hin;
+               assert(fp[i] = bam_open(fn[i], "r"));
+               hin = bam_header_read(fp[i]);
+               if (i == 0) hout = hin;
+               else { // validate multiple baf
+                       if (hout->n_targets != hin->n_targets) {
+                               fprintf(stderr, "[bam_merge_core] file '%s' has different number of target sequences. Abort!\n", fn[i]);
+                               abort();
+                       }
+                       for (j = 0; j < hout->n_targets; ++j) {
+                               if (strcmp(hout->target_name[j], hin->target_name[j]) || hout->target_len[j] != hin->target_len[j]) {
+                                       fprintf(stderr, "[bam_merge_core] file '%s' has a different target sequence. Abort!\n", fn[i]);
+                                       abort();
+                               }
+                       }
+                       bam_header_destroy(hin);
+               }
+               h = heap + i;
+               h->i = i;
+               h->b = (bam1_t*)calloc(1, sizeof(bam1_t));
+               if (bam_read1(fp[i], h->b) >= 0)
+                       h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)h->b->core.pos<<1 | bam1_strand(h->b);
+               else h->pos = HEAP_EMPTY;
+       }
+       fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w");
+       assert(fpout);
+       bam_header_write(fpout, hout);
+       bam_header_destroy(hout);
+
+       ks_heapmake(heap, n, heap);
+       while (heap->pos != HEAP_EMPTY) {
+               bam1_t *b = heap->b;
+               bam_write1_core(fpout, &b->core, b->data_len, b->data);
+               if ((j = bam_read1(fp[heap->i], b)) >= 0)
+                       heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)b->core.pos<<1 | bam1_strand(b);
+               else if (j == -1) heap->pos = HEAP_EMPTY;
+               else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]);
+               ks_heapadjust(heap, 0, n, heap);
+       }
+
+       for (i = 0; i != n; ++i) {
+               bam_close(fp[i]);
+               free(heap[i].b->data);
+               free(heap[i].b);
+       }
+       bam_close(fpout);
+       free(fp); free(heap);
+}
+int bam_merge(int argc, char *argv[])
+{
+       int c, is_by_qname = 0;
+       while ((c = getopt(argc, argv, "n")) >= 0) {
+               switch (c) {
+               case 'n': is_by_qname = 1; break;
+               }
+       }
+       if (optind + 3 >= argc) {
+               fprintf(stderr, "Usage: samtools merge [-n] <out.bam> <in1.bam> <in2.bam> [...]\n");
+               return 1;
+       }
+       bam_merge_core(is_by_qname, argv[optind], argc - optind - 1, argv + optind + 1);
+       return 0;
+}
+
+typedef bam1_t *bam1_p;
+
+static inline int bam1_lt(const bam1_p a, const bam1_p b)
+{
+       if (g_is_by_qname) {
+               int t = strnum_cmp(bam1_qname(a), bam1_qname(b));
+               return (t < 0 || (t == 0 && (((uint64_t)a->core.tid<<32|a->core.pos) < ((uint64_t)b->core.tid<<32|b->core.pos))));
+       } else return (((uint64_t)a->core.tid<<32|a->core.pos) < ((uint64_t)b->core.tid<<32|b->core.pos));
+}
+KSORT_INIT(sort, bam1_p, bam1_lt)
+
+static void sort_blocks(int n, int k, bam1_p *buf, const char *prefix, const bam_header_t *h)
+{
+       char *name;
+       int i;
+       bamFile fp;
+       ks_mergesort(sort, k, buf, 0);
+       name = (char*)calloc(strlen(prefix) + 20, 1);
+       if (n >= 0) sprintf(name, "%s.%.4d.bam", prefix, n);
+       else sprintf(name, "%s.bam", prefix);
+       assert(fp = bam_open(name, "w"));
+       free(name);
+       bam_header_write(fp, h);
+       for (i = 0; i < k; ++i)
+               bam_write1_core(fp, &buf[i]->core, buf[i]->data_len, buf[i]->data);
+       bam_close(fp);
+}
+
+void bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t max_mem)
+{
+       int n, ret, k, i;
+       size_t mem;
+       bam_header_t *header;
+       bamFile fp;
+       bam1_t *b, **buf;
+
+       g_is_by_qname = is_by_qname;
+       n = k = 0; mem = 0;
+       fp = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r");
+       assert(fp);
+       header = bam_header_read(fp);
+       buf = (bam1_t**)calloc(max_mem / BAM_CORE_SIZE, sizeof(bam1_t*));
+       // write sub files
+       for (;;) {
+               if (buf[k] == 0) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t));
+               b = buf[k];
+               if ((ret = bam_read1(fp, b)) < 0) break;
+               mem += ret;
+               ++k;
+               if (mem >= max_mem) {
+                       sort_blocks(n++, k, buf, prefix, header);
+                       mem = 0; k = 0;
+               }
+       }
+       if (ret != -1)
+               fprintf(stderr, "[bam_sort_core] truncated file. Continue anyway.\n");
+       if (n == 0) sort_blocks(-1, k, buf, prefix, header);
+       else { // then merge
+               char **fns, *fnout;
+               fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n+1);
+               sort_blocks(n++, k, buf, prefix, header);
+               fnout = (char*)calloc(strlen(prefix) + 20, 1);
+               sprintf(fnout, "%s.bam", prefix);
+               fns = (char**)calloc(n, sizeof(char*));
+               for (i = 0; i < n; ++i) {
+                       fns[i] = (char*)calloc(strlen(prefix) + 20, 1);
+                       sprintf(fns[i], "%s.%.4d.bam", prefix, i);
+               }
+               bam_merge_core(0, fnout, n, fns);
+               free(fnout);
+               for (i = 0; i < n; ++i) {
+                       unlink(fns[i]);
+                       free(fns[i]);
+               }
+               free(fns);
+       }
+       for (k = 0; k < max_mem / BAM_CORE_SIZE; ++k) {
+               if (buf[k]) {
+                       free(buf[k]->data);
+                       free(buf[k]);
+               }
+       }
+       free(buf);
+       bam_header_destroy(header);
+       bam_close(fp);
+}
+
+int bam_sort(int argc, char *argv[])
+{
+       size_t max_mem = 500000000;
+       int c, is_by_qname = 0;
+       while ((c = getopt(argc, argv, "nm:")) >= 0) {
+               switch (c) {
+               case 'n': is_by_qname = 1; break;
+               case 'm': max_mem = atol(optarg); break;
+               }
+       }
+       if (optind + 2 > argc) {
+               fprintf(stderr, "Usage: samtools sort [-n] [-m <maxMem>] <in.baf> <out.prefix>\n");
+               return 1;
+       }
+       bam_sort_core(is_by_qname, argv[optind], argv[optind+1], max_mem);
+       return 0;
+}
diff --git a/bam_tview.c b/bam_tview.c
new file mode 100644 (file)
index 0000000..3dfb201
--- /dev/null
@@ -0,0 +1,315 @@
+#ifndef _NO_CURSES
+#include <curses.h>
+#include <ctype.h>
+#include <assert.h>
+#include <string.h>
+#include "bam.h"
+#include "faidx.h"
+#include "bam_maqcns.h"
+
+#define TV_MIN_ALNROW 2
+#define TV_MAX_GOTO  40
+#define TV_LOW_MAPQ  10
+
+#define TV_COLOR_MAPQ  0
+#define TV_COLOR_BASEQ 1
+#define TV_COLOR_NUCL  2
+
+typedef struct {
+       int mrow, mcol;
+       WINDOW *wgoto, *whelp;
+
+       bam_index_t *idx;
+       bam_lplbuf_t *lplbuf;
+       bam_header_t *header;
+       bamFile fp;
+       int curr_tid, left_pos;
+       faidx_t *fai;
+       bam_maqcns_t *bmc;
+
+       int ccol, last_pos, row_shift, color_for, is_nucl, l_ref;
+       char *ref;
+} tview_t;
+
+char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
+
+int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data)
+{
+       tview_t *tv = (tview_t*)data;
+       int i, j, c, rb, attr, max_ins = 0;
+       uint32_t call = 0;
+       if (pos < tv->left_pos || tv->ccol > tv->mcol) return 0; // out of screen
+       // print referece
+       rb = (tv->ref && pos - tv->left_pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N';
+       for (i = tv->last_pos + 1; i < pos; ++i) {
+               if (i%10 == 0) mvprintw(0, tv->ccol, "%-d", i+1);
+               c = tv->ref? tv->ref[i - tv->left_pos] : 'N';
+               mvaddch(1, tv->ccol++, c);
+       }
+       if (pos%10 == 0) mvprintw(0, tv->ccol, "%-d", pos+1);
+       // print consensus
+       call = bam_maqcns_call(n, pl, tv->bmc);
+       attr = A_UNDERLINE;
+       c = ",ACMGRSVTWYHKDBN"[call>>28&0xf];
+       i = (call>>8&0xff)/10+1;
+       if (i > 4) i = 4;
+       attr |= COLOR_PAIR(i);
+       if (c == toupper(rb)) c = '.';
+       attron(attr);
+       mvaddch(2, tv->ccol, c);
+       attroff(attr);
+       // calculate maximum insert
+       for (i = 0; i < n; ++i) {
+               const bam_pileup1_t *p = pl + i;
+               if (p->indel > 0 && max_ins < p->indel) max_ins = p->indel;
+       }
+       // core loop
+       for (j = 0; j <= max_ins; ++j) {
+               for (i = 0; i < n; ++i) {
+                       const bam_pileup1_t *p = pl + i;
+                       int row = TV_MIN_ALNROW + p->level - tv->row_shift;
+                       if (j == 0) {
+                               if (!p->is_del) {
+                                       c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)];
+                                       if (!tv->is_nucl && toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.';
+                               } else c = '*';
+                       } else { // padding
+                               if (j > p->indel) c = '*';
+                               else { // insertion
+                                       c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)];
+                                       if (j == 0 && !tv->is_nucl && toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.';
+                               }
+                       }
+                       if (row > TV_MIN_ALNROW && row < tv->mrow) {
+                               int x;
+                               attr = 0;
+                               if (((p->b->core.flag&BAM_FPAIRED) && !(p->b->core.flag&BAM_FPROPER_PAIR))
+                                       || (p->b->core.flag & BAM_FSECONDARY)) attr |= A_UNDERLINE;
+                               if (tv->color_for == TV_COLOR_BASEQ) {
+                                       x = bam1_qual(p->b)[p->qpos]/10 + 1;
+                                       if (x > 4) x = 4;
+                                       attr |= COLOR_PAIR(x);
+                               } else if (tv->color_for == TV_COLOR_MAPQ) {
+                                       x = p->b->core.qual/10 + 1;
+                                       if (x > 4) x = 4;
+                                       attr |= COLOR_PAIR(x);
+                               } else if (tv->color_for == TV_COLOR_NUCL) {
+                                       x = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos)] + 5;
+                                       attr |= COLOR_PAIR(x);
+                               }
+                               attron(attr);
+                               mvaddch(row, tv->ccol, bam1_strand(p->b)? tolower(c) : toupper(c));
+                               attroff(attr);
+                       }
+               }
+               c = j? '*' : rb;
+               if (c == '*') {
+                       attr = COLOR_PAIR(8);
+                       attron(attr);
+                       mvaddch(1, tv->ccol++, c);
+                       attroff(attr);
+               } else mvaddch(1, tv->ccol++, c);
+       }
+       tv->last_pos = pos;
+       return 0;
+}
+
+tview_t *tv_init(const char *fn, const char *fn_fa)
+{
+       tview_t *tv = (tview_t*)calloc(1, sizeof(tview_t));
+       tv->idx = bam_index_load(fn);
+       tv->fp = bam_open(fn, "r");
+       assert(tv->fp);
+       tv->header = bam_header_read(tv->fp);
+       tv->lplbuf = bam_lplbuf_init(tv_pl_func, tv);
+       if (fn_fa) tv->fai = fai_load(fn_fa);
+       tv->bmc = bam_maqcns_init();
+       bam_maqcns_prepare(tv->bmc);
+
+       initscr();
+       keypad(stdscr, TRUE);
+    clear();
+    noecho();
+    cbreak();
+#ifdef NCURSES_VERSION
+       getmaxyx(stdscr, tv->mrow, tv->mcol);
+#else
+       tv->mrow = 80; tv->mcol = 40;
+#endif
+       tv->wgoto = newwin(3, TV_MAX_GOTO + 10, 10, 5);
+       tv->whelp = newwin(22,40, 5, 5);
+       tv->color_for = TV_COLOR_MAPQ;
+       start_color();
+       init_pair(1, COLOR_BLUE, COLOR_BLACK);
+       init_pair(2, COLOR_GREEN, COLOR_BLACK);
+       init_pair(3, COLOR_YELLOW, COLOR_BLACK);
+       init_pair(4, COLOR_WHITE, COLOR_BLACK);
+       init_pair(5, COLOR_GREEN, COLOR_BLACK);
+       init_pair(6, COLOR_CYAN, COLOR_BLACK);
+       init_pair(7, COLOR_YELLOW, COLOR_BLACK);
+       init_pair(8, COLOR_RED, COLOR_BLACK);
+       init_pair(9, COLOR_BLUE, COLOR_BLACK);
+       return tv;
+}
+
+void tv_destroy(tview_t *tv)
+{
+       delwin(tv->wgoto); delwin(tv->whelp);
+       endwin();
+
+       bam_lplbuf_destroy(tv->lplbuf);
+       bam_maqcns_destroy(tv->bmc);
+       bam_index_destroy(tv->idx);
+       if (tv->fai) fai_destroy(tv->fai);
+       free(tv->ref);
+       bam_header_destroy(tv->header);
+       bam_close(tv->fp);
+       free(tv);
+}
+
+int tv_fetch_func(const bam1_t *b, void *data)
+{
+       tview_t *tv = (tview_t*)data;
+       bam_lplbuf_push(b, tv->lplbuf);
+       return 0;
+}
+
+int tv_draw_aln(tview_t *tv, int tid, int pos)
+{
+       int end;
+       // reset
+       clear();
+       tv->curr_tid = tid; tv->left_pos = pos;
+       tv->last_pos = tv->left_pos - 1;
+       tv->ccol = 0;
+       // print ref and consensus
+       if (tv->fai) {
+               char *str;
+               if (tv->ref) free(tv->ref);
+               str = (char*)calloc(strlen(tv->header->target_name[tv->curr_tid]) + 30, 1);
+               sprintf(str, "%s:%d-%d", tv->header->target_name[tv->curr_tid], tv->left_pos + 1, tv->left_pos + tv->mcol);
+               tv->ref = fai_fetch(tv->fai, str, &tv->l_ref);
+               free(str);
+       }
+       // draw aln
+       bam_lplbuf_reset(tv->lplbuf);
+       bam_fetch(tv->fp, tv->idx, tv->curr_tid, tv->left_pos, tv->left_pos + tv->mcol, tv, tv_fetch_func);
+       bam_lplbuf_push(0, tv->lplbuf);
+       return 0;
+}
+
+static void tv_win_goto(tview_t *tv, int *tid, int *pos)
+{
+       char str[256];
+       int i, l = 0;
+       wborder(tv->wgoto, '|', '|', '-', '-', '+', '+', '+', '+');
+    mvwprintw(tv->wgoto, 1, 2, "Goto: ");
+    for (;;) {
+               int c = wgetch(tv->wgoto);
+               wrefresh(tv->wgoto);
+               if (c == KEY_BACKSPACE || c == '\010' || c == '\177') {
+                       --l;
+               } else if (c == KEY_ENTER || c == '\012' || c == '\015') {
+                       int _tid = -1, _beg, _end;
+                       bam_parse_region(tv->header, str, &_tid, &_beg, &_end);
+                       if (_tid >= 0) {
+                               *tid = _tid; *pos = _beg;
+                               return;
+                       }
+               } else if (isgraph(c)) {
+                       if (l < TV_MAX_GOTO) str[l++] = c;
+               } else if (c == '\027') l = 0;
+               else if (c == '\033') return;
+               str[l] = '\0';
+               for (i = 0; i < TV_MAX_GOTO; ++i) mvwaddch(tv->wgoto, 1, 8 + i, ' ');
+               mvwprintw(tv->wgoto, 1, 8, "%s", str);
+    }
+}
+
+static void tv_win_help(tview_t *tv) {
+    int r = 1;
+       WINDOW *win = tv->whelp;
+    wborder(win, '|', '|', '-', '-', '+', '+', '+', '+');
+    mvwprintw(win, r++, 2, "        -=-    Help    -=- ");
+    r++;
+    mvwprintw(win, r++, 2, "?          This window");
+    mvwprintw(win, r++, 2, "Arrows     Small scroll movement");
+    mvwprintw(win, r++, 2, "h,j,k,l    Small scroll movement");
+    mvwprintw(win, r++, 2, "H,J,K,L    Large scroll movement");
+    mvwprintw(win, r++, 2, "ctrl-H     Scroll 1k left");
+    mvwprintw(win, r++, 2, "ctrl-L     Scroll 1k right");
+    mvwprintw(win, r++, 2, "space      Scroll one screen");
+    mvwprintw(win, r++, 2, "backspace  Scroll back one screen");
+    mvwprintw(win, r++, 2, "g          Go to specific location");
+    mvwprintw(win, r++, 2, "b          Color for base quality");
+    mvwprintw(win, r++, 2, "m          Color for mapping qual");
+    mvwprintw(win, r++, 2, "n          Color for nucleotide");
+    mvwprintw(win, r++, 2, ".          Toggle on/off dot view");
+    mvwprintw(win, r++, 2, "q          Exit");
+       r++;
+       mvwprintw(win, r++, 2, "Underline:      Secondary or orphan");
+       mvwprintw(win, r++, 2, "Blue:    0-9    Green: 10-19");
+       mvwprintw(win, r++, 2, "Yellow: 20-29   White: >=30");
+    wrefresh(win);
+    wgetch(win);
+}
+
+void tv_loop(tview_t *tv)
+{
+       int tid, pos;
+       tid = tv->curr_tid; pos = tv->left_pos;
+       while (1) {
+               int c = getch();
+               switch (c) {
+               case '?': tv_win_help(tv); break;
+               case '\033':
+               case 'q': goto end_loop;
+               case 'g': tv_win_goto(tv, &tid, &pos); break;
+               case 'b': tv->color_for = TV_COLOR_BASEQ; break;
+               case 'm': tv->color_for = TV_COLOR_MAPQ; break;
+               case 'n': tv->color_for = TV_COLOR_NUCL; break;
+               case KEY_LEFT:
+               case 'h': --pos; break;
+               case KEY_RIGHT:
+               case 'l': ++pos; break;
+               case KEY_SLEFT:
+               case 'H': pos -= 20; break;
+               case KEY_SRIGHT:
+               case 'L': pos += 20; break;
+               case '.': tv->is_nucl = !tv->is_nucl; break;
+               case '\010': pos -= 1000; break;
+               case '\014': pos += 1000; break;
+               case ' ': pos += tv->mcol; break;
+               case KEY_UP:
+               case 'j': --tv->row_shift; break;
+               case KEY_DOWN:
+               case 'k': ++tv->row_shift; break;
+               case KEY_BACKSPACE:
+               case '\177': pos -= tv->mcol; break;
+#ifdef KEY_RESIZE
+               case KEY_RESIZE: getmaxyx(stdscr, tv->mrow, tv->mcol); break;
+#endif
+               default: continue;
+               }
+               if (pos < 0) pos = 0;
+               if (tv->row_shift < 0) tv->row_shift = 0;
+               tv_draw_aln(tv, tid, pos);
+       }
+end_loop:
+       return;
+}
+
+int bam_tview_main(int argc, char *argv[])
+{
+       tview_t *tv;
+       if (argc == 1) {
+               fprintf(stderr, "Usage: bamtk tview <aln.bam> [ref.fasta]\n");
+               return 1;
+       }
+       tv = tv_init(argv[1], (argc == 2)? 0 : argv[2]);
+       tv_draw_aln(tv, 0, 0);
+       tv_loop(tv);
+       tv_destroy(tv);
+       return 0;
+}
+#endif
diff --git a/bamtk.c b/bamtk.c
new file mode 100644 (file)
index 0000000..54ef455
--- /dev/null
+++ b/bamtk.c
@@ -0,0 +1,112 @@
+#include <stdio.h>
+#include <unistd.h>
+#include "bam.h"
+
+#ifndef PACKAGE_VERSION
+#define PACKAGE_VERSION "0.1.1"
+#endif
+
+int bam_taf2baf(int argc, char *argv[]);
+int bam_pileup(int argc, char *argv[]);
+int bam_merge(int argc, char *argv[]);
+int bam_index(int argc, char *argv[]);
+int bam_sort(int argc, char *argv[]);
+int bam_tview_main(int argc, char *argv[]);
+int faidx_main(int argc, char *argv[]);
+
+static int view_aux(const bam1_t *b, void *data)
+{
+       bam_view1((bam_header_t*)data, b);
+       return 0;
+}
+static int view_auxb(const bam1_t *b, void *data)
+{
+       bam_write1((bamFile)data, b);
+       return 0;
+}
+
+int bam_view(int argc, char *argv[])
+{
+       bamFile fp, fpout = 0;
+       bam_header_t *header;
+       bam1_t *b;
+       int ret, c, is_bam = 0;
+       while ((c = getopt(argc, argv, "b")) >= 0) {
+               switch (c) {
+               case 'b': is_bam = 1; break;
+               default: fprintf(stderr, "Unrecognized option: -%c\n", c); return 1;
+               }
+       }
+       if (argc == optind) {
+               fprintf(stderr, "Usage: samtools view [-b] <in.bam> [<region> [...]]\n");
+               return 1;
+       }
+       fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r");
+       assert(fp);
+       header = bam_header_read(fp);
+       if (is_bam) {
+               assert(fpout = bam_dopen(fileno(stdout), "w"));
+               bam_header_write(fpout, header);
+       }
+       if (optind + 1 == argc) {
+               b = (bam1_t*)calloc(1, sizeof(bam1_t));
+               while ((ret = bam_read1(fp, b)) >= 0) bam_view1(header, b);
+               if (ret < -1) fprintf(stderr, "[bam_view] truncated file? Continue anyway. (%d)\n", ret);
+               free(b->data); free(b);
+       } else {
+               int i;
+               bam_index_t *idx;
+               idx = bam_index_load(argv[optind]);
+               for (i = optind + 1; i < argc; ++i) {
+                       int tid, beg, end;
+                       bam_parse_region(header, argv[i], &tid, &beg, &end);
+                       if (is_bam) bam_fetch(fp, idx, tid, beg, end, fpout, view_auxb);
+                       else bam_fetch(fp, idx, tid, beg, end, header, view_aux);
+               }
+               bam_index_destroy(idx);
+       }
+       bam_header_destroy(header);
+       bam_close(fp);
+       if (is_bam) bam_close(fpout);
+       return 0;
+}
+
+static int usage()
+{
+       fprintf(stderr, "\n");
+       fprintf(stderr, "Program: samtools (Tools for alignments in the SAM format)\n");
+       fprintf(stderr, "Version: %s\n\n", PACKAGE_VERSION);
+       fprintf(stderr, "Usage:   samtools <command> [options]\n\n");
+       fprintf(stderr, "Command: import      import from the text format\n");
+       fprintf(stderr, "         view        export to the text format\n");
+       fprintf(stderr, "         sort        sort alignment file\n");
+       fprintf(stderr, "         merge       merge multiple sorted alignment files\n");
+       fprintf(stderr, "         pileup      generate pileup output\n");
+       fprintf(stderr, "         faidx       index/extract FASTA\n");
+#ifndef _NO_CURSES
+       fprintf(stderr, "         tview       text alignment viewer\n");
+#endif
+       fprintf(stderr, "         index       index alignment\n");
+       fprintf(stderr, "\n");
+       return 1;
+}
+
+int main(int argc, char *argv[])
+{
+       if (argc < 2) return usage();
+       if (strcmp(argv[1], "view") == 0) return bam_view(argc-1, argv+1);
+       else if (strcmp(argv[1], "import") == 0) return bam_taf2baf(argc-1, argv+1);
+       else if (strcmp(argv[1], "pileup") == 0) return bam_pileup(argc-1, argv+1);
+       else if (strcmp(argv[1], "merge") == 0) return bam_merge(argc-1, argv+1);
+       else if (strcmp(argv[1], "sort") == 0) return bam_sort(argc-1, argv+1);
+       else if (strcmp(argv[1], "index") == 0) return bam_index(argc-1, argv+1);
+       else if (strcmp(argv[1], "faidx") == 0) return faidx_main(argc-1, argv+1);
+#ifndef _NO_CURSES
+       else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1);
+#endif
+       else {
+               fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]);
+               return 1;
+       }
+       return 0;       
+}
diff --git a/bgzf.c b/bgzf.c
new file mode 100644 (file)
index 0000000..4314c70
--- /dev/null
+++ b/bgzf.c
@@ -0,0 +1,488 @@
+/*
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2008 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ *
+ * This software is supplied without any warranty or guaranteed support whatsoever.
+ * Neither the Broad Institute nor MIT can be responsible for its use, misuse,
+ * or functionality.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include "bgzf.h"
+
+extern off_t ftello(FILE *stream);
+extern int fseeko(FILE *stream, off_t offset, int whence);
+
+typedef int8_t byte;
+
+static const int DEFAULT_BLOCK_SIZE = 64 * 1024;
+static const int MAX_BLOCK_SIZE = 64 * 1024;
+
+static const int BLOCK_HEADER_LENGTH = 18;
+static const int BLOCK_FOOTER_LENGTH = 8;
+
+static const int GZIP_ID1 = 31;
+static const int GZIP_ID2 = 139;
+static const int CM_DEFLATE = 8;
+static const int FLG_FEXTRA = 4;
+static const int OS_UNKNOWN = 255;
+static const int BGZF_ID1 = 66; // 'B'
+static const int BGZF_ID2 = 67; // 'C'
+static const int BGZF_LEN = 2;
+static const int BGZF_XLEN = 6; // BGZF_LEN+4
+
+static const int GZIP_WINDOW_BITS = -15; // no zlib header
+static const int Z_DEFAULT_MEM_LEVEL = 8;
+
+
+inline
+void
+packInt16(uint8_t* buffer, uint16_t value)
+{
+    buffer[0] = value;
+    buffer[1] = value >> 8;
+}
+
+inline
+int
+unpackInt16(const uint8_t* buffer)
+{
+    return (buffer[0] | (buffer[1] << 8));
+}
+
+inline
+void
+packInt32(uint8_t* buffer, uint32_t value)
+{
+    buffer[0] = value;
+    buffer[1] = value >> 8;
+    buffer[2] = value >> 16;
+    buffer[3] = value >> 24;
+}
+
+inline
+int
+min(int x, int y)
+{
+    return (x < y) ? x : y;
+}
+
+static
+void
+report_error(BGZF* fp, const char* message) {
+    fp->error = message;
+}
+
+static
+BGZF*
+open_read(int fd)
+{
+    FILE* file = fdopen(fd, "r");
+    BGZF* fp = malloc(sizeof(BGZF));
+    fp->file_descriptor = fd;
+    fp->open_mode = 'r';
+    fp->owned_file = 0;
+    fp->file = file;
+    fp->uncompressed_block_size = MAX_BLOCK_SIZE;
+    fp->uncompressed_block = malloc(MAX_BLOCK_SIZE);
+    fp->compressed_block_size = MAX_BLOCK_SIZE;
+    fp->compressed_block = malloc(MAX_BLOCK_SIZE);
+    fp->block_address = 0;
+    fp->block_offset = 0;
+    fp->block_length = 0;
+    fp->error = NULL;
+    return fp;
+}
+
+static
+BGZF*
+open_write(int fd)
+{
+    FILE* file = fdopen(fd, "w");
+    BGZF* fp = malloc(sizeof(BGZF));
+    fp->file_descriptor = fd;
+    fp->open_mode = 'w';
+    fp->owned_file = 0;
+    fp->file = file;
+    fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE;
+    fp->uncompressed_block = NULL;
+    fp->compressed_block_size = MAX_BLOCK_SIZE;
+    fp->compressed_block = malloc(MAX_BLOCK_SIZE);
+    fp->block_address = 0;
+    fp->block_offset = 0;
+    fp->block_length = 0;
+    fp->error = NULL;
+    return fp;
+}
+
+BGZF*
+bgzf_open(const char* __restrict path, const char* __restrict mode)
+{
+    BGZF* fp = NULL;
+    if (strcasecmp(mode, "r") == 0) {
+       int oflag = O_RDONLY;
+       int fd = open(path, oflag);
+        fp = open_read(fd);
+    } else if (strcasecmp(mode, "w") == 0) {
+       int oflag = O_WRONLY | O_CREAT | O_TRUNC;
+       int fd = open(path, oflag, 0644);
+        fp = open_write(fd);
+    }
+    if (fp != NULL) {
+        fp->owned_file = 1;
+    }
+    return fp;
+}
+
+BGZF*
+bgzf_fdopen(int fd, const char * __restrict mode)
+{
+    if (strcasecmp(mode, "r") == 0) {
+        return open_read(fd);
+    } else if (strcasecmp(mode, "w") == 0) {
+        return open_write(fd);
+    } else {
+        return NULL;
+    }
+}
+
+static
+int
+deflate_block(BGZF* fp, int block_length)
+{
+    // Deflate the block in fp->uncompressed_block into fp->compressed_block.
+    // Also adds an extra field that stores the compressed block length.
+
+    byte* buffer = fp->compressed_block;
+    int buffer_size = fp->compressed_block_size;
+
+    // Init gzip header
+    buffer[0] = GZIP_ID1;
+    buffer[1] = GZIP_ID2;
+    buffer[2] = CM_DEFLATE;
+    buffer[3] = FLG_FEXTRA;
+    buffer[4] = 0; // mtime
+    buffer[5] = 0;
+    buffer[6] = 0;
+    buffer[7] = 0;
+    buffer[8] = 0;
+    buffer[9] = OS_UNKNOWN;
+    buffer[10] = BGZF_XLEN;
+    buffer[11] = 0;
+    buffer[12] = BGZF_ID1;
+    buffer[13] = BGZF_ID2;
+    buffer[14] = BGZF_LEN;
+    buffer[15] = 0;
+    buffer[16] = 0; // placeholder for block length
+    buffer[17] = 0;
+
+    // loop to retry for blocks that do not compress enough
+    int input_length = block_length;
+    int compressed_length = 0;
+    while (1) {
+
+        z_stream zs;
+        zs.zalloc = NULL;
+        zs.zfree = NULL;
+        zs.next_in = fp->uncompressed_block;
+        zs.avail_in = input_length;
+        zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH];
+        zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
+
+        int status = deflateInit2(&zs, Z_DEFAULT_COMPRESSION, Z_DEFLATED,
+                                  GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+        if (status != Z_OK) {
+            report_error(fp, "deflate init failed");
+            return -1;
+        }
+        status = deflate(&zs, Z_FINISH);
+        if (status != Z_STREAM_END) {
+            deflateEnd(&zs);
+            if (status == Z_OK) {
+                // Not enough space in buffer.
+                // Can happen in the rare case the input doesn't compress enough.
+                // Reduce the amount of input until it fits.
+                input_length -= 1024;
+                if (input_length <= 0) {
+                    // should never happen
+                    report_error(fp, "input reduction failed");
+                    return -1;
+                }
+                continue;
+            }
+            report_error(fp, "deflate failed");
+            return -1;
+        }
+        status = deflateEnd(&zs);
+        if (status != Z_OK) {
+            report_error(fp, "deflate end failed");
+            return -1;
+        }
+        compressed_length = zs.total_out;
+        compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH;
+        if (compressed_length > MAX_BLOCK_SIZE) {
+            // should never happen
+            report_error(fp, "deflate overflow");
+            return -1;
+        }
+        break;
+    }
+
+    packInt16((uint8_t*)&buffer[16], compressed_length-1);
+    uint32_t crc = crc32(0L, NULL, 0L);
+    crc = crc32(crc, fp->uncompressed_block, input_length);
+    packInt32((uint8_t*)&buffer[compressed_length-8], crc);
+    packInt32((uint8_t*)&buffer[compressed_length-4], input_length);
+
+    int remaining = block_length - input_length;
+    if (remaining > 0) {
+        if (remaining > input_length) {
+            // should never happen (check so we can use memcpy)
+            report_error(fp, "remainder too large");
+            return -1;
+        }
+        memcpy(fp->uncompressed_block,
+               fp->uncompressed_block + input_length,
+               remaining);
+    }
+    fp->block_offset = remaining;
+    return compressed_length;
+}
+
+static
+int
+inflate_block(BGZF* fp, int block_length)
+{
+    // Inflate the block in fp->compressed_block into fp->uncompressed_block
+
+    z_stream zs;
+    zs.zalloc = NULL;
+    zs.zfree = NULL;
+    zs.next_in = fp->compressed_block + 18;
+    zs.avail_in = block_length - 16;
+    zs.next_out = fp->uncompressed_block;
+    zs.avail_out = fp->uncompressed_block_size;
+
+    int status = inflateInit2(&zs, GZIP_WINDOW_BITS);
+    if (status != Z_OK) {
+        report_error(fp, "inflate init failed");
+        return -1;
+    }
+    status = inflate(&zs, Z_FINISH);
+    if (status != Z_STREAM_END) {
+        inflateEnd(&zs);
+        report_error(fp, "inflate failed");
+        return -1;
+    }
+    status = inflateEnd(&zs);
+    if (status != Z_OK) {
+        report_error(fp, "inflate failed");
+        return -1;
+    }
+    return zs.total_out;
+}
+
+static
+int
+check_header(const byte* header)
+{
+    return (header[0] == GZIP_ID1 &&
+            header[1] == (byte) GZIP_ID2 &&
+            header[2] == Z_DEFLATED &&
+            (header[3] & FLG_FEXTRA) != 0 &&
+            unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN &&
+            header[12] == BGZF_ID1 &&
+            header[13] == BGZF_ID2 &&
+            unpackInt16((uint8_t*)&header[14]) == BGZF_LEN);
+}
+
+static
+int
+read_block(BGZF* fp)
+{
+    byte header[BLOCK_HEADER_LENGTH];
+    int64_t block_address = ftello(fp->file);
+    int count = fread(header, 1, sizeof(header), fp->file);
+    if (count == 0) {
+        fp->block_length = 0;
+        return 0;
+    }
+    if (count != sizeof(header)) {
+        report_error(fp, "read failed");
+        return -1;
+    }
+    if (!check_header(header)) {
+        report_error(fp, "invalid block header");
+        return -1;
+    }
+    int block_length = unpackInt16((uint8_t*)&header[16]) + 1;
+    byte* compressed_block = (byte*) fp->compressed_block;
+    memcpy(compressed_block, header, BLOCK_HEADER_LENGTH);
+    int remaining = block_length - BLOCK_HEADER_LENGTH;
+    count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file);
+    if (count != remaining) {
+        report_error(fp, "read failed");
+        return -1;
+    }
+    count = inflate_block(fp, block_length);
+    if (count < 0) {
+        return -1;
+    }
+    if (fp->block_length != 0) {
+        // Do not reset offset if this read follows a seek.
+        fp->block_offset = 0;
+    }
+    fp->block_address = block_address;
+    fp->block_length = count;
+    return 0;
+}
+
+int
+bgzf_read(BGZF* fp, void* data, int length)
+{
+    if (length <= 0) {
+        return 0;
+    }
+    if (fp->open_mode != 'r') {
+        report_error(fp, "file not open for reading");
+        return -1;
+    }
+
+    int bytes_read = 0;
+    byte* output = data;
+    while (bytes_read < length) {
+        int available = fp->block_length - fp->block_offset;
+        if (available <= 0) {
+            if (read_block(fp) != 0) {
+                return -1;
+            }
+            available = fp->block_length - fp->block_offset;
+            if (available <= 0) {
+                break;
+            }
+        }
+        int copy_length = min(length-bytes_read, available);
+        byte* buffer = fp->uncompressed_block;
+        memcpy(output, buffer + fp->block_offset, copy_length);
+        fp->block_offset += copy_length;
+        output += copy_length;
+        bytes_read += copy_length;
+    }
+    if (fp->block_offset == fp->block_length) {
+        fp->block_address = ftello(fp->file);
+        fp->block_offset = 0;
+        fp->block_length = 0;
+    }
+    return bytes_read;
+}
+
+static
+int
+flush_block(BGZF* fp)
+{
+    while (fp->block_offset > 0) {
+        int block_length = deflate_block(fp, fp->block_offset);
+        if (block_length < 0) {
+            return -1;
+        }
+        int count = fwrite(fp->compressed_block, 1, block_length, fp->file);
+        if (count != block_length) {
+            report_error(fp, "write failed");
+            return -1;
+        }
+        fp->block_address += block_length;
+    }
+    return 0;
+}
+
+int
+bgzf_write(BGZF* fp, const void* data, int length)
+{
+    if (fp->open_mode != 'w') {
+        report_error(fp, "file not open for writing");
+        return -1;
+    }
+
+    if (fp->uncompressed_block == NULL) {
+        fp->uncompressed_block = malloc(fp->uncompressed_block_size);
+    }
+
+    const byte* input = data;
+    int block_length = fp->uncompressed_block_size;
+    int bytes_written = 0;
+    while (bytes_written < length) {
+        int copy_length = min(block_length - fp->block_offset, length - bytes_written);
+        byte* buffer = fp->uncompressed_block;
+        memcpy(buffer + fp->block_offset, input, copy_length);
+        fp->block_offset += copy_length;
+        input += copy_length;
+        bytes_written += copy_length;
+        if (fp->block_offset == block_length) {
+            if (flush_block(fp) != 0) {
+                break;
+            }
+        }
+    }
+    return bytes_written;
+}
+
+int
+bgzf_close(BGZF* fp)
+{
+    if (fp->open_mode == 'w') {
+        if (flush_block(fp) != 0) {
+            return -1;
+        }
+        if (fflush(fp->file) != 0) {
+            report_error(fp, "flush failed");
+            return -1;
+        }
+    }
+    if (fp->owned_file) {
+        if (fclose(fp->file) != 0) {
+            return -1;
+        }
+    }
+    free(fp->uncompressed_block);
+    free(fp->compressed_block);
+    free(fp);
+    return 0;
+}
+
+int64_t
+bgzf_tell(BGZF* fp)
+{
+    return ((fp->block_address << 16) | (fp->block_offset & 0xFFFF));
+}
+
+int64_t
+bgzf_seek(BGZF* fp, int64_t pos, int where)
+{
+    if (fp->open_mode != 'r') {
+        report_error(fp, "file not open for read");
+        return -1;
+    }
+    if (where != SEEK_SET) {
+        report_error(fp, "unimplemented seek option");
+        return -1;
+    }
+    int block_offset = pos & 0xFFFF;
+    int64_t block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL;
+    if (fseeko(fp->file, block_address, SEEK_SET) != 0) {
+        report_error(fp, "seek failed");
+        return -1;
+    }
+    fp->block_length = 0;  // indicates current block is not loaded
+    fp->block_address = block_address;
+    fp->block_offset = block_offset;
+    return 0;
+}
+
diff --git a/bgzf.h b/bgzf.h
new file mode 100644 (file)
index 0000000..4ed5c29
--- /dev/null
+++ b/bgzf.h
@@ -0,0 +1,102 @@
+/*
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2008 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ *
+ * This software is supplied without any warranty or guaranteed support whatsoever.
+ * Neither the Broad Institute nor MIT can be responsible for its use, misuse,
+ * or functionality.
+ */
+
+#ifndef __BCGZ_H
+#define __BGZF_H
+
+#include <stdint.h>
+#include <stdio.h>
+#include "zlib.h"
+#include <stdbool.h>
+//#include "zutil.h"
+
+//typedef int8_t bool;
+
+typedef struct {
+    int file_descriptor;
+    char open_mode;  // 'r' or 'w'
+    bool owned_file;
+    FILE* file;
+    int uncompressed_block_size;
+    int compressed_block_size;
+    void* uncompressed_block;
+    void* compressed_block;
+    int64_t block_address;
+    int block_length;
+    int block_offset;
+    const char* error;
+} BGZF;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Open an existing file descriptor for reading or writing.
+ * Mode must be either "r" or "w".
+ * A subsequent bgzf_close will not close the file descriptor.
+ * Returns null on error.
+ */
+BGZF* bgzf_fdopen(int fd, const char* __restrict mode);
+
+/*
+ * Open the specified file for reading or writing.
+ * Mode must be either "r" or "w".
+ * Returns null on error.
+ */
+BGZF* bgzf_open(const char* path, const char* __restrict mode);
+
+/*
+ * Close the BGZ file and free all associated resources.
+ * Does not close the underlying file descriptor if created with bgzf_fdopen.
+ * Returns zero on success, -1 on error.
+ */
+int bgzf_close(BGZF* fp);
+
+/*
+ * Read up to length bytes from the file storing into data.
+ * Returns the number of bytes actually read.
+ * Returns zero on end of file.
+ * Returns -1 on error.
+ */
+int bgzf_read(BGZF* fp, void* data, int length);
+
+/*
+ * Write length bytes from data to the file.
+ * Returns the number of bytes written.
+ * Returns -1 on error.
+ */
+int bgzf_write(BGZF* fp, const void* data, int length);
+
+/*
+ * Return a virtual file pointer to the current location in the file.
+ * No interpetation of the value should be made, other than a subsequent
+ * call to bgzf_seek can be used to position the file at the same point.
+ * Return value is non-negative on success.
+ * Returns -1 on error.
+ */
+int64_t bgzf_tell(BGZF* fp);
+
+/*
+ * Set the file to read from the location specified by pos, which must
+ * be a value previously returned by bgzf_tell for this file (but not
+ * necessarily one returned by this file handle).
+ * The where argument must be SEEK_SET.
+ * Seeking on a file opened for write is not supported.
+ * Returns zero on success, -1 on error.
+ */
+int64_t bgzf_seek(BGZF* fp, int64_t pos, int where);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/bgzip.c b/bgzip.c
new file mode 100644 (file)
index 0000000..c58d55d
--- /dev/null
+++ b/bgzip.c
@@ -0,0 +1,166 @@
+/*
+ * The Broad Institute
+ * SOFTWARE COPYRIGHT NOTICE AGREEMENT
+ * This software and its documentation are copyright 2008 by the
+ * Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
+ *
+ * This software is supplied without any warranty or guaranteed support whatsoever.
+ * Neither the Broad Institute nor MIT can be responsible for its use, misuse,
+ * or functionality.
+ */
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+#include "bgzf.h"
+
+static const int WINDOW_SIZE = 64 * 1024;
+
+static int bgzip_main_usage()
+{
+       printf("\n");
+       printf("Usage:   bgzip [options] [file] ...\n\n");
+       printf("Options: -c      write on standard output, keep original files unchanged\n");
+       printf("         -d      decompress\n");
+       // printf("         -l      list compressed file contents\n");
+       printf("         -b INT  decompress at virtual file pointer INT\n");
+       printf("         -s INT  decompress INT bytes in the uncompressed file\n");
+       printf("         -h      give this help\n");
+       printf("\n");
+       return 0;
+}
+
+static int write_open(const char *fn, int is_forced)
+{
+       int fd = -1;
+       char c;
+       if (!is_forced) {
+               if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0644)) < 0 && errno == EEXIST) {
+                       printf("bgzip: %s already exists; do you wish to overwrite (y or n)? ", fn);
+                       scanf("%c", &c);
+                       if (c != 'Y' && c != 'y') {
+                               printf("bgzip: not overwritten\n");
+                               exit(1);
+                       }
+               }
+       }
+       if (fd < 0) {
+               if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0644)) < 0) {
+                       fprintf(stderr, "bgzip: %s: Fail to write\n", fn);
+                       exit(1);
+               }
+       }
+       return fd;
+}
+
+static
+void
+fail(BGZF* fp)
+{
+    printf("Error: %s\n", fp->error);
+    exit(1);
+}
+
+int main(int argc, char **argv)
+{
+       int c, compress, pstdout, is_forced;
+       BGZF *rz;
+       void *buffer;
+       long start, end, size;
+
+       compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0;
+       while((c  = getopt(argc, argv, "cdlhfb:s:")) >= 0){
+               switch(c){
+               case 'h': return bgzip_main_usage();
+               case 'd': compress = 0; break;
+               case 'c': pstdout = 1; break;
+                // case 'l': compress = 2; break;
+               case 'b': start = atol(optarg); break;
+               case 's': size = atol(optarg); break;
+               case 'f': is_forced = 1; break;
+               }
+       }
+       if (size >= 0) end = start + size;
+       if(end >= 0 && end < start){
+               fprintf(stderr, " -- Illegal region: [%ld, %ld] --\n", start, end);
+               return 1;
+       }
+       if(compress == 1){
+               int f_src, f_dst = -1;
+               if(argc > optind){
+                       if((f_src = open(argv[optind], O_RDONLY)) < 0){
+                               fprintf(stderr, " -- Cannot open file: %s --\n", argv[optind]);
+                               return 1;
+                       }
+                       if(pstdout){
+                               f_dst = fileno(stdout);
+                       } else {
+                               char *name = malloc(sizeof(strlen(argv[optind]) + 5));
+                               strcpy(name, argv[optind]);
+                               strcat(name, ".gz");
+                               f_dst = write_open(name, is_forced);
+                               if (f_dst < 0) return 1;
+                               free(name);
+                       }
+               } else if(pstdout){ 
+                       f_src = fileno(stdin);
+                       f_dst = fileno(stdout);
+               } else return bgzip_main_usage();
+               rz = bgzf_fdopen(f_dst, "w");
+               buffer = malloc(WINDOW_SIZE);
+               while((c = read(f_src, buffer, WINDOW_SIZE)) > 0) {
+                  if (bgzf_write(rz, buffer, c) < 0) {
+                    fail(rz);
+                  }
+                }
+                // f_dst will be closed here
+               if (bgzf_close(rz) < 0) {
+                  fail(rz);
+                }
+               if (argc > optind) unlink(argv[optind]);
+               free(buffer);
+               close(f_src);
+               return 0;
+       } else {
+               if(argc <= optind) return bgzip_main_usage();
+                int f_dst;
+                if (argc > optind && !pstdout) {
+                  char *name;
+                  if (strstr(argv[optind], ".gz") - argv[optind] != strlen(argv[optind]) - 3) {
+                    printf("bgzip: %s: unknown suffix -- ignored\n", argv[optind]);
+                    return 1;
+                  }
+                  name = strdup(argv[optind]);
+                  name[strlen(name) - 3] = '\0';
+                  f_dst = write_open(name, is_forced);
+                  free(name);
+                } else f_dst = fileno(stdout);
+                rz = bgzf_open(argv[optind], "r");
+                if (rz == NULL) {
+                  printf("Could not open file: %s\n", argv[optind]);
+                  return 1;
+                }
+                buffer = malloc(WINDOW_SIZE);
+                if (bgzf_seek(rz, start, SEEK_SET) < 0) {
+                  fail(rz);
+                }
+                while(1){
+                  if(end < 0) c = bgzf_read(rz, buffer, WINDOW_SIZE);
+                  else c = bgzf_read(rz, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start));
+                  if(c == 0) break;
+                  if (c < 0) fail(rz);
+                  start += c;
+                  write(f_dst, buffer, c);
+                  if(end >= 0 && start >= end) break;
+                }
+                free(buffer);
+               if (bgzf_close(rz) < 0) {
+                  fail(rz);
+                }
+                if (!pstdout) unlink(argv[optind]);
+               return 0;
+       }
+}
+
diff --git a/examples/00README.txt b/examples/00README.txt
new file mode 100644 (file)
index 0000000..5dd123c
--- /dev/null
@@ -0,0 +1,28 @@
+NA18507_part.fa contains two sequences cut from the human genome
+build36. They were exatracted with command:
+
+  samtools faidx human_b36.fa 2:2043966-2045540 20:67967-69550
+
+Sequence names were changed manually for simplicity. ex1.fa.fai is the
+index for the sequence file, generated by:
+
+  samtools faidx ex1.fa
+
+This index file also works as the reference list file used by `import'
+and `pileup' commands of samtools. ex1.sam.gz contains MAQ alignments
+exatracted with:
+
+  (samtools view NA18507_maq.bam 2:2044001-2045500;
+   samtools view NA18507_maq.bam 20:68001-69500)
+
+and processed with an awk command to make everything consistent as a
+standalone alignment.
+
+To try samtools, you may run the following commands:
+
+  samtools import ex1.fa.fai ex1.sam.gz ex1.bam
+  samtools index ex1.bam
+  samtools tview ex1.bam ex1.fa
+  samtools pileup -cf ex1.fa ex1.bam
+  samtools pileup -cf ex1.fa -t ex1.fa.fai ex1.sam.gz
+
diff --git a/examples/ex1.fa b/examples/ex1.fa
new file mode 100644 (file)
index 0000000..ef611b4
--- /dev/null
@@ -0,0 +1,56 @@
+>seq1
+CACTAGTGGCTCATTGTAAATGTGTGGTTTAACTCGTCCATGGCCCAGCATTAGGGAGCT
+GTGGACCCTGCAGCCTGGCTGTGGGGGCCGCAGTGGCTGAGGGGTGCAGAGCCGAGTCAC
+GGGGTTGCCAGCACAGGGGCTTAACCTCTGGTGACTGCCAGAGCTGCTGGCAAGCTAGAG
+TCCCATTTGGAGCCCCTCTAAGCCGTTCTATTTGTAATGAAAACTATATTTATGCTATTC
+AGTTCTAAATATAGAAATTGAAACAGCTGTGTTTAGTGCCTTTGTTCAACCCCCTTGCAA
+CAACCTTGAGAACCCCAGGGAATTTGTCAATGTCAGGGAAGGAGCATTTTGTCAGTTACC
+AAATGTGTTTATTACCAGAGGGATGGAGGGAAGAGGGACGCTGAAGAACTTTGATGCCCT
+CTTCTTCCAAAGATGAAACGCGTAACTGCGCTCTCATTCACTCCAGCTCCCTGTCACCCA
+ATGGACCTGTGATATCTGGATTCTGGGAAATTCTTCATCCTGGACCCTGAGAGATTCTGC
+AGCCCAGCTCCAGATTGCTTGTGGTCTGACAGGCTGCAACTGTGAGCCATCACAATGAAC
+AACAGGAAGAAAAGGTCTTTCAAAAGGTGATGTGTGTTCTCATCAACCTCATACACACAC
+ATGGTTTAGGGGTATAATACCTCTACATGGCTGATTATGAAAACAATGTTCCCCAGATAC
+CATCCCTGTCTTACTTCCAGCTCCCCAGAGGGAAAGCTTTCAACGCTTCTAGCCATTTCT
+TTTGGCATTTGCCTTCAGACCCTACACGAATGCGTCTCTACCACAGGGGGCTGCGCGGTT
+TCCCATCATGAAGCACTGAACTTCCACGTCTCATCTAGGGGAACAGGGAGGTGCACTAAT
+GCGCTCCACGCCCAAGCCCTTCTCACAGTTTCTGCCCCCAGCATGGTTGTACTGGGCAAT
+ACATGAGATTATTAGGAAATGCTTTACTGTCATAACTATGAAGAGACTATTGCCAGATGA
+ACCACACATTAATACTATGTTTCTTATCTGCACATTACTACCCTGCAATTAATATAATTG
+TGTCCATGTACACACGCTGTCCTATGTACTTATCATGACTCTATCCCAAATTCCCAATTA
+CGTCCTATCTTCTTCTTAGGGAAGAACAGCTTAGGTATCAATTTGGTGTTCTGTGTAAAG
+TCTCAGGGAGCCGTCCGTGTCCTCCCATCTGGCCTCGTCCACACTGGTTCTCTTGAAAGC
+TTGGGCTGTAATGATGCCCCTTGGCCATCACCCAGTCCCTGCCCCATCTCTTGTAATCTC
+TCTCCTTTTTGCTGCATCCCTGTCTTCCTCTGTCTTGATTTACTTGTTGTTGGTTTTCTG
+TTTCTTTGTTTGATTTGGTGGAAGACATAATCCCACGCTTCCTATGGAAAGGTTGTTGGG
+AGATTTTTAATGATTCCTCAATGTTAAAATGTCTATTTTTGTCTTGACACCCAACTAATA
+TTTGTCTGAGCAAAACAGTCTAGATGAGAGAGAACTTCCCTGGAGGTCTGATGGCGTTTC
+TCCCTCGTCTTCTTA
+>seq2
+TTCAAATGAACTTCTGTAATTGAAAAATTCATTTAAGAAATTACAAAATATAGTTGAAAG
+CTCTAACAATAGACTAAACCAAGCAGAAGAAAGAGGTTCAGAACTTGAAGACAAGTCTCT
+TATGAATTAACCCAGTCAGACAAAAATAAAGAAAAAAATTTTAAAAATGAACAGAGCTTT
+CAAGAAGTATGAGATTATGTAAAGTAACTGAACCTATGAGTCACAGGTATTCCTGAGGAA
+AAAGAAAAAGTGAGAAGTTTGGAAAAACTATTTGAGGAAGTAATTGGGGAAAACCTCTTT
+AGTCTTGCTAGAGATTTAGACATCTAAATGAAAGAGGCTCAAAGAATGCCAGGAAGATAC
+ATTGCAAGACAGACTTCATCAAGATATGTAGTCATCAGACTATCTAAAGTCAACATGAAG
+GAAAAAAATTCTAAAATCAGCAAGAGAAAAGCATACAGTCATCTATAAAGGAAATCCCAT
+CAGAATAACAATGGGCTTCTCAGCAGAAACCTTACAAGCCAGAAGAGATTGGATCTAATT
+TTTGGACTTCTTAAAGAAAAAAAAACCTGTCAAACACGAATGTTATGCCCTGCTAAACTA
+AGCATCATAAATGAAGGGGAAATAAAGTCAAGTCTTTCCTGACAAGCAAATGCTAAGATA
+ATTCATCATCACTAAACCAGTCCTATAAGAAATGCTCAAAAGAATTGTAAAAGTCAAAAT
+TAAAGTTCAATACTCACCATCATAAATACACACAAAAGTACAAAACTCACAGGTTTTATA
+AAACAATTGAGACTACAGAGCAACTAGGTAAAAAATTAACATTACAACAGGAACAAAACC
+TCATATATCAATATTAACTTTGAATAAAAAGGGATTAAATTCCCCCACTTAAGAGATATA
+GATTGGCAGAACAGATTTAAAAACATGAACTAACTATATGCTGTTTACAAGAAACTCATT
+AATAAAGACATGAGTTCAGGTAAAGGGGTGGAAAAAGATGTTCTACGCAAACAGAAACCA
+AATGAGAGAAGGAGTAGCTATACTTATATCAGATAAAGCACACTTTAAATCAACAACAGT
+AAAATAAAACAAAGGAGGTCATCATACAATGATAAAAAGATCAATTCAGCAAGAAGATAT
+AACCATCCTACTAAATACATATGCACCTAACACAAGACTACCCAGATTCATAAAACAAAT
+ACTACTAGACCTAAGAGGGATGAGAAATTACCTAATTGGTACAATGTACAATATTCTGAT
+GATGGTTACACTAAAAGCCCATACTTTACTGCTACTCAATATATCCATGTAACAAATCTG
+CGCTTGTACTTCTAAATCTATAAAAAAATTAAAATTTAACAAAAGTAAATAAAACACATA
+GCTAAAACTAAAAAAGCAAAAACAAAAACTATGCTAAGTATTGGTAAAGATGTGGGGAAA
+AAAGTAAACTCTCAAATATTGCTAGTGGGAGTATAAATTGTTTTCCACTTTGGAAAACAA
+TTTGGTAATTTCGTTTTTTTTTTTTTCTTTTCTCTTTTTTTTTTTTTTTTTTTTGCATGC
+CAGAAAAAAATATTTACAGTAACT
diff --git a/examples/ex1.fa.fai b/examples/ex1.fa.fai
new file mode 100644 (file)
index 0000000..bac151a
--- /dev/null
@@ -0,0 +1,2 @@
+seq1   1575    6       60      61
+seq2   1584    1614    60      61
diff --git a/examples/ex1.sam.gz b/examples/ex1.sam.gz
new file mode 100644 (file)
index 0000000..1a213d1
Binary files /dev/null and b/examples/ex1.sam.gz differ
diff --git a/faidx.c b/faidx.c
new file mode 100644 (file)
index 0000000..44e7f57
--- /dev/null
+++ b/faidx.c
@@ -0,0 +1,287 @@
+#include <ctype.h>
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "faidx.h"
+#include "khash.h"
+
+typedef struct {
+       uint64_t len:32, line_len:16, line_blen:16;
+       uint64_t offset;
+} faidx1_t;
+KHASH_MAP_INIT_STR(s, faidx1_t)
+
+#ifdef HAVE_RAZF
+#include "razf.h"
+#else
+extern off_t ftello(FILE *stream);
+extern int fseeko(FILE *stream, off_t offset, int whence);
+#define RAZF FILE
+#define razf_read(fp, buf, size) fread(buf, 1, size, fp)
+#define razf_open(fn, mode) fopen(fn, mode)
+#define razf_close(fp) fclose(fp)
+#define razf_seek(fp, offset, whence) fseeko(fp, offset, whence)
+#define razf_tell(fp) ftello(fp)
+#endif
+
+struct __faidx_t {
+       RAZF *rz;
+       int n, m;
+       char **name;
+       khash_t(s) *hash;
+};
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int line_len, int line_blen, uint64_t offset)
+{
+       khint_t k;
+       int ret;
+       faidx1_t t;
+       if (idx->n == idx->m) {
+               idx->m = idx->m? idx->m<<1 : 16;
+               idx->name = (char**)realloc(idx->name, sizeof(void*) * idx->m);
+       }
+       idx->name[idx->n] = strdup(name);
+       k = kh_put(s, idx->hash, idx->name[idx->n], &ret);
+       t.len = len; t.line_len = line_len; t.line_blen = line_blen; t.offset = offset;
+       kh_value(idx->hash, k) = t;
+       ++idx->n;
+}
+
+faidx_t *fai_build_core(RAZF *rz)
+{
+       char c, *name;
+       int l_name, m_name, ret;
+       int len, line_len, line_blen, state;
+       int l1, l2;
+       faidx_t *idx;
+       uint64_t offset;
+
+       idx = (faidx_t*)calloc(1, sizeof(faidx_t));
+       idx->hash = kh_init(s);
+       name = 0; l_name = m_name = 0;
+       len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0;
+       while (razf_read(rz, &c, 1)) {
+               if (c == '>') { // fasta header
+                       if (len >= 0)
+                               fai_insert_index(idx, name, len, line_len, line_blen, offset);
+                       l_name = 0;
+                       while ((ret = razf_read(rz, &c, 1)) != 0 && !isspace(c)) {
+                               if (m_name < l_name + 2) {
+                                       m_name = l_name + 2;
+                                       kroundup32(m_name);
+                                       name = (char*)realloc(name, m_name);
+                               }
+                               name[l_name++] = c;
+                       }
+                       name[l_name] = '\0';
+                       assert(ret);
+                       if (c != '\n') while (razf_read(rz, &c, 1) && c != '\n');
+                       state = 1; len = 0;
+                       offset = razf_tell(rz);
+               } else {
+                       if (state == 3) {
+                               fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'. Abort!\n", name);
+                               exit(1);
+                       }
+                       if (state == 2) state = 3;
+                       l1 = l2 = 0;
+                       do {
+                               ++l1;
+                               if (isgraph(c)) ++l2;
+                       } while ((ret = razf_read(rz, &c, 1)) && c != '\n');
+                       if (state == 3 && l2) {
+                               fprintf(stderr, "[fai_build_core] different line length in sequence '%s'. Abort!\n", name);
+                               exit(1);
+                       }
+                       ++l1; len += l2;
+                       if (l2 >= 0x10000) {
+                               fprintf(stderr, "[fai_build_core] line length exceeds 65535 in sequence '%s'. Abort!\n", name);
+                               exit(1);
+                       }
+                       if (state == 1) line_len = l1, line_blen = l2, state = 0;
+                       else if (state == 0) {
+                               if (l1 != line_len || l2 != line_blen) state = 2;
+                       }
+               }
+       }
+       fai_insert_index(idx, name, len, line_len, line_blen, offset);
+       free(name);
+       return idx;
+}
+
+void fai_save(const faidx_t *fai, FILE *fp)
+{
+       khint_t k;
+       int i;
+       for (i = 0; i < fai->n; ++i) {
+               faidx1_t x;
+               k = kh_get(s, fai->hash, fai->name[i]);
+               x = kh_value(fai->hash, k);
+               fprintf(fp, "%s\t%d\t%lld\t%d\t%d\n", fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len);
+       }
+}
+
+faidx_t *fai_read(FILE *fp)
+{
+       faidx_t *fai;
+       char *buf, *p;
+       int len, line_len, line_blen;
+       long long offset;
+       fai = (faidx_t*)calloc(1, sizeof(faidx_t));
+       fai->hash = kh_init(s);
+       buf = (char*)calloc(0x10000, 1);
+       while (!feof(fp) && fgets(buf, 0x10000, fp)) {
+               for (p = buf; *p && isgraph(*p); ++p);
+               *p = 0; ++p;
+               sscanf(p, "%d%lld%d%d", &len, &offset, &line_blen, &line_len);
+               fai_insert_index(fai, buf, len, line_len, line_blen, offset);
+       }
+       free(buf);
+       return fai;
+}
+
+void fai_destroy(faidx_t *fai)
+{
+       int i;
+       for (i = 0; i < fai->n; ++i) free(fai->name[i]);
+       free(fai->name);
+       kh_destroy(s, fai->hash);
+       if (fai->rz) razf_close(fai->rz);
+       free(fai);
+}
+
+void fai_build(const char *fn)
+{
+       char *str;
+       RAZF *rz;
+       FILE *fp;
+       faidx_t *fai;
+       str = (char*)calloc(strlen(fn) + 5, 1);
+       sprintf(str, "%s.fai", fn);
+       rz = razf_open(fn, "r");
+       assert(rz);
+       fai = fai_build_core(rz);
+       razf_close(rz);
+       fp = fopen(str, "w");
+       assert(fp);
+       fai_save(fai, fp);
+       fclose(fp);
+       free(str);
+       fai_destroy(fai);
+}
+
+faidx_t *fai_load(const char *fn)
+{
+       char *str;
+       FILE *fp;
+       faidx_t *fai;
+       str = (char*)calloc(strlen(fn) + 5, 1);
+       sprintf(str, "%s.fai", fn);
+       fp = fopen(str, "r");
+       if (fp == 0) {
+               fprintf(stderr, "[fai_load] build FASTA index.\n");
+               fai_build(fn);
+               fp = fopen(str, "r");
+               if (fp == 0) {
+                       free(str);
+                       return 0;
+               }
+       }
+       fai = fai_read(fp);
+       fclose(fp);
+       fai->rz = razf_open(fn, "r");
+       if (fai->rz == 0) return 0;
+       assert(fai->rz);
+       free(str);
+       return fai;
+}
+
+char *fai_fetch(const faidx_t *fai, const char *str, int *len)
+{
+       char *s, *p, c;
+       int i, l, k;
+       khiter_t iter;
+       faidx1_t val;
+       khash_t(s) *h;
+       int beg, end;
+
+       beg = end = -1;
+       h = fai->hash;
+       l = strlen(str);
+       p = s = (char*)malloc(l+1);
+       /* squeeze out "," */
+       for (i = k = 0; i != l; ++i)
+               if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i];
+       s[k] = 0;
+       for (i = 0; i != k; ++i) if (s[i] == ':') break;
+       s[i] = 0;
+       iter = kh_get(s, h, s); /* get the ref_id */
+       if (iter == kh_end(h)) {
+               *len = 0;
+               free(s); return 0;
+       }
+       val = kh_value(h, iter);
+       if (i == k) { /* dump the whole sequence */
+               beg = 0; end = val.len;
+       } else {
+               for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break;
+               beg = atoi(p);
+               if (i < k) {
+                       p = s + i + 1;
+                       end = atoi(p);
+               } else end = val.len;
+       }
+       if (beg > 0) --beg;
+       if (beg >= val.len) beg = val.len;
+       if (end >= val.len) end = val.len;
+       if (beg > end) beg = end;
+       free(s);
+
+       // now retrieve the sequence
+       l = 0;
+       s = (char*)malloc(end - beg + 2);
+       razf_seek(fai->rz, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET);
+       while (razf_read(fai->rz, &c, 1) == 1 && l < end - beg)
+               if (isgraph(c)) s[l++] = c;
+       s[l] = '\0';
+       *len = l;
+       return s;
+}
+
+int faidx_main(int argc, char *argv[])
+{
+       if (argc == 1) {
+               fprintf(stderr, "Usage: faidx <in.fasta> [<reg> [...]]\n");
+               return 1;
+       } else {
+               if (argc == 2) fai_build(argv[1]);
+               else {
+                       int i, j, k, l;
+                       char *s;
+                       faidx_t *fai;
+                       fai = fai_load(argv[1]);
+                       assert(fai);
+                       for (i = 2; i != argc; ++i) {
+                               printf(">%s\n", argv[i]);
+                               s = fai_fetch(fai, argv[i], &l);
+                               for (j = 0; j < l; j += 60) {
+                                       for (k = 0; k < 60 && k < l - j; ++k)
+                                               putchar(s[j + k]);
+                                       putchar('\n');
+                               }
+                               free(s);
+                       }
+                       fai_destroy(fai);
+               }
+       }
+       return 0;
+}
+
+#ifdef FAIDX_MAIN
+int main(int argc, char *argv[]) { return faidx_main(argc, argv); }
+#endif
diff --git a/faidx.h b/faidx.h
new file mode 100644 (file)
index 0000000..98c60e4
--- /dev/null
+++ b/faidx.h
@@ -0,0 +1,81 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3@sanger.ac.uk> */
+
+#ifndef FAIDX_H
+#define FAIDX_H
+
+/*!
+  @header
+
+  Index FASTA files and extract subsequence.
+
+  @copyright The Wellcome Trust Sanger Institute.
+ */
+
+struct __faidx_t;
+typedef struct __faidx_t faidx_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+       /*!
+         @abstract   Build index for a FASTA or razip compressed FASTA file.
+         @param  fn  FASTA file name
+         @discussion File "fn.fai" will be generated.
+        */
+       void fai_build(const char *fn);
+
+       /*!
+         @abstract    Distroy a faidx_t struct.
+         @param  fai  Pointer to the struct to be destroyed
+        */
+       void fai_destroy(faidx_t *fai);
+
+       /*!
+         @abstract   Load index from "fn.fai".
+         @param  fn  File name of the FASTA file
+        */
+       faidx_t *fai_load(const char *fn);
+
+       /*!
+         @abstract    Fetch the sequence in a region.
+         @param  fai  Pointer to the faidx_t struct
+         @param  reg  Region in the format "chr2:20,000-30,000"
+         @param  len  Length of the region
+         @return      Pointer to the sequence; null on failure
+
+         @discussion The returned sequence is allocated by malloc family
+         and should be destroyed by end users by calling free() on it.
+        */
+       char *fai_fetch(const faidx_t *fai, const char *reg, int *len);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/glf.h b/glf.h
new file mode 100644 (file)
index 0000000..d9d23c6
--- /dev/null
+++ b/glf.h
@@ -0,0 +1,11 @@
+#ifndef GLF_H_
+#define GLF_H_
+
+typedef struct {
+       unsigned char ref_base:4, dummy:4; /** "XACMGRSVTWYHKDBN"[ref_base] gives the reference base */
+       unsigned char max_mapQ; /** maximum mapping quality */
+       unsigned char lk[10];   /** log likelihood ratio, capped at 255 */
+       unsigned min_lk:8, depth:24; /** minimum lk capped at 255, and the number of mapped reads */
+} glf1_t;
+
+#endif
diff --git a/khash.h b/khash.h
new file mode 100644 (file)
index 0000000..1d583ef
--- /dev/null
+++ b/khash.h
@@ -0,0 +1,486 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3@sanger.ac.uk> */
+
+/*
+  An example:
+
+#include "khash.h"
+KHASH_MAP_INIT_INT(32, char)
+int main() {
+       int ret, is_missing;
+       khiter_t k;
+       khash_t(32) *h = kh_init(32);
+       k = kh_put(32, h, 5, &ret);
+       if (!ret) kh_del(32, h, k);
+       kh_value(h, k) = 10;
+       k = kh_get(32, h, 10);
+       is_missing = (k == kh_end(h));
+       k = kh_get(32, h, 5);
+       kh_del(32, h, k);
+       for (k = kh_begin(h); k != kh_end(h); ++k)
+               if (kh_exist(h, k)) kh_value(h, k) = 1;
+       kh_destroy(32, h);
+       return 0;
+}
+*/
+
+/*
+  2008-09-19 (0.2.3):
+
+       * Corrected the example
+       * Improved interfaces
+
+  2008-09-11 (0.2.2):
+
+       * Improved speed a little in kh_put()
+
+  2008-09-10 (0.2.1):
+
+       * Added kh_clear()
+       * Fixed a compiling error
+
+  2008-09-02 (0.2.0):
+
+       * Changed to token concatenation which increases flexibility.
+
+  2008-08-31 (0.1.2):
+
+       * Fixed a bug in kh_get(), which has not been tested previously.
+
+  2008-08-31 (0.1.1):
+
+       * Added destructor
+*/
+
+
+#ifndef __AC_KHASH_H
+#define __AC_KHASH_H
+
+/*!
+  @header
+
+  Generic hash table library.
+
+  @copyright Heng Li
+ */
+
+#define AC_VERSION_KHASH_H "0.2.2"
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+typedef uint32_t khint_t;
+typedef khint_t khiter_t;
+
+#define __ac_HASH_PRIME_SIZE 32
+static const uint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] =
+{
+  0ul,          3ul,          11ul,         23ul,         53ul,
+  97ul,         193ul,        389ul,        769ul,        1543ul,
+  3079ul,       6151ul,       12289ul,      24593ul,      49157ul,
+  98317ul,      196613ul,     393241ul,     786433ul,     1572869ul,
+  3145739ul,    6291469ul,    12582917ul,   25165843ul,   50331653ul,
+  100663319ul,  201326611ul,  402653189ul,  805306457ul,  1610612741ul,
+  3221225473ul, 4294967291ul
+};
+
+#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
+#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
+#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
+#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
+#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
+#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
+#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
+
+static const double __ac_HASH_UPPER = 0.77;
+
+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+       typedef struct {                                                                                                        \
+               khint_t n_buckets, size, n_occupied, upper_bound;                               \
+               uint32_t *flags;                                                                                                \
+               khkey_t *keys;                                                                                                  \
+               khval_t *vals;                                                                                                  \
+       } kh_##name##_t;                                                                                                        \
+       static inline kh_##name##_t *kh_init_##name() {                                         \
+               return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t));                \
+       }                                                                                                                                       \
+       static inline void kh_destroy_##name(kh_##name##_t *h)                          \
+       {                                                                                                                                       \
+               if (h) {                                                                                                                \
+                       free(h->keys); free(h->flags);                                                          \
+                       free(h->vals);                                                                                          \
+                       free(h);                                                                                                        \
+               }                                                                                                                               \
+       }                                                                                                                                       \
+       static inline void kh_clear_##name(kh_##name##_t *h)                            \
+       {                                                                                                                                       \
+               if (h && h->flags) {                                                                                    \
+                       memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t)); \
+                       h->size = h->n_occupied = 0;                                                            \
+               }                                                                                                                               \
+       }                                                                                                                                       \
+       static inline khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \
+       {                                                                                                                                       \
+               if (h->n_buckets) {                                                                                             \
+                       khint_t inc, k, i, last;                                                                        \
+                       k = __hash_func(key); i = k % h->n_buckets;                                     \
+                       inc = 1 + k % (h->n_buckets - 1); last = i;                                     \
+                       while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+                               if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
+                               else i += inc;                                                                                  \
+                               if (i == last) return h->n_buckets;                                             \
+                       }                                                                                                                       \
+                       return __ac_iseither(h->flags, i)? h->n_buckets : i;            \
+               } else return 0;                                                                                                \
+       }                                                                                                                                       \
+       static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
+       {                                                                                                                                       \
+               uint32_t *new_flags = 0;                                                                                \
+               khint_t j = 1;                                                                                                  \
+               {                                                                                                                               \
+                       khint_t t = __ac_HASH_PRIME_SIZE - 1;                                           \
+                       while (__ac_prime_list[t] > new_n_buckets) --t;                         \
+                       new_n_buckets = __ac_prime_list[t+1];                                           \
+                       if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \
+                       else {                                                                                                          \
+                               new_flags = (uint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(uint32_t));     \
+                               memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \
+                               if (h->n_buckets < new_n_buckets) {                                             \
+                                       h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
+                                       if (kh_is_map)                                                                          \
+                                               h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
+                               }                                                                                                               \
+                       }                                                                                                                       \
+               }                                                                                                                               \
+               if (j) {                                                                                                                \
+                       for (j = 0; j != h->n_buckets; ++j) {                                           \
+                               if (__ac_iseither(h->flags, j) == 0) {                                  \
+                                       khkey_t key = h->keys[j];                                                       \
+                                       khval_t val;                                                                            \
+                                       if (kh_is_map) val = h->vals[j];                                        \
+                                       __ac_set_isdel_true(h->flags, j);                                       \
+                                       while (1) {                                                                                     \
+                                               khint_t inc, k, i;                                                              \
+                                               k = __hash_func(key);                                                   \
+                                               i = k % new_n_buckets;                                                  \
+                                               inc = 1 + k % (new_n_buckets - 1);                              \
+                                               while (!__ac_isempty(new_flags, i)) {                   \
+                                                       if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \
+                                                       else i += inc;                                                          \
+                                               }                                                                                               \
+                                               __ac_set_isempty_false(new_flags, i);                   \
+                                               if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \
+                                                       { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
+                                                       if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
+                                                       __ac_set_isdel_true(h->flags, i);                       \
+                                               } else {                                                                                \
+                                                       h->keys[i] = key;                                                       \
+                                                       if (kh_is_map) h->vals[i] = val;                        \
+                                                       break;                                                                          \
+                                               }                                                                                               \
+                                       }                                                                                                       \
+                               }                                                                                                               \
+                       }                                                                                                                       \
+                       if (h->n_buckets > new_n_buckets) {                                                     \
+                               h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
+                               if (kh_is_map)                                                                                  \
+                                       h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
+                       }                                                                                                                       \
+                       free(h->flags);                                                                                         \
+                       h->flags = new_flags;                                                                           \
+                       h->n_buckets = new_n_buckets;                                                           \
+                       h->n_occupied = h->size;                                                                        \
+                       h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
+               }                                                                                                                               \
+       }                                                                                                                                       \
+       static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
+       {                                                                                                                                       \
+               khint_t x;                                                                                                              \
+               if (h->n_occupied >= h->upper_bound) {                                                  \
+                       if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \
+                       else kh_resize_##name(h, h->n_buckets + 1);                                     \
+               }                                                                                                                               \
+               {                                                                                                                               \
+                       khint_t inc, k, i, site, last;                                                          \
+                       x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \
+                       if (__ac_isempty(h->flags, i)) x = i;                                           \
+                       else {                                                                                                          \
+                               inc = 1 + k % (h->n_buckets - 1); last = i;                             \
+                               while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+                                       if (__ac_isdel(h->flags, i)) site = i;                          \
+                                       if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
+                                       else i += inc;                                                                          \
+                                       if (i == last) { x = site; break; }                                     \
+                               }                                                                                                               \
+                               if (x == h->n_buckets) {                                                                \
+                                       if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
+                                       else x = i;                                                                                     \
+                               }                                                                                                               \
+                       }                                                                                                                       \
+               }                                                                                                                               \
+               if (__ac_isempty(h->flags, x)) {                                                                \
+                       h->keys[x] = key;                                                                                       \
+                       __ac_set_isboth_false(h->flags, x);                                                     \
+                       ++h->size; ++h->n_occupied;                                                                     \
+                       *ret = 1;                                                                                                       \
+               } else if (__ac_isdel(h->flags, x)) {                                                   \
+                       h->keys[x] = key;                                                                                       \
+                       __ac_set_isboth_false(h->flags, x);                                                     \
+                       ++h->size;                                                                                                      \
+                       *ret = 2;                                                                                                       \
+               } else *ret = 0;                                                                                                \
+               return x;                                                                                                               \
+       }                                                                                                                                       \
+       static inline void kh_del_##name(kh_##name##_t *h, khint_t x)           \
+       {                                                                                                                                       \
+               if (x != h->n_buckets && !__ac_iseither(h->flags, x)) {                 \
+                       __ac_set_isdel_true(h->flags, x);                                                       \
+                       --h->size;                                                                                                      \
+               }                                                                                                                               \
+       }
+
+/* --- BEGIN OF HASH FUNCTIONS --- */
+
+/*! @function
+  @abstract     Integer hash function
+  @param  key   The integer [uint32_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int_hash_func(key) (uint32_t)(key)
+/*! @function
+  @abstract     Integer comparison function
+ */
+#define kh_int_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     64-bit integer hash function
+  @param  key   The integer [uint64_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int64_hash_func(key) (uint32_t)((key)>>33^(key)^(key)<<11)
+/*! @function
+  @abstract     64-bit integer comparison function
+ */
+#define kh_int64_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     const char* hash function
+  @param  s     Pointer to a null terminated string
+  @return       The hash value
+ */
+static inline khint_t __ac_X31_hash_string(const char *s)
+{
+       khint_t h = *s;
+       if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
+       return h;
+}
+/*! @function
+  @abstract     Another interface to const char* hash function
+  @param  key   Pointer to a null terminated string [const char*]
+  @return       The hash value [khint_t]
+ */
+#define kh_str_hash_func(key) __ac_X31_hash_string(key)
+/*! @function
+  @abstract     Const char* comparison function
+ */
+#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
+
+/* --- END OF HASH FUNCTIONS --- */
+
+/* Other necessary macros... */
+
+/*!
+  @abstract Type of the hash table.
+  @param  name  Name of the hash table [symbol]
+ */
+#define khash_t(name) kh_##name##_t
+
+/*! @function
+  @abstract     Initiate a hash table.
+  @param  name  Name of the hash table [symbol]
+  @return       Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_init(name) kh_init_##name()
+
+/*! @function
+  @abstract     Destroy a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_destroy(name, h) kh_destroy_##name(h)
+
+/*! @function
+  @abstract     Reset a hash table without deallocating memory.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_clear(name, h) kh_clear_##name(h)
+
+/*! @function
+  @abstract     Resize a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  s     New size [khint_t]
+ */
+#define kh_resize(name, h, s) kh_resize_##name(h, s)
+
+/*! @function
+  @abstract     Insert a key to the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @param  r     Extra return code: 0 if the key is present in the hash table;
+                1 if the bucket is empty (never used); 2 if the element in
+                               the bucket has been deleted [int*]
+  @return       Iterator to the inserted element [khint_t]
+ */
+#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
+
+/*! @function
+  @abstract     Retrieve a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @return       Iterator to the found element, or kh_end(h) is the element is absent [khint_t]
+ */
+#define kh_get(name, h, k) kh_get_##name(h, k)
+
+/*! @function
+  @abstract     Remove a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Iterator to the element to be deleted [khint_t]
+ */
+#define kh_del(name, h, k) kh_del_##name(h, k)
+
+
+/*! @function
+  @abstract     Test whether a bucket contains data.
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       1 if containing data; 0 otherwise [int]
+ */
+#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
+
+/*! @function
+  @abstract     Get key given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Key [type of keys]
+ */
+#define kh_key(h, x) ((h)->keys[x])
+
+/*! @function
+  @abstract     Get value given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Value [type of values]
+  @discussion   For hash sets, calling this results in segfault.
+ */
+#define kh_val(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Alias of kh_val()
+ */
+#define kh_value(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Get the start iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The start iterator [khint_t]
+ */
+#define kh_begin(h) (khint_t)(0)
+
+/*! @function
+  @abstract     Get the end iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The end iterator [khint_t]
+ */
+#define kh_end(h) ((h)->n_buckets)
+
+/*! @function
+  @abstract     Get the number of elements in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of elements in the hash table [khint_t]
+ */
+#define kh_size(h) ((h)->size)
+
+/*! @function
+  @abstract     Get the number of buckets in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of buckets in the hash table [khint_t]
+ */
+#define kh_n_buckets(h) ((h)->n_buckets)
+
+/* More conenient interfaces */
+
+/*! @function
+  @abstract     Instantiate a hash set containing integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT(name)                                                                               \
+       KHASH_INIT(name, uint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT(name, khval_t)                                                              \
+       KHASH_INIT(name, uint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT64(name)                                                                             \
+       KHASH_INIT(name, uint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT64(name, khval_t)                                                            \
+       KHASH_INIT(name, uint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+
+typedef const char *kh_cstr_t;
+/*! @function
+  @abstract     Instantiate a hash map containing const char* keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_STR(name)                                                                               \
+       KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing const char* keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_STR(name, khval_t)                                                              \
+       KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
+
+#endif /* __AC_KHASH_H */
diff --git a/kseq.h b/kseq.h
new file mode 100644 (file)
index 0000000..25f31a3
--- /dev/null
+++ b/kseq.h
@@ -0,0 +1,207 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3@sanger.ac.uk> */
+
+#ifndef AC_KSEQ_H
+#define AC_KSEQ_H
+
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define __KS_TYPE(type_t)                                              \
+       typedef struct __kstream_t {                            \
+               char *buf;                                                              \
+               int begin, end, is_eof;                                 \
+               type_t f;                                                               \
+       } kstream_t;
+
+#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
+#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
+
+#define __KS_BASIC(type_t, __bufsize)                                                          \
+       static inline kstream_t *ks_init(type_t f)                                              \
+       {                                                                                                                               \
+               kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t));       \
+               ks->f = f;                                                                                                      \
+               ks->buf = (char*)malloc(__bufsize);                                                     \
+               return ks;                                                                                                      \
+       }                                                                                                                               \
+       static inline void ks_destroy(kstream_t *ks)                                    \
+       {                                                                                                                               \
+               if (ks) {                                                                                                       \
+                       free(ks->buf);                                                                                  \
+                       free(ks);                                                                                               \
+               }                                                                                                                       \
+       }
+
+#define __KS_GETC(__read, __bufsize)                                           \
+       static inline int ks_getc(kstream_t *ks)                                \
+       {                                                                                                               \
+               if (ks->is_eof && ks->begin >= ks->end) return -1;      \
+               if (ks->begin >= ks->end) {                                                     \
+                       ks->begin = 0;                                                                  \
+                       ks->end = __read(ks->f, ks->buf, __bufsize);    \
+                       if (ks->end < __bufsize) ks->is_eof = 1;                \
+                       if (ks->end == 0) return -1;                                    \
+               }                                                                                                       \
+               return (int)ks->buf[ks->begin++];                                       \
+       }
+
+typedef struct __kstring_t {
+       size_t l, m;
+       char *s;
+} kstring_t;
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#define __KS_GETUNTIL(__read, __bufsize)                                                               \
+       static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
+       {                                                                                                                                       \
+               if (dret) *dret = 0;                                                                                    \
+               str->l = 0;                                                                                                             \
+               if (ks->begin >= ks->end && ks->is_eof) return -1;                              \
+               for (;;) {                                                                                                              \
+                       int i;                                                                                                          \
+                       if (ks->begin >= ks->end) {                                                                     \
+                               if (!ks->is_eof) {                                                                              \
+                                       ks->begin = 0;                                                                          \
+                                       ks->end = __read(ks->f, ks->buf, __bufsize);            \
+                                       if (ks->end < __bufsize) ks->is_eof = 1;                        \
+                                       if (ks->end == 0) break;                                                        \
+                               } else break;                                                                                   \
+                       }                                                                                                                       \
+                       if (delimiter) {                                                                                        \
+                               for (i = ks->begin; i < ks->end; ++i)                                   \
+                                       if (ks->buf[i] == delimiter) break;                                     \
+                       } else {                                                                                                        \
+                               for (i = ks->begin; i < ks->end; ++i)                                   \
+                                       if (isspace(ks->buf[i])) break;                                         \
+                       }                                                                                                                       \
+                       if (str->m - str->l < i - ks->begin + 1) {                                      \
+                               str->m = str->l + (i - ks->begin) + 1;                                  \
+                               kroundup32(str->m);                                                                             \
+                               str->s = (char*)realloc(str->s, str->m);                                \
+                       }                                                                                                                       \
+                       memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
+                       str->l = str->l + (i - ks->begin);                                                      \
+                       ks->begin = i + 1;                                                                                      \
+                       if (i < ks->end) {                                                                                      \
+                               if (dret) *dret = ks->buf[i];                                                   \
+                               break;                                                                                                  \
+                       }                                                                                                                       \
+               }                                                                                                                               \
+               str->s[str->l] = '\0';                                                                                  \
+               return str->l;                                                                                                  \
+       }
+
+#define KSTREAM_INIT(type_t, __read, __bufsize) \
+       __KS_TYPE(type_t)                                                       \
+       __KS_BASIC(type_t, __bufsize)                           \
+       __KS_GETC(__read, __bufsize)                            \
+       __KS_GETUNTIL(__read, __bufsize)
+
+#define __KSEQ_BASIC(type_t)                                                                                   \
+       static inline kseq_t *kseq_init(type_t fd)                                                      \
+       {                                                                                                                                       \
+               kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t));                                 \
+               s->f = ks_init(fd);                                                                                             \
+               return s;                                                                                                               \
+       }                                                                                                                                       \
+       static inline void kseq_rewind(kseq_t *ks)                                                      \
+       {                                                                                                                                       \
+               ks->last_char = 0;                                                                                              \
+               ks->f->is_eof = ks->f->begin = ks->f->end = 0;                                  \
+       }                                                                                                                                       \
+       static inline void kseq_destroy(kseq_t *ks)                                                     \
+       {                                                                                                                                       \
+               if (!ks) return;                                                                                                \
+               free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
+               ks_destroy(ks->f);                                                                                              \
+               free(ks);                                                                                                               \
+       }
+
+/* Return value:
+   >=0  length of the sequence (normal)
+   -1   end-of-file
+   -2   truncated quality string
+ */
+#define __KSEQ_READ                                                                                                            \
+       static int kseq_read(kseq_t *seq)                                                                       \
+       {                                                                                                                                       \
+               int c;                                                                                                                  \
+               kstream_t *ks = seq->f;                                                                                 \
+               if (seq->last_char == 0) { /* then jump to the next header line */ \
+                       while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@');        \
+                       if (c == -1) return -1; /* end of file */                                       \
+                       seq->last_char = c;                                                                                     \
+               } /* the first header char has been read */                                             \
+               seq->comment.l = seq->seq.l = seq->qual.l = 0;                                  \
+               if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1;                  \
+               if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0);                 \
+               while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
+                       if (isgraph(c)) { /* printable non-space character */           \
+                               if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \
+                                       seq->seq.m = seq->seq.l + 2;                                            \
+                                       kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \
+                                       seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
+                               }                                                                                                               \
+                               seq->seq.s[seq->seq.l++] = (char)c;                                             \
+                       }                                                                                                                       \
+               }                                                                                                                               \
+               if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
+               seq->seq.s[seq->seq.l] = 0;     /* null terminated string */            \
+               if (c != '+') return seq->seq.l; /* FASTA */                                    \
+               if (seq->qual.m < seq->seq.m) { /* allocate enough memory */    \
+                       seq->qual.m = seq->seq.m;                                                                       \
+                       seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m);         \
+               }                                                                                                                               \
+               while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
+               if (c == -1) return -2; /* we should not stop here */                   \
+               while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l)             \
+                       if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \
+               seq->qual.s[seq->qual.l] = 0; /* null terminated string */              \
+               seq->last_char = 0;     /* we have not come to the next header line */ \
+               if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \
+               return seq->seq.l;                                                                                              \
+       }
+
+#define __KSEQ_TYPE(type_t)                                            \
+       typedef struct {                                                        \
+               kstring_t name, comment, seq, qual;             \
+               int last_char;                                                  \
+               kstream_t *f;                                                   \
+       } kseq_t;
+
+#define KSEQ_INIT(type_t, __read)                              \
+       KSTREAM_INIT(type_t, __read, 4096)                      \
+       __KSEQ_TYPE(type_t)                                                     \
+       __KSEQ_BASIC(type_t)                                            \
+       __KSEQ_READ
+
+#endif
diff --git a/ksort.h b/ksort.h
new file mode 100644 (file)
index 0000000..16a03fd
--- /dev/null
+++ b/ksort.h
@@ -0,0 +1,271 @@
+/* The MIT License
+
+   Copyright (c) 2008 Genome Research Ltd (GRL).
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3@sanger.ac.uk> */
+
+/*
+  2008-11-16 (0.1.4):
+
+    * Fixed a bug in introsort() that happens in rare cases.
+
+  2008-11-05 (0.1.3):
+
+    * Fixed a bug in introsort() for complex comparisons.
+
+       * Fixed a bug in mergesort(). The previous version is not stable.
+
+  2008-09-15 (0.1.2):
+
+       * Accelerated introsort. On my Mac (not on another Linux machine),
+         my implementation is as fast as std::sort on random input.
+
+       * Added combsort and in introsort, switch to combsort if the
+         recursion is too deep.
+
+  2008-09-13 (0.1.1):
+
+       * Added k-small algorithm
+
+  2008-09-05 (0.1.0):
+
+       * Initial version
+
+*/
+
+#ifndef AC_KSORT_H
+#define AC_KSORT_H
+
+#include <stdlib.h>
+#include <string.h>
+
+typedef struct {
+       void *left, *right;
+       int depth;
+} ks_isort_stack_t;
+
+#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; }
+
+#define KSORT_INIT(name, type_t, __sort_lt)                                                            \
+       void ks_mergesort_##name(size_t n, type_t array[], type_t temp[])       \
+       {                                                                                                                                       \
+               type_t *a2[2], *a, *b;                                                                                  \
+               int curr, shift;                                                                                                \
+                                                                                                                                               \
+               a2[0] = array;                                                                                                  \
+               a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n);               \
+               for (curr = 0, shift = 0; (1ul<<shift) < n; ++shift) {                  \
+                       a = a2[curr]; b = a2[1-curr];                                                           \
+                       if (shift == 0) {                                                                                       \
+                               type_t *p = b, *i, *eb = a + n;                                                 \
+                               for (i = a; i < eb; i += 2) {                                                   \
+                                       if (i == eb - 1) *p++ = *i;                                                     \
+                                       else {                                                                                          \
+                                               if (__sort_lt(*(i+1), *i)) {                                    \
+                                                       *p++ = *(i+1); *p++ = *i;                                       \
+                                               } else {                                                                                \
+                                                       *p++ = *i; *p++ = *(i+1);                                       \
+                                               }                                                                                               \
+                                       }                                                                                                       \
+                               }                                                                                                               \
+                       } else {                                                                                                        \
+                               size_t i, step = 1ul<<shift;                                                    \
+                               for (i = 0; i < n; i += step<<1) {                                              \
+                                       type_t *p, *j, *k, *ea, *eb;                                            \
+                                       if (n < i + step) {                                                                     \
+                                               ea = a + n; eb = a;                                                             \
+                                       } else {                                                                                        \
+                                               ea = a + i + step;                                                              \
+                                               eb = a + (n < i + (step<<1)? n : i + (step<<1)); \
+                                       }                                                                                                       \
+                                       j = a + i; k = a + i + step; p = b + i;                         \
+                                       while (j < ea && k < eb) {                                                      \
+                                               if (__sort_lt(*k, *j)) *p++ = *k++;                             \
+                                               else *p++ = *j++;                                                               \
+                                       }                                                                                                       \
+                                       while (j < ea) *p++ = *j++;                                                     \
+                                       while (k < eb) *p++ = *k++;                                                     \
+                               }                                                                                                               \
+                       }                                                                                                                       \
+                       curr = 1 - curr;                                                                                        \
+               }                                                                                                                               \
+               if (curr == 1) {                                                                                                \
+                       type_t *p = a2[0], *i = a2[1], *eb = array + n;                         \
+                       for (; p < eb; ++i) *p++ = *i;                                                          \
+               }                                                                                                                               \
+               if (temp == 0) free(a2[1]);                                                                             \
+       }                                                                                                                                       \
+       void ks_heapadjust_##name(size_t i, size_t n, type_t l[])                       \
+       {                                                                                                                                       \
+               size_t k = i;                                                                                                   \
+               type_t tmp = l[i];                                                                                              \
+               while ((k = (k << 1) + 1) < n) {                                                                \
+                       if (k != n - 1 && __sort_lt(l[k], l[k+1])) ++k;                         \
+                       if (__sort_lt(l[k], tmp)) break;                                                        \
+                       l[i] = l[k]; i = k;                                                                                     \
+               }                                                                                                                               \
+               l[i] = tmp;                                                                                                             \
+       }                                                                                                                                       \
+       void ks_heapmake_##name(size_t lsize, type_t l[])                                       \
+       {                                                                                                                                       \
+               size_t i;                                                                                                               \
+               for (i = (lsize >> 1) - 1; i != (size_t)(-1); --i)                              \
+                       ks_heapadjust_##name(i, lsize, l);                                                      \
+       }                                                                                                                                       \
+       void ks_heapsort_##name(size_t lsize, type_t l[])                                       \
+       {                                                                                                                                       \
+               size_t i;                                                                                                               \
+               for (i = lsize - 1; i > 0; --i) {                                                               \
+                       type_t tmp;                                                                                                     \
+                       tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \
+               }                                                                                                                               \
+       }                                                                                                                                       \
+       inline void __ks_insertsort_##name(type_t *s, type_t *t)                        \
+       {                                                                                                                                       \
+               type_t *i, *j, swap_tmp;                                                                                \
+               for (i = s + 1; i < t; ++i)                                                                             \
+                       for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) {                      \
+                               swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp;                  \
+                       }                                                                                                                       \
+       }                                                                                                                                       \
+       void ks_combsort_##name(size_t n, type_t a[])                                           \
+       {                                                                                                                                       \
+               const double shrink_factor = 1.2473309501039786540366528676643; \
+               int do_swap;                                                                                                    \
+               size_t gap = n;                                                                                                 \
+               type_t tmp, *i, *j;                                                                                             \
+               do {                                                                                                                    \
+                       if (gap > 2) {                                                                                          \
+                               gap = (size_t)(gap / shrink_factor);                                    \
+                               if (gap == 9 || gap == 10) gap = 11;                                    \
+                       }                                                                                                                       \
+                       do_swap = 0;                                                                                            \
+                       for (i = a; i < a + n - gap; ++i) {                                                     \
+                               j = i + gap;                                                                                    \
+                               if (__sort_lt(*j, *i)) {                                                                \
+                                       tmp = *i; *i = *j; *j = tmp;                                            \
+                                       do_swap = 1;                                                                            \
+                               }                                                                                                               \
+                       }                                                                                                                       \
+               } while (do_swap || gap > 2);                                                                   \
+               if (gap != 1) __ks_insertsort_##name(a, a + n);                                 \
+       }                                                                                                                                       \
+       void ks_introsort_##name(size_t n, type_t a[])                                          \
+       {                                                                                                                                       \
+               int d;                                                                                                                  \
+               ks_isort_stack_t *top, *stack;                                                                  \
+               type_t rp, swap_tmp;                                                                                    \
+               type_t *s, *t, *i, *j, *k;                                                                              \
+                                                                                                                                               \
+               if (n < 1) return;                                                                                              \
+               else if (n == 2) {                                                                                              \
+                       if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \
+                       return;                                                                                                         \
+               }                                                                                                                               \
+               for (d = 2; 1ul<<d < n; ++d);                                                                   \
+               stack = (ks_isort_stack_t*)malloc(sizeof(ks_isort_stack_t) * ((sizeof(size_t)*d)+2)); \
+               top = stack; s = a; t = a + (n-1); d <<= 1;                                             \
+               while (1) {                                                                                                             \
+                       if (s < t) {                                                                                            \
+                               if (--d == 0) {                                                                                 \
+                                       ks_combsort_##name(t - s + 1, s);                                       \
+                                       t = s;                                                                                          \
+                                       continue;                                                                                       \
+                               }                                                                                                               \
+                               i = s; j = t; k = i + ((j-i)>>1) + 1;                                   \
+                               if (__sort_lt(*k, *i)) {                                                                \
+                                       if (__sort_lt(*k, *j)) k = j;                                           \
+                               } else k = __sort_lt(*j, *i)? i : j;                                    \
+                               rp = *k;                                                                                                \
+                               if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; }  \
+                               for (;;) {                                                                                              \
+                                       do ++i; while (__sort_lt(*i, rp));                                      \
+                                       do --j; while (i <= j && __sort_lt(rp, *j));            \
+                                       if (j <= i) break;                                                                      \
+                                       swap_tmp = *i; *i = *j; *j = swap_tmp;                          \
+                               }                                                                                                               \
+                               swap_tmp = *i; *i = *t; *t = swap_tmp;                                  \
+                               if (i-s > t-i) {                                                                                \
+                                       if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \
+                                       s = t-i > 16? i+1 : t;                                                          \
+                               } else {                                                                                                \
+                                       if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \
+                                       t = i-s > 16? i-1 : s;                                                          \
+                               }                                                                                                               \
+                       } else {                                                                                                        \
+                               if (top == stack) {                                                                             \
+                                       free(stack);                                                                            \
+                                       __ks_insertsort_##name(a, a+n);                                         \
+                                       return;                                                                                         \
+                               } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \
+                       }                                                                                                                       \
+               }                                                                                                                               \
+       }                                                                                                                                       \
+       /* This function is adapted from: http://ndevilla.free.fr/median/ */ \
+       /* 0 <= kk < n */                                                                                                       \
+       type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk)                      \
+       {                                                                                                                                       \
+               type_t *low, *high, *k, *ll, *hh, *mid;                                                 \
+               low = arr; high = arr + n - 1; k = arr + kk;                                    \
+               for (;;) {                                                                                                              \
+                       if (high <= low) return *k;                                                                     \
+                       if (high == low + 1) {                                                                          \
+                               if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
+                               return *k;                                                                                              \
+                       }                                                                                                                       \
+                       mid = low + (high - low) / 2;                                                           \
+                       if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \
+                       if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \
+                       if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low);      \
+                       KSORT_SWAP(type_t, *mid, *(low+1));                                                     \
+                       ll = low + 1; hh = high;                                                                        \
+                       for (;;) {                                                                                                      \
+                               do ++ll; while (__sort_lt(*ll, *low));                                  \
+                               do --hh; while (__sort_lt(*low, *hh));                                  \
+                               if (hh < ll) break;                                                                             \
+                               KSORT_SWAP(type_t, *ll, *hh);                                                   \
+                       }                                                                                                                       \
+                       KSORT_SWAP(type_t, *low, *hh);                                                          \
+                       if (hh <= k) low = ll;                                                                          \
+                       if (hh >= k) high = hh - 1;                                                                     \
+               }                                                                                                                               \
+       }
+
+#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t)
+#define ks_introsort(name, n, a) ks_introsort_##name(n, a)
+#define ks_combsort(name, n, a) ks_combsort_##name(n, a)
+#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a)
+#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a)
+#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a)
+#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k)
+
+#define ks_lt_generic(a, b) ((a) < (b))
+#define ks_lt_str(a, b) (strcmp((a), (b)) < 0)
+
+typedef const char *ksstr_t;
+
+#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic)
+#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str)
+
+#endif
diff --git a/misc/Makefile b/misc/Makefile
new file mode 100644 (file)
index 0000000..8a38f54
--- /dev/null
@@ -0,0 +1,52 @@
+CC=                    gcc
+CXX=           g++
+CFLAGS=                -g -Wall -O2 -m64 #-arch ppc
+CXXFLAGS=      $(CFLAGS)
+DFLAGS=                #-D_FILE_OFFSET_BITS=64
+OBJS=          
+PROG=          faidx md5sum-lite md5fa maq2sam-short maq2sam-long
+INCLUDES=
+LIBS=          -lm -lz
+SUBDIRS=       .
+
+.SUFFIXES:.c .o
+
+.c.o:
+               $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
+
+all:$(PROG)
+
+lib-recur all-recur clean-recur cleanlocal-recur install-recur:
+               @target=`echo $@ | sed s/-recur//`; \
+               wdir=`pwd`; \
+               list='$(SUBDIRS)'; for subdir in $$list; do \
+                       cd $$subdir; \
+                       $(MAKE) CC="$(CC)" CXX="$(CXX)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \
+                               INCLUDES="$(INCLUDES)" $$target || exit 1; \
+                       cd $$wdir; \
+               done;
+
+lib:
+
+faidx:../faidx.c ../faidx.h
+               $(CC) $(CFLAGS) -DFAIDX_MAIN -o $@ ../faidx.c
+
+md5fa:md5.o md5fa.o md5.h ../kseq.h
+               $(CC) $(CFLAGS) -o $@ md5.o md5fa.o -lz
+
+md5sum-lite:md5.c md5.h
+               $(CC) $(CFLAGS) -DMD5SUM_MAIN -o $@ md5.c
+
+maq2sam-short:maq2sam.c
+               $(CC) $(CFLAGS) -o $@ maq2sam.c -lz
+
+maq2sam-long:maq2sam.c
+               $(CC) $(CFLAGS) -DMAQ_LONGREADS -o $@ maq2sam.c -lz
+
+md5fa.o:md5.h md5fa.c
+               $(CC) $(CFLAGS) -c -I.. -o $@ md5fa.c
+
+cleanlocal:
+               rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a
+
+clean:cleanlocal-recur
diff --git a/misc/export2sam.pl b/misc/export2sam.pl
new file mode 100755 (executable)
index 0000000..ae82123
--- /dev/null
@@ -0,0 +1,107 @@
+#!/usr/bin/perl -w
+
+# Contact: lh3
+# Version: 0.1.0
+
+use strict;
+use warnings;
+use Getopt::Std;
+
+&export2sam;
+exit;
+
+sub export2sam {
+  my ($fh1, $fh2, $is_paired);
+  $is_paired = (@ARGV >= 2);
+  die("export2sam.pl <read1.export> [<read2.export>]\n") if (@ARGV == 0);
+  open($fh1, $ARGV[0]) || die;
+  if ($is_paired) {
+       open($fh2, $ARGV[1]) || die;
+  }
+  # conversion table
+  my @conv_table;
+  for (-64..64) {
+       $conv_table[$_+64] = chr(int(33 + 10*log(1+10**($_/10.0))/log(10)+.499));
+  }
+  # core loop
+  while (<$fh1>) {
+       my (@s1, @s2);
+       &export2sam_aux($_, \@s1, \@conv_table, $is_paired);
+       if ($is_paired) {
+         $_ = <$fh2>;
+         &export2sam_aux($_, \@s2, \@conv_table, $is_paired);
+         if (@s1 && @s2) { # then set mate coordinate
+               my $isize = 0;
+               if ($s1[2] ne '*' && $s1[2] eq $s2[2]) { # then calculate $isize
+                 my $x1 = ($s1[1] & 0x10)? $s1[3] + length($s1[9]) : $s1[3];
+                 my $x2 = ($s2[1] & 0x10)? $s2[3] + length($s2[9]) : $s2[3];
+                 $isize = $x2 - $x1;
+               }
+               # update mate coordinate
+               if ($s2[2] ne '*') {
+                 @s1[6..8] = (($s2[2] eq $s1[2])? "=" : $s2[2], $s2[3], $isize);
+                 $s1[1] |= 0x20 if ($s2[1] & 0x10);
+               } else {
+                 $s1[1] |= 0x8;
+               }
+               if ($s1[2] ne '*') {
+                 @s2[6..8] = (($s1[2] eq $s2[2])? "=" : $s1[2], $s1[3], -$isize);
+                 $s2[1] |= 0x20 if ($s1[1] & 0x10);
+               } else {
+                 $s2[1] |= 0x8;
+               }
+         }
+       }
+       print join("\t", @s1), "\n" if (@s1);
+       print join("\t", @s2), "\n" if (@s2 && $is_paired);
+  }
+  close($fh1);
+  close($fh2) if ($is_paired);
+}
+
+sub export2sam_aux {
+  my ($line, $s, $ct, $is_paired) = @_;
+  chomp($line);
+  my @t = split("\t", $line);
+  @$s = ();
+  return if ($t[21] ne 'Y');
+  # read name
+  $s->[0] = $t[1]? "$t[0]_$t[1]:$t[2]:$t[3]:$t[4]:$t[5]" : "$t[0]:$t[2]:$t[3]:$t[4]:$t[5]";
+  # initial flag (will be updated later)
+  $s->[1] = 0;
+  $s->[1] |= 1 | 1<<(5 + $t[7]) if ($is_paired);
+  # read & quality
+  $s->[9] = $t[8]; $s->[10] = $t[9];
+  if ($t[13] eq 'R') { # then reverse the sequence and quality
+       $s->[9] = reverse($t[8]);
+       $s->[9] =~ tr/ACGTacgt/TGCAtgca/;
+       $s->[10] = reverse($t[9]);
+  }
+  $s->[10] =~ s/(.)/$ct->[ord($1)]/eg; # change coding
+  # cigar
+  $s->[5] = length($s->[9]) . "M";
+  # coor
+  my $has_coor = 0;
+  $s->[2] = "*";
+  if ($t[10] eq 'NM') {
+       $s->[1] |= 0x8; # unmapped
+  } elsif ($t[10] =~ /(\d+):(\d+):(\d+)/) {
+       $s->[1] |= 0x8; # TODO: should I set BAM_FUNMAP in this case?
+       push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3")
+  } else {
+       $s->[2] = $t[10];
+       $has_coor = 1;
+  }
+  $s->[3] = $has_coor? $t[12] : 0;
+  $s->[1] |= 0x10 if ($has_coor && $t[13] eq 'R');
+  # mapQ (TODO: should I choose the larger between $t[15] and $t[16]?)
+  $s->[4] = 0;
+  $s->[4] = $t[15] if ($t[15] ne '');
+  $s->[4] = $t[16] if ($t[16] ne '' && $s->[4] < $t[16]);
+  # mate coordinate
+  $s->[6] = '*'; $s->[7] = $s->[8] = 0;
+  # aux
+  push(@$s, "BC:Z:$t[6]") if ($t[6]);
+  push(@$s, "MD:Z:$t[14]") if ($has_coor);
+  push(@$s, "SM:i:$t[15]") if ($is_paired && $has_coor);
+}
diff --git a/misc/maq2sam.c b/misc/maq2sam.c
new file mode 100644 (file)
index 0000000..e30aa92
--- /dev/null
@@ -0,0 +1,168 @@
+#include <string.h>
+#include <zlib.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <assert.h>
+
+//#define MAQ_LONGREADS
+
+#ifdef MAQ_LONGREADS
+#  define MAX_READLEN 128
+#else
+#  define MAX_READLEN 64
+#endif
+
+#define MAX_NAMELEN 36
+#define MAQMAP_FORMAT_OLD 0
+#define MAQMAP_FORMAT_NEW -1
+
+#define PAIRFLAG_FF      0x01
+#define PAIRFLAG_FR      0x02
+#define PAIRFLAG_RF      0x04
+#define PAIRFLAG_RR      0x08
+#define PAIRFLAG_PAIRED  0x10
+#define PAIRFLAG_DIFFCHR 0x20
+#define PAIRFLAG_NOMATCH 0x40
+#define PAIRFLAG_SW      0x80
+
+typedef struct
+{
+       uint8_t seq[MAX_READLEN]; /* the last base is the single-end mapping quality. */
+       uint8_t size, map_qual, info1, info2, c[2], flag, alt_qual;
+       uint32_t seqid, pos;
+       int dist;
+       char name[MAX_NAMELEN];
+} maqmap1_t;
+
+typedef struct
+{
+       int format, n_ref;
+       char **ref_name;
+       uint64_t n_mapped_reads;
+       maqmap1_t *mapped_reads;
+} maqmap_t;
+
+maqmap_t *maq_new_maqmap()
+{
+       maqmap_t *mm = (maqmap_t*)calloc(1, sizeof(maqmap_t));
+       mm->format = MAQMAP_FORMAT_NEW;
+       return mm;
+}
+void maq_delete_maqmap(maqmap_t *mm)
+{
+       int i;
+       if (mm == 0) return;
+       for (i = 0; i < mm->n_ref; ++i)
+               free(mm->ref_name[i]);
+       free(mm->ref_name);
+       free(mm->mapped_reads);
+       free(mm);
+}
+maqmap_t *maqmap_read_header(gzFile fp)
+{
+       maqmap_t *mm;
+       int k, len;
+       mm = maq_new_maqmap();
+       gzread(fp, &mm->format, sizeof(int));
+       if (mm->format != MAQMAP_FORMAT_NEW) {
+               if (mm->format > 0) {
+                       fprintf(stderr, "** Obsolete map format is detected. Please use 'mapass2maq' command to convert the format.\n");
+                       exit(3);
+               }
+               assert(mm->format == MAQMAP_FORMAT_NEW);
+       }
+       gzread(fp, &mm->n_ref, sizeof(int));
+       mm->ref_name = (char**)calloc(mm->n_ref, sizeof(char*));
+       for (k = 0; k != mm->n_ref; ++k) {
+               gzread(fp, &len, sizeof(int));
+               mm->ref_name[k] = (char*)malloc(len * sizeof(char));
+               gzread(fp, mm->ref_name[k], len);
+       }
+       /* read number of mapped reads */
+       gzread(fp, &mm->n_mapped_reads, sizeof(uint64_t));
+       return mm;
+}
+
+void maq2tam_core(gzFile fp)
+{
+       maqmap_t *mm;
+       maqmap1_t mm1, *m1;
+       int ret;
+       m1 = &mm1;
+       mm = maqmap_read_header(fp);
+       while ((ret = gzread(fp, m1, sizeof(maqmap1_t))) == sizeof(maqmap1_t)) {
+               int j, flag = 0;
+               if (m1->flag) flag |= 1;
+               if ((m1->flag&PAIRFLAG_PAIRED) || ((m1->flag&PAIRFLAG_SW) && m1->flag != 192)) flag |= 2;
+               if (m1->flag == 192) flag |= 4;
+               if (m1->flag == 64) flag |= 8;
+               if (m1->pos&1) flag |= 0x10;
+               if ((flag&1) && m1->dist != 0) {
+                       int c;
+                       if (m1->dist > 0) {
+                               if (m1->flag&(PAIRFLAG_FF|PAIRFLAG_RF)) c = 0;
+                               else if (m1->flag&(PAIRFLAG_FR|PAIRFLAG_RR)) c = 1;
+                               else c = m1->pos&1;                             
+                       } else {
+                               if (m1->flag&(PAIRFLAG_FF|PAIRFLAG_FR)) c = 0;
+                               else if (m1->flag&(PAIRFLAG_RF|PAIRFLAG_RR)) c = 1;
+                               else c = m1->pos&1;
+                       }
+                       flag |= c;
+               }
+               if (flag) {
+                       int l = strlen(m1->name);
+                       if (m1->name[l-2] == '/') {
+                               flag |= (m1->name[l-1] == '1')? 0x40 : 0x80;
+                               m1->name[l-2] = '\0';
+                       }
+               }
+               printf("%s\t%d\t", m1->name, flag);
+               printf("%s\t%d\t", mm->ref_name[m1->seqid], (m1->pos>>1)+1);
+               if (m1->flag == 130) {
+                       int c = (int8_t)m1->seq[MAX_READLEN-1];
+                       printf("%d\t", m1->alt_qual);
+                       if (c == 0) printf("%dM\t", m1->size);
+                       else {
+                               if (c > 0) printf("%dM%dI%dM\t", m1->map_qual, c, m1->size - m1->map_qual - c);
+                               else printf("%dM%dD%dM\t", m1->map_qual, -c, m1->size - m1->map_qual);
+                       }
+               } else {
+                       if (flag&4) printf("0\t*\t");
+                       else printf("%d\t%dM\t", m1->map_qual, m1->size);
+               }
+               printf("*\t0\t%d\t", m1->dist);
+               for (j = 0; j != m1->size; ++j) {
+                       if (m1->seq[j] == 0) putchar('N');
+                       else putchar("ACGT"[m1->seq[j]>>6&3]);
+               }
+               putchar('\t');
+               for (j = 0; j != m1->size; ++j)
+                       putchar((m1->seq[j]&0x3f) + 33);
+               putchar('\t');
+               if (flag&4) {
+                       printf("MF:i:%d\n", m1->flag);
+               } else {
+                       printf("MF:i:%d\t", m1->flag);
+                       if (m1->flag) printf("Aq:i:%d\t", m1->alt_qual);
+                       printf("NM:i:%d\tUQ:i:%d\tH0:i:%d\tH1:i:%d\n", m1->info1&0xf, m1->info2, m1->c[0], m1->c[1]);
+               }
+       }
+       if (ret > 0)
+               fprintf(stderr, "Truncated! Continue anyway.\n");
+       maq_delete_maqmap(mm);
+}
+
+int main(int argc, char *argv[])
+{
+       gzFile fp;
+       if (argc == 1) {
+               fprintf(stderr, "Usage: maq2tam <in.map>\n");
+               return 1;
+       }
+       fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r");
+       maq2tam_core(fp);
+       gzclose(fp);
+       return 0;
+}
diff --git a/misc/md5.c b/misc/md5.c
new file mode 100644 (file)
index 0000000..ccead0e
--- /dev/null
@@ -0,0 +1,307 @@
+/*
+ **********************************************************************
+ ** md5.c                                                            **
+ ** RSA Data Security, Inc. MD5 Message Digest Algorithm             **
+ ** Created: 2/17/90 RLR                                             **
+ ** Revised: 1/91 SRD,AJ,BSK,JT Reference C Version                  **
+ **********************************************************************
+ */
+
+/*
+ **********************************************************************
+ ** Copyright (C) 1990, RSA Data Security, Inc. All rights reserved. **
+ **                                                                  **
+ ** License to copy and use this software is granted provided that   **
+ ** it is identified as the "RSA Data Security, Inc. MD5 Message     **
+ ** Digest Algorithm" in all material mentioning or referencing this **
+ ** software or this function.                                       **
+ **                                                                  **
+ ** License is also granted to make and use derivative works         **
+ ** provided that such works are identified as "derived from the RSA **
+ ** Data Security, Inc. MD5 Message Digest Algorithm" in all         **
+ ** material mentioning or referencing the derived work.             **
+ **                                                                  **
+ ** RSA Data Security, Inc. makes no representations concerning      **
+ ** either the merchantability of this software or the suitability   **
+ ** of this software for any particular purpose.  It is provided "as **
+ ** is" without express or implied warranty of any kind.             **
+ **                                                                  **
+ ** These notices must be retained in any copies of any part of this **
+ ** documentation and/or software.                                   **
+ **********************************************************************
+ */
+
+#include "md5.h"
+
+/* forward declaration */
+static void Transform ();
+
+static unsigned char PADDING[64] = {
+  0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+/* F, G and H are basic MD5 functions: selection, majority, parity */
+#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
+#define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+#define I(x, y, z) ((y) ^ ((x) | (~z))) 
+
+/* ROTATE_LEFT rotates x left n bits */
+#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
+
+/* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4 */
+/* Rotation is separate from addition to prevent recomputation */
+#define FF(a, b, c, d, x, s, ac) \
+  {(a) += F ((b), (c), (d)) + (x) + (UINT4)(ac); \
+   (a) = ROTATE_LEFT ((a), (s)); \
+   (a) += (b); \
+  }
+#define GG(a, b, c, d, x, s, ac) \
+  {(a) += G ((b), (c), (d)) + (x) + (UINT4)(ac); \
+   (a) = ROTATE_LEFT ((a), (s)); \
+   (a) += (b); \
+  }
+#define HH(a, b, c, d, x, s, ac) \
+  {(a) += H ((b), (c), (d)) + (x) + (UINT4)(ac); \
+   (a) = ROTATE_LEFT ((a), (s)); \
+   (a) += (b); \
+  }
+#define II(a, b, c, d, x, s, ac) \
+  {(a) += I ((b), (c), (d)) + (x) + (UINT4)(ac); \
+   (a) = ROTATE_LEFT ((a), (s)); \
+   (a) += (b); \
+  }
+
+void MD5Init (mdContext)
+MD5_CTX *mdContext;
+{
+  mdContext->i[0] = mdContext->i[1] = (UINT4)0;
+
+  /* Load magic initialization constants.
+   */
+  mdContext->buf[0] = (UINT4)0x67452301;
+  mdContext->buf[1] = (UINT4)0xefcdab89;
+  mdContext->buf[2] = (UINT4)0x98badcfe;
+  mdContext->buf[3] = (UINT4)0x10325476;
+}
+
+void MD5Update (mdContext, inBuf, inLen)
+MD5_CTX *mdContext;
+unsigned char *inBuf;
+unsigned int inLen;
+{
+  UINT4 in[16];
+  int mdi;
+  unsigned int i, ii;
+
+  /* compute number of bytes mod 64 */
+  mdi = (int)((mdContext->i[0] >> 3) & 0x3F);
+
+  /* update number of bits */
+  if ((mdContext->i[0] + ((UINT4)inLen << 3)) < mdContext->i[0])
+    mdContext->i[1]++;
+  mdContext->i[0] += ((UINT4)inLen << 3);
+  mdContext->i[1] += ((UINT4)inLen >> 29);
+
+  while (inLen--) {
+    /* add new character to buffer, increment mdi */
+    mdContext->in[mdi++] = *inBuf++;
+
+    /* transform if necessary */
+    if (mdi == 0x40) {
+      for (i = 0, ii = 0; i < 16; i++, ii += 4)
+        in[i] = (((UINT4)mdContext->in[ii+3]) << 24) |
+                (((UINT4)mdContext->in[ii+2]) << 16) |
+                (((UINT4)mdContext->in[ii+1]) << 8) |
+                ((UINT4)mdContext->in[ii]);
+      Transform (mdContext->buf, in);
+      mdi = 0;
+    }
+  }
+}
+
+void MD5Final (mdContext)
+MD5_CTX *mdContext;
+{
+  UINT4 in[16];
+  int mdi;
+  unsigned int i, ii;
+  unsigned int padLen;
+
+  /* save number of bits */
+  in[14] = mdContext->i[0];
+  in[15] = mdContext->i[1];
+
+  /* compute number of bytes mod 64 */
+  mdi = (int)((mdContext->i[0] >> 3) & 0x3F);
+
+  /* pad out to 56 mod 64 */
+  padLen = (mdi < 56) ? (56 - mdi) : (120 - mdi);
+  MD5Update (mdContext, PADDING, padLen);
+
+  /* append length in bits and transform */
+  for (i = 0, ii = 0; i < 14; i++, ii += 4)
+    in[i] = (((UINT4)mdContext->in[ii+3]) << 24) |
+            (((UINT4)mdContext->in[ii+2]) << 16) |
+            (((UINT4)mdContext->in[ii+1]) << 8) |
+            ((UINT4)mdContext->in[ii]);
+  Transform (mdContext->buf, in);
+
+  /* store buffer in digest */
+  for (i = 0, ii = 0; i < 4; i++, ii += 4) {
+    mdContext->digest[ii] = (unsigned char)(mdContext->buf[i] & 0xFF);
+    mdContext->digest[ii+1] =
+      (unsigned char)((mdContext->buf[i] >> 8) & 0xFF);
+    mdContext->digest[ii+2] =
+      (unsigned char)((mdContext->buf[i] >> 16) & 0xFF);
+    mdContext->digest[ii+3] =
+      (unsigned char)((mdContext->buf[i] >> 24) & 0xFF);
+  }
+}
+
+/* Basic MD5 step. Transform buf based on in.
+ */
+static void Transform (buf, in)
+UINT4 *buf;
+UINT4 *in;
+{
+  UINT4 a = buf[0], b = buf[1], c = buf[2], d = buf[3];
+
+  /* Round 1 */
+#define S11 7
+#define S12 12
+#define S13 17
+#define S14 22
+  FF ( a, b, c, d, in[ 0], S11, 3614090360u); /* 1 */
+  FF ( d, a, b, c, in[ 1], S12, 3905402710u); /* 2 */
+  FF ( c, d, a, b, in[ 2], S13,  606105819u); /* 3 */
+  FF ( b, c, d, a, in[ 3], S14, 3250441966u); /* 4 */
+  FF ( a, b, c, d, in[ 4], S11, 4118548399u); /* 5 */
+  FF ( d, a, b, c, in[ 5], S12, 1200080426u); /* 6 */
+  FF ( c, d, a, b, in[ 6], S13, 2821735955u); /* 7 */
+  FF ( b, c, d, a, in[ 7], S14, 4249261313u); /* 8 */
+  FF ( a, b, c, d, in[ 8], S11, 1770035416u); /* 9 */
+  FF ( d, a, b, c, in[ 9], S12, 2336552879u); /* 10 */
+  FF ( c, d, a, b, in[10], S13, 4294925233u); /* 11 */
+  FF ( b, c, d, a, in[11], S14, 2304563134u); /* 12 */
+  FF ( a, b, c, d, in[12], S11, 1804603682u); /* 13 */
+  FF ( d, a, b, c, in[13], S12, 4254626195u); /* 14 */
+  FF ( c, d, a, b, in[14], S13, 2792965006u); /* 15 */
+  FF ( b, c, d, a, in[15], S14, 1236535329u); /* 16 */
+
+  /* Round 2 */
+#define S21 5
+#define S22 9
+#define S23 14
+#define S24 20
+  GG ( a, b, c, d, in[ 1], S21, 4129170786u); /* 17 */
+  GG ( d, a, b, c, in[ 6], S22, 3225465664u); /* 18 */
+  GG ( c, d, a, b, in[11], S23,  643717713u); /* 19 */
+  GG ( b, c, d, a, in[ 0], S24, 3921069994u); /* 20 */
+  GG ( a, b, c, d, in[ 5], S21, 3593408605u); /* 21 */
+  GG ( d, a, b, c, in[10], S22,   38016083u); /* 22 */
+  GG ( c, d, a, b, in[15], S23, 3634488961u); /* 23 */
+  GG ( b, c, d, a, in[ 4], S24, 3889429448u); /* 24 */
+  GG ( a, b, c, d, in[ 9], S21,  568446438u); /* 25 */
+  GG ( d, a, b, c, in[14], S22, 3275163606u); /* 26 */
+  GG ( c, d, a, b, in[ 3], S23, 4107603335u); /* 27 */
+  GG ( b, c, d, a, in[ 8], S24, 1163531501u); /* 28 */
+  GG ( a, b, c, d, in[13], S21, 2850285829u); /* 29 */
+  GG ( d, a, b, c, in[ 2], S22, 4243563512u); /* 30 */
+  GG ( c, d, a, b, in[ 7], S23, 1735328473u); /* 31 */
+  GG ( b, c, d, a, in[12], S24, 2368359562u); /* 32 */
+
+  /* Round 3 */
+#define S31 4
+#define S32 11
+#define S33 16
+#define S34 23
+  HH ( a, b, c, d, in[ 5], S31, 4294588738u); /* 33 */
+  HH ( d, a, b, c, in[ 8], S32, 2272392833u); /* 34 */
+  HH ( c, d, a, b, in[11], S33, 1839030562u); /* 35 */
+  HH ( b, c, d, a, in[14], S34, 4259657740u); /* 36 */
+  HH ( a, b, c, d, in[ 1], S31, 2763975236u); /* 37 */
+  HH ( d, a, b, c, in[ 4], S32, 1272893353u); /* 38 */
+  HH ( c, d, a, b, in[ 7], S33, 4139469664u); /* 39 */
+  HH ( b, c, d, a, in[10], S34, 3200236656u); /* 40 */
+  HH ( a, b, c, d, in[13], S31,  681279174u); /* 41 */
+  HH ( d, a, b, c, in[ 0], S32, 3936430074u); /* 42 */
+  HH ( c, d, a, b, in[ 3], S33, 3572445317u); /* 43 */
+  HH ( b, c, d, a, in[ 6], S34,   76029189u); /* 44 */
+  HH ( a, b, c, d, in[ 9], S31, 3654602809u); /* 45 */
+  HH ( d, a, b, c, in[12], S32, 3873151461u); /* 46 */
+  HH ( c, d, a, b, in[15], S33,  530742520u); /* 47 */
+  HH ( b, c, d, a, in[ 2], S34, 3299628645u); /* 48 */
+
+  /* Round 4 */
+#define S41 6
+#define S42 10
+#define S43 15
+#define S44 21
+  II ( a, b, c, d, in[ 0], S41, 4096336452u); /* 49 */
+  II ( d, a, b, c, in[ 7], S42, 1126891415u); /* 50 */
+  II ( c, d, a, b, in[14], S43, 2878612391u); /* 51 */
+  II ( b, c, d, a, in[ 5], S44, 4237533241u); /* 52 */
+  II ( a, b, c, d, in[12], S41, 1700485571u); /* 53 */
+  II ( d, a, b, c, in[ 3], S42, 2399980690u); /* 54 */
+  II ( c, d, a, b, in[10], S43, 4293915773u); /* 55 */
+  II ( b, c, d, a, in[ 1], S44, 2240044497u); /* 56 */
+  II ( a, b, c, d, in[ 8], S41, 1873313359u); /* 57 */
+  II ( d, a, b, c, in[15], S42, 4264355552u); /* 58 */
+  II ( c, d, a, b, in[ 6], S43, 2734768916u); /* 59 */
+  II ( b, c, d, a, in[13], S44, 1309151649u); /* 60 */
+  II ( a, b, c, d, in[ 4], S41, 4149444226u); /* 61 */
+  II ( d, a, b, c, in[11], S42, 3174756917u); /* 62 */
+  II ( c, d, a, b, in[ 2], S43,  718787259u); /* 63 */
+  II ( b, c, d, a, in[ 9], S44, 3951481745u); /* 64 */
+
+  buf[0] += a;
+  buf[1] += b;
+  buf[2] += c;
+  buf[3] += d;
+}
+
+/* lh3: the following code is added by me */
+
+#ifdef MD5SUM_MAIN
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#define HEX_STR "0123456789abcdef"
+
+static void md5_one(const char *fn)
+{
+       unsigned char buf[4096];
+       MD5_CTX md5;
+       int l;
+       FILE *fp;
+
+       fp = strcmp(fn, "-")? fopen(fn, "r") : stdin;
+       if (fp == 0) {
+               fprintf(stderr, "md5sum: %s: No such file or directory\n", fn);
+               exit(1);
+       }
+       MD5Init(&md5);
+       while ((l = fread(buf, 1, 4096, fp)) > 0)
+               MD5Update(&md5, buf, l);
+       MD5Final(&md5);
+       if (fp != stdin) fclose(fp);
+       for (l = 0; l < 16; ++l)
+               printf("%c%c", HEX_STR[md5.digest[l]>>4&0xf], HEX_STR[md5.digest[l]&0xf]);
+       printf("  %s\n", fn);
+}
+int main(int argc, char *argv[])
+{
+       int i;
+       if (argc == 1) md5_one("-");
+       else for (i = 1; i < argc; ++i) md5_one(argv[i]);
+       return 0;
+}
+#endif
diff --git a/misc/md5.h b/misc/md5.h
new file mode 100644 (file)
index 0000000..678ac27
--- /dev/null
@@ -0,0 +1,68 @@
+/*
+ **********************************************************************
+ ** md5.h -- Header file for implementation of MD5                   **
+ ** RSA Data Security, Inc. MD5 Message Digest Algorithm             **
+ ** Created: 2/17/90 RLR                                             **
+ ** Revised: 12/27/90 SRD,AJ,BSK,JT Reference C version              **
+ ** Revised (for MD5): RLR 4/27/91                                   **
+ **   -- G modified to have y&~z instead of y&z                      **
+ **   -- FF, GG, HH modified to add in last register done            **
+ **   -- Access pattern: round 2 works mod 5, round 3 works mod 3    **
+ **   -- distinct additive constant for each step                    **
+ **   -- round 4 added, working mod 7                                **
+ **********************************************************************
+ */
+
+/*
+ **********************************************************************
+ ** Copyright (C) 1990, RSA Data Security, Inc. All rights reserved. **
+ **                                                                  **
+ ** License to copy and use this software is granted provided that   **
+ ** it is identified as the "RSA Data Security, Inc. MD5 Message     **
+ ** Digest Algorithm" in all material mentioning or referencing this **
+ ** software or this function.                                       **
+ **                                                                  **
+ ** License is also granted to make and use derivative works         **
+ ** provided that such works are identified as "derived from the RSA **
+ ** Data Security, Inc. MD5 Message Digest Algorithm" in all         **
+ ** material mentioning or referencing the derived work.             **
+ **                                                                  **
+ ** RSA Data Security, Inc. makes no representations concerning      **
+ ** either the merchantability of this software or the suitability   **
+ ** of this software for any particular purpose.  It is provided "as **
+ ** is" without express or implied warranty of any kind.             **
+ **                                                                  **
+ ** These notices must be retained in any copies of any part of this **
+ ** documentation and/or software.                                   **
+ **********************************************************************
+ */
+
+#ifndef MD5_H
+#define MD5_H
+
+#include <stdint.h>
+
+/* typedef a 32 bit type */
+typedef uint32_t UINT4;
+
+/* Data structure for MD5 (Message Digest) computation */
+typedef struct {
+  UINT4 i[2];                   /* number of _bits_ handled mod 2^64 */
+  UINT4 buf[4];                                    /* scratch buffer */
+  unsigned char in[64];                              /* input buffer */
+  unsigned char digest[16];     /* actual digest after MD5Final call */
+} MD5_CTX;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+       void MD5Init(MD5_CTX *mdContext);
+       void MD5Update(MD5_CTX *mdContext, unsigned char *inBuf, unsigned intinLen);
+       void MD5Final(MD5_CTX *mdContext);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/misc/md5fa.c b/misc/md5fa.c
new file mode 100644 (file)
index 0000000..c41db2d
--- /dev/null
@@ -0,0 +1,58 @@
+#include <stdio.h>
+#include <zlib.h>
+#include "md5.h"
+#include "kseq.h"
+
+#define HEX_STR "0123456789abcdef"
+
+KSEQ_INIT(gzFile, gzread)
+
+static void md5_one(const char *fn)
+{
+       MD5_CTX md5_one, md5_all;
+       int l, i, k;
+       gzFile fp;
+       kseq_t *seq;
+       unsigned char unordered[16];
+
+       for (l = 0; l < 16; ++l) unordered[l] = 0;
+       fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+       if (fp == 0) {
+               fprintf(stderr, "md5fa: %s: No such file or directory\n", fn);
+               exit(1);
+       }
+       
+       MD5Init(&md5_all);
+       seq = kseq_init(fp);
+       while ((l = kseq_read(seq)) >= 0) {
+               for (i = k = 0; i < seq->seq.l; ++i) {
+                       if (islower(seq->seq.s[i])) seq->seq.s[k++] = toupper(seq->seq.s[i]);
+                       else if (isupper(seq->seq.s[i])) seq->seq.s[k++] = seq->seq.s[i];
+               }
+               MD5Init(&md5_one);
+               MD5Update(&md5_one, (unsigned char*)seq->seq.s, k);
+               MD5Final(&md5_one);
+               for (l = 0; l < 16; ++l) {
+                       printf("%c%c", HEX_STR[md5_one.digest[l]>>4&0xf], HEX_STR[md5_one.digest[l]&0xf]);
+                       unordered[l] ^= md5_one.digest[l];
+               }
+               printf("  %s  %s\n", fn, seq->name.s);
+               MD5Update(&md5_all, (unsigned char*)seq->seq.s, k);
+       }
+       MD5Final(&md5_all);
+       kseq_destroy(seq);
+       for (l = 0; l < 16; ++l)
+               printf("%c%c", HEX_STR[md5_all.digest[l]>>4&0xf], HEX_STR[md5_all.digest[l]&0xf]);
+       printf("  %s  >ordered\n", fn);
+       for (l = 0; l < 16; ++l)
+               printf("%c%c", HEX_STR[unordered[l]>>4&0xf], HEX_STR[unordered[l]&0xf]);
+       printf("  %s  >unordered\n", fn);
+}
+
+int main(int argc, char *argv[])
+{
+       int i;
+       if (argc == 1) md5_one("-");
+       else for (i = 1; i < argc; ++i) md5_one(argv[i]);
+       return 0;
+}
diff --git a/razf.c b/razf.c
new file mode 100644 (file)
index 0000000..6611f0b
--- /dev/null
+++ b/razf.c
@@ -0,0 +1,647 @@
+/*
+ * RAZF : Random Access compressed(Z) File
+ * Version: 1.0
+ * Release Date: 2008-10-27
+ *
+ * Copyright 2008, Jue Ruan <ruanjue@gmail.com>, Heng Li <lh3@sanger.ac.uk>
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * To compile razf.c, zlib-1.2.3(or greater) is required.
+ */
+
+#include <fcntl.h>
+#include <stdio.h>
+#include "razf.h"
+
+static inline uint32_t byte_swap_4(uint32_t v){
+       v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
+       return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
+}
+
+static inline uint64_t byte_swap_8(uint64_t v){
+       v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32);
+       v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16);
+       return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8);
+}
+
+static inline int is_big_endian(){
+       int x = 0x01;
+       char *c = (char*)&x;
+       return (c[0] != 0x01);
+}
+
+static void add_zindex(RAZF *rz, int64_t in, int64_t out){
+       if(rz->index->size == rz->index->cap){
+               rz->index->cap = rz->index->cap * 1.5 + 2;
+               rz->index->cell_offsets = realloc(rz->index->cell_offsets, sizeof(int) * rz->index->cap);
+               rz->index->bin_offsets  = realloc(rz->index->bin_offsets, sizeof(int64_t) * (rz->index->cap/RZ_BIN_SIZE + 1));
+       }
+       if(rz->index->size % RZ_BIN_SIZE == 0) rz->index->bin_offsets[rz->index->size / RZ_BIN_SIZE] = out;
+       rz->index->cell_offsets[rz->index->size] = out - rz->index->bin_offsets[rz->index->size / RZ_BIN_SIZE];
+       rz->index->size ++;
+}
+
+static void save_zindex(RAZF *rz, int fd){
+       int32_t i, v32;
+       int is_be;
+       is_be = is_big_endian();
+       if(is_be) write(fd, &rz->index->size, sizeof(int));
+       else {
+               v32 = byte_swap_4((uint32_t)rz->index->size);
+               write(fd, &v32, sizeof(uint32_t));
+       }
+       v32 = rz->index->size / RZ_BIN_SIZE + 1;
+       if(!is_be){
+               for(i=0;i<v32;i++) rz->index->bin_offsets[i]  = byte_swap_8((uint64_t)rz->index->bin_offsets[i]);
+               for(i=0;i<rz->index->size;i++) rz->index->cell_offsets[i] = byte_swap_4((uint32_t)rz->index->cell_offsets[i]);
+       }
+       write(fd, rz->index->bin_offsets, sizeof(int64_t) * v32);
+       write(fd, rz->index->cell_offsets, sizeof(int32_t) * rz->index->size);
+}
+
+static void load_zindex(RAZF *rz, int fd){
+       int32_t i, v32;
+       int is_be;
+       if(!rz->load_index) return;
+       if(rz->index == NULL) rz->index = malloc(sizeof(ZBlockIndex));
+       is_be = is_big_endian();
+       read(fd, &rz->index->size, sizeof(int));
+       if(!is_be) rz->index->size = byte_swap_4((uint32_t)rz->index->size);
+       rz->index->cap = rz->index->size;
+       v32 = rz->index->size / RZ_BIN_SIZE + 1;
+       rz->index->bin_offsets  = malloc(sizeof(int64_t) * v32);
+       read(fd, rz->index->bin_offsets, sizeof(int64_t) * v32);
+       rz->index->cell_offsets = malloc(sizeof(int) * rz->index->size);
+       read(fd, rz->index->cell_offsets, sizeof(int) * rz->index->size);
+       if(!is_be){
+               for(i=0;i<v32;i++) rz->index->bin_offsets[i] = byte_swap_8((uint64_t)rz->index->bin_offsets[i]);
+               for(i=0;i<rz->index->size;i++) rz->index->cell_offsets[i] = byte_swap_4((uint32_t)rz->index->cell_offsets[i]);
+       }
+}
+
+static RAZF* razf_open_w(int fd){
+       RAZF *rz;
+       rz = calloc(1, sizeof(RAZF));
+       rz->mode = 'w';
+       rz->filedes = fd;
+       rz->stream = calloc(sizeof(z_stream), 1);
+       rz->inbuf  = malloc(RZ_BUFFER_SIZE);
+       rz->outbuf = malloc(RZ_BUFFER_SIZE);
+       rz->index = calloc(sizeof(ZBlockIndex), 1);
+       deflateInit2(rz->stream, RZ_COMPRESS_LEVEL, Z_DEFLATED, WINDOW_BITS + 16, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+       rz->stream->avail_out = RZ_BUFFER_SIZE;
+       rz->stream->next_out  = rz->outbuf;
+       rz->header = calloc(sizeof(gz_header), 1);
+       rz->header->os    = 0x03; //Unix
+       rz->header->text  = 0;
+       rz->header->time  = 0;
+       rz->header->extra = malloc(7);
+       strncpy((char*)rz->header->extra, "RAZF", 4);
+       rz->header->extra[4] = 1; // obsolete field
+       // block size = RZ_BLOCK_SIZE, Big-Endian
+       rz->header->extra[5] = RZ_BLOCK_SIZE >> 8;
+       rz->header->extra[6] = RZ_BLOCK_SIZE & 0xFF;
+       rz->header->extra_len = 7;
+       rz->header->name = rz->header->comment  = 0;
+       rz->header->hcrc = 0;
+       deflateSetHeader(rz->stream, rz->header);
+       rz->block_pos = rz->block_off = 0;
+       return rz;
+}
+
+static void _razf_write(RAZF* rz, const void *data, int size){
+       int tout;
+       rz->stream->avail_in = size;
+       rz->stream->next_in  = (void*)data;
+       while(1){
+               tout = rz->stream->avail_out;
+               deflate(rz->stream, Z_NO_FLUSH);
+               rz->out += tout - rz->stream->avail_out;
+               if(rz->stream->avail_out) break;
+               write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+               rz->stream->avail_out = RZ_BUFFER_SIZE;
+               rz->stream->next_out  = rz->outbuf;
+               if(rz->stream->avail_in == 0) break;
+       };
+       rz->in += size - rz->stream->avail_in;
+       rz->block_off += size - rz->stream->avail_in;
+}
+
+static void razf_flush(RAZF *rz){
+       uint32_t tout;
+       if(rz->buf_len){
+               _razf_write(rz, rz->inbuf, rz->buf_len);
+               rz->buf_off = rz->buf_len = 0;
+       }
+       if(rz->stream->avail_out){
+               write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+               rz->stream->avail_out = RZ_BUFFER_SIZE;
+               rz->stream->next_out  = rz->outbuf;
+       }
+       while(1){
+               tout = rz->stream->avail_out;
+               deflate(rz->stream, Z_FULL_FLUSH);
+               rz->out += tout - rz->stream->avail_out;
+               if(rz->stream->avail_out == 0){
+                       write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+                       rz->stream->avail_out = RZ_BUFFER_SIZE;
+                       rz->stream->next_out  = rz->outbuf;
+               } else break;
+       }
+       rz->block_pos = rz->out;
+       rz->block_off = 0;
+}
+
+static void razf_end_flush(RAZF *rz){
+       uint32_t tout;
+       if(rz->buf_len){
+               _razf_write(rz, rz->inbuf, rz->buf_len);
+               rz->buf_off = rz->buf_len = 0;
+       }
+       while(1){
+               tout = rz->stream->avail_out;
+               deflate(rz->stream, Z_FINISH);
+               rz->out += tout - rz->stream->avail_out;
+               if(rz->stream->avail_out < RZ_BUFFER_SIZE){
+                       write(rz->filedes, rz->outbuf, RZ_BUFFER_SIZE - rz->stream->avail_out);
+                       rz->stream->avail_out = RZ_BUFFER_SIZE;
+                       rz->stream->next_out  = rz->outbuf;
+               } else break;
+       }
+}
+
+static void _razf_buffered_write(RAZF *rz, const void *data, int size){
+       int i, n;
+       while(1){
+               if(rz->buf_len == RZ_BUFFER_SIZE){
+                       _razf_write(rz, rz->inbuf, rz->buf_len);
+                       rz->buf_len = 0;
+               }
+               if(size + rz->buf_len < RZ_BUFFER_SIZE){
+                       for(i=0;i<size;i++) ((char*)rz->inbuf + rz->buf_len)[i] = ((char*)data)[i];
+                       rz->buf_len += size;
+                       return;
+               } else {
+                       n = RZ_BUFFER_SIZE - rz->buf_len;
+                       for(i=0;i<n;i++) ((char*)rz->inbuf + rz->buf_len)[i] = ((char*)data)[i];
+                       size -= n;
+                       data += n;
+                       rz->buf_len += n;
+               }
+       }
+}
+
+int razf_write(RAZF* rz, const void *data, int size){
+       int ori_size, n;
+       int64_t next_block;
+       ori_size = size;
+       next_block = ((rz->in / RZ_BLOCK_SIZE) + 1) * RZ_BLOCK_SIZE;
+       while(rz->in + rz->buf_len + size >= next_block){
+               n = next_block - rz->in - rz->buf_len;
+               _razf_buffered_write(rz, data, n);
+               data += n;
+               size -= n;
+               razf_flush(rz);
+               add_zindex(rz, rz->in, rz->out);
+               next_block = ((rz->in / RZ_BLOCK_SIZE) + 1) * RZ_BLOCK_SIZE;
+       }
+       _razf_buffered_write(rz, data, size);
+       return ori_size;
+}
+
+/* gzip flag byte */
+#define ASCII_FLAG   0x01 /* bit 0 set: file probably ascii text */
+#define HEAD_CRC     0x02 /* bit 1 set: header CRC present */
+#define EXTRA_FIELD  0x04 /* bit 2 set: extra field present */
+#define ORIG_NAME    0x08 /* bit 3 set: original file name present */
+#define COMMENT      0x10 /* bit 4 set: file comment present */
+#define RESERVED     0xE0 /* bits 5..7: reserved */
+
+static int _read_gz_header(unsigned char *data, int size, int *extra_off, int *extra_len){
+       int method, flags, n, len;
+       if(size < 2) return 0;
+       if(data[0] != 0x1f || data[1] != 0x8b) return 0;
+       if(size < 4) return 0;
+       method = data[2];
+       flags  = data[3];
+       if(method != Z_DEFLATED || (flags & RESERVED)) return 0;
+       n = 4 + 6; // Skip 6 bytes
+       *extra_off = n + 2;
+       *extra_len = 0;
+       if(flags & EXTRA_FIELD){
+               if(size < n + 2) return 0;
+               len = ((int)data[n + 1] << 8) | data[n];
+               n += 2;
+               *extra_off = n;
+               while(len){
+                       if(n >= size) return 0;
+                       n ++;
+                       len --;
+               }
+               *extra_len = n - (*extra_off);
+       }
+       if(flags & ORIG_NAME) while(n < size && data[n++]);
+       if(flags & COMMENT) while(n < size && data[n++]);
+       if(flags & HEAD_CRC){
+               if(n + 2 > size) return 0;
+               n += 2;
+       }
+       return n;
+}
+
+static RAZF* razf_open_r(int fd, int _load_index){
+       RAZF *rz;
+       int ext_off, ext_len;
+       int n, is_be, ret;
+       int64_t end;
+       unsigned char c[] = "RAZF";
+       rz = calloc(1, sizeof(RAZF));
+       rz->mode = 'r';
+       rz->filedes = fd;
+       rz->stream = calloc(sizeof(z_stream), 1);
+       rz->inbuf  = malloc(RZ_BUFFER_SIZE);
+       rz->outbuf = malloc(RZ_BUFFER_SIZE);
+       rz->end = rz->src_end = 0x7FFFFFFFFFFFFFFFLL;
+       n = read(rz->filedes, rz->inbuf, RZ_BUFFER_SIZE);
+       ret = _read_gz_header(rz->inbuf, n, &ext_off, &ext_len);
+       if(ret == 0){
+               PLAIN_FILE:
+               rz->in = n;
+               rz->file_type = FILE_TYPE_PLAIN;
+               memcpy(rz->outbuf, rz->inbuf, n);
+               rz->buf_len = n;
+               free(rz->stream);
+               rz->stream = NULL;
+               return rz;
+       }
+       rz->header_size = ret;
+       ret = inflateInit2(rz->stream, -WINDOW_BITS);
+       if(ret != Z_OK){ inflateEnd(rz->stream); goto PLAIN_FILE;}
+       rz->stream->avail_in = n - rz->header_size;
+       rz->stream->next_in  = rz->inbuf + rz->header_size;
+       rz->stream->avail_out = RZ_BUFFER_SIZE;
+       rz->stream->next_out  = rz->outbuf;
+       rz->file_type = FILE_TYPE_GZ;
+       rz->in = rz->header_size;
+       rz->block_pos = rz->header_size;
+       rz->next_block_pos = rz->header_size;
+       rz->block_off = 0;
+       if(ext_len < 7 || memcmp(rz->inbuf + ext_off, c, 4) != 0) return rz;
+       if(((((unsigned char*)rz->inbuf)[ext_off + 5] << 8) | ((unsigned char*)rz->inbuf)[ext_off + 6]) != RZ_BLOCK_SIZE){
+               fprintf(stderr, " -- WARNING: RZ_BLOCK_SIZE is not %d, treat source as gz file.  in %s -- %s:%d --\n", RZ_BLOCK_SIZE, __FUNCTION__, __FILE__, __LINE__);
+               return rz;
+       }
+       rz->load_index = _load_index;
+       rz->file_type = FILE_TYPE_RZ;
+       if(lseek(fd, -16, SEEK_END) == -1){
+               UNSEEKABLE:
+               rz->seekable = 0;
+               rz->index = NULL;
+               rz->src_end = rz->end = 0x7FFFFFFFFFFFFFFFLL;
+       } else {
+               is_be = is_big_endian();
+               rz->seekable = 1;
+               read(fd, &end, sizeof(int64_t));
+               if(!is_be) rz->src_end = (int64_t)byte_swap_8((uint64_t)end);
+               else rz->src_end = end;
+               read(fd, &end, sizeof(int64_t));
+               if(!is_be) rz->end = (int64_t)byte_swap_8((uint64_t)end);
+               else rz->end = end;
+               if(n > rz->end){
+                       rz->stream->avail_in -= n - rz->end;
+                       n = rz->end;
+               }
+               if(rz->end > rz->src_end){
+                       lseek(fd, rz->in, SEEK_SET);
+                       goto UNSEEKABLE;
+               }
+               if(lseek(fd, rz->end, SEEK_SET) != rz->end){
+                       lseek(fd, rz->in, SEEK_SET);
+                       goto UNSEEKABLE;
+               }
+               load_zindex(rz, fd);
+               lseek(fd, n, SEEK_SET);
+       }
+       return rz;
+}
+
+RAZF* razf_dopen(int fd, const char *mode){
+       if(strcasecmp(mode, "r") == 0) return razf_open_r(fd, 1);
+       else if(strcasecmp(mode, "w") == 0) return razf_open_w(fd);
+       else return NULL;
+}
+
+RAZF* razf_dopen2(int fd, const char *mode)
+{
+       if(strcasecmp(mode, "r") == 0) return razf_open_r(fd, 0);
+       else if(strcasecmp(mode, "w") == 0) return razf_open_w(fd);
+       else return NULL;
+}
+
+static inline RAZF* _razf_open(const char *filename, const char *mode, int _load_index){
+       int fd;
+       RAZF *rz;
+       if(strcasecmp(mode, "r") == 0){
+               fd = open(filename, O_RDONLY);
+               rz = razf_open_r(fd, _load_index);
+       } else if(strcasecmp(mode, "w") == 0){
+               fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
+               rz = razf_open_w(fd);
+       } else return NULL;
+       return rz;
+}
+
+RAZF* razf_open(const char *filename, const char *mode){
+       return _razf_open(filename, mode, 1);
+}
+
+RAZF* razf_open2(const char *filename, const char *mode){
+       return _razf_open(filename, mode, 0);
+}
+
+int razf_get_data_size(RAZF *rz, int64_t *u_size, int64_t *c_size){
+       int64_t n;
+       if(rz->mode != 'r' && rz->mode != 'R') return 0;
+       switch(rz->file_type){
+               case FILE_TYPE_PLAIN:
+                       if(rz->end == 0x7fffffffffffffffLL){
+                               if((n = lseek(rz->filedes, 0, SEEK_CUR)) == -1) return 0;
+                               rz->end = lseek(rz->filedes, 0, SEEK_END);
+                               lseek(rz->filedes, n, SEEK_SET);
+                       }
+                       *u_size = *c_size = rz->end;
+                       return 1;
+               case FILE_TYPE_GZ:
+                       return 0;
+               case FILE_TYPE_RZ:
+                       if(rz->src_end == rz->end) return 0;
+                       *u_size = rz->src_end;
+                       *c_size = rz->end;
+                       return 1;
+               default:
+                       return 0;
+       }
+}
+
+static int _razf_read(RAZF* rz, void *data, int size){
+       int ret, tin;
+       if(rz->z_eof || rz->z_err) return 0;
+       if (rz->file_type == FILE_TYPE_PLAIN) {
+               ret = read(rz->filedes, data, size);
+               if (ret == 0) rz->z_eof = 1;
+               return ret;
+       }
+       rz->stream->avail_out = size;
+       rz->stream->next_out  = data;
+       while(rz->stream->avail_out){
+               if(rz->stream->avail_in == 0){
+                       if(rz->in >= rz->end){ rz->z_eof = 1; break; }
+                       if(rz->end - rz->in < RZ_BUFFER_SIZE){
+                               rz->stream->avail_in = read(rz->filedes, rz->inbuf, rz->end -rz->in);
+                       } else {
+                               rz->stream->avail_in = read(rz->filedes, rz->inbuf, RZ_BUFFER_SIZE);
+                       }
+                       if(rz->stream->avail_in == 0){
+                               rz->z_eof = 1;
+                               break;
+                       }
+                       rz->stream->next_in = rz->inbuf;
+               }
+               tin = rz->stream->avail_in;
+               ret = inflate(rz->stream, Z_BLOCK);
+               rz->in += tin - rz->stream->avail_in;
+               if(ret == Z_NEED_DICT || ret == Z_MEM_ERROR || ret == Z_DATA_ERROR){
+                       fprintf(stderr, "[_razf_read] inflate error: %d (at %s:%d)\n", ret, __FILE__, __LINE__);
+                       rz->z_err = 1;
+                       break;
+               }
+               if(ret == Z_STREAM_END){
+                       rz->z_eof = 1;
+                       break;
+               }
+               if ((rz->stream->data_type&128) && !(rz->stream->data_type&64)){
+                       rz->buf_flush = 1;
+                       rz->next_block_pos = rz->in;
+                       break;
+               }
+       }
+       return size - rz->stream->avail_out;
+}
+
+int razf_read(RAZF *rz, void *data, int size){
+       int ori_size, i;
+       ori_size = size;
+       while(size > 0){
+               if(rz->buf_len){
+                       if(size < rz->buf_len){
+                               for(i=0;i<size;i++) ((char*)data)[i] = ((char*)rz->outbuf + rz->buf_off)[i];
+                               rz->buf_off += size;
+                               rz->buf_len -= size;
+                               data += size;
+                               rz->block_off += size;
+                               size = 0;
+                               break;
+                       } else {
+                               for(i=0;i<rz->buf_len;i++) ((char*)data)[i] = ((char*)rz->outbuf + rz->buf_off)[i];
+                               data += rz->buf_len;
+                               size -= rz->buf_len;
+                               rz->block_off += rz->buf_len;
+                               rz->buf_off = 0;
+                               rz->buf_len = 0;
+                               if(rz->buf_flush){
+                                       rz->block_pos = rz->next_block_pos;
+                                       rz->block_off = 0;
+                                       rz->buf_flush = 0;
+                               }
+                       }
+               } else if(rz->buf_flush){
+                       rz->block_pos = rz->next_block_pos;
+                       rz->block_off = 0;
+                       rz->buf_flush = 0;
+               }
+               if(rz->buf_flush) continue;
+               rz->buf_len = _razf_read(rz, rz->outbuf, RZ_BUFFER_SIZE);
+               if(rz->z_eof && rz->buf_len == 0) break;
+       }
+       rz->out += ori_size - size;
+       return ori_size - size;
+}
+
+int razf_skip(RAZF* rz, int size){
+       int ori_size;
+       ori_size = size;
+       while(size > 0){
+               if(rz->buf_len){
+                       if(size < rz->buf_len){
+                               rz->buf_off += size;
+                               rz->buf_len -= size;
+                               rz->block_off += size;
+                               size = 0;
+                               break;
+                       } else {
+                               size -= rz->buf_len;
+                               rz->buf_off = 0;
+                               rz->buf_len = 0;
+                               rz->block_off += rz->buf_len;
+                               if(rz->buf_flush){
+                                       rz->block_pos = rz->next_block_pos;
+                                       rz->block_off = 0;
+                                       rz->buf_flush = 0;
+                               }
+                       }
+               } else if(rz->buf_flush){
+                       rz->block_pos = rz->next_block_pos;
+                       rz->block_off = 0;
+                       rz->buf_flush = 0;
+               }
+               if(rz->buf_flush) continue;
+               rz->buf_len = _razf_read(rz, rz->outbuf, RZ_BUFFER_SIZE);
+               if(rz->z_eof) break;
+       }
+       rz->out += ori_size - size;
+       return ori_size - size;
+}
+
+static void _razf_reset_read(RAZF *rz, int64_t in, int64_t out){
+       lseek(rz->filedes, in, SEEK_SET);
+       rz->in  = in;
+       rz->out = out;
+       rz->block_pos = in;
+       rz->next_block_pos = in;
+       rz->block_off = 0;
+       rz->buf_flush = 0;
+       rz->z_eof = rz->z_err = 0;
+       inflateReset(rz->stream);
+       rz->stream->avail_in = 0;
+       rz->buf_off = rz->buf_len = 0;
+}
+
+int64_t razf_jump(RAZF *rz, int64_t block_start, int block_offset){
+       int64_t pos;
+       rz->z_eof = 0;
+       if(rz->file_type == FILE_TYPE_PLAIN){
+               rz->buf_off = rz->buf_len = 0;
+               pos = block_start + block_offset;
+               pos = lseek(rz->filedes, pos, SEEK_SET);
+               rz->out = rz->in = pos;
+               return pos;
+       }
+       if(block_start == rz->block_pos && block_offset >= rz->block_off) {
+               block_offset -= rz->block_off;
+               goto SKIP; // Needn't reset inflate
+       }
+       if(block_start  == 0) block_start = rz->header_size; // Automaticly revist wrong block_start
+       _razf_reset_read(rz, block_start, 0);
+       SKIP:
+       if(block_offset) razf_skip(rz, block_offset);
+       return rz->block_off;
+}
+
+int64_t razf_seek(RAZF* rz, int64_t pos, int where){
+       int64_t idx;
+       int64_t seek_pos, new_out;
+       rz->z_eof = 0;
+       if (where == SEEK_CUR) pos += rz->out;
+       else if (where == SEEK_END) pos += rz->src_end;
+       if(rz->file_type == FILE_TYPE_PLAIN){
+               seek_pos = lseek(rz->filedes, pos, SEEK_SET);
+               rz->buf_off = rz->buf_len = 0;
+               rz->out = rz->in = seek_pos;
+               return seek_pos;
+       } else if(rz->file_type == FILE_TYPE_GZ){
+               if(pos >= rz->out) goto SKIP;
+               return rz->out;
+       }
+       if(pos == rz->out) return pos;
+       if(pos > rz->src_end) return rz->out;
+       if(!rz->seekable || !rz->load_index){
+               if(pos >= rz->out) goto SKIP;
+       }
+       idx = pos / RZ_BLOCK_SIZE - 1;
+       seek_pos = (idx < 0)? rz->header_size:(rz->index->cell_offsets[idx] + rz->index->bin_offsets[idx / RZ_BIN_SIZE]);
+       new_out  = (idx + 1) * RZ_BLOCK_SIZE;
+       if(pos > rz->out && new_out <= rz->out) goto SKIP;
+       _razf_reset_read(rz, seek_pos, new_out);
+       SKIP:
+       razf_skip(rz, (int)(pos - rz->out));
+       return rz->out;
+}
+
+uint64_t razf_tell2(RAZF *rz)
+{
+       /*
+       if (rz->load_index) {
+               int64_t idx, seek_pos;
+               idx = rz->out / RZ_BLOCK_SIZE - 1;
+               seek_pos = (idx < 0)? rz->header_size:(rz->index->cell_offsets[idx] + rz->index->bin_offsets[idx / RZ_BIN_SIZE]);
+               if (seek_pos != rz->block_pos || rz->out%RZ_BLOCK_SIZE != rz->block_off)
+                       fprintf(stderr, "[razf_tell2] inconsistent block offset: (%lld, %lld) != (%lld, %lld)\n",
+                                       (long long)seek_pos, (long long)rz->out%RZ_BLOCK_SIZE, (long long)rz->block_pos, (long long) rz->block_off);
+       }
+       */
+       return (uint64_t)rz->block_pos<<16 | (rz->block_off&0xffff);
+}
+
+int64_t razf_seek2(RAZF *rz, uint64_t voffset, int where)
+{
+       if (where != SEEK_SET) return -1;
+       return razf_jump(rz, voffset>>16, voffset&0xffff);
+}
+
+void razf_close(RAZF *rz){
+       uint64_t v64;
+       if(rz->mode == 'w'){
+               razf_end_flush(rz);
+               deflateEnd(rz->stream);
+               save_zindex(rz, rz->filedes);
+               if(is_big_endian()){
+                       write(rz->filedes, &rz->in, sizeof(int64_t));
+                       write(rz->filedes, &rz->out, sizeof(int64_t));
+               } else {
+                       v64 = byte_swap_8((uint64_t)rz->in);
+                       write(rz->filedes, &v64, sizeof(int64_t));
+                       v64 = byte_swap_8((uint64_t)rz->out);
+                       write(rz->filedes, &v64, sizeof(int64_t));
+               }
+       } else if(rz->mode == 'r'){
+               if(rz->stream) inflateEnd(rz->stream);
+       }
+       if(rz->inbuf) free(rz->inbuf);
+       if(rz->outbuf) free(rz->outbuf);
+       if(rz->header){
+               free(rz->header->extra);
+               free(rz->header->name);
+               free(rz->header->comment);
+               free(rz->header);
+       }
+       if(rz->index){
+               free(rz->index->bin_offsets);
+               free(rz->index->cell_offsets);
+               free(rz->index);
+       }
+       free(rz->stream);
+       close(rz->filedes);
+       free(rz);
+}
diff --git a/razf.h b/razf.h
new file mode 100644 (file)
index 0000000..d391776
--- /dev/null
+++ b/razf.h
@@ -0,0 +1,117 @@
+ /*-
+ * RAZF : Random Access compressed(Z) File
+ * Version: 1.0
+ * Release Date: 2008-10-27
+ *
+ * Copyright 2008, Jue Ruan <ruanjue@gmail.com>, Heng Li <lh3@sanger.ac.uk>
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+
+#ifndef __RAZF_RJ_H
+#define __RAZF_RJ_H
+
+#include <stdint.h>
+#include <stdio.h>
+#include "zlib.h"
+#include "zutil.h"
+
+#define WINDOW_BITS   15
+
+#ifndef RZ_BLOCK_SIZE
+#define RZ_BLOCK_SIZE (1<<WINDOW_BITS)
+#endif
+
+#ifndef RZ_BUFFER_SIZE
+#define RZ_BUFFER_SIZE 4096
+#endif
+
+#ifndef RZ_COMPRESS_LEVEL
+#define RZ_COMPRESS_LEVEL 6
+#endif
+
+#define RZ_BIN_SIZE ((1LLU << 32) / RZ_BLOCK_SIZE)
+
+typedef struct {
+       uint32_t *cell_offsets; // i
+       int64_t  *bin_offsets; // i / BIN_SIZE
+       int size;
+       int cap;
+} ZBlockIndex;
+/* When storing index, output bytes in Big-Endian everywhere */
+
+#define FILE_TYPE_RZ   1
+#define FILE_TYPE_PLAIN        2
+#define FILE_TYPE_GZ   3
+
+typedef struct RandomAccessZFile  {
+       char mode; /* 'w' : write mode; 'r' : read mode */
+       int file_type;
+       /* plain file or rz file, razf_read support plain file as input too, in this case, razf_read work as buffered fread */
+       int filedes; /* the file descriptor */
+       z_stream *stream;
+       ZBlockIndex *index;
+       int64_t in, out, end, src_end;
+       /* in: n bytes total in; out: n bytes total out; */
+       /* end: the end of all data blocks, while the start of index; src_end: the true end position in uncompressed file */
+       int buf_flush; // buffer should be flush, suspend inflate util buffer is empty
+       int64_t block_pos, block_off, next_block_pos;
+       /* block_pos: the start postiion of current block  in compressed file */
+       /* block_off: tell how many bytes have been read from current block */
+       void *inbuf, *outbuf;
+       int header_size;
+       gz_header *header;
+       /* header is used to transfer inflate_state->mode from HEAD to TYPE after call inflateReset */
+       int buf_off, buf_len;
+       int z_err, z_eof;
+       int seekable;
+       /* Indice where the source is seekable */
+       int load_index;
+       /* set has_index to 0 in mode 'w', then index will be discarded */
+} RAZF;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+       RAZF* razf_dopen(int data_fd, const char *mode);
+       RAZF *razf_open(const char *fn, const char *mode);
+       int razf_write(RAZF* rz, const void *data, int size);
+       int razf_read(RAZF* rz, void *data, int size);
+       int64_t razf_seek(RAZF* rz, int64_t pos, int where);
+       void razf_close(RAZF* rz);
+
+#define razf_tell(rz) ((rz)->out)
+
+       RAZF* razf_open2(const char *filename, const char *mode);
+       RAZF* razf_dopen2(int fd, const char *mode);
+       uint64_t razf_tell2(RAZF *rz);
+       int64_t razf_seek2(RAZF *rz, uint64_t voffset, int where);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/razip.c b/razip.c
new file mode 100644 (file)
index 0000000..0b67c6c
--- /dev/null
+++ b/razip.c
@@ -0,0 +1,139 @@
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+#include "razf.h"
+
+#define WINDOW_SIZE 4096
+
+static int razf_main_usage()
+{
+       printf("\n");
+       printf("Usage:   razip [options] [file] ...\n\n");
+       printf("Options: -c      write on standard output, keep original files unchanged\n");
+       printf("         -d      decompress\n");
+       printf("         -l      list compressed file contents\n");
+       printf("         -b INT  decompress at INT position in the uncompressed file\n");
+       printf("         -s INT  decompress INT bytes in the uncompressed file\n");
+       printf("         -h      give this help\n");
+       printf("\n");
+       return 0;
+}
+
+static int write_open(const char *fn, int is_forced)
+{
+       int fd = -1;
+       char c;
+       if (!is_forced) {
+               if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0644)) < 0 && errno == EEXIST) {
+                       printf("razip: %s already exists; do you wish to overwrite (y or n)? ", fn);
+                       scanf("%c", &c);
+                       if (c != 'Y' && c != 'y') {
+                               printf("razip: not overwritten\n");
+                               exit(1);
+                       }
+               }
+       }
+       if (fd < 0) {
+               if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0644)) < 0) {
+                       fprintf(stderr, "razip: %s: Fail to write\n", fn);
+                       exit(1);
+               }
+       }
+       return fd;
+}
+
+int main(int argc, char **argv)
+{
+       int c, compress, pstdout, is_forced;
+       RAZF *rz;
+       void *buffer;
+       long start, end, size;
+
+       compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0;
+       while((c  = getopt(argc, argv, "cdlhfb:s:")) >= 0){
+               switch(c){
+               case 'h': return razf_main_usage();
+               case 'd': compress = 0; break;
+               case 'c': pstdout = 1; break;
+               case 'l': compress = 2; break;
+               case 'b': start = atol(optarg); break;
+               case 's': size = atol(optarg); break;
+               case 'f': is_forced = 1; break;
+               }
+       }
+       if (size >= 0) end = start + size;
+       if(end >= 0 && end < start){
+               fprintf(stderr, " -- Illegal region: [%ld, %ld] --\n", start, end);
+               return 1;
+       }
+       if(compress == 1){
+               int f_src, f_dst = -1;
+               if(argc > optind){
+                       if((f_src = open(argv[optind], O_RDONLY)) < 0){
+                               fprintf(stderr, " -- Cannot open file: %s --\n", argv[optind]);
+                               return 1;
+                       }
+                       if(pstdout){
+                               f_dst = fileno(stdout);
+                       } else {
+                               char *name = malloc(sizeof(strlen(argv[optind]) + 5));
+                               strcpy(name, argv[optind]);
+                               strcat(name, ".rz");
+                               f_dst = write_open(name, is_forced);
+                               if (f_dst < 0) return 1;
+                               free(name);
+                       }
+               } else if(pstdout){ 
+                       f_src = fileno(stdin);
+                       f_dst = fileno(stdout);
+               } else return razf_main_usage();
+               rz = razf_dopen(f_dst, "w");
+               buffer = malloc(WINDOW_SIZE);
+               while((c = read(f_src, buffer, WINDOW_SIZE)) > 0) razf_write(rz, buffer, c);
+               razf_close(rz); // f_dst will be closed here
+               if (argc > optind) unlink(argv[optind]);
+               free(buffer);
+               close(f_src);
+               return 0;
+       } else {
+               if(argc <= optind) return razf_main_usage();
+               if(compress == 2){
+                       rz = razf_open(argv[optind], "r");
+                       if(rz->file_type == FILE_TYPE_RZ) {
+                                                       printf("%20s%20s%7s %s\n", "compressed", "uncompressed", "ratio", "name");
+                               printf("%20lld%20lld%6.1f%% %s\n", (long long)rz->end, (long long)rz->src_end, rz->end * 100.0f / rz->src_end,
+                                          argv[optind]);
+                       } else fprintf(stdout, "%s is not a regular rz file\n", argv[optind]);
+               } else {
+                       int f_dst;
+                       if (argc > optind && !pstdout) {
+                               char *name;
+                               if (strstr(argv[optind], ".rz") - argv[optind] != strlen(argv[optind]) - 3) {
+                                       printf("razip: %s: unknown suffix -- ignored\n", argv[optind]);
+                                       return 1;
+                               }
+                               name = strdup(argv[optind]);
+                               name[strlen(name) - 3] = '\0';
+                               f_dst = write_open(name, is_forced);
+                               free(name);
+                       } else f_dst = fileno(stdout);
+                       rz = razf_open(argv[optind], "r");
+                       buffer = malloc(WINDOW_SIZE);
+                       razf_seek(rz, start, SEEK_SET);
+                       while(1){
+                               if(end < 0) c = razf_read(rz, buffer, WINDOW_SIZE);
+                               else c = razf_read(rz, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start));
+                               if(c <= 0) break;
+                               start += c;
+                               write(f_dst, buffer, c);
+                               if(end >= 0 && start >= end) break;
+                       }
+                       free(buffer);
+                       if (!pstdout) unlink(argv[optind]);
+               }
+               razf_close(rz);
+               return 0;
+       }
+}
+
diff --git a/samtools.1 b/samtools.1
new file mode 100644 (file)
index 0000000..91f627f
--- /dev/null
@@ -0,0 +1,258 @@
+.TH samtools 1 "22 December 2008" "samtools-0.1.1" "Bioinformatics tools"
+.SH NAME
+.PP
+samtools - Utilities for the Sequence Alignment/Map (SAM) format
+.SH SYNOPSIS
+.PP
+samtools import ref_list.txt aln.sam.gz aln.bam
+.PP
+samtools sort aln.bam aln.sorted
+.PP
+samtools index aln.sorted.bam
+.PP
+samtools view aln.sorted.bam chr2:20,100,000-20,200,000
+.PP
+samtools merge out.bam in1.bam in2.bam in3.bam
+.PP
+samtools faidx ref.fasta
+.PP
+samtools pileup -f ref.fasta aln.sorted.bam
+.PP
+samtools tview aln.sorted.bam ref.fasta
+
+.SH DESCRIPTION
+.PP
+Samtools is a set of utilities that manipulate alignments in the BAM
+format. It imports from and exports to the SAM (Sequence
+Alignment/Map) format, does sorting, merging and indexing, and
+allows to retrieve reads in any regions swiftly.
+
+.SH COMMANDS AND OPTIONS
+.TP 10
+.B import
+samtools import <in.ref_list> <in.sam> <out.bam>
+
+Convert alignments in SAM format to BAM format. File
+.I <in.ref_list>
+is TAB-delimited. Each line must contain the reference name and the
+length of the reference, one line for each distinct reference;
+additional fields are ignored. This file also defines the order of the
+reference sequences in sorting. File
+.I <in.sam>
+can be optionally compressed by zlib or gzip. A single hyphen is
+recognized as stdin or stdout, depending on the context.
+
+.TP
+.B sort
+samtools sort [-n] [-m maxMem] <in.bam> <out.prefix>
+
+Sort alignments based on the leftmost coordinate. File
+.I <out.prefix>.bam
+will be created. This command may also create temporary files
+.I <out.prefix>.%d.bam
+when the whole alignment cannot be fitted into memory (controlled by
+option -m).
+
+.B OPTIONS:
+.RS
+.TP 8
+.B -n
+Sort by read names rather than by chromosomal coordinates
+.TP
+.B -m INT
+Approximately the maximum required memory.
+.RE
+
+.TP
+.B merge
+samtools merge [-n] <out.bam> <in1.bam> <in2.bam> [...]
+
+Merge multiple sorted alignments. The header of
+.I <in1.bam>
+will be copied to
+.I <out.bam>
+and the headers of other files will be ignored.
+
+.B OPTIONS:
+.RS
+.TP 8
+.B -n
+The input alignments are sorted by read names rather than by chromosomal
+coordinates
+.RE
+
+.TP
+.B index
+samtools index <aln.bam>
+
+Index sorted alignment for fast random access. Index file
+.I <aln.bam>.bai
+will be created.
+
+.TP
+.B view
+samtools view [-b] <in.bam> [region1 [...]]
+
+Extract/print all or sub alignments in SAM or BAM format. If no region
+is specified, all the alignments will be printed; otherwise only
+alignments overlapping with the specified regions will be output. An
+alignment may be given multiple times if it is overlapping several
+regions. A region can be presented, for example, in the following
+format: `chr2', `chr2:1000000' or `chr2:1,000,000-2,000,000'.
+
+.B OPTIONS:
+.RS
+.TP 8
+.B -b
+Output in the BAM format.
+.RE
+
+.TP
+.B faidx
+samtools faidx <ref.fasta> [region1 [...]]
+
+Index reference sequence in the FASTA format or extract subsequence from
+indexed reference sequence. If no region is specified,
+.B faidx
+will index the file and create
+.I <ref.fasta>.fai
+on the disk. If regions are speficified, the subsequences will be
+retrieved and printed to stdout in the FASTA format. The input file can
+be compressed in the
+.B RAZF
+format.
+
+.TP
+.B pileup
+samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l in.site_list]
+[-s] [-c] [-T theta] [-N nHap] [-r pairDiffRate] <in.alignment>
+
+Print the alignment in the pileup format. In the pileup format, each
+line represents a genomic position, consisting of chromosome name,
+coordinate, reference base, read bases, read qualities and alignment
+mapping qualities. Information on match, mismatch, indel, strand,
+mapping quality and start and end of a read are all encoded at the read
+base column. At this column, a dot stands for a match to the reference
+base on the forward strand, a comma for a match on the reverse strand,
+`ACGTN' for a mismatch on the forward strand and `acgtn' for a mismatch
+on the reverse strand. A pattern `\\+[0-9]+[ACGTNacgtn]+' indicates
+there is an insertion between this reference position and the next
+reference position. The length of the insertion is given by the integer
+in the pattern, followed by the inserted sequence. Similarly, a pattern
+`-[0-9]+[ACGTNacgtn]+' represents a deletion from the reference. Also at
+the read base column, a symbol `^' marks the start of a read segment
+which is a contiguous subsequence on the read separated by `N/S/H' CIGAR
+operations. The ASCII of the character following `^' minus 33 gives the
+mapping quality. A symbol `$' marks the end of a read segment.
+
+If option
+.B -c
+is applied, the consensus base, consensus quality, SNP quality and
+maximum mapping quality of the reads covering the site will be inserted
+between the `reference base' and the `read bases' columns. An indel
+occupies an additional line. Each indel line consists of chromosome
+name, coordinate, a star, top two high-scoring ins/del sequences, the
+number of reads strongly supporting the first indel, the number of reads
+strongly supporting the second indel, the number of reads that confer
+little information on distinguishing indels and the number of reads that
+contain indels different from the top two ones.
+
+.B OPTIONS:
+.RS
+
+.TP 10
+.B -s
+Print the mapping quality as the last column. This option makes the
+output easier to parse, although this format is not space efficient.
+
+.TP
+.B -f FILE
+The reference sequence in the FASTA format. Index file
+.I FILE.fai
+will be created if
+absent.
+
+.TP
+.B -t FILE
+List of reference names ane sequence lengths, in the format described
+for the
+.B import
+command. If this option is present, samtools assumes the input
+.I <in.alignment>
+is in SAM format; otherwise it assumes in BAM format.
+
+.TP
+.B -l FILE
+List of sites at which pileup is output. This file is space
+delimited. The first two columns are required to be chromosome and
+1-based coordinate. Additional columns are ignored. It is
+recommended to use option
+.B -s
+together with
+.B -l
+as in the default format we may not know the mapping quality.
+
+.TP
+.B -c
+Call the consensus sequnce using MAQ consensus model. Options
+.B -T,
+.B -N
+and
+.B -r
+are only effective when
+.B -c
+is in use.
+
+.TP
+.B -T FLOAT
+The theta parameter (error dependency coefficient) in the maq consensus
+calling model [0.85]
+
+.TP
+.B -N INT
+Number of haplotypes in the sample (>=2) [2]
+
+.TP
+.B -r FLOAT
+Expected fraction of differences between a pair of haplotypes [0.001]
+
+.RE
+
+.TP
+.B tview
+samtools tview <in.sorted.bam> [ref.fasta]
+
+Text alignment viewer (based on the ncurses library). In the viewer,
+press `?' for help and press `g' to check the alignment start from a
+region in the format like `chr10:10,000,000'. Note that if the region
+showed on the screen contains no mapped reads, a blank screen will be
+seen. This is a known issue and will be improved later.
+
+.RE
+
+.SH LIMITATIONS
+.PP
+.IP o 2
+In general, more testing is needed to ensure there is no severe bug.
+.IP o 2
+PCR duplicate removal has not been implemented.
+.IP o 2
+Only MAQ->SAM converter is implemented. More converters are needed.
+.IP o 2
+Reference sequence names and lengths are not acquired from the BAM/SAM header.
+.IP o 2
+CIGAR operations N and P may not be properly handled.
+.IP o 2
+There is a small known memory leak in the viewer.
+
+.SH AUTHOR
+.PP
+Heng Li from the Sanger Institute is the author of samtools. Bob
+Handsaker from the Broad Institute implemented the BGZF library and Jue
+Ruan from Beijing Genomics Institute wrote the RAZF library. Various
+people in the 1000Genomes Project contributed to the SAM format
+specification.
+
+.SH SEE ALSO
+.PP
+Samtools website: http://samtools.sourceforge.net
diff --git a/source.dot b/source.dot
new file mode 100644 (file)
index 0000000..cfa2222
--- /dev/null
@@ -0,0 +1,15 @@
+digraph {
+  faidx[label="faidx.c\n(faidx)"]
+  import[label="bam_import.c\n(import)"]
+  plcmd[label="bam_plcmd.c\n(pileup)"]
+  sort[label="bam_sort.c\n(sort, merge)"]
+  index[label="bam_index.c\n(index)"]
+  tview[label="bam_tview.c\n(tview)"]
+  "bam_aux.c" -> {"bam.c", import}
+  "bgzf.c" -> "bam.c"
+  "bam.c" -> {index, "bam_pileup.c", sort, import}
+  "bam_pileup.c" -> {"bam_lpileup.c", plcmd}
+  {"bam_lpileup.c", index, faidx, "bam_maqcns.c"} -> tview
+  {import, faidx, "bam_maqcns.c"} -> plcmd
+  {tview, plcmd, faidx, sort, import, index} -> "bamtk.c\n(view)"
+}
\ No newline at end of file
diff --git a/zutil.h b/zutil.h
new file mode 100644 (file)
index 0000000..b7d5eff
--- /dev/null
+++ b/zutil.h
@@ -0,0 +1,269 @@
+/* zutil.h -- internal interface and configuration of the compression library
+ * Copyright (C) 1995-2005 Jean-loup Gailly.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* WARNING: this file should *not* be used by applications. It is
+   part of the implementation of the compression library and is
+   subject to change. Applications should only use zlib.h.
+ */
+
+/* @(#) $Id$ */
+
+#ifndef ZUTIL_H
+#define ZUTIL_H
+
+#define ZLIB_INTERNAL
+#include "zlib.h"
+
+#ifdef STDC
+#  ifndef _WIN32_WCE
+#    include <stddef.h>
+#  endif
+#  include <string.h>
+#  include <stdlib.h>
+#endif
+#ifdef NO_ERRNO_H
+#   ifdef _WIN32_WCE
+      /* The Microsoft C Run-Time Library for Windows CE doesn't have
+       * errno.  We define it as a global variable to simplify porting.
+       * Its value is always 0 and should not be used.  We rename it to
+       * avoid conflict with other libraries that use the same workaround.
+       */
+#     define errno z_errno
+#   endif
+    extern int errno;
+#else
+#  ifndef _WIN32_WCE
+#    include <errno.h>
+#  endif
+#endif
+
+#ifndef local
+#  define local static
+#endif
+/* compile with -Dlocal if your debugger can't find static symbols */
+
+typedef unsigned char  uch;
+typedef uch FAR uchf;
+typedef unsigned short ush;
+typedef ush FAR ushf;
+typedef unsigned long  ulg;
+
+extern const char * const z_errmsg[10]; /* indexed by 2-zlib_error */
+/* (size given to avoid silly warnings with Visual C++) */
+
+#define ERR_MSG(err) z_errmsg[Z_NEED_DICT-(err)]
+
+#define ERR_RETURN(strm,err) \
+  return (strm->msg = (char*)ERR_MSG(err), (err))
+/* To be used only when the state is known to be valid */
+
+        /* common constants */
+
+#ifndef DEF_WBITS
+#  define DEF_WBITS MAX_WBITS
+#endif
+/* default windowBits for decompression. MAX_WBITS is for compression only */
+
+#if MAX_MEM_LEVEL >= 8
+#  define DEF_MEM_LEVEL 8
+#else
+#  define DEF_MEM_LEVEL  MAX_MEM_LEVEL
+#endif
+/* default memLevel */
+
+#define STORED_BLOCK 0
+#define STATIC_TREES 1
+#define DYN_TREES    2
+/* The three kinds of block type */
+
+#define MIN_MATCH  3
+#define MAX_MATCH  258
+/* The minimum and maximum match lengths */
+
+#define PRESET_DICT 0x20 /* preset dictionary flag in zlib header */
+
+        /* target dependencies */
+
+#if defined(MSDOS) || (defined(WINDOWS) && !defined(WIN32))
+#  define OS_CODE  0x00
+#  if defined(__TURBOC__) || defined(__BORLANDC__)
+#    if(__STDC__ == 1) && (defined(__LARGE__) || defined(__COMPACT__))
+       /* Allow compilation with ANSI keywords only enabled */
+       void _Cdecl farfree( void *block );
+       void *_Cdecl farmalloc( unsigned long nbytes );
+#    else
+#      include <alloc.h>
+#    endif
+#  else /* MSC or DJGPP */
+#    include <malloc.h>
+#  endif
+#endif
+
+#ifdef AMIGA
+#  define OS_CODE  0x01
+#endif
+
+#if defined(VAXC) || defined(VMS)
+#  define OS_CODE  0x02
+#  define F_OPEN(name, mode) \
+     fopen((name), (mode), "mbc=60", "ctx=stm", "rfm=fix", "mrs=512")
+#endif
+
+#if defined(ATARI) || defined(atarist)
+#  define OS_CODE  0x05
+#endif
+
+#ifdef OS2
+#  define OS_CODE  0x06
+#  ifdef M_I86
+     #include <malloc.h>
+#  endif
+#endif
+
+#if defined(MACOS) || defined(TARGET_OS_MAC)
+#  define OS_CODE  0x07
+#  if defined(__MWERKS__) && __dest_os != __be_os && __dest_os != __win32_os
+#    include <unix.h> /* for fdopen */
+#  else
+#    ifndef fdopen
+#      define fdopen(fd,mode) NULL /* No fdopen() */
+#    endif
+#  endif
+#endif
+
+#ifdef TOPS20
+#  define OS_CODE  0x0a
+#endif
+
+#ifdef WIN32
+#  ifndef __CYGWIN__  /* Cygwin is Unix, not Win32 */
+#    define OS_CODE  0x0b
+#  endif
+#endif
+
+#ifdef __50SERIES /* Prime/PRIMOS */
+#  define OS_CODE  0x0f
+#endif
+
+#if defined(_BEOS_) || defined(RISCOS)
+#  define fdopen(fd,mode) NULL /* No fdopen() */
+#endif
+
+#if (defined(_MSC_VER) && (_MSC_VER > 600))
+#  if defined(_WIN32_WCE)
+#    define fdopen(fd,mode) NULL /* No fdopen() */
+#    ifndef _PTRDIFF_T_DEFINED
+       typedef int ptrdiff_t;
+#      define _PTRDIFF_T_DEFINED
+#    endif
+#  else
+#    define fdopen(fd,type)  _fdopen(fd,type)
+#  endif
+#endif
+
+        /* common defaults */
+
+#ifndef OS_CODE
+#  define OS_CODE  0x03  /* assume Unix */
+#endif
+
+#ifndef F_OPEN
+#  define F_OPEN(name, mode) fopen((name), (mode))
+#endif
+
+         /* functions */
+
+#if defined(STDC99) || (defined(__TURBOC__) && __TURBOC__ >= 0x550)
+#  ifndef HAVE_VSNPRINTF
+#    define HAVE_VSNPRINTF
+#  endif
+#endif
+#if defined(__CYGWIN__)
+#  ifndef HAVE_VSNPRINTF
+#    define HAVE_VSNPRINTF
+#  endif
+#endif
+#ifndef HAVE_VSNPRINTF
+#  ifdef MSDOS
+     /* vsnprintf may exist on some MS-DOS compilers (DJGPP?),
+        but for now we just assume it doesn't. */
+#    define NO_vsnprintf
+#  endif
+#  ifdef __TURBOC__
+#    define NO_vsnprintf
+#  endif
+#  ifdef WIN32
+     /* In Win32, vsnprintf is available as the "non-ANSI" _vsnprintf. */
+#    if !defined(vsnprintf) && !defined(NO_vsnprintf)
+#      define vsnprintf _vsnprintf
+#    endif
+#  endif
+#  ifdef __SASC
+#    define NO_vsnprintf
+#  endif
+#endif
+#ifdef VMS
+#  define NO_vsnprintf
+#endif
+
+#if defined(pyr)
+#  define NO_MEMCPY
+#endif
+#if defined(SMALL_MEDIUM) && !defined(_MSC_VER) && !defined(__SC__)
+ /* Use our own functions for small and medium model with MSC <= 5.0.
+  * You may have to use the same strategy for Borland C (untested).
+  * The __SC__ check is for Symantec.
+  */
+#  define NO_MEMCPY
+#endif
+#if defined(STDC) && !defined(HAVE_MEMCPY) && !defined(NO_MEMCPY)
+#  define HAVE_MEMCPY
+#endif
+#ifdef HAVE_MEMCPY
+#  ifdef SMALL_MEDIUM /* MSDOS small or medium model */
+#    define zmemcpy _fmemcpy
+#    define zmemcmp _fmemcmp
+#    define zmemzero(dest, len) _fmemset(dest, 0, len)
+#  else
+#    define zmemcpy memcpy
+#    define zmemcmp memcmp
+#    define zmemzero(dest, len) memset(dest, 0, len)
+#  endif
+#else
+   extern void zmemcpy  OF((Bytef* dest, const Bytef* source, uInt len));
+   extern int  zmemcmp  OF((const Bytef* s1, const Bytef* s2, uInt len));
+   extern void zmemzero OF((Bytef* dest, uInt len));
+#endif
+
+/* Diagnostic functions */
+#ifdef DEBUG
+#  include <stdio.h>
+   extern int z_verbose;
+   extern void z_error    OF((char *m));
+#  define Assert(cond,msg) {if(!(cond)) z_error(msg);}
+#  define Trace(x) {if (z_verbose>=0) fprintf x ;}
+#  define Tracev(x) {if (z_verbose>0) fprintf x ;}
+#  define Tracevv(x) {if (z_verbose>1) fprintf x ;}
+#  define Tracec(c,x) {if (z_verbose>0 && (c)) fprintf x ;}
+#  define Tracecv(c,x) {if (z_verbose>1 && (c)) fprintf x ;}
+#else
+#  define Assert(cond,msg)
+#  define Trace(x)
+#  define Tracev(x)
+#  define Tracevv(x)
+#  define Tracec(c,x)
+#  define Tracecv(c,x)
+#endif
+
+
+voidpf zcalloc OF((voidpf opaque, unsigned items, unsigned size));
+void   zcfree  OF((voidpf opaque, voidpf ptr));
+
+#define ZALLOC(strm, items, size) \
+           (*((strm)->zalloc))((strm)->opaque, (items), (size))
+#define ZFREE(strm, addr)  (*((strm)->zfree))((strm)->opaque, (voidpf)(addr))
+#define TRY_FREE(s, p) {if (p) ZFREE(s, p);}
+
+#endif /* ZUTIL_H */