From: Bo Li Date: Thu, 9 Jan 2014 00:26:18 +0000 (-0600) Subject: Updated samtools to 0.1.19 X-Git-Url: https://git.donarmstrong.com/?p=rsem.git;a=commitdiff_plain;h=dbcf1cfb8ad1086c21d64e249f012809403e7ddc Updated samtools to 0.1.19 --- diff --git a/EBSeq/BiocInstaller/DESCRIPTION b/EBSeq/BiocInstaller/DESCRIPTION new file mode 100644 index 0000000..269c539 --- /dev/null +++ b/EBSeq/BiocInstaller/DESCRIPTION @@ -0,0 +1,13 @@ +Package: BiocInstaller +Title: Install/Update Bioconductor and CRAN Packages +Description: Installs/updates Bioconductor and CRAN packages +Version: 1.12.0 +Author: Dan Tenenbaum and Biocore Team +Maintainer: Bioconductor Package Maintainer + +biocViews: Software +Depends: R (>= 3.0.0) +Suggests: RUnit, BiocGenerics +License: Artistic-2.0 +Packaged: 2013-10-15 04:03:53 UTC; biocbuild +Built: R 3.0.2; ; 2013-10-15 11:47:50 UTC; unix diff --git a/EBSeq/BiocInstaller/INDEX b/EBSeq/BiocInstaller/INDEX new file mode 100644 index 0000000..ec33cf3 --- /dev/null +++ b/EBSeq/BiocInstaller/INDEX @@ -0,0 +1,13 @@ +BiocUpgrade Upgrade Bioconductor to the latest version + available for this version of R +biocLite Install or update Bioconductor and CRAN + packages +biocValid Validate installed package versions against + biocLite versions. +biocVersion Bioconductor version +biocases_group Convenience functions to return package names + associated with Bioconductor publications. +biocinstallRepos Display current Bioconductor and CRAN + repositories. +useDevel Get the 'devel' version of the BiocInstaller + package. diff --git a/EBSeq/BiocInstaller/Meta/Rd.rds b/EBSeq/BiocInstaller/Meta/Rd.rds new file mode 100644 index 0000000..e07c822 Binary files /dev/null and b/EBSeq/BiocInstaller/Meta/Rd.rds differ diff --git a/EBSeq/BiocInstaller/Meta/hsearch.rds b/EBSeq/BiocInstaller/Meta/hsearch.rds new file mode 100644 index 0000000..e320f87 Binary files /dev/null and b/EBSeq/BiocInstaller/Meta/hsearch.rds differ diff --git a/EBSeq/BiocInstaller/Meta/links.rds b/EBSeq/BiocInstaller/Meta/links.rds new file mode 100644 index 0000000..cc993be Binary files /dev/null and b/EBSeq/BiocInstaller/Meta/links.rds differ diff --git a/EBSeq/BiocInstaller/Meta/nsInfo.rds b/EBSeq/BiocInstaller/Meta/nsInfo.rds new file mode 100644 index 0000000..f32ede4 Binary files /dev/null and b/EBSeq/BiocInstaller/Meta/nsInfo.rds differ diff --git a/EBSeq/BiocInstaller/Meta/package.rds b/EBSeq/BiocInstaller/Meta/package.rds new file mode 100644 index 0000000..8dee4bd Binary files /dev/null and b/EBSeq/BiocInstaller/Meta/package.rds differ diff --git a/EBSeq/BiocInstaller/NAMESPACE b/EBSeq/BiocInstaller/NAMESPACE new file mode 100644 index 0000000..2415823 --- /dev/null +++ b/EBSeq/BiocInstaller/NAMESPACE @@ -0,0 +1,4 @@ +import("utils") + +export(biocLite, biocVersion, biocinstallRepos, useDevel, biocValid, + monograph_group, RBioinf_group, biocases_group, all_group) diff --git a/EBSeq/BiocInstaller/NEWS b/EBSeq/BiocInstaller/NEWS new file mode 100644 index 0000000..67c3189 --- /dev/null +++ b/EBSeq/BiocInstaller/NEWS @@ -0,0 +1,16 @@ +CHANGES IN VERSION 1.10.0 +------------------------- + +NEW FEATURES + + o biocValid() checks that installed packages are consistent with + those available via biocLite(). + + o biocVersion() returns the version of Bioconductor expected with + this version of the BiocInstaller package. + +USER-VISIBLE CHANGES + + o biocLite() invoked with no arguments updates currently installed + packages to their most-recent version. + diff --git a/EBSeq/BiocInstaller/R/BiocInstaller b/EBSeq/BiocInstaller/R/BiocInstaller new file mode 100644 index 0000000..3b65e3c --- /dev/null +++ b/EBSeq/BiocInstaller/R/BiocInstaller @@ -0,0 +1,27 @@ +# File share/R/nspackloader.R +# Part of the R package, http://www.R-project.org +# +# Copyright (C) 1995-2012 The R Core Team +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# A copy of the GNU General Public License is available at +# http://www.r-project.org/Licenses/ + +local({ + info <- loadingNamespaceInfo() + pkg <- info$pkgname + ns <- .getNamespace(as.name(pkg)) + if (is.null(ns)) + stop("cannot find namespace environment for ", pkg, domain = NA); + dbbase <- file.path(info$libname, pkg, "R", pkg) + lazyLoad(dbbase, ns, filter = function(n) n != ".__NAMESPACE__.") +}) diff --git a/EBSeq/BiocInstaller/R/BiocInstaller.rdb b/EBSeq/BiocInstaller/R/BiocInstaller.rdb new file mode 100644 index 0000000..2e70858 Binary files /dev/null and b/EBSeq/BiocInstaller/R/BiocInstaller.rdb differ diff --git a/EBSeq/BiocInstaller/R/BiocInstaller.rdx b/EBSeq/BiocInstaller/R/BiocInstaller.rdx new file mode 100644 index 0000000..bbce4ec Binary files /dev/null and b/EBSeq/BiocInstaller/R/BiocInstaller.rdx differ diff --git a/EBSeq/BiocInstaller/help/AnIndex b/EBSeq/BiocInstaller/help/AnIndex new file mode 100644 index 0000000..93ce51c --- /dev/null +++ b/EBSeq/BiocInstaller/help/AnIndex @@ -0,0 +1,10 @@ +all_group packageGroups +biocases_group packageGroups +biocinstallRepos biocinstallRepos +biocLite biocLite +BiocUpgrade BiocUpgrade +biocValid biocValid +biocVersion biocVersion +monograph_group packageGroups +RBioinf_group packageGroups +useDevel useDevel diff --git a/EBSeq/BiocInstaller/help/BiocInstaller.rdb b/EBSeq/BiocInstaller/help/BiocInstaller.rdb new file mode 100644 index 0000000..e88fe48 Binary files /dev/null and b/EBSeq/BiocInstaller/help/BiocInstaller.rdb differ diff --git a/EBSeq/BiocInstaller/help/BiocInstaller.rdx b/EBSeq/BiocInstaller/help/BiocInstaller.rdx new file mode 100644 index 0000000..82d8f93 Binary files /dev/null and b/EBSeq/BiocInstaller/help/BiocInstaller.rdx differ diff --git a/EBSeq/BiocInstaller/help/aliases.rds b/EBSeq/BiocInstaller/help/aliases.rds new file mode 100644 index 0000000..9b2af13 Binary files /dev/null and b/EBSeq/BiocInstaller/help/aliases.rds differ diff --git a/EBSeq/BiocInstaller/help/paths.rds b/EBSeq/BiocInstaller/help/paths.rds new file mode 100644 index 0000000..7b4a225 Binary files /dev/null and b/EBSeq/BiocInstaller/help/paths.rds differ diff --git a/EBSeq/BiocInstaller/html/00Index.html b/EBSeq/BiocInstaller/html/00Index.html new file mode 100644 index 0000000..33a3a2e --- /dev/null +++ b/EBSeq/BiocInstaller/html/00Index.html @@ -0,0 +1,44 @@ + +R: Install/Update Bioconductor and CRAN Packages + + + +

Install/Update Bioconductor and CRAN Packages + +

+
+
+[Up] +[Top] +

Documentation for package ‘BiocInstaller’ version 1.12.0

+ + + +

Help Pages

+ + + + + + + + + + + + + + + + + + + + + + + +
all_groupConvenience functions to return package names associated with Bioconductor publications.
biocases_groupConvenience functions to return package names associated with Bioconductor publications.
biocinstallReposDisplay current Bioconductor and CRAN repositories.
biocLiteInstall or update Bioconductor and CRAN packages
BiocUpgradeUpgrade Bioconductor to the latest version available for this version of R
biocValidValidate installed package versions against biocLite versions.
biocVersionBioconductor version
monograph_groupConvenience functions to return package names associated with Bioconductor publications.
RBioinf_groupConvenience functions to return package names associated with Bioconductor publications.
useDevelGet the 'devel' version of the BiocInstaller package.
+ diff --git a/EBSeq/BiocInstaller/html/R.css b/EBSeq/BiocInstaller/html/R.css new file mode 100644 index 0000000..6f058f3 --- /dev/null +++ b/EBSeq/BiocInstaller/html/R.css @@ -0,0 +1,57 @@ +BODY{ background: white; + color: black } + +A:link{ background: white; + color: blue } +A:visited{ background: white; + color: rgb(50%, 0%, 50%) } + +H1{ background: white; + color: rgb(55%, 55%, 55%); + font-family: monospace; + font-size: x-large; + text-align: center } + +H2{ background: white; + color: rgb(40%, 40%, 40%); + font-family: monospace; + font-size: large; + text-align: center } + +H3{ background: white; + color: rgb(40%, 40%, 40%); + font-family: monospace; + font-size: large } + +H4{ background: white; + color: rgb(40%, 40%, 40%); + font-family: monospace; + font-style: italic; + font-size: large } + +H5{ background: white; + color: rgb(40%, 40%, 40%); + font-family: monospace } + +H6{ background: white; + color: rgb(40%, 40%, 40%); + font-family: monospace; + font-style: italic } + +IMG.toplogo{ vertical-align: middle } + +IMG.arrow{ width: 30px; + height: 30px; + border: 0 } + +span.acronym{font-size: small} +span.env{font-family: monospace} +span.file{font-family: monospace} +span.option{font-family: monospace} +span.pkg{font-weight: bold} +span.samp{font-family: monospace} + +div.vignettes a:hover { + background: rgb(85%, 85%, 85%); +} + diff --git a/EBSeq/BiocInstaller/scripts/biocLite.R b/EBSeq/BiocInstaller/scripts/biocLite.R new file mode 100644 index 0000000..9940cfb --- /dev/null +++ b/EBSeq/BiocInstaller/scripts/biocLite.R @@ -0,0 +1,86 @@ +## Mirrors: uncomment the following and change to your favorite CRAN mirror +## if you don't want to use the default (cran.fhcrc.org, Seattle, USA). +## options("repos" = c(CRAN="http://cran.fhcrc.org")) + +## Mirrors: uncomment the following and change to your favorite Bioconductor +## mirror, if you don't want to use the default (www.bioconductor.org, +## Seattle, USA) +## options("BioC_mirror" = "http://www.bioconductor.org") + +local({ + currBiocVers <- + package_version(readLines("http://bioconductor.org/bioc-version", + warn=FALSE)) + vers <- getRversion() + biocVers <- tryCatch({ + BiocInstaller::biocVersion() # recent BiocInstaller + }, error=function(...) { # no / older BiocInstaller + tools:::.BioC_version_associated_with_R_version + }) + + if (biocVers < currBiocVers) { + txt <- strwrap(sprintf("Your Bioconductor is out-of-date, upgrade + to version %s by following instructions at + http://bioconductor.org/install.", currBiocVers)) + message(paste(txt, collapse="\n")) + } + + if (vers > "2.13" && biocVers > "2.8") { + + if (exists("biocLite", .GlobalEnv, inherits=FALSE)) { + txt <- strwrap("There is an outdated biocLite() function in the + global environment; run 'rm(biocLite)' and try again.") + stop("\n", paste(txt, collapse="\n")) + } + + if (!suppressWarnings(require("BiocInstaller", quietly=TRUE))) { + a <- NULL + p <- file.path(Sys.getenv("HOME"), ".R", "repositories") + if (file.exists(p)) { + a <- tools:::.read_repositories(p) + if (!"BioCsoft" %in% rownames(a)) + a <- NULL + } + if (is.null(a)) { + p <- file.path(R.home("etc"), "repositories") + a <- tools:::.read_repositories(p) + } + if (!"package:utils" %in% search()) { + url <- "http://bioconductor.org/biocLite.R" + txt <- sprintf("use 'source(\"%s\")' to update 'BiocInstaller' + after 'utils' package is attached", + url) + message(paste(strwrap(txt), collapse="\n ")) + } else { + ## add a conditional for Bioc releases occuring WITHIN + ## a single R minor version + if (vers >= "2.15" && vers < "2.16") { + a["BioCsoft", "URL"] <- sub(as.character(biocVers), "2.11", + a["BioCsoft", "URL"]) + biocVers <- numeric_version("2.11") + } + install.packages("BiocInstaller", repos=a["BioCsoft", "URL"]) + if (!suppressWarnings(require("BiocInstaller", + quietly=TRUE))) { + url0 <- "http://www.bioconductor.org/packages" + url <- sprintf("%s/%s/bioc", + url0, as.character(biocVers)) + txt0 <- "'biocLite.R' failed to install 'BiocInstaller', + use 'install.packages(\"%s\", repos=\"%s\")'" + txt <- sprintf(txt0, "BiocInstaller", url) + message(paste(strwrap(txt), collapse="\n ")) + } + } + } + } else { + source("http://bioconductor.org/getBioC.R") + biocLite <<- + function(pkgs, groupName="lite", ...) + { + if (missing(pkgs)) + biocinstall(groupName=groupName, ...) + else + biocinstall(pkgs=pkgs, groupName=groupName, ...) + } + } +}) diff --git a/EBSeq/BiocInstaller/unitTests/test_BiocUpgrade.R b/EBSeq/BiocInstaller/unitTests/test_BiocUpgrade.R new file mode 100644 index 0000000..d4cb4e7 --- /dev/null +++ b/EBSeq/BiocInstaller/unitTests/test_BiocUpgrade.R @@ -0,0 +1,29 @@ +test_useDevel <- function() +{ + if (!BiocInstaller:::IS_END_OF_LIFE) { + checkException(useDevel(), silent=TRUE) + } else if (!BiocInstaller:::IS_DOWNGRADEABLE) { + checkException(useDevel(FALSE), silent=TRUE) + } + if (!BiocInstaller:::IS_UPGRADEABLE) { + checkException(useDevel(), silent=TRUE) + opts <- options(warn=2); on.exit(options(opts)) + checkException(biocLite("BiocUpgrade")) + } +} + +test_getContribUrl_exist <- function() +{ + fun <- BiocInstaller:::.getContribUrl + + vers <- BiocInstaller:::BIOC_VERSION + checkTrue(grepl(vers, fun(vers))) + if (BiocInstaller:::IS_UPGRADEABLE) { + vers <- BiocInstaller:::UPGRADE_VERSION + checkTrue(grepl(vers, fun(vers))) + } + if (BiocInstaller:::IS_DOWNGRADEABLE) { + vers <- BiocInstaller:::DOWNGRADE_VERSION + checkTrue(grepl(vers, fun(vers))) + } +} diff --git a/EBSeq/BiocInstaller/unitTests/test_biocinstallRepos.R b/EBSeq/BiocInstaller/unitTests/test_biocinstallRepos.R new file mode 100644 index 0000000..e6fb520 --- /dev/null +++ b/EBSeq/BiocInstaller/unitTests/test_biocinstallRepos.R @@ -0,0 +1,27 @@ +repos <- biocinstallRepos() + +test_biocinstallRepos_named_repositories <- function() +{ + + allOS <- c("BioCsoft", "CRAN", "BioCann", "BioCexp", "BioCextra") + windowsOnly <- "CRANextra" + + checkTrue(all(allOS %in% names(repos))) + if (.Platform$OS.type == "windows") + { + checkTrue(windowsOnly %in% names(repos)) + } else { + checkTrue(!windowsOnly %in% names(repos)) + } + +} + +test_biocinstallRepos_noNA_repositories <- function() +{ + checkTrue(!any(is.na(repos))) +} + +test_biocinstallRepos_order <- function() +{ + checkIdentical("BioCsoft", names(repos)[[1]]) +} diff --git a/EBSeq/calcClusteringInfo.cpp b/EBSeq/calcClusteringInfo.cpp index 2103f61..c1d64bb 100644 --- a/EBSeq/calcClusteringInfo.cpp +++ b/EBSeq/calcClusteringInfo.cpp @@ -65,18 +65,17 @@ string convert(const string& rawseq) { void loadRef(char* inpF) { ifstream fin(inpF); string tag, line, rawseq; - void *pt; assert(fin.is_open()); names.clear(); names.push_back(""); seqs.clear(); seqs.push_back(""); - pt = getline(fin, line); - while (pt != 0 && line[0] == '>') { + getline(fin, line); + while ((fin) && (line[0] == '>')) { tag = line.substr(1); rawseq = ""; - while((pt = getline(fin, line)) && line[0] != '>') { + while((getline(fin, line)) && (line[0] != '>')) { rawseq += line; } if (rawseq.size() <= 0) { diff --git a/README.md b/README.md index 5c6943b..e3e890c 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,13 @@ To compile RSEM, simply run make +For cygwin users, please uncomment the 3rd and 7th line in +'sam/Makefile' before you run 'make'. + +To compile EBSeq, which is included in the RSEM package, run + + make ebseq + To install, simply put the rsem directory in your environment's PATH variable. @@ -111,7 +118,7 @@ using an alternative aligner, you may also want to provide the indices are not built. RSEM requires all alignments of the same read group together. For -paired-end reads, RSEM also requires the two mates of any alignment be +vpaired-end reads, RSEM also requires the two mates of any alignment be adjacent. To check if your SAM/BAM file satisfy the requirements, please run @@ -181,7 +188,7 @@ Usage: sorted_bam_input : Input BAM format file, must be sorted wig_output : Output wiggle file's name, e.g. output.wig -wiggle_name : The name of this wiggle plot +wiggle_name : the name of this wiggle plot --no-fractional-weight : If this is set, RSEM will not look for "ZW" tag and each alignment appeared in the BAM file has weight 1. Set this if your BAM file is not generated by RSEM. Please note that this option must be at the end of the command line #### c) Loading a BAM and/or Wiggle file into the UCSC Genome Browser or Integrative Genomics Viewer(IGV) @@ -243,7 +250,7 @@ Histogram of reads with different number of alignments: x-axis is the number of ## Example Suppose we download the mouse genome from UCSC Genome Browser. We -will use a reference_name of 'mouse_125'. We have a FASTQ-formatted file, +will use a reference_name of 'mm9'. We have a FASTQ-formatted file, 'mmliver.fq', containing single-end reads from one sample, which we call 'mmliver_single_quals'. We want to estimate expression values by using the single-end model with a fragment length distribution. We @@ -259,69 +266,36 @@ list is 'gene_ids.txt'. We will visualize the models learned in The commands for this scenario are as follows: - rsem-prepare-reference --gtf mm9.gtf --mapping knownIsoforms.txt --bowtie-path /sw/bowtie /data/mm9 /ref/mouse_125 - rsem-calculate-expression --bowtie-path /sw/bowtie --phred64-quals --fragment-length-mean 150.0 --fragment-length-sd 35.0 -p 8 --output-genome-bam --calc-ci --memory-allocate 1024 /data/mmliver.fq /ref/mouse_125 mmliver_single_quals + rsem-prepare-reference --gtf mm9.gtf --mapping knownIsoforms.txt --bowtie-path /sw/bowtie /data/mm9 /ref/mm9 + rsem-calculate-expression --bowtie-path /sw/bowtie --phred64-quals --fragment-length-mean 150.0 --fragment-length-sd 35.0 -p 8 --output-genome-bam --calc-ci --memory-allocate 1024 /data/mmliver.fq /ref/mm9 mmliver_single_quals rsem-bam2wig mmliver_single_quals.sorted.bam mmliver_single_quals.sorted.wig mmliver_single_quals rsem-plot-transcript-wiggles --gene-list --show-unique mmliver_single_quals gene_ids.txt output.pdf rsem-plot-model mmliver_single_quals mmliver_single_quals.models.pdf ## Simulation -RSEM provides users the 'rsem-simulate-reads' program to simulate RNA-Seq data based on parameters learned from real data sets. Run - - rsem-simulate-reads - -to get usage information or read the following subsections. - ### Usage: rsem-simulate-reads reference_name estimated_model_file estimated_isoform_results theta0 N output_name [-q] -__reference_name:__ The name of RSEM references, which should be already generated by 'rsem-prepare-reference' - -__estimated_model_file:__ This file describes how the RNA-Seq reads will be sequenced given the expression levels. It determines what kind of reads will be simulated (single-end/paired-end, w/o quality score) and includes parameters for fragment length distribution, read start position distribution, sequencing error models, etc. Normally, this file should be learned from real data using 'rsem-calculate-expression'. The file can be found under the 'sample_name.stat' folder with the name of 'sample_name.model' - -__estimated_isoform_results:__ This file contains expression levels for all isoforms recorded in the reference. It can be learned using 'rsem-calculate-expression' from real data. The corresponding file users want to use is 'sample_name.isoforms.results'. If simulating from user-designed expression profile is desired, start from a learned 'sample_name.isoforms.results' file and only modify the 'TPM' column. The simulator only reads the TPM column. But keeping the file format the same is required. - -__theta0:__ This parameter determines the fraction of reads that are coming from background "noise" (instead of from a transcript). It can also be estimated using 'rsem-calculate-expression' from real data. Users can find it as the first value of the third line of the file 'sample_name.stat/sample_name.theta'. - -__N:__ The total number of reads to be simulated. If 'rsem-calculate-expression' is executed on a real data set, the total number of reads can be found as the 4th number of the first line of the file 'sample_name.stat/sample_name.cnt'. - -__output_name:__ Prefix for all output files. - -__-q:__ Set it will stop outputting intermediate information. +estimated_model_file: file containing model parameters. Generated by +rsem-calculate-expression. +estimated_isoform_results: file containing isoform expression levels. +Generated by rsem-calculate-expression. +theta0: fraction of reads that are "noise" (not derived from a transcript). +N: number of reads to simulate. +output_name: prefix for all output files. +[-q] : set it will stop outputting intermediate information. ### Outputs: -output_name.sim.isoforms.results, output_name.sim.genes.results: Expression levels estimated by counting where each simulated read comes from. - output_name.fa if single-end without quality score; output_name.fq if single-end with quality score; output_name_1.fa & output_name_2.fa if paired-end without quality score; output_name_1.fq & output_name_2.fq if paired-end with quality score. -**Format of the header line**: Each simulated read's header line encodes where it comes from. The header line has the format: - - {>/@}_rid_dir_sid_pos[_insertL] - -__{>/@}:__ Either '>' or '@' must appear. '>' appears if FASTA files are generated and '@' appears if FASTQ files are generated - -__rid:__ Simulated read's index, numbered from 0 - -__dir:__ The direction of the simulated read. 0 refers to forward strand ('+') and 1 refers to reverse strand ('-') - -__sid:__ Represent which transcript this read is simulated from. It ranges between 0 and M, where M is the total number of transcripts. If sid=0, the read is simulated from the background noise. Otherwise, the read is simulated from a transcript with index sid. Transcript sid's transcript name can be found in the 'transcript_id' column of the 'sample_name.isoforms.results' file (at line sid + 1, line 1 is for column names) - -__pos:__ The start position of the simulated read in strand dir of transcript sid. It is numbered from 0 - -__insertL:__ Only appear for paired-end reads. It gives the insert length of the simulated read. - -### Example: - -Suppose we want to simulate 50 millon single-end reads with quality scores and use the parameters learned from [Example](#example). In addition, we set theta0 as 0.2 and output_name as 'simulated_reads'. The command is: - - rsem-simulate-reads /ref/mouse_125 mmliver_single_quals.stat/mmliver_single_quals.model mmliver_single_quals.isoforms.results 0.2 50000000 simulated_reads +output_name.sim.isoforms.results, output_name.sim.genes.results : Results estimated based on sample values. ## Generate Transcript-to-Gene-Map from Trinity Output diff --git a/ReadReader.h b/ReadReader.h index 9cd88e9..141585a 100644 --- a/ReadReader.h +++ b/ReadReader.h @@ -7,6 +7,7 @@ #include #include #include +#include #include "utils.h" #include "SingleRead.h" @@ -94,7 +95,7 @@ bool ReadReader::locate(READ_INT_TYPE rid) { if (crid < rid) return false; - std::streampos tmp[s]; + std::vector tmp(s); for (int i = 0; i < s; i++) { tmp[i] = arr[i]->tellg(); } if (!read.read(s, (std::istream**)arr, 0)) return false; diff --git a/Refs.h b/Refs.h index 008d275..711f64b 100644 --- a/Refs.h +++ b/Refs.h @@ -84,7 +84,6 @@ void Refs::makeRefs(char *inpF, RefSeqPolicy& policy, PolyARules& rules) { //read standard fasta format here std::ifstream fin; std::string tag, line, rawseq; - void* pt; // istream& is indeed a pointer, that's why I can use void* here seqs.clear(); seqs.push_back(RefSeq()); // noise isoform @@ -94,11 +93,11 @@ void Refs::makeRefs(char *inpF, RefSeqPolicy& policy, PolyARules& rules) { fin.open(inpF); if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); } - pt = getline(fin, line); - while (pt != 0 && line[0] == '>') { + getline(fin, line); + while ((fin) && (line[0] == '>')) { tag = line.substr(1); rawseq = ""; - while((pt = getline(fin, line)) && line[0] != '>') { + while((getline(fin, line)) && (line[0] != '>')) { rawseq += line; } if (rawseq.size() <= 0) { diff --git a/extractRef.cpp b/extractRef.cpp index 2d2b17c..fb7d3ec 100644 --- a/extractRef.cpp +++ b/extractRef.cpp @@ -260,7 +260,6 @@ int main(int argc, char* argv[]) { ifstream fin; string line, gseq, seqname; - void* pt; chrvec.clear(); @@ -270,31 +269,31 @@ int main(int argc, char* argv[]) { for (int i = start; i < argc; i++) { fin.open(argv[i]); if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", argv[i]); exit(-1); } - pt = getline(fin, line); - while (pt != 0 && line[0] == '>') { + getline(fin, line); + while ((fin) && (line[0] == '>')) { istringstream strin(line.substr(1)); strin>>seqname; gseq = ""; - while((pt = getline(fin, line)) && line[0] != '>') { - gseq += line; - } - - size_t len = gseq.length(); - assert(len > 0); - for (size_t j = 0; j < len; j++) gseq[j] = check(gseq[j]); - - iter = sn2tr.find(seqname); - if (iter == sn2tr.end()) continue; - - chrvec.push_back(ChrInfo(seqname, len)); - - vector& vec = iter->second; - int s = vec.size(); - for (int j = 0; j < s; j++) { - assert(vec[j] > 0 && vec[j] <= M); - transcripts.getTranscriptAt(vec[j]).extractSeq(gseq, seqs[vec[j]]); - } + while((getline(fin, line)) && (line[0] != '>')) { + gseq += line; + } + + size_t len = gseq.length(); + assert(len > 0); + for (size_t j = 0; j < len; j++) gseq[j] = check(gseq[j]); + + iter = sn2tr.find(seqname); + if (iter == sn2tr.end()) continue; + + chrvec.push_back(ChrInfo(seqname, len)); + + vector& vec = iter->second; + int s = vec.size(); + for (int j = 0; j < s; j++) { + assert(vec[j] > 0 && vec[j] <= M); + transcripts.getTranscriptAt(vec[j]).extractSeq(gseq, seqs[vec[j]]); + } } fin.close(); @@ -304,8 +303,8 @@ int main(int argc, char* argv[]) { for (int i = 1; i <= M; i++) { if (seqs[i] == "") { const Transcript& transcript = transcripts.getTranscriptAt(i); - fprintf(stderr, "Cannot extract transcript %s's sequence from chromosome %s! Loading chromosome %s's sequence is failed. Please check if 1) the chromosome directory is set correctly; 2) the list of chromosome files is complete; 3) the FASTA files containing chromosome sequences are not truncated or having wrong format.\n", \ - transcript.getTranscriptID().c_str(), transcript.getSeqName().c_str(), transcript.getSeqName().c_str()); + fprintf(stderr, "Cannot extract transcript %s's sequence from chromosome %s, whose information might not be provided! Please check if the chromosome directory is set correctly or the list of chromosome files is complete.\n", \ + transcript.getTranscriptID().c_str(), transcript.getSeqName().c_str()); exit(-1); } } diff --git a/makefile b/makefile index d706f55..3c48e69 100644 --- a/makefile +++ b/makefile @@ -135,6 +135,8 @@ rsem-sam-validator : sam/bam.h sam/sam.h my_assert.h samValidator.cpp sam/libbam rsem-scan-for-paired-end-reads : sam/bam.h sam/sam.h my_assert.h scanForPairedEndReads.cpp sam/libbam.a $(CC) -O3 -Wall scanForPairedEndReads.cpp sam/libbam.a -lz -o $@ +.PHONY: ebseq + ebseq : cd EBSeq ; ${MAKE} all diff --git a/sam/.DS_Store b/sam/.DS_Store new file mode 100644 index 0000000..ee99731 Binary files /dev/null and b/sam/.DS_Store differ diff --git a/sam/._.DS_Store b/sam/._.DS_Store new file mode 100644 index 0000000..09fa6bd Binary files /dev/null and b/sam/._.DS_Store differ diff --git a/sam/._.gitignore b/sam/._.gitignore new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._.gitignore differ diff --git a/sam/._AUTHORS b/sam/._AUTHORS new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._AUTHORS differ diff --git a/sam/._COPYING b/sam/._COPYING new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._COPYING differ diff --git a/sam/._ChangeLog.old b/sam/._ChangeLog.old new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._ChangeLog.old differ diff --git a/sam/._INSTALL b/sam/._INSTALL new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._INSTALL differ diff --git a/sam/._Makefile.mingw b/sam/._Makefile.mingw new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._Makefile.mingw differ diff --git a/sam/._NEWS b/sam/._NEWS new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._NEWS differ diff --git a/sam/._bam.c b/sam/._bam.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bam.c differ diff --git a/sam/._bam.h b/sam/._bam.h new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bam.h differ diff --git a/sam/._bam2bcf.c b/sam/._bam2bcf.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bam2bcf.c differ diff --git a/sam/._bam2bcf.h b/sam/._bam2bcf.h new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bam2bcf.h differ diff --git a/sam/._bam2bcf_indel.c b/sam/._bam2bcf_indel.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bam2bcf_indel.c differ diff --git a/sam/._bam2depth.c b/sam/._bam2depth.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bam2depth.c differ diff --git a/sam/._bam_aux.c b/sam/._bam_aux.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bam_aux.c differ diff --git a/sam/._bam_cat.c b/sam/._bam_cat.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bam_cat.c differ diff --git a/sam/._bam_color.c b/sam/._bam_color.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bam_color.c differ diff --git a/sam/._bam_endian.h b/sam/._bam_endian.h new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bam_endian.h differ diff --git a/sam/._bam_import.c b/sam/._bam_import.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bam_import.c differ diff --git a/sam/._bam_index.c b/sam/._bam_index.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bam_index.c differ diff --git a/sam/._bam_lpileup.c b/sam/._bam_lpileup.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bam_lpileup.c differ diff --git a/sam/._bam_mate.c b/sam/._bam_mate.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bam_mate.c differ diff --git a/sam/._bam_md.c b/sam/._bam_md.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bam_md.c differ diff --git a/sam/._bam_pileup.c b/sam/._bam_pileup.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bam_pileup.c differ diff --git a/sam/._bam_plcmd.c b/sam/._bam_plcmd.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bam_plcmd.c differ diff --git a/sam/._bam_reheader.c b/sam/._bam_reheader.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bam_reheader.c differ diff --git a/sam/._bam_rmdup.c b/sam/._bam_rmdup.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bam_rmdup.c differ diff --git a/sam/._bam_rmdupse.c b/sam/._bam_rmdupse.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bam_rmdupse.c differ diff --git a/sam/._bam_sort.c b/sam/._bam_sort.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bam_sort.c differ diff --git a/sam/._bam_stat.c b/sam/._bam_stat.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bam_stat.c differ diff --git a/sam/._bam_tview.c b/sam/._bam_tview.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bam_tview.c differ diff --git a/sam/._bam_tview.h b/sam/._bam_tview.h new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bam_tview.h differ diff --git a/sam/._bam_tview_curses.c b/sam/._bam_tview_curses.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bam_tview_curses.c differ diff --git a/sam/._bam_tview_html.c b/sam/._bam_tview_html.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bam_tview_html.c differ diff --git a/sam/._bamshuf.c b/sam/._bamshuf.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bamshuf.c differ diff --git a/sam/._bamtk.c b/sam/._bamtk.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bamtk.c differ diff --git a/sam/._bcftools b/sam/._bcftools new file mode 100755 index 0000000..94286bb Binary files /dev/null and b/sam/._bcftools differ diff --git a/sam/._bedcov.c b/sam/._bedcov.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bedcov.c differ diff --git a/sam/._bedidx.c b/sam/._bedidx.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bedidx.c differ diff --git a/sam/._bgzf.c b/sam/._bgzf.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bgzf.c differ diff --git a/sam/._bgzf.h b/sam/._bgzf.h new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bgzf.h differ diff --git a/sam/._bgzip.c b/sam/._bgzip.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._bgzip.c differ diff --git a/sam/._cut_target.c b/sam/._cut_target.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._cut_target.c differ diff --git a/sam/._errmod.c b/sam/._errmod.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._errmod.c differ diff --git a/sam/._errmod.h b/sam/._errmod.h new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._errmod.h differ diff --git a/sam/._examples b/sam/._examples new file mode 100755 index 0000000..94286bb Binary files /dev/null and b/sam/._examples differ diff --git a/sam/._faidx.c b/sam/._faidx.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._faidx.c differ diff --git a/sam/._faidx.h b/sam/._faidx.h new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._faidx.h differ diff --git a/sam/._kaln.c b/sam/._kaln.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._kaln.c differ diff --git a/sam/._kaln.h b/sam/._kaln.h new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._kaln.h differ diff --git a/sam/._khash.h b/sam/._khash.h new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._khash.h differ diff --git a/sam/._klist.h b/sam/._klist.h new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._klist.h differ diff --git a/sam/._knetfile.c b/sam/._knetfile.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._knetfile.c differ diff --git a/sam/._knetfile.h b/sam/._knetfile.h new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._knetfile.h differ diff --git a/sam/._kprobaln.c b/sam/._kprobaln.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._kprobaln.c differ diff --git a/sam/._kprobaln.h b/sam/._kprobaln.h new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._kprobaln.h differ diff --git a/sam/._kseq.h b/sam/._kseq.h new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._kseq.h differ diff --git a/sam/._ksort.h b/sam/._ksort.h new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._ksort.h differ diff --git a/sam/._kstring.c b/sam/._kstring.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._kstring.c differ diff --git a/sam/._kstring.h b/sam/._kstring.h new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._kstring.h differ diff --git a/sam/._misc b/sam/._misc new file mode 100755 index 0000000..94286bb Binary files /dev/null and b/sam/._misc differ diff --git a/sam/._padding.c b/sam/._padding.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._padding.c differ diff --git a/sam/._phase.c b/sam/._phase.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._phase.c differ diff --git a/sam/._razf.c b/sam/._razf.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._razf.c differ diff --git a/sam/._razf.h b/sam/._razf.h new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._razf.h differ diff --git a/sam/._razip.c b/sam/._razip.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._razip.c differ diff --git a/sam/._sam.c b/sam/._sam.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._sam.c differ diff --git a/sam/._sam.h b/sam/._sam.h new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._sam.h differ diff --git a/sam/._sam_header.c b/sam/._sam_header.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._sam_header.c differ diff --git a/sam/._sam_header.h b/sam/._sam_header.h new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._sam_header.h differ diff --git a/sam/._sam_view.c b/sam/._sam_view.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._sam_view.c differ diff --git a/sam/._sample.c b/sam/._sample.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._sample.c differ diff --git a/sam/._sample.h b/sam/._sample.h new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._sample.h differ diff --git a/sam/._samtools.1 b/sam/._samtools.1 new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/._samtools.1 differ diff --git a/sam/._win32 b/sam/._win32 new file mode 100755 index 0000000..94286bb Binary files /dev/null and b/sam/._win32 differ diff --git a/sam/ChangeLog b/sam/ChangeLog deleted file mode 100644 index a471838..0000000 --- a/sam/ChangeLog +++ /dev/null @@ -1,5948 +0,0 @@ ------------------------------------------------------------------------- -r925 | lh3lh3 | 2011-02-28 15:45:17 -0500 (Mon, 28 Feb 2011) | 2 lines -Changed paths: - M /trunk/samtools/phase.c - -minor changes to a heuristic rule - ------------------------------------------------------------------------- -r924 | lh3lh3 | 2011-02-28 15:24:04 -0500 (Mon, 28 Feb 2011) | 4 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bcftools/vcfutils.pl - M /trunk/samtools/phase.c - - * 0.1.12-r924:126 - * fixed a bug in phase (due to recent changes) - * fixed a bug in vcf2fq - ------------------------------------------------------------------------- -r923 | lh3lh3 | 2011-02-28 12:57:39 -0500 (Mon, 28 Feb 2011) | 5 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam.h - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/phase.c - - * put version number in bam.h - * write version to BCF - * in phase, change the default -q to 37 - * output a little more information during phasing - ------------------------------------------------------------------------- -r922 | lh3lh3 | 2011-02-25 16:40:09 -0500 (Fri, 25 Feb 2011) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/bcf.tex - M /trunk/samtools/bcftools/bcf2qcall.c - M /trunk/samtools/bcftools/bcfutils.c - M /trunk/samtools/bcftools/ld.c - M /trunk/samtools/bcftools/prob1.c - M /trunk/samtools/bcftools/vcf.c - M /trunk/samtools/cut_target.c - - * change the order of PL/GL according to the latest VCF spec - * change the type of SP to int32_t - ------------------------------------------------------------------------- -r921 | lh3lh3 | 2011-02-25 14:40:56 -0500 (Fri, 25 Feb 2011) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.tex - -update the BCF spec - ------------------------------------------------------------------------- -r920 | lh3lh3 | 2011-02-25 00:59:27 -0500 (Fri, 25 Feb 2011) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam_md.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bam_sort.c - M /trunk/samtools/bamtk.c - A /trunk/samtools/cut_target.c - M /trunk/samtools/errmod.h - M /trunk/samtools/faidx.c - M /trunk/samtools/khash.h - M /trunk/samtools/kstring.c - M /trunk/samtools/kstring.h - A /trunk/samtools/phase.c - M /trunk/samtools/samtools.1 - -added the phase command - ------------------------------------------------------------------------- -r918 | lh3lh3 | 2011-02-24 10:05:54 -0500 (Thu, 24 Feb 2011) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/prob1.c - M /trunk/samtools/bcftools/prob1.h - -added "const" to bcf_p1_cal() - ------------------------------------------------------------------------- -r917 | lh3lh3 | 2011-02-24 09:36:30 -0500 (Thu, 24 Feb 2011) | 2 lines -Changed paths: - M /trunk/samtools/bam.c - -more meaningful BAM truncation message - ------------------------------------------------------------------------- -r916 | lh3lh3 | 2011-02-24 09:35:06 -0500 (Thu, 24 Feb 2011) | 3 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/vcf.c - - * automatically fix errors in GL - * output unrecognized FORMAT as "." - ------------------------------------------------------------------------- -r913 | lh3lh3 | 2011-02-10 22:59:47 -0500 (Thu, 10 Feb 2011) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.h - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/vcf.c - -finished VCF->BCF conversion - ------------------------------------------------------------------------- -r910 | petulda | 2011-02-03 03:13:48 -0500 (Thu, 03 Feb 2011) | 1 line -Changed paths: - M /trunk/samtools/bcftools/vcfutils.pl - -Prevent division by zero ------------------------------------------------------------------------- -r909 | lh3lh3 | 2011-02-02 11:29:20 -0500 (Wed, 02 Feb 2011) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/call1.c - -fixed a typo in the VCF header - ------------------------------------------------------------------------- -r908 | lh3lh3 | 2011-02-02 11:28:24 -0500 (Wed, 02 Feb 2011) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam_index.c - - * fixed an out-of-boundary bug - * improved sorting order checking in index - ------------------------------------------------------------------------- -r907 | lh3lh3 | 2011-01-29 22:59:20 -0500 (Sat, 29 Jan 2011) | 4 lines -Changed paths: - M /trunk/samtools/INSTALL - M /trunk/samtools/bam_tview.c - M /trunk/samtools/knetfile.c - - * avoid a segfault when network connect fails - * update INSTALL - * fixed a bug in tview on big-endian by Nathan Weeks - ------------------------------------------------------------------------- -r903 | lh3lh3 | 2011-01-27 14:50:02 -0500 (Thu, 27 Jan 2011) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bam_md.c - - * fixed a rare memory issue in bam_md.c - * fixed a bug in indel calling related to unmapped and refskip reads - ------------------------------------------------------------------------- -r902 | lh3lh3 | 2011-01-23 21:46:18 -0500 (Sun, 23 Jan 2011) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/fet.c - -fixed two minor bugs in Fisher's exact test - ------------------------------------------------------------------------- -r899 | petulda | 2011-01-19 09:28:02 -0500 (Wed, 19 Jan 2011) | 1 line -Changed paths: - M /trunk/samtools/bcftools/vcfutils.pl - -Skip sites with unknown ref ------------------------------------------------------------------------- -r898 | lh3lh3 | 2011-01-15 12:56:05 -0500 (Sat, 15 Jan 2011) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_md.c - -move bam_nt16_nt4_table[] from bam_maqcns.c to bam_md.c - ------------------------------------------------------------------------- -r896 | lh3lh3 | 2011-01-06 10:52:15 -0500 (Thu, 06 Jan 2011) | 3 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcftools/bcf.h - M /trunk/samtools/bcftools/bcfutils.c - M /trunk/samtools/bcftools/call1.c - - * samtools-0.1.12-10 (r896) - * allow to exclude read groups in mpileup - ------------------------------------------------------------------------- -r895 | lh3lh3 | 2011-01-04 11:31:29 -0500 (Tue, 04 Jan 2011) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.tex - -sorry. It is SP not ST - ------------------------------------------------------------------------- -r894 | lh3lh3 | 2011-01-04 11:29:06 -0500 (Tue, 04 Jan 2011) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.tex - -added ST - ------------------------------------------------------------------------- -r893 | petulda | 2011-01-04 06:55:56 -0500 (Tue, 04 Jan 2011) | 1 line -Changed paths: - M /trunk/samtools/bcftools/call1.c - -Fixed a typo in read_samples ------------------------------------------------------------------------- -r892 | jmarshall | 2010-12-28 08:06:49 -0500 (Tue, 28 Dec 2010) | 9 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bcftools/Makefile - M /trunk/samtools/examples/Makefile - -System libraries go *after* user libraries in link commands, because -the user libraries may themselves have dependencies that are satisfied -by the system libraries. It's not rocket science! - -This makes a difference with some linkers; or with -static or --as-needed. - -The examples/Makefile fix is from Charles Plessy. -See also http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=606004 - ------------------------------------------------------------------------- -r891 | lh3lh3 | 2010-12-21 12:16:33 -0500 (Tue, 21 Dec 2010) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcftools/bcf.h - M /trunk/samtools/bcftools/bcfutils.c - M /trunk/samtools/bcftools/call1.c - - * samtools-0.1.12-9 (r891) - * allow to call SNPs from a subset of samples - ------------------------------------------------------------------------- -r889 | lh3lh3 | 2010-12-15 11:28:16 -0500 (Wed, 15 Dec 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.12-12 (r889) - * set mapQ as 20 if it equals 255 - ------------------------------------------------------------------------- -r888 | lh3lh3 | 2010-12-14 22:41:09 -0500 (Tue, 14 Dec 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - -When -B is applied to mpileup, still use paired reads only unless -A is flagged. - ------------------------------------------------------------------------- -r887 | lh3lh3 | 2010-12-14 22:37:05 -0500 (Tue, 14 Dec 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_md.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.12-6 (r887) - * added a hidden option -E to mpileup/calmd. -E triggers an alternative way to apply BAQ. - ------------------------------------------------------------------------- -r886 | lh3lh3 | 2010-12-14 12:51:03 -0500 (Tue, 14 Dec 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bamtk.c - -(Arguably) improved the indel caller a tiny bit for lowCov data. - ------------------------------------------------------------------------- -r885 | petulda | 2010-12-14 04:55:46 -0500 (Tue, 14 Dec 2010) | 1 line -Changed paths: - M /trunk/samtools/bcftools/call1.c - -Fixed the VCF header to pass validation ------------------------------------------------------------------------- -r884 | lh3lh3 | 2010-12-12 23:02:19 -0500 (Sun, 12 Dec 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcftools/vcfutils.pl - - * samtools-0.1.12-4 (r884) - * fixed a long-existing flaw in the INDEL calling model - ------------------------------------------------------------------------- -r883 | lh3lh3 | 2010-12-11 20:05:42 -0500 (Sat, 11 Dec 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcfutils.c - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/vcfutils.pl - -compute max SP and max GQ from sample genotypes - ------------------------------------------------------------------------- -r880 | lh3lh3 | 2010-12-10 10:50:54 -0500 (Fri, 10 Dec 2010) | 2 lines -Changed paths: - D /trunk/samtools/bcftools/bcf-fix.pl - -drop bcf-fix.pl as it is redundant by the latest changes - ------------------------------------------------------------------------- -r879 | lh3lh3 | 2010-12-10 10:50:29 -0500 (Fri, 10 Dec 2010) | 3 lines -Changed paths: - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/vcf.c - - * fixed a minor issue in printing VCFs - * write bcftools specific INFO and FORMAT in the header - ------------------------------------------------------------------------- -r878 | lh3lh3 | 2010-12-10 10:09:14 -0500 (Fri, 10 Dec 2010) | 2 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcftools/bcfutils.c - M /trunk/samtools/bcftools/call1.c - -Make sure that the GT genotype field is the first - ------------------------------------------------------------------------- -r877 | lh3lh3 | 2010-12-08 17:27:05 -0500 (Wed, 08 Dec 2010) | 7 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.12-2 (r877) - - * allow to fine control the selection of indel candidates. The current - setting is okay for lowCov and highCov with ~100 samples, but it - skips too many indels for highCov with >250 samples. - - ------------------------------------------------------------------------- -r874 | lh3lh3 | 2010-12-07 22:40:35 -0500 (Tue, 07 Dec 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - -a spelling error.. - ------------------------------------------------------------------------- -r873 | lh3lh3 | 2010-12-07 22:39:57 -0500 (Tue, 07 Dec 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.12-1 (r873) - * added a switch to allow anomalous read pairs in calling - ------------------------------------------------------------------------- -r872 | lh3lh3 | 2010-12-07 14:43:54 -0500 (Tue, 07 Dec 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/vcfutils.pl - -fixed a bug in vcf2fq - ------------------------------------------------------------------------- -r869 | lh3lh3 | 2010-12-05 01:18:06 -0500 (Sun, 05 Dec 2010) | 2 lines -Changed paths: - M /trunk/samtools/bamtk.c - -added a warning for the Windows version - ------------------------------------------------------------------------- -r868 | lh3lh3 | 2010-12-05 01:05:51 -0500 (Sun, 05 Dec 2010) | 4 lines -Changed paths: - M /trunk/samtools/bcftools/call1.c - -In ksprintf(), change "%lf" and "%lg" to "%f" and "%g", respectively. -According to the manual page, this change is valid. However, MinGW seems -to interpret "%lf" as "%Lf". - ------------------------------------------------------------------------- -r867 | lh3lh3 | 2010-12-05 00:35:43 -0500 (Sun, 05 Dec 2010) | 2 lines -Changed paths: - M /trunk/samtools/Makefile.mingw - M /trunk/samtools/bam_aux.c - -bring back the windows support - ------------------------------------------------------------------------- -r866 | lh3lh3 | 2010-12-04 23:33:51 -0500 (Sat, 04 Dec 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_reheader.c - M /trunk/samtools/bcftools/vcfutils.pl - -Fixed a compiling error when knetfile is not used. - ------------------------------------------------------------------------- -r865 | lh3lh3 | 2010-12-04 00:13:22 -0500 (Sat, 04 Dec 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/vcfutils.pl - -vcf->fastq - ------------------------------------------------------------------------- -r864 | lh3lh3 | 2010-12-03 17:12:30 -0500 (Fri, 03 Dec 2010) | 3 lines -Changed paths: - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/prob1.c - M /trunk/samtools/bcftools/prob1.h - - * remove "-f". Instead always compute consensus quality - * increase the upper limit of quality - ------------------------------------------------------------------------- -r863 | lh3lh3 | 2010-12-03 15:28:15 -0500 (Fri, 03 Dec 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.c - -more informative error message - ------------------------------------------------------------------------- -r862 | lh3lh3 | 2010-12-02 16:16:08 -0500 (Thu, 02 Dec 2010) | 2 lines -Changed paths: - M /trunk/samtools/NEWS - M /trunk/samtools/bamtk.c - -Release samtools-0.1.12a - ------------------------------------------------------------------------- -r861 | lh3lh3 | 2010-12-02 15:55:06 -0500 (Thu, 02 Dec 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/call1.c - -a possible fix to DP4=0,0,0,0; have not tested, but should have no side-effect - ------------------------------------------------------------------------- -r859 | lh3lh3 | 2010-12-02 11:39:57 -0500 (Thu, 02 Dec 2010) | 2 lines -Changed paths: - M /trunk/samtools/NEWS - M /trunk/samtools/bam_index.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/samtools.1 - -Release samtools-0.1.12 - ------------------------------------------------------------------------- -r858 | lh3lh3 | 2010-12-02 11:24:41 -0500 (Thu, 02 Dec 2010) | 4 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcftools/bcf.c - - * samtools-0.1.11-1 (r858) - * fixed a bug in mpileup which causes segfaults - * bcftools: do not segfault when BCF contains errors - ------------------------------------------------------------------------- -r857 | lh3lh3 | 2010-11-30 23:52:50 -0500 (Tue, 30 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_index.c - -fixed a memory leak in bam_fetch() - ------------------------------------------------------------------------- -r856 | lh3lh3 | 2010-11-26 00:07:31 -0500 (Fri, 26 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bcftools/vcfutils.pl - - * fixed a memory violation - * added splitchr to vcfutils.pl - ------------------------------------------------------------------------- -r854 | lh3lh3 | 2010-11-23 09:05:08 -0500 (Tue, 23 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/ld.c - -fixed a typo/bug in r^2 computation - ------------------------------------------------------------------------- -r852 | lh3lh3 | 2010-11-21 22:20:20 -0500 (Sun, 21 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bamtk.c - -forget to change the version information - ------------------------------------------------------------------------- -r851 | lh3lh3 | 2010-11-21 22:16:52 -0500 (Sun, 21 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/NEWS - M /trunk/samtools/bcftools/bcftools.1 - M /trunk/samtools/samtools.1 - -Release samtools-0.1.11 - ------------------------------------------------------------------------- -r844 | lh3lh3 | 2010-11-19 23:16:08 -0500 (Fri, 19 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/prob1.c - M /trunk/samtools/bcftools/prob1.h - - * samtools-0.1.10-9 (r844) - * added the "folded" or reference-free mode for variant calling - ------------------------------------------------------------------------- -r843 | lh3lh3 | 2010-11-19 22:26:36 -0500 (Fri, 19 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/NEWS - M /trunk/samtools/bam_sort.c - -In merging, if -R is specified, do not abort if the sequence dictionary is different. - ------------------------------------------------------------------------- -r842 | jmarshall | 2010-11-19 21:24:28 -0500 (Fri, 19 Nov 2010) | 5 lines -Changed paths: - M /trunk/samtools/bam_sort.c - -When merging BAM headers, compare the list of target reference sequences -strictly (and fail/abort if there is a mismatch), but allow one list to be a -prefix of the other. (i.e., check that the lists are identical up until the -shorter runs out, and add the excess targets from the longer to the output.) - ------------------------------------------------------------------------- -r841 | lh3lh3 | 2010-11-19 14:49:27 -0500 (Fri, 19 Nov 2010) | 4 lines -Changed paths: - M /trunk/samtools/bam_index.c - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.10 (r841) - * fixed a bug in pileup when the first CIGAR operation is D - * fixed a bug in view with range query - ------------------------------------------------------------------------- -r840 | lh3lh3 | 2010-11-19 13:45:51 -0500 (Fri, 19 Nov 2010) | 10 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.10-4 (r840) - - * drop the MNP caller. It is slow while does not diliver too much - benefit. Possibly I will work on it in future given more time. - - * there is a segfault in pileup - - * someone has reported segfault from view/index/sort - - ------------------------------------------------------------------------- -r839 | lh3lh3 | 2010-11-18 17:30:11 -0500 (Thu, 18 Nov 2010) | 9 lines -Changed paths: - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.10-6 (r839) - - * call MNPs without realignment because it seems to me that it is not - worthwhile to significantly slow down SNP calling. - - * the result looks quite different from the previous version. I have - work to do... - - ------------------------------------------------------------------------- -r838 | lh3lh3 | 2010-11-18 11:26:09 -0500 (Thu, 18 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/knetfile.c - -Apply a patch by Rob Davis, which improves fault detection. - ------------------------------------------------------------------------- -r836 | lh3lh3 | 2010-11-18 11:09:23 -0500 (Thu, 18 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bamtk.c - - * samtools-r836 - * initiate MNP realignment when the MNP has at least 0.2% frequency (otherwise too slow) - ------------------------------------------------------------------------- -r835 | lh3lh3 | 2010-11-18 00:25:13 -0500 (Thu, 18 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bcftools/vcfutils.pl - - * modify the filtering rule: also filter SNPs around filtered indels - * added MNP filter - ------------------------------------------------------------------------- -r834 | lh3lh3 | 2010-11-17 23:13:52 -0500 (Wed, 17 Nov 2010) | 4 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.10-4 (r834) - * fixed a silly bug in printing MNP - * restrict to at most 1 alternative allele - ------------------------------------------------------------------------- -r833 | lh3lh3 | 2010-11-17 21:58:58 -0500 (Wed, 17 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bamtk.c - -fixed a bug in printing MNPs - ------------------------------------------------------------------------- -r832 | lh3lh3 | 2010-11-17 21:47:20 -0500 (Wed, 17 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bamtk.c - -minor change to how seqQ is applied - ------------------------------------------------------------------------- -r831 | lh3lh3 | 2010-11-17 21:41:12 -0500 (Wed, 17 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.10 (r831) - * initial MNP caller - ------------------------------------------------------------------------- -r829 | lh3lh3 | 2010-11-16 23:14:15 -0500 (Tue, 16 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/NEWS - M /trunk/samtools/bamtk.c - -Release samtools-0.1.10 (r829) - ------------------------------------------------------------------------- -r828 | lh3lh3 | 2010-11-16 20:48:49 -0500 (Tue, 16 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bamtk.c - -update version information: samtools-0.1.9-20 (r828) - ------------------------------------------------------------------------- -r827 | lh3lh3 | 2010-11-16 15:32:50 -0500 (Tue, 16 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/call1.c - -bcftools: allow to skip indels - ------------------------------------------------------------------------- -r826 | lh3lh3 | 2010-11-16 14:11:58 -0500 (Tue, 16 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_md.c - -remove ZQ if both BQ and ZQ are present - ------------------------------------------------------------------------- -r825 | lh3lh3 | 2010-11-16 13:51:33 -0500 (Tue, 16 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bam_md.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/samtools.1 - - * samtools-0.1.9-18 (r825) - * change the behaviour of calmd such that by default it does not change the base quality - ------------------------------------------------------------------------- -r824 | lh3lh3 | 2010-11-15 23:31:53 -0500 (Mon, 15 Nov 2010) | 4 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/samtools.1 - - * samtools-0.1.9-17 (r824) - * added command line options to change the default parameters in indel calling - * update the manual - ------------------------------------------------------------------------- -r823 | lh3lh3 | 2010-11-15 12:20:13 -0500 (Mon, 15 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bam_md.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.9-r823 - * the BQ tag is now 64 shifted, not 33 shifted - ------------------------------------------------------------------------- -r822 | lh3lh3 | 2010-11-15 00:30:18 -0500 (Mon, 15 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcftools/vcfutils.pl - M /trunk/samtools/misc/samtools.pl - - * samtools-0.1.9-16 (r822) - * keep the raw depth because in indel calling, DP4 may be way off the true depth - ------------------------------------------------------------------------- -r821 | lh3lh3 | 2010-11-13 01:18:31 -0500 (Sat, 13 Nov 2010) | 4 lines -Changed paths: - M /trunk/samtools/bam_md.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.9-15 (r821) - * calmd: write BQ - * skip realignment if BQ is present - ------------------------------------------------------------------------- -r820 | lh3lh3 | 2010-11-13 01:08:26 -0500 (Sat, 13 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.9-14 (r820) - * penalize reads with excessive differences in indel calling - ------------------------------------------------------------------------- -r819 | lh3lh3 | 2010-11-12 21:36:27 -0500 (Fri, 12 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.9-13 (r819) - * fixed a bug in pileup given refskip - ------------------------------------------------------------------------- -r818 | lh3lh3 | 2010-11-12 13:04:53 -0500 (Fri, 12 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bamtk.c - - * samtools-r818 - * for indel calling, do two rounds of probabilistic realignments - ------------------------------------------------------------------------- -r817 | lh3lh3 | 2010-11-11 20:04:07 -0500 (Thu, 11 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcftools/vcfutils.pl - - * samtools-0.1.19-11 (r817) - * only initiate indel calling when 0.2% of reads contain a gap - ------------------------------------------------------------------------- -r816 | lh3lh3 | 2010-11-11 01:22:59 -0500 (Thu, 11 Nov 2010) | 7 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.9-10 (r816) - - * I know why the forward method fails. it is because of zero base - qualities. when that is fixed, the forward method seems to give - better results than Viterbi, as it should be. I am tired... - - ------------------------------------------------------------------------- -r815 | lh3lh3 | 2010-11-11 00:57:15 -0500 (Thu, 11 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/bam2bcf_indel.c - -effectively revert to the viterbi version. The forward realignment gives too many false positives. - ------------------------------------------------------------------------- -r814 | lh3lh3 | 2010-11-11 00:18:02 -0500 (Thu, 11 Nov 2010) | 4 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bam_md.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.9-9 (r810) - * use forward, instead of viterbi, for realignment - * realignment is now quality aware - ------------------------------------------------------------------------- -r813 | lh3lh3 | 2010-11-10 22:45:24 -0500 (Wed, 10 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/kprobaln.c - M /trunk/samtools/kprobaln.h - - * prepare to replace kaln with kprobaln in realignment - ------------------------------------------------------------------------- -r812 | lh3lh3 | 2010-11-10 17:28:50 -0500 (Wed, 10 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.c - -fixed a typo - ------------------------------------------------------------------------- -r811 | lh3lh3 | 2010-11-10 16:54:46 -0500 (Wed, 10 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/bcf.h - -use zlib for direct reading when BCF_LITE is in use - ------------------------------------------------------------------------- -r810 | lh3lh3 | 2010-11-10 16:32:13 -0500 (Wed, 10 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - - * do not use reads containing too many mismatches for indel calling - * fixed a trivial bug in case of multi-allelic indels - ------------------------------------------------------------------------- -r809 | lh3lh3 | 2010-11-10 13:23:02 -0500 (Wed, 10 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.9-8 (r809) - * fixed a bug in the indel caller - ------------------------------------------------------------------------- -r808 | lh3lh3 | 2010-11-10 12:24:10 -0500 (Wed, 10 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - -minor change to makefile - ------------------------------------------------------------------------- -r807 | lh3lh3 | 2010-11-10 12:10:21 -0500 (Wed, 10 Nov 2010) | 4 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcftools/vcfutils.pl - - * samtools-0.1.9-8 (r807) - * collect indel candidates only from specified platforms (@RG-PL) - * merge varFilter and filter4vcf in vcfutils.pl - ------------------------------------------------------------------------- -r806 | lh3lh3 | 2010-11-09 22:05:46 -0500 (Tue, 09 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/prob1.c - M /trunk/samtools/bcftools/prob1.h - -bcftools: compute equal-tail (Bayesian) credible interval - ------------------------------------------------------------------------- -r805 | lh3lh3 | 2010-11-09 16:28:39 -0500 (Tue, 09 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/vcfutils.pl - -added a double-hit filter to avoid overestimated indel likelihood - ------------------------------------------------------------------------- -r804 | lh3lh3 | 2010-11-09 14:12:06 -0500 (Tue, 09 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.9-7 (r804) - * fixed a bug in the gap caller - ------------------------------------------------------------------------- -r803 | lh3lh3 | 2010-11-09 10:45:33 -0500 (Tue, 09 Nov 2010) | 4 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/bcf.h - M /trunk/samtools/bcftools/prob1.c - - * samtools-0.1.9-6 (r803) - * mpileup: apply homopolymer correction when calculating GL, instead of before - * bcftools: apply a different prior to indels - ------------------------------------------------------------------------- -r802 | lh3lh3 | 2010-11-08 23:53:15 -0500 (Mon, 08 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.9-5 (r802) - * relax tandem penalty. this will be made a command-line option in future. - ------------------------------------------------------------------------- -r801 | lh3lh3 | 2010-11-08 23:35:52 -0500 (Mon, 08 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.9-4 (r801) - * fixed a minor issue in printing indel VCF - ------------------------------------------------------------------------- -r800 | lh3lh3 | 2010-11-08 15:28:14 -0500 (Mon, 08 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bcftools/vcfutils.pl - -fixed another silly bug in mpileup's indel caller - ------------------------------------------------------------------------- -r799 | lh3lh3 | 2010-11-08 14:28:27 -0500 (Mon, 08 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - -fixed a silly bug in the indel caller - ------------------------------------------------------------------------- -r798 | lh3lh3 | 2010-11-08 14:07:33 -0500 (Mon, 08 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/sam_view.c - M /trunk/samtools/samtools.1 - -Incorporate patches by Marcel Martin for read counting. - ------------------------------------------------------------------------- -r797 | lh3lh3 | 2010-11-08 13:39:52 -0500 (Mon, 08 Nov 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.9-2 (r797) - * mpileup: indel calling seems to be working - ------------------------------------------------------------------------- -r796 | lh3lh3 | 2010-11-08 10:54:46 -0500 (Mon, 08 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/kaln.c - -indel calling is apparently working, but more information needs to be collected - ------------------------------------------------------------------------- -r795 | lh3lh3 | 2010-11-08 00:39:18 -0500 (Mon, 08 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf_indel.c - -fixed a few bugs in the indel caller. Probably there are more. - ------------------------------------------------------------------------- -r794 | lh3lh3 | 2010-11-07 22:23:16 -0500 (Sun, 07 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam.h - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - A /trunk/samtools/bam2bcf_indel.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/kaln.c - M /trunk/samtools/kaln.h - -prepare for the indel caller. It is not ready yet. - ------------------------------------------------------------------------- -r793 | lh3lh3 | 2010-11-05 11:28:23 -0400 (Fri, 05 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bam_plcmd.c - -Revert to r790. The recent changes are not good... - ------------------------------------------------------------------------- -r792 | lh3lh3 | 2010-11-05 00:19:14 -0400 (Fri, 05 Nov 2010) | 6 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bam_plcmd.c - - * this revision is UNSTABLE - - * indel caller seems working, but it is very insensitive and has - several things I do not quite understand. - - ------------------------------------------------------------------------- -r791 | lh3lh3 | 2010-11-04 22:58:43 -0400 (Thu, 04 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bam_plcmd.c - -for backup. no effective changes - ------------------------------------------------------------------------- -r790 | lh3lh3 | 2010-11-03 15:51:24 -0400 (Wed, 03 Nov 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/vcfutils.pl - M /trunk/samtools/kprobaln.c - -fixed a minor problem in the example coming with kprobaln.c - ------------------------------------------------------------------------- -r789 | lh3lh3 | 2010-11-02 15:41:27 -0400 (Tue, 02 Nov 2010) | 4 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam_md.c - M /trunk/samtools/kaln.c - M /trunk/samtools/kaln.h - A /trunk/samtools/kprobaln.c - A /trunk/samtools/kprobaln.h - -Separate kaln and kprobaln as I am preparing further changes. At -present, the results should be identical to the previous. - - ------------------------------------------------------------------------- -r788 | petulda | 2010-11-02 12:19:04 -0400 (Tue, 02 Nov 2010) | 1 line -Changed paths: - M /trunk/samtools/bam_plcmd.c - -Added -b option: read file names from a file ------------------------------------------------------------------------- -r787 | lh3lh3 | 2010-10-29 23:17:22 -0400 (Fri, 29 Oct 2010) | 7 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.9-2 (r787) - - * Allow to set a maximum per-sample depth to reduce memory. However, - BAQ computation is still applied to every read. The speed is not - improved. - - ------------------------------------------------------------------------- -r786 | lh3lh3 | 2010-10-29 12:10:40 -0400 (Fri, 29 Oct 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/vcf.c - - * samtools-0.1.9-1 (r786) - * samtools: optionally perform exact test for each sample - ------------------------------------------------------------------------- -r785 | lh3lh3 | 2010-10-29 09:42:25 -0400 (Fri, 29 Oct 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bcftools/bcf.c - -Optionally output "DP", the individual read depth - ------------------------------------------------------------------------- -r784 | lh3lh3 | 2010-10-27 23:10:27 -0400 (Wed, 27 Oct 2010) | 2 lines -Changed paths: - M /trunk/samtools/samtools.1 - -acknowledge Petr and John who have greatly contributed to the project. - ------------------------------------------------------------------------- -r783 | lh3lh3 | 2010-10-27 22:47:47 -0400 (Wed, 27 Oct 2010) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/NEWS - M /trunk/samtools/bamtk.c - M /trunk/samtools/samtools.1 - -Release samtools-0.1.9 (r783) - ------------------------------------------------------------------------- -r782 | lh3lh3 | 2010-10-27 19:58:54 -0400 (Wed, 27 Oct 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - -fixed a silly bug in pileup - ------------------------------------------------------------------------- -r781 | lh3lh3 | 2010-10-27 14:39:48 -0400 (Wed, 27 Oct 2010) | 5 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bam_sort.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/samtools.1 - - * samtools-0.1.8-22 (r781) - * made BAQ the default behavior of mpileup - * updated manual - * in merge, force to exit given inconsistent header when "-R" is not in use. - ------------------------------------------------------------------------- -r780 | lh3lh3 | 2010-10-27 11:01:11 -0400 (Wed, 27 Oct 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.8-21 (r780) - * minor speedup to pileup - ------------------------------------------------------------------------- -r779 | lh3lh3 | 2010-10-27 09:58:56 -0400 (Wed, 27 Oct 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/examples/toy.sam - -improve pileup a little bit - ------------------------------------------------------------------------- -r778 | lh3lh3 | 2010-10-27 00:14:43 -0400 (Wed, 27 Oct 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bam_tview.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.8-20 (r778) - * speed up pileup, although I do not know how much is the improvement - ------------------------------------------------------------------------- -r777 | lh3lh3 | 2010-10-26 17:26:04 -0400 (Tue, 26 Oct 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_maqcns.h - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/examples/Makefile - - * samtools-0.1.8-19 (r777) - * integrate mpileup features to pileup: min_baseQ, capQ, prob_realn, paired-only and biased prior - ------------------------------------------------------------------------- -r776 | lh3lh3 | 2010-10-26 15:27:46 -0400 (Tue, 26 Oct 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_md.c - -remove local realignment (probabilistic realignment is still there) - ------------------------------------------------------------------------- -r774 | jmarshall | 2010-10-21 06:52:38 -0400 (Thu, 21 Oct 2010) | 3 lines -Changed paths: - M /trunk/samtools/sam_view.c - -Add the relevant filename or region to error messages, and cause a failure -exit status where appropriate. Based on a patch provided by Marcel Martin. - ------------------------------------------------------------------------- -r773 | lh3lh3 | 2010-10-19 19:44:31 -0400 (Tue, 19 Oct 2010) | 3 lines -Changed paths: - M /trunk/samtools/examples/toy.sam - M /trunk/samtools/kaln.c - - * Minor code changes. No real effect. - * change quality to 30 in toy.sam - ------------------------------------------------------------------------- -r772 | lh3lh3 | 2010-10-18 23:40:13 -0400 (Mon, 18 Oct 2010) | 2 lines -Changed paths: - M /trunk/samtools/examples/toy.fa - M /trunk/samtools/examples/toy.sam - -added another toy example - ------------------------------------------------------------------------- -r771 | lh3lh3 | 2010-10-13 23:32:12 -0400 (Wed, 13 Oct 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/ld.c - M /trunk/samtools/bcftools/vcfutils.pl - -improve the LD statistics - ------------------------------------------------------------------------- -r770 | lh3lh3 | 2010-10-12 23:49:26 -0400 (Tue, 12 Oct 2010) | 3 lines -Changed paths: - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/vcfutils.pl - - * a minor fix to the -L option - * add ldstats to vcfutils.pl - ------------------------------------------------------------------------- -r769 | lh3lh3 | 2010-10-12 15:51:57 -0400 (Tue, 12 Oct 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.c - -a minor change - ------------------------------------------------------------------------- -r768 | lh3lh3 | 2010-10-12 15:49:06 -0400 (Tue, 12 Oct 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/Makefile - A /trunk/samtools/bcftools/ld.c - -forget to add the key file - ------------------------------------------------------------------------- -r767 | lh3lh3 | 2010-10-12 15:48:46 -0400 (Tue, 12 Oct 2010) | 4 lines -Changed paths: - M /trunk/samtools/bcftools/Makefile - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/bcf.h - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/prob1.c - M /trunk/samtools/bcftools/vcfutils.pl - - * vcfutils.pl: fixed a typo in help message - * added APIs: bcf_append_info() and bcf_cpy() - * calculate adjacent LD - ------------------------------------------------------------------------- -r766 | lh3lh3 | 2010-10-11 11:06:40 -0400 (Mon, 11 Oct 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/vcfutils.pl - -added filter for samtools/bcftools genetated VCFs - ------------------------------------------------------------------------- -r765 | lh3lh3 | 2010-10-05 14:05:18 -0400 (Tue, 05 Oct 2010) | 3 lines -Changed paths: - M /trunk/samtools/bcftools/vcfutils.pl - M /trunk/samtools/kaln.c - - * removed a comment line in kaln.c - * vcfutils.pl fillac works when GT is not the first field - ------------------------------------------------------------------------- -r764 | petulda | 2010-10-05 08:59:36 -0400 (Tue, 05 Oct 2010) | 1 line -Changed paths: - A /trunk/samtools/bcftools/bcf-fix.pl - -Convert VCF output of "bcftools view -bgcv" to a valid VCF file ------------------------------------------------------------------------- -r763 | lh3lh3 | 2010-10-02 22:51:03 -0400 (Sat, 02 Oct 2010) | 4 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - A /trunk/samtools/bcftools/bcftools.1 - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/samtools.1 - - * samtools-0.1.8-18 (r763) - * added bcftools manual page - * minor fix to mpileup and view command lines - ------------------------------------------------------------------------- -r762 | lh3lh3 | 2010-10-02 21:46:25 -0400 (Sat, 02 Oct 2010) | 3 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/vcfutils.pl - - * vcfutils.pl qstats: calculate marginal ts/tv - * allow to call genotypes at variant sites - ------------------------------------------------------------------------- -r761 | lh3lh3 | 2010-10-01 00:29:55 -0400 (Fri, 01 Oct 2010) | 3 lines -Changed paths: - M /trunk/samtools/kaln.c - M /trunk/samtools/misc/HmmGlocal.java - -I am changing the gap open probability back to 0.001. It seems that -being conservative here is a good thing... - ------------------------------------------------------------------------- -r760 | lh3lh3 | 2010-10-01 00:11:27 -0400 (Fri, 01 Oct 2010) | 5 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/kaln.c - A /trunk/samtools/misc/HmmGlocal.java - - * samtools-0.1.8-17 (r760) - * the default gap open penalty is too small (a typo) - * added comments on hmm_realn - * Java implementation - ------------------------------------------------------------------------- -r759 | lh3lh3 | 2010-09-30 10:12:54 -0400 (Thu, 30 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bamtk.c - -mark samtools-0.1.8-16 (r759) - ------------------------------------------------------------------------- -r758 | lh3lh3 | 2010-09-30 10:12:02 -0400 (Thu, 30 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/kaln.c - -round to the nearest integer - ------------------------------------------------------------------------- -r757 | lh3lh3 | 2010-09-28 17:16:43 -0400 (Tue, 28 Sep 2010) | 4 lines -Changed paths: - M /trunk/samtools/kaln.c - -I was trying to accelerate ka_prob_glocal() as this will be the -bottleneck. After an hour, the only gain is to change division to -multiplication. OK. I will stop. - ------------------------------------------------------------------------- -r756 | lh3lh3 | 2010-09-28 16:57:49 -0400 (Tue, 28 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/kaln.c - -this is interesting. multiplication is much faster than division, at least on my Mac - ------------------------------------------------------------------------- -r755 | lh3lh3 | 2010-09-28 16:19:13 -0400 (Tue, 28 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/kaln.c - -minor changes - ------------------------------------------------------------------------- -r754 | lh3lh3 | 2010-09-28 15:44:16 -0400 (Tue, 28 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_md.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/kaln.c - -prob_realn() seems working! - ------------------------------------------------------------------------- -r753 | lh3lh3 | 2010-09-28 12:48:23 -0400 (Tue, 28 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/kaln.c - -minor - ------------------------------------------------------------------------- -r752 | lh3lh3 | 2010-09-28 12:47:41 -0400 (Tue, 28 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/kaln.c - M /trunk/samtools/kaln.h - -Convert phredQ to probabilities - ------------------------------------------------------------------------- -r751 | lh3lh3 | 2010-09-28 12:32:08 -0400 (Tue, 28 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/kaln.c - M /trunk/samtools/kaln.h - -Implement the glocal HMM; discard the extention HMM - ------------------------------------------------------------------------- -r750 | lh3lh3 | 2010-09-28 00:06:11 -0400 (Tue, 28 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/kaln.c - -improve numerical stability - ------------------------------------------------------------------------- -r749 | lh3lh3 | 2010-09-27 23:27:54 -0400 (Mon, 27 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/kaln.c - -more comments - ------------------------------------------------------------------------- -r748 | lh3lh3 | 2010-09-27 23:17:16 -0400 (Mon, 27 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/kaln.c - -fixed a bug in banded DP - ------------------------------------------------------------------------- -r747 | lh3lh3 | 2010-09-27 23:05:12 -0400 (Mon, 27 Sep 2010) | 3 lines -Changed paths: - M /trunk/samtools/kaln.c - - * fixed that weird issue. - * the banded version is NOT working - ------------------------------------------------------------------------- -r746 | lh3lh3 | 2010-09-27 22:57:05 -0400 (Mon, 27 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/kaln.c - -More comments. This version seems working, but something is a little weird... - ------------------------------------------------------------------------- -r745 | lh3lh3 | 2010-09-27 17:21:40 -0400 (Mon, 27 Sep 2010) | 6 lines -Changed paths: - M /trunk/samtools/kaln.c - -A little code cleanup. Now the forward and backback algorithms give -nearly identical P(x), which means both are close to the correct -forms. However, I have only tested on toy examples. Minor errors in -the implementation may not be obvious. - - ------------------------------------------------------------------------- -r744 | lh3lh3 | 2010-09-27 16:55:15 -0400 (Mon, 27 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bam_sort.c - M /trunk/samtools/kaln.c - M /trunk/samtools/kaln.h - -... - ------------------------------------------------------------------------- -r743 | jmarshall | 2010-09-27 08:19:06 -0400 (Mon, 27 Sep 2010) | 6 lines -Changed paths: - M /trunk/samtools/bam_sort.c - -Abort if merge -h's INH.SAM cannot be opened, just as we abort -if any of the IN#.BAM input files cannot be opened. - -Also propagate any error indication returned by bam_merge_core() -to samtools merge's exit status. - ------------------------------------------------------------------------- -r741 | jmarshall | 2010-09-24 11:08:24 -0400 (Fri, 24 Sep 2010) | 5 lines -Changed paths: - M /trunk/samtools/bam_index.c - -Use bam_validate1() to detect garbage records in the event of a corrupt -BAI index file that causes a bam_seek() to an invalid position. At most -one record (namely, the bam_iter_read terminator) is tested per bam_fetch() -call, so the cost is insignificant in the normal case. - ------------------------------------------------------------------------- -r740 | jmarshall | 2010-09-24 11:00:19 -0400 (Fri, 24 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam.c - M /trunk/samtools/bam.h - -Add bam_validate1(). - ------------------------------------------------------------------------- -r739 | lh3lh3 | 2010-09-22 12:07:50 -0400 (Wed, 22 Sep 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_md.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.8-15 (r379) - * allow to change capQ parameter in calmd - ------------------------------------------------------------------------- -r738 | jmarshall | 2010-09-22 11:15:33 -0400 (Wed, 22 Sep 2010) | 13 lines -Changed paths: - M /trunk/samtools/bam_index.c - M /trunk/samtools/sam_view.c - -When bam_read1() returns an error (return value <= -2), propagate that error -to bam_iter_read()'s own return value. Similarly, also propagate it up to -bam_fetch()'s return value. Previously bam_fetch() always returned 0, and -callers ignored its return value anyway. With this change, 0 continues to -indicate success, while <= -2 (which can be written as < 0, as -1 is never -returned) indicates corrupted input. - -bam_iter_read() ought also to propagate errors returned by bam_seek(). - -main_samview() can now print an error message and fail when bam_fetch() -detects that a .bai index file is corrupted or otherwise does not correspond -to the .bam file it is being used with. - ------------------------------------------------------------------------- -r737 | jmarshall | 2010-09-22 10:47:42 -0400 (Wed, 22 Sep 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_index.c - -0 is a successful return value from bam_read1(). (In practice, it never -returns 0 anyway; but all the other callers treat 0 as successful.) - ------------------------------------------------------------------------- -r736 | lh3lh3 | 2010-09-20 17:43:08 -0400 (Mon, 20 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_index.c - M /trunk/samtools/bam_sort.c - - * merge files region-by-region. work on small examples but more tests are needed. - ------------------------------------------------------------------------- -r735 | lh3lh3 | 2010-09-20 16:56:24 -0400 (Mon, 20 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/vcfutils.pl - -improve qstats by checking the alleles as well - ------------------------------------------------------------------------- -r734 | lh3lh3 | 2010-09-17 18:12:13 -0400 (Fri, 17 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/vcfutils.pl - -convert UCSC SNP SQL dump to VCF - ------------------------------------------------------------------------- -r733 | lh3lh3 | 2010-09-17 13:02:11 -0400 (Fri, 17 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/vcfutils.pl - -hapmap2vcf convertor - ------------------------------------------------------------------------- -r732 | lh3lh3 | 2010-09-17 10:11:37 -0400 (Fri, 17 Sep 2010) | 3 lines -Changed paths: - M /trunk/samtools/bcftools/Makefile - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/bcf.h - M /trunk/samtools/bcftools/vcf.c - - * added comments - * VCF->BCF is not possible without knowing the sequence dictionary before hand... - ------------------------------------------------------------------------- -r731 | lh3lh3 | 2010-09-17 09:15:53 -0400 (Fri, 17 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/bcf.h - M /trunk/samtools/bcftools/bcfutils.c - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/vcf.c - - * put n_smpl to "bcf1_t" to simplify API a little - ------------------------------------------------------------------------- -r730 | lh3lh3 | 2010-09-16 21:36:01 -0400 (Thu, 16 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.h - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/index.c - -fixed a bug in indexing - ------------------------------------------------------------------------- -r729 | lh3lh3 | 2010-09-16 16:54:48 -0400 (Thu, 16 Sep 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam.c - M /trunk/samtools/bam_md.c - M /trunk/samtools/bam_pileup.c - - * fixed a bug in capQ - * valgrind identifies a use of uninitialised value, but I have not fixed it. - ------------------------------------------------------------------------- -r728 | lh3lh3 | 2010-09-16 15:03:59 -0400 (Thu, 16 Sep 2010) | 3 lines -Changed paths: - M /trunk/samtools/bgzip.c - M /trunk/samtools/razip.c - - * fixed a bug in razip: -c will delete the input file - * copy tabix/bgzip to here - ------------------------------------------------------------------------- -r727 | lh3lh3 | 2010-09-16 13:45:49 -0400 (Thu, 16 Sep 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_md.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.8-14 (r727) - * allow to change the capQ parameter at the command line - ------------------------------------------------------------------------- -r726 | lh3lh3 | 2010-09-16 13:38:43 -0400 (Thu, 16 Sep 2010) | 4 lines -Changed paths: - M /trunk/samtools/bam_md.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bcftools/vcfutils.pl - M /trunk/samtools/misc/samtools.pl - - * added varFilter to vcfutils.pl - * reimplement realn(). now it performs a local alignment - * added cap_mapQ() to cap mapping quality when there are many substitutions - ------------------------------------------------------------------------- -r724 | lh3lh3 | 2010-09-15 00:18:31 -0400 (Wed, 15 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/Makefile - A /trunk/samtools/bcftools/bcf2qcall.c - M /trunk/samtools/bcftools/call1.c - - * convert BCF to QCALL input - ------------------------------------------------------------------------- -r723 | lh3lh3 | 2010-09-14 22:41:50 -0400 (Tue, 14 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_md.c - -dynamic band width in realignment - ------------------------------------------------------------------------- -r722 | lh3lh3 | 2010-09-14 22:05:32 -0400 (Tue, 14 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_md.c - M /trunk/samtools/bam_plcmd.c - -fixed a bug in realignment - ------------------------------------------------------------------------- -r721 | lh3lh3 | 2010-09-14 20:54:09 -0400 (Tue, 14 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/prob1.c - -fixed a minor issue - ------------------------------------------------------------------------- -r720 | lh3lh3 | 2010-09-14 19:25:10 -0400 (Tue, 14 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_md.c - -fixed a bug in realignment - ------------------------------------------------------------------------- -r719 | lh3lh3 | 2010-09-14 19:18:24 -0400 (Tue, 14 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - -minor changes. It is BUGGY now! - ------------------------------------------------------------------------- -r718 | lh3lh3 | 2010-09-14 16:32:33 -0400 (Tue, 14 Sep 2010) | 4 lines -Changed paths: - M /trunk/samtools/bam_md.c - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/kaln.c - M /trunk/samtools/kaln.h - - * aggressive gapped aligner is implemented in calmd. - * distinguish gap_open and gap_end_open in banded alignment - * make tview accepts alignment with heading and tailing D - ------------------------------------------------------------------------- -r717 | jmarshall | 2010-09-14 09:04:28 -0400 (Tue, 14 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools - -Add svn:ignore properties for generated files that don't appear in "make all". - ------------------------------------------------------------------------- -r716 | jmarshall | 2010-09-13 08:37:53 -0400 (Mon, 13 Sep 2010) | 3 lines -Changed paths: - M /trunk/samtools - M /trunk/samtools/bcftools - M /trunk/samtools/misc - -Add svn:ignore properties listing the generated files. -(Except for *.o, which we'll assume is in global-ignores.) - ------------------------------------------------------------------------- -r715 | lh3lh3 | 2010-09-08 12:53:55 -0400 (Wed, 08 Sep 2010) | 5 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/prob1.c - M /trunk/samtools/sample.c - M /trunk/samtools/sample.h - - * samtools-0.1.8-13 (r715) - * fixed a bug in identifying SM across files - * bcftools: estimate heterozygosity - * bcftools: allow to skip sites without reference bases - ------------------------------------------------------------------------- -r713 | lh3lh3 | 2010-09-03 17:19:12 -0400 (Fri, 03 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/Makefile - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/prob1.c - M /trunk/samtools/bcftools/prob1.h - -quite a lot changes to the contrast caller, but I still feel something is missing... - ------------------------------------------------------------------------- -r711 | lh3lh3 | 2010-09-03 00:30:48 -0400 (Fri, 03 Sep 2010) | 4 lines -Changed paths: - M /trunk/samtools/bcftools/Makefile - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/prob1.c - M /trunk/samtools/bcftools/vcfutils.pl - - * changed 3.434 to 4.343 (typo!) - * fixed a bug in the contrast caller - * calculate heterozygosity - ------------------------------------------------------------------------- -r710 | lh3lh3 | 2010-09-01 23:24:47 -0400 (Wed, 01 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.h - M /trunk/samtools/bcftools/bcfutils.c - M /trunk/samtools/bcftools/call1.c - -SNP calling from the GL field - ------------------------------------------------------------------------- -r709 | lh3lh3 | 2010-09-01 18:52:30 -0400 (Wed, 01 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/vcf.c - -fixed another problem - ------------------------------------------------------------------------- -r708 | lh3lh3 | 2010-09-01 18:31:17 -0400 (Wed, 01 Sep 2010) | 3 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/vcf.c - - * fixed bugs in parsing VCF - * parser now works with GT/GQ/DP/PL/GL - ------------------------------------------------------------------------- -r707 | lh3lh3 | 2010-09-01 15:28:29 -0400 (Wed, 01 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/Makefile - M /trunk/samtools/bcftools/prob1.c - -Do not compile _BCF_QUAD by default - ------------------------------------------------------------------------- -r706 | lh3lh3 | 2010-09-01 15:21:41 -0400 (Wed, 01 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/bcf.h - M /trunk/samtools/bcftools/bcfutils.c - M /trunk/samtools/bcftools/call1.c - -Write the correct ALT and PL in the SNP calling mode. - ------------------------------------------------------------------------- -r705 | lh3lh3 | 2010-09-01 12:50:33 -0400 (Wed, 01 Sep 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/vcfutils.pl - -more commands for my own uses - ------------------------------------------------------------------------- -r704 | lh3lh3 | 2010-09-01 09:26:10 -0400 (Wed, 01 Sep 2010) | 2 lines -Changed paths: - A /trunk/samtools/bcftools/vcfutils.pl - -Utilities for processing VCF - ------------------------------------------------------------------------- -r703 | lh3lh3 | 2010-08-31 16:44:57 -0400 (Tue, 31 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/Makefile - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/prob1.c - M /trunk/samtools/bcftools/prob1.h - -preliminary contrast variant caller - ------------------------------------------------------------------------- -r702 | lh3lh3 | 2010-08-31 12:28:39 -0400 (Tue, 31 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/call1.c - M /trunk/samtools/bcftools/prob1.c - M /trunk/samtools/bcftools/prob1.h - -z' and z'' can be calculated - ------------------------------------------------------------------------- -r701 | lh3lh3 | 2010-08-31 10:20:57 -0400 (Tue, 31 Aug 2010) | 3 lines -Changed paths: - M /trunk/samtools/bcftools/Makefile - A /trunk/samtools/bcftools/call1.c (from /trunk/samtools/bcftools/vcfout.c:699) - M /trunk/samtools/bcftools/prob1.c - D /trunk/samtools/bcftools/vcfout.c - - * rename vcfout.c as call1.c - * prepare to add two-sample comparison - ------------------------------------------------------------------------- -r699 | lh3lh3 | 2010-08-24 15:28:16 -0400 (Tue, 24 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/vcfout.c - -fixed a bug in calculating the t statistics - ------------------------------------------------------------------------- -r698 | lh3lh3 | 2010-08-24 14:05:50 -0400 (Tue, 24 Aug 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcftools/kfunc.c - M /trunk/samtools/bcftools/vcfout.c - - * samtools-0.1.8-13 (r698) - * perform one-tailed t-test for baseQ, mapQ and endDist - ------------------------------------------------------------------------- -r697 | lh3lh3 | 2010-08-24 12:30:13 -0400 (Tue, 24 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/kfunc.c - -added regularized incomplete beta function - ------------------------------------------------------------------------- -r695 | lh3lh3 | 2010-08-23 17:36:17 -0400 (Mon, 23 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_plcmd.c - -change the default correlation coefficient - ------------------------------------------------------------------------- -r694 | lh3lh3 | 2010-08-23 14:46:52 -0400 (Mon, 23 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/vcfout.c - -print QUAL as floating numbers - ------------------------------------------------------------------------- -r693 | lh3lh3 | 2010-08-23 14:06:07 -0400 (Mon, 23 Aug 2010) | 3 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/examples/Makefile - A /trunk/samtools/sample.c - A /trunk/samtools/sample.h - - * samtools-0.1.8-12 (r692) - * group data by samples in "mpileup -g" - ------------------------------------------------------------------------- -r692 | lh3lh3 | 2010-08-23 10:58:53 -0400 (Mon, 23 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - D /trunk/samtools/bam_mcns.c - D /trunk/samtools/bam_mcns.h - M /trunk/samtools/bam_plcmd.c - -remove VCF output in mpileup - ------------------------------------------------------------------------- -r691 | lh3lh3 | 2010-08-23 10:48:20 -0400 (Mon, 23 Aug 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - - * use the revised MAQ error model for mpileup - * prepare to remove the independent model from mpileup - ------------------------------------------------------------------------- -r690 | lh3lh3 | 2010-08-20 15:46:40 -0400 (Fri, 20 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_maqcns.h - M /trunk/samtools/bam_plcmd.c - A /trunk/samtools/errmod.c - A /trunk/samtools/errmod.h - M /trunk/samtools/ksort.h - -added revised MAQ error model - ------------------------------------------------------------------------- -r689 | lh3lh3 | 2010-08-18 09:55:20 -0400 (Wed, 18 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/prob1.c - M /trunk/samtools/bcftools/prob1.h - M /trunk/samtools/bcftools/vcfout.c - -allow to read the prior from the error output. EM iteration is working. - ------------------------------------------------------------------------- -r688 | lh3lh3 | 2010-08-17 12:12:20 -0400 (Tue, 17 Aug 2010) | 3 lines -Changed paths: - M /trunk/samtools/bcftools/main.c - M /trunk/samtools/bcftools/vcf.c - - * write a little more VCF header - * concatenate BCFs - ------------------------------------------------------------------------- -r687 | lh3lh3 | 2010-08-16 20:53:16 -0400 (Mon, 16 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/bcf.h - M /trunk/samtools/bcftools/bcf.tex - -use float for QUAL - ------------------------------------------------------------------------- -r686 | lh3lh3 | 2010-08-14 00:11:13 -0400 (Sat, 14 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/prob1.c - -faster for large sample size (in principle) - ------------------------------------------------------------------------- -r685 | lh3lh3 | 2010-08-13 23:28:31 -0400 (Fri, 13 Aug 2010) | 4 lines -Changed paths: - M /trunk/samtools/bcftools/prob1.c - - * a numerically stable method to calculate z_{jk} - * currently slower than the old method but will be important for large sample size - * in principle, we can speed up for large n, but have not tried - ------------------------------------------------------------------------- -r684 | lh3lh3 | 2010-08-11 21:58:31 -0400 (Wed, 11 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/vcfout.c - -fixed an issue in parsing integer - ------------------------------------------------------------------------- -r683 | lh3lh3 | 2010-08-09 13:05:07 -0400 (Mon, 09 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.c - -do not print refname if file is converted from VCF - ------------------------------------------------------------------------- -r682 | lh3lh3 | 2010-08-09 12:59:47 -0400 (Mon, 09 Aug 2010) | 3 lines -Changed paths: - M /trunk/samtools/bcftools/vcf.c - - * parse PL - * fixed a bug in parsing VCF - ------------------------------------------------------------------------- -r681 | lh3lh3 | 2010-08-09 12:49:23 -0400 (Mon, 09 Aug 2010) | 4 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/bcf.h - M /trunk/samtools/bcftools/bcfutils.c - M /trunk/samtools/bcftools/main.c - M /trunk/samtools/bcftools/vcf.c - M /trunk/samtools/bcftools/vcfout.c - M /trunk/samtools/bgzf.c - M /trunk/samtools/kstring.c - - * fixed a bug in kstrtok@kstring.c - * preliminary VCF parser (not parse everything for now) - * improved view interface - ------------------------------------------------------------------------- -r680 | lh3lh3 | 2010-08-09 10:43:13 -0400 (Mon, 09 Aug 2010) | 4 lines -Changed paths: - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/bcf.h - M /trunk/samtools/bcftools/vcfout.c - M /trunk/samtools/kstring.c - M /trunk/samtools/kstring.h - - * improved kstring (added kstrtok) - * removed the limit on the format string length in bcftools - * use kstrtok to parse format which fixed a bug in the old code - ------------------------------------------------------------------------- -r679 | lh3lh3 | 2010-08-09 01:12:05 -0400 (Mon, 09 Aug 2010) | 2 lines -Changed paths: - A /trunk/samtools/bcftools/README - M /trunk/samtools/bcftools/vcfout.c - -help messages - ------------------------------------------------------------------------- -r678 | lh3lh3 | 2010-08-09 00:01:52 -0400 (Mon, 09 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/vcfout.c - -perform single-tail test for ED4 - ------------------------------------------------------------------------- -r677 | lh3lh3 | 2010-08-08 23:48:35 -0400 (Sun, 08 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/Makefile - M /trunk/samtools/bcftools/kfunc.c - M /trunk/samtools/bcftools/vcfout.c - - * test depth, end distance and HWE - ------------------------------------------------------------------------- -r676 | lh3lh3 | 2010-08-08 02:04:15 -0400 (Sun, 08 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/kfunc.c - -reimplement incomplete gamma functions. no copy-paste - ------------------------------------------------------------------------- -r675 | lh3lh3 | 2010-08-06 22:42:54 -0400 (Fri, 06 Aug 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bcftools/fet.c - M /trunk/samtools/bcftools/prob1.c - M /trunk/samtools/bcftools/prob1.h - M /trunk/samtools/bcftools/vcfout.c - - * bcftools: add HWE (no testing for now) - * record end dist in a 2x2 table, not avg, std any more - ------------------------------------------------------------------------- -r674 | lh3lh3 | 2010-08-06 17:30:16 -0400 (Fri, 06 Aug 2010) | 3 lines -Changed paths: - A /trunk/samtools/bcftools/kfunc.c - - * Special functions: log(gamma()), erfc(), P(a,x) (incomplete gamma) - * Not using Numerical Recipe due to licensing issues - ------------------------------------------------------------------------- -r673 | lh3lh3 | 2010-08-05 23:46:53 -0400 (Thu, 05 Aug 2010) | 2 lines -Changed paths: - A /trunk/samtools/bcftools/fet.c - -Fisher's exact test - ------------------------------------------------------------------------- -r672 | lh3lh3 | 2010-08-05 21:48:33 -0400 (Thu, 05 Aug 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bamtk.c - M /trunk/samtools/examples/Makefile - - * samtools-0.1.8-11 (r672) - * collect more stats for allele balance test in bcftools (not yet) - ------------------------------------------------------------------------- -r671 | lh3lh3 | 2010-08-05 16:17:58 -0400 (Thu, 05 Aug 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/main.c - - * the code base is stablized again. - * I will delay the vcf parser, which is quite complicated but with little value for now - ------------------------------------------------------------------------- -r670 | lh3lh3 | 2010-08-05 16:03:23 -0400 (Thu, 05 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/examples/Makefile - -minor - ------------------------------------------------------------------------- -r669 | lh3lh3 | 2010-08-05 16:03:08 -0400 (Thu, 05 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcftools/vcf.c - -unfinished vcf parser - ------------------------------------------------------------------------- -r668 | lh3lh3 | 2010-08-05 15:46:40 -0400 (Thu, 05 Aug 2010) | 3 lines -Changed paths: - M /trunk/samtools/bcftools/Makefile - M /trunk/samtools/bcftools/bcf.c - M /trunk/samtools/bcftools/bcf.h - M /trunk/samtools/bcftools/bcfutils.c - M /trunk/samtools/bcftools/index.c - M /trunk/samtools/bcftools/main.c - A /trunk/samtools/bcftools/vcf.c - M /trunk/samtools/bcftools/vcfout.c - - * added prelimiary VCF parser (not finished) - * change struct a bit - ------------------------------------------------------------------------- -r667 | lh3lh3 | 2010-08-03 22:35:27 -0400 (Tue, 03 Aug 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bcftools/bcf.c - - * allow to set min base q - * fixed a bug in mpileup -u - ------------------------------------------------------------------------- -r666 | lh3lh3 | 2010-08-03 22:08:44 -0400 (Tue, 03 Aug 2010) | 2 lines -Changed paths: - A /trunk/samtools/bcftools/bcf.tex - -spec - ------------------------------------------------------------------------- -r665 | lh3lh3 | 2010-08-03 21:18:57 -0400 (Tue, 03 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/examples/Makefile - -added more examples - ------------------------------------------------------------------------- -r664 | lh3lh3 | 2010-08-03 21:13:00 -0400 (Tue, 03 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bcftools/Makefile - -fixed compilation error - ------------------------------------------------------------------------- -r662 | lh3lh3 | 2010-08-03 21:04:00 -0400 (Tue, 03 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - D /trunk/samtools/bcf.c - D /trunk/samtools/bcf.h - A /trunk/samtools/bcftools - A /trunk/samtools/bcftools/Makefile - A /trunk/samtools/bcftools/bcf.c - A /trunk/samtools/bcftools/bcf.h - A /trunk/samtools/bcftools/bcfutils.c - A /trunk/samtools/bcftools/index.c - A /trunk/samtools/bcftools/main.c - A /trunk/samtools/bcftools/prob1.c - A /trunk/samtools/bcftools/prob1.h - A /trunk/samtools/bcftools/vcfout.c - -move bcftools to samtools - ------------------------------------------------------------------------- -r660 | lh3lh3 | 2010-08-03 15:58:32 -0400 (Tue, 03 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - -fixed another minor bug - ------------------------------------------------------------------------- -r658 | lh3lh3 | 2010-08-03 15:06:45 -0400 (Tue, 03 Aug 2010) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/bcf.c - - * samtools-0.1.8-10 (r658) - * fixed a bug in bam2bcf when the reference is N - ------------------------------------------------------------------------- -r657 | lh3lh3 | 2010-08-03 14:50:23 -0400 (Tue, 03 Aug 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - - * fixed a bug - * treat ambiguous ref base as the fifth base - ------------------------------------------------------------------------- -r654 | lh3lh3 | 2010-08-02 17:38:27 -0400 (Mon, 02 Aug 2010) | 2 lines -Changed paths: - M /trunk/bcftools/bcf.c - M /trunk/samtools/bcf.c - -missing a column in VCF output... - ------------------------------------------------------------------------- -r653 | lh3lh3 | 2010-08-02 17:31:33 -0400 (Mon, 02 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcf.c - -fixed a memory leak - ------------------------------------------------------------------------- -r651 | lh3lh3 | 2010-08-02 17:27:31 -0400 (Mon, 02 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bcf.c - -fixed a bug in bcf reader - ------------------------------------------------------------------------- -r650 | lh3lh3 | 2010-08-02 17:00:41 -0400 (Mon, 02 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam2bcf.c - -fixed a bug - ------------------------------------------------------------------------- -r649 | lh3lh3 | 2010-08-02 16:49:35 -0400 (Mon, 02 Aug 2010) | 3 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam2bcf.c - M /trunk/samtools/bam2bcf.h - M /trunk/samtools/bamtk.c - - * samtools-0.1.8-9 (r649) - * lossless representation of PL in BCF output - ------------------------------------------------------------------------- -r648 | lh3lh3 | 2010-08-02 16:07:25 -0400 (Mon, 02 Aug 2010) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - A /trunk/samtools/bam2bcf.c - A /trunk/samtools/bam2bcf.h - M /trunk/samtools/bam_plcmd.c - A /trunk/samtools/bcf.c - A /trunk/samtools/bcf.h - -Generate binary VCF - ------------------------------------------------------------------------- -r644 | lh3lh3 | 2010-07-28 11:59:19 -0400 (Wed, 28 Jul 2010) | 5 lines -Changed paths: - M /trunk/samtools/bam_mcns.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.8-8 (r644) - * mpileup becomes a little stable again - * the method is slightly different, but is more theoretically correct - * snp calling is O(n^2) instead of O(n^3) - ------------------------------------------------------------------------- -r643 | lh3lh3 | 2010-07-28 11:54:15 -0400 (Wed, 28 Jul 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_mcns.c - - * fixed a STUPID bug, which cost me a lot of time. - * I am going to clean up mcns a little bit - ------------------------------------------------------------------------- -r642 | lh3lh3 | 2010-07-27 23:23:07 -0400 (Tue, 27 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_mcns.c - M /trunk/samtools/bam_mcns.h - M /trunk/samtools/bam_plcmd.c - -supposedly this is THE correct implementation, but more testing is needed - ------------------------------------------------------------------------- -r641 | lh3lh3 | 2010-07-27 22:43:39 -0400 (Tue, 27 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_mcns.c - -NOT ready yet. Going to make further changes... - ------------------------------------------------------------------------- -r639 | lh3lh3 | 2010-07-25 22:18:38 -0400 (Sun, 25 Jul 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_mcns.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.8-7 (r639) - * fixed the reference allele assignment - ------------------------------------------------------------------------- -r638 | lh3lh3 | 2010-07-25 12:01:26 -0400 (Sun, 25 Jul 2010) | 5 lines -Changed paths: - M /trunk/samtools/bam_mcns.c - M /trunk/samtools/bam_mcns.h - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.8-6 (r638) - * skip isnan/isinf in case of float underflow - * added the flat prior - * fixed an issue where there are no reads supporting the reference - ------------------------------------------------------------------------- -r637 | lh3lh3 | 2010-07-24 14:16:27 -0400 (Sat, 24 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - -minor changes - ------------------------------------------------------------------------- -r636 | lh3lh3 | 2010-07-24 14:07:27 -0400 (Sat, 24 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_mcns.c - M /trunk/samtools/bam_mcns.h - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - -minor tweaks - ------------------------------------------------------------------------- -r635 | lh3lh3 | 2010-07-24 01:49:49 -0400 (Sat, 24 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_mcns.c - M /trunk/samtools/bam_mcns.h - M /trunk/samtools/bam_plcmd.c - -posterior expectation FINALLY working. I am so tired... - ------------------------------------------------------------------------- -r633 | lh3lh3 | 2010-07-23 13:50:48 -0400 (Fri, 23 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - -another minor fix to mpileup - ------------------------------------------------------------------------- -r632 | lh3lh3 | 2010-07-23 13:43:31 -0400 (Fri, 23 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - -added the format column - ------------------------------------------------------------------------- -r631 | lh3lh3 | 2010-07-23 13:25:44 -0400 (Fri, 23 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_mcns.c - M /trunk/samtools/bam_mcns.h - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - -added an alternative prior - ------------------------------------------------------------------------- -r628 | lh3lh3 | 2010-07-23 11:48:51 -0400 (Fri, 23 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_mcns.c - M /trunk/samtools/bam_mcns.h - M /trunk/samtools/bam_plcmd.c - -calculate posterior allele frequency - ------------------------------------------------------------------------- -r627 | lh3lh3 | 2010-07-22 21:39:13 -0400 (Thu, 22 Jul 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_mcns.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.8-3 (r627) - * multi-sample snp calling appears to work. More tests needed. - ------------------------------------------------------------------------- -r626 | lh3lh3 | 2010-07-22 16:37:56 -0400 (Thu, 22 Jul 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_mcns.c - M /trunk/samtools/bam_mcns.h - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bam_tview.c - - * preliminary multisample SNP caller. - * something looks not so right, but it largely works - ------------------------------------------------------------------------- -r617 | lh3lh3 | 2010-07-14 16:26:27 -0400 (Wed, 14 Jul 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_mcns.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.8-2 (r617) - * allele frequency calculation apparently works... - ------------------------------------------------------------------------- -r616 | lh3lh3 | 2010-07-14 13:33:51 -0400 (Wed, 14 Jul 2010) | 3 lines -Changed paths: - M /trunk/samtools/Makefile - A /trunk/samtools/bam_mcns.c - A /trunk/samtools/bam_mcns.h - M /trunk/samtools/bam_plcmd.c - - * added mutli-sample framework. It is not working, yet. - * improved the mpileup interface - ------------------------------------------------------------------------- -r615 | lh3lh3 | 2010-07-13 14:50:12 -0400 (Tue, 13 Jul 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/misc/Makefile - - * samtools-0.1.8-1 (r615) - * allow to get mpileup at required sites - ------------------------------------------------------------------------- -r613 | lh3lh3 | 2010-07-11 22:40:56 -0400 (Sun, 11 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/NEWS - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/samtools.1 - -Release samtools-0.1.8 - ------------------------------------------------------------------------- -r612 | lh3lh3 | 2010-07-11 21:08:56 -0400 (Sun, 11 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/knetfile.c - -fixed a compiling issue for Windows - ------------------------------------------------------------------------- -r611 | lh3lh3 | 2010-07-11 20:59:15 -0400 (Sun, 11 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_sort.c - -fixed a bug in sorting when output to stdout (by Peter Chines) - ------------------------------------------------------------------------- -r610 | lh3lh3 | 2010-07-09 17:05:10 -0400 (Fri, 09 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/NEWS - M /trunk/samtools/bam_plcmd.c - -change the command line option of pileup - ------------------------------------------------------------------------- -r609 | lh3lh3 | 2010-07-09 00:39:34 -0400 (Fri, 09 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_pileup.c - A /trunk/samtools/examples/toy.fa - A /trunk/samtools/examples/toy.sam - -make pileup work with CIGAR with I/D at the beginning or in the end - ------------------------------------------------------------------------- -r608 | lh3lh3 | 2010-07-08 22:36:12 -0400 (Thu, 08 Jul 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_maqcns.h - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bam_tview.c - - * make tview more friendly - * a temporary remedy for an issue in indel calling - ------------------------------------------------------------------------- -r607 | lh3lh3 | 2010-07-08 14:43:52 -0400 (Thu, 08 Jul 2010) | 4 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.7-r607 - * improved the genotype accuracy for indels - * use the SOAPsnp model for SNP calling by default. - ------------------------------------------------------------------------- -r606 | lh3lh3 | 2010-07-08 01:05:19 -0400 (Thu, 08 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/misc/Makefile - -removed a debugging example - ------------------------------------------------------------------------- -r605 | lh3lh3 | 2010-07-08 01:04:09 -0400 (Thu, 08 Jul 2010) | 4 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bamtk.c - - * samtools-.1.7-18 (r605) - * fixed an issue when a deletion and mismatch occur at the same time - and the base quality is higher than 40 (if -I40). - ------------------------------------------------------------------------- -r604 | lh3lh3 | 2010-07-02 19:32:24 -0400 (Fri, 02 Jul 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_index.c - M /trunk/samtools/misc/Makefile - -fixed a minor bug in idxstats - ------------------------------------------------------------------------- -r601 | lh3lh3 | 2010-06-16 09:03:59 -0400 (Wed, 16 Jun 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_index.c - -fixed a minor bug in indexing - ------------------------------------------------------------------------- -r600 | lh3lh3 | 2010-06-15 10:17:53 -0400 (Tue, 15 Jun 2010) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/bam.c - -change printf() to puts in exporting - ------------------------------------------------------------------------- -r599 | lh3lh3 | 2010-06-13 21:41:11 -0400 (Sun, 13 Jun 2010) | 2 lines -Changed paths: - M /trunk/samtools/bamtk.c - -minor fix. No actual effect. - ------------------------------------------------------------------------- -r598 | lh3lh3 | 2010-06-13 21:32:45 -0400 (Sun, 13 Jun 2010) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - -added Makefile targets to compile shared/dynamic library - ------------------------------------------------------------------------- -r596 | lh3lh3 | 2010-06-13 19:48:07 -0400 (Sun, 13 Jun 2010) | 3 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/bam_index.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.7-17 (r596) - * also keep the number of coor-less reads in the index file - ------------------------------------------------------------------------- -r595 | lh3lh3 | 2010-06-13 18:54:26 -0400 (Sun, 13 Jun 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_index.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.7-16 (r595) - * write additional information to bam index - ------------------------------------------------------------------------- -r594 | lh3lh3 | 2010-06-13 17:29:52 -0400 (Sun, 13 Jun 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_index.c - -fixed a bug for unmapped sequences in indexing - ------------------------------------------------------------------------- -r593 | lh3lh3 | 2010-06-12 18:11:32 -0400 (Sat, 12 Jun 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_index.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/samtools.1 - -rename iterf as iter - ------------------------------------------------------------------------- -r592 | lh3lh3 | 2010-06-12 18:02:38 -0400 (Sat, 12 Jun 2010) | 4 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/bam_aux.c - M /trunk/samtools/bam_index.c - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.7-15 (r592) - * fixed a few minor memory leaks in the new pileup code - * improved the functionality of mpileup - ------------------------------------------------------------------------- -r591 | lh3lh3 | 2010-06-12 14:09:22 -0400 (Sat, 12 Jun 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.7-14 (r591) - * elementary multi-way pileup. More testing and more functionality to be done. - ------------------------------------------------------------------------- -r590 | lh3lh3 | 2010-06-12 01:00:24 -0400 (Sat, 12 Jun 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.7-13 (r590) - * added mpileup APIs. No compiling errors, but not tested at all. It is late. - ------------------------------------------------------------------------- -r589 | lh3lh3 | 2010-06-11 22:37:09 -0400 (Fri, 11 Jun 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.7-12 (r589) - * added iterator-like APIs for pileup - ------------------------------------------------------------------------- -r588 | lh3lh3 | 2010-06-11 17:41:13 -0400 (Fri, 11 Jun 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam_index.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.7-11 (r588) - * ported a few improvements from tabix back to samtools - ------------------------------------------------------------------------- -r587 | lh3lh3 | 2010-06-11 17:33:16 -0400 (Fri, 11 Jun 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_index.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.7-10 (r587) - * added iterator interface for bam_fetch (ported back from tabix) - ------------------------------------------------------------------------- -r586 | lh3lh3 | 2010-06-11 13:23:53 -0400 (Fri, 11 Jun 2010) | 3 lines -Changed paths: - M /trunk/samtools/Makefile - A /trunk/samtools/bam_reheader.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bgzf.c - - * samtools-0.1.7-9 (r586) - * added "reheader" to replace the BAM header - ------------------------------------------------------------------------- -r585 | lh3lh3 | 2010-06-11 12:22:06 -0400 (Fri, 11 Jun 2010) | 3 lines -Changed paths: - M /trunk/samtools/bam.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/kstring.h - - * samtools-0.1.7-8 (r585) - * speed up "view" - ------------------------------------------------------------------------- -r584 | lh3lh3 | 2010-06-11 12:00:41 -0400 (Fri, 11 Jun 2010) | 4 lines -Changed paths: - M /trunk/samtools/bam.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bgzf.c - M /trunk/samtools/bgzf.h - M /trunk/samtools/kstring.h - M /trunk/samtools/misc/wgsim_eval.pl - - * samtools-0.1.7-7 (r584) - * ported tabix BGZF to samtools - * flush BGZF after writing the BAM header and between alignment boundaries - ------------------------------------------------------------------------- -r583 | petulda | 2010-06-11 11:58:20 -0400 (Fri, 11 Jun 2010) | 1 line -Changed paths: - A /trunk/samtools/misc/varfilter.py - -Initial release on behalf of Aylwyn Scally ------------------------------------------------------------------------- -r561 | petulda | 2010-05-07 08:41:56 -0400 (Fri, 07 May 2010) | 1 line -Changed paths: - M /trunk/samtools/samtools.1 - -Added a note about the indels coordinates ------------------------------------------------------------------------- -r551 | petulda | 2010-04-23 09:42:13 -0400 (Fri, 23 Apr 2010) | 1 line -Changed paths: - M /trunk/samtools/misc/sam2vcf.pl - -Added the possibility to print or not to print the reference allele ------------------------------------------------------------------------- -r546 | petulda | 2010-04-15 04:33:55 -0400 (Thu, 15 Apr 2010) | 1 line -Changed paths: - M /trunk/samtools/sam_header.c - -More descriptive message for space separated tags ------------------------------------------------------------------------- -r545 | petulda | 2010-04-14 11:44:50 -0400 (Wed, 14 Apr 2010) | 1 line -Changed paths: - M /trunk/samtools/misc/sam2vcf.pl - -Speedup with -i, no need to query the reference all the time ------------------------------------------------------------------------- -r541 | petulda | 2010-03-15 10:03:51 -0400 (Mon, 15 Mar 2010) | 1 line -Changed paths: - M /trunk/samtools/sam_header.c - -Fixed the order of sequences in the header ------------------------------------------------------------------------- -r540 | petulda | 2010-03-04 06:28:35 -0500 (Thu, 04 Mar 2010) | 1 line -Changed paths: - M /trunk/samtools/misc/sam2vcf.pl - -Added possibility to select indels only and fixed a bug in reporting homozygous indels. ------------------------------------------------------------------------- -r539 | jmarshall | 2010-02-27 06:48:17 -0500 (Sat, 27 Feb 2010) | 4 lines -Changed paths: - M /trunk/samtools/bam.c - -Improve the invalid 'BAM\1' magic number error message, and also print it -when no bytes can be read from the alleged BAM file, e.g., in the common -user error case when a SAM file has accidentally been supplied. - ------------------------------------------------------------------------- -r538 | petulda | 2010-02-26 10:51:40 -0500 (Fri, 26 Feb 2010) | 1 line -Changed paths: - M /trunk/samtools/AUTHORS - M /trunk/samtools/bam.h - M /trunk/samtools/bam_import.c - M /trunk/samtools/sam_header.c - -Improved efficiency of header parsing ------------------------------------------------------------------------- -r537 | lh3lh3 | 2010-02-23 21:08:48 -0500 (Tue, 23 Feb 2010) | 3 lines -Changed paths: - M /trunk/samtools/misc/export2sam.pl - -Updated export2sam.pl by Chris Saunders from Illumina. - - ------------------------------------------------------------------------- -r536 | petulda | 2010-02-17 08:32:53 -0500 (Wed, 17 Feb 2010) | 1 line -Changed paths: - M /trunk/samtools/misc/samtools.pl - -Fixed filtering of SNPs near indels. Added min indel and SNP quality filter. ------------------------------------------------------------------------- -r535 | petulda | 2010-02-12 04:52:37 -0500 (Fri, 12 Feb 2010) | 1 line -Changed paths: - M /trunk/samtools/misc/sam2vcf.pl - -Print an error for pileups in simple format ------------------------------------------------------------------------- -r534 | lh3lh3 | 2010-02-11 14:01:41 -0500 (Thu, 11 Feb 2010) | 2 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - -added a hidden option in pileup to output the base position (for Erin) - ------------------------------------------------------------------------- -r533 | petulda | 2010-02-09 10:12:14 -0500 (Tue, 09 Feb 2010) | 1 line -Changed paths: - M /trunk/samtools/misc/sam2vcf.pl - -Added possibility to specify a custom column title for the data column ------------------------------------------------------------------------- -r532 | petulda | 2010-02-09 09:46:09 -0500 (Tue, 09 Feb 2010) | 1 line -Changed paths: - M /trunk/samtools/bam_plcmd.c - -Added the -d option to limit maximum depth for indels. ------------------------------------------------------------------------- -r531 | petulda | 2010-02-03 07:57:27 -0500 (Wed, 03 Feb 2010) | 1 line -Changed paths: - M /trunk/samtools/misc/sam2vcf.pl - -Added VCF header ------------------------------------------------------------------------- -r530 | lh3lh3 | 2010-02-01 09:13:19 -0500 (Mon, 01 Feb 2010) | 3 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/misc/samtools.pl - M /trunk/samtools/misc/wgsim.c - - * samtools-0.1.7-6 - * fixed a bug in faidx - ------------------------------------------------------------------------- -r529 | jmarshall | 2010-01-11 18:51:49 -0500 (Mon, 11 Jan 2010) | 2 lines -Changed paths: - M /trunk/samtools/faidx.c - -Put the right filename in the error message. - ------------------------------------------------------------------------- -r528 | lh3lh3 | 2009-12-14 11:26:47 -0500 (Mon, 14 Dec 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.7-5 (r528) - * further add new consensus generation strategy - ------------------------------------------------------------------------- -r527 | petulda | 2009-12-11 12:31:05 -0500 (Fri, 11 Dec 2009) | 1 line -Changed paths: - M /trunk/samtools/knetfile.c - -Fixed a bug in knet_seek ------------------------------------------------------------------------- -r526 | petulda | 2009-12-11 07:51:18 -0500 (Fri, 11 Dec 2009) | 1 line -Changed paths: - M /trunk/samtools/misc/sam2vcf.pl - -Small fix in VCF format: dot for the empty INFO field ------------------------------------------------------------------------- -r525 | petulda | 2009-12-11 04:36:18 -0500 (Fri, 11 Dec 2009) | 1 line -Changed paths: - M /trunk/samtools/sam_header.c - -Allow tabs in the CO header field ------------------------------------------------------------------------- -r524 | jmarshall | 2009-12-10 10:03:58 -0500 (Thu, 10 Dec 2009) | 3 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/Makefile.mingw - -Depend on libbam.a rather than the phony target, so that samtools is not -unnecessarily rebuilt every time. Also clean bgzip. - ------------------------------------------------------------------------- -r523 | jmarshall | 2009-12-10 09:45:32 -0500 (Thu, 10 Dec 2009) | 4 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/Makefile.mingw - -Fix a bug in compiling bgzip: this also needs knetfile.o when _USE_KNETFILE -is defined. Also introduce $(KNETFILE_O) which can be set to empty to -facilitate non-knet builds. - ------------------------------------------------------------------------- -r522 | lh3lh3 | 2009-12-01 13:02:36 -0500 (Tue, 01 Dec 2009) | 4 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam_view.c - - * samtools-0.1.7-4 (r522) - * fixed a bug in "view -r" - * added a new option "view -R" to read required read groups from a file - ------------------------------------------------------------------------- -r521 | lh3lh3 | 2009-12-01 10:00:12 -0500 (Tue, 01 Dec 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_md.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.7-3 (r521) - * calmd: optionally mask matching bases as N - ------------------------------------------------------------------------- -r520 | lh3lh3 | 2009-12-01 09:37:17 -0500 (Tue, 01 Dec 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/misc/samtools.pl - - * samtools-0.1.7-2 (r520) - * fixed a few issues with compilation in Windows (on behalf of John) - * choose a random base as the consensus (for population genetics studies) - ------------------------------------------------------------------------- -r519 | jmarshall | 2009-11-30 10:53:02 -0500 (Mon, 30 Nov 2009) | 6 lines -Changed paths: - M /trunk/samtools/Makefile - -Put libraries at the end, so they can resolve references from libbam.a -as well, even with old-fashioned linkers. - -Also use libbam.a explicitly rather than "-L. -lbam" to ensure that we get -the freshly built library, not some other libbam.a lying around the system. - ------------------------------------------------------------------------- -r518 | jmarshall | 2009-11-30 08:44:56 -0500 (Mon, 30 Nov 2009) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/misc/Makefile - -Also clean *.exe (for Cygwin users using this makefile). - ------------------------------------------------------------------------- -r517 | jmarshall | 2009-11-30 07:09:04 -0500 (Mon, 30 Nov 2009) | 2 lines -Changed paths: - M /trunk/samtools/bam_index.c - -Index files should be opened in binary mode, not text mode. - ------------------------------------------------------------------------- -r516 | lh3lh3 | 2009-11-27 15:18:59 -0500 (Fri, 27 Nov 2009) | 2 lines -Changed paths: - A /trunk/samtools/examples/bam2bed.c - -another example program - ------------------------------------------------------------------------- -r515 | lh3lh3 | 2009-11-27 10:44:56 -0500 (Fri, 27 Nov 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_import.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/misc/wgsim_eval.pl - M /trunk/samtools/sam.c - - * samtools-0.1.7-1 (r515) - * report an error when .fai contains duplicated names, instead of segfault - ------------------------------------------------------------------------- -r514 | jmarshall | 2009-11-24 09:45:35 -0500 (Tue, 24 Nov 2009) | 2 lines -Changed paths: - M /trunk/samtools/bam.c - -Format 'c'-encoded auxiliary fields correctly, as *signed* integers. - ------------------------------------------------------------------------- -r513 | lh3lh3 | 2009-11-16 10:13:07 -0500 (Mon, 16 Nov 2009) | 2 lines -Changed paths: - M /trunk/samtools/Makefile.mingw - -Update Makefile.mingw for the same reason - ------------------------------------------------------------------------- -r512 | lh3lh3 | 2009-11-16 10:00:08 -0500 (Mon, 16 Nov 2009) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - -Fixed a bug in compiling razip - ------------------------------------------------------------------------- -r510 | lh3lh3 | 2009-11-10 10:55:41 -0500 (Tue, 10 Nov 2009) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/NEWS - M /trunk/samtools/bamtk.c - M /trunk/samtools/samtools.1 - -Release samtools-0.1.7 (r510) - ------------------------------------------------------------------------- -r509 | lh3lh3 | 2009-11-06 09:17:09 -0500 (Fri, 06 Nov 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.6-22 (r509) - * forget to fix a similar problem in glfgen - ------------------------------------------------------------------------- -r508 | lh3lh3 | 2009-11-06 09:06:40 -0500 (Fri, 06 Nov 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam_view.c - - * samtools-0.1.6-21 (r508) - * fixed a potential bug in the indel caller towards the end of a chromosome - ------------------------------------------------------------------------- -r494 | lh3lh3 | 2009-10-26 11:38:00 -0400 (Mon, 26 Oct 2009) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam_view.c - - * samtools-0.1.6-19 (r494) - * allow to convert Illumina quality (64 based) to the BAM quality - ------------------------------------------------------------------------- -r493 | lh3lh3 | 2009-10-26 10:24:39 -0400 (Mon, 26 Oct 2009) | 4 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam.c - M /trunk/samtools/bam_import.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam.c - M /trunk/samtools/sam_header.c - - * samtools-0.1.6-18 (r493) - * fixed the bugs due to improperly incorporating Petr's header parser - * a little code clean up in sam_header.c - ------------------------------------------------------------------------- -r492 | petulda | 2009-10-24 09:43:25 -0400 (Sat, 24 Oct 2009) | 1 line -Changed paths: - M /trunk/samtools/sam_header.c - -Added sam_header_line_free call for sam_header_parse2 ------------------------------------------------------------------------- -r491 | lh3lh3 | 2009-10-24 00:50:16 -0400 (Sat, 24 Oct 2009) | 3 lines -Changed paths: - M /trunk/samtools/sam_view.c - - * BUGGY VERSION - * fixed a minor bug - ------------------------------------------------------------------------- -r490 | lh3lh3 | 2009-10-24 00:45:12 -0400 (Sat, 24 Oct 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam.c - M /trunk/samtools/bam.h - M /trunk/samtools/bam_import.c - M /trunk/samtools/sam.c - - * BUGGY VERSION - * improved the interface a bit - * bug unfixed - ------------------------------------------------------------------------- -r489 | lh3lh3 | 2009-10-24 00:41:50 -0400 (Sat, 24 Oct 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_import.c - M /trunk/samtools/sam_header.c - M /trunk/samtools/sam_header.h - - * BUGGY VERSION. Please NOT use it. - * Fixed a minor bug, but the major bug is still there. - ------------------------------------------------------------------------- -r488 | lh3lh3 | 2009-10-24 00:17:10 -0400 (Sat, 24 Oct 2009) | 3 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam.c - M /trunk/samtools/bam.h - M /trunk/samtools/bam_aux.c - M /trunk/samtools/bam_import.c - M /trunk/samtools/bam_rmdup.c - M /trunk/samtools/bam_rmdupse.c - M /trunk/samtools/kaln.c - M /trunk/samtools/sam.c - M /trunk/samtools/sam_header.c - M /trunk/samtools/sam_header.h - M /trunk/samtools/sam_view.c - - * This revision is SERIOUSLY BUGGY. Please NOT use it. - * Start to incorporate header parsing from Petr Danecek - ------------------------------------------------------------------------- -r487 | petulda | 2009-10-23 11:44:32 -0400 (Fri, 23 Oct 2009) | 1 line -Changed paths: - M /trunk/samtools/sam_header.c - M /trunk/samtools/sam_header.h - -Now possible to merge multiple HeaderDict dictionaries ------------------------------------------------------------------------- -r486 | petulda | 2009-10-22 11:46:58 -0400 (Thu, 22 Oct 2009) | 1 line -Changed paths: - M /trunk/samtools/sam_header.c - - ------------------------------------------------------------------------- -r485 | petulda | 2009-10-22 11:41:56 -0400 (Thu, 22 Oct 2009) | 1 line -Changed paths: - A /trunk/samtools/sam_header.c - A /trunk/samtools/sam_header.h - - ------------------------------------------------------------------------- -r484 | lh3lh3 | 2009-10-19 14:31:32 -0400 (Mon, 19 Oct 2009) | 5 lines -Changed paths: - M /trunk/samtools/bam_import.c - M /trunk/samtools/bam_rmdupse.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/examples/Makefile - - * samtools-0.1.6-17 (r484) - * fixed a memory leak in rmdupse - * fixed a bug in parsing @RG header lines - * test rmdup in examples/ - ------------------------------------------------------------------------- -r483 | lh3lh3 | 2009-10-19 13:22:48 -0400 (Mon, 19 Oct 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam_rmdup.c - M /trunk/samtools/bam_rmdupse.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.6-16 (r483) - * unify the interface of rmdup and rmdupse - * a new bug found in rg2lib(). Have not been fixed yet. - ------------------------------------------------------------------------- -r482 | lh3lh3 | 2009-10-19 13:03:34 -0400 (Mon, 19 Oct 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_rmdup.c - M /trunk/samtools/bam_rmdupse.c - M /trunk/samtools/bamtk.c - A /trunk/samtools/klist.h - - * samtools-0.1.6-15 (r482) - * rewrite rmdupse - * rmdupse is now library aware - ------------------------------------------------------------------------- -r481 | lh3lh3 | 2009-10-18 00:07:21 -0400 (Sun, 18 Oct 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_rmdup.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.6-14 (r480) - * rmdup is now RG aware - ------------------------------------------------------------------------- -r480 | lh3lh3 | 2009-10-17 22:05:20 -0400 (Sat, 17 Oct 2009) | 2 lines -Changed paths: - M /trunk/samtools/misc/samtools.pl - -added a small unitity to parse SRA XML files - ------------------------------------------------------------------------- -r479 | lh3lh3 | 2009-10-17 20:57:26 -0400 (Sat, 17 Oct 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_maqcns.h - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bam_sort.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.6-13 (r479) - * merge: optionally use file names as RG tags - ------------------------------------------------------------------------- -r478 | lh3lh3 | 2009-10-14 14:18:12 -0400 (Wed, 14 Oct 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam.c - M /trunk/samtools/bam.h - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/kaln.c - - * samtools-0.1.6-12 (r478) - * fixed a bug in the indel caller - ------------------------------------------------------------------------- -r477 | lh3lh3 | 2009-10-10 06:12:26 -0400 (Sat, 10 Oct 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_index.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.6-11 (r477) - * fixed a bug due to recent change in bam_index.c (thank Nicole Washington for the patch) - ------------------------------------------------------------------------- -r476 | petulda | 2009-10-09 11:45:36 -0400 (Fri, 09 Oct 2009) | 1 line -Changed paths: - A /trunk/samtools/misc/sam2vcf.pl - -Added the sam2vcf.pl script. ------------------------------------------------------------------------- -r475 | lh3lh3 | 2009-10-08 10:19:16 -0400 (Thu, 08 Oct 2009) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam.c - M /trunk/samtools/bam.h - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bamtk.c - A /trunk/samtools/kaln.c - A /trunk/samtools/kaln.h - -Unfinished modification. Please do not use this revision... - ------------------------------------------------------------------------- -r474 | petulda | 2009-10-08 06:39:54 -0400 (Thu, 08 Oct 2009) | 1 line -Changed paths: - M /trunk/samtools/knetfile.c - -Removed the offending knet_seek message. ------------------------------------------------------------------------- -r473 | petulda | 2009-10-06 09:26:35 -0400 (Tue, 06 Oct 2009) | 1 line -Changed paths: - M /trunk/samtools/knetfile.c - M /trunk/samtools/razf.c - -Bug fix - faidx on RAZF compressed files now working. ------------------------------------------------------------------------- -r472 | lh3lh3 | 2009-10-02 08:42:57 -0400 (Fri, 02 Oct 2009) | 2 lines -Changed paths: - M /trunk/samtools/samtools.1 - -Clarify the meaning of a region like "chr2:1,000,000". - ------------------------------------------------------------------------- -r471 | lh3lh3 | 2009-10-02 05:42:19 -0400 (Fri, 02 Oct 2009) | 2 lines -Changed paths: - M /trunk/samtools/misc/novo2sam.pl - -Fixed minor bugs in novo2sam.pl (on behalf of Ken Chen and Colin Hercus) - ------------------------------------------------------------------------- -r470 | lh3lh3 | 2009-09-29 15:01:27 -0400 (Tue, 29 Sep 2009) | 3 lines -Changed paths: - M /trunk/samtools/Makefile.mingw - M /trunk/samtools/bamtk.c - M /trunk/samtools/knetfile.c - M /trunk/samtools/knetfile.h - - * samtools-0.1.6-9 (r470) - * make knetfile.c compatible with MinGW (thank Martin Morgan for the patch) - ------------------------------------------------------------------------- -r469 | lh3lh3 | 2009-09-29 08:07:44 -0400 (Tue, 29 Sep 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_index.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.6-9 (r469) - * refactor bam_fetch() for Python binding. On behalf of Leo Goodstadt. - ------------------------------------------------------------------------- -r468 | lh3lh3 | 2009-09-28 05:18:29 -0400 (Mon, 28 Sep 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_sort.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/misc/samtools.pl - - * samtools-0.1.6-7 (r468) - * make merge stable - ------------------------------------------------------------------------- -r467 | petulda | 2009-09-28 04:51:29 -0400 (Mon, 28 Sep 2009) | 1 line -Changed paths: - M /trunk/samtools/bgzf.c - M /trunk/samtools/bgzip.c - M /trunk/samtools/razf.c - M /trunk/samtools/razip.c - -Changed the mode for newly created files to 0666. This allows less strict permissions with umask properly set (e.g. 0002 vs. 0022). ------------------------------------------------------------------------- -r466 | lh3lh3 | 2009-09-24 06:29:19 -0400 (Thu, 24 Sep 2009) | 3 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/bam_md.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.6-6 (r466) - * do not crash calmd when some sequences are absent from the reference. - ------------------------------------------------------------------------- -r464 | jmarshall | 2009-09-23 06:14:32 -0400 (Wed, 23 Sep 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam.c - M /trunk/samtools/knetfile.c - -Suppress bgzf_check_EOF() messages when reading from a pipe, as there is -no way to seek on a pipe and the messages always appear. - ------------------------------------------------------------------------- -r463 | petulda | 2009-09-16 07:05:41 -0400 (Wed, 16 Sep 2009) | 1 line -Changed paths: - M /trunk/samtools/knetfile.c - M /trunk/samtools/razf.c - -A bug fix, "samtools view" is now working again. ------------------------------------------------------------------------- -r462 | lh3lh3 | 2009-09-16 04:51:07 -0400 (Wed, 16 Sep 2009) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/faidx.c - M /trunk/samtools/knetfile.c - M /trunk/samtools/knetfile.h - M /trunk/samtools/razf.c - M /trunk/samtools/razf.h - - * samtools-0.1.6-5 (r462) - * Added knetfile support in razf and faidx (on behalf of Petr Danecek) - ------------------------------------------------------------------------- -r460 | lh3lh3 | 2009-09-09 07:06:22 -0400 (Wed, 09 Sep 2009) | 2 lines -Changed paths: - M /trunk/samtools/samtools.1 - -fixed a formatting issue - ------------------------------------------------------------------------- -r459 | lh3lh3 | 2009-09-08 18:14:08 -0400 (Tue, 08 Sep 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_sort.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.6-4 (r459) - * make sort output the result to stdout when -o is in use - ------------------------------------------------------------------------- -r458 | lh3lh3 | 2009-09-07 05:10:28 -0400 (Mon, 07 Sep 2009) | 4 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/faidx.c - M /trunk/samtools/faidx.h - M /trunk/samtools/samtools.1 - - * samtools-0.1.6-2 (r458) - * added more interface to faidx (by Nils) - * updated documentation - ------------------------------------------------------------------------- -r457 | lh3lh3 | 2009-09-05 16:12:04 -0400 (Sat, 05 Sep 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_sort.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.6-2 (r457) - * get rid of three assert() in bam_sort.c - ------------------------------------------------------------------------- -r456 | jmarshall | 2009-09-04 12:46:25 -0400 (Fri, 04 Sep 2009) | 3 lines -Changed paths: - M /trunk/samtools/razf.c - -Return NULL from _razf_open() (and hence razf_open()/razf_open2()) -when opening the file fails. - ------------------------------------------------------------------------- -r453 | lh3lh3 | 2009-09-02 08:56:33 -0400 (Wed, 02 Sep 2009) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/NEWS - M /trunk/samtools/bamtk.c - M /trunk/samtools/samtools.1 - D /trunk/samtools/source.dot - -Release samtools-0.1.6 - ------------------------------------------------------------------------- -r451 | lh3lh3 | 2009-09-02 05:44:48 -0400 (Wed, 02 Sep 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam_md.c - M /trunk/samtools/bam_rmdup.c - M /trunk/samtools/bam_rmdupse.c - M /trunk/samtools/bam_sort.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/samtools.1 - - * samtools-0.1.5-34 (r451) - * applied the patch by John - * improved the help message a little bit - ------------------------------------------------------------------------- -r450 | lh3lh3 | 2009-09-02 04:55:55 -0400 (Wed, 02 Sep 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_color.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.5-33 (r450) - * fixed a bug in bam_color.c (on behalf of Nils Homer) - ------------------------------------------------------------------------- -r449 | lh3lh3 | 2009-08-29 15:36:41 -0400 (Sat, 29 Aug 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam_import.c - M /trunk/samtools/bam_md.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/misc/samtools.pl - - * samtools-0.1.5-32 (r449) - * fillmd: fixed a bug in modifying MD/NM tags - * in import, give a warning if the read is aligned but there is no CIGAR. - ------------------------------------------------------------------------- -r448 | lh3lh3 | 2009-08-19 04:44:28 -0400 (Wed, 19 Aug 2009) | 3 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/misc/wgsim_eval.pl - - * samtools-0.1.5-31 (r448) - * fixed an issue when the last CIGAR is I or D - ------------------------------------------------------------------------- -r447 | lh3lh3 | 2009-08-17 04:34:57 -0400 (Mon, 17 Aug 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_aux.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.5-30 (r447) - * fixed a bug in bam_aux_get(): 'A' is not checked - ------------------------------------------------------------------------- -r446 | lh3lh3 | 2009-08-17 04:33:17 -0400 (Mon, 17 Aug 2009) | 2 lines -Changed paths: - M /trunk/samtools/bam_aux.c - M /trunk/samtools/bamtk.c - - * - ------------------------------------------------------------------------- -r444 | lh3lh3 | 2009-08-11 05:02:36 -0400 (Tue, 11 Aug 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_sort.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.5-28 (r444) - * bug in "merge -n" - ------------------------------------------------------------------------- -r443 | lh3lh3 | 2009-08-11 04:29:11 -0400 (Tue, 11 Aug 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam.c - M /trunk/samtools/bam_import.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.5-27 (r443) - * SEQ and QUAL can be "*" - * parse CIGAR "=" and "X" as "M" - ------------------------------------------------------------------------- -r442 | lh3lh3 | 2009-08-07 16:56:38 -0400 (Fri, 07 Aug 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/misc/md5.c - M /trunk/samtools/misc/md5.h - M /trunk/samtools/misc/md5fa.c - - * samtools-0.1.5-26 (r442) - * replace RSA Inc md5.* with ones under permissive lincense - * fixed a bug in detecting unsorted bam in pileup - ------------------------------------------------------------------------- -r441 | bhandsaker | 2009-08-05 09:41:28 -0400 (Wed, 05 Aug 2009) | 2 lines -Changed paths: - M /trunk/samtools/bgzf.c - M /trunk/samtools/bgzf.h - M /trunk/samtools/bgzip.c - -Change copyright notices now that MIT has approved open source distribution. - ------------------------------------------------------------------------- -r440 | lh3lh3 | 2009-08-05 05:44:24 -0400 (Wed, 05 Aug 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_stat.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.5-25 (r436) - * in flagstats, do not report singletons if both ends are unmapped - ------------------------------------------------------------------------- -r439 | lh3lh3 | 2009-08-04 17:16:51 -0400 (Tue, 04 Aug 2009) | 2 lines -Changed paths: - M /trunk/samtools/misc/maq2sam.c - -fixed a SERIOUS bug in setting 0x20 flag - ------------------------------------------------------------------------- -r438 | lh3lh3 | 2009-08-04 16:50:43 -0400 (Tue, 04 Aug 2009) | 2 lines -Changed paths: - M /trunk/samtools/misc/samtools.pl - -fixed two minor bugs (suggested by Tim M Storm) - ------------------------------------------------------------------------- -r437 | lh3lh3 | 2009-08-04 04:13:24 -0400 (Tue, 04 Aug 2009) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/misc/samtools.pl - M /trunk/samtools/sam_view.c - - * samtools-0.1.5-24 (r435) - * fixed a typo - ------------------------------------------------------------------------- -r434 | lh3lh3 | 2009-08-03 05:40:42 -0400 (Mon, 03 Aug 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_tview.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.5-23 (r434) - * in tview, press 'r' to show read names rather than sequences - ------------------------------------------------------------------------- -r433 | lh3lh3 | 2009-08-02 14:13:35 -0400 (Sun, 02 Aug 2009) | 3 lines -Changed paths: - M /trunk/samtools/knetfile.c - - * tried to fixed the buggy FTP random access in Windows. FAILED. - * anyway, MinGW seems to have problem with "%lld". - ------------------------------------------------------------------------- -r432 | lh3lh3 | 2009-08-01 19:32:07 -0400 (Sat, 01 Aug 2009) | 5 lines -Changed paths: - M /trunk/samtools/Makefile.mingw - M /trunk/samtools/bamtk.c - M /trunk/samtools/faidx.c - M /trunk/samtools/razf.c - A /trunk/samtools/win32/libcurses.a - A /trunk/samtools/win32/xcurses.h - - * samtools-0.1.5-22 (r432) - * faidx: fixed compitability issue with _WIN32 - * razf: fixed potential compitability issue with _WIN32 - * PDCurses support in Windows - ------------------------------------------------------------------------- -r431 | lh3lh3 | 2009-08-01 18:34:54 -0400 (Sat, 01 Aug 2009) | 2 lines -Changed paths: - M /trunk/samtools/win32/libz.a - -replace the GnuWin32 version of libz.a with my own build with MinGW. - ------------------------------------------------------------------------- -r430 | lh3lh3 | 2009-08-01 18:21:07 -0400 (Sat, 01 Aug 2009) | 2 lines -Changed paths: - M /trunk/samtools/knetfile.c - -add comments - ------------------------------------------------------------------------- -r429 | lh3lh3 | 2009-08-01 17:41:19 -0400 (Sat, 01 Aug 2009) | 3 lines -Changed paths: - M /trunk/samtools/Makefile.mingw - M /trunk/samtools/bamtk.c - M /trunk/samtools/knetfile.c - M /trunk/samtools/knetfile.h - - * samtools-0.1.5-21 (r428) - * knetfile.c is now compatible with mingw-winsock - ------------------------------------------------------------------------- -r428 | lh3lh3 | 2009-07-31 19:39:07 -0400 (Fri, 31 Jul 2009) | 2 lines -Changed paths: - M /trunk/samtools/Makefile.mingw - -simplify MinGW Makefile - ------------------------------------------------------------------------- -r427 | lh3lh3 | 2009-07-31 19:30:54 -0400 (Fri, 31 Jul 2009) | 5 lines -Changed paths: - A /trunk/samtools/Makefile.mingw - M /trunk/samtools/bam_import.c - M /trunk/samtools/bamtk.c - A /trunk/samtools/win32 - A /trunk/samtools/win32/libz.a - A /trunk/samtools/win32/zconf.h - A /trunk/samtools/win32/zlib.h - - * samtools-0.1.5-20 (r427) - * MinGW support. At least SAM<->BAM conversion is working. Other - functionality are not tested at the moment. - * zlib headers and Windows version of libz.a are included in win32/ - ------------------------------------------------------------------------- -r426 | lh3lh3 | 2009-07-31 18:32:09 -0400 (Fri, 31 Jul 2009) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam_view.c - - * samtools-0.1.5-19 (r426) - * fixed a bug caused by recent modifications. Sorry. - ------------------------------------------------------------------------- -r425 | lh3lh3 | 2009-07-31 18:23:51 -0400 (Fri, 31 Jul 2009) | 2 lines -Changed paths: - M /trunk/samtools/bgzf.c - -compatible with Windows binary files - ------------------------------------------------------------------------- -r424 | lh3lh3 | 2009-07-31 05:19:59 -0400 (Fri, 31 Jul 2009) | 5 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_maqcns.h - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bam_tview.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/misc/samtools.pl - - * samtools-0.1.5-18 (r423) - * output additional information in pileup indel lines, for the purepose - of debugging at the moment - * in tview, optionally allow to treat reference skip as deletion - ------------------------------------------------------------------------- -r423 | lh3lh3 | 2009-07-30 17:00:36 -0400 (Thu, 30 Jul 2009) | 2 lines -Changed paths: - A /trunk/samtools/misc/psl2sam.pl - -convert BLAT psl to SAM. - ------------------------------------------------------------------------- -r422 | lh3lh3 | 2009-07-30 06:24:39 -0400 (Thu, 30 Jul 2009) | 6 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/bam.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bgzf.c - M /trunk/samtools/bgzf.h - M /trunk/samtools/knetfile.c - M /trunk/samtools/sam.c - M /trunk/samtools/sam_view.c - - * samtools-0.1.5-17 (r422) - * fixed a but in knetfile.c when seek type is not SEEK_SET - * write an empty BGZF block to every BGZF file - * check BGZF EOF marker in bam_header_read() - * update ChangeLog - ------------------------------------------------------------------------- -r421 | lh3lh3 | 2009-07-30 05:03:39 -0400 (Thu, 30 Jul 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_import.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/misc/samtools.pl - M /trunk/samtools/sam.c - M /trunk/samtools/sam.h - M /trunk/samtools/sam_view.c - - * samtools-0.1.5-16 (r421) - * in view and pileup, load header from FASTA index if the input is SAM. - ------------------------------------------------------------------------- -r420 | lh3lh3 | 2009-07-29 04:18:55 -0400 (Wed, 29 Jul 2009) | 2 lines -Changed paths: - M /trunk/samtools/misc/maq2sam.c - -do not set "read 1" if reads are not mapped in the PE mode of maq - ------------------------------------------------------------------------- -r419 | lh3lh3 | 2009-07-28 04:52:33 -0400 (Tue, 28 Jul 2009) | 5 lines -Changed paths: - M /trunk/samtools/bam_import.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/misc/samtools.pl - M /trunk/samtools/misc/wgsim_eval.pl - - * samtools-0.1.5-15 (r419) - * in sam_open(), return NULL when the file cannot be opened. - * make wgsim_eval.pl more robust to imperfect SAM - * add "unique" command to samtools.pl - ------------------------------------------------------------------------- -r418 | lh3lh3 | 2009-07-24 09:04:19 -0400 (Fri, 24 Jul 2009) | 2 lines -Changed paths: - M /trunk/samtools/misc/wgsim_eval.pl - -skip @header lines in SAM - ------------------------------------------------------------------------- -r417 | lh3lh3 | 2009-07-24 07:42:38 -0400 (Fri, 24 Jul 2009) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam_view.c - - * samtools-0.1.5-14 (r417) - * more help in "samtools view" due to the recent changes. - ------------------------------------------------------------------------- -r416 | lh3lh3 | 2009-07-24 07:34:30 -0400 (Fri, 24 Jul 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam.c - M /trunk/samtools/bam.h - M /trunk/samtools/bam_import.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam.c - M /trunk/samtools/sam.h - M /trunk/samtools/sam_view.c - - * samtools-0.1.5-17 (r416) - * support import/export SAM with string tags - ------------------------------------------------------------------------- -r415 | lh3lh3 | 2009-07-24 06:39:26 -0400 (Fri, 24 Jul 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam.c - M /trunk/samtools/bam.h - M /trunk/samtools/bam_import.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam.c - M /trunk/samtools/sam.h - M /trunk/samtools/sam_view.c - - * samtools-0.1.5-12 (r415) - * FLAG now can be in HEX - ------------------------------------------------------------------------- -r414 | lh3lh3 | 2009-07-22 17:03:49 -0400 (Wed, 22 Jul 2009) | 2 lines -Changed paths: - M /trunk/samtools/kstring.h - -fixed a compiling error (thank Ken for fixing it) - ------------------------------------------------------------------------- -r412 | lh3lh3 | 2009-07-21 17:19:40 -0400 (Tue, 21 Jul 2009) | 2 lines -Changed paths: - M /trunk/samtools/kstring.c - M /trunk/samtools/kstring.h - -Implemented Boyer-Moore search in the kstring library. - ------------------------------------------------------------------------- -r409 | lh3lh3 | 2009-07-17 12:10:20 -0400 (Fri, 17 Jul 2009) | 2 lines -Changed paths: - M /trunk/samtools/bam_index.c - -do not include knetfile.h when _USE_KNETFILE is not defined - ------------------------------------------------------------------------- -r408 | lh3lh3 | 2009-07-17 10:29:21 -0400 (Fri, 17 Jul 2009) | 5 lines -Changed paths: - M /trunk/samtools/bam_md.c - M /trunk/samtools/bam_tview.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bgzf.c - - * samtools-0.1.5-11 (r408) - * force to overwirte existing MD if it is different from the one calculated - from fillmd. - * bgzf.c: improved the compatibility with Windows headers - ------------------------------------------------------------------------- -r407 | lh3lh3 | 2009-07-17 09:46:56 -0400 (Fri, 17 Jul 2009) | 5 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_aux.c - M /trunk/samtools/bam_md.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam.h - - * samtools-0.1.5-10 (r407) - * implemented bam_aux_del() to remove a tag - * fillmd: generate the NM tag - * fillmd: cmd interface improvement - ------------------------------------------------------------------------- -r406 | lh3lh3 | 2009-07-16 18:30:40 -0400 (Thu, 16 Jul 2009) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - -Sorry. The old Makefile is for PDCurses... - ------------------------------------------------------------------------- -r405 | lh3lh3 | 2009-07-16 18:30:11 -0400 (Thu, 16 Jul 2009) | 3 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam_tview.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.5-9 (r405) - * improved the compatibility with PDCurses a little bit - ------------------------------------------------------------------------- -r404 | lh3lh3 | 2009-07-16 18:23:52 -0400 (Thu, 16 Jul 2009) | 3 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam_tview.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.5-8 (r404) - * compatible with PDCurses - ------------------------------------------------------------------------- -r403 | lh3lh3 | 2009-07-16 17:39:39 -0400 (Thu, 16 Jul 2009) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/kseq.h - - * samtools-0.1.5-7 (r403) - * fixed a bug in kseq.h for binary files (text files are fine) - ------------------------------------------------------------------------- -r402 | lh3lh3 | 2009-07-16 06:49:53 -0400 (Thu, 16 Jul 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam_import.c - M /trunk/samtools/bam_index.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bgzf.c - - * samtools-0.1.5-6 (r402) - * fixed compiling error when "-D_USE_NETFILE" is not applied - * improve portability to MinGW - ------------------------------------------------------------------------- -r398 | lh3lh3 | 2009-07-13 05:21:36 -0400 (Mon, 13 Jul 2009) | 3 lines -Changed paths: - A /trunk/bam-lite/bam.h (from /trunk/samtools/bam.h:395) - A /trunk/bam-lite/bam_lite.c (from /trunk/samtools/bam_lite.c:395) - D /trunk/samtools/bam_lite.c - - * move bam_lite.c to bam-lite - * copy bam.h to bam-lite - ------------------------------------------------------------------------- -r395 | lh3lh3 | 2009-07-13 05:12:57 -0400 (Mon, 13 Jul 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_lite.c - M /trunk/samtools/bam_lpileup.c - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.5-5 (r395) - * added bam_pileup_file() and removed bam_lpileup_file() - ------------------------------------------------------------------------- -r394 | lh3lh3 | 2009-07-12 19:35:10 -0400 (Sun, 12 Jul 2009) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/knetfile.c - M /trunk/samtools/knetfile.h - - * samtools-0.1.5-4 (r394) - * http_proxy support in knetfile library (check http_proxy ENV) - ------------------------------------------------------------------------- -r393 | lh3lh3 | 2009-07-12 18:57:07 -0400 (Sun, 12 Jul 2009) | 5 lines -Changed paths: - M /trunk/samtools/bam_index.c - M /trunk/samtools/bam_tview.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/knetfile.c - M /trunk/samtools/knetfile.h - - * samtools-0.1.5-3 (r393) - * knetfile now supports HTTP (no proxy at the moment) - * fixed a potential issue in knetfile on opening ordinary file, although I have - not seen the sideeffect so far. - ------------------------------------------------------------------------- -r392 | lh3lh3 | 2009-07-12 13:50:55 -0400 (Sun, 12 Jul 2009) | 2 lines -Changed paths: - M /trunk/samtools/samtools.1 - -Remove the warning in tview - ------------------------------------------------------------------------- -r391 | lh3lh3 | 2009-07-12 13:42:43 -0400 (Sun, 12 Jul 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_tview.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.5-2 (r391) - * do not show a blank screen when no reads mapped - ------------------------------------------------------------------------- -r390 | lh3lh3 | 2009-07-09 09:01:42 -0400 (Thu, 09 Jul 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam.h - A /trunk/samtools/bam_lite.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.5-1 (r390) - * removed useless _IOLIB in bam.h. This should cause no change at all. - * added bam_lite.c for light-weight BAM reading - ------------------------------------------------------------------------- -r385 | lh3lh3 | 2009-07-07 11:53:29 -0400 (Tue, 07 Jul 2009) | 2 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/knetfile.c - -Release samtools-0.1.5c (fixed a bug in piping) - ------------------------------------------------------------------------- -r383 | lh3lh3 | 2009-07-07 06:39:55 -0400 (Tue, 07 Jul 2009) | 2 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam.c - -Release samtools-0.1.5b (BUG! so embarrassing!) - ------------------------------------------------------------------------- -r381 | lh3lh3 | 2009-07-07 06:20:06 -0400 (Tue, 07 Jul 2009) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam.h - M /trunk/samtools/bam_aux.c - M /trunk/samtools/bamtk.c - -Release samtools-0.1.5a (for compatibility with Bio::DB::Sam) - ------------------------------------------------------------------------- -r373 | lh3lh3 | 2009-07-07 05:26:57 -0400 (Tue, 07 Jul 2009) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/NEWS - M /trunk/samtools/bamtk.c - M /trunk/samtools/samtools.1 - -Release samtools-0.1.5 - ------------------------------------------------------------------------- -r372 | lh3lh3 | 2009-07-07 04:49:27 -0400 (Tue, 07 Jul 2009) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam.c - - * samtools-0.1.4-23 (r372) - * keep header text if "view -t" is used (by Gerton) - ------------------------------------------------------------------------- -r371 | lh3lh3 | 2009-07-06 20:13:32 -0400 (Mon, 06 Jul 2009) | 2 lines -Changed paths: - M /trunk/samtools/samtools.1 - -update documentation - ------------------------------------------------------------------------- -r370 | bhandsaker | 2009-07-02 17:24:34 -0400 (Thu, 02 Jul 2009) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - -Introduced LIBPATH variable so this could be overridden to allow samtools to build correct at the Broad. - ------------------------------------------------------------------------- -r369 | lh3lh3 | 2009-07-02 08:36:53 -0400 (Thu, 02 Jul 2009) | 4 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/bam_aux.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.4-22 (r369) - * in pileup, optionally print E2 and U2 - * remove the debugging code in bam_aux_get() (Drat!) - ------------------------------------------------------------------------- -r368 | lh3lh3 | 2009-07-02 06:32:26 -0400 (Thu, 02 Jul 2009) | 6 lines -Changed paths: - M /trunk/samtools/bam.c - M /trunk/samtools/bam.h - M /trunk/samtools/bam_aux.c - M /trunk/samtools/bam_index.c - M /trunk/samtools/bam_lpileup.c - M /trunk/samtools/bam_md.c - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bam_rmdup.c - M /trunk/samtools/bam_stat.c - M /trunk/samtools/bam_tview.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/faidx.c - M /trunk/samtools/faidx.h - M /trunk/samtools/glf.c - - * samtools-0.1.4-21 (r368) - * propagate errors rather than exit or complain assertion failure. Assertion - should be only used for checking internal bugs, but not for external input - inconsistency. I was just a bit lazy. - * small memory leak may be present on failure, though - ------------------------------------------------------------------------- -r367 | lh3lh3 | 2009-06-30 11:18:42 -0400 (Tue, 30 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/knetfile.c - -reduce the chance of blocking in FTP connection - ------------------------------------------------------------------------- -r366 | lh3lh3 | 2009-06-30 10:35:21 -0400 (Tue, 30 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/knetfile.c - -minor changes to knetfile: invalid fd equals -1 rather than 0 - ------------------------------------------------------------------------- -r365 | lh3lh3 | 2009-06-30 09:04:30 -0400 (Tue, 30 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_index.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/knetfile.c - M /trunk/samtools/knetfile.h - - * samtools-0.1.4-20 (r365) - * download the BAM index file if it is not found in the current working directory. - ------------------------------------------------------------------------- -r364 | lh3lh3 | 2009-06-30 07:39:07 -0400 (Tue, 30 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/knetfile.c - - * samtools-0.1.4-19 (r364) - * knetfile: report error when the file is not present on FTP - ------------------------------------------------------------------------- -r363 | lh3lh3 | 2009-06-29 18:23:32 -0400 (Mon, 29 Jun 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam_tview.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bgzf.c - M /trunk/samtools/bgzf.h - M /trunk/samtools/knetfile.c - M /trunk/samtools/knetfile.h - - * samtools-0.1.4-18 (r363) - * knetfile: do not trigger network communication in FTP seek (lazy seek) - * bgzf: cache recent blocks (disabled by default) - ------------------------------------------------------------------------- -r362 | lh3lh3 | 2009-06-25 16:04:34 -0400 (Thu, 25 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/bgzf.c - -write changelog - ------------------------------------------------------------------------- -r361 | lh3lh3 | 2009-06-25 16:03:10 -0400 (Thu, 25 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_index.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.4-17 (r361) - * if a file is given on FTP, search locally for the BAM index - ------------------------------------------------------------------------- -r360 | lh3lh3 | 2009-06-25 15:44:52 -0400 (Thu, 25 Jun 2009) | 5 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam_import.c - M /trunk/samtools/bam_index.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bgzf.c - M /trunk/samtools/bgzf.h - M /trunk/samtools/knetfile.c - M /trunk/samtools/knetfile.h - - * samtools-0.1.4-16 (r360) - * report more information in index when the input is not sorted - * change the behaviour of knet_seek() such that it returns 0 on success - * support knetfile library in BGZF - ------------------------------------------------------------------------- -r359 | lh3lh3 | 2009-06-25 12:10:55 -0400 (Thu, 25 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/knetfile.c - M /trunk/samtools/knetfile.h - -fixed bugs in knetfile.* - ------------------------------------------------------------------------- -r358 | lh3lh3 | 2009-06-25 08:53:19 -0400 (Thu, 25 Jun 2009) | 2 lines -Changed paths: - A /trunk/samtools/knetfile.h - -this is the header file - ------------------------------------------------------------------------- -r357 | lh3lh3 | 2009-06-25 08:52:03 -0400 (Thu, 25 Jun 2009) | 3 lines -Changed paths: - A /trunk/samtools/knetfile.c - - * open a file at FTP - * preliminary version - ------------------------------------------------------------------------- -r354 | lh3lh3 | 2009-06-24 09:02:25 -0400 (Wed, 24 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.4-15 (r354) - * fixed a memory leak in bam_view1(), although samtools is not using this routine. - ------------------------------------------------------------------------- -r351 | lh3lh3 | 2009-06-17 19:16:26 -0400 (Wed, 17 Jun 2009) | 4 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/faidx.c - - * samtools-0.1.4-13 (r351) - * make faidx more tolerant to empty lines right before or after > lines - * hope this does not introduce new bugs... - ------------------------------------------------------------------------- -r350 | lh3lh3 | 2009-06-16 09:37:01 -0400 (Tue, 16 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.4-13 (r350) - * fixed a small memory leak in pileup, caused by recent modifications - ------------------------------------------------------------------------- -r347 | lh3lh3 | 2009-06-13 16:20:49 -0400 (Sat, 13 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam_view.c - - * samtools-0.1.4-12 (r347) - * added `-S' to pileup, similar to `view -S' - ------------------------------------------------------------------------- -r346 | lh3lh3 | 2009-06-13 12:52:31 -0400 (Sat, 13 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam_view.c - M /trunk/samtools/samtools.1 - - * samtools-0.1.4-11 (r346) - * allow to select a read group at view command-line - ------------------------------------------------------------------------- -r344 | lh3lh3 | 2009-06-13 09:06:24 -0400 (Sat, 13 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/examples/calDepth.c - -added more comments - ------------------------------------------------------------------------- -r343 | lh3lh3 | 2009-06-13 09:01:22 -0400 (Sat, 13 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/examples/calDepth.c - -nothing really - ------------------------------------------------------------------------- -r342 | lh3lh3 | 2009-06-13 08:58:48 -0400 (Sat, 13 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/examples/Makefile - A /trunk/samtools/examples/calDepth.c - -added an example of calculating read depth - ------------------------------------------------------------------------- -r341 | lh3lh3 | 2009-06-13 08:00:08 -0400 (Sat, 13 Jun 2009) | 6 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam.h - M /trunk/samtools/bam_aux.c - A /trunk/samtools/bam_color.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bam_sort.c - M /trunk/samtools/bam_tview.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam.c - M /trunk/samtools/sam.h - - * samtools-0.1.4-10 (r341) - * only include key APIs in libbam.a - * move color-specific routines to bam_color.c - * update documentations - * remove the support of -q in pileup - ------------------------------------------------------------------------- -r340 | lh3lh3 | 2009-06-13 06:17:14 -0400 (Sat, 13 Jun 2009) | 6 lines -Changed paths: - M /trunk/samtools/INSTALL - M /trunk/samtools/Makefile - M /trunk/samtools/bam_aux.c - M /trunk/samtools/bam_import.c - M /trunk/samtools/bam_tview.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/razf.c - M /trunk/samtools/sam_view.c - - * samtools-0.1.4-9 (r340) - * added a warning to razf.c if zlib<1.2.2.1 - * fixed a compilation warning - * fixed a segfault caused by @RG parsing - * detect NCURSES in bam_tview.c - ------------------------------------------------------------------------- -r339 | lh3lh3 | 2009-06-13 05:35:19 -0400 (Sat, 13 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/INSTALL - -update INSTALL - ------------------------------------------------------------------------- -r338 | lh3lh3 | 2009-06-12 19:15:24 -0400 (Fri, 12 Jun 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam.c - M /trunk/samtools/bam.h - M /trunk/samtools/bam_aux.c - M /trunk/samtools/bam_import.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/kstring.h - M /trunk/samtools/sam.c - M /trunk/samtools/sam_view.c - - * samtools-0.1.4-8 (r338) - * parse the @RG header lines and allow to choose library at the "samtools view" - command line - ------------------------------------------------------------------------- -r337 | lh3lh3 | 2009-06-12 16:25:50 -0400 (Fri, 12 Jun 2009) | 4 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/bgzf.c - M /trunk/samtools/bgzf.h - M /trunk/samtools/sam.c - M /trunk/samtools/sam_view.c - - * samtools-0.1.4-7 (r337) - * bgzf.c: support mode string "wu": uncompressed output - * "samtools view" support "-u" command-line option - ------------------------------------------------------------------------- -r336 | lh3lh3 | 2009-06-12 12:20:12 -0400 (Fri, 12 Jun 2009) | 5 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/misc/Makefile - M /trunk/samtools/razf.c - M /trunk/samtools/razf.h - M /trunk/samtools/razip.c - - * no changes to samtools itself - * remove zlib source codes - * make RAZF reading compatible with old version of zlib - * on old version of zlib, writing is not available - ------------------------------------------------------------------------- -r335 | lh3lh3 | 2009-06-12 11:47:33 -0400 (Fri, 12 Jun 2009) | 2 lines -Changed paths: - D /trunk/samtools/zlib - -remove zlib for simplification... - ------------------------------------------------------------------------- -r334 | lh3lh3 | 2009-06-12 10:43:36 -0400 (Fri, 12 Jun 2009) | 5 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_aux.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.4-6 (r334) - * do not export bam_aux_get_core() for Bio::DB::Sam because it has already - been implemented in that. - * this version works with the latest Bio::DB::Sam (20090612) - ------------------------------------------------------------------------- -r333 | lh3lh3 | 2009-06-12 10:33:42 -0400 (Fri, 12 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - -update ChangeLog - ------------------------------------------------------------------------- -r332 | lh3lh3 | 2009-06-12 10:21:21 -0400 (Fri, 12 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/AUTHORS - M /trunk/samtools/Makefile - M /trunk/samtools/misc/Makefile - -fixed minor things in Makefile - ------------------------------------------------------------------------- -r331 | lh3lh3 | 2009-06-12 10:07:05 -0400 (Fri, 12 Jun 2009) | 4 lines -Changed paths: - M /trunk/samtools/bamtk.c - - * samtools-0.1.4-5 (r3310 - * no change to samtools itself. Version number is increased to reflect the - changes in the Makefile building system. - ------------------------------------------------------------------------- -r330 | lh3lh3 | 2009-06-12 10:03:38 -0400 (Fri, 12 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/AUTHORS - D /trunk/samtools/README - -update information... - ------------------------------------------------------------------------- -r329 | lh3lh3 | 2009-06-12 09:52:21 -0400 (Fri, 12 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/misc/novo2sam.pl - - * updated novoalign converter by Colin Hercus et al. - * this version works with indels - ------------------------------------------------------------------------- -r328 | lh3lh3 | 2009-06-12 09:50:53 -0400 (Fri, 12 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/INSTALL - M /trunk/samtools/Makefile - M /trunk/samtools/misc/Makefile - M /trunk/samtools/zlib/Makefile - - * update Makefile - * update INSTALL instruction - ------------------------------------------------------------------------- -r327 | lh3lh3 | 2009-06-12 09:18:29 -0400 (Fri, 12 Jun 2009) | 4 lines -Changed paths: - A /trunk/samtools/Makefile (from /trunk/samtools/Makefile.generic:325) - D /trunk/samtools/Makefile.am - D /trunk/samtools/Makefile.generic - D /trunk/samtools/Makefile.lite - D /trunk/samtools/autogen.sh - D /trunk/samtools/cleanup.sh - D /trunk/samtools/configure.ac - A /trunk/samtools/misc/Makefile (from /trunk/samtools/misc/Makefile.generic:305) - D /trunk/samtools/misc/Makefile.am - D /trunk/samtools/misc/Makefile.generic - M /trunk/samtools/razf.c - A /trunk/samtools/zlib - A /trunk/samtools/zlib/Makefile - A /trunk/samtools/zlib/adler32.c - A /trunk/samtools/zlib/compress.c - A /trunk/samtools/zlib/crc32.c - A /trunk/samtools/zlib/crc32.h - A /trunk/samtools/zlib/deflate.c - A /trunk/samtools/zlib/deflate.h - A /trunk/samtools/zlib/gzio.c - A /trunk/samtools/zlib/infback.c - A /trunk/samtools/zlib/inffast.c - A /trunk/samtools/zlib/inffast.h - A /trunk/samtools/zlib/inffixed.h - A /trunk/samtools/zlib/inflate.c - A /trunk/samtools/zlib/inflate.h - A /trunk/samtools/zlib/inftrees.c - A /trunk/samtools/zlib/inftrees.h - A /trunk/samtools/zlib/trees.c - A /trunk/samtools/zlib/trees.h - A /trunk/samtools/zlib/uncompr.c - A /trunk/samtools/zlib/zconf.h - A /trunk/samtools/zlib/zlib.h - A /trunk/samtools/zlib/zutil.c - A /trunk/samtools/zlib/zutil.h - D /trunk/samtools/zutil.h - - * added zlib-1.2.3 as razip requires that - * prepare to changed back to the Makefile building system - * unfinished! (will be soon) - ------------------------------------------------------------------------- -r326 | lh3lh3 | 2009-06-12 09:12:03 -0400 (Fri, 12 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/misc/samtools.pl - -Unfinished - ------------------------------------------------------------------------- -r325 | lh3lh3 | 2009-06-10 11:27:59 -0400 (Wed, 10 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.4-4 (r325) - * further avoid wrong consensus calls in repetitive regions. - ------------------------------------------------------------------------- -r324 | lh3lh3 | 2009-06-10 10:56:17 -0400 (Wed, 10 Jun 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam.c - M /trunk/samtools/sam.h - - * samtools-0.1.4-3 (r324) - * make maqcns generate the correct call in repetitive regions. - * allow filtering on mapQ at the pileup command line - ------------------------------------------------------------------------- -r323 | lh3lh3 | 2009-06-10 05:04:21 -0400 (Wed, 10 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/misc/samtools.pl - - * samtools.pl-0.3.2 (r322) - * indels and SNPs use different mapping quality threshold - ------------------------------------------------------------------------- -r322 | lh3lh3 | 2009-06-10 05:03:22 -0400 (Wed, 10 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/misc/export2sam.pl - -fixed a typo - ------------------------------------------------------------------------- -r321 | lh3lh3 | 2009-06-09 04:21:48 -0400 (Tue, 09 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/misc/samtools.pl - -just typo. no real change - ------------------------------------------------------------------------- -r320 | lh3lh3 | 2009-06-08 09:32:51 -0400 (Mon, 08 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/misc/samtools.pl - -a little bit code cleanup - ------------------------------------------------------------------------- -r319 | lh3lh3 | 2009-06-08 09:22:33 -0400 (Mon, 08 Jun 2009) | 4 lines -Changed paths: - M /trunk/samtools/misc/samtools.pl - - * samtools.pl-0.3.1 - * change default parameters - * optionally print filtered variants - ------------------------------------------------------------------------- -r318 | lh3lh3 | 2009-06-08 09:14:26 -0400 (Mon, 08 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/misc/samtools.pl - - * samtools.pl-0.3.0 - * combine snpFilter and indelFilter - ------------------------------------------------------------------------- -r317 | lh3lh3 | 2009-06-08 06:31:42 -0400 (Mon, 08 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/misc/samtools.pl - - * samtools.pl-0.2.3 - * change a default parameter - ------------------------------------------------------------------------- -r316 | lh3lh3 | 2009-06-08 06:11:06 -0400 (Mon, 08 Jun 2009) | 5 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_maqcns.h - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam.c - - * samtools-0.1.4-2 (r316) - * pileup: cap mapping quality at 60 (by default) - * pileup: always calculate RMS mapq - * pileup: allow to output variant sites only - ------------------------------------------------------------------------- -r312 | lh3lh3 | 2009-06-04 08:01:10 -0400 (Thu, 04 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/misc/samtools.pl - - * samtools.pl-0.2.2 - * added pileup2fq - ------------------------------------------------------------------------- -r311 | lh3lh3 | 2009-06-03 04:40:40 -0400 (Wed, 03 Jun 2009) | 2 lines -Changed paths: - M /trunk/samtools/misc/samtools.pl - - * in snpFilter, suppress non-SNP sites - ------------------------------------------------------------------------- -r310 | lh3lh3 | 2009-06-01 09:35:13 -0400 (Mon, 01 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/misc/samtools.pl - - * samtools.pl-0.2.1 - * fixed a typo - ------------------------------------------------------------------------- -r309 | lh3lh3 | 2009-06-01 09:04:39 -0400 (Mon, 01 Jun 2009) | 3 lines -Changed paths: - M /trunk/samtools/misc/samtools.pl - - * samtools.pl-0.2.0 - * snpFilter - ------------------------------------------------------------------------- -r306 | lh3lh3 | 2009-05-28 06:49:35 -0400 (Thu, 28 May 2009) | 3 lines -Changed paths: - M /trunk/samtools/bgzf.c - - * minor changes to bgzf: return NULL if fd == -1 - * suggested by {kdj,jm18}@sanger.ac.uk - ------------------------------------------------------------------------- -r305 | lh3lh3 | 2009-05-28 06:16:08 -0400 (Thu, 28 May 2009) | 2 lines -Changed paths: - A /trunk/samtools/misc/interpolate_sam.pl - -Script for paired-end pileup, contributed by Stephen Montgomery. - ------------------------------------------------------------------------- -r304 | lh3lh3 | 2009-05-28 06:08:49 -0400 (Thu, 28 May 2009) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam.c - - * samtools-0.1.4-1 (r304) - * fixed a minor bug in printing headers - ------------------------------------------------------------------------- -r297 | lh3lh3 | 2009-05-21 11:06:16 -0400 (Thu, 21 May 2009) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/NEWS - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/misc/maq2sam.c - M /trunk/samtools/samtools.1 - -Release samtools-0.1.4 - ------------------------------------------------------------------------- -r296 | lh3lh3 | 2009-05-21 07:53:14 -0400 (Thu, 21 May 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-24 (r296) - * another similar bug in the indel caller - ------------------------------------------------------------------------- -r295 | lh3lh3 | 2009-05-21 07:50:28 -0400 (Thu, 21 May 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-23 (r295) - * fixed a critical bug in the indel caller - ------------------------------------------------------------------------- -r294 | lh3lh3 | 2009-05-20 08:00:20 -0400 (Wed, 20 May 2009) | 2 lines -Changed paths: - M /trunk/samtools/bam_stat.c - -added a missing header file - ------------------------------------------------------------------------- -r293 | lh3lh3 | 2009-05-19 18:44:25 -0400 (Tue, 19 May 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_tview.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-22 (r293) - * open tview in the dot-view mode by default - ------------------------------------------------------------------------- -r292 | lh3lh3 | 2009-05-18 16:01:23 -0400 (Mon, 18 May 2009) | 6 lines -Changed paths: - M /trunk/samtools/samtools.1 - -Added a note to the manual. Currently SAMtools used unaligned words in -several places. Although this does not cause bus errors to me, it may -affect portability. Please see the "Bus error" wiki page for more -information. Also thank James Bonfields for pointing this out. - - ------------------------------------------------------------------------- -r286 | lh3lh3 | 2009-05-14 10:23:13 -0400 (Thu, 14 May 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_aux.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-21 (286) - * declare bam_aux_get_core() in bam.h - ------------------------------------------------------------------------- -r276 | lh3lh3 | 2009-05-13 05:07:55 -0400 (Wed, 13 May 2009) | 5 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_index.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-20 (r276) - * remove bam1_t::hash again. We need to modify the Perl API anyway to - make it work with the latest SVN. - * As is suggested by Tim, scan "{base}.bai" and "{base}.bam.bai" for index - ------------------------------------------------------------------------- -r275 | lh3lh3 | 2009-05-12 16:14:10 -0400 (Tue, 12 May 2009) | 4 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/bam.h - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-19 (r275) - * a minor change to the bam1_t struct: added back "void *hash" for the - backward compatibility with Bio::DB::Sam - ------------------------------------------------------------------------- -r273 | lh3lh3 | 2009-05-12 09:28:39 -0400 (Tue, 12 May 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_rmdupse.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-18 (r273) - * rmdupse: do not remove unmapped reads - ------------------------------------------------------------------------- -r272 | lh3lh3 | 2009-05-12 09:20:00 -0400 (Tue, 12 May 2009) | 2 lines -Changed paths: - M /trunk/samtools/bam_rmdupse.c - -change a parameter. It does nothing - ------------------------------------------------------------------------- -r271 | lh3lh3 | 2009-05-12 09:17:58 -0400 (Tue, 12 May 2009) | 3 lines -Changed paths: - M /trunk/samtools/Makefile.am - M /trunk/samtools/Makefile.generic - M /trunk/samtools/Makefile.lite - A /trunk/samtools/bam_rmdupse.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/configure.ac - - * samtools-0.1.3-17 (r271) - * added 'rmdupse' command - ------------------------------------------------------------------------- -r267 | lh3lh3 | 2009-05-05 17:31:41 -0400 (Tue, 05 May 2009) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam_view.c - - * samtools-0.1.3-16 (r267) - * in sam_view.c, changed g_flag_on based on the suggestion by Angie Hinrichs - ------------------------------------------------------------------------- -r266 | lh3lh3 | 2009-05-05 17:23:27 -0400 (Tue, 05 May 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_import.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-15 (r266) - * report an error if a non-* reference is present while @SQ is absent - ------------------------------------------------------------------------- -r265 | lh3lh3 | 2009-05-05 17:09:00 -0400 (Tue, 05 May 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_import.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam.c - M /trunk/samtools/sam_view.c - - * samtools-0.1.3-14 (r262) - * make samopen() recognize @SQ header lines - ------------------------------------------------------------------------- -r261 | lh3lh3 | 2009-05-05 10:10:30 -0400 (Tue, 05 May 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bgzf.c - M /trunk/samtools/sam.c - M /trunk/samtools/sam_view.c - - * samtools-0.1.3-13 (r260) - * report error for file I/O error - ------------------------------------------------------------------------- -r260 | lh3lh3 | 2009-05-05 10:01:16 -0400 (Tue, 05 May 2009) | 2 lines -Changed paths: - M /trunk/samtools/Makefile.am - -update Makefile.am - ------------------------------------------------------------------------- -r259 | lh3lh3 | 2009-05-05 09:52:25 -0400 (Tue, 05 May 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/sam.c - M /trunk/samtools/sam.h - - * samtools-0.1.3-12 (r259) - * use the new I/O interface in pileup - ------------------------------------------------------------------------- -r258 | lh3lh3 | 2009-05-05 09:33:22 -0400 (Tue, 05 May 2009) | 3 lines -Changed paths: - M /trunk/samtools/Makefile.generic - M /trunk/samtools/Makefile.lite - M /trunk/samtools/bam.c - M /trunk/samtools/bam.h - M /trunk/samtools/bam_import.c - M /trunk/samtools/bamtk.c - A /trunk/samtools/sam.c - A /trunk/samtools/sam.h - A /trunk/samtools/sam_view.c - - * samtools-0.1.3-11 (r258) - * unify the interface to BAM and SAM I/O - ------------------------------------------------------------------------- -r257 | lh3lh3 | 2009-05-05 04:53:35 -0400 (Tue, 05 May 2009) | 3 lines -Changed paths: - M /trunk/samtools/Makefile.lite - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-10 (r257) - * allow hex with "pileup -m" - ------------------------------------------------------------------------- -r256 | lh3lh3 | 2009-05-04 14:16:50 -0400 (Mon, 04 May 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam_lpileup.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-9 (r256) - * fixed a bug in bam_lpileup.c - * I do not know if this also fixes the bug causing assertion failure in the tview - ------------------------------------------------------------------------- -r251 | lh3lh3 | 2009-04-28 08:53:23 -0400 (Tue, 28 Apr 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-8 (r251) - * fixed a bug when there are reads without coordinates - ------------------------------------------------------------------------- -r250 | lh3lh3 | 2009-04-28 08:43:33 -0400 (Tue, 28 Apr 2009) | 2 lines -Changed paths: - A /trunk/samtools/AUTHORS - A /trunk/samtools/README - M /trunk/samtools/cleanup.sh - -added missing files - ------------------------------------------------------------------------- -r249 | lh3lh3 | 2009-04-28 08:37:16 -0400 (Tue, 28 Apr 2009) | 2 lines -Changed paths: - M /trunk/samtools/Makefile.generic - M /trunk/samtools/Makefile.lite - M /trunk/samtools/configure.ac - M /trunk/samtools/misc/Makefile.generic - -improve large file support in compilation - ------------------------------------------------------------------------- -r248 | lh3lh3 | 2009-04-28 08:33:24 -0400 (Tue, 28 Apr 2009) | 2 lines -Changed paths: - M /trunk/samtools/INSTALL - -update INSTALL - ------------------------------------------------------------------------- -r247 | lh3lh3 | 2009-04-28 08:28:50 -0400 (Tue, 28 Apr 2009) | 2 lines -Changed paths: - M /trunk/samtools/Makefile.am - M /trunk/samtools/autogen.sh - M /trunk/samtools/cleanup.sh - M /trunk/samtools/configure.ac - A /trunk/samtools/misc/Makefile.am - -fixed various issues about the GNU building scripts - ------------------------------------------------------------------------- -r246 | lh3lh3 | 2009-04-28 08:10:23 -0400 (Tue, 28 Apr 2009) | 4 lines -Changed paths: - M /trunk/samtools/ChangeLog - D /trunk/samtools/Makefile - A /trunk/samtools/Makefile.am - A /trunk/samtools/Makefile.generic - A /trunk/samtools/autogen.sh - M /trunk/samtools/bam.h - M /trunk/samtools/bam_aux.c - M /trunk/samtools/bam_tview.c - M /trunk/samtools/bamtk.c - A /trunk/samtools/cleanup.sh - A /trunk/samtools/configure.ac - D /trunk/samtools/misc/Makefile - A /trunk/samtools/misc/Makefile.generic (from /trunk/samtools/misc/Makefile:245) - - * samtools-0.1.3-7 (r246) - * incorporated revisions from Nils Homer - * enhanced support of displaying color-space reads - ------------------------------------------------------------------------- -r244 | lh3lh3 | 2009-04-25 06:49:40 -0400 (Sat, 25 Apr 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_md.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-6 (r244) - * fixed segfault for unmapped reads - ------------------------------------------------------------------------- -r243 | lh3lh3 | 2009-04-24 16:27:26 -0400 (Fri, 24 Apr 2009) | 5 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_md.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-5 (r243) - * fixed a long existing bug which may cause memory leak - * check MD - * consensus calling now works with "=", but indel calling not - ------------------------------------------------------------------------- -r242 | lh3lh3 | 2009-04-24 15:44:46 -0400 (Fri, 24 Apr 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_md.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-4 (r242) - * fixed a memory leak - ------------------------------------------------------------------------- -r240 | lh3lh3 | 2009-04-24 11:40:18 -0400 (Fri, 24 Apr 2009) | 5 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/Makefile.lite - M /trunk/samtools/bam.h - M /trunk/samtools/bam_aux.c - A /trunk/samtools/bam_md.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-3 (r240) - * generate MD tag - * generate "=" bases - * the plain pileup now support "=" bases, but consensus calling and glfgen may fail - ------------------------------------------------------------------------- -r239 | lh3lh3 | 2009-04-24 07:08:20 -0400 (Fri, 24 Apr 2009) | 5 lines -Changed paths: - M /trunk/samtools/bam.h - M /trunk/samtools/bam_aux.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-2 (r239) - * fixed bugs in bam_aux.c (these functions nevered used by samtools) - * removed bam_aux_init()/bam_aux_destroy() - * added tagview for testing bam_aux - ------------------------------------------------------------------------- -r235 | lh3lh3 | 2009-04-21 18:17:39 -0400 (Tue, 21 Apr 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.3-1 - * fixed a bug in pileup: the first read in a chromosome may not be printed - ------------------------------------------------------------------------- -r232 | lh3lh3 | 2009-04-16 10:25:43 -0400 (Thu, 16 Apr 2009) | 2 lines -Changed paths: - M /trunk/samtools/Makefile.lite - -a missing file in Makefile.lite - ------------------------------------------------------------------------- -r227 | lh3lh3 | 2009-04-15 17:02:53 -0400 (Wed, 15 Apr 2009) | 2 lines -Changed paths: - M /trunk/samtools/NEWS - M /trunk/samtools/bamtk.c - -Release samtools-0.1.3 - ------------------------------------------------------------------------- -r223 | lh3lh3 | 2009-04-15 09:31:32 -0400 (Wed, 15 Apr 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.2-28 - * make samtools more robust to weird input such as empty file - ------------------------------------------------------------------------- -r222 | lh3lh3 | 2009-04-15 09:05:33 -0400 (Wed, 15 Apr 2009) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/NEWS - M /trunk/samtools/samtools.1 - -prepare for release 0.1.3 - ------------------------------------------------------------------------- -r221 | lh3lh3 | 2009-04-15 08:32:14 -0400 (Wed, 15 Apr 2009) | 2 lines -Changed paths: - A /trunk/samtools/misc/blast2sam.pl - -convert NCBI-BLASTN to SAM - ------------------------------------------------------------------------- -r220 | lh3lh3 | 2009-04-15 08:18:19 -0400 (Wed, 15 Apr 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_lpileup.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.2-27 - * fixed a small memory leak in tview - ------------------------------------------------------------------------- -r219 | lh3lh3 | 2009-04-15 08:00:08 -0400 (Wed, 15 Apr 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_rmdup.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.2-26 - * fixed a bug in rmdup when there are unmapped reads - ------------------------------------------------------------------------- -r218 | lh3lh3 | 2009-04-14 17:28:58 -0400 (Tue, 14 Apr 2009) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/NEWS - -proposed NEWS for the new release (have not yet) - ------------------------------------------------------------------------- -r216 | lh3lh3 | 2009-04-14 17:10:46 -0400 (Tue, 14 Apr 2009) | 4 lines -Changed paths: - M /trunk/samtools/misc/samtools.pl - - * samtools.pl-0.1.1 - * improve indelFilter to avoid filtering true indels. The new filter relies - on the new pileup indel line implemented in samtools-0.1.2-25 - ------------------------------------------------------------------------- -r215 | lh3lh3 | 2009-04-14 17:04:19 -0400 (Tue, 14 Apr 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/samtools.1 - - * samtools-0.1.2-25 - * change the pileup indel line to shows the number of alignments actually - containing indels - ------------------------------------------------------------------------- -r211 | lh3lh3 | 2009-04-13 07:07:13 -0400 (Mon, 13 Apr 2009) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - -update ChangeLog from "svn log" - ------------------------------------------------------------------------- -r210 | lh3lh3 | 2009-04-12 15:57:05 -0400 (Sun, 12 Apr 2009) | 4 lines -Changed paths: - M /trunk/samtools/bam.c - M /trunk/samtools/bam_import.c - M /trunk/samtools/bam_sort.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/kseq.h - - * samtools-0.1.2-24 - * in merge, gives a warning rather than error if the target sequence length is different - * allow empty header - ------------------------------------------------------------------------- -r209 | lh3lh3 | 2009-04-12 15:32:44 -0400 (Sun, 12 Apr 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam.c - M /trunk/samtools/bam_import.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.2-23 - * recognize '*' at the QUAL field - ------------------------------------------------------------------------- -r208 | lh3lh3 | 2009-04-12 15:08:02 -0400 (Sun, 12 Apr 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_import.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/kseq.h - - * samtools-0.1.2-22 - * the field separater is TAB only, now - ------------------------------------------------------------------------- -r207 | lh3lh3 | 2009-04-08 10:18:03 -0400 (Wed, 08 Apr 2009) | 2 lines -Changed paths: - M /trunk/samtools/examples/ex1.sam.gz - - * fixed the problem in the example alignment due to the bug in fixmate - ------------------------------------------------------------------------- -r206 | lh3lh3 | 2009-04-08 10:15:05 -0400 (Wed, 08 Apr 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_mate.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/misc/soap2sam.pl - - * samtools-0.1.2-21 - * fixed a nasty bug in `fixmate' - ------------------------------------------------------------------------- -r205 | lh3lh3 | 2009-04-08 05:57:08 -0400 (Wed, 08 Apr 2009) | 2 lines -Changed paths: - M /trunk/samtools/misc/bowtie2sam.pl - M /trunk/samtools/misc/soap2sam.pl - M /trunk/samtools/misc/wgsim_eval.pl - -make the script robust to the bugs in SOAP-2.1.7 - ------------------------------------------------------------------------- -r200 | lh3lh3 | 2009-04-02 10:14:56 -0400 (Thu, 02 Apr 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_stat.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.2-20 - * check if file is truncated in flagstat - ------------------------------------------------------------------------- -r199 | lh3lh3 | 2009-04-02 10:09:10 -0400 (Thu, 02 Apr 2009) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - - * samtools-0.1.2-19 - * print the header if requested - ------------------------------------------------------------------------- -r193 | lh3lh3 | 2009-03-27 11:09:50 -0400 (Fri, 27 Mar 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.2-18 - * fixed a minor bug reported by Nils Homer - ------------------------------------------------------------------------- -r185 | lh3lh3 | 2009-03-24 07:50:32 -0400 (Tue, 24 Mar 2009) | 2 lines -Changed paths: - A /trunk/samtools/Makefile (from /trunk/samtools/Makefile.std:184) - D /trunk/samtools/Makefile.std - A /trunk/samtools/misc/Makefile (from /trunk/samtools/misc/Makefile.std:184) - D /trunk/samtools/misc/Makefile.std - -rename Makefile.std as Makefile. GNU building systerm is not ready and may take some time... - ------------------------------------------------------------------------- -r184 | lh3lh3 | 2009-03-24 06:36:38 -0400 (Tue, 24 Mar 2009) | 4 lines -Changed paths: - D /trunk/samtools/Makefile - A /trunk/samtools/Makefile.std (from /trunk/samtools/Makefile:183) - M /trunk/samtools/bam_sort.c - M /trunk/samtools/bam_tview.c - M /trunk/samtools/bamtk.c - D /trunk/samtools/misc/Makefile - A /trunk/samtools/misc/Makefile.std (from /trunk/samtools/misc/Makefile:182) - M /trunk/samtools/samtools.1 - - * samtools-0.1.2-17 - * incorporating Nils' changes - * rename Makefile to Makefile.std and prepare to add the GNU building systerms (also by Nils) - ------------------------------------------------------------------------- -r183 | lh3lh3 | 2009-03-24 06:30:23 -0400 (Tue, 24 Mar 2009) | 4 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam_import.c - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_maqcns.h - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/kseq.h - A /trunk/samtools/kstring.c - A /trunk/samtools/kstring.h - - * samtools-0.1.2-16 - * made pileup take a list of proposed indels. An insertion is N at the moment. - * added my kstring library for a bit complex parsing of the position list. - ------------------------------------------------------------------------- -r169 | lh3lh3 | 2009-03-12 09:40:14 -0400 (Thu, 12 Mar 2009) | 3 lines -Changed paths: - M /trunk/samtools/misc/soap2sam.pl - - * soap2sam.pl-0.1.2 - * more robust to truncated soap output - ------------------------------------------------------------------------- -r168 | lh3lh3 | 2009-03-11 06:49:00 -0400 (Wed, 11 Mar 2009) | 2 lines -Changed paths: - M /trunk/samtools/Makefile.lite - -added bam_stat.o to Makefile.lite - ------------------------------------------------------------------------- -r167 | lh3lh3 | 2009-03-10 18:11:31 -0400 (Tue, 10 Mar 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.2-15 - * generate RMS of mapQ instead of max mapQ - ------------------------------------------------------------------------- -r166 | lh3lh3 | 2009-03-10 18:06:45 -0400 (Tue, 10 Mar 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/glf.c - M /trunk/samtools/glf.h - M /trunk/samtools/misc/Makefile - - * samtools-0.1.2-14 - * implemented GLFv3 - ------------------------------------------------------------------------- -r159 | lh3lh3 | 2009-03-03 06:26:08 -0500 (Tue, 03 Mar 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.2-13 - * fixed a minor bug in displaying pileup - ------------------------------------------------------------------------- -r158 | lh3lh3 | 2009-03-03 06:24:16 -0500 (Tue, 03 Mar 2009) | 3 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/bamtk.c - - * samtools-0.1.2-12 - * optionally print SAM header - ------------------------------------------------------------------------- -r153 | lh3lh3 | 2009-03-02 05:45:28 -0500 (Mon, 02 Mar 2009) | 3 lines -Changed paths: - M /trunk/samtools/bamtk.c - M /trunk/samtools/glf.c - - * samtools-0.1.2-11 - * use "GLF\3" as the magic for GLFv3 files - ------------------------------------------------------------------------- -r152 | lh3lh3 | 2009-03-02 05:39:09 -0500 (Mon, 02 Mar 2009) | 5 lines -Changed paths: - M /trunk/samtools/Makefile - M /trunk/samtools/bam_import.c - M /trunk/samtools/bam_index.c - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/glf.c - M /trunk/samtools/glf.h - - * samtools-0.1.2-10 - * fixed a bug in import: core.bin is undefined for unmapped reads - * this bug can be alleviated (not completely solved) in bam_index.c - * update to GLFv3: pos is changed to offset for better compression - ------------------------------------------------------------------------- -r151 | lh3lh3 | 2009-03-01 10:18:43 -0500 (Sun, 01 Mar 2009) | 3 lines -Changed paths: - M /trunk/samtools/misc/wgsim.c - - * wgsim-0.2.3 - * fixed a bug in simulating indels - ------------------------------------------------------------------------- -r145 | lh3lh3 | 2009-02-26 14:43:57 -0500 (Thu, 26 Feb 2009) | 4 lines -Changed paths: - M /trunk/samtools/misc/wgsim.c - - * wgsim-0.2.2 - * allow to print mismatch information as fastq comment. MAQ does - not like long read names. - ------------------------------------------------------------------------- -r141 | lh3lh3 | 2009-02-26 09:53:03 -0500 (Thu, 26 Feb 2009) | 6 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/misc/wgsim.c - M /trunk/samtools/misc/wgsim_eval.pl - - * wgsim-0.2.1 - * fixed a bug about color read coordinates - * fixed a bug in read names - * wgsim_eval.pl-0.1.3 - * make the script work with color reads - ------------------------------------------------------------------------- -r140 | lh3lh3 | 2009-02-26 09:02:57 -0500 (Thu, 26 Feb 2009) | 2 lines -Changed paths: - M /trunk/samtools/misc/Makefile - M /trunk/samtools/misc/wgsim.c - - * wgsim: added a note - ------------------------------------------------------------------------- -r139 | lh3lh3 | 2009-02-26 06:39:08 -0500 (Thu, 26 Feb 2009) | 7 lines -Changed paths: - M /trunk/samtools/misc/wgsim.c - M /trunk/samtools/misc/wgsim_eval.pl - - * wgsim-0.2.0 - * considerable code clean up - * print number of substitutions/indels/errors on each read - * potentially support SOLiD simulation, though not tested at the moment - * wgsim_eval.pl-0.1.2 - * change in accordant with wgsim - ------------------------------------------------------------------------- -r129 | lh3lh3 | 2009-02-18 17:23:27 -0500 (Wed, 18 Feb 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_index.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.2-9 - * fixed a bug in bam_fetch, caused by completely contained adjacent chunks - ------------------------------------------------------------------------- -r128 | bhandsaker | 2009-02-18 14:06:57 -0500 (Wed, 18 Feb 2009) | 2 lines -Changed paths: - M /trunk/samtools/bamtk.c - -Fix annoying segv when invalid region specified. - ------------------------------------------------------------------------- -r127 | lh3lh3 | 2009-02-17 05:49:55 -0500 (Tue, 17 Feb 2009) | 2 lines -Changed paths: - D /trunk/samtools/misc/indel_filter.pl - A /trunk/samtools/misc/samtools.pl - - * move indel_filter.pl to samtools.pl - ------------------------------------------------------------------------- -r126 | lh3lh3 | 2009-02-14 16:22:30 -0500 (Sat, 14 Feb 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_mate.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.2-7 - * fixed a bug in fixmate: SE reads are flagged as BAM_FMUNMAP - ------------------------------------------------------------------------- -r125 | lh3lh3 | 2009-02-13 04:54:45 -0500 (Fri, 13 Feb 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_stat.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.2-7 - * fixed a minor bug in flagstat - ------------------------------------------------------------------------- -r124 | lh3lh3 | 2009-02-12 06:15:32 -0500 (Thu, 12 Feb 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/misc/indel_filter.pl - - * samtools-0.1.2-6 - * improve indel caller by setting maximum window size - ------------------------------------------------------------------------- -r123 | lh3lh3 | 2009-02-12 05:30:29 -0500 (Thu, 12 Feb 2009) | 2 lines -Changed paths: - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bamtk.c - - * output max mapping quality in indel line - ------------------------------------------------------------------------- -r122 | lh3lh3 | 2009-02-11 05:59:10 -0500 (Wed, 11 Feb 2009) | 2 lines -Changed paths: - M /trunk/samtools/misc/maq2sam.c - -fixed a bug in generating tag AM - ------------------------------------------------------------------------- -r121 | lh3lh3 | 2009-02-03 05:43:11 -0500 (Tue, 03 Feb 2009) | 2 lines -Changed paths: - M /trunk/samtools/bam_index.c - M /trunk/samtools/bamtk.c - -fixed a potential memory problem in indexing - ------------------------------------------------------------------------- -r120 | bhandsaker | 2009-02-02 10:52:52 -0500 (Mon, 02 Feb 2009) | 2 lines -Changed paths: - M /trunk/samtools/Makefile - -Pass LIBS to recursive targets to facilitate building at Broad. - ------------------------------------------------------------------------- -r119 | lh3lh3 | 2009-02-02 05:12:15 -0500 (Mon, 02 Feb 2009) | 4 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/bam_plcmd.c - M /trunk/samtools/bam_stat.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.2-3 - * fixed a bug in generating GLFv2 for indels - * improve flagstat report a little bit - ------------------------------------------------------------------------- -r118 | lh3lh3 | 2009-01-29 07:33:23 -0500 (Thu, 29 Jan 2009) | 3 lines -Changed paths: - M /trunk/samtools/Makefile - A /trunk/samtools/bam_stat.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.2-1 - * added flagstat command - ------------------------------------------------------------------------- -r116 | lh3lh3 | 2009-01-28 08:31:12 -0500 (Wed, 28 Jan 2009) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/NEWS - M /trunk/samtools/bamtk.c - M /trunk/samtools/samtools.1 - -Release SAMtools-0.1.2 - ------------------------------------------------------------------------- -r115 | lh3lh3 | 2009-01-28 07:54:08 -0500 (Wed, 28 Jan 2009) | 2 lines -Changed paths: - A /trunk/samtools/misc/indel_filter.pl - -Script for filtering indel results - ------------------------------------------------------------------------- -r114 | lh3lh3 | 2009-01-25 06:45:37 -0500 (Sun, 25 Jan 2009) | 2 lines -Changed paths: - A /trunk/samtools/misc/zoom2sam.pl - -convert ZOOM to SAM - ------------------------------------------------------------------------- -r113 | lh3lh3 | 2009-01-24 09:25:07 -0500 (Sat, 24 Jan 2009) | 2 lines -Changed paths: - A /trunk/samtools/misc/novo2sam.pl - -add a script to convert novo alignment to SAM - ------------------------------------------------------------------------- -r112 | lh3lh3 | 2009-01-23 15:57:39 -0500 (Fri, 23 Jan 2009) | 2 lines -Changed paths: - M /trunk/samtools/ChangeLog - M /trunk/samtools/ChangeLog.old - M /trunk/samtools/samtools.1 - -update documentation and ChangeLog - ------------------------------------------------------------------------- -r111 | lh3lh3 | 2009-01-23 14:22:59 -0500 (Fri, 23 Jan 2009) | 3 lines -Changed paths: - M /trunk/samtools/bam_sort.c - M /trunk/samtools/bamtk.c - - * samtools-0.1.1-19 - * fixed a bug in "merge" command line - ------------------------------------------------------------------------- -r110 | lh3lh3 | 2009-01-22 10:36:48 -0500 (Thu, 22 Jan 2009) | 3 lines -Changed paths: - M /trunk/samtools/misc/Makefile - A /trunk/samtools/misc/bowtie2sam.pl (from /branches/dev/samtools/misc/bowtie2sam.pl:108) - M /trunk/samtools/misc/export2sam.pl - A /trunk/samtools/misc/soap2sam.pl (from /branches/dev/samtools/misc/soap2sam.pl:108) - A /trunk/samtools/misc/wgsim.c (from /branches/dev/samtools/misc/wgsim.c:108) - A /trunk/samtools/misc/wgsim_eval.pl (from /branches/dev/samtools/misc/wgsim_eval.pl:108) - - * merge from branches/dev/ - * all future development will happen here - ------------------------------------------------------------------------- -r109 | lh3lh3 | 2009-01-22 10:14:27 -0500 (Thu, 22 Jan 2009) | 3 lines -Changed paths: - M /trunk/samtools/COPYING - M /trunk/samtools/ChangeLog - A /trunk/samtools/INSTALL (from /branches/dev/samtools/INSTALL:108) - M /trunk/samtools/Makefile - A /trunk/samtools/Makefile.lite (from /branches/dev/samtools/Makefile.lite:108) - M /trunk/samtools/bam.c - M /trunk/samtools/bam.h - M /trunk/samtools/bam_import.c - M /trunk/samtools/bam_index.c - M /trunk/samtools/bam_lpileup.c - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_maqcns.h - A /trunk/samtools/bam_mate.c (from /branches/dev/samtools/bam_mate.c:108) - M /trunk/samtools/bam_pileup.c - M /trunk/samtools/bam_plcmd.c - A /trunk/samtools/bam_rmdup.c (from /branches/dev/samtools/bam_rmdup.c:108) - M /trunk/samtools/bam_sort.c - M /trunk/samtools/bamtk.c - M /trunk/samtools/bgzf.h - M /trunk/samtools/examples/00README.txt - A /trunk/samtools/examples/Makefile (from /branches/dev/samtools/examples/Makefile:108) - D /trunk/samtools/examples/ex1.fa.fai - M /trunk/samtools/examples/ex1.sam.gz - M /trunk/samtools/faidx.c - A /trunk/samtools/glf.c (from /branches/dev/samtools/glf.c:108) - M /trunk/samtools/glf.h - M /trunk/samtools/misc/Makefile - M /trunk/samtools/misc/maq2sam.c - M /trunk/samtools/razf.c - M /trunk/samtools/source.dot - - * Merge from branches/dev/ - * all future development will happen here at trunk/ - ------------------------------------------------------------------------- -r79 | bhandsaker | 2009-01-07 16:42:15 -0500 (Wed, 07 Jan 2009) | 2 lines -Changed paths: - M /trunk/samtools/bam_maqcns.c - M /trunk/samtools/bam_tview.c - -Fix problem with compiling without curses. - ------------------------------------------------------------------------- -r63 | lh3lh3 | 2008-12-22 10:58:02 -0500 (Mon, 22 Dec 2008) | 2 lines -Changed paths: - A /trunk/samtools (from /branches/dev/samtools:62) - -Create trunk copy - ------------------------------------------------------------------------- -r62 | lh3lh3 | 2008-12-22 10:55:13 -0500 (Mon, 22 Dec 2008) | 2 lines -Changed paths: - A /branches/dev/samtools/NEWS - M /branches/dev/samtools/bamtk.c - M /branches/dev/samtools/samtools.1 - -Release samtools-0.1.1 - ------------------------------------------------------------------------- -r61 | lh3lh3 | 2008-12-22 10:46:08 -0500 (Mon, 22 Dec 2008) | 10 lines -Changed paths: - M /branches/dev/samtools/bam_aux.c - M /branches/dev/samtools/bam_index.c - M /branches/dev/samtools/bam_plcmd.c - M /branches/dev/samtools/bam_tview.c - M /branches/dev/samtools/bamtk.c - M /branches/dev/samtools/razf.c - M /branches/dev/samtools/samtools.1 - - * samtools-0.1.0-66 - * fixed a bug in razf.c: reset z_eof when razf_seek() is called - * fixed a memory leak in parsing a region - * changed pileup a little bit when -s is in use: output ^ and $ - * when a bam is not indexed, output more meaningful error message - * fixed a bug in indexing for small alignment - * fixed a bug in the viewer when we come to the end of a reference file - * updated documentation - * prepare to release 0.1.1 - ------------------------------------------------------------------------- -r60 | lh3lh3 | 2008-12-22 10:10:16 -0500 (Mon, 22 Dec 2008) | 2 lines -Changed paths: - A /branches/dev/samtools/examples - A /branches/dev/samtools/examples/00README.txt - A /branches/dev/samtools/examples/ex1.fa - A /branches/dev/samtools/examples/ex1.fa.fai - A /branches/dev/samtools/examples/ex1.sam.gz - -example - ------------------------------------------------------------------------- -r59 | lh3lh3 | 2008-12-22 04:38:15 -0500 (Mon, 22 Dec 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/ChangeLog - -update ChangeLog - ------------------------------------------------------------------------- -r58 | lh3lh3 | 2008-12-20 18:06:00 -0500 (Sat, 20 Dec 2008) | 3 lines -Changed paths: - M /branches/dev/samtools/misc/export2sam.pl - - * added comments - * fixed several bugs - ------------------------------------------------------------------------- -r57 | lh3lh3 | 2008-12-20 10:44:20 -0500 (Sat, 20 Dec 2008) | 2 lines -Changed paths: - A /branches/dev/samtools/misc/export2sam.pl - -convert Export format to SAM; not thoroughly tested - ------------------------------------------------------------------------- -r56 | lh3lh3 | 2008-12-19 17:13:28 -0500 (Fri, 19 Dec 2008) | 6 lines -Changed paths: - M /branches/dev/samtools/bam_import.c - M /branches/dev/samtools/bam_plcmd.c - M /branches/dev/samtools/bam_tview.c - M /branches/dev/samtools/bamtk.c - A /branches/dev/samtools/source.dot - - * samtools-0.1.0-65 - * pileup: generate maq-like simple output - * pileup: allow to output pileup at required sites - * source.dot: source file relationship graph - * tview: fixed a minor bug - ------------------------------------------------------------------------- -r55 | lh3lh3 | 2008-12-19 15:10:26 -0500 (Fri, 19 Dec 2008) | 2 lines -Changed paths: - D /branches/dev/samtools/misc/all2sam.pl - -remove all2sam.pl - ------------------------------------------------------------------------- -r54 | lh3lh3 | 2008-12-16 17:34:25 -0500 (Tue, 16 Dec 2008) | 2 lines -Changed paths: - A /branches/dev/samtools/COPYING - M /branches/dev/samtools/bam.h - M /branches/dev/samtools/faidx.h - M /branches/dev/samtools/khash.h - M /branches/dev/samtools/kseq.h - M /branches/dev/samtools/ksort.h - M /branches/dev/samtools/samtools.1 - -Added copyright information and a bit more documentation. No code change. - ------------------------------------------------------------------------- -r53 | lh3lh3 | 2008-12-16 08:40:18 -0500 (Tue, 16 Dec 2008) | 3 lines -Changed paths: - M /branches/dev/samtools/bam.c - M /branches/dev/samtools/bam.h - M /branches/dev/samtools/bam_index.c - M /branches/dev/samtools/bam_maqcns.c - M /branches/dev/samtools/bamtk.c - - * samtools-0.1.0-64 - * improved efficiency of the indel caller for spliced alignments - ------------------------------------------------------------------------- -r52 | lh3lh3 | 2008-12-16 05:28:20 -0500 (Tue, 16 Dec 2008) | 3 lines -Changed paths: - M /branches/dev/samtools/bam.c - M /branches/dev/samtools/bam.h - M /branches/dev/samtools/bam_aux.c - M /branches/dev/samtools/bam_index.c - M /branches/dev/samtools/bamtk.c - - * samtools-0.1.0-63 - * a bit code cleanup: reduce the dependency between source files - ------------------------------------------------------------------------- -r51 | lh3lh3 | 2008-12-15 09:29:32 -0500 (Mon, 15 Dec 2008) | 3 lines -Changed paths: - M /branches/dev/samtools/bam_maqcns.c - M /branches/dev/samtools/bam_plcmd.c - M /branches/dev/samtools/bamtk.c - - * samtools-0.1.0-62 - * fixed a memory leak - ------------------------------------------------------------------------- -r50 | lh3lh3 | 2008-12-15 09:00:13 -0500 (Mon, 15 Dec 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/ChangeLog - M /branches/dev/samtools/bam.h - M /branches/dev/samtools/samtools.1 - -update documentation, ChangeLog and a comment - ------------------------------------------------------------------------- -r49 | lh3lh3 | 2008-12-15 08:36:43 -0500 (Mon, 15 Dec 2008) | 6 lines -Changed paths: - M /branches/dev/samtools/Makefile - M /branches/dev/samtools/bam.h - M /branches/dev/samtools/bam_maqcns.c - M /branches/dev/samtools/bam_maqcns.h - M /branches/dev/samtools/bam_pileup.c - A /branches/dev/samtools/bam_plcmd.c - M /branches/dev/samtools/bamtk.c - M /branches/dev/samtools/samtools.1 - - * samtools-0.1.0-61 - * moved pileup command to a separate source file - * added indel caller - * added bam_cal_segend(). (NOT WORKING for spliced alignment!!!) - * updated documentation - ------------------------------------------------------------------------- -r48 | lh3lh3 | 2008-12-12 08:55:36 -0500 (Fri, 12 Dec 2008) | 3 lines -Changed paths: - M /branches/dev/samtools/bam_maqcns.c - M /branches/dev/samtools/bamtk.c - - * samtools-0.1.0-60 - * fixed another bug in maqcns when there is a nearby deletion - ------------------------------------------------------------------------- -r47 | lh3lh3 | 2008-12-12 08:42:16 -0500 (Fri, 12 Dec 2008) | 5 lines -Changed paths: - M /branches/dev/samtools/bam_maqcns.c - M /branches/dev/samtools/bam_pileup.c - M /branches/dev/samtools/bamtk.c - - * samtools-0.1.0-59 - * pileup: outputing consensus is now optional - * fixed a bug in glfgen. This bug also exists in maq's glfgen. However, - I am not quite sure why the previous version may have problem. - ------------------------------------------------------------------------- -r46 | lh3lh3 | 2008-12-12 06:44:56 -0500 (Fri, 12 Dec 2008) | 6 lines -Changed paths: - M /branches/dev/samtools/bam_pileup.c - M /branches/dev/samtools/bamtk.c - - * samtools-0.1.0-58 - * add maq consensus to pileup. However, I will move this part to a new - command as strictly speaking, consensus callin is not part of pileup, - and imposing it would make it harder to generate for other language - bindings. - ------------------------------------------------------------------------- -r45 | bhandsaker | 2008-12-11 15:43:56 -0500 (Thu, 11 Dec 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/bgzf.c - -Fix bug in tell() after reads that consume to the exact end of a block. - ------------------------------------------------------------------------- -r44 | lh3lh3 | 2008-12-11 04:36:53 -0500 (Thu, 11 Dec 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/samtools.1 - -update manual - ------------------------------------------------------------------------- -r43 | lh3lh3 | 2008-12-11 04:25:36 -0500 (Thu, 11 Dec 2008) | 4 lines -Changed paths: - M /branches/dev/samtools/bam_import.c - M /branches/dev/samtools/bamtk.c - - * samtools-0.1.0-57 - * fixed a bug in parser when there is auxiliary fields - * made the parser a bit more robust - ------------------------------------------------------------------------- -r42 | lh3lh3 | 2008-12-10 09:57:29 -0500 (Wed, 10 Dec 2008) | 5 lines -Changed paths: - M /branches/dev/samtools/bam_index.c - M /branches/dev/samtools/bamtk.c - M /branches/dev/samtools/bgzf.c - - * samtools-0.1.0-56 - * fixed a bug in bgzf (only reading is affected) - * fixed a typo in bam_index.c - * in bam_index.c, check potential bugs in the underlying I/O library - ------------------------------------------------------------------------- -r41 | lh3lh3 | 2008-12-10 07:53:08 -0500 (Wed, 10 Dec 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/samtools.1 - -update manual - ------------------------------------------------------------------------- -r40 | lh3lh3 | 2008-12-10 06:52:10 -0500 (Wed, 10 Dec 2008) | 5 lines -Changed paths: - M /branches/dev/samtools/bam.h - M /branches/dev/samtools/bam_pileup.c - M /branches/dev/samtools/bamtk.c - - * samtools-0.1.0-55 - * tried to make pileup work with clipping (previously not), though NOT tested - * removed -v from pileup - * made pileup take the reference sequence - ------------------------------------------------------------------------- -r39 | lh3lh3 | 2008-12-09 06:59:28 -0500 (Tue, 09 Dec 2008) | 4 lines -Changed paths: - M /branches/dev/samtools/bam_import.c - M /branches/dev/samtools/bamtk.c - M /branches/dev/samtools/samtools.1 - - * samtools-0.1.0-54 - * in parser, recognize "=", rather than ",", as a match - * in parser, correctl parse "=" at the MRNM field. - ------------------------------------------------------------------------- -r38 | lh3lh3 | 2008-12-09 06:39:07 -0500 (Tue, 09 Dec 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/misc/maq2sam.c - -fixed a bug in handling maq flag 64 and 192 - ------------------------------------------------------------------------- -r37 | lh3lh3 | 2008-12-09 04:53:46 -0500 (Tue, 09 Dec 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/misc/md5fa.c - -also calculate unordered md5sum check - ------------------------------------------------------------------------- -r36 | lh3lh3 | 2008-12-09 04:46:21 -0500 (Tue, 09 Dec 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/misc/md5fa.c - -fixed a minor bug when there are space in the sequence - ------------------------------------------------------------------------- -r35 | lh3lh3 | 2008-12-09 04:40:45 -0500 (Tue, 09 Dec 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/misc/md5fa.c - -fixed a potential memory leak - ------------------------------------------------------------------------- -r34 | lh3lh3 | 2008-12-08 09:52:17 -0500 (Mon, 08 Dec 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/bam_import.c - M /branches/dev/samtools/bam_index.c - M /branches/dev/samtools/bamtk.c - - * fixed a bug in import: bin is wrongly calculated - ------------------------------------------------------------------------- -r33 | lh3lh3 | 2008-12-08 09:08:01 -0500 (Mon, 08 Dec 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/misc/all2sam.pl - -nothing, really - ------------------------------------------------------------------------- -r32 | lh3lh3 | 2008-12-08 07:56:02 -0500 (Mon, 08 Dec 2008) | 3 lines -Changed paths: - M /branches/dev/samtools/Makefile - M /branches/dev/samtools/kseq.h - M /branches/dev/samtools/misc/Makefile - A /branches/dev/samtools/misc/md5.c - A /branches/dev/samtools/misc/md5.h - A /branches/dev/samtools/misc/md5fa.c - - * fixed two warnings in kseq.h - * added md5sum utilities - ------------------------------------------------------------------------- -r31 | lh3lh3 | 2008-12-08 06:35:29 -0500 (Mon, 08 Dec 2008) | 5 lines -Changed paths: - M /branches/dev/samtools/Makefile - M /branches/dev/samtools/bam_import.c - M /branches/dev/samtools/bamtk.c - A /branches/dev/samtools/kseq.h - D /branches/dev/samtools/kstream.h - - * samtools-0.1.0-52 - * replace kstream with kseq. kseq is a superset of kstream. I need the - extra functions in kseq.h. - * also compile stand-alone faidx - ------------------------------------------------------------------------- -r30 | lh3lh3 | 2008-12-08 06:17:04 -0500 (Mon, 08 Dec 2008) | 3 lines -Changed paths: - M /branches/dev/samtools/bam.h - M /branches/dev/samtools/bam_sort.c - M /branches/dev/samtools/bamtk.c - - * samtools-0.1.0-51 - * sorting by read names is available - ------------------------------------------------------------------------- -r29 | lh3lh3 | 2008-12-08 05:29:02 -0500 (Mon, 08 Dec 2008) | 3 lines -Changed paths: - M /branches/dev/samtools/bam.c - M /branches/dev/samtools/bam.h - M /branches/dev/samtools/bam_import.c - M /branches/dev/samtools/bam_maqcns.c - M /branches/dev/samtools/bam_pileup.c - M /branches/dev/samtools/bam_sort.c - M /branches/dev/samtools/bam_tview.c - M /branches/dev/samtools/bamtk.c - M /branches/dev/samtools/misc/maq2sam.c - - * samtools-0.1.0-50 - * format change to meet the latest specification - ------------------------------------------------------------------------- -r28 | lh3lh3 | 2008-12-04 11:09:21 -0500 (Thu, 04 Dec 2008) | 3 lines -Changed paths: - M /branches/dev/samtools/bam_maqcns.c - M /branches/dev/samtools/misc/maq2sam.c - - * minor change in maqcns: special care when n==0 - * change maq2sam to meet the latest specification - ------------------------------------------------------------------------- -r27 | lh3lh3 | 2008-12-04 10:55:44 -0500 (Thu, 04 Dec 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/razf.c - M /branches/dev/samtools/razf.h - -considerable code clean up in razf - ------------------------------------------------------------------------- -r26 | lh3lh3 | 2008-12-04 10:08:18 -0500 (Thu, 04 Dec 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/ChangeLog - M /branches/dev/samtools/Makefile - M /branches/dev/samtools/faidx.c - -make RAZF optional in faidx.c - ------------------------------------------------------------------------- -r25 | lh3lh3 | 2008-12-01 10:27:22 -0500 (Mon, 01 Dec 2008) | 3 lines -Changed paths: - M /branches/dev/samtools/Makefile - M /branches/dev/samtools/bam.h - M /branches/dev/samtools/bam_aux.c - M /branches/dev/samtools/bamtk.c - M /branches/dev/samtools/samtools.1 - - * samtools-0.1.0-49 - * added routines for retrieving aux data, NOT TESTED YET! - ------------------------------------------------------------------------- -r24 | lh3lh3 | 2008-12-01 09:29:43 -0500 (Mon, 01 Dec 2008) | 5 lines -Changed paths: - M /branches/dev/samtools/bam.c - M /branches/dev/samtools/bam_import.c - M /branches/dev/samtools/bam_maqcns.c - M /branches/dev/samtools/bamtk.c - M /branches/dev/samtools/bgzf.c - M /branches/dev/samtools/samtools.1 - - * samtools-0.1.0-48 - * bgzf: fixed a potential integer overflow on 32-it machines - * maqcns: set the minimum combined quality as 0 - * supporting hex strings - ------------------------------------------------------------------------- -r23 | lh3lh3 | 2008-11-27 12:14:37 -0500 (Thu, 27 Nov 2008) | 3 lines -Changed paths: - M /branches/dev/samtools/bam_maqcns.c - M /branches/dev/samtools/bamtk.c - - * samtools-0.1.0-47 - * fixed the bug in maqcns - ------------------------------------------------------------------------- -r22 | lh3lh3 | 2008-11-27 12:08:11 -0500 (Thu, 27 Nov 2008) | 3 lines -Changed paths: - M /branches/dev/samtools/Makefile - M /branches/dev/samtools/bam.h - A /branches/dev/samtools/bam_maqcns.c - A /branches/dev/samtools/bam_maqcns.h - M /branches/dev/samtools/bam_tview.c - M /branches/dev/samtools/bamtk.c - A /branches/dev/samtools/glf.h - - * samtools-0.1.0-46 - * add MAQ consensus caller, currently BUGGY! - ------------------------------------------------------------------------- -r21 | lh3lh3 | 2008-11-27 08:51:28 -0500 (Thu, 27 Nov 2008) | 4 lines -Changed paths: - M /branches/dev/samtools/bam_pileup.c - M /branches/dev/samtools/bam_tview.c - M /branches/dev/samtools/bamtk.c - - * samtools-0.1.0-45 - * tview: display padded alignment (but not P operation) - * better coordinates and reference sequence - ------------------------------------------------------------------------- -r19 | lh3lh3 | 2008-11-27 04:26:05 -0500 (Thu, 27 Nov 2008) | 2 lines -Changed paths: - A /branches/dev/samtools/ChangeLog - -new ChangeLog - ------------------------------------------------------------------------- -r18 | lh3lh3 | 2008-11-27 04:24:45 -0500 (Thu, 27 Nov 2008) | 3 lines -Changed paths: - D /branches/dev/samtools/ChangeLog - A /branches/dev/samtools/ChangeLog.old (from /branches/dev/samtools/ChangeLog:6) - -Rename ChangeLog to ChangeLog.old. This old ChangeLog is generated from -the log of my personal SVN repository. - ------------------------------------------------------------------------- -r17 | lh3lh3 | 2008-11-27 04:22:55 -0500 (Thu, 27 Nov 2008) | 6 lines -Changed paths: - M /branches/dev/samtools/Makefile - M /branches/dev/samtools/bamtk.c - M /branches/dev/samtools/bgzf.c - - * samtools-0.1.0-44 - * declare fseeko and ftello as some Linux may not do this by default and - missing these declarations will make bgzf buggy - * get rid of some harmless warings - * use BGZF by default, now - ------------------------------------------------------------------------- -r16 | lh3lh3 | 2008-11-26 16:19:11 -0500 (Wed, 26 Nov 2008) | 4 lines -Changed paths: - M /branches/dev/samtools/bam_index.c - M /branches/dev/samtools/bamtk.c - M /branches/dev/samtools/razf.c - - * samtools-0.1.0-43 - * fixed a bug in razf_read() - * give more warnings when the file is truncated (or due to bugs in I/O library) - ------------------------------------------------------------------------- -r15 | lh3lh3 | 2008-11-26 15:41:39 -0500 (Wed, 26 Nov 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/bgzf.c - -fixed a bug in bgzf.c at the end of the file - ------------------------------------------------------------------------- -r14 | lh3lh3 | 2008-11-26 12:05:18 -0500 (Wed, 26 Nov 2008) | 4 lines -Changed paths: - M /branches/dev/samtools/bamtk.c - - * samtools-0.1.0-42 - * a lot happened to RAZF, although samtools itself is untouched. Better - also update the version number anyway to avoid confusion - ------------------------------------------------------------------------- -r13 | lh3lh3 | 2008-11-26 12:03:48 -0500 (Wed, 26 Nov 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/razf.c - -a change from Jue, but I think it should not matter - ------------------------------------------------------------------------- -r12 | lh3lh3 | 2008-11-26 11:48:14 -0500 (Wed, 26 Nov 2008) | 3 lines -Changed paths: - M /branches/dev/samtools/razf.c - -fixed a potential bug in razf. However, it seems still buggy, just -rarely happens, very rarely. - ------------------------------------------------------------------------- -r11 | lh3lh3 | 2008-11-26 09:02:56 -0500 (Wed, 26 Nov 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/razf.c - -fixed a bug in razf, with the help of Jue - ------------------------------------------------------------------------- -r10 | lh3lh3 | 2008-11-26 06:55:32 -0500 (Wed, 26 Nov 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/bam_index.c - -remove a comment - ------------------------------------------------------------------------- -r9 | lh3lh3 | 2008-11-26 06:37:05 -0500 (Wed, 26 Nov 2008) | 2 lines -Changed paths: - M /branches/dev/samtools/Makefile - M /branches/dev/samtools/bam.h - M /branches/dev/samtools/razf.c - M /branches/dev/samtools/razf.h - - * Jue has updated razf to realize Bob's scheme - ------------------------------------------------------------------------- -r7 | lh3lh3 | 2008-11-25 15:37:37 -0500 (Tue, 25 Nov 2008) | 2 lines -Changed paths: - A /branches/dev/samtools/samtools.1 - -the manual page - ------------------------------------------------------------------------- -r6 | lh3lh3 | 2008-11-25 15:37:16 -0500 (Tue, 25 Nov 2008) | 3 lines -Changed paths: - A /branches/dev/samtools/ChangeLog - A /branches/dev/samtools/Makefile - A /branches/dev/samtools/bam.c - A /branches/dev/samtools/bam.h - A /branches/dev/samtools/bam_aux.c - A /branches/dev/samtools/bam_endian.h - A /branches/dev/samtools/bam_import.c - A /branches/dev/samtools/bam_index.c - A /branches/dev/samtools/bam_lpileup.c - A /branches/dev/samtools/bam_pileup.c - A /branches/dev/samtools/bam_sort.c - A /branches/dev/samtools/bam_tview.c - A /branches/dev/samtools/bamtk.c - A /branches/dev/samtools/bgzf.c - A /branches/dev/samtools/bgzf.h - A /branches/dev/samtools/bgzip.c - A /branches/dev/samtools/faidx.c - A /branches/dev/samtools/faidx.h - A /branches/dev/samtools/khash.h - A /branches/dev/samtools/ksort.h - A /branches/dev/samtools/kstream.h - A /branches/dev/samtools/misc - A /branches/dev/samtools/misc/Makefile - A /branches/dev/samtools/misc/all2sam.pl - A /branches/dev/samtools/misc/maq2sam.c - A /branches/dev/samtools/razf.c - A /branches/dev/samtools/razf.h - A /branches/dev/samtools/razip.c - A /branches/dev/samtools/zutil.h - -The initial version of samtools, replicated from my local SVN repository. -The current version is: 0.1.0-42. All future development will happen here. - ------------------------------------------------------------------------- -r5 | lh3lh3 | 2008-11-25 15:30:49 -0500 (Tue, 25 Nov 2008) | 2 lines -Changed paths: - A /branches/dev/samtools - -samtools (C version) - ------------------------------------------------------------------------- diff --git a/sam/ChangeLog.old b/sam/ChangeLog.old new file mode 100644 index 0000000..19aefae --- /dev/null +++ b/sam/ChangeLog.old @@ -0,0 +1,3875 @@ +commit db2ad3e19068cbafde72ecde75d0638bbb3598ba +Author: Heng Li +Date: Thu Feb 16 14:45:17 2012 -0500 + + removed downsample.c + +commit 6c55c576903992c6fef148fe3b606fbc8bd10655 +Author: Heng Li +Date: Thu Feb 16 14:45:06 2012 -0500 + + print to output + +commit db1044a34e6049c87eaa63c39ed6e56f03e7d4c1 +Author: Heng Li +Date: Thu Feb 16 14:39:34 2012 -0500 + + removed sample + + Downsampling already exists in "view". View also keeps pairing while "sample" does not. + +commit ffdeed3e5d4a530bfdf6f9ba97fff0ba7add6cba +Merge: 2daad7b accf026 +Author: Heng Li +Date: Thu Feb 16 14:22:15 2012 -0500 + + Merge branch 'master' of github.com:lh3/samtools + +commit accf0260fd1117e10047344345d40b31a9ec31bb +Merge: 9134e0d c554160 +Author: Heng Li +Date: Thu Feb 16 11:21:14 2012 -0800 + + Merge pull request #8 from nh13/master + + Patches + +commit c554160df16ec7748cfdda4c7b54c641be7b809f +Author: Nils Homer +Date: Thu Feb 16 14:06:52 2012 -0500 + + * more README.md work + +commit 2a81ffe349208d917666808fbc9f3041e0cb57de +Author: Nils Homer +Date: Thu Feb 16 14:06:10 2012 -0500 + + * more README work + +commit fb3125f732715f62cded8685a23a002a96ce009b +Author: Nils Homer +Date: Thu Feb 16 14:05:19 2012 -0500 + + * more README work + +commit 444d41002c37e1c3d0f9208b4a88126c47276386 +Author: Nils Homer +Date: Thu Feb 16 14:02:13 2012 -0500 + + * updating README + +commit dec53cb1043fe7efadfde75fa2fd39b76de22e54 +Author: Nils Homer +Date: Thu Feb 16 13:55:01 2012 -0500 + + updating the README for markdown syntax + +commit 798da18c346dca8ec6005582a0ddb1d5420b04ca +Author: Nils Homer +Date: Thu Feb 16 13:48:35 2012 -0500 + + adding a README with the current differences between this repository and + the official one + +commit 4d22d86c0f28636662f2144a88cd168e104c4275 +Author: Nils Homer +Date: Thu Feb 16 13:35:03 2012 -0500 + + adding "samtools sample" to the main + +commit 893c25a37c21005dc42f45d45e9ad78ddc5f29bb +Author: Nils Homer +Date: Thu Feb 16 13:33:51 2012 -0500 + + * removing some compile flags to work with OS X + +commit 7ac22f72fdc32edd5c24af6baebfa7db5faf8e7b +Author: Jonathan Manning +Date: Thu Feb 16 10:47:14 2012 -0500 + + Check write filehandle after opening for write. tamw/tamr is a union type, so change is only semantic. + + Signed-off-by: Nils Homer + +commit fef53330416631690f60fdff42b6e43d764170dc +Author: Jonathan Manning +Date: Thu Feb 16 10:44:59 2012 -0500 + + Catch and report invalid BAM header, instead of segfaulting later on. + + Signed-off-by: Nils Homer + +commit 5cc013fe4930bf9b6e7963aab1cd4a3c94f695bc +Author: Jonathan Manning +Date: Thu Feb 16 10:44:16 2012 -0500 + + Add downsample to examples. + + Signed-off-by: Nils Homer + +commit b3fa9e7071532905a81dc7aa48eadc24b8c8846b +Author: Jonathan Manning +Date: Thu Feb 16 10:43:48 2012 -0500 + + Adjust for leading hard clip on colorspace reads. + + Signed-off-by: Nils Homer + +commit 1a9296c1389469d1c1db5b8069f0e11ffcc8abb2 +Author: Jonathan Manning +Date: Thu Feb 16 10:42:52 2012 -0500 + + Add samtools sample command, contributed by Davide Cittaro . + + Signed-off-by: Nils Homer + +commit 2a804f3379748aeba944f1dec306dd726ff3235e +Author: Jonathan Manning +Date: Thu Feb 16 10:42:07 2012 -0500 + + Add samtools qa command, contributed by Roman Valls Guimera . + + Signed-off-by: Nils Homer + +commit 0f3207fe8fd93e44d40fcf57204079c8c06d24a6 +Author: Jonathan Manning +Date: Thu Feb 16 10:39:08 2012 -0500 + + Makefile cleanup - allow CC, CFLAGS, LDFLAGS to be passed on make command line. Use LDFLAGS in samtools compile. + + Signed-off-by: Nils Homer + +commit 6e7df604025f6a86881bf7f4a16f30e15d31538a +Author: Jonathan Manning +Date: Thu Feb 16 10:31:15 2012 -0500 + + Allow max_mem for sort to be specified with units. + + Signed-off-by: Nils Homer + +commit f12ebcaf6e60d34180a27d70e09b743cef140b98 +Author: Jonathan Manning +Date: Thu Feb 16 10:29:11 2012 -0500 + + Allow user defined [lowercase] tags in header elements. + + Signed-off-by: Nils Homer + +commit 50b931fa3312dc109537a4260698ddecd0f06a05 +Author: Jonathan Manning +Date: Thu Feb 16 10:27:11 2012 -0500 + + Check lowerbound in text entry box to avoid segfault in tview. Remove redundant call to bam_aux_get. + + Signed-off-by: Nils Homer + +commit 5e729da5190949a813d20d329eab7ddb661816bd +Author: Nils Homer +Date: Thu Feb 16 10:31:48 2012 -0500 + + * fixing overflow/underflow in integer parsing + +commit fa50a4330b9abedaf07c26e13d31f05e57f1d319 +Author: Nils Homer +Date: Thu Feb 16 10:30:40 2012 -0500 + + * updating help message for samtools depth + +commit 79e52c9624b6dd3bdfdf439f4b4bc6f774c230a4 +Author: Nils Homer +Date: Thu Feb 16 10:29:32 2012 -0500 + + * adding support for outputting a circos histogram file in "samtools depth". Use + the "-c/-B" options. + +commit 2daad7b52daa86561c0fb65fe366691fad9f5ed3 +Author: Heng Li +Date: Thu Feb 16 09:31:57 2012 -0500 + + bugfix: wrong SP; missing DV in the VCF hdr + +commit 9134e0d5047c281ef3bd53da91771d4814a5131c +Author: Heng Li +Date: Wed Feb 8 11:19:12 2012 -0500 + + missing support of DV + +commit 34ebf12078c1d1015a0b8b9a9221243a60b22893 +Author: Heng Li +Date: Wed Feb 8 11:08:56 2012 -0500 + + new BCF DV format: number of variant reads + +commit 9589d3312fa2d076f48bdd68e2a5edd419c8070c +Author: Heng Li +Date: Tue Jan 10 10:30:27 2012 -0500 + + scale depth to quality (hidden option) + +commit 704473e14668333ecaca5fb7b238af405c43e3b1 +Author: Heng Li +Date: Tue Jan 10 10:18:17 2012 -0500 + + really nothing + +commit 01b307fd287962372bbf07461c88b54f41636817 +Author: Heng Li +Date: Wed Dec 7 13:07:42 2011 -0500 + + added an example containing 'B' + +commit c678791f0451ceb9205c1ab5c52c84641863c99a +Author: Heng Li +Date: Sat Dec 3 12:10:30 2011 -0500 + + 'B' now moves backward w.r.t. the query + +commit 152119bc06a073933ca830e8e1407538e44626cc +Author: Heng Li +Date: Fri Dec 2 10:50:12 2011 -0500 + + better consensus; a little more robust + +commit 454da4754ac503edda5b1329b67757d797e46e07 +Author: Heng Li +Date: Fri Dec 2 00:20:22 2011 -0500 + + in pileup call remove_B() + +commit ff2bcac1cc078ba1879f18c89cfae314439d7086 +Author: Heng Li +Date: Fri Dec 2 00:17:32 2011 -0500 + + working on a few toy examples + +commit 745ca7260158d6df7897b52598033ffb055a9e4f +Author: Heng Li +Date: Thu Dec 1 22:55:39 2011 -0500 + + bam_remove_B(); not tested + +commit 07e4cdc7300abfcc82e03105b4689f95cab551cd +Author: Heng Li +Date: Thu Nov 10 12:58:55 2011 -0500 + + baseQ threshold on plain pipleup; removed -E + +commit 322ebf2082dfa91df44b3a996d26c85357e5d5a2 +Author: Heng Li +Date: Wed Oct 19 09:28:04 2011 -0400 + + fixed two gcc warnings + +commit a632457b4c4adc50d833b56b5a5231feafaf8193 +Author: Heng Li +Date: Tue Oct 4 10:13:23 2011 -0400 + + change size_t to uint32_t in bam_header_t + + This may cause issues on 64-bit big-endian machines. Reported and fixed by Paolo Emilio Mazzon. + +commit af31bf5a78aea03baf6eb90fe50076549d499f6e +Author: Heng Li +Date: Mon Sep 26 20:17:57 2011 -0400 + + rename pad2unpad to depad + +commit 77b198b73dfad1048e5d1c5a64aa75ee7b90f596 +Author: Heng Li +Date: Fri Sep 23 01:22:40 2011 -0400 + + convert padded BAM to unpadded BAM + +commit adb9e2342b7b7501d9527d3c23afab10469ae2c6 +Author: Heng Li +Date: Wed Sep 7 11:40:50 2011 -0400 + + generate template cigar with "fixmate" + +commit 46e5ab445a0fe880216cbc0daf1225725b569d7a +Author: Heng Li +Date: Fri Sep 2 12:50:18 2011 -0400 + + update kseq.h to the latest version + +commit 68e9e4a73eb91405bb3e56bf0cdaf12d1b487abb +Author: Heng Li +Date: Fri Sep 2 12:44:45 2011 -0400 + + Release samtools-0.1.18 + +commit aa06bdadb2d109a79f927f478102f96a1f5fd258 +Author: Heng Li +Date: Fri Sep 2 12:14:17 2011 -0400 + + updated the revision number + +commit 267e1e1b6e54c0ab24f94cd9aee9cbd2d1923f9f +Merge: 19ff1d3 aebab30 +Author: Heng Li +Date: Fri Sep 2 12:13:08 2011 -0400 + + Merge https://github.com/lh3/samtools into reduce + + Conflicts: + bam_md.c + + Fixed a few typos in the merge + +commit aebab302399c24eaa6c5ab79d13d6bd5e2e9ea9a +Merge: c2c63d0 da62663 +Author: Heng Li +Date: Fri Sep 2 09:03:49 2011 -0700 + + Merge pull request #4 from peterjc/x_equals2 + + Implement basic support for =/X CIGAR operations + +commit 19ff1d3d7f47d7e61b121292aefe5a74bb8a18d2 +Author: Heng Li +Date: Thu Aug 25 16:38:12 2011 -0400 + + reduce BAM size (experimental) + +commit da626630fd98fd4e07ceb4d58c5c9a42d312a85d +Author: peterjc +Date: Mon Aug 22 06:58:08 2011 +0100 + + Support =/X CIGAR operations (treated like M) + +commit 461d8003529db77a4d5ecbd108312e868b051a3d +Author: peterjc +Date: Mon Aug 22 05:52:56 2011 +0100 + + Define CIGAR equals and X operationss (7 and 8) + +commit c2c63d067113baab41f3bc35fb28f4f00578accb +Merge: 7ab3ef3 9a0ed9a +Author: Heng Li +Date: Thu Aug 18 17:21:54 2011 -0700 + + Merge pull request #3 from peterjc/x_equals + + Accept SAM files using = in CIGAR (treats X and = as M) + +commit 9a0ed9a6b85c7981465f459300208dbd93e3c6f5 +Author: peterjc +Date: Thu Aug 18 19:28:52 2011 +0100 + + Accept SAM files using = in CIGAR (treats X and = as M) + +commit 7ab3ef388c1eb34d7912fd70cc5656c955240263 +Author: Heng Li +Date: Mon Aug 8 10:22:22 2011 -0400 + + bugfix: indexing takes huge memory + + This happens when an unmapped mate has coordinate 1. Thank Joel Martin for the fix. + +commit a3f6738593e944354a8f75306687d8b3acf08bf1 +Merge: a8bdca9 bc67ea2 +Author: Heng Li +Date: Mon Aug 8 09:52:26 2011 -0400 + + Merge branch 'master' of github.com:lh3/samtools + +commit bc67ea225da653f36a70b38382d6111dd494f659 +Author: Petr Danecek +Date: Thu Jul 28 20:03:16 2011 +0100 + + Variant Distance Bias + +commit deb578f0c49d0b7d8c3bc6be220b4d67e2e7dfdf +Author: Petr Danecek +Date: Tue Jul 26 09:57:37 2011 +0100 + + If there is only one RG tag present in the header and reads are not annotated, don't refuse to work but use the tag instead. + +commit a8bdca9cf482a637b89ee4f98469a93e0ab5e69b +Author: Heng Li +Date: Mon Jul 25 10:10:55 2011 -0400 + + bugfix: LRT2=nan + +commit 0afe33137d046a3e849eeb4a54590f27cbad4228 +Author: Heng Li +Date: Fri Jul 22 21:55:38 2011 -0400 + + fixed a bug/typo + +commit 62d5849658c10222d40308c6b53ab4f99a448494 +Author: Heng Li +Date: Fri Jul 15 16:04:19 2011 -0400 + + allow to set see in subsampling + +commit 5f46243824cc9435b167973e1d51e13128794ea1 +Author: Heng Li +Date: Fri Jul 15 15:54:47 2011 -0400 + + support subsampling + +commit 5e55b6f34fc86cba7cf98d52ccaed405c3ffabbc +Author: Heng Li +Date: Fri Jul 15 15:53:38 2011 -0400 + + support indels + +commit f31c162926d6f43e8b60171789a258d02e1f9be5 +Author: Heng Li +Date: Thu Jul 7 17:02:33 2011 -0400 + + do not count indel with "view -Y" + +commit e412dae587883b4c17e5fbf4b7c33f38bfa8458a +Author: Heng Li +Date: Thu Jul 7 00:35:25 2011 -0400 + + for WIN32 compatibility + +commit 70a52501bcfa63824749893a5ab8ed3c38e34958 +Author: Heng Li +Date: Thu Jul 7 00:32:46 2011 -0400 + + for WIN32 compatibility + +commit 00438f14ed5984f08e8f7645a9b95644a812f969 +Author: Heng Li +Date: Wed Jul 6 23:41:45 2011 -0400 + + fixed an uninitialized variable + +commit 7609c4a01059c326544b3d0142dfe9c4229d68c6 +Author: Heng Li +Date: Wed Jul 6 23:39:31 2011 -0400 + + fixed an uninitialized variable + +commit cec7189a412f80ccb068a73bd28528915c16b0bf +Author: Heng Li +Date: Wed Jul 6 22:53:19 2011 -0400 + + Release samtools-0.1.17 + +commit 93c06a249de3bb666029bf07b66de5e8e5e314fa +Author: Heng Li +Date: Wed Jul 6 09:46:09 2011 -0400 + + bugfix: incorrect idxstats for the last seq + + Again, this bug is caused by 3rd-party code for the sorting order checking. + +commit 84f6ca62db6e27b8c4c711e7b5f3ca704bf27b4f +Author: Heng Li +Date: Tue Jul 5 23:30:23 2011 -0400 + + output mapping quality in the old pileup format + +commit 362e05fd670886acaede69b864903d730b9db3ca +Author: Heng Li +Date: Tue Jul 5 21:59:22 2011 -0400 + + added a brief description of the VCF format + +commit e690a696468205e0cc4560016361c997660dd496 +Author: Heng Li +Date: Tue Jul 5 16:23:10 2011 -0400 + + improved samtools manual page + +commit 362b4a1408ef3c32311d638aa8d85ce39c1c7b2d +Author: Heng Li +Date: Tue Jul 5 15:58:29 2011 -0400 + + merge bcftools.1 to samtools.1 + +commit 643e0e61ba7266efbc9e5bfcb8e41f369ba2ce0a +Author: Heng Li +Date: Tue Jul 5 13:39:02 2011 -0400 + + mpileup: when region set, set reference properly + +commit 613e4d67624a94f62563935fbd5cc294df69605a +Author: Heng Li +Date: Mon Jul 4 23:29:02 2011 -0400 + + compute the min PL diff + +commit 5b7d5d3f52b97ca42c8500eede808dab88a46a53 +Author: Heng Li +Date: Mon Jul 4 22:57:48 2011 -0400 + + rename trio.c to mut.c + +commit 84fe96ad64b0365ead93a4115d1684b9bebb98fc +Author: Heng Li +Date: Sun Jul 3 15:38:51 2011 -0400 + + added pair caller interface; not tested + +commit 2f2867b87b84c35319cc416d6173819d5c8a4e8c +Author: Heng Li +Date: Sun Jul 3 15:24:23 2011 -0400 + + inital implementation of a pair caller + +commit e97653cf2ad653c95886933c42a2b5492ccab5ff +Author: Heng Li +Date: Sun Jul 3 00:06:28 2011 -0400 + + convert bam to single-end fastq + +commit e8013e11f7a8db0a8d18c60d130169cca39bf2bd +Author: Heng Li +Date: Sat Jul 2 14:39:18 2011 -0400 + + improve BED parsing + +commit 1025714325fdc636aeee47a76db8dafbbbfde64b +Author: Heng Li +Date: Fri Jul 1 14:19:54 2011 -0400 + + update the manual page + +commit 8022d0039dff47b1c11b2421357d510c1f28ae15 +Author: Heng Li +Date: Fri Jul 1 14:17:03 2011 -0400 + + output the best constrained genotypes in trio + +commit 18c87295e12f5bebafdcae00d52000fb94c8a566 +Author: Heng Li +Date: Fri Jul 1 11:18:14 2011 -0400 + + added documentations for view -T + +commit daf7a8d96bd495296bf7c7d99cddb808a3ced7d5 +Author: Heng Li +Date: Thu Jun 30 22:45:20 2011 -0400 + + fixed a bug in writing SP + +commit e5c32bf9b28c6e3e861db88de56b5dbe11058b61 +Author: Heng Li +Date: Thu Jun 30 22:35:25 2011 -0400 + + optionally output read positions in mpileup + +commit 1008051155ec994c1901e18f3eb03ea32a62e5d7 +Author: Heng Li +Date: Thu Jun 30 22:17:25 2011 -0400 + + make faidx works with <2GB lines + +commit 2daebb63762425dd3074ddf71582ad189001e394 +Author: Heng Li +Date: Thu Jun 30 17:28:58 2011 -0400 + + fixed an issue in the trio caller and the indel caller + +commit 9fdd52cf0716fb342a94946433d564b28b230835 +Author: Heng Li +Date: Thu Jun 30 13:34:01 2011 -0400 + + Added trio caller; NOT tested yet + +commit ea22a8ed83625e9c82382b56acc42a2d9cfd17e5 +Author: Heng Li +Date: Thu Jun 30 11:42:29 2011 -0400 + + convert PL to 10-likelihood GL + +commit 10d7065267b0d12c2bfcb6c70204fb6944cd395d +Author: Heng Li +Date: Thu Jun 30 10:49:05 2011 -0400 + + fix a compatibility issue with the new bcftools + +commit d340f01f609c61b719d38a6a55629a3fc899e1cd +Author: Heng Li +Date: Sun Jun 26 23:41:20 2011 -0400 + + allow to ignore RG + +commit d6321faf98ebfe899b9409fb23c90a4aa8c6b542 +Author: Heng Li +Date: Sun Jun 5 23:05:21 2011 -0400 + + fixed a bug in SO checking due to a recent change + +commit bc995abf666d0c9ab4258f6c1b3518a45a89209f +Author: Heng Li +Date: Fri Jun 3 14:45:36 2011 -0400 + + update the version number + +commit 9e7cd83a08383858d008e0ccb2238a2b93831d6c +Author: Heng Li +Date: Fri Jun 3 14:43:12 2011 -0400 + + smarter way to parse a region string + +commit e58a90a0fde54053dac65352b34c13c3fea815fc +Author: Heng Li +Date: Wed Jun 1 14:36:22 2011 -0400 + + output LRT2 instead of LRT1 + +commit 08f78c9af3e5661f04f80bef424232de721dba03 +Author: Heng Li +Date: Wed Jun 1 14:02:28 2011 -0400 + + genotype test, but assuming 1-degree + +commit 587b852340d7e60f6f7cf474a92ef77aeab46018 +Author: Heng Li +Date: Wed Jun 1 12:55:19 2011 -0400 + + perform 2-degree test by default + +commit 3d38e403c5c830478b7eb157a484776997440501 +Author: Heng Li +Date: Wed Jun 1 12:44:34 2011 -0400 + + fixed a typo; but the result is still not good + +commit 06291624f7dcc57445676f3be25d0bc355dd7110 +Author: Heng Li +Date: Wed Jun 1 12:24:18 2011 -0400 + + fixed a typo + +commit 63b98aa33636b0d82a435bf49153c8c1502e7d42 +Author: Heng Li +Date: Wed Jun 1 12:23:37 2011 -0400 + + added HWE+F<0 filter + +commit 37d926e8999999b593d0637ab7dc379dbd3d6006 +Author: Heng Li +Date: Wed May 4 10:11:59 2011 -0400 + + improved sorting order checking in index + + Patches from Jonathan Manning + +commit 1c2dc6762c5f7cd946046b53346513f2f9761dbf +Author: Heng Li +Date: Tue May 3 23:09:05 2011 -0400 + + added r^2 estimate; added Brent's method + +commit c2d3bcd8f98e31668b5f1321222fbc6fd6336e75 +Author: Heng Li +Date: Sun May 1 23:45:23 2011 -0400 + + combine several utilites into vcfutils.lua + +commit be2e7362d7593ea4d03fb33cdb6af2aa096ca6c4 +Author: Heng Li +Date: Wed Apr 27 21:09:22 2011 -0400 + + minor warning + +commit 683ef0443860813d743cf84fa86dda9bfaf5445a +Author: Heng Li +Date: Wed Apr 27 10:10:38 2011 -0400 + + added versioning + +commit ed72f25ec85671f7646dbc92fa7b5b1dda427f7d +Author: Heng Li +Date: Wed Apr 27 10:04:02 2011 -0400 + + Output ML allele count + +commit 2a9e36d2d6c405b2411ca47458f028ada8fe1000 +Author: Heng Li +Date: Tue Apr 26 16:14:20 2011 -0400 + + use ar -s + +commit 7a4f54e6dbcd7c94acbb3f1050a93f94b8a07949 +Author: Heng Li +Date: Sat Apr 23 01:22:31 2011 -0400 + + added another type of LRT + +commit b9c5e84762a4aacce3a3771b51ea80967c79a2e5 +Author: Heng Li +Date: Fri Apr 22 16:00:31 2011 -0400 + + added version + +commit 8fad6677c5952efd67391581d64e67e02e7f6e68 +Author: Heng Li +Date: Fri Apr 22 00:30:19 2011 -0400 + + remove the pileup command + +commit 3a962fb6ebf779de70f9e6effb2d8701a9aa3dd9 +Author: Heng Li +Date: Thu Apr 21 23:10:45 2011 -0400 + + Release 0.1.16 (r963:234) + +commit b4d683cffbd98c43f05aff8610b37d63dd7e54aa +Author: Heng Li +Date: Thu Apr 21 12:44:44 2011 -0400 + + fixed a bug when coordinate-less reads are on the reverse strand + +commit c5ec45a128f409debc6a56a798024f53004037dc +Author: Heng Li +Date: Wed Apr 20 11:36:52 2011 -0400 + + added option '-f' to merge to avoid overwritting + +commit 68d431531370d24907c01a27f166f2341d7c4d35 +Author: Heng Li +Date: Wed Apr 20 10:26:58 2011 -0400 + + do not print a warning + +commit 32922607e51ad2260c337eb022b9e4aedacb049f +Author: Heng Li +Date: Wed Apr 20 10:21:06 2011 -0400 + + Added ldpair to compute LD between requested pairs + +commit b8d6fa71b91678fa02338257e0707d1e5ca098dd +Author: Heng Li +Date: Sun Apr 17 21:51:43 2011 -0400 + + On a toy sample, type "B" seems to be accepted + +commit 0e7ee9a6bb4029184202aa6e6738105ba0c0510b +Author: Heng Li +Date: Sun Apr 17 21:21:20 2011 -0400 + + added type "B"; not tested yet + +commit a513dfad0ac0062b03871eb6ecf26cb8d18dc895 +Author: Heng Li +Date: Sun Apr 17 19:25:54 2011 -0400 + + fixed a bug in bedidx.c: input BED not sorted + +commit de1e192bb0a8a762a54a6eee81d882fab01c3d32 +Author: Heng Li +Date: Sun Apr 17 18:51:08 2011 -0400 + + by default, always perform posterior chi^2 + +commit df6e0d1099895fc6cd7a19dc89fba95ed6654d35 +Author: Heng Li +Date: Sat Apr 16 12:33:28 2011 -0400 + + added debugging + +commit 8ce52e024dc2ef361dbd5399c232163055057e70 +Author: Heng Li +Date: Sat Apr 16 00:59:05 2011 -0400 + + avoid a segfault given wrong input + +commit e66b6684fc9a397f91ec29fdeecae9f8eb986a55 +Author: Heng Li +Date: Fri Apr 15 19:55:39 2011 -0400 + + do not segfault when there is no PL + +commit 9ce3c584ec0cebfa45576f2ef538df4dad2b7e55 +Author: Heng Li +Date: Fri Apr 15 11:59:55 2011 -0400 + + remove another unused part + +commit f53a051d68bf312ac8d5865210fae7a9808c0fb9 +Author: Heng Li +Date: Fri Apr 15 10:41:25 2011 -0400 + + print G3 if HWE is small + +commit 4b2c08bb86ca4ed4959e4cb77a28f7d6fc19f5c9 +Author: Heng Li +Date: Fri Apr 15 10:04:34 2011 -0400 + + fixed a bug + + actually not fix, but hide it + +commit 088e13c32453fb533b7bb1c65a573f9b90a23625 +Author: Heng Li +Date: Fri Apr 15 09:48:47 2011 -0400 + + added LRT based permutation; not used though + +commit 1e3c2001afcb80b5eaa4c3f88df9da7b01b62524 +Author: Heng Li +Date: Fri Apr 15 09:28:55 2011 -0400 + + Perform posterior contrast for small LRT + + Posterior contrast is much slower than LRT. Nonetheless, posterior P-value is + more robust to sequencing artifacts. Thus we may combine the two to achieve a + balance between speed and low FPR. + +commit 6f1b066270902198a7175ff6c1b05ebc8d1919be +Author: Heng Li +Date: Fri Apr 15 01:36:06 2011 -0400 + + Added Brent's method + +commit 3d061e5db25b67b25f6ff87afe4162e121354232 +Author: Heng Li +Date: Thu Apr 14 23:30:10 2011 -0400 + + fixed a typo in printing + +commit 7fd14ceb5990bb350b8e97346ef3537d80058def +Author: Heng Li +Date: Thu Apr 14 23:14:23 2011 -0400 + + fixed a stupid bug + +commit f5b2c3459ec098b3cafd9619b9077132516baf58 +Author: Heng Li +Date: Thu Apr 14 22:42:35 2011 -0400 + + separate EM and posterior + + Now, constrast is not performed unless -C is in use. EM can be invoked + independently with -e without computing the posterior. + +commit 9eefcac963697fae554789b11ae3cb2c23f224d0 +Author: Heng Li +Date: Thu Apr 14 22:00:19 2011 -0400 + + further code cleanup; prepare to add EM interface + +commit c2cce52355262743711e4742b0c8542bfcab1cdd +Author: Heng Li +Date: Thu Apr 14 21:44:03 2011 -0400 + + drop EM from prob1 + +commit 24016f04bd3bdffb7eeb50cb25854f5007feb70f +Author: Heng Li +Date: Thu Apr 14 21:08:33 2011 -0400 + + drop posterior LRT; prepare for clean up + +commit 3670d8bd88c3eb22873f0a80e2a5913f64ca8c9a +Author: Heng Li +Date: Thu Apr 14 20:57:43 2011 -0400 + + better initial values for LD + +commit d48a8873c060b18b57799cfe3a0e5496ba069457 +Author: Heng Li +Date: Thu Apr 14 20:36:25 2011 -0400 + + finished EM + +commit b101f2db476188a950c23f5c1b6185fdb7f8f40b +Author: Heng Li +Date: Wed Apr 13 01:19:04 2011 -0400 + + genotype frequency estimate + +commit d79bdcbf6242ecfb8accba9ac9a22fbcbd543cf2 +Author: Heng Li +Date: Wed Apr 13 00:37:22 2011 -0400 + + prepare for code clean up + +commit e0ce416abfc094f0c090957080b1404fd0edf752 +Author: Heng Li +Date: Wed Apr 13 00:34:15 2011 -0400 + + rename ld.c to em.c + +commit 45ede3ad181f35c1be24bed5d75841e472357ab7 +Author: Heng Li +Date: Wed Apr 13 00:22:10 2011 -0400 + + implemeted EM likelihood ratio test + + The idea is learned from a brief chat with Rasmus Nielsen. + +commit 0454a346b60e42b75a2f742272089810279c7131 +Author: Heng Li +Date: Tue Apr 12 15:45:52 2011 -0400 + + added likelihood-ratio test (idea from Nick) + +commit f6287c8646c690440a1554c8958e7268f4134dc2 +Author: Heng Li +Date: Sun Apr 10 18:24:37 2011 -0400 + + Release samtools-0.1.15 (r949:203) + +commit de6023f38f4d652438557cf7a0ac6eec324e7416 +Author: Heng Li +Date: Sun Apr 10 15:54:58 2011 -0400 + + improved help information + +commit d3b337f2b7eda1e6f8f5575a19d1b5ed55cae279 +Author: Heng Li +Date: Sat Apr 9 16:28:01 2011 -0400 + + fixed a minor issue + +commit 82f6e4f49247e75fbd8ec08c285b8d3047b3d235 +Author: Heng Li +Date: Sat Apr 9 15:49:04 2011 -0400 + + separate QC-pass and QC-fail reads + +commit 8362b4a255081ee7ca0a4ca2eabc8c76758b6863 +Author: Heng Li +Date: Fri Apr 8 17:45:19 2011 -0400 + + added verbose level + +commit f7bf419c290462be7d289249a4a6d28f825b4c93 +Author: Heng Li +Date: Fri Apr 8 16:08:14 2011 -0400 + + fixed a bug + +commit 890cbb1ac93b3004fb6cf42ff47195077dcfc8ad +Author: Heng Li +Date: Fri Apr 8 16:00:37 2011 -0400 + + drop unrelated @RG when "-R" is in use + +commit a62dc929c950fb51311b705f5b5bfba8e3f704d7 +Author: Heng Li +Date: Fri Apr 8 16:00:14 2011 -0400 + + skip header validation + +commit 39da810e2c56c8f0eff1ab726600b41f26d3d8e9 +Author: Heng Li +Date: Tue Apr 5 23:52:22 2011 -0400 + + change error message + +commit c0c50a34df250ef8a7a29b172058cd229be582b5 +Author: Heng Li +Date: Tue Apr 5 23:50:46 2011 -0400 + + fixed a bug caused by recent modifications + +commit 25226e8c468404cb5e1b5272efcea57e4193c762 +Author: Heng Li +Date: Tue Apr 5 13:31:19 2011 -0400 + + reduce the indel filtering window + +commit 5e18d7014437734f9dac9ab45a95e43ec2526101 +Author: Heng Li +Date: Mon Apr 4 13:56:20 2011 -0400 + + only output hwe if it is small enough + +commit 614941fb7dd276de662e7820eb8c7bae871a18cc +Author: Heng Li +Date: Mon Apr 4 13:34:02 2011 -0400 + + added HWE back + +commit 7abe8825aa0bacccdeb38125934ae94d18f0ad4d +Author: Heng Li +Date: Mon Apr 4 12:46:24 2011 -0400 + + EM estimate of genotype frequency + +commit 2bfeff9c645d177416664f1cb811e85cac3ff9e3 +Author: Heng Li +Date: Mon Apr 4 11:29:12 2011 -0400 + + minor + +commit 401e40647e7e3abbac6e4ec3d8bb68eb6f2d401b +Author: Heng Li +Date: Mon Apr 4 11:24:04 2011 -0400 + + Added genotype freq estimate and association test + +commit 6cc226df6e3b480f1bd6e763ce8ef47f785bbb74 +Author: Heng Li +Date: Sun Apr 3 20:57:23 2011 -0400 + + minor changes + +commit 7e47a39630e812f09b80369f14606245976f687e +Author: Heng Li +Date: Fri Apr 1 15:21:59 2011 -0400 + + print the grayscale + +commit 2f675d9c0dde3c166c99e335fa17c7873a5ae8d5 +Author: Heng Li +Date: Fri Apr 1 08:55:16 2011 -0400 + + change to comment + +commit 0592bb514994544ed84f51e509b233cf8821e0cf +Author: Heng Li +Date: Fri Apr 1 08:54:35 2011 -0400 + + added base quality filtering + +commit fc1b47e04a7b94f6362c45856cbeb89d9d0b5ca5 +Author: Heng Li +Date: Thu Mar 31 23:31:14 2011 -0400 + + fixed a few typos in comments + +commit 60be79bc8f0d24656e5e8a329af7e9b5b91d4c8b +Author: Heng Li +Date: Thu Mar 31 23:13:23 2011 -0400 + + comments + +commit 2432864acc25ebe5cee4217dbb0120439077a7f8 +Author: Heng Li +Date: Thu Mar 31 22:42:46 2011 -0400 + + added bam2depth.c, a demo program + +commit 39625f7c6bea9ccbfd9af0feb22348d52079f012 +Author: Heng Li +Date: Thu Mar 31 16:37:22 2011 -0400 + + added bgzf_check_bgzf() (used by tabix) + +commit 6de6bd3fb67fd22753a5f07d4cc25bf94e1b5a8c +Author: Heng Li +Date: Thu Mar 31 16:37:08 2011 -0400 + + fixed a bug in bedidx.c + +commit 3b9e257d25b2e81eed1625bc5d2882ed486ef20e +Author: Heng Li +Date: Wed Mar 30 13:27:15 2011 -0400 + + added bed support to bcftools + +commit 47bcce3d14ec4d205283b61e5e653803996c42e0 +Author: Heng Li +Date: Wed Mar 30 12:56:40 2011 -0400 + + Added BED support to "samtools view" + +commit a812386017faedfc86c0e6562adbb2138329cfeb +Author: Heng Li +Date: Wed Mar 30 12:47:04 2011 -0400 + + support BED file + +commit 3052dddc929f1825e6e7f7f6f6724d9465d6cf9a +Author: Heng Li +Date: Mon Mar 28 15:51:55 2011 -0400 + + relax RG matching; proper mismatching message + +commit f86d60c8fe25785523f01fae1486d2a6df4ee6ef +Author: Heng Li +Date: Sat Mar 26 10:38:23 2011 -0400 + + Avoid reporting association when something unexpected, which I do not understand, happens. + +commit dd41e6b26fd9fe30218748b9a0a1f49bdb1862b9 +Author: Heng Li +Date: Sat Mar 26 10:38:01 2011 -0400 + + Added -1 to merge + +commit 4a0364b0d7f87f1c88d71ec5857a1f1d40710681 +Author: Heng Li +Date: Wed Mar 23 16:56:55 2011 -0400 + + plot pairwise r^2 + +commit 452629a711582e612bec22b3b082e234bd37039b +Author: Heng Li +Date: Wed Mar 23 14:31:01 2011 -0400 + + pairwise LD; case-control AF2 + +commit 52862951adcaecde26ba8f0d9c1897944640a674 +Author: Heng Li +Date: Mon Mar 21 23:03:14 2011 -0400 + + Release samtools-0.1.14 (r933:170) + +commit 59a5a8ba8e2940f0e38238f9339f02c91a8a0ce4 +Author: Heng Li +Date: Mon Mar 21 13:52:55 2011 -0400 + + optionally skip loci with too low sample coverage + +commit 6434264b5c69514d4fafe62cbd30b3bbaddc1d41 +Author: Heng Li +Date: Sat Mar 19 14:38:25 2011 -0400 + + mpileup support Illumina1.3+ quality; skip non-variant sites when "view -v" is in use + +commit 5f59e01987e1d5eca7d6359cae64a9734b18beea +Author: Heng Li +Date: Fri Mar 18 17:19:18 2011 -0400 + + update version to r933:167 + +commit 4d2c3c950910aa3d2c87760c3532e458fe01c0fa +Author: Heng Li +Date: Fri Mar 18 16:25:01 2011 -0400 + + added "-1" to the command-line help + +commit 55313a015a7bd6369cf5a66fed7fab2333201dc9 +Author: Heng Li +Date: Fri Mar 18 16:22:12 2011 -0400 + + added the "cat" command (by Chris Saunders) + +commit b670272cadf3efa4dc456ac4c76104f73477d60d +Author: Heng Li +Date: Fri Mar 18 15:59:46 2011 -0400 + + support varying the compression level + +commit c5dd3c9ca5f75f880e52c8cd2beae983bcb8d3b1 +Author: Heng Li +Date: Wed Mar 16 14:33:45 2011 -0400 + + update the manual pages + +commit 12fb4b596dc51bccd154fc4bd0593442f7937a46 +Author: Heng Li +Date: Wed Mar 16 12:49:26 2011 -0400 + + update changelog + +commit e7fe4fd66e02d60a1ca7952ad1938809e77729a9 +Author: Heng Li +Date: Wed Mar 16 12:10:05 2011 -0400 + + do not call indels when the depth is very high + +commit 7455eeaa32b949bb3856f75810890aabf7cacb18 +Author: Heng Li +Date: Wed Mar 16 11:56:56 2011 -0400 + + code clean up + +commit 5f16679e54ced8e67a75d949f9175c50480b914e +Author: Heng Li +Date: Tue Mar 15 14:45:24 2011 -0400 + + when -s is specified, change the sample order + +commit 7ba95adee09d3b06a7eaf797d25efef837e592f5 +Author: Heng Li +Date: Tue Mar 15 14:11:42 2011 -0400 + + compute the rank in permutation + +commit d219783cea7643fc7e10e1bd3a98e9b3165b4506 +Author: Heng Li +Date: Sun Mar 13 21:35:13 2011 -0400 + + I have found a SERIOUS BUG!!! + +commit 8e20d04ecdac1a7788eef71c4bb91b8479cf7150 +Author: Heng Li +Date: Sun Mar 13 17:04:04 2011 -0400 + + optionally shuffle samples in a BCF (debugging) + +commit fc7b261f181f2a411427bc9ee5d586c883ca9cdc +Author: Heng Li +Date: Fri Mar 11 09:34:20 2011 -0500 + + fixed a bug + +commit b3bbcc3d40994ae85705ab6fef9866ec8c142201 +Author: Heng Li +Date: Thu Mar 10 20:25:59 2011 -0500 + + use mode instead of mean + +commit f1161262d137098a19143b5cb0de810e5db3243e +Author: Heng Li +Date: Thu Mar 10 20:09:16 2011 -0500 + + start from the mean instead of the mode + +commit 2ba56f5e99e90674855c4ffc8bf583340b932e1e +Author: Heng Li +Date: Thu Mar 10 17:13:34 2011 -0500 + + fixed an error in Chi^2 test + +commit b4ce7ae400290bc43dd287240479667f99b3b11e +Author: Heng Li +Date: Thu Mar 10 00:23:39 2011 -0500 + + minor + +commit 8487fa5d3a73a43443964e731ea2a4c873c9d4e5 +Author: Heng Li +Date: Wed Mar 9 21:33:19 2011 -0500 + + added -F to accept BCFs generated by old samtools + +commit fd51d2093f7fd775a7eaaeea57fa34716ab59ac2 +Author: Heng Li +Date: Wed Mar 9 17:39:09 2011 -0500 + + update version + +commit b6da54335df943015a998a934075331b467abb5b +Author: Heng Li +Date: Wed Mar 9 17:37:14 2011 -0500 + + compute pseudo-chi2 probability + +commit 9f73cefdb8935421d872b989dd98fbc8e1295029 +Author: Heng Li +Date: Wed Mar 9 15:54:04 2011 -0500 + + remove a comment which is wrong + +commit b10b1e47ece522e97ab8ef23417bcb6454f8b9db +Author: Heng Li +Date: Wed Mar 9 15:51:12 2011 -0500 + + clean up + +commit 353bfae2c6ff59205bd9223db04084cf7f507f01 +Author: Heng Li +Date: Wed Mar 9 15:45:29 2011 -0500 + + for backup + +commit 53915d1c6410c2537d18bfa8eb8c657a2233c35e +Author: Heng Li +Date: Wed Mar 9 15:27:56 2011 -0500 + + having debugging code + +commit 0d0dbf66995b1511390d593981eae7b5d36fe17b +Author: Heng Li +Date: Wed Mar 9 14:58:23 2011 -0500 + + temporary backup + +commit 5b74a174a8b637dee43b7f30250df6fb96580e12 +Author: Heng Li +Date: Tue Mar 8 15:46:11 2011 -0500 + + the output makes sense, but there may be a typo... + +commit d81ec654b6c0c1eef6b0625d96f14b3155cee7c6 +Author: Heng Li +Date: Tue Mar 8 15:19:09 2011 -0500 + + added contrast2(); fixed a bug in haploid mode + +commit 0cfd896fad5f7737cca49efa94a11892dafcd812 +Author: Heng Li +Date: Mon Mar 7 21:40:17 2011 -0500 + + fixed a bug in haploid genotyping + +commit ccd52155ef61273f2b42ad9c7b31ff1915f81b24 +Author: Heng Li +Date: Sat Mar 5 18:10:35 2011 -0500 + + fixed a few bugs; still not fully working + +commit edc3af753f96f831968ae32f2e0f915b74f31e6e +Author: Heng Li +Date: Fri Mar 4 17:31:33 2011 -0500 + + drop HWE calculation + +commit 92dac194debb66ca0718c21c871822dda2dd5bc1 +Author: Heng Li +Date: Fri Mar 4 17:28:35 2011 -0500 + + implemented hap/dipoind mode; probably BUGGY! + +commit 7f26804bc27937e36fdc967e5c76514653ea40f5 +Author: Heng Li +Date: Fri Mar 4 16:01:27 2011 -0500 + + read ploidy + +commit e7b7213475b5e61a69aab77ffb02b4983c8e7678 +Author: Heng Li +Date: Fri Mar 4 14:12:14 2011 -0500 + + added math notes + +commit 46023e2f21321da83fc8e83e9229757a4e821acb +Author: Heng Li +Date: Fri Mar 4 13:34:10 2011 -0500 + + update BCF spec + +commit 13190c49eeb006ad7013b7f1e9fc1b3beca3ae78 +Author: Heng Li +Date: Tue Mar 1 14:45:19 2011 -0500 + + Release samtools-0.1.13 (r926:134) + +commit be8fabbb6001d9fd5263a70a3e21ed6dfe5a9837 +Author: Heng Li +Date: Tue Mar 1 14:07:15 2011 -0500 + + prepare to finalize 0.1.13 + +commit 1e8c753660978bed7e9289fe50becd596d9314bb +Author: Heng Li +Date: Tue Mar 1 09:40:17 2011 -0500 + + allow to change whether to drop ambiguous reads + +commit 412210bfdb46606023f2e4b9086f2787f0cf1c62 +Author: Heng Li +Date: Mon Feb 28 22:01:29 2011 -0500 + + revert to the old behavior of phase + +commit 46035589518cf84738de8666b866e2619457c1fb +Author: Heng Li +Date: Mon Feb 28 16:46:23 2011 -0500 + + change version number + +commit 7f40c33e37fc16fcb0a375ce46ae1d09cafb6d50 +Author: Heng Li +Date: Mon Feb 28 16:37:42 2011 -0500 + + bugfix in indel calling: interger overflow + +commit 75849470efbe30042e5ddd516f9bcbe3b9bf6062 +Author: Heng Li +Date: Mon Feb 28 15:35:47 2011 -0500 + + fixed a typo + +commit 9e6fb569885f906fabaab7fc2f02eae82f4bd602 +Author: Heng Li +Date: Mon Feb 28 15:34:09 2011 -0500 + + minor changes to heuristic rules + +commit 30a799a91f5e2c10b761aa5437f902c6649fceb3 +Author: Heng Li +Date: Mon Feb 28 15:20:26 2011 -0500 + + fixed a bug in the latest change + +commit e21ba9df950ea37f5c1b35c2af9ba9a4e0bba02a +Author: Heng Li +Date: Mon Feb 28 12:47:06 2011 -0500 + + put version in bam.h + +commit 918b14780c1dceb39c7010638ecd61c626e17166 +Author: Heng Li +Date: Mon Feb 28 12:00:38 2011 -0500 + + frag_t::phased==0 reads are dumped to chimera.bam + +commit 657293c7bdba3ac69f53cd1ffa2874ed8756475e +Author: Heng Li +Date: Mon Feb 28 11:05:29 2011 -0500 + + change default -q to 37 (previously 40) + +commit 33d8d3bea76e466798ea322d68d34deb8d2dff06 +Author: Heng Li +Date: Mon Feb 28 10:39:57 2011 -0500 + + fixed a minor bug in BAM reading + +commit daa25d426d42465d76c7317c95772bbb36bb3f47 +Author: Heng Li +Date: Sat Feb 26 21:07:24 2011 -0500 + + suppress gzopen64() warning + +commit 9cec4256eb9e7848d4711adb67b540659c141e32 +Author: Heng Li +Date: Fri Feb 25 22:14:52 2011 -0500 + + fixed a long existing bug in vcf2fq + +commit 304487c83067a733add71cbc3886fa8c49f7ef2a +Author: Heng Li +Date: Fri Feb 25 16:37:40 2011 -0500 + + change version number + +commit 10ba6bf4f16692760f696f7b17f3719065786f77 +Author: Heng Li +Date: Fri Feb 25 16:34:08 2011 -0500 + + Change the order of PL; change SP to int32_t + +commit c5cc2a8036a9c3579fbfde651efec4f6763b0228 +Author: Heng Li +Date: Fri Feb 25 14:52:03 2011 -0500 + + claim X defined in the header + +commit 4ee8cb29f6092fd14a89f0cc5d3575112a204f39 +Author: Heng Li +Date: Fri Feb 25 14:40:24 2011 -0500 + + minor changes + +commit 00065e9336a2831dc53bee7da2f4719845be1a2a +Author: Heng Li +Date: Fri Feb 25 11:39:06 2011 -0500 + + fixed an error in the BCF spec + +commit 1e2a73afcb72a02aa448718cb017c0438de89f90 +Author: Heng Li +Date: Fri Feb 25 11:36:40 2011 -0500 + + update BCF spec + +commit dbf8eedaa38a405cb2fba5b3952b85776f51d035 +Author: Heng Li +Date: Fri Feb 25 11:28:43 2011 -0500 + + update BCF spec + +commit eed1d91af9fad3c9d965333a55e623757f9c4e9d +Author: Heng Li +Date: Fri Feb 25 09:51:39 2011 -0500 + + fixed a flaw in targetcut + +commit 59bc980bb832b92a8b0cc244cf106e6150e4db6f +Author: Heng Li +Date: Fri Feb 25 00:54:35 2011 -0500 + + update manual page + +commit fcc4738c4abdca79e3de159e21208df1b98ac76c +Author: Heng Li +Date: Fri Feb 25 00:45:39 2011 -0500 + + update version format + +commit 5748639ae542b7f6b853562edc2bb3faf43030e4 +Author: Heng Li +Date: Fri Feb 25 00:45:12 2011 -0500 + + update version number + +commit 06b44cc366cf27ce8976ee6a05810a0b3c48b56d +Author: Heng Li +Date: Fri Feb 25 00:44:21 2011 -0500 + + update version number + +commit ab7f4529d12739ff66fd4c09af9d992ab59c53ef +Author: Heng Li +Date: Fri Feb 25 00:42:55 2011 -0500 + + various help message + +commit a092e1f6f963272f8bb23616986ddaf604fd0f82 +Author: Heng Li +Date: Thu Feb 24 23:43:13 2011 -0500 + + disable unfinished functionality + +commit f00a78db72b14ee4c6689fc13f20ed31aeaecd40 +Author: Heng Li +Date: Thu Feb 24 10:04:56 2011 -0500 + + added "const" to bcf_p1_cal() + +commit 91049c4a8db3bf50dcc9d07506f22fa4ca5b5a96 +Author: Heng Li +Date: Wed Feb 23 11:53:47 2011 -0500 + + randomly allocate unphased reads + +commit f4405354a8d4cb3441141fa734573031059d7f57 +Author: Heng Li +Date: Tue Feb 22 15:36:07 2011 -0500 + + fixed a typo + +commit 3075e4dc5c7c9d954426aabda6a73fa788357100 +Author: Heng Li +Date: Tue Feb 22 15:33:40 2011 -0500 + + make output more informative + +commit 628cf3235e2815a40acf089fb1d3357be6437787 +Author: Heng Li +Date: Tue Feb 22 14:50:06 2011 -0500 + + change the scoring rule; change default k to 13 + +commit f22fd99831e4b5c74f898719216f359dbe987bbf +Author: Heng Li +Date: Tue Feb 22 14:45:15 2011 -0500 + + update scoring in masking + +commit 2f23547b81984555032aa0eefd064b8e07986fdc +Author: Heng Li +Date: Tue Feb 22 14:37:17 2011 -0500 + + remove dropreg() + +commit 4d8b6b1f1f331ca9041983c66e34a857c3b8f1bb +Author: Heng Li +Date: Tue Feb 22 13:10:16 2011 -0500 + + accept files from stdin + +commit 9b50c5038e6fc0185e29ca5b50fe0806a9a939b9 +Author: Heng Li +Date: Tue Feb 22 11:16:57 2011 -0500 + + fixed a bug in consensus generation + +commit 1332ab32fb788fdc81b2ba8653b905d106238fad +Author: Heng Li +Date: Mon Feb 21 22:53:23 2011 -0500 + + print dropped fragments + +commit a288761b4ca1584e51076a71cbc4d72fe923dda1 +Author: Heng Li +Date: Mon Feb 21 22:37:04 2011 -0500 + + bugfix: singletons are not phased + +commit 683365f534c0223dea7d72532015ac16a45ba22b +Author: Heng Li +Date: Mon Feb 21 17:27:10 2011 -0500 + + output singleton blocks + +commit 841a4609084d81f1bc81e0b00dd806002461e7d9 +Author: Heng Li +Date: Mon Feb 21 15:58:55 2011 -0500 + + fixed a bug; not working with -l right now + +commit fdd57ea31732b5516dc212d72174b60206952636 +Author: Heng Li +Date: Mon Feb 21 15:17:00 2011 -0500 + + skip mapQ==0 reads + +commit 4eb6ba75c23c1c9be5f76814fa1b93a2e304b2af +Author: Heng Li +Date: Mon Feb 21 14:03:03 2011 -0500 + + print the "targetcut" command + +commit 0123d9559ba58b026c0dfd15bc26019a193cd21a +Author: Heng Li +Date: Mon Feb 21 11:22:13 2011 -0500 + + allow to set the maximum depth + +commit 0f92eb248a4d06645b2c3d736a0faea8a7a9f731 +Author: Heng Li +Date: Mon Feb 21 09:56:41 2011 -0500 + + use a proper error model to call hets + +commit 587a01504af5aea6288740d121dccf48fb8a75f4 +Author: Heng Li +Date: Mon Feb 21 09:16:38 2011 -0500 + + phase is UNFINISHED; strip RG when merging + +commit 723bf3cd79e4f4a558373d4c707fa6b3db0fb357 +Author: Heng Li +Date: Sat Feb 19 23:38:11 2011 -0500 + + use a proper model to compute consensus + +commit 891a6b02d4a9af2ed98fbaac4915bf1f0da4f6c8 +Author: Heng Li +Date: Sat Feb 19 22:14:19 2011 -0500 + + added comment + +commit 8b55e0a581ecc9e4ba754d1f3c8784f3038b6e48 +Author: Heng Li +Date: Fri Feb 18 17:23:39 2011 -0500 + + change the output format + +commit 75c36e8c563eddd0a362ba3b38cf0aea21aafb1f +Author: Heng Li +Date: Tue Feb 15 20:31:00 2011 -0500 + + fixed a bug in writing BAM + +commit bb0ce52f066cfebaa35a125d57b353bb717a5165 +Author: Heng Li +Date: Mon Feb 14 23:39:09 2011 -0500 + + skip uncovered; unknown alleles taken as X + +commit ba67f4d119c7d06907db3015d337d9a01a3fc9fe +Author: Heng Li +Date: Mon Feb 14 23:21:19 2011 -0500 + + fixed a bug + +commit e4448d49e6129a5e1ee9c7f04f43612f12d6aad6 +Author: Heng Li +Date: Mon Feb 14 22:43:09 2011 -0500 + + prepare to read hets from a list; unfinished + +commit 129ea29c1f12177c0a7c3e21676f6210370fc59b +Author: Heng Li +Date: Mon Feb 14 16:32:22 2011 -0500 + + updated khash.h to 0.2.5 + +commit 15b44ed93bd949dffcf79ac8dbea6d9b7dfcb58c +Author: Heng Li +Date: Mon Feb 14 16:15:04 2011 -0500 + + use the latest version of khash + +commit 486c05f06f44d981dfb2069bcb43e4b35fd8389c +Author: Heng Li +Date: Mon Feb 14 15:04:40 2011 -0500 + + change the default -k to 11 + +commit 07cf9d1e443d73cf053de38dd01671e3781f6e29 +Author: Heng Li +Date: Mon Feb 14 14:50:51 2011 -0500 + + sort fragments by vpos instead of by beg + +commit d0d3e7faabf5cbb7e5ff7b294f7e220da807c4c0 +Author: Heng Li +Date: Mon Feb 14 14:45:41 2011 -0500 + + shuffling the two haplotypes for better randomness + +commit 3be28eaf5f6033229aedf12ddb11a0084ba01cd8 +Author: Heng Li +Date: Mon Feb 14 14:09:17 2011 -0500 + + write chimeras to a separate BAM + +commit 80ccbc26f43918fe42be123cc1da9d3d7ce30816 +Author: Heng Li +Date: Mon Feb 14 13:54:13 2011 -0500 + + no mem leak/violation on small files; correctness is not checked + +commit 5c923867432fa14c26a19e3782e7f48d4080f6ac +Author: Heng Li +Date: Mon Feb 14 13:50:25 2011 -0500 + + bam separation; at least not immediate segfault + +commit cea2643ec30a59735bf89b2f562b563bf7263e79 +Author: Heng Li +Date: Sun Feb 13 23:24:11 2011 -0500 + + on the way to implement BAM separation; unfinished + +commit 964269cd15036a470ca89e43d0952201a0825671 +Author: Heng Li +Date: Sun Feb 13 18:07:56 2011 -0500 + + keep singletons in the hash table + +commit 2d4aa649bd670d5e038a1acaefd33c5fe24ae0e8 +Author: Heng Li +Date: Sun Feb 13 17:42:24 2011 -0500 + + Revert "prepare to add bam separation" + + This reverts commit ed6957e5211c2c4cf684dcb8bbb661052c74df6f. + +commit ed6957e5211c2c4cf684dcb8bbb661052c74df6f +Author: Heng Li +Date: Sun Feb 13 00:24:28 2011 -0500 + + prepare to add bam separation + +commit d211e652d93791d2e112d334added243ffe5fc3e +Author: Heng Li +Date: Sat Feb 12 18:50:20 2011 -0500 + + accelerate kstrtok + +commit 2d6af49d331ff5afe7b9e9b102e79d7d4512fdbe +Author: Heng Li +Date: Fri Feb 11 21:08:21 2011 -0500 + + split unlinked blocks + +commit 68e4cd1b560b0a6fd4c77e5e51eadde9fda26ea4 +Author: Heng Li +Date: Fri Feb 11 10:47:58 2011 -0500 + + remove heading and tailing ambiguous positions + +commit d2b685141426a902ae76660c1fbe8020da150cf8 +Author: Heng Li +Date: Fri Feb 11 10:02:21 2011 -0500 + + code clean up for further features + +commit c6980e062d55928b59f287c03e599dd5a37ed509 +Author: Heng Li +Date: Fri Feb 11 08:00:08 2011 -0500 + + change /64 to >>6; the latter is faster + +commit 91635b9c2687f24d72ee6a8aad2050a79bb8400f +Merge: 41d4df2 9a7e155 +Author: Heng Li +Date: Fri Feb 11 01:22:55 2011 -0500 + + Merge branch 'master' into devel + +commit 9a7e155cc591c1b6c9f7f9cb939364a6becb65b2 +Author: Heng Li +Date: Fri Feb 11 01:21:07 2011 -0500 + + output an unrecognized field as '.'; autofix GL/PL + +commit 41d4df2e9545e9abe97151cfe5d6c763f3d00db1 +Merge: c00c41c aacce0c +Author: Heng Li +Date: Thu Feb 10 23:00:14 2011 -0500 + + Merge branch 'master' into devel + +commit aacce0ce7276f451e4fddf81832f9e5f7f65198b +Author: Heng Li +Date: Thu Feb 10 22:57:53 2011 -0500 + + finished VCF->BCF conversion + +commit 0e875df643e41d848b709e2fa877de8ae53cdd4c +Author: Heng Li +Date: Thu Feb 10 21:57:28 2011 -0500 + + fixed a bug in reading VCF files + +commit c00c41c2a5da69cccea64adb542a0b365e56b4fc +Author: Heng Li +Date: Thu Feb 10 16:28:37 2011 -0500 + + suppres one-allele blocks + +commit 2e2354b673722e2f00d72970a043f80a66270da1 +Author: Heng Li +Date: Thu Feb 10 16:06:56 2011 -0500 + + fixed the bug in filtering + +commit d971e1fe24de4ecaf94055efffc5f641e2bdb563 +Author: Heng Li +Date: Thu Feb 10 12:24:23 2011 -0500 + + prepare to add filtering; buggy right now + +commit a0a5a3fbf504c3b02f7b9212e72315c1047cc249 +Author: Heng Li +Date: Thu Feb 10 11:55:02 2011 -0500 + + make masking optional + +commit 28db71ccd95054a5f8a47c2332794f8968f6a822 +Author: Heng Li +Date: Thu Feb 10 11:40:47 2011 -0500 + + routine to mask poorly called regions + +commit a3f6c439262bc10a4067860440f4d4dde9e0c515 +Author: Heng Li +Date: Wed Feb 9 17:18:33 2011 -0500 + + code clean up: remove globals + +commit 0b711978492f6ad39d459d78723c299468906818 +Author: Heng Li +Date: Wed Feb 9 16:52:54 2011 -0500 + + output more information + +commit f69d217ae5b691bf42ad07a97f29a7cc6456046f +Author: Heng Li +Date: Wed Feb 9 16:11:54 2011 -0500 + + fixed another bug in flipping + +commit d47882d549337fbcc251597508a2c7faf1bb92e2 +Author: Heng Li +Date: Wed Feb 9 16:01:35 2011 -0500 + + fixed a stupid bug in flipping + +commit e33f89de499496537f5fbde396a66557f0353f1b +Author: Heng Li +Date: Wed Feb 9 15:54:42 2011 -0500 + + fix chimeras; a little weird... + +commit 03d3c1d0b945245108ce0942d4772536a32212c7 +Author: Heng Li +Date: Wed Feb 9 13:27:35 2011 -0500 + + no effective change; prepare to fix chimera + +commit 6bc0a4676dd2252085a6e67bb06daa5ae05a554f +Author: Heng Li +Date: Wed Feb 9 11:52:58 2011 -0500 + + better count output + +commit dcac515439d25f71125d6de8111da417776ab9ce +Author: Heng Li +Date: Wed Feb 9 10:31:07 2011 -0500 + + prepare for another way of filtering + +commit ca7e4f1899b86d2e077994c789e8f69d699b3cd9 +Author: Heng Li +Date: Tue Feb 8 16:10:08 2011 -0500 + + fixed the bug; I can do better. + +commit 0733f77b98af121bdcb198cea6151d159831bb9c +Author: Heng Li +Date: Tue Feb 8 15:55:38 2011 -0500 + + fixed two bugs; still not working... + +commit 80f18cba9ba73c9592380fc1ecd53c351d294782 +Author: Heng Li +Date: Tue Feb 8 15:42:58 2011 -0500 + + filter false SNPs; NOT working right now + +commit 69a66e2f96d5b102cd712ff1527a3802fa84c590 +Author: Heng Li +Date: Tue Feb 8 14:39:09 2011 -0500 + + write sequence in the SAM format for debugging + +commit b6f1c9d160822af2b713be206f37bd6dde00546a +Author: Heng Li +Date: Mon Feb 7 11:51:21 2011 -0500 + + fixed two bugs + +commit 400aa5c06100af9c47cd5e4ce8b95b7deb84f54b +Author: Heng Li +Date: Mon Feb 7 11:22:38 2011 -0500 + + Optionally apply BAQ + +commit 4c82e0e19682e424f5cdb8381364114c307b329e +Author: Heng Li +Date: Mon Feb 7 01:23:31 2011 -0500 + + improved output; the result makes sense at a glance + +commit dc7853a581ab24bcc496e96b123ccf637e32ed1d +Author: Heng Li +Date: Sun Feb 6 14:12:43 2011 -0500 + + process per linked block instead of per chr + +commit e867d9c6c2e61d9e748e78163e5481dca5697a36 +Author: Heng Li +Date: Sun Feb 6 00:45:46 2011 -0500 + + DP seems to work on toy examples + +commit 445ad72fc43d4354d56f5f759790e8ae0be73d02 +Author: Heng Li +Date: Sat Feb 5 01:24:42 2011 -0500 + + implemented backtrack; not tested + +commit ba38e180b9cd545956583b22e97e09b4bb12073e +Author: Heng Li +Date: Fri Feb 4 23:55:23 2011 -0500 + + More "correct" DP; backtrack not implemented + +commit d69761fd9351273ccd37ea431b10509add91e7cf +Author: Heng Li +Date: Fri Feb 4 17:22:31 2011 -0500 + + scratch of dynamic programming; unfinished... + +commit 769ffcb44e26e59300791658801d321559b33858 +Author: Heng Li +Date: Fri Feb 4 16:29:55 2011 -0500 + + UNFINISHED commit. + +commit 9adab9591317c3467f3d8cdf2d19ec1f65d1b5b7 +Author: Heng Li +Date: Thu Feb 3 16:20:59 2011 -0500 + + another way of counting; can be even faster + +commit bbafbdc01ed1ceaab44927def1ad47c4c78aeb9c +Author: Heng Li +Date: Thu Feb 3 14:48:20 2011 -0500 + + for backup + +commit eba7446389cad62a19133bced1386a4334dcab79 +Merge: a44a98e f01a593 +Author: Heng Li +Date: Wed Feb 2 14:06:07 2011 -0500 + + Merge branch 'master' into devel + +commit f01a5930445b5fda7e6b5b813ed63c652160ada2 +Author: Heng Li +Date: Wed Feb 2 11:31:54 2011 -0500 + + Better truncation warning when EOF is absent + +commit dd3ee5ed26c8bbef4a62fa5b2bfb0a75833f2c31 +Author: Heng Li +Date: Wed Feb 2 10:38:28 2011 -0500 + + fixed a typo in BCF/VCF headers + +commit b9d1137c55f401387113d1ad8a387489afe741db +Author: Heng Li +Date: Wed Feb 2 09:13:44 2011 -0500 + + fixed an out-of-boundary bug (fixed by Roel Kluin) + +commit a44a98e16559b9672e8a3492c8f8c640074b7ee2 +Merge: ef68a14 d0443d5 +Author: Heng Li +Date: Tue Feb 1 21:54:48 2011 -0500 + + Merge branch 'master' into devel + +commit d0443d5c2f648e0f69bd4c56eaac7868e501c18b +Author: Heng Li +Date: Tue Feb 1 17:31:52 2011 -0500 + + improved sorting order checking + +commit ef68a14fab91399b2ecd38345936c3d6e7391cf3 +Merge: 1e597b3 1a39a2e +Author: Heng Li +Date: Tue Feb 1 15:12:37 2011 -0500 + + Merge branch 'master' into devel + +commit 1a39a2eb08a270e20a34a0983e8bed6ffb3e2008 +Author: Heng Li +Date: Tue Feb 1 15:12:14 2011 -0500 + + more precise error message + +commit e028e7a47c02232e06a9dd3009262c00dede1060 +Author: Heng Li +Date: Tue Feb 1 14:48:01 2011 -0500 + + improved sorting order validation in index + +commit 1e597b3356744e2b791b12c9187f91c8054511d5 +Author: Heng Li +Date: Tue Feb 1 14:44:27 2011 -0500 + + testing only; not working + +commit 5753ace1e54228822d8ee95f69943f586e42f6e8 +Author: Heng Li +Date: Mon Jan 31 17:37:08 2011 -0500 + + reduce the effect of seq errors at the cost of SN + +commit 6f239ce5e0abd47babee33174476d48b723260d8 +Author: Heng Li +Date: Mon Jan 31 17:29:34 2011 -0500 + + added testing code + +commit 3db42fe22d27d61ab5735cd2308f73d93def8ebe +Author: Heng Li +Date: Mon Jan 31 14:33:21 2011 -0500 + + routine for phasing fosmid resequencing (incomplete) + +commit ed88f2797323229ae8f38fbcd107b231007956a8 +Author: Heng Li +Date: Mon Jan 31 10:12:53 2011 -0500 + + SAM output + +commit abc6acae28dc4794f6422255f077cf370d34e414 +Merge: f1985a9 b133dbf +Author: Heng Li +Date: Sat Jan 29 22:56:10 2011 -0500 + + Merge branch 'master' into devel + +commit b133dbf82de4e8cea5eb56e5bbf0c4b3e9368fd5 +Author: Heng Li +Date: Sat Jan 29 22:37:11 2011 -0500 + + fixed a bug in tview on big-endian by Nathan Weeks + +commit 9d3fdaef29f91e21dbfcb9ff0165b9573e7c1042 +Author: Heng Li +Date: Sat Jan 29 22:24:00 2011 -0500 + + update INSTALL + +commit 9d074a38bde53961f96157b6fb3683b6dded38d7 +Author: Heng Li +Date: Sat Jan 29 21:56:25 2011 -0500 + + avoid a segfault when network connect fails + +commit f1985a93f7455b3ea1b0ef9b959d50b896ccd620 +Author: Heng Li +Date: Sat Jan 29 21:53:18 2011 -0500 + + fixed a bug about bit ordering + +commit d09797db6fef648a6823cbe718d67664660c6ebe +Author: Heng Li +Date: Thu Jan 27 16:53:19 2011 -0500 + + point out there are 4 or fewer free parameters + +commit 5fd1717650ed68ab6c55d094d1648c16a054891a +Author: Heng Li +Date: Thu Jan 27 16:09:18 2011 -0500 + + updated .gitignore + +commit fccb19fbe8f9de91f59d85bb49a248683dc6266c +Author: Heng Li +Date: Thu Jan 27 16:08:14 2011 -0500 + + fixed a bug; better scoring + +commit b4dcb844bde3d09eedcd9f6832186ece60ae5afd +Merge: ffc3e89 6f502de +Author: Heng Li +Date: Thu Jan 27 14:50:30 2011 -0500 + + Merge branch 'master' into devel + +commit 6f502dec46b18dae4bb5b2319715d028b5e193d0 +Author: Heng Li +Date: Thu Jan 27 14:47:31 2011 -0500 + + skip unmapped and ref-skip reads in indel calling + +commit 3639f37dd8257b24560c35effcc3b6c16c3c1bcb +Author: Heng Li +Date: Thu Jan 27 14:19:15 2011 -0500 + + fixed an out-of-boundary bug in rare cases + +commit ffc3e89678ab9052b84f403da1e43044b045e73f +Author: Heng Li +Date: Thu Jan 27 14:00:17 2011 -0500 + + targetcut can be compiled, though probably buggy + +commit f452b3ac51306865ddde31a8d715b155d4d3e6e6 +Author: Heng Li +Date: Wed Jan 26 18:58:43 2011 -0500 + + this is for a very special application... + +commit ca1451c6406c7ee757cb31349ea0b8de70db0656 +Author: Heng Li +Date: Wed Jan 26 18:48:09 2011 -0500 + + fixed compiling errors + +commit 085b87a7642865f17239fb6a436e626e25417838 +Author: Heng Li +Date: Wed Jan 26 18:45:09 2011 -0500 + + This script was put in a wrong place... + +commit 090d360828622520de60385af4928ce1aebe0e48 +Author: Heng Li +Date: Wed Jan 26 18:33:58 2011 -0500 + + Imported from samtools-r902 +------------------------------------------------------------------------ +r108 | lh3lh3 | 2009-01-20 11:56:45 +0000 (Tue, 20 Jan 2009) | 2 lines +Changed paths: + M /branches/dev/samtools/examples/Makefile + +made it a little more convenient + +------------------------------------------------------------------------ +r107 | lh3lh3 | 2009-01-20 11:53:30 +0000 (Tue, 20 Jan 2009) | 2 lines +Changed paths: + A /branches/dev/samtools/examples/Makefile + +added a Makefile + +------------------------------------------------------------------------ +r106 | lh3lh3 | 2009-01-20 11:25:05 +0000 (Tue, 20 Jan 2009) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/maq2sam.c + +support RG tag + +------------------------------------------------------------------------ +r105 | lh3lh3 | 2009-01-18 17:37:20 +0000 (Sun, 18 Jan 2009) | 2 lines +Changed paths: + M /branches/dev/samtools/ChangeLog + +update changelog + +------------------------------------------------------------------------ +r104 | lh3lh3 | 2009-01-18 17:31:21 +0000 (Sun, 18 Jan 2009) | 3 lines +Changed paths: + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_lpileup.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-18 + * fixed a bug in bam_lpileup.c: segment start and end are not correctly recognized + +------------------------------------------------------------------------ +r103 | lh3lh3 | 2009-01-18 16:34:03 +0000 (Sun, 18 Jan 2009) | 5 lines +Changed paths: + M /branches/dev/samtools/bam_import.c + M /branches/dev/samtools/bam_index.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-17 + * fixed a bug when there are reads without coordinates + * also recognize type 'c' as 'A' + * found a bug in bam_lpileup.c; NOT fixed yet + +------------------------------------------------------------------------ +r102 | lh3lh3 | 2009-01-17 19:46:49 +0000 (Sat, 17 Jan 2009) | 2 lines +Changed paths: + A /branches/dev/samtools/INSTALL + +Instruction for compilation + +------------------------------------------------------------------------ +r101 | lh3lh3 | 2009-01-17 19:31:36 +0000 (Sat, 17 Jan 2009) | 3 lines +Changed paths: + M /branches/dev/samtools/Makefile + A /branches/dev/samtools/Makefile.lite + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/faidx.c + M /branches/dev/samtools/misc/Makefile + M /branches/dev/samtools/razf.c + + * replaced HAVE_RAZF with _NO_RAZF + * added Makefile.lite for people who have trouble with razf.c + +------------------------------------------------------------------------ +r100 | lh3lh3 | 2009-01-16 10:03:37 +0000 (Fri, 16 Jan 2009) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_mate.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/misc/wgsim.c + + * samtools-0.1.1-15 + * fixed another bug in fixmate: unmapped pair has non-zero isize + +------------------------------------------------------------------------ +r99 | lh3lh3 | 2009-01-16 09:13:36 +0000 (Fri, 16 Jan 2009) | 4 lines +Changed paths: + M /branches/dev/samtools/ChangeLog + M /branches/dev/samtools/bam_mate.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-14 + * fixed a bug in fixmate: isize not equal to zero if two ends mapped to + different chr + +------------------------------------------------------------------------ +r98 | lh3lh3 | 2009-01-15 16:47:41 +0000 (Thu, 15 Jan 2009) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-13 + * fixed the prior for hom indels (Richard pointed this out) + +------------------------------------------------------------------------ +r97 | lh3lh3 | 2009-01-15 16:38:47 +0000 (Thu, 15 Jan 2009) | 4 lines +Changed paths: + M /branches/dev/samtools/COPYING + M /branches/dev/samtools/bam_sort.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/source.dot + + * samtools-0.1.1-12 + * fixed a bug in sort + * update source file graph and copyright information + +------------------------------------------------------------------------ +r96 | lh3lh3 | 2009-01-14 21:46:14 +0000 (Wed, 14 Jan 2009) | 2 lines +Changed paths: + M /branches/dev/samtools/glf.c + +fixed a typo + +------------------------------------------------------------------------ +r95 | lh3lh3 | 2009-01-14 21:44:53 +0000 (Wed, 14 Jan 2009) | 2 lines +Changed paths: + M /branches/dev/samtools/glf.c + +added a main function for glf.c + +------------------------------------------------------------------------ +r94 | lh3lh3 | 2009-01-14 17:14:59 +0000 (Wed, 14 Jan 2009) | 4 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/bgzf.h + A /branches/dev/samtools/glf.c + M /branches/dev/samtools/glf.h + + * samtools-0.1.1-11 + * generate binary GLFv2 + * added glfview command to dump GLFv2 binary file + +------------------------------------------------------------------------ +r93 | lh3lh3 | 2009-01-14 15:07:44 +0000 (Wed, 14 Jan 2009) | 4 lines +Changed paths: + M /branches/dev/samtools/bam_rmdup.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/glf.h + + * samtools-0.1.1-10 + * fixed several bugs in rmdup + * prepare to generate GLF2 + +------------------------------------------------------------------------ +r92 | lh3lh3 | 2009-01-14 13:27:44 +0000 (Wed, 14 Jan 2009) | 3 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_import.c + A /branches/dev/samtools/bam_rmdup.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-9 + * implemented rmdup; NOT tested yet + +------------------------------------------------------------------------ +r91 | lh3lh3 | 2009-01-13 20:15:43 +0000 (Tue, 13 Jan 2009) | 2 lines +Changed paths: + M /branches/dev/samtools/examples/00README.txt + +update README for typos + +------------------------------------------------------------------------ +r90 | lh3lh3 | 2009-01-13 19:57:50 +0000 (Tue, 13 Jan 2009) | 2 lines +Changed paths: + M /branches/dev/samtools/examples/ex1.sam.gz + +update example + +------------------------------------------------------------------------ +r89 | lh3lh3 | 2009-01-13 17:21:38 +0000 (Tue, 13 Jan 2009) | 3 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bam.c + A /branches/dev/samtools/bam_mate.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-8 + * added fixmate command + +------------------------------------------------------------------------ +r88 | lh3lh3 | 2009-01-13 10:48:23 +0000 (Tue, 13 Jan 2009) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-7 + * change the reported indel position to the previous way + +------------------------------------------------------------------------ +r87 | lh3lh3 | 2009-01-12 22:12:12 +0000 (Mon, 12 Jan 2009) | 4 lines +Changed paths: + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-6 + * addd glt output + * allow to change indel calling parameters at the command line + +------------------------------------------------------------------------ +r86 | lh3lh3 | 2009-01-12 21:16:48 +0000 (Mon, 12 Jan 2009) | 4 lines +Changed paths: + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_pileup.c + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-5 + * added two more flags + * allowed to select reads shown in pileup with a mask + +------------------------------------------------------------------------ +r85 | lh3lh3 | 2009-01-12 20:47:51 +0000 (Mon, 12 Jan 2009) | 4 lines +Changed paths: + M /branches/dev/samtools/bam_index.c + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-4 + * fixed a bug in indexing (linear index) + * prepare to add glt output from pileup + +------------------------------------------------------------------------ +r84 | lh3lh3 | 2009-01-12 09:22:35 +0000 (Mon, 12 Jan 2009) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-3 + * fixed a bug in outputing the coordinate of an indel + +------------------------------------------------------------------------ +r83 | lh3lh3 | 2009-01-11 15:18:01 +0000 (Sun, 11 Jan 2009) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-2 + * pileup: allows to output indel sites only + +------------------------------------------------------------------------ +r82 | lh3lh3 | 2009-01-10 23:34:31 +0000 (Sat, 10 Jan 2009) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bam_maqcns.h + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.1-1 + * implemented a Bayesian indel caller + +------------------------------------------------------------------------ +r81 | lh3lh3 | 2009-01-09 09:54:28 +0000 (Fri, 09 Jan 2009) | 2 lines +Changed paths: + M /branches/dev/samtools/examples/00README.txt + D /branches/dev/samtools/examples/ex1.fa.fai + +Let users generate ex1.fa.fai. + +------------------------------------------------------------------------ +r80 | lh3lh3 | 2009-01-08 16:10:08 +0000 (Thu, 08 Jan 2009) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/bowtie2sam.pl + +make the bowtie converter works for "-k 2" + +------------------------------------------------------------------------ +r78 | lh3lh3 | 2009-01-03 17:25:24 +0000 (Sat, 03 Jan 2009) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/export2sam.pl + +fixed a bug for "QC" reads + +------------------------------------------------------------------------ +r77 | lh3lh3 | 2009-01-01 18:32:06 +0000 (Thu, 01 Jan 2009) | 3 lines +Changed paths: + A /branches/dev/samtools/misc/bowtie2sam.pl + M /branches/dev/samtools/misc/soap2sam.pl + + * soap2sam.pl: added NM tag + * bowtie2sam.pl: converter for bowtie + +------------------------------------------------------------------------ +r76 | lh3lh3 | 2008-12-31 23:24:24 +0000 (Wed, 31 Dec 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/misc/soap2sam.pl + +soap2sam.pl: convert soap output to SAM + +------------------------------------------------------------------------ +r75 | lh3lh3 | 2008-12-31 17:54:32 +0000 (Wed, 31 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/misc/wgsim_eval.pl + + * wgsim_eval.pl-0.1.1 + * fixed a bug for a contig name like "NT_012345" + +------------------------------------------------------------------------ +r74 | lh3lh3 | 2008-12-31 16:38:21 +0000 (Wed, 31 Dec 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/misc/wgsim_eval.pl + + * evaluate alignment for reads generated by wgsim + +------------------------------------------------------------------------ +r73 | lh3lh3 | 2008-12-31 15:11:22 +0000 (Wed, 31 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/Makefile + M /branches/dev/samtools/misc/wgsim.c + +fixed compiling warnings for wgsim + +------------------------------------------------------------------------ +r72 | lh3lh3 | 2008-12-31 13:40:51 +0000 (Wed, 31 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/bam_tview.c + +remove an unused variable (a compiler warning only) + +------------------------------------------------------------------------ +r71 | lh3lh3 | 2008-12-31 13:37:16 +0000 (Wed, 31 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/Makefile + A /branches/dev/samtools/misc/wgsim.c + +wgsim: Paired-end reads simulator + +------------------------------------------------------------------------ +r70 | bhandsaker | 2008-12-29 20:27:16 +0000 (Mon, 29 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bam_tview.c + +Move definition of bam_nt16_nt4_table so we can build without curses. + +------------------------------------------------------------------------ +r62 | lh3lh3 | 2008-12-22 15:55:13 +0000 (Mon, 22 Dec 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/NEWS + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/samtools.1 + +Release samtools-0.1.1 + +------------------------------------------------------------------------ +r61 | lh3lh3 | 2008-12-22 15:46:08 +0000 (Mon, 22 Dec 2008) | 10 lines +Changed paths: + M /branches/dev/samtools/bam_aux.c + M /branches/dev/samtools/bam_index.c + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bam_tview.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/razf.c + M /branches/dev/samtools/samtools.1 + + * samtools-0.1.0-66 + * fixed a bug in razf.c: reset z_eof when razf_seek() is called + * fixed a memory leak in parsing a region + * changed pileup a little bit when -s is in use: output ^ and $ + * when a bam is not indexed, output more meaningful error message + * fixed a bug in indexing for small alignment + * fixed a bug in the viewer when we come to the end of a reference file + * updated documentation + * prepare to release 0.1.1 + +------------------------------------------------------------------------ +r60 | lh3lh3 | 2008-12-22 15:10:16 +0000 (Mon, 22 Dec 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/examples + A /branches/dev/samtools/examples/00README.txt + A /branches/dev/samtools/examples/ex1.fa + A /branches/dev/samtools/examples/ex1.fa.fai + A /branches/dev/samtools/examples/ex1.sam.gz + +example + +------------------------------------------------------------------------ +r59 | lh3lh3 | 2008-12-22 09:38:15 +0000 (Mon, 22 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/ChangeLog + +update ChangeLog + +------------------------------------------------------------------------ +r58 | lh3lh3 | 2008-12-20 23:06:00 +0000 (Sat, 20 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/misc/export2sam.pl + + * added comments + * fixed several bugs + +------------------------------------------------------------------------ +r57 | lh3lh3 | 2008-12-20 15:44:20 +0000 (Sat, 20 Dec 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/misc/export2sam.pl + +convert Export format to SAM; not thoroughly tested + +------------------------------------------------------------------------ +r56 | lh3lh3 | 2008-12-19 22:13:28 +0000 (Fri, 19 Dec 2008) | 6 lines +Changed paths: + M /branches/dev/samtools/bam_import.c + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bam_tview.c + M /branches/dev/samtools/bamtk.c + A /branches/dev/samtools/source.dot + + * samtools-0.1.0-65 + * pileup: generate maq-like simple output + * pileup: allow to output pileup at required sites + * source.dot: source file relationship graph + * tview: fixed a minor bug + +------------------------------------------------------------------------ +r55 | lh3lh3 | 2008-12-19 20:10:26 +0000 (Fri, 19 Dec 2008) | 2 lines +Changed paths: + D /branches/dev/samtools/misc/all2sam.pl + +remove all2sam.pl + +------------------------------------------------------------------------ +r54 | lh3lh3 | 2008-12-16 22:34:25 +0000 (Tue, 16 Dec 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/COPYING + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/faidx.h + M /branches/dev/samtools/khash.h + M /branches/dev/samtools/kseq.h + M /branches/dev/samtools/ksort.h + M /branches/dev/samtools/samtools.1 + +Added copyright information and a bit more documentation. No code change. + +------------------------------------------------------------------------ +r53 | lh3lh3 | 2008-12-16 13:40:18 +0000 (Tue, 16 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam.c + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_index.c + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-64 + * improved efficiency of the indel caller for spliced alignments + +------------------------------------------------------------------------ +r52 | lh3lh3 | 2008-12-16 10:28:20 +0000 (Tue, 16 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam.c + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_aux.c + M /branches/dev/samtools/bam_index.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-63 + * a bit code cleanup: reduce the dependency between source files + +------------------------------------------------------------------------ +r51 | lh3lh3 | 2008-12-15 14:29:32 +0000 (Mon, 15 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-62 + * fixed a memory leak + +------------------------------------------------------------------------ +r50 | lh3lh3 | 2008-12-15 14:00:13 +0000 (Mon, 15 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/ChangeLog + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/samtools.1 + +update documentation, ChangeLog and a comment + +------------------------------------------------------------------------ +r49 | lh3lh3 | 2008-12-15 13:36:43 +0000 (Mon, 15 Dec 2008) | 6 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bam_maqcns.h + M /branches/dev/samtools/bam_pileup.c + A /branches/dev/samtools/bam_plcmd.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/samtools.1 + + * samtools-0.1.0-61 + * moved pileup command to a separate source file + * added indel caller + * added bam_cal_segend(). (NOT WORKING for spliced alignment!!!) + * updated documentation + +------------------------------------------------------------------------ +r48 | lh3lh3 | 2008-12-12 13:55:36 +0000 (Fri, 12 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-60 + * fixed another bug in maqcns when there is a nearby deletion + +------------------------------------------------------------------------ +r47 | lh3lh3 | 2008-12-12 13:42:16 +0000 (Fri, 12 Dec 2008) | 5 lines +Changed paths: + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bam_pileup.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-59 + * pileup: outputing consensus is now optional + * fixed a bug in glfgen. This bug also exists in maq's glfgen. However, + I am not quite sure why the previous version may have problem. + +------------------------------------------------------------------------ +r46 | lh3lh3 | 2008-12-12 11:44:56 +0000 (Fri, 12 Dec 2008) | 6 lines +Changed paths: + M /branches/dev/samtools/bam_pileup.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-58 + * add maq consensus to pileup. However, I will move this part to a new + command as strictly speaking, consensus callin is not part of pileup, + and imposing it would make it harder to generate for other language + bindings. + +------------------------------------------------------------------------ +r45 | bhandsaker | 2008-12-11 20:43:56 +0000 (Thu, 11 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/bgzf.c + +Fix bug in tell() after reads that consume to the exact end of a block. + +------------------------------------------------------------------------ +r44 | lh3lh3 | 2008-12-11 09:36:53 +0000 (Thu, 11 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/samtools.1 + +update manual + +------------------------------------------------------------------------ +r43 | lh3lh3 | 2008-12-11 09:25:36 +0000 (Thu, 11 Dec 2008) | 4 lines +Changed paths: + M /branches/dev/samtools/bam_import.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-57 + * fixed a bug in parser when there is auxiliary fields + * made the parser a bit more robust + +------------------------------------------------------------------------ +r42 | lh3lh3 | 2008-12-10 14:57:29 +0000 (Wed, 10 Dec 2008) | 5 lines +Changed paths: + M /branches/dev/samtools/bam_index.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/bgzf.c + + * samtools-0.1.0-56 + * fixed a bug in bgzf (only reading is affected) + * fixed a typo in bam_index.c + * in bam_index.c, check potential bugs in the underlying I/O library + +------------------------------------------------------------------------ +r41 | lh3lh3 | 2008-12-10 12:53:08 +0000 (Wed, 10 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/samtools.1 + +update manual + +------------------------------------------------------------------------ +r40 | lh3lh3 | 2008-12-10 11:52:10 +0000 (Wed, 10 Dec 2008) | 5 lines +Changed paths: + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_pileup.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-55 + * tried to make pileup work with clipping (previously not), though NOT tested + * removed -v from pileup + * made pileup take the reference sequence + +------------------------------------------------------------------------ +r39 | lh3lh3 | 2008-12-09 11:59:28 +0000 (Tue, 09 Dec 2008) | 4 lines +Changed paths: + M /branches/dev/samtools/bam_import.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/samtools.1 + + * samtools-0.1.0-54 + * in parser, recognize "=", rather than ",", as a match + * in parser, correctl parse "=" at the MRNM field. + +------------------------------------------------------------------------ +r38 | lh3lh3 | 2008-12-09 11:39:07 +0000 (Tue, 09 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/maq2sam.c + +fixed a bug in handling maq flag 64 and 192 + +------------------------------------------------------------------------ +r37 | lh3lh3 | 2008-12-09 09:53:46 +0000 (Tue, 09 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/md5fa.c + +also calculate unordered md5sum check + +------------------------------------------------------------------------ +r36 | lh3lh3 | 2008-12-09 09:46:21 +0000 (Tue, 09 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/md5fa.c + +fixed a minor bug when there are space in the sequence + +------------------------------------------------------------------------ +r35 | lh3lh3 | 2008-12-09 09:40:45 +0000 (Tue, 09 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/md5fa.c + +fixed a potential memory leak + +------------------------------------------------------------------------ +r34 | lh3lh3 | 2008-12-08 14:52:17 +0000 (Mon, 08 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/bam_import.c + M /branches/dev/samtools/bam_index.c + M /branches/dev/samtools/bamtk.c + + * fixed a bug in import: bin is wrongly calculated + +------------------------------------------------------------------------ +r33 | lh3lh3 | 2008-12-08 14:08:01 +0000 (Mon, 08 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/misc/all2sam.pl + +nothing, really + +------------------------------------------------------------------------ +r32 | lh3lh3 | 2008-12-08 12:56:02 +0000 (Mon, 08 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/kseq.h + M /branches/dev/samtools/misc/Makefile + A /branches/dev/samtools/misc/md5.c + A /branches/dev/samtools/misc/md5.h + A /branches/dev/samtools/misc/md5fa.c + + * fixed two warnings in kseq.h + * added md5sum utilities + +------------------------------------------------------------------------ +r31 | lh3lh3 | 2008-12-08 11:35:29 +0000 (Mon, 08 Dec 2008) | 5 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bam_import.c + M /branches/dev/samtools/bamtk.c + A /branches/dev/samtools/kseq.h + D /branches/dev/samtools/kstream.h + + * samtools-0.1.0-52 + * replace kstream with kseq. kseq is a superset of kstream. I need the + extra functions in kseq.h. + * also compile stand-alone faidx + +------------------------------------------------------------------------ +r30 | lh3lh3 | 2008-12-08 11:17:04 +0000 (Mon, 08 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_sort.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-51 + * sorting by read names is available + +------------------------------------------------------------------------ +r29 | lh3lh3 | 2008-12-08 10:29:02 +0000 (Mon, 08 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam.c + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_import.c + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bam_pileup.c + M /branches/dev/samtools/bam_sort.c + M /branches/dev/samtools/bam_tview.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/misc/maq2sam.c + + * samtools-0.1.0-50 + * format change to meet the latest specification + +------------------------------------------------------------------------ +r28 | lh3lh3 | 2008-12-04 16:09:21 +0000 (Thu, 04 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/misc/maq2sam.c + + * minor change in maqcns: special care when n==0 + * change maq2sam to meet the latest specification + +------------------------------------------------------------------------ +r27 | lh3lh3 | 2008-12-04 15:55:44 +0000 (Thu, 04 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/razf.c + M /branches/dev/samtools/razf.h + +considerable code clean up in razf + +------------------------------------------------------------------------ +r26 | lh3lh3 | 2008-12-04 15:08:18 +0000 (Thu, 04 Dec 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/ChangeLog + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/faidx.c + +make RAZF optional in faidx.c + +------------------------------------------------------------------------ +r25 | lh3lh3 | 2008-12-01 15:27:22 +0000 (Mon, 01 Dec 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/bam_aux.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/samtools.1 + + * samtools-0.1.0-49 + * added routines for retrieving aux data, NOT TESTED YET! + +------------------------------------------------------------------------ +r24 | lh3lh3 | 2008-12-01 14:29:43 +0000 (Mon, 01 Dec 2008) | 5 lines +Changed paths: + M /branches/dev/samtools/bam.c + M /branches/dev/samtools/bam_import.c + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/bgzf.c + M /branches/dev/samtools/samtools.1 + + * samtools-0.1.0-48 + * bgzf: fixed a potential integer overflow on 32-it machines + * maqcns: set the minimum combined quality as 0 + * supporting hex strings + +------------------------------------------------------------------------ +r23 | lh3lh3 | 2008-11-27 17:14:37 +0000 (Thu, 27 Nov 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/bam_maqcns.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-47 + * fixed the bug in maqcns + +------------------------------------------------------------------------ +r22 | lh3lh3 | 2008-11-27 17:08:11 +0000 (Thu, 27 Nov 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bam.h + A /branches/dev/samtools/bam_maqcns.c + A /branches/dev/samtools/bam_maqcns.h + M /branches/dev/samtools/bam_tview.c + M /branches/dev/samtools/bamtk.c + A /branches/dev/samtools/glf.h + + * samtools-0.1.0-46 + * add MAQ consensus caller, currently BUGGY! + +------------------------------------------------------------------------ +r21 | lh3lh3 | 2008-11-27 13:51:28 +0000 (Thu, 27 Nov 2008) | 4 lines +Changed paths: + M /branches/dev/samtools/bam_pileup.c + M /branches/dev/samtools/bam_tview.c + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-45 + * tview: display padded alignment (but not P operation) + * better coordinates and reference sequence + +------------------------------------------------------------------------ +r19 | lh3lh3 | 2008-11-27 09:26:05 +0000 (Thu, 27 Nov 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/ChangeLog + +new ChangeLog + +------------------------------------------------------------------------ +r18 | lh3lh3 | 2008-11-27 09:24:45 +0000 (Thu, 27 Nov 2008) | 3 lines +Changed paths: + D /branches/dev/samtools/ChangeLog + A /branches/dev/samtools/ChangeLog.old (from /branches/dev/samtools/ChangeLog:6) + +Rename ChangeLog to ChangeLog.old. This old ChangeLog is generated from +the log of my personal SVN repository. + +------------------------------------------------------------------------ +r17 | lh3lh3 | 2008-11-27 09:22:55 +0000 (Thu, 27 Nov 2008) | 6 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/bgzf.c + + * samtools-0.1.0-44 + * declare fseeko and ftello as some Linux may not do this by default and + missing these declarations will make bgzf buggy + * get rid of some harmless warings + * use BGZF by default, now + +------------------------------------------------------------------------ +r16 | lh3lh3 | 2008-11-26 21:19:11 +0000 (Wed, 26 Nov 2008) | 4 lines +Changed paths: + M /branches/dev/samtools/bam_index.c + M /branches/dev/samtools/bamtk.c + M /branches/dev/samtools/razf.c + + * samtools-0.1.0-43 + * fixed a bug in razf_read() + * give more warnings when the file is truncated (or due to bugs in I/O library) + +------------------------------------------------------------------------ +r15 | lh3lh3 | 2008-11-26 20:41:39 +0000 (Wed, 26 Nov 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/bgzf.c + +fixed a bug in bgzf.c at the end of the file + +------------------------------------------------------------------------ +r14 | lh3lh3 | 2008-11-26 17:05:18 +0000 (Wed, 26 Nov 2008) | 4 lines +Changed paths: + M /branches/dev/samtools/bamtk.c + + * samtools-0.1.0-42 + * a lot happened to RAZF, although samtools itself is untouched. Better + also update the version number anyway to avoid confusion + +------------------------------------------------------------------------ +r13 | lh3lh3 | 2008-11-26 17:03:48 +0000 (Wed, 26 Nov 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/razf.c + +a change from Jue, but I think it should not matter + +------------------------------------------------------------------------ +r12 | lh3lh3 | 2008-11-26 16:48:14 +0000 (Wed, 26 Nov 2008) | 3 lines +Changed paths: + M /branches/dev/samtools/razf.c + +fixed a potential bug in razf. However, it seems still buggy, just +rarely happens, very rarely. + +------------------------------------------------------------------------ +r11 | lh3lh3 | 2008-11-26 14:02:56 +0000 (Wed, 26 Nov 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/razf.c + +fixed a bug in razf, with the help of Jue + +------------------------------------------------------------------------ +r10 | lh3lh3 | 2008-11-26 11:55:32 +0000 (Wed, 26 Nov 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/bam_index.c + +remove a comment + +------------------------------------------------------------------------ +r9 | lh3lh3 | 2008-11-26 11:37:05 +0000 (Wed, 26 Nov 2008) | 2 lines +Changed paths: + M /branches/dev/samtools/Makefile + M /branches/dev/samtools/bam.h + M /branches/dev/samtools/razf.c + M /branches/dev/samtools/razf.h + + * Jue has updated razf to realize Bob's scheme + +------------------------------------------------------------------------ +r7 | lh3lh3 | 2008-11-25 20:37:37 +0000 (Tue, 25 Nov 2008) | 2 lines +Changed paths: + A /branches/dev/samtools/samtools.1 + +the manual page + +------------------------------------------------------------------------ +r6 | lh3lh3 | 2008-11-25 20:37:16 +0000 (Tue, 25 Nov 2008) | 3 lines +Changed paths: + A /branches/dev/samtools/ChangeLog + A /branches/dev/samtools/Makefile + A /branches/dev/samtools/bam.c + A /branches/dev/samtools/bam.h + A /branches/dev/samtools/bam_aux.c + A /branches/dev/samtools/bam_endian.h + A /branches/dev/samtools/bam_import.c + A /branches/dev/samtools/bam_index.c + A /branches/dev/samtools/bam_lpileup.c + A /branches/dev/samtools/bam_pileup.c + A /branches/dev/samtools/bam_sort.c + A /branches/dev/samtools/bam_tview.c + A /branches/dev/samtools/bamtk.c + A /branches/dev/samtools/bgzf.c + A /branches/dev/samtools/bgzf.h + A /branches/dev/samtools/bgzip.c + A /branches/dev/samtools/faidx.c + A /branches/dev/samtools/faidx.h + A /branches/dev/samtools/khash.h + A /branches/dev/samtools/ksort.h + A /branches/dev/samtools/kstream.h + A /branches/dev/samtools/misc + A /branches/dev/samtools/misc/Makefile + A /branches/dev/samtools/misc/all2sam.pl + A /branches/dev/samtools/misc/maq2sam.c + A /branches/dev/samtools/razf.c + A /branches/dev/samtools/razf.h + A /branches/dev/samtools/razip.c + A /branches/dev/samtools/zutil.h + +The initial version of samtools, replicated from my local SVN repository. +The current version is: 0.1.0-42. All future development will happen here. + +------------------------------------------------------------------------ +r5 | lh3lh3 | 2008-11-25 20:30:49 +0000 (Tue, 25 Nov 2008) | 2 lines +Changed paths: + A /branches/dev/samtools + +samtools (C version) + +------------------------------------------------------------------------ +------------------------------------------------------------------------ +r703 | lh3 | 2008-11-25 20:20:02 +0000 (Tue, 25 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/samtools.1 + +rename bamtk to samtools + +------------------------------------------------------------------------ +r702 | lh3 | 2008-11-25 20:15:09 +0000 (Tue, 25 Nov 2008) | 2 lines +Changed paths: + D /branches/prog/bam/bamtk.1 + A /branches/prog/bam/samtools.1 (from /branches/prog/bam/bamtk.1:679) + +rename bamtk.1 to samtools.1 + +------------------------------------------------------------------------ +r701 | lh3 | 2008-11-25 13:29:10 +0000 (Tue, 25 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/Makefile + M /branches/prog/bam/bam.c + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_import.c + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.c + M /branches/prog/bam/misc/Makefile + + * samtools-0.1.0-41 + * small (but a bit dangerous) changes to meet the latest specification + +------------------------------------------------------------------------ +r700 | lh3 | 2008-11-25 13:15:11 +0000 (Tue, 25 Nov 2008) | 2 lines +Changed paths: + A /branches/prog/bam/misc/all2sam.pl (from /branches/prog/bam/misc/all2tam.pl:649) + D /branches/prog/bam/misc/all2tam.pl + A /branches/prog/bam/misc/maq2sam.c (from /branches/prog/bam/misc/maq2tam.c:699) + D /branches/prog/bam/misc/maq2tam.c + +rename tam to sam + +------------------------------------------------------------------------ +r699 | lh3 | 2008-11-25 13:14:49 +0000 (Tue, 25 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/misc/maq2tam.c + +change for the new specification + +------------------------------------------------------------------------ +r698 | lh3 | 2008-11-24 13:15:20 +0000 (Mon, 24 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/razf.c + M /branches/prog/bam/razf.h + + * add a fake BGZF mode to razf. It is fake in that it loads razf index into + memory but gives BGZF like virtual offset + +------------------------------------------------------------------------ +r697 | lh3 | 2008-11-24 09:53:44 +0000 (Mon, 24 Nov 2008) | 2 lines +Changed paths: + A /branches/prog/bam/ChangeLog + +change log + +------------------------------------------------------------------------ +r696 | lh3 | 2008-11-24 09:53:23 +0000 (Mon, 24 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/bgzf.c + +updated bgzf, on behalf of Bob + +------------------------------------------------------------------------ +r695 | lh3 | 2008-11-23 11:40:31 +0000 (Sun, 23 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/razf.c + +fixed a bug in razf + +------------------------------------------------------------------------ +r694 | lh3 | 2008-11-22 16:23:52 +0000 (Sat, 22 Nov 2008) | 4 lines +Changed paths: + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bam_lpileup.c + M /branches/prog/bam/bam_tview.c + M /branches/prog/bam/bamtk.c + + * bam-0.1.0-40 + * fixed two small memory leaks + * fixed a memory problem when seek outside the length of the sequence + +------------------------------------------------------------------------ +r693 | lh3 | 2008-11-22 16:10:04 +0000 (Sat, 22 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bamtk.c + + * bam-0.1.0-39 + * fixed an uninitialized warning. This does not matter in fact + +------------------------------------------------------------------------ +r692 | lh3 | 2008-11-22 15:44:05 +0000 (Sat, 22 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/razf.c + M /branches/prog/bam/razf.h + +Jue's new razf + +------------------------------------------------------------------------ +r691 | lh3 | 2008-11-21 21:30:39 +0000 (Fri, 21 Nov 2008) | 4 lines +Changed paths: + M /branches/prog/bam/Makefile + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bamtk.c + M /branches/prog/bam/bgzip.c + + * bam-0.1.0-38 + * get rid of some warings in bgzip.c + * potentially improve performance in indexing for BGZF + +------------------------------------------------------------------------ +r690 | lh3 | 2008-11-21 21:15:51 +0000 (Fri, 21 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/bgzf.c + +I think I have fixed the bug in bgzf + +------------------------------------------------------------------------ +r689 | lh3 | 2008-11-21 20:48:56 +0000 (Fri, 21 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/bgzf.c + +bug fix by Bob + +------------------------------------------------------------------------ +r688 | lh3 | 2008-11-21 20:37:27 +0000 (Fri, 21 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_index.c + +fixed a bug due to the name change in _IOLIB + +------------------------------------------------------------------------ +r687 | lh3 | 2008-11-21 14:42:56 +0000 (Fri, 21 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/bgzf.c + +fix small things + +------------------------------------------------------------------------ +r686 | lh3 | 2008-11-21 14:37:59 +0000 (Fri, 21 Nov 2008) | 2 lines +Changed paths: + A /branches/prog/bam/bgzf.c + A /branches/prog/bam/bgzf.h + A /branches/prog/bam/bgzip.c + +Bob's BGZF format, although currently buggy + +------------------------------------------------------------------------ +r685 | lh3 | 2008-11-21 09:48:20 +0000 (Fri, 21 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bam_tview.c + M /branches/prog/bam/bamtk.c + + * bam-0.1.0-37 + * improve interface a little bit + +------------------------------------------------------------------------ +r684 | lh3 | 2008-11-21 09:30:18 +0000 (Fri, 21 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam_tview.c + M /branches/prog/bam/bamtk.c + + * bam-0.1.0-36 + * improve the interface of tview, a little bit + +------------------------------------------------------------------------ +r683 | lh3 | 2008-11-20 22:33:54 +0000 (Thu, 20 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/bam_tview.c + +a little better viewer + +------------------------------------------------------------------------ +r682 | lh3 | 2008-11-20 22:27:01 +0000 (Thu, 20 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_tview.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-35 + * better viewer + +------------------------------------------------------------------------ +r681 | lh3 | 2008-11-20 20:51:16 +0000 (Thu, 20 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/Makefile + M /branches/prog/bam/bam_tview.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-34 + * tview is now a component of bamtk + +------------------------------------------------------------------------ +r680 | lh3 | 2008-11-20 19:17:30 +0000 (Thu, 20 Nov 2008) | 2 lines +Changed paths: + A /branches/prog/bam/bam_tview.c + +text alignment viewer + +------------------------------------------------------------------------ +r679 | lh3 | 2008-11-20 19:17:15 +0000 (Thu, 20 Nov 2008) | 5 lines +Changed paths: + M /branches/prog/bam/Makefile + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bam_lpileup.c + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.1 + M /branches/prog/bam/bamtk.c + M /branches/prog/bam/faidx.c + + * bamtk-0.1.0-33 + * added routines to reset pileup bufferes + * fixed a bug in faidx + * add text alignment viewer + +------------------------------------------------------------------------ +r678 | lh3 | 2008-11-20 11:05:02 +0000 (Thu, 20 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/Makefile + A /branches/prog/bam/bam_lpileup.c (from /branches/prog/bam/bam_tview.c:668) + D /branches/prog/bam/bam_tview.c + +rename tview as lpileup + +------------------------------------------------------------------------ +r677 | lh3 | 2008-11-20 10:08:52 +0000 (Thu, 20 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/razf.c + +fixed a bug in razf + +------------------------------------------------------------------------ +r676 | lh3 | 2008-11-19 22:52:20 +0000 (Wed, 19 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/faidx.h + +add documentations + +------------------------------------------------------------------------ +r674 | lh3 | 2008-11-19 21:39:17 +0000 (Wed, 19 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/bamtk.1 + M /branches/prog/bam/faidx.h + +update documentation + +------------------------------------------------------------------------ +r673 | lh3 | 2008-11-19 21:19:03 +0000 (Wed, 19 Nov 2008) | 2 lines +Changed paths: + A /branches/prog/bam/bamtk.1 + +add manual page + +------------------------------------------------------------------------ +r672 | lh3 | 2008-11-19 16:40:49 +0000 (Wed, 19 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bamtk.c + M /branches/prog/bam/faidx.c + + * bamtk-0.1.0-32 + * make faidx more error resistant + +------------------------------------------------------------------------ +r671 | lh3 | 2008-11-19 16:09:55 +0000 (Wed, 19 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/faidx.h + +add index + +------------------------------------------------------------------------ +r670 | lh3 | 2008-11-19 16:02:39 +0000 (Wed, 19 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.c + M /branches/prog/bam/faidx.c + + * bamtk-0.1.0-31 + * show reference sequence in pileup -v (not in the default pileup) + +------------------------------------------------------------------------ +r669 | lh3 | 2008-11-19 14:51:17 +0000 (Wed, 19 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/Makefile + M /branches/prog/bam/bamtk.c + M /branches/prog/bam/faidx.c + + * bamtk-0.1.0-30 + * put faidx in bamtk and remove faidx_main.c + +------------------------------------------------------------------------ +r668 | lh3 | 2008-11-19 14:15:05 +0000 (Wed, 19 Nov 2008) | 4 lines +Changed paths: + M /branches/prog/bam/Makefile + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bam_tview.c + M /branches/prog/bam/bamtk.c + A /branches/prog/bam/faidx.c + A /branches/prog/bam/faidx.h + M /branches/prog/bam/razf.c + + * bamtk-0.1.0-29 + * fixed a bug in tview.c + * prepare to add faidx + +------------------------------------------------------------------------ +r667 | lh3 | 2008-11-19 10:20:45 +0000 (Wed, 19 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/razf.c + M /branches/prog/bam/razf.h + +gzip-compatible razf + +------------------------------------------------------------------------ +r664 | lh3 | 2008-11-18 12:50:23 +0000 (Tue, 18 Nov 2008) | 5 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-28 + * fetch: fixed a bug at an array boundary + * fetch: fixed a bug when the whole chromosome is retrieved + * add linear index + +------------------------------------------------------------------------ +r663 | lh3 | 2008-11-17 21:29:22 +0000 (Mon, 17 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/Makefile + M /branches/prog/bam/bam.c + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_import.c + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bam_tview.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-27 + * put l_qseq into core and move l_aux to bam1_t + +------------------------------------------------------------------------ +r662 | lh3 | 2008-11-17 20:55:16 +0000 (Mon, 17 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam.c + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_import.c + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-26 + * save seq and qual separately + +------------------------------------------------------------------------ +r661 | lh3 | 2008-11-17 13:09:37 +0000 (Mon, 17 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/bam.h + +little + +------------------------------------------------------------------------ +r660 | lh3 | 2008-11-17 13:06:14 +0000 (Mon, 17 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/bam.h + +more documentations + +------------------------------------------------------------------------ +r659 | lh3 | 2008-11-17 12:55:08 +0000 (Mon, 17 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-25 + * make tview work for TAM + +------------------------------------------------------------------------ +r658 | lh3 | 2008-11-17 12:50:21 +0000 (Mon, 17 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/Makefile + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bam_tview.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-24 + * make tview as an independent module + +------------------------------------------------------------------------ +r657 | lh3 | 2008-11-17 11:26:06 +0000 (Mon, 17 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/Makefile + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_pileup.c + +change little + +------------------------------------------------------------------------ +r656 | lh3 | 2008-11-16 21:33:19 +0000 (Sun, 16 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-23 + * also add tview for TAM + +------------------------------------------------------------------------ +r655 | lh3 | 2008-11-16 21:29:46 +0000 (Sun, 16 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/Makefile + M /branches/prog/bam/bam_tview.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-22 + * make tview more efficient for deep depth + +------------------------------------------------------------------------ +r654 | lh3 | 2008-11-16 20:52:19 +0000 (Sun, 16 Nov 2008) | 4 lines +Changed paths: + M /branches/prog/bam/Makefile + M /branches/prog/bam/bam_import.c + M /branches/prog/bam/bam_pileup.c + A /branches/prog/bam/bam_tview.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-21 + * fixed bug in the TAM parser: lowercase not recognized + * unfinished function to leveled pileup (tview) + +------------------------------------------------------------------------ +r653 | lh3 | 2008-11-15 12:58:36 +0000 (Sat, 15 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-20 + * pileup now display deleted bases as '*' + +------------------------------------------------------------------------ +r652 | lh3 | 2008-11-15 09:58:39 +0000 (Sat, 15 Nov 2008) | 4 lines +Changed paths: + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-19 + * fixed a bug in fetch() + * reduce memory in indexing + +------------------------------------------------------------------------ +r651 | lh3 | 2008-11-14 21:56:05 +0000 (Fri, 14 Nov 2008) | 5 lines +Changed paths: + M /branches/prog/bam/Makefile + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-18 + * important changes are made to index: the index size is increased, but + now we have no limit on file sizes and the new method potentially + works with BGZF, Bob's new compression format. + +------------------------------------------------------------------------ +r650 | lh3 | 2008-11-14 16:03:22 +0000 (Fri, 14 Nov 2008) | 4 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-17 + * more comments in bam.h + * fixed a bug in bam_index.c + +------------------------------------------------------------------------ +r649 | lh3 | 2008-11-13 16:04:18 +0000 (Thu, 13 Nov 2008) | 4 lines +Changed paths: + M /branches/prog/bam/bam.c + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_import.c + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bam_sort.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-16 + * use macros to retrieve pointers from bam1_t and thus reduce the size + of bam1_t struct. + +------------------------------------------------------------------------ +r648 | lh3 | 2008-11-13 13:21:39 +0000 (Thu, 13 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam_sort.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-15 + * make more things work over pipe + +------------------------------------------------------------------------ +r647 | lh3 | 2008-11-13 12:49:28 +0000 (Thu, 13 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/misc/maq2tam.c + +fixed a bug in maq2tam + +------------------------------------------------------------------------ +r646 | lh3 | 2008-11-13 11:46:59 +0000 (Thu, 13 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/Makefile + M /branches/prog/bam/misc/Makefile + M /branches/prog/bam/misc/maq2tam.c + + * bug fix in maq2tam.c + * improve Makefile + +------------------------------------------------------------------------ +r645 | lh3 | 2008-11-13 11:39:46 +0000 (Thu, 13 Nov 2008) | 3 lines +Changed paths: + A /branches/prog/bam/misc/Makefile + M /branches/prog/bam/misc/maq2tam.c + + * corrected maq2tam + * add Makefile + +------------------------------------------------------------------------ +r644 | lh3 | 2008-11-13 11:25:45 +0000 (Thu, 13 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/razf.c + +fixed the bug in buffered write (on behalf of Jue) + +------------------------------------------------------------------------ +r643 | lh3 | 2008-11-13 10:53:42 +0000 (Thu, 13 Nov 2008) | 2 lines +Changed paths: + D /branches/prog/bam/all2tam.pl + A /branches/prog/bam/misc/all2tam.pl (from /branches/prog/bam/all2tam.pl:642) + +move to misc + +------------------------------------------------------------------------ +r642 | lh3 | 2008-11-13 10:53:23 +0000 (Thu, 13 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/all2tam.pl + +change tag + +------------------------------------------------------------------------ +r641 | lh3 | 2008-11-13 10:53:12 +0000 (Thu, 13 Nov 2008) | 2 lines +Changed paths: + D /branches/prog/bam/utils + +has been renamed + +------------------------------------------------------------------------ +r640 | lh3 | 2008-11-13 10:52:50 +0000 (Thu, 13 Nov 2008) | 2 lines +Changed paths: + A /branches/prog/bam/misc (from /branches/prog/bam/utils:639) + +rename + +------------------------------------------------------------------------ +r639 | lh3 | 2008-11-13 10:52:35 +0000 (Thu, 13 Nov 2008) | 2 lines +Changed paths: + A /branches/prog/bam/utils + A /branches/prog/bam/utils/maq2tam.c + +utilities (converters and so on) + +------------------------------------------------------------------------ +r638 | lh3 | 2008-11-12 22:24:22 +0000 (Wed, 12 Nov 2008) | 4 lines +Changed paths: + M /branches/prog/bam/bam.c + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_import.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-14 + * copy the text header to BAM + * add BAM1 header flag + +------------------------------------------------------------------------ +r637 | lh3 | 2008-11-12 14:56:08 +0000 (Wed, 12 Nov 2008) | 4 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bamtk.c + M /branches/prog/bam/razf.c + + * bamtk-0.1.0-13 + * fixed a bug in razf + * improved and fixed potential bugs in index + +------------------------------------------------------------------------ +r636 | lh3 | 2008-11-12 11:57:13 +0000 (Wed, 12 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.c + +update documentation in the HeaderDOC format + +------------------------------------------------------------------------ +r635 | lh3 | 2008-11-12 10:08:38 +0000 (Wed, 12 Nov 2008) | 4 lines +Changed paths: + M /branches/prog/bam/bam.c + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_import.c + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-12 + * more documentations + * rename baf1_core_t as bam1_core_t + +------------------------------------------------------------------------ +r634 | lh3 | 2008-11-11 23:00:35 +0000 (Tue, 11 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_pileup.c + +documentation + +------------------------------------------------------------------------ +r633 | lh3 | 2008-11-11 21:23:49 +0000 (Tue, 11 Nov 2008) | 4 lines +Changed paths: + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-11 + * give up regional pileup. We can now use pipe to mimic that. + * for index file, change suffix .idx to .bmi + +------------------------------------------------------------------------ +r632 | lh3 | 2008-11-11 21:00:11 +0000 (Tue, 11 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_import.c + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.c + M /branches/prog/bam/razf.c + + * bamtk-0.1.0-10 + * make pileup work on TAM + +------------------------------------------------------------------------ +r631 | lh3 | 2008-11-11 09:20:29 +0000 (Tue, 11 Nov 2008) | 4 lines +Changed paths: + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.c + M /branches/prog/bam/razf.c + M /branches/prog/bam/razf.h + M /branches/prog/bam/razip.c + + * bamtk-0.1.0-9 + * razf now supports streaming + * prepare to improve pileup (have not yet) + +------------------------------------------------------------------------ +r630 | lh3 | 2008-11-10 18:34:40 +0000 (Mon, 10 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_import.c + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-8 + * improve the interface of TAM parser + +------------------------------------------------------------------------ +r629 | lh3 | 2008-11-10 13:06:13 +0000 (Mon, 10 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-7 + * almost nothing + +------------------------------------------------------------------------ +r628 | lh3 | 2008-11-10 12:56:36 +0000 (Mon, 10 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam.c + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-6 + * fixed a bug in bam_pileup.c + +------------------------------------------------------------------------ +r627 | lh3 | 2008-11-10 11:32:46 +0000 (Mon, 10 Nov 2008) | 4 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bamtk.c + M /branches/prog/bam/razf.c + + * bamtk-0.1.0-5 + * fixed a bug in razf.c, caused by my modifications + * improve the interface of pileup. Now it will be slower but more flexible + +------------------------------------------------------------------------ +r626 | lh3 | 2008-11-09 20:51:04 +0000 (Sun, 09 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam.h + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-4 + * view: dumping binary output + +------------------------------------------------------------------------ +r625 | lh3 | 2008-11-09 20:31:54 +0000 (Sun, 09 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam.c + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_import.c + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bam_pileup.c + M /branches/prog/bam/bam_sort.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-3 + * rename functions + +------------------------------------------------------------------------ +r624 | lh3 | 2008-11-09 15:07:32 +0000 (Sun, 09 Nov 2008) | 2 lines +Changed paths: + M /branches/prog/bam/bam.h + +add comments + +------------------------------------------------------------------------ +r623 | lh3 | 2008-11-08 22:32:49 +0000 (Sat, 08 Nov 2008) | 4 lines +Changed paths: + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-2 + * improve indexing for a mixture of long and short reads, although currently + I do not know whether it really works... + +------------------------------------------------------------------------ +r622 | lh3 | 2008-11-08 22:13:58 +0000 (Sat, 08 Nov 2008) | 3 lines +Changed paths: + M /branches/prog/bam/bam_index.c + M /branches/prog/bam/bamtk.c + + * bamtk-0.1.0-1 + * prepare for improving indexing algorithm + +------------------------------------------------------------------------ +r621 | lh3 | 2008-11-08 20:28:09 +0000 (Sat, 08 Nov 2008) | 4 lines +Changed paths: + A /branches/prog/bam/all2tam.pl + M /branches/prog/bam/bam.c + M /branches/prog/bam/bam.h + M /branches/prog/bam/bam_import.c + M /branches/prog/bam/bamtk.c + D /branches/prog/bam/tam_utils.pl + + * bamtk-0.1.0 + * smarter integers + * rename tam_utils.pl to all2tam.pl + +------------------------------------------------------------------------ +r620 | lh3 | 2008-11-08 17:17:22 +0000 (Sat, 08 Nov 2008) | 2 lines +Changed paths: + A /branches/prog/bam + A /branches/prog/bam/Makefile + A /branches/prog/bam/bam.c + A /branches/prog/bam/bam.h + A /branches/prog/bam/bam_endian.h + A /branches/prog/bam/bam_import.c + A /branches/prog/bam/bam_index.c + A /branches/prog/bam/bam_pileup.c + A /branches/prog/bam/bam_sort.c + A /branches/prog/bam/bamtk.c + A /branches/prog/bam/khash.h + A /branches/prog/bam/ksort.h + A /branches/prog/bam/kstream.h + A /branches/prog/bam/razf.c + A /branches/prog/bam/razf.h + A /branches/prog/bam/razip.c + A /branches/prog/bam/tam_utils.pl + A /branches/prog/bam/zutil.h + +The Binary Alignment/Mapping format. + +------------------------------------------------------------------------ diff --git a/sam/Makefile b/sam/Makefile index bb89e00..73c8df4 100644 --- a/sam/Makefile +++ b/sam/Makefile @@ -1,21 +1,20 @@ CC= gcc - CFLAGS_EXTRA= #CFLAGS_EXTRA= -L/usr/include/ncurses -CFLAGS= -g -Wall -O2 $(CFLAGS_EXTRA) #-m64 #-arch ppc - +CFLAGS= -g -Wall -O2 $(CFLAGS_EXTRA) +#LDFLAGS= -Wl,-rpath,\$$ORIGIN/../lib DFLAGS_EXTRA= #DFLAGS_EXTRA= -Dexpl=exp -Dlogl=log DFLAGS= -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -D_USE_KNETFILE -D_CURSES_LIB=1 $(DFLAGS_EXTRA) - KNETFILE_O= knetfile.o LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ bam_pileup.o bam_lpileup.o bam_md.o razf.o faidx.o bedidx.o \ $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o bam_cat.o -AOBJS= bam_tview.o bam_plcmd.o sam_view.o \ - bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ +AOBJS= bam_tview.o bam_plcmd.o sam_view.o \ + bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o \ - cut_target.o phase.o bam2depth.o + cut_target.o phase.o bam2depth.o padding.o bedcov.o bamshuf.o \ + bam_tview_curses.o bam_tview_html.o PROG= samtools INCLUDES= -I. SUBDIRS= . bcftools misc @@ -23,6 +22,7 @@ LIBPATH= LIBCURSES= -lcurses # -lXCurses .SUFFIXES:.c .o +.PHONY: all lib .c.o: $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ @@ -48,13 +48,16 @@ libbam.a:$(LOBJS) $(AR) -csru $@ $(LOBJS) samtools:lib-recur $(AOBJS) - $(CC) $(CFLAGS) -o $@ $(AOBJS) -Lbcftools $(LIBPATH) libbam.a -lbcf $(LIBCURSES) -lm -lz + $(CC) $(CFLAGS) -o $@ $(AOBJS) $(LDFLAGS) libbam.a -Lbcftools -lbcf $(LIBPATH) $(LIBCURSES) -lm -lz -lpthread razip:razip.o razf.o $(KNETFILE_O) - $(CC) $(CFLAGS) -o $@ razf.o razip.o $(KNETFILE_O) -lz + $(CC) $(CFLAGS) -o $@ $^ -lz bgzip:bgzip.o bgzf.o $(KNETFILE_O) - $(CC) $(CFLAGS) -o $@ bgzf.o bgzip.o $(KNETFILE_O) -lz + $(CC) $(CFLAGS) -o $@ $^ -lz -lpthread + +bgzf.o:bgzf.c bgzf.h + $(CC) -c $(CFLAGS) $(DFLAGS) -DBGZF_CACHE $(INCLUDES) bgzf.c -o $@ razip.o:razf.h bam.o:bam.h razf.h bam_endian.h kstring.h sam_header.h @@ -64,7 +67,9 @@ bam_pileup.o:bam.h razf.h ksort.h bam_plcmd.o:bam.h faidx.h bcftools/bcf.h bam2bcf.h bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h bam_lpileup.o:bam.h ksort.h -bam_tview.o:bam.h faidx.h +bam_tview.o:bam.h faidx.h bam_tview.h +bam_tview_curses.o:bam.h faidx.h bam_tview.h +bam_tview_html.o:bam.h faidx.h bam_tview.h bam_sort.o:bam.h ksort.h razf.h bam_md.o:bam.h faidx.h sam_header.o:sam_header.h khash.h diff --git a/sam/NEWS b/sam/NEWS index 41a6cc8..121485e 100644 --- a/sam/NEWS +++ b/sam/NEWS @@ -1,3 +1,33 @@ +Beta Release 0.1.19 (15 March, 2013) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes in samtools and bcftools: + + * The latest source code and development moved to github, + http://github.com/samtools/samtools + + * Many important bugfixes and contributions by many people. Thanks to all! + + * Performance improvements (multi-threading) + + * Important changes in calling, see + - samtools mpileup -p + - bcftools view -m + + * New annotations useful for filtering (RPB, HWE, QBD, MDV) + + * New tools, bamcheck and plot-bamcheck + + * New features in samtools tview + + * And much more.. + +For a detailed list of commits, please see +http://github.com/samtools/samtools/commits/master + +(0.1.19: 15 March 2013, commit 96b5f2294ac0054230e88913c4983d548069ea4e) + + Beta Release 0.1.18 (2 September, 2011) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/sam/bam.c b/sam/bam.c index 0055e84..b00d6a6 100644 --- a/sam/bam.c +++ b/sam/bam.c @@ -7,7 +7,7 @@ #include "kstring.h" #include "sam_header.h" -int bam_is_be = 0, bam_verbose = 2; +int bam_is_be = 0, bam_verbose = 2, bam_no_B = 0; char *bam_flag2char_table = "pPuUrR12sfd\0\0\0\0\0"; /************************** @@ -16,12 +16,26 @@ char *bam_flag2char_table = "pPuUrR12sfd\0\0\0\0\0"; uint32_t bam_calend(const bam1_core_t *c, const uint32_t *cigar) { - uint32_t k, end; - end = c->pos; + int k, end = c->pos; for (k = 0; k < c->n_cigar; ++k) { - int op = cigar[k] & BAM_CIGAR_MASK; - if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP) - end += cigar[k] >> BAM_CIGAR_SHIFT; + int op = bam_cigar_op(cigar[k]); + int len = bam_cigar_oplen(cigar[k]); + if (op == BAM_CBACK) { // move backward + int l, u, v; + if (k == c->n_cigar - 1) break; // skip trailing 'B' + for (l = k - 1, u = v = 0; l >= 0; --l) { + int op1 = bam_cigar_op(cigar[l]); + int len1 = bam_cigar_oplen(cigar[l]); + if (bam_cigar_type(op1)&1) { // consume query + if (u + len1 >= len) { // stop + if (bam_cigar_type(op1)&2) v += len - u; + break; + } else u += len1; + } + if (bam_cigar_type(op1)&2) v += len1; + } + end = l < 0? c->pos : end - v; + } else if (bam_cigar_type(op)&2) end += bam_cigar_oplen(cigar[k]); } return end; } @@ -30,11 +44,9 @@ int32_t bam_cigar2qlen(const bam1_core_t *c, const uint32_t *cigar) { uint32_t k; int32_t l = 0; - for (k = 0; k < c->n_cigar; ++k) { - int op = cigar[k] & BAM_CIGAR_MASK; - if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CEQUAL || op == BAM_CDIFF) - l += cigar[k] >> BAM_CIGAR_SHIFT; - } + for (k = 0; k < c->n_cigar; ++k) + if (bam_cigar_type(bam_cigar_op(cigar[k]))&1) + l += bam_cigar_oplen(cigar[k]); return l; } @@ -206,6 +218,7 @@ int bam_read1(bamFile fp, bam1_t *b) if (bam_read(fp, b->data, b->data_len) != b->data_len) return -4; b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2; if (bam_is_be) swap_endian_data(c, b->data_len, b->data); + if (bam_no_B) bam_remove_B(b); return 4 + block_len; } @@ -266,9 +279,10 @@ char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of) kputw(c->pos + 1, &str); kputc('\t', &str); kputw(c->qual, &str); kputc('\t', &str); if (c->n_cigar == 0) kputc('*', &str); else { + uint32_t *cigar = bam1_cigar(b); for (i = 0; i < c->n_cigar; ++i) { kputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str); - kputc("MIDNSHP=X"[bam1_cigar(b)[i]&BAM_CIGAR_MASK], &str); + kputc(bam_cigar_opchr(cigar[i]), &str); } } kputc('\t', &str); @@ -360,3 +374,101 @@ const char *bam_get_library(bam_header_t *h, const bam1_t *b) rg = bam_aux_get(b, "RG"); return (rg == 0)? 0 : sam_tbl_get(h->rg2lib, (const char*)(rg + 1)); } + +/************ + * Remove B * + ************/ + +int bam_remove_B(bam1_t *b) +{ + int i, j, end_j, k, l, no_qual; + uint32_t *cigar, *new_cigar; + uint8_t *seq, *qual, *p; + // test if removal is necessary + if (b->core.flag & BAM_FUNMAP) return 0; // unmapped; do nothing + cigar = bam1_cigar(b); + for (k = 0; k < b->core.n_cigar; ++k) + if (bam_cigar_op(cigar[k]) == BAM_CBACK) break; + if (k == b->core.n_cigar) return 0; // no 'B' + if (bam_cigar_op(cigar[0]) == BAM_CBACK) goto rmB_err; // cannot be removed + // allocate memory for the new CIGAR + if (b->data_len + (b->core.n_cigar + 1) * 4 > b->m_data) { // not enough memory + b->m_data = b->data_len + b->core.n_cigar * 4; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + cigar = bam1_cigar(b); // after realloc, cigar may be changed + } + new_cigar = (uint32_t*)(b->data + (b->m_data - b->core.n_cigar * 4)); // from the end of b->data + // the core loop + seq = bam1_seq(b); qual = bam1_qual(b); + no_qual = (qual[0] == 0xff); // test whether base quality is available + i = j = 0; end_j = -1; + for (k = l = 0; k < b->core.n_cigar; ++k) { + int op = bam_cigar_op(cigar[k]); + int len = bam_cigar_oplen(cigar[k]); + if (op == BAM_CBACK) { // the backward operation + int t, u; + if (k == b->core.n_cigar - 1) break; // ignore 'B' at the end of CIGAR + if (len > j) goto rmB_err; // an excessively long backward + for (t = l - 1, u = 0; t >= 0; --t) { // look back + int op1 = bam_cigar_op(new_cigar[t]); + int len1 = bam_cigar_oplen(new_cigar[t]); + if (bam_cigar_type(op1)&1) { // consume the query + if (u + len1 >= len) { // stop + new_cigar[t] -= (len - u) << BAM_CIGAR_SHIFT; + break; + } else u += len1; + } + } + if (bam_cigar_oplen(new_cigar[t]) == 0) --t; // squeeze out the zero-length operation + l = t + 1; + end_j = j; j -= len; + } else { // other CIGAR operations + new_cigar[l++] = cigar[k]; + if (bam_cigar_type(op)&1) { // consume the query + if (i != j) { // no need to copy if i == j + int u, c, c0; + for (u = 0; u < len; ++u) { // construct the consensus + c = bam1_seqi(seq, i+u); + if (j + u < end_j) { // in an overlap + c0 = bam1_seqi(seq, j+u); + if (c != c0) { // a mismatch; choose the better base + if (qual[j+u] < qual[i+u]) { // the base in the 2nd segment is better + bam1_seq_seti(seq, j+u, c); + qual[j+u] = qual[i+u] - qual[j+u]; + } else qual[j+u] -= qual[i+u]; // the 1st is better; reduce base quality + } else qual[j+u] = qual[j+u] > qual[i+u]? qual[j+u] : qual[i+u]; + } else { // not in an overlap; copy over + bam1_seq_seti(seq, j+u, c); + qual[j+u] = qual[i+u]; + } + } + } + i += len, j += len; + } + } + } + if (no_qual) qual[0] = 0xff; // in very rare cases, this may be modified + // merge adjacent operations if possible + for (k = 1; k < l; ++k) + if (bam_cigar_op(new_cigar[k]) == bam_cigar_op(new_cigar[k-1])) + new_cigar[k] += new_cigar[k-1] >> BAM_CIGAR_SHIFT << BAM_CIGAR_SHIFT, new_cigar[k-1] &= 0xf; + // kill zero length operations + for (k = i = 0; k < l; ++k) + if (new_cigar[k] >> BAM_CIGAR_SHIFT) + new_cigar[i++] = new_cigar[k]; + l = i; + // update b + memcpy(cigar, new_cigar, l * 4); // set CIGAR + p = b->data + b->core.l_qname + l * 4; + memmove(p, seq, (j+1)>>1); p += (j+1)>>1; // set SEQ + memmove(p, qual, j); p += j; // set QUAL + memmove(p, bam1_aux(b), b->l_aux); p += b->l_aux; // set optional fields + b->core.n_cigar = l, b->core.l_qseq = j; // update CIGAR length and query length + b->data_len = p - b->data; // update record length + return 0; + +rmB_err: + b->core.flag |= BAM_FUNMAP; + return -1; +} diff --git a/sam/bam.h b/sam/bam.h index 346c750..80e8703 100644 --- a/sam/bam.h +++ b/sam/bam.h @@ -40,7 +40,7 @@ @copyright Genome Research Ltd. */ -#define BAM_VERSION "0.1.18 (r982:295)" +#define BAM_VERSION "0.1.19-44428cd" #include #include @@ -89,7 +89,7 @@ typedef struct { char **target_name; uint32_t *target_len; void *dict, *hash, *rg2lib; - size_t l_text, n_text; + uint32_t l_text, n_text; char *text; } bam_header_t; @@ -150,15 +150,24 @@ typedef struct { /*! @abstract CIGAR: P = padding */ #define BAM_CPAD 6 /*! @abstract CIGAR: equals = match */ -#define BAM_CEQUAL 7 +#define BAM_CEQUAL 7 /*! @abstract CIGAR: X = mismatch */ -#define BAM_CDIFF 8 +#define BAM_CDIFF 8 +#define BAM_CBACK 9 + +#define BAM_CIGAR_STR "MIDNSHP=XB" +#define BAM_CIGAR_TYPE 0x3C1A7 + +#define bam_cigar_op(c) ((c)&BAM_CIGAR_MASK) +#define bam_cigar_oplen(c) ((c)>>BAM_CIGAR_SHIFT) +#define bam_cigar_opchr(c) (BAM_CIGAR_STR[bam_cigar_op(c)]) +#define bam_cigar_gen(l, o) ((l)<>((o)<<1)&3) // bit 1: consume query; bit 2: consume reference /*! @typedef @abstract Structure for core alignment information. @field tid chromosome ID, defined by bam_header_t @field pos 0-based leftmost coordinate - @field strand strand; 0 for forward and 1 otherwise @field bin bin calculated by bam_reg2bin() @field qual mapping quality @field l_qname length of the query name @@ -183,13 +192,15 @@ typedef struct { @field l_aux length of auxiliary data @field data_len current length of bam1_t::data @field m_data maximum length of bam1_t::data - @field data all variable-length data, concatenated; structure: cigar-qname-seq-qual-aux + @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux @discussion Notes: 1. qname is zero tailing and core.l_qname includes the tailing '\0'. 2. l_qseq is calculated from the total length of an alignment block on reading or from CIGAR. + 3. cigar data is encoded 4 bytes per CIGAR operation. + 4. seq is nybble-encoded according to bam_nt16_table. */ typedef struct { bam1_core_t core; @@ -245,7 +256,10 @@ typedef struct __bam_iter_t *bam_iter_t; @param i The i-th position, 0-based @return 4-bit integer representing the base. */ -#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf) +//#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf) +#define bam1_seqi(s, i) ((s)[(i)>>1] >> ((~(i)&1)<<2) & 0xf) + +#define bam1_seq_seti(s, i, c) ( (s)[(i)>>1] = ((s)[(i)>>1] & 0xf<<(((i)&1)<<2)) | (c)<<((~(i)&1)<<2) ) /*! @function @abstract Get query sequence and quality @@ -275,6 +289,8 @@ extern int bam_is_be; */ extern int bam_verbose; +extern int bam_no_B; + /*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */ extern unsigned char bam_nt16_table[256]; @@ -420,6 +436,8 @@ extern "C" { */ int bam_read1(bamFile fp, bam1_t *b); + int bam_remove_B(bam1_t *b); + /*! @abstract Write an alignment to BAM. @param fp BAM file handler @@ -755,9 +773,21 @@ static inline int bam_aux_type2size(int x) { if (x == 'C' || x == 'c' || x == 'A') return 1; else if (x == 'S' || x == 's') return 2; - else if (x == 'I' || x == 'i' || x == 'f') return 4; + else if (x == 'I' || x == 'i' || x == 'f' || x == 'F') return 4; else return 0; } +/********************************* + *** Compatibility with htslib *** + *********************************/ + +typedef bam_header_t bam_hdr_t; + +#define bam_get_qname(b) bam1_qname(b) +#define bam_get_cigar(b) bam1_cigar(b) + +#define bam_hdr_read(fp) bam_header_read(fp) +#define bam_hdr_write(fp, h) bam_header_write(fp, h) +#define bam_hdr_destroy(fp) bam_header_destroy(fp) #endif diff --git a/sam/bam2bcf.c b/sam/bam2bcf.c index dec3305..340b10b 100644 --- a/sam/bam2bcf.c +++ b/sam/bam2bcf.c @@ -1,5 +1,6 @@ #include #include +#include #include "bam.h" #include "kstring.h" #include "bam2bcf.h" @@ -26,20 +27,55 @@ bcf_callaux_t *bcf_call_init(double theta, int min_baseQ) bca->e = errmod_init(1. - theta); bca->min_frac = 0.002; bca->min_support = 1; - return bca; + bca->per_sample_flt = 0; + bca->npos = 100; + bca->ref_pos = calloc(bca->npos, sizeof(int)); + bca->alt_pos = calloc(bca->npos, sizeof(int)); + return bca; +} + + +static int get_position(const bam_pileup1_t *p, int *len) +{ + int icig, n_tot_bases = 0, iread = 0, edist = p->qpos + 1; + for (icig=0; icigb->core.n_cigar; icig++) + { + // Conversion from uint32_t to MIDNSHP + // 0123456 + // MIDNSHP + int cig = bam1_cigar(p->b)[icig] & BAM_CIGAR_MASK; + int ncig = bam1_cigar(p->b)[icig] >> BAM_CIGAR_SHIFT; + if ( cig==0 ) + { + n_tot_bases += ncig; + iread += ncig; + } + else if ( cig==1 ) + { + n_tot_bases += ncig; + iread += ncig; + } + else if ( cig==4 ) + { + iread += ncig; + if ( iread<=p->qpos ) edist -= ncig; + } + } + *len = n_tot_bases; + return edist; } void bcf_call_destroy(bcf_callaux_t *bca) { if (bca == 0) return; errmod_destroy(bca->e); + if (bca->npos) { free(bca->ref_pos); free(bca->alt_pos); bca->npos = 0; } free(bca->bases); free(bca->inscns); free(bca); } /* ref_base is the 4-bit representation of the reference base. It is * negative if we are looking at an indel. */ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r) { - static int *var_pos = NULL, nvar_pos = 0; int i, n, ref4, is_indel, ori_depth = 0; memset(r, 0, sizeof(bcf_callret1_t)); if (ref_base >= 0) { @@ -54,8 +90,7 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases); } // fill the bases array - memset(r, 0, sizeof(bcf_callret1_t)); - for (i = n = 0; i < _n; ++i) { + for (i = n = r->n_supp = 0; i < _n; ++i) { const bam_pileup1_t *p = pl + i; int q, b, mapQ, baseQ, is_diff, min_dist, seqQ; // set base @@ -78,6 +113,7 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t b = p->aux>>16&0x3f; is_diff = (b != 0); } + if (is_diff) ++r->n_supp; bca->bases[n++] = q<<5 | (int)bam1_strand(p->b)<<4 | b; // collect annotations if (b < 4) r->qsum[b] += q; @@ -91,97 +127,163 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ; r->anno[3<<2|is_diff<<1|0] += min_dist; r->anno[3<<2|is_diff<<1|1] += min_dist * min_dist; + + // collect read positions for ReadPosBias + int len, pos = get_position(p, &len); + int epos = (double)pos/(len+1) * bca->npos; + if ( bam1_seqi(bam1_seq(p->b),p->qpos) == ref_base ) + bca->ref_pos[epos]++; + else + bca->alt_pos[epos]++; } r->depth = n; r->ori_depth = ori_depth; // glfgen errmod_cal(bca->e, n, 5, bca->bases, r->p); + return r->depth; +} - // Calculate the Variant Distance Bias (make it optional?) - if ( nvar_pos < _n ) { - nvar_pos = _n; - var_pos = realloc(var_pos,sizeof(int)*nvar_pos); - } - int alt_dp=0, read_len=0; - for (i=0; i<_n; i++) { - const bam_pileup1_t *p = pl + i; - if ( bam1_seqi(bam1_seq(p->b),p->qpos) == ref_base ) - continue; +double mann_whitney_1947(int n, int m, int U) +{ + if (U<0) return 0; + if (n==0||m==0) return U==0 ? 1 : 0; + return (double)n/(n+m)*mann_whitney_1947(n-1,m,U-m) + (double)m/(n+m)*mann_whitney_1947(n,m-1,U); +} - var_pos[alt_dp] = p->qpos; - if ( (bam1_cigar(p->b)[0]&BAM_CIGAR_MASK)==4 ) - var_pos[alt_dp] -= bam1_cigar(p->b)[0]>>BAM_CIGAR_SHIFT; +void calc_ReadPosBias(bcf_callaux_t *bca, bcf_call_t *call) +{ + int i, nref = 0, nalt = 0; + unsigned long int U = 0; + for (i=0; inpos; i++) + { + nref += bca->ref_pos[i]; + nalt += bca->alt_pos[i]; + U += nref*bca->alt_pos[i]; + bca->ref_pos[i] = 0; + bca->alt_pos[i] = 0; + } +#if 0 +//todo + double var = 0, avg = (double)(nref+nalt)/bca->npos; + for (i=0; inpos; i++) + { + double ediff = bca->ref_pos[i] + bca->alt_pos[i] - avg; + var += ediff*ediff; + bca->ref_pos[i] = 0; + bca->alt_pos[i] = 0; + } + call->read_pos.avg = avg; + call->read_pos.var = sqrt(var/bca->npos); + call->read_pos.dp = nref+nalt; +#endif + if ( !nref || !nalt ) + { + call->read_pos_bias = -1; + return; + } - alt_dp++; - read_len += p->b->core.l_qseq; + if ( nref>=8 || nalt>=8 ) + { + // normal approximation + double mean = ((double)nref*nalt+1.0)/2.0; + double var2 = (double)nref*nalt*(nref+nalt+1.0)/12.0; + double z = (U-mean)/sqrt(var2); + call->read_pos_bias = z; + //fprintf(stderr,"nref=%d nalt=%d U=%ld mean=%e var=%e zval=%e\n", nref,nalt,U,mean,sqrt(var2),call->read_pos_bias); } - float mvd=0; - int j; - n=0; - for (i=0; i= 1./sqrt(var2*2*M_PI) ) z = 0; // equal to mean + else + { + if ( U >= nref*nalt/2. ) z = sqrt(-2*log(sqrt(var2*2*M_PI)*p)); + else z = -sqrt(-2*log(sqrt(var2*2*M_PI)*p)); } + call->read_pos_bias = z; + //fprintf(stderr,"nref=%d nalt=%d U=%ld p=%e var2=%e zval=%e\n", nref,nalt,U, p,var2,call->read_pos_bias); } - r->mvd[0] = n ? mvd/n : 0; - r->mvd[1] = alt_dp; - r->mvd[2] = alt_dp ? read_len/alt_dp : 0; - - return r->depth; } - -void calc_vdb(int n, const bcf_callret1_t *calls, bcf_call_t *call) +float mean_diff_to_prob(float mdiff, int dp, int readlen) { - // Variant distance bias. Samples merged by means of DP-weighted average. - - float weight=0, tot_prob=0; - - int i; - for (i=0; i2*mu ? 0 : sin(mvd*3.14/2/mu) / (4*mu/3.14); - } - else - { - // Scaled gaussian curve, crude approximation, but behaves well. Using fixed depth for bigger depths. - if ( dp>5 ) - dp = 5; - float sigma2 = (read_len/1.9/(dp+1)) * (read_len/1.9/(dp+1)); - float norm = 1.125*sqrt(2*3.14*sigma2); - float mu = read_len/2.9; - if ( mvd < mu ) - prob = exp(-(mvd-mu)*(mvd-mu)/2/sigma2)/norm; - else - prob = exp(-(mvd-mu)*(mvd-mu)/3.125/sigma2)/norm; - } + float m, v; + if ( dp>=24 ) + { + m = readlen/8.; + if (dp>100) dp = 100; + v = 1.476/(0.182*pow(dp,0.514)); + v = v*(readlen/100.); + } + else + { + m = mv[dp][0]; + v = mv[dp][1]; + m = m*readlen/100.; + v = v*readlen/100.; + v *= 1.2; // allow more variability + } + return 1.0/(v*sqrt(2*M_PI)) * exp(-0.5*((mdiff-m)/v)*((mdiff-m)/v)); +} - //fprintf(stderr,"dp=%d mvd=%d read_len=%d -> prob=%f\n", dp,mvd,read_len,prob); - tot_prob += prob*dp; - weight += dp; +void calc_vdb(bcf_callaux_t *bca, bcf_call_t *call) +{ + int i, dp = 0; + float mean_pos = 0, mean_diff = 0; + for (i=0; inpos; i++) + { + if ( !bca->alt_pos[i] ) continue; + dp += bca->alt_pos[i]; + int j = inpos/2 ? i : bca->npos - i; + mean_pos += bca->alt_pos[i]*j; + } + if ( dp<2 ) + { + call->vdb = -1; + return; } - tot_prob = weight ? tot_prob/weight : 1; - //fprintf(stderr,"prob=%f\n", tot_prob); - call->vdb = tot_prob; + mean_pos /= dp; + for (i=0; inpos; i++) + { + if ( !bca->alt_pos[i] ) continue; + int j = inpos/2 ? i : bca->npos - i; + mean_diff += bca->alt_pos[i] * fabs(j - mean_pos); + } + mean_diff /= dp; + call->vdb = mean_diff_to_prob(mean_diff, dp, bca->npos); } -int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call) +/** + * bcf_call_combine() - sets the PL array and VDB, RPB annotations, finds the top two alleles + * @n: number of samples + * @calls: each sample's calls + * @bca: auxiliary data structure for holding temporary values + * @ref_base: the reference base + * @call: filled with the annotations + */ +int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call) { int ref4, i, j, qsum[4]; int64_t tmp; @@ -194,6 +296,8 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, for (i = 0; i < n; ++i) for (j = 0; j < 4; ++j) qsum[j] += calls[i].qsum[j]; + int qsum_tot=0; + for (j=0; j<4; j++) { qsum_tot += qsum[j]; call->qsum[j] = 0; } for (j = 0; j < 4; ++j) qsum[j] = qsum[j] << 2 | j; // find the top 2 alleles for (i = 1; i < 4; ++i) // insertion sort @@ -205,9 +309,15 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, call->a[0] = ref4; for (i = 3, j = 1; i >= 0; --i) { if ((qsum[i]&3) != ref4) { - if (qsum[i]>>2 != 0) call->a[j++] = qsum[i]&3; + if (qsum[i]>>2 != 0) + { + if ( j<4 ) call->qsum[j] = (float)(qsum[i]>>2)/qsum_tot; // ref N can make j>=4 + call->a[j++] = qsum[i]&3; + } else break; } + else + call->qsum[0] = (float)(qsum[i]>>2)/qsum_tot; } if (ref_base >= 0) { // for SNPs, find the "unseen" base if (((ref4 < 4 && j < 4) || (ref4 == 4 && j < 5)) && i >= 0) @@ -255,12 +365,13 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j]; } - calc_vdb(n, calls, call); + calc_vdb(bca, call); + calc_ReadPosBias(bca, call); return 0; } -int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int is_SP, +int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int fmt_flag, const bcf_callaux_t *bca, const char *ref) { extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two); @@ -304,34 +415,39 @@ int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bc } kputc('\0', &s); // INFO - if (bc->ori_ref < 0) kputs("INDEL;", &s); + if (bc->ori_ref < 0) ksprintf(&s,"INDEL;IS=%d,%f;", bca->max_support, bca->max_frac); kputs("DP=", &s); kputw(bc->ori_depth, &s); kputs(";I16=", &s); for (i = 0; i < 16; ++i) { if (i) kputc(',', &s); kputw(bc->anno[i], &s); } - if ( bc->vdb!=1 ) - { - ksprintf(&s, ";VDB=%.4f", bc->vdb); - } + //ksprintf(&s,";RPS=%d,%f,%f", bc->read_pos.dp,bc->read_pos.avg,bc->read_pos.var); + ksprintf(&s,";QS=%f,%f,%f,%f", bc->qsum[0],bc->qsum[1],bc->qsum[2],bc->qsum[3]); + if (bc->vdb != -1) + ksprintf(&s, ";VDB=%e", bc->vdb); + if (bc->read_pos_bias != -1 ) + ksprintf(&s, ";RPB=%e", bc->read_pos_bias); kputc('\0', &s); // FMT kputs("PL", &s); - if (bcr) { - kputs(":DP", &s); - if (is_SP) kputs(":SP", &s); + if (bcr && fmt_flag) { + if (fmt_flag & B2B_FMT_DP) kputs(":DP", &s); + if (fmt_flag & B2B_FMT_DV) kputs(":DV", &s); + if (fmt_flag & B2B_FMT_SP) kputs(":SP", &s); } kputc('\0', &s); b->m_str = s.m; b->str = s.s; b->l_str = s.l; bcf_sync(b); memcpy(b->gi[0].data, bc->PL, b->gi[0].len * bc->n); - if (bcr) { - uint16_t *dp = (uint16_t*)b->gi[1].data; - int32_t *sp = is_SP? b->gi[2].data : 0; + if (bcr && fmt_flag) { + uint16_t *dp = (fmt_flag & B2B_FMT_DP)? b->gi[1].data : 0; + uint16_t *dv = (fmt_flag & B2B_FMT_DV)? b->gi[1 + ((fmt_flag & B2B_FMT_DP) != 0)].data : 0; + int32_t *sp = (fmt_flag & B2B_FMT_SP)? b->gi[1 + ((fmt_flag & B2B_FMT_DP) != 0) + ((fmt_flag & B2B_FMT_DV) != 0)].data : 0; for (i = 0; i < bc->n; ++i) { bcf_callret1_t *p = bcr + i; - dp[i] = p->depth < 0xffff? p->depth : 0xffff; - if (is_SP) { + if (dp) dp[i] = p->depth < 0xffff? p->depth : 0xffff; + if (dv) dv[i] = p->n_supp < 0xffff? p->n_supp : 0xffff; + if (sp) { if (p->anno[0] + p->anno[1] < 2 || p->anno[2] + p->anno[3] < 2 || p->anno[0] + p->anno[2] < 2 || p->anno[1] + p->anno[3] < 2) { diff --git a/sam/bam2bcf.h b/sam/bam2bcf.h index 4af080c..b2b1825 100644 --- a/sam/bam2bcf.h +++ b/sam/bam2bcf.h @@ -7,15 +7,22 @@ #define B2B_INDEL_NULL 10000 +#define B2B_FMT_DP 0x1 +#define B2B_FMT_SP 0x2 +#define B2B_FMT_DV 0x4 + typedef struct __bcf_callaux_t { int capQ, min_baseQ; int openQ, extQ, tandemQ; // for indels - int min_support; // for collecting indel candidates - double min_frac; // for collecting indel candidates + int min_support, max_support; // for collecting indel candidates + double min_frac, max_frac; // for collecting indel candidates + int per_sample_flt; // indel filtering strategy + int *ref_pos, *alt_pos, npos; // for ReadPosBias // for internal uses int max_bases; int indel_types[4]; int maxins, indelreg; + int read_len; char *inscns; uint16_t *bases; errmod_t *e; @@ -23,18 +30,21 @@ typedef struct __bcf_callaux_t { } bcf_callaux_t; typedef struct { - int depth, ori_depth, qsum[4]; - int anno[16]; + int depth, n_supp, ori_depth, qsum[4]; + unsigned int anno[16]; float p[25]; - int mvd[3]; // mean variant distance, number of variant reads, average read length } bcf_callret1_t; typedef struct { int a[5]; // alleles: ref, alt, alt2, alt3 + float qsum[4]; int n, n_alleles, shift, ori_ref, unseen; - int anno[16], depth, ori_depth; + int n_supp; // number of supporting non-reference reads + unsigned int anno[16], depth, ori_depth; uint8_t *PL; float vdb; // variant distance bias + float read_pos_bias; + struct { float avg, var; int dp; } read_pos; } bcf_call_t; #ifdef __cplusplus @@ -44,8 +54,8 @@ extern "C" { bcf_callaux_t *bcf_call_init(double theta, int min_baseQ); void bcf_call_destroy(bcf_callaux_t *bca); int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r); - int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call); - int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int is_SP, + int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call); + int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int fmt_flag, const bcf_callaux_t *bca, const char *ref); int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref, const void *rghash); diff --git a/sam/bam2bcf_indel.c b/sam/bam2bcf_indel.c index 5142b3e..30b3f46 100644 --- a/sam/bam2bcf_indel.c +++ b/sam/bam2bcf_indel.c @@ -109,6 +109,9 @@ static inline int est_indelreg(int pos, const char *ref, int l, char *ins4) return max_i - pos; } +/* + * @n: number of samples + */ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref, const void *rghash) { @@ -142,32 +145,48 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla if (s == n) return -1; // there is no indel at this position. for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads { // find out how many types of indels are present - int m, n_alt = 0, n_tot = 0; + bca->max_support = bca->max_frac = 0; + int m, n_alt = 0, n_tot = 0, indel_support_ok = 0; uint32_t *aux; aux = calloc(N + 1, 4); m = max_rd_len = 0; aux[m++] = MINUS_CONST; // zero indel is always a type for (s = 0; s < n; ++s) { + int na = 0, nt = 0; for (i = 0; i < n_plp[s]; ++i) { const bam_pileup1_t *p = plp[s] + i; if (rghash == 0 || p->aux == 0) { - ++n_tot; + ++nt; if (p->indel != 0) { - ++n_alt; + ++na; aux[m++] = MINUS_CONST + p->indel; } } j = bam_cigar2qlen(&p->b->core, bam1_cigar(p->b)); if (j > max_rd_len) max_rd_len = j; } + float frac = (float)na/nt; + if ( !indel_support_ok && na >= bca->min_support && frac >= bca->min_frac ) + indel_support_ok = 1; + if ( na > bca->max_support && frac > 0 ) bca->max_support = na, bca->max_frac = frac; + n_alt += na; + n_tot += nt; } + // To prevent long stretches of N's to be mistaken for indels (sometimes thousands of bases), + // check the number of N's in the sequence and skip places where half or more reference bases are Ns. + int nN=0; for (i=pos; i-posi ) { free(aux); return -1; } + ks_introsort(uint32_t, m, aux); // squeeze out identical types for (i = 1, n_types = 1; i < m; ++i) if (aux[i] != aux[i-1]) ++n_types; - if (n_types == 1 || (double)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support) { // then skip - free(aux); return -1; - } + // Taking totals makes it hard to call rare indels + if ( !bca->per_sample_flt ) + indel_support_ok = ( (float)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1; + if ( n_types == 1 || !indel_support_ok ) { // then skip + free(aux); return -1; + } if (n_types >= 64) { free(aux); if (bam_verbose >= 2) @@ -199,6 +218,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla * reduces the power because sometimes, substitutions caused by * indels are not distinguishable from true mutations. Multiple * sequence realignment helps to increase the power. + * + * Masks mismatches present in at least 70% of the reads with 'N'. */ { // construct per-sample consensus int L = right - left + 1, max_i, max2_i; @@ -242,7 +263,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1; if (max_i >= 0) r[max_i] = 15; if (max2_i >= 0) r[max2_i] = 15; -// for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], stderr); fputc('\n', stderr); + //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], stderr); fputc('\n', stderr); } free(ref0); free(cns); } @@ -259,9 +280,9 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla } } // construct the consensus sequence - max_ins = types[n_types - 1]; // max_ins is at least 0 + max_ins = types[n_types - 1]; // max_ins is at least 0 if (max_ins > 0) { - int *inscns_aux = calloc(4 * n_types * max_ins, sizeof(int)); + int *inscns_aux = calloc(5 * n_types * max_ins, sizeof(int)); // count the number of occurrences of each base at each position for each type of insertion for (t = 0; t < n_types; ++t) { if (types[t] > 0) { @@ -272,7 +293,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla uint8_t *seq = bam1_seq(p->b); for (k = 1; k <= p->indel; ++k) { int c = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos + k)]; - if (c < 4) ++inscns_aux[(t*max_ins+(k-1))*4 + c]; + assert(c<5); + ++inscns_aux[(t*max_ins+(k-1))*5 + c]; } } } @@ -283,11 +305,12 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla inscns = calloc(n_types * max_ins, 1); for (t = 0; t < n_types; ++t) { for (j = 0; j < types[t]; ++j) { - int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*4]; - for (k = 0; k < 4; ++k) + int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5]; + for (k = 0; k < 5; ++k) if (ia[k] > max) max = ia[k], max_k = k; inscns[t*max_ins + j] = max? max_k : 4; + if ( max_k==4 ) { types[t] = 0; break; } // discard insertions which contain N's } } free(inscns_aux); diff --git a/sam/bam2depth.c b/sam/bam2depth.c index ca36b89..02311ef 100644 --- a/sam/bam2depth.c +++ b/sam/bam2depth.c @@ -13,7 +13,7 @@ typedef struct { // auxiliary data structure bamFile fp; // the file handler bam_iter_t iter; // NULL if a region not specified - int min_mapQ; // mapQ filter + int min_mapQ, min_len; // mapQ filter; length filter } aux_t; void *bed_read(const char *fn); // read a BED or position list file @@ -25,40 +25,65 @@ static int read_bam(void *data, bam1_t *b) // read level filters better go here { aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure int ret = aux->iter? bam_iter_read(aux->fp, aux->iter, b) : bam_read1(aux->fp, b); - if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP; + if (!(b->core.flag&BAM_FUNMAP)) { + if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP; + else if (aux->min_len && bam_cigar2qlen(&b->core, bam1_cigar(b)) < aux->min_len) b->core.flag |= BAM_FUNMAP; + } return ret; } +int read_file_list(const char *file_list,int *n,char **argv[]); + #ifdef _MAIN_BAM2DEPTH int main(int argc, char *argv[]) #else int main_depth(int argc, char *argv[]) #endif { - int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0; + int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0, nfiles; const bam_pileup1_t **plp; char *reg = 0; // specified region void *bed = 0; // BED data structure + char *file_list = NULL, **fn = NULL; bam_header_t *h = 0; // BAM header of the 1st input aux_t **data; bam_mplp_t mplp; // parse the command line - while ((n = getopt(argc, argv, "r:b:q:Q:")) >= 0) { + while ((n = getopt(argc, argv, "r:b:q:Q:l:f:")) >= 0) { switch (n) { + case 'l': min_len = atoi(optarg); break; // minimum query length case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header case 'b': bed = bed_read(optarg); break; // BED or position list file can be parsed now case 'q': baseQ = atoi(optarg); break; // base quality threshold case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold + case 'f': file_list = optarg; break; } } - if (optind == argc) { - fprintf(stderr, "Usage: bam2depth [-r reg] [-q baseQthres] [-Q mapQthres] [-b in.bed] [...]\n"); + if (optind == argc && !file_list) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n"); + fprintf(stderr, "Options:\n"); + fprintf(stderr, " -b list of positions or regions\n"); + fprintf(stderr, " -f list of input BAM filenames, one per line [null]\n"); + fprintf(stderr, " -l minQLen\n"); + fprintf(stderr, " -q base quality threshold\n"); + fprintf(stderr, " -Q mapping quality threshold\n"); + fprintf(stderr, " -r region\n"); + fprintf(stderr, "\n"); return 1; } // initialize the auxiliary data structures - n = argc - optind; // the number of BAMs on the command line + if (file_list) + { + if ( read_file_list(file_list,&nfiles,&fn) ) return 1; + n = nfiles; + argv = fn; + optind = 0; + } + else + n = argc - optind; // the number of BAMs on the command line data = calloc(n, sizeof(void*)); // data[i] for the i-th input beg = 0; end = 1<<30; tid = -1; // set the default region for (i = 0; i < n; ++i) { @@ -66,6 +91,7 @@ int main_depth(int argc, char *argv[]) data[i] = calloc(1, sizeof(aux_t)); data[i]->fp = bam_open(argv[optind+i], "r"); // open BAM data[i]->min_mapQ = mapQ; // set the mapQ filter + data[i]->min_len = min_len; // set the qlen filter htmp = bam_header_read(data[i]->fp); // read the BAM header if (i == 0) { h = htmp; // keep the header of the 1st BAM @@ -108,5 +134,10 @@ int main_depth(int argc, char *argv[]) } free(data); free(reg); if (bed) bed_destroy(bed); + if ( file_list ) + { + for (i=0; i #include +#include "knetfile.h" #include "bgzf.h" #include "bam.h" @@ -97,7 +98,7 @@ int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]); return -1; } - if (in->open_mode != 'r') return -1; + if (in->is_write) return -1; old = bam_header_read(in); if (h == 0 && i == 0) bam_header_write(fp, old); @@ -109,10 +110,10 @@ int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam j=0; #ifdef _USE_KNETFILE - fp_file=fp->x.fpw; - while ((len = knet_read(in->x.fpr, buf, BUF_SIZE)) > 0) { + fp_file = fp->fp; + while ((len = knet_read(in->fp, buf, BUF_SIZE)) > 0) { #else - fp_file=fp->file; + fp_file = fp->fp; while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0) { #endif if(len> BAM_CIGAR_SHIFT; + } + } else { i++; } return cs[i]; } @@ -42,7 +48,14 @@ char bam_aux_getCQi(bam1_t *b, int i) cq = bam_aux2Z(c); // adjust for strandedness - if(bam1_strand(b)) i = strlen(cq) - 1 - i; + if(bam1_strand(b)) { + i = strlen(cq) - 1 - i; + // adjust for leading hard clip + uint32_t cigar = bam1_cigar(b)[0]; + if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) { + i -= (cigar >> BAM_CIGAR_SHIFT); + } + } return cq[i]; } @@ -98,6 +111,11 @@ char bam_aux_getCEi(bam1_t *b, int i) // adjust for strandedness and leading adaptor if(bam1_strand(b)) { //reverse strand cs_i = strlen(cs) - 1 - i; + // adjust for leading hard clip + uint32_t cigar = bam1_cigar(b)[0]; + if((cigar & BAM_CIGAR_MASK) == BAM_CHARD_CLIP) { + cs_i -= cigar >> BAM_CIGAR_SHIFT; + } // get current color cur_color = cs[cs_i]; // get previous base. Note: must rc adaptor diff --git a/sam/bam_import.c b/sam/bam_import.c index 5518a9c..da2bf94 100644 --- a/sam/bam_import.c +++ b/sam/bam_import.c @@ -183,7 +183,7 @@ static inline void append_text(bam_header_t *header, kstring_t *str) // Sanity check if ( header->l_text+str->l+1 >= header->n_text ) { - fprintf(stderr,"append_text FIXME: %ld>=%ld, x=%ld,y=%ld\n", header->l_text+str->l+1,header->n_text,x,y); + fprintf(stderr,"append_text FIXME: %ld>=%ld, x=%ld,y=%ld\n", header->l_text+str->l+1,(long)header->n_text,x,y); abort(); } strncpy(header->text + header->l_text, str->s, str->l+1); // we cannot use strcpy() here. @@ -291,11 +291,13 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) if (ks_getuntil(ks, KS_SEP_TAB, str, &dret) < 0) return -3; z += str->l + 1; if (str->s[0] != '*') { + uint32_t *cigar; for (s = str->s; *s; ++s) { if ((isalpha(*s)) || (*s=='=')) ++c->n_cigar; else if (!isdigit(*s)) parse_error(fp->n_lines, "invalid CIGAR character"); } b->data = alloc_data(b, doff + c->n_cigar * 4); + cigar = bam1_cigar(b); for (i = 0, s = str->s; i != c->n_cigar; ++i) { x = strtol(s, &t, 10); op = toupper(*t); @@ -308,12 +310,13 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) else if (op == 'P') op = BAM_CPAD; else if (op == '=') op = BAM_CEQUAL; else if (op == 'X') op = BAM_CDIFF; + else if (op == 'B') op = BAM_CBACK; else parse_error(fp->n_lines, "invalid CIGAR operation"); s = t + 1; - bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op; + cigar[i] = bam_cigar_gen(x, op); } if (*s) parse_error(fp->n_lines, "unmatched CIGAR operation"); - c->bin = bam_reg2bin(c->pos, bam_calend(c, bam1_cigar(b))); + c->bin = bam_reg2bin(c->pos, bam_calend(c, cigar)); doff += c->n_cigar * 4; } else { if (!(c->flag&BAM_FUNMAP)) { @@ -340,9 +343,9 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) if (strcmp(str->s, "*")) { c->l_qseq = strlen(str->s); if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b))) { - fprintf(stderr, "Line %ld, sequence length %i vs %i from CIGAR\n", - (long)fp->n_lines, c->l_qseq, (int32_t)bam_cigar2qlen(c, bam1_cigar(b))); - parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent"); + fprintf(stderr, "Line %ld, sequence length %i vs %i from CIGAR\n", + (long)fp->n_lines, c->l_qseq, (int32_t)bam_cigar2qlen(c, bam1_cigar(b))); + parse_error(fp->n_lines, "CIGAR and sequence length are inconsistent"); } p = (uint8_t*)alloc_data(b, doff + c->l_qseq + (c->l_qseq+1)/2) + doff; memset(p, 0, (c->l_qseq+1)/2); @@ -459,6 +462,7 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) } b->l_aux = doff - doff0; b->data_len = doff; + if (bam_no_B) bam_remove_B(b); return z; } diff --git a/sam/bam_index.c b/sam/bam_index.c index 9610a26..f916e04 100644 --- a/sam/bam_index.c +++ b/sam/bam_index.c @@ -159,9 +159,14 @@ bam_index_t *bam_index_core(bamFile fp) bam1_core_t *c; uint64_t save_off, last_off, n_mapped, n_unmapped, off_beg, off_end, n_no_coor; + h = bam_header_read(fp); + if(h == 0) { + fprintf(stderr, "[bam_index_core] Invalid BAM header."); + return NULL; + } + idx = (bam_index_t*)calloc(1, sizeof(bam_index_t)); b = (bam1_t*)calloc(1, sizeof(bam1_t)); - h = bam_header_read(fp); c = &b->core; idx->n = h->n_targets; @@ -459,6 +464,7 @@ bam_index_t *bam_index_load(const char *fn) strcat(strcpy(fnidx, fn), ".bai"); fprintf(stderr, "[bam_index_load] attempting to download the remote index file.\n"); download_from_remote(fnidx); + free(fnidx); idx = bam_index_load_local(fn); } if (idx == 0) fprintf(stderr, "[bam_index_load] fail to load BAM index.\n"); @@ -489,6 +495,7 @@ int bam_index_build2(const char *fn, const char *_fnidx) if (fpidx == 0) { fprintf(stderr, "[bam_index_build2] fail to create the index file.\n"); free(fnidx); + bam_index_destroy(idx); return -1; } bam_index_save(idx, fpidx); diff --git a/sam/bam_mate.c b/sam/bam_mate.c index 61f808a..b947c9d 100644 --- a/sam/bam_mate.c +++ b/sam/bam_mate.c @@ -1,14 +1,44 @@ #include #include +#include +#include "kstring.h" #include "bam.h" +void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str) +{ + bam1_t *swap; + int i, end; + uint32_t *cigar; + str->l = 0; + if (b1->core.tid != b2->core.tid || b1->core.tid < 0) return; // coordinateless or not on the same chr; skip + if (b1->core.pos > b2->core.pos) swap = b1, b1 = b2, b2 = swap; // make sure b1 has a smaller coordinate + kputc((b1->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index + kputc((b1->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand + for (i = 0, cigar = bam1_cigar(b1); i < b1->core.n_cigar; ++i) { + kputw(bam_cigar_oplen(cigar[i]), str); + kputc(bam_cigar_opchr(cigar[i]), str); + } + end = bam_calend(&b1->core, cigar); + kputw(b2->core.pos - end, str); + kputc('T', str); + kputc((b2->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index + kputc((b2->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand + for (i = 0, cigar = bam1_cigar(b2); i < b2->core.n_cigar; ++i) { + kputw(bam_cigar_oplen(cigar[i]), str); + kputc(bam_cigar_opchr(cigar[i]), str); + } + bam_aux_append(b1, "CT", 'Z', str->l+1, (uint8_t*)str->s); +} + // currently, this function ONLY works if each read has one hit -void bam_mating_core(bamFile in, bamFile out) +void bam_mating_core(bamFile in, bamFile out, int remove_reads) { bam_header_t *header; bam1_t *b[2]; - int curr, has_prev; + int curr, has_prev, pre_end = 0, cur_end; + kstring_t str; + str.l = str.m = 0; str.s = 0; header = bam_header_read(in); bam_header_write(out, header); @@ -17,16 +47,28 @@ void bam_mating_core(bamFile in, bamFile out) curr = 0; has_prev = 0; while (bam_read1(in, b[curr]) >= 0) { bam1_t *cur = b[curr], *pre = b[1-curr]; + if (cur->core.tid < 0) + { + if ( !remove_reads ) bam_write1(out, cur); + continue; + } + cur_end = bam_calend(&cur->core, bam1_cigar(cur)); + if (cur_end > (int)header->target_len[cur->core.tid]) cur->core.flag |= BAM_FUNMAP; + if (cur->core.flag & BAM_FSECONDARY) + { + if ( !remove_reads ) bam_write1(out, cur); + continue; // skip secondary alignments + } if (has_prev) { if (strcmp(bam1_qname(cur), bam1_qname(pre)) == 0) { // identical pair name cur->core.mtid = pre->core.tid; cur->core.mpos = pre->core.pos; pre->core.mtid = cur->core.tid; pre->core.mpos = cur->core.pos; if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)) - && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) + && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // set TLEN/ISIZE { uint32_t cur5, pre5; - cur5 = (cur->core.flag&BAM_FREVERSE)? bam_calend(&cur->core, bam1_cigar(cur)) : cur->core.pos; - pre5 = (pre->core.flag&BAM_FREVERSE)? bam_calend(&pre->core, bam1_cigar(pre)) : pre->core.pos; + cur5 = (cur->core.flag&BAM_FREVERSE)? cur_end : cur->core.pos; + pre5 = (pre->core.flag&BAM_FREVERSE)? pre_end : pre->core.pos; cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5; } else cur->core.isize = pre->core.isize = 0; if (pre->core.flag&BAM_FREVERSE) cur->core.flag |= BAM_FMREVERSE; @@ -35,6 +77,7 @@ void bam_mating_core(bamFile in, bamFile out) else pre->core.flag &= ~BAM_FMREVERSE; if (cur->core.flag & BAM_FUNMAP) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FPROPER_PAIR; } if (pre->core.flag & BAM_FUNMAP) { cur->core.flag |= BAM_FMUNMAP; cur->core.flag &= ~BAM_FPROPER_PAIR; } + bam_template_cigar(pre, cur, &str); bam_write1(out, pre); bam_write1(out, cur); has_prev = 0; @@ -48,23 +91,38 @@ void bam_mating_core(bamFile in, bamFile out) } } else has_prev = 1; curr = 1 - curr; + pre_end = cur_end; } if (has_prev) bam_write1(out, b[1-curr]); bam_header_destroy(header); bam_destroy1(b[0]); bam_destroy1(b[1]); + free(str.s); +} + +void usage() +{ + fprintf(stderr,"Usage: samtools fixmate \n"); + fprintf(stderr,"Options:\n"); + fprintf(stderr," -r remove unmapped reads and secondary alignments\n"); + exit(1); } int bam_mating(int argc, char *argv[]) { bamFile in, out; - if (argc < 3) { - fprintf(stderr, "samtools fixmate \n"); - return 1; - } - in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r"); - out = (strcmp(argv[2], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[2], "w"); - bam_mating_core(in, out); + int c, remove_reads=0; + while ((c = getopt(argc, argv, "r")) >= 0) { + switch (c) { + case 'r': remove_reads=1; break; + } + } + if (optind+1 >= argc) usage(); + in = (strcmp(argv[optind], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[optind], "r"); + out = (strcmp(argv[optind+1], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[optind+1], "w"); + bam_mating_core(in, out, remove_reads); bam_close(in); bam_close(out); return 0; } + + diff --git a/sam/bam_md.c b/sam/bam_md.c index d42aa8f..ce40a12 100644 --- a/sam/bam_md.c +++ b/sam/bam_md.c @@ -188,7 +188,7 @@ int bam_cap_mapQ(bam1_t *b, char *ref, int thres) int bam_prob_realn_core(bam1_t *b, const char *ref, int flag) { - int k, i, bw, x, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1; + int k, i, bw, x, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4; uint32_t *cigar = bam1_cigar(b); bam1_core_t *c = &b->core; kpa_par_t conf = kpa_par_def; @@ -197,6 +197,11 @@ int bam_prob_realn_core(bam1_t *b, const char *ref, int flag) // test if BQ or ZQ is present if ((bq = bam_aux_get(b, "BQ")) != 0) ++bq; if ((zq = bam_aux_get(b, "ZQ")) != 0 && *zq == 'Z') ++zq; + if (bq && redo_baq) + { + bam_aux_del(b, bq-1); + bq = 0; + } if (bq && zq) { // remove the ZQ tag bam_aux_del(b, zq-1); zq = 0; diff --git a/sam/bam_plcmd.c b/sam/bam_plcmd.c index cbf6ae8..54a4597 100644 --- a/sam/bam_plcmd.c +++ b/sam/bam_plcmd.c @@ -4,9 +4,12 @@ #include #include #include +#include +#include #include "sam.h" #include "faidx.h" #include "kstring.h" +#include "sam_header.h" static inline int printw(int c, FILE *fp) { @@ -66,24 +69,24 @@ static inline void pileup_seq(const bam_pileup1_t *p, int pos, int ref_len, cons #define MPLP_NO_COMP 0x20 #define MPLP_NO_ORPHAN 0x40 #define MPLP_REALN 0x80 -#define MPLP_FMT_DP 0x100 -#define MPLP_FMT_SP 0x200 #define MPLP_NO_INDEL 0x400 -#define MPLP_EXT_BAQ 0x800 +#define MPLP_REDO_BAQ 0x800 #define MPLP_ILLUMINA13 0x1000 #define MPLP_IGNORE_RG 0x2000 #define MPLP_PRINT_POS 0x4000 #define MPLP_PRINT_MAPQ 0x8000 +#define MPLP_PER_SAMPLE 0x10000 void *bed_read(const char *fn); void bed_destroy(void *_h); int bed_overlap(const void *_h, const char *chr, int beg, int end); typedef struct { - int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth; + int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag; + int rflag_require, rflag_filter; int openQ, extQ, tandemQ, min_support; // for indels double min_frac; // for indels - char *reg, *pl_list; + char *reg, *pl_list, *fai_fname; faidx_t *fai; void *bed, *rghash; } mplp_conf_t; @@ -118,6 +121,8 @@ static int mplp_func(void *data, bam1_t *b) skip = 1; continue; } + if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) { skip = 1; continue; } + if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) { skip = 1; continue; } if (ma->conf->bed) { // test overlap skip = !bed_overlap(ma->conf->bed, ma->h->target_name[b->core.tid], b->core.pos, bam_calend(&b->core, bam1_cigar(b))); if (skip) continue; @@ -135,7 +140,7 @@ static int mplp_func(void *data, bam1_t *b) } has_ref = (ma->ref && ma->ref_id == b->core.tid)? 1 : 0; skip = 0; - if (has_ref && (ma->conf->flag&MPLP_REALN)) bam_prob_realn_core(b, ma->ref, (ma->conf->flag & MPLP_EXT_BAQ)? 3 : 1); + if (has_ref && (ma->conf->flag&MPLP_REALN)) bam_prob_realn_core(b, ma->ref, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3); if (has_ref && ma->conf->capQ_thres > 10) { int q = bam_cap_mapQ(b, ma->ref, ma->conf->capQ_thres); if (q < 0) skip = 1; @@ -209,8 +214,17 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) bam_header_t *h_tmp; data[i] = calloc(1, sizeof(mplp_aux_t)); data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r"); + if ( !data[i]->fp ) + { + fprintf(stderr, "[%s] failed to open %s: %s\n", __func__, fn[i], strerror(errno)); + exit(1); + } data[i]->conf = conf; h_tmp = bam_header_read(data[i]->fp); + if ( !h_tmp ) { + fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, fn[i]); + exit(1); + } data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); @@ -219,11 +233,11 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) bam_index_t *idx; idx = bam_index_load(fn[i]); if (idx == 0) { - fprintf(stderr, "[%s] fail to load index for %d-th input.\n", __func__, i+1); + fprintf(stderr, "[%s] fail to load index for %s\n", __func__, fn[i]); exit(1); } if (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) { - fprintf(stderr, "[%s] malformatted region or wrong seqname for %d-th input.\n", __func__, i+1); + fprintf(stderr, "[%s] malformatted region or wrong seqname for %s\n", __func__, fn[i]); exit(1); } if (i == 0) tid0 = tid, beg0 = beg, end0 = end; @@ -262,9 +276,24 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) bh->l_smpl = s.l; bh->sname = malloc(s.l); memcpy(bh->sname, s.s, s.l); - bh->txt = malloc(strlen(BAM_VERSION) + 64); - bh->l_txt = 1 + sprintf(bh->txt, "##samtoolsVersion=%s\n", BAM_VERSION); - free(s.s); + s.l = 0; + ksprintf(&s, "##samtoolsVersion=%s\n", BAM_VERSION); + if (conf->fai_fname) ksprintf(&s, "##reference=file://%s\n", conf->fai_fname); + h->dict = sam_header_parse2(h->text); + int nseq; + const char *tags[] = {"SN","LN","UR","M5",NULL}; + char **tbl = sam_header2tbl_n(h->dict, "SQ", tags, &nseq); + for (i=0; i\n", &s); + } + if (tbl) free(tbl); + bh->txt = s.s; + bh->l_txt = 1 + s.l; bcf_hdr_sync(bh); bcf_hdr_write(bp, bh); bca = bcf_call_init(-1., conf->min_baseQ); @@ -273,6 +302,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ; bca->min_frac = conf->min_frac; bca->min_support = conf->min_support; + bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE; } if (tid0 >= 0 && conf->fai) { // region is set ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len); @@ -307,19 +337,17 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) ref16 = bam_nt16_table[_ref0]; for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i); - bcf_call_combine(gplp.n, bcr, ref16, &bc); - bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, - (conf->flag&MPLP_FMT_SP), 0, 0); + bcf_call_combine(gplp.n, bcr, bca, ref16, &bc); + bcf_call2bcf(tid, pos, &bc, b, bcr, conf->fmt_flag, 0, 0); bcf_write(bp, bh, b); bcf_destroy(b); // call indels if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); - if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) { + if (bcf_call_combine(gplp.n, bcr, bca, -1, &bc) >= 0) { b = calloc(1, sizeof(bcf1_t)); - bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, - (conf->flag&MPLP_FMT_SP), bca, ref); + bcf_call2bcf(tid, pos, &bc, b, bcr, conf->fmt_flag, bca, ref); bcf_write(bp, bh, b); bcf_destroy(b); } @@ -327,20 +355,29 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) } else { printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); for (i = 0; i < n; ++i) { - int j; - printf("\t%d\t", n_plp[i]); + int j, cnt; + for (j = cnt = 0; j < n_plp[i]; ++j) { + const bam_pileup1_t *p = plp[i] + j; + if (bam1_qual(p->b)[p->qpos] >= conf->min_baseQ) ++cnt; + } + printf("\t%d\t", cnt); if (n_plp[i] == 0) { printf("*\t*"); // FIXME: printf() is very slow... if (conf->flag & MPLP_PRINT_POS) printf("\t*"); } else { - for (j = 0; j < n_plp[i]; ++j) - pileup_seq(plp[i] + j, pos, ref_len, ref); + for (j = 0; j < n_plp[i]; ++j) { + const bam_pileup1_t *p = plp[i] + j; + if (bam1_qual(p->b)[p->qpos] >= conf->min_baseQ) + pileup_seq(plp[i] + j, pos, ref_len, ref); + } putchar('\t'); for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; - int c = bam1_qual(p->b)[p->qpos] + 33; - if (c > 126) c = 126; - putchar(c); + int c = bam1_qual(p->b)[p->qpos]; + if (c >= conf->min_baseQ) { + c = c + 33 < 126? c + 33 : 126; + putchar(c); + } } if (conf->flag & MPLP_PRINT_MAPQ) { putchar('\t'); @@ -381,11 +418,15 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) } #define MAX_PATH_LEN 1024 -static int read_file_list(const char *file_list,int *n,char **argv[]) +int read_file_list(const char *file_list,int *n,char **argv[]) { char buf[MAX_PATH_LEN]; - int len, nfiles; - char **files; + int len, nfiles = 0; + char **files = NULL; + struct stat sb; + + *n = 0; + *argv = NULL; FILE *fh = fopen(file_list,"r"); if ( !fh ) @@ -394,28 +435,33 @@ static int read_file_list(const char *file_list,int *n,char **argv[]) return 1; } - // Speed is not an issue here, determine the number of files by reading the file twice - nfiles = 0; - while ( fgets(buf,MAX_PATH_LEN,fh) ) nfiles++; - - if ( fseek(fh, 0L, SEEK_SET) ) - { - fprintf(stderr,"%s: %s\n", file_list,strerror(errno)); - return 1; - } - files = calloc(nfiles,sizeof(char*)); nfiles = 0; while ( fgets(buf,MAX_PATH_LEN,fh) ) { + // allow empty lines and trailing spaces len = strlen(buf); while ( len>0 && isspace(buf[len-1]) ) len--; if ( !len ) continue; - files[nfiles] = malloc(sizeof(char)*(len+1)); - strncpy(files[nfiles],buf,len); - files[nfiles][len] = 0; + // check sanity of the file list + buf[len] = 0; + if (stat(buf, &sb) != 0) + { + // no such file, check if it is safe to print its name + int i, safe_to_print = 1; + for (i=0; i= 0) { + static struct option lopts[] = + { + {"rf",1,0,1}, // require flag + {"ff",1,0,2}, // filter flag + {0,0,0,0} + }; + while ((c = getopt_long(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsV1:2:",lopts,NULL)) >= 0) { switch (c) { + case 1 : mplp.rflag_require = strtol(optarg,0,0); break; + case 2 : mplp.rflag_filter = strtol(optarg,0,0); break; case 'f': mplp.fai = fai_load(optarg); if (mplp.fai == 0) return 1; + mplp.fai_fname = optarg; break; case 'd': mplp.max_depth = atoi(optarg); break; case 'r': mplp.reg = strdup(optarg); break; case 'l': mplp.bed = bed_read(optarg); break; case 'P': mplp.pl_list = strdup(optarg); break; + case 'p': mplp.flag |= MPLP_PER_SAMPLE; break; case 'g': mplp.flag |= MPLP_GLF; break; case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_GLF; break; case 'a': mplp.flag |= MPLP_NO_ORPHAN | MPLP_REALN; break; case 'B': mplp.flag &= ~MPLP_REALN; break; - case 'D': mplp.flag |= MPLP_FMT_DP; break; - case 'S': mplp.flag |= MPLP_FMT_SP; break; + case 'D': mplp.fmt_flag |= B2B_FMT_DP; break; + case 'S': mplp.fmt_flag |= B2B_FMT_SP; break; + case 'V': mplp.fmt_flag |= B2B_FMT_DV; break; case 'I': mplp.flag |= MPLP_NO_INDEL; break; - case 'E': mplp.flag |= MPLP_EXT_BAQ; break; + case 'E': mplp.flag |= MPLP_REDO_BAQ; break; case '6': mplp.flag |= MPLP_ILLUMINA13; break; case 'R': mplp.flag |= MPLP_IGNORE_RG; break; case 's': mplp.flag |= MPLP_PRINT_MAPQ; break; @@ -500,10 +556,10 @@ int bam_mpileup(int argc, char *argv[]) fprintf(stderr, " -6 assume the quality is in the Illumina-1.3+ encoding\n"); fprintf(stderr, " -A count anomalous read pairs\n"); fprintf(stderr, " -B disable BAQ computation\n"); - fprintf(stderr, " -b FILE list of input BAM files [null]\n"); + fprintf(stderr, " -b FILE list of input BAM filenames, one per line [null]\n"); fprintf(stderr, " -C INT parameter for adjusting mapQ; 0 to disable [0]\n"); fprintf(stderr, " -d INT max per-BAM depth to avoid excessive memory usage [%d]\n", mplp.max_depth); - fprintf(stderr, " -E extended BAQ for higher sensitivity but lower specificity\n"); + fprintf(stderr, " -E recalculate extended BAQ on the fly thus ignoring existing BQs\n"); fprintf(stderr, " -f FILE faidx indexed reference sequence file [null]\n"); fprintf(stderr, " -G FILE exclude read groups listed in FILE [null]\n"); fprintf(stderr, " -l FILE list of positions (chr pos) or regions (BED) [null]\n"); @@ -512,6 +568,8 @@ int bam_mpileup(int argc, char *argv[]) fprintf(stderr, " -R ignore RG tags\n"); fprintf(stderr, " -q INT skip alignments with mapQ smaller than INT [%d]\n", mplp.min_mq); fprintf(stderr, " -Q INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp.min_baseQ); + fprintf(stderr, " --rf INT required flags: skip reads with mask bits unset []\n"); + fprintf(stderr, " --ff INT filter flags: skip reads with mask bits set []\n"); fprintf(stderr, "\nOutput options:\n\n"); fprintf(stderr, " -D output per-sample DP in BCF (require -g/-u)\n"); fprintf(stderr, " -g generate BCF output (genotype likelihoods)\n"); @@ -527,11 +585,13 @@ int bam_mpileup(int argc, char *argv[]) fprintf(stderr, " -L INT max per-sample depth for INDEL calling [%d]\n", mplp.max_indel_depth); fprintf(stderr, " -m INT minimum gapped reads for indel candidates [%d]\n", mplp.min_support); fprintf(stderr, " -o INT Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ); + fprintf(stderr, " -p apply -m and -F per-sample to increase sensitivity\n"); fprintf(stderr, " -P STR comma separated list of platforms for indels [all]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Notes: Assuming diploid individuals.\n\n"); return 1; } + bam_no_B = 1; if (file_list) { if ( read_file_list(file_list,&nfiles,&fn) ) return 1; mpileup(&mplp,nfiles,fn); diff --git a/sam/bam_reheader.c b/sam/bam_reheader.c index 0b52267..6619428 100644 --- a/sam/bam_reheader.c +++ b/sam/bam_reheader.c @@ -1,5 +1,6 @@ #include #include +#include "knetfile.h" #include "bgzf.h" #include "bam.h" @@ -11,7 +12,7 @@ int bam_reheader(BGZF *in, const bam_header_t *h, int fd) bam_header_t *old; int len; uint8_t *buf; - if (in->open_mode != 'r') return -1; + if (in->is_write) return -1; buf = malloc(BUF_SIZE); old = bam_header_read(in); fp = bgzf_fdopen(fd, "w"); @@ -21,8 +22,8 @@ int bam_reheader(BGZF *in, const bam_header_t *h, int fd) bgzf_flush(fp); } #ifdef _USE_KNETFILE - while ((len = knet_read(in->x.fpr, buf, BUF_SIZE)) > 0) - fwrite(buf, 1, len, fp->x.fpw); + while ((len = knet_read(in->fp, buf, BUF_SIZE)) > 0) + fwrite(buf, 1, len, fp->fp); #else while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0) fwrite(buf, 1, len, fp->file); diff --git a/sam/bam_sort.c b/sam/bam_sort.c index abf8d4f..c46bce3 100644 --- a/sam/bam_sort.c +++ b/sam/bam_sort.c @@ -10,24 +10,28 @@ static int g_is_by_qname = 0; -static inline int strnum_cmp(const char *a, const char *b) +static int strnum_cmp(const char *_a, const char *_b) { - char *pa, *pb; - pa = (char*)a; pb = (char*)b; + const unsigned char *a = (const unsigned char*)_a, *b = (const unsigned char*)_b; + const unsigned char *pa = a, *pb = b; while (*pa && *pb) { if (isdigit(*pa) && isdigit(*pb)) { - long ai, bi; - ai = strtol(pa, &pa, 10); - bi = strtol(pb, &pb, 10); - if (ai != bi) return aibi? 1 : 0; + while (*pa == '0') ++pa; + while (*pb == '0') ++pb; + while (isdigit(*pa) && isdigit(*pb) && *pa == *pb) ++pa, ++pb; + if (isdigit(*pa) && isdigit(*pb)) { + int i = 0; + while (isdigit(pa[i]) && isdigit(pb[i])) ++i; + return isdigit(pa[i])? 1 : isdigit(pb[i])? -1 : (int)*pa - (int)*pb; + } else if (isdigit(*pa)) return 1; + else if (isdigit(*pb)) return -1; + else if (pa - a != pb - b) return pa - a < pb - b? 1 : -1; } else { - if (*pa != *pb) break; + if (*pa != *pb) return (int)*pa - (int)*pb; ++pa; ++pb; } } - if (*pa == *pb) - return (pa-a) < (pb-b)? -1 : (pa-a) > (pb-b)? 1 : 0; - return *pa<*pb? -1 : *pa>*pb? 1 : 0; + return *pa? 1 : *pb? -1 : 0; } #define HEAP_EMPTY 0xffffffffffffffffull @@ -46,7 +50,7 @@ static inline int heap_lt(const heap1_t a, const heap1_t b) int t; if (a.b == 0 || b.b == 0) return a.b == 0? 1 : 0; t = strnum_cmp(bam1_qname(a.b), bam1_qname(b.b)); - return (t > 0 || (t == 0 && __pos_cmp(a, b))); + return (t > 0 || (t == 0 && (a.b->core.flag&0xc0) > (b.b->core.flag&0xc0))); } else return __pos_cmp(a, b); } @@ -85,8 +89,7 @@ static void swap_header_text(bam_header_t *h1, bam_header_t *h2) @discussion Padding information may NOT correctly maintained. This function is NOT thread safe. */ -int bam_merge_core(int by_qname, const char *out, const char *headers, int n, char * const *fn, - int flag, const char *reg) +int bam_merge_core2(int by_qname, const char *out, const char *headers, int n, char * const *fn, int flag, const char *reg, int n_threads, int level) { bamFile fpout, *fp; heap1_t *heap; @@ -94,7 +97,7 @@ int bam_merge_core(int by_qname, const char *out, const char *headers, int n, ch bam_header_t *hheaders = NULL; int i, j, *RG_len = 0; uint64_t idx = 0; - char **RG = 0; + char **RG = 0, mode[8]; bam_iter_t *iter = 0; if (headers) { @@ -209,15 +212,17 @@ int bam_merge_core(int by_qname, const char *out, const char *headers, int n, ch } else h->pos = HEAP_EMPTY; } - if (flag & MERGE_UNCOMP) fpout = strcmp(out, "-")? bam_open(out, "wu") : bam_dopen(fileno(stdout), "wu"); - else if (flag & MERGE_LEVEL1) fpout = strcmp(out, "-")? bam_open(out, "w1") : bam_dopen(fileno(stdout), "w1"); - else fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w"); - if (fpout == 0) { + if (flag & MERGE_UNCOMP) level = 0; + else if (flag & MERGE_LEVEL1) level = 1; + strcpy(mode, "w"); + if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9); + if ((fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w")) == 0) { fprintf(stderr, "[%s] fail to create the output file.\n", __func__); return -1; } bam_header_write(fpout, hout); bam_header_destroy(hout); + if (!(flag & MERGE_UNCOMP)) bgzf_mt(fpout, n_threads, 256); ks_heapmake(heap, n, heap); while (heap->pos != HEAP_EMPTY) { @@ -252,12 +257,17 @@ int bam_merge_core(int by_qname, const char *out, const char *headers, int n, ch return 0; } +int bam_merge_core(int by_qname, const char *out, const char *headers, int n, char * const *fn, int flag, const char *reg) +{ + return bam_merge_core2(by_qname, out, headers, n, fn, flag, reg, 0, -1); +} + int bam_merge(int argc, char *argv[]) { - int c, is_by_qname = 0, flag = 0, ret = 0; + int c, is_by_qname = 0, flag = 0, ret = 0, n_threads = 0, level = -1; char *fn_headers = NULL, *reg = 0; - while ((c = getopt(argc, argv, "h:nru1R:f")) >= 0) { + while ((c = getopt(argc, argv, "h:nru1R:f@:l:")) >= 0) { switch (c) { case 'r': flag |= MERGE_RG; break; case 'f': flag |= MERGE_FORCE; break; @@ -266,6 +276,8 @@ int bam_merge(int argc, char *argv[]) case '1': flag |= MERGE_LEVEL1; break; case 'u': flag |= MERGE_UNCOMP; break; case 'R': reg = strdup(optarg); break; + case 'l': level = atoi(optarg); break; + case '@': n_threads = atoi(optarg); break; } } if (optind + 2 >= argc) { @@ -276,6 +288,8 @@ int bam_merge(int argc, char *argv[]) fprintf(stderr, " -u uncompressed BAM output\n"); fprintf(stderr, " -f overwrite the output BAM if exist\n"); fprintf(stderr, " -1 compress level 1\n"); + fprintf(stderr, " -l INT compression level, from 0 to 9 [-1]\n"); + fprintf(stderr, " -@ INT number of BAM compression threads [0]\n"); fprintf(stderr, " -R STR merge file in the specified region STR [all]\n"); fprintf(stderr, " -h FILE copy the header in FILE to [in1.bam]\n\n"); fprintf(stderr, "Note: Samtools' merge does not reconstruct the @RG dictionary in the header. Users\n"); @@ -291,51 +305,126 @@ int bam_merge(int argc, char *argv[]) return 1; } } - if (bam_merge_core(is_by_qname, argv[optind], fn_headers, argc - optind - 1, argv + optind + 1, flag, reg) < 0) ret = 1; + if (bam_merge_core2(is_by_qname, argv[optind], fn_headers, argc - optind - 1, argv + optind + 1, flag, reg, n_threads, level) < 0) ret = 1; free(reg); free(fn_headers); return ret; } +/*************** + * BAM sorting * + ***************/ + +#include + typedef bam1_t *bam1_p; +static int change_SO(bam_header_t *h, const char *so) +{ + char *p, *q, *beg = 0, *end = 0, *newtext; + if (h->l_text > 3) { + if (strncmp(h->text, "@HD", 3) == 0) { + if ((p = strchr(h->text, '\n')) == 0) return -1; + *p = '\0'; + if ((q = strstr(h->text, "\tSO:")) != 0) { + *p = '\n'; // change back + if (strncmp(q + 4, so, p - q - 4) != 0) { + beg = q; + for (q += 4; *q != '\n' && *q != '\t'; ++q); + end = q; + } else return 0; // no need to change + } else beg = end = p, *p = '\n'; + } + } + if (beg == 0) { // no @HD + h->l_text += strlen(so) + 15; + newtext = malloc(h->l_text + 1); + sprintf(newtext, "@HD\tVN:1.3\tSO:%s\n", so); + strcat(newtext, h->text); + } else { // has @HD but different or no SO + h->l_text = (beg - h->text) + (4 + strlen(so)) + (h->text + h->l_text - end); + newtext = malloc(h->l_text + 1); + strncpy(newtext, h->text, beg - h->text); + sprintf(newtext + (beg - h->text), "\tSO:%s", so); + strcat(newtext, end); + } + free(h->text); + h->text = newtext; + return 0; +} + static inline int bam1_lt(const bam1_p a, const bam1_p b) { if (g_is_by_qname) { int t = strnum_cmp(bam1_qname(a), bam1_qname(b)); - return (t < 0 || (t == 0 && (((uint64_t)a->core.tid<<32|(a->core.pos+1)) < ((uint64_t)b->core.tid<<32|(b->core.pos+1))))); - } else return (((uint64_t)a->core.tid<<32|(a->core.pos+1)) < ((uint64_t)b->core.tid<<32|(b->core.pos+1))); + return (t < 0 || (t == 0 && (a->core.flag&0xc0) < (b->core.flag&0xc0))); + } else return (((uint64_t)a->core.tid<<32|(a->core.pos+1)<<1|bam1_strand(a)) < ((uint64_t)b->core.tid<<32|(b->core.pos+1)<<1|bam1_strand(b))); } KSORT_INIT(sort, bam1_p, bam1_lt) -static void sort_blocks(int n, int k, bam1_p *buf, const char *prefix, const bam_header_t *h, int is_stdout) +typedef struct { + size_t buf_len; + const char *prefix; + bam1_p *buf; + const bam_header_t *h; + int index; +} worker_t; + +static void write_buffer(const char *fn, const char *mode, size_t l, bam1_p *buf, const bam_header_t *h, int n_threads) { - char *name, mode[3]; - int i; + size_t i; bamFile fp; - ks_mergesort(sort, k, buf, 0); - name = (char*)calloc(strlen(prefix) + 20, 1); - if (n >= 0) { - sprintf(name, "%s.%.4d.bam", prefix, n); - strcpy(mode, "w1"); - } else { - sprintf(name, "%s.bam", prefix); - strcpy(mode, "w"); - } - fp = is_stdout? bam_dopen(fileno(stdout), mode) : bam_open(name, mode); - if (fp == 0) { - fprintf(stderr, "[sort_blocks] fail to create file %s.\n", name); - free(name); - // FIXME: possible memory leak - return; - } - free(name); + fp = strcmp(fn, "-")? bam_open(fn, mode) : bam_dopen(fileno(stdout), mode); + if (fp == 0) return; bam_header_write(fp, h); - for (i = 0; i < k; ++i) + if (n_threads > 1) bgzf_mt(fp, n_threads, 256); + for (i = 0; i < l; ++i) bam_write1_core(fp, &buf[i]->core, buf[i]->data_len, buf[i]->data); bam_close(fp); } +static void *worker(void *data) +{ + worker_t *w = (worker_t*)data; + char *name; + ks_mergesort(sort, w->buf_len, w->buf, 0); + name = (char*)calloc(strlen(w->prefix) + 20, 1); + sprintf(name, "%s.%.4d.bam", w->prefix, w->index); + write_buffer(name, "w1", w->buf_len, w->buf, w->h, 0); + free(name); + return 0; +} + +static int sort_blocks(int n_files, size_t k, bam1_p *buf, const char *prefix, const bam_header_t *h, int n_threads) +{ + int i; + size_t rest; + bam1_p *b; + pthread_t *tid; + pthread_attr_t attr; + worker_t *w; + + if (n_threads < 1) n_threads = 1; + if (k < n_threads * 64) n_threads = 1; // use a single thread if we only sort a small batch of records + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + w = calloc(n_threads, sizeof(worker_t)); + tid = calloc(n_threads, sizeof(pthread_t)); + b = buf; rest = k; + for (i = 0; i < n_threads; ++i) { + w[i].buf_len = rest / (n_threads - i); + w[i].buf = b; + w[i].prefix = prefix; + w[i].h = h; + w[i].index = n_files + i; + b += w[i].buf_len; rest -= w[i].buf_len; + pthread_create(&tid[i], &attr, worker, &w[i]); + } + for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0); + free(tid); free(w); + return n_files + n_threads; +} + /*! @abstract Sort an unsorted BAM file based on the chromosome order and the leftmost position of an alignment @@ -345,68 +434,94 @@ static void sort_blocks(int n, int k, bam1_p *buf, const char *prefix, const bam @param prefix prefix of the output and the temporary files; upon sucessess, prefix.bam will be written. @param max_mem approxiate maximum memory (very inaccurate) + @param full_path the given output path is the full path and not just the prefix @discussion It may create multiple temporary subalignment files and then merge them by calling bam_merge_core(). This function is NOT thread safe. */ -void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size_t max_mem, int is_stdout) +void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size_t _max_mem, int is_stdout, int n_threads, int level, int full_path) { - int n, ret, k, i; - size_t mem; + int ret, i, n_files = 0; + size_t mem, max_k, k, max_mem; bam_header_t *header; bamFile fp; bam1_t *b, **buf; + char *fnout = 0; + char const *suffix = ".bam"; + if (full_path) suffix += 4; + if (n_threads < 2) n_threads = 1; g_is_by_qname = is_by_qname; - n = k = 0; mem = 0; + max_k = k = 0; mem = 0; + max_mem = _max_mem * n_threads; + buf = 0; fp = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r"); if (fp == 0) { fprintf(stderr, "[bam_sort_core] fail to open file %s\n", fn); return; } header = bam_header_read(fp); - buf = (bam1_t**)calloc(max_mem / BAM_CORE_SIZE, sizeof(bam1_t*)); + if (is_by_qname) change_SO(header, "queryname"); + else change_SO(header, "coordinate"); // write sub files for (;;) { + if (k == max_k) { + size_t old_max = max_k; + max_k = max_k? max_k<<1 : 0x10000; + buf = realloc(buf, max_k * sizeof(void*)); + memset(buf + old_max, 0, sizeof(void*) * (max_k - old_max)); + } if (buf[k] == 0) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t)); b = buf[k]; if ((ret = bam_read1(fp, b)) < 0) break; - mem += ret; + if (b->data_len < b->m_data>>2) { // shrink + b->m_data = b->data_len; + kroundup32(b->m_data); + b->data = realloc(b->data, b->m_data); + } + mem += sizeof(bam1_t) + b->m_data + sizeof(void*) + sizeof(void*); // two sizeof(void*) for the data allocated to pointer arrays ++k; if (mem >= max_mem) { - sort_blocks(n++, k, buf, prefix, header, 0); - mem = 0; k = 0; + n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads); + mem = k = 0; } } if (ret != -1) fprintf(stderr, "[bam_sort_core] truncated file. Continue anyway.\n"); - if (n == 0) sort_blocks(-1, k, buf, prefix, header, is_stdout); - else { // then merge - char **fns, *fnout; - fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n+1); - sort_blocks(n++, k, buf, prefix, header, 0); - fnout = (char*)calloc(strlen(prefix) + 20, 1); - if (is_stdout) sprintf(fnout, "-"); - else sprintf(fnout, "%s.bam", prefix); - fns = (char**)calloc(n, sizeof(char*)); - for (i = 0; i < n; ++i) { + // output file name + fnout = calloc(strlen(prefix) + 20, 1); + if (is_stdout) sprintf(fnout, "-"); + else sprintf(fnout, "%s%s", prefix, suffix); + // write the final output + if (n_files == 0) { // a single block + char mode[8]; + strcpy(mode, "w"); + if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9); + ks_mergesort(sort, k, buf, 0); + write_buffer(fnout, mode, k, buf, header, n_threads); + } else { // then merge + char **fns; + n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads); + fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n_files); + fns = (char**)calloc(n_files, sizeof(char*)); + for (i = 0; i < n_files; ++i) { fns[i] = (char*)calloc(strlen(prefix) + 20, 1); - sprintf(fns[i], "%s.%.4d.bam", prefix, i); + sprintf(fns[i], "%s.%.4d%s", prefix, i, suffix); } - bam_merge_core(is_by_qname, fnout, 0, n, fns, 0, 0); - free(fnout); - for (i = 0; i < n; ++i) { + bam_merge_core2(is_by_qname, fnout, 0, n_files, fns, 0, 0, n_threads, level); + for (i = 0; i < n_files; ++i) { unlink(fns[i]); free(fns[i]); } free(fns); } - for (k = 0; k < max_mem / BAM_CORE_SIZE; ++k) { - if (buf[k]) { - free(buf[k]->data); - free(buf[k]); - } + free(fnout); + // free + for (k = 0; k < max_k; ++k) { + if (!buf[k]) continue; + free(buf[k]->data); + free(buf[k]); } free(buf); bam_header_destroy(header); @@ -415,24 +530,42 @@ void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size void bam_sort_core(int is_by_qname, const char *fn, const char *prefix, size_t max_mem) { - bam_sort_core_ext(is_by_qname, fn, prefix, max_mem, 0); + bam_sort_core_ext(is_by_qname, fn, prefix, max_mem, 0, 0, -1, 0); } int bam_sort(int argc, char *argv[]) { - size_t max_mem = 500000000; - int c, is_by_qname = 0, is_stdout = 0; - while ((c = getopt(argc, argv, "nom:")) >= 0) { + size_t max_mem = 768<<20; // 512MB + int c, is_by_qname = 0, is_stdout = 0, n_threads = 0, level = -1, full_path = 0; + while ((c = getopt(argc, argv, "fnom:@:l:")) >= 0) { switch (c) { + case 'f': full_path = 1; break; case 'o': is_stdout = 1; break; case 'n': is_by_qname = 1; break; - case 'm': max_mem = atol(optarg); break; + case 'm': { + char *q; + max_mem = strtol(optarg, &q, 0); + if (*q == 'k' || *q == 'K') max_mem <<= 10; + else if (*q == 'm' || *q == 'M') max_mem <<= 20; + else if (*q == 'g' || *q == 'G') max_mem <<= 30; + break; + } + case '@': n_threads = atoi(optarg); break; + case 'l': level = atoi(optarg); break; } } if (optind + 2 > argc) { - fprintf(stderr, "Usage: samtools sort [-on] [-m ] \n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools sort [options] \n\n"); + fprintf(stderr, "Options: -n sort by read name\n"); + fprintf(stderr, " -f use as full file name instead of prefix\n"); + fprintf(stderr, " -o final output to stdout\n"); + fprintf(stderr, " -l INT compression level, from 0 to 9 [-1]\n"); + fprintf(stderr, " -@ INT number of sorting and compression threads [1]\n"); + fprintf(stderr, " -m INT max memory per thread; suffix K/M/G recognized [768M]\n"); + fprintf(stderr, "\n"); return 1; } - bam_sort_core_ext(is_by_qname, argv[optind], argv[optind+1], max_mem, is_stdout); + bam_sort_core_ext(is_by_qname, argv[optind], argv[optind+1], max_mem, is_stdout, n_threads, level, full_path); return 0; } diff --git a/sam/bam_tview.c b/sam/bam_tview.c index 4eea955..06d5e33 100644 --- a/sam/bam_tview.c +++ b/sam/bam_tview.c @@ -1,61 +1,81 @@ -#undef _HAVE_CURSES - -#if _CURSES_LIB == 0 -#elif _CURSES_LIB == 1 -#include -#ifndef NCURSES_VERSION -#warning "_CURSES_LIB=1 but NCURSES_VERSION not defined; tview is NOT compiled" -#else -#define _HAVE_CURSES -#endif -#elif _CURSES_LIB == 2 -#include -#define _HAVE_CURSES -#else -#warning "_CURSES_LIB is not 0, 1 or 2; tview is NOT compiled" -#endif - -#ifdef _HAVE_CURSES -#include #include -#include -#include -#include "bam.h" -#include "faidx.h" -#include "bam2bcf.h" - -char bam_aux_getCEi(bam1_t *b, int i); -char bam_aux_getCSi(bam1_t *b, int i); -char bam_aux_getCQi(bam1_t *b, int i); +#include "bam_tview.h" -#define TV_MIN_ALNROW 2 -#define TV_MAX_GOTO 40 -#define TV_LOW_MAPQ 10 +int base_tv_init(tview_t* tv,const char *fn, const char *fn_fa, const char *samples) + { + assert(tv!=NULL); + assert(fn!=NULL); + tv->mrow = 24; tv->mcol = 80; + tv->color_for = TV_COLOR_MAPQ; + tv->is_dot = 1; + + tv->fp = bam_open(fn, "r"); + if(tv->fp==0) + { + fprintf(stderr,"bam_open %s. %s\n", fn,fn_fa); + exit(EXIT_FAILURE); + } + bgzf_set_cache_size(tv->fp, 8 * 1024 *1024); + assert(tv->fp); + + tv->header = bam_header_read(tv->fp); + if(tv->header==0) + { + fprintf(stderr,"Cannot read '%s'.\n", fn); + exit(EXIT_FAILURE); + } + tv->idx = bam_index_load(fn); + if (tv->idx == 0) + { + fprintf(stderr,"Cannot read index for '%s'.\n", fn); + exit(EXIT_FAILURE); + } + tv->lplbuf = bam_lplbuf_init(tv_pl_func, tv); + if (fn_fa) tv->fai = fai_load(fn_fa); + tv->bca = bcf_call_init(0.83, 13); + tv->ins = 1; -#define TV_COLOR_MAPQ 0 -#define TV_COLOR_BASEQ 1 -#define TV_COLOR_NUCL 2 -#define TV_COLOR_COL 3 -#define TV_COLOR_COLQ 4 + if ( samples ) + { + if ( !tv->header->dict ) tv->header->dict = sam_header_parse2(tv->header->text); + void *iter = tv->header->dict; + const char *key, *val; + int n = 0; + tv->rg_hash = kh_init(kh_rg); + while ( (iter = sam_header2key_val(iter, "RG","ID","SM", &key, &val)) ) + { + if ( !strcmp(samples,key) || (val && !strcmp(samples,val)) ) + { + khiter_t k = kh_get(kh_rg, tv->rg_hash, key); + if ( k != kh_end(tv->rg_hash) ) continue; + int ret; + k = kh_put(kh_rg, tv->rg_hash, key, &ret); + kh_value(tv->rg_hash, k) = val; + n++; + } + } + if ( !n ) + { + fprintf(stderr,"The sample or read group \"%s\" not present.\n", samples); + exit(EXIT_FAILURE); + } + } -#define TV_BASE_NUCL 0 -#define TV_BASE_COLOR_SPACE 1 + return 0; + } -typedef struct { - int mrow, mcol; - WINDOW *wgoto, *whelp; - bam_index_t *idx; - bam_lplbuf_t *lplbuf; - bam_header_t *header; - bamFile fp; - int curr_tid, left_pos; - faidx_t *fai; - bcf_callaux_t *bca; +void base_tv_destroy(tview_t* tv) + { + bam_lplbuf_destroy(tv->lplbuf); + bcf_call_destroy(tv->bca); + bam_index_destroy(tv->idx); + if (tv->fai) fai_destroy(tv->fai); + free(tv->ref); + bam_header_destroy(tv->header); + bam_close(tv->fp); + } - int ccol, last_pos, row_shift, base_for, color_for, is_dot, l_ref, ins, no_skip, show_name; - char *ref; -} tview_t; int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) { @@ -67,11 +87,11 @@ int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void // print referece rb = (tv->ref && pos - tv->left_pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N'; for (i = tv->last_pos + 1; i < pos; ++i) { - if (i%10 == 0 && tv->mcol - tv->ccol >= 10) mvprintw(0, tv->ccol, "%-d", i+1); + if (i%10 == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-d", i+1); c = tv->ref? tv->ref[i - tv->left_pos] : 'N'; - mvaddch(1, tv->ccol++, c); + tv->my_mvaddch(tv,1, tv->ccol++, c); } - if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) mvprintw(0, tv->ccol, "%-d", pos+1); + if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-d", pos+1); { // call consensus bcf_callret1_t bcr; int qsum[4], a1, a2, tmp; @@ -89,15 +109,15 @@ int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void else if (p[2] < p[1] && p[2] < p[0]) call = (1<my_underline(tv); c = ",ACMGRSVTWYHKDBN"[call>>16&0xf]; i = (call&0xffff)/10+1; if (i > 4) i = 4; - attr |= COLOR_PAIR(i); + attr |= tv->my_colorpair(tv,i); if (c == toupper(rb)) c = '.'; - attron(attr); - mvaddch(2, tv->ccol, c); - attroff(attr); + tv->my_attron(tv,attr); + tv->my_mvaddch(tv,2, tv->ccol, c); + tv->my_attroff(tv,attr); if(tv->ins) { // calculate maximum insert for (i = 0; i < n; ++i) { @@ -114,7 +134,6 @@ int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void if (!p->is_del) { if (tv->base_for == TV_BASE_COLOR_SPACE && (c = bam_aux_getCSi(p->b, p->qpos))) { - c = bam_aux_getCSi(p->b, p->qpos); // assume that if we found one color, we will be able to get the color error if (tv->is_dot && '-' == bam_aux_getCEi(p->b, p->qpos)) c = bam1_strand(p->b)? ',' : '.'; } else { @@ -148,18 +167,18 @@ int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void int x; attr = 0; if (((p->b->core.flag&BAM_FPAIRED) && !(p->b->core.flag&BAM_FPROPER_PAIR)) - || (p->b->core.flag & BAM_FSECONDARY)) attr |= A_UNDERLINE; + || (p->b->core.flag & BAM_FSECONDARY)) attr |= tv->my_underline(tv); if (tv->color_for == TV_COLOR_BASEQ) { x = bam1_qual(p->b)[p->qpos]/10 + 1; if (x > 4) x = 4; - attr |= COLOR_PAIR(x); + attr |= tv->my_colorpair(tv,x); } else if (tv->color_for == TV_COLOR_MAPQ) { x = p->b->core.qual/10 + 1; if (x > 4) x = 4; - attr |= COLOR_PAIR(x); + attr |= tv->my_colorpair(tv,x); } else if (tv->color_for == TV_COLOR_NUCL) { x = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos)] + 5; - attr |= COLOR_PAIR(x); + attr |= tv->my_colorpair(tv,x); } else if(tv->color_for == TV_COLOR_COL) { x = 0; switch(bam_aux_getCSi(p->b, p->qpos)) { @@ -171,87 +190,44 @@ int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void default: x = bam_nt16_nt4_table[bam1_seqi(bam1_seq(p->b), p->qpos)]; break; } x+=5; - attr |= COLOR_PAIR(x); + attr |= tv->my_colorpair(tv,x); } else if(tv->color_for == TV_COLOR_COLQ) { x = bam_aux_getCQi(p->b, p->qpos); if(0 == x) x = bam1_qual(p->b)[p->qpos]; x = x/10 + 1; if (x > 4) x = 4; - attr |= COLOR_PAIR(x); + attr |= tv->my_colorpair(tv,x); } - attron(attr); - mvaddch(row, tv->ccol, bam1_strand(p->b)? tolower(c) : toupper(c)); - attroff(attr); + tv->my_attron(tv,attr); + tv->my_mvaddch(tv,row, tv->ccol, bam1_strand(p->b)? tolower(c) : toupper(c)); + tv->my_attroff(tv,attr); } } c = j? '*' : rb; if (c == '*') { - attr = COLOR_PAIR(8); - attron(attr); - mvaddch(1, tv->ccol++, c); - attroff(attr); - } else mvaddch(1, tv->ccol++, c); + attr = tv->my_colorpair(tv,8); + tv->my_attron(tv,attr); + tv->my_mvaddch(tv,1, tv->ccol++, c); + tv->my_attroff(tv,attr); + } else tv->my_mvaddch(tv,1, tv->ccol++, c); } tv->last_pos = pos; return 0; } -tview_t *tv_init(const char *fn, const char *fn_fa) -{ - tview_t *tv = (tview_t*)calloc(1, sizeof(tview_t)); - tv->is_dot = 1; - tv->fp = bam_open(fn, "r"); - bgzf_set_cache_size(tv->fp, 8 * 1024 *1024); - assert(tv->fp); - tv->header = bam_header_read(tv->fp); - tv->idx = bam_index_load(fn); - if (tv->idx == 0) exit(1); - tv->lplbuf = bam_lplbuf_init(tv_pl_func, tv); - if (fn_fa) tv->fai = fai_load(fn_fa); - tv->bca = bcf_call_init(0.83, 13); - tv->ins = 1; - - initscr(); - keypad(stdscr, TRUE); - clear(); - noecho(); - cbreak(); - tv->mrow = 24; tv->mcol = 80; - getmaxyx(stdscr, tv->mrow, tv->mcol); - tv->wgoto = newwin(3, TV_MAX_GOTO + 10, 10, 5); - tv->whelp = newwin(29, 40, 5, 5); - tv->color_for = TV_COLOR_MAPQ; - start_color(); - init_pair(1, COLOR_BLUE, COLOR_BLACK); - init_pair(2, COLOR_GREEN, COLOR_BLACK); - init_pair(3, COLOR_YELLOW, COLOR_BLACK); - init_pair(4, COLOR_WHITE, COLOR_BLACK); - init_pair(5, COLOR_GREEN, COLOR_BLACK); - init_pair(6, COLOR_CYAN, COLOR_BLACK); - init_pair(7, COLOR_YELLOW, COLOR_BLACK); - init_pair(8, COLOR_RED, COLOR_BLACK); - init_pair(9, COLOR_BLUE, COLOR_BLACK); - return tv; -} -void tv_destroy(tview_t *tv) -{ - delwin(tv->wgoto); delwin(tv->whelp); - endwin(); - bam_lplbuf_destroy(tv->lplbuf); - bcf_call_destroy(tv->bca); - bam_index_destroy(tv->idx); - if (tv->fai) fai_destroy(tv->fai); - free(tv->ref); - bam_header_destroy(tv->header); - bam_close(tv->fp); - free(tv); -} int tv_fetch_func(const bam1_t *b, void *data) { tview_t *tv = (tview_t*)data; + if ( tv->rg_hash ) + { + const uint8_t *rg = bam_aux_get(b, "RG"); + if ( !rg ) return 0; + khiter_t k = kh_get(kh_rg, tv->rg_hash, (const char*)(rg + 1)); + if ( k == kh_end(tv->rg_hash) ) return 0; + } if (tv->no_skip) { uint32_t *cigar = bam1_cigar(b); // this is cheating... int i; @@ -264,10 +240,11 @@ int tv_fetch_func(const bam1_t *b, void *data) return 0; } -int tv_draw_aln(tview_t *tv, int tid, int pos) -{ +int base_draw_aln(tview_t *tv, int tid, int pos) + { + assert(tv!=NULL); // reset - clear(); + tv->my_clear(tv); tv->curr_tid = tid; tv->left_pos = pos; tv->last_pos = tv->left_pos - 1; tv->ccol = 0; @@ -275,7 +252,10 @@ int tv_draw_aln(tview_t *tv, int tid, int pos) if (tv->fai) { char *str; if (tv->ref) free(tv->ref); + assert(tv->curr_tid>=0); + str = (char*)calloc(strlen(tv->header->target_name[tv->curr_tid]) + 30, 1); + assert(str!=NULL); sprintf(str, "%s:%d-%d", tv->header->target_name[tv->curr_tid], tv->left_pos + 1, tv->left_pos + tv->mcol); tv->ref = fai_fetch(tv->fai, str, &tv->l_ref); free(str); @@ -287,154 +267,102 @@ int tv_draw_aln(tview_t *tv, int tid, int pos) while (tv->ccol < tv->mcol) { int pos = tv->last_pos + 1; - if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) mvprintw(0, tv->ccol, "%-d", pos+1); - mvaddch(1, tv->ccol++, (tv->ref && pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N'); + if (pos%10 == 0 && tv->mcol - tv->ccol >= 10) tv->my_mvprintw(tv,0, tv->ccol, "%-d", pos+1); + tv->my_mvaddch(tv,1, tv->ccol++, (tv->ref && pos < tv->l_ref)? tv->ref[pos - tv->left_pos] : 'N'); ++tv->last_pos; } return 0; } -static void tv_win_goto(tview_t *tv, int *tid, int *pos) -{ - char str[256], *p; - int i, l = 0; - wborder(tv->wgoto, '|', '|', '-', '-', '+', '+', '+', '+'); - mvwprintw(tv->wgoto, 1, 2, "Goto: "); - for (;;) { - int c = wgetch(tv->wgoto); - wrefresh(tv->wgoto); - if (c == KEY_BACKSPACE || c == '\010' || c == '\177') { - --l; - } else if (c == KEY_ENTER || c == '\012' || c == '\015') { - int _tid = -1, _beg, _end; - if (str[0] == '=') { - _beg = strtol(str+1, &p, 10) - 1; - if (_beg > 0) { - *pos = _beg; - return; - } - } else { - bam_parse_region(tv->header, str, &_tid, &_beg, &_end); - if (_tid >= 0) { - *tid = _tid; *pos = _beg; - return; - } - } - } else if (isgraph(c)) { - if (l < TV_MAX_GOTO) str[l++] = c; - } else if (c == '\027') l = 0; - else if (c == '\033') return; - str[l] = '\0'; - for (i = 0; i < TV_MAX_GOTO; ++i) mvwaddch(tv->wgoto, 1, 8 + i, ' '); - mvwprintw(tv->wgoto, 1, 8, "%s", str); - } -} -static void tv_win_help(tview_t *tv) { - int r = 1; - WINDOW *win = tv->whelp; - wborder(win, '|', '|', '-', '-', '+', '+', '+', '+'); - mvwprintw(win, r++, 2, " -=- Help -=- "); - r++; - mvwprintw(win, r++, 2, "? This window"); - mvwprintw(win, r++, 2, "Arrows Small scroll movement"); - mvwprintw(win, r++, 2, "h,j,k,l Small scroll movement"); - mvwprintw(win, r++, 2, "H,J,K,L Large scroll movement"); - mvwprintw(win, r++, 2, "ctrl-H Scroll 1k left"); - mvwprintw(win, r++, 2, "ctrl-L Scroll 1k right"); - mvwprintw(win, r++, 2, "space Scroll one screen"); - mvwprintw(win, r++, 2, "backspace Scroll back one screen"); - mvwprintw(win, r++, 2, "g Go to specific location"); - mvwprintw(win, r++, 2, "m Color for mapping qual"); - mvwprintw(win, r++, 2, "n Color for nucleotide"); - mvwprintw(win, r++, 2, "b Color for base quality"); - mvwprintw(win, r++, 2, "c Color for cs color"); - mvwprintw(win, r++, 2, "z Color for cs qual"); - mvwprintw(win, r++, 2, ". Toggle on/off dot view"); - mvwprintw(win, r++, 2, "s Toggle on/off ref skip"); - mvwprintw(win, r++, 2, "r Toggle on/off rd name"); - mvwprintw(win, r++, 2, "N Turn on nt view"); - mvwprintw(win, r++, 2, "C Turn on cs view"); - mvwprintw(win, r++, 2, "i Toggle on/off ins"); - mvwprintw(win, r++, 2, "q Exit"); - r++; - mvwprintw(win, r++, 2, "Underline: Secondary or orphan"); - mvwprintw(win, r++, 2, "Blue: 0-9 Green: 10-19"); - mvwprintw(win, r++, 2, "Yellow: 20-29 White: >=30"); - wrefresh(win); - wgetch(win); -} -void tv_loop(tview_t *tv) + +static void error(const char *format, ...) { - int tid, pos; - tid = tv->curr_tid; pos = tv->left_pos; - while (1) { - int c = getch(); - switch (c) { - case '?': tv_win_help(tv); break; - case '\033': - case 'q': goto end_loop; - case '/': - case 'g': tv_win_goto(tv, &tid, &pos); break; - case 'm': tv->color_for = TV_COLOR_MAPQ; break; - case 'b': tv->color_for = TV_COLOR_BASEQ; break; - case 'n': tv->color_for = TV_COLOR_NUCL; break; - case 'c': tv->color_for = TV_COLOR_COL; break; - case 'z': tv->color_for = TV_COLOR_COLQ; break; - case 's': tv->no_skip = !tv->no_skip; break; - case 'r': tv->show_name = !tv->show_name; break; - case KEY_LEFT: - case 'h': --pos; break; - case KEY_RIGHT: - case 'l': ++pos; break; - case KEY_SLEFT: - case 'H': pos -= 20; break; - case KEY_SRIGHT: - case 'L': pos += 20; break; - case '.': tv->is_dot = !tv->is_dot; break; - case 'N': tv->base_for = TV_BASE_NUCL; break; - case 'C': tv->base_for = TV_BASE_COLOR_SPACE; break; - case 'i': tv->ins = !tv->ins; break; - case '\010': pos -= 1000; break; - case '\014': pos += 1000; break; - case ' ': pos += tv->mcol; break; - case KEY_UP: - case 'j': --tv->row_shift; break; - case KEY_DOWN: - case 'k': ++tv->row_shift; break; - case KEY_BACKSPACE: - case '\177': pos -= tv->mcol; break; - case KEY_RESIZE: getmaxyx(stdscr, tv->mrow, tv->mcol); break; - default: continue; - } - if (pos < 0) pos = 0; - if (tv->row_shift < 0) tv->row_shift = 0; - tv_draw_aln(tv, tid, pos); - } -end_loop: - return; + if ( !format ) + { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bamtk tview [options] [ref.fasta]\n"); + fprintf(stderr, "Options:\n"); + fprintf(stderr, " -d display output as (H)tml or (C)urses or (T)ext \n"); + fprintf(stderr, " -p chr:pos go directly to this position\n"); + fprintf(stderr, " -s STR display only reads from this sample or group\n"); + fprintf(stderr, "\n\n"); + } + else + { + va_list ap; + va_start(ap, format); + vfprintf(stderr, format, ap); + va_end(ap); + } + exit(-1); } +enum dipsay_mode {display_ncurses,display_html,display_text}; +extern tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples); +extern tview_t* html_tv_init(const char *fn, const char *fn_fa, const char *samples); +extern tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples); + int bam_tview_main(int argc, char *argv[]) -{ - tview_t *tv; - if (argc == 1) { - fprintf(stderr, "Usage: bamtk tview [ref.fasta]\n"); - return 1; + { + int view_mode=display_ncurses; + tview_t* tv=NULL; + char *samples=NULL, *position=NULL; + int c; + while ((c = getopt(argc, argv, "s:p:d:")) >= 0) { + switch (c) { + case 's': samples=optarg; break; + case 'p': position=optarg; break; + case 'd': + { + switch(optarg[0]) + { + case 'H': case 'h': view_mode=display_html;break; + case 'T': case 't': view_mode=display_text;break; + case 'C': case 'c': view_mode=display_ncurses;break; + default: view_mode=display_ncurses;break; + } + break; + } + default: error(NULL); + } + } + if (argc==optind) error(NULL); + + switch(view_mode) + { + case display_ncurses: + { + tv = curses_tv_init(argv[optind], (optind+1>=argc)? 0 : argv[optind+1], samples); + break; + } + case display_text: + { + tv = text_tv_init(argv[optind], (optind+1>=argc)? 0 : argv[optind+1], samples); + break; + } + case display_html: + { + tv = html_tv_init(argv[optind], (optind+1>=argc)? 0 : argv[optind+1], samples); + break; + } + } + if(tv==NULL) + { + error("cannot create view"); + return EXIT_FAILURE; + } + + if ( position ) + { + int _tid = -1, _beg, _end; + bam_parse_region(tv->header, position, &_tid, &_beg, &_end); + if (_tid >= 0) { tv->curr_tid = _tid; tv->left_pos = _beg; } + } + tv->my_drawaln(tv, tv->curr_tid, tv->left_pos); + tv->my_loop(tv); + tv->my_destroy(tv); + + return EXIT_SUCCESS; } - tv = tv_init(argv[1], (argc == 2)? 0 : argv[2]); - tv_draw_aln(tv, 0, 0); - tv_loop(tv); - tv_destroy(tv); - return 0; -} -#else // #ifdef _HAVE_CURSES -#include -#warning "No curses library is available; tview is disabled." -int bam_tview_main(int argc, char *argv[]) -{ - fprintf(stderr, "[bam_tview_main] The ncurses library is unavailable; tview is not compiled.\n"); - return 1; -} -#endif // #ifdef _HAVE_CURSES diff --git a/sam/bam_tview.h b/sam/bam_tview.h new file mode 100644 index 0000000..80f0464 --- /dev/null +++ b/sam/bam_tview.h @@ -0,0 +1,75 @@ +#ifndef BAM_TVIEW_H +#define BAM_TVIEW_H + +#include +#include +#include +#include +#include +#include +#include "bam.h" +#include "faidx.h" +#include "bam2bcf.h" +#include "sam_header.h" +#include "khash.h" + +KHASH_MAP_INIT_STR(kh_rg, const char *) + +typedef struct AbstractTview { + int mrow, mcol; + + bam_index_t *idx; + bam_lplbuf_t *lplbuf; + bam_header_t *header; + bamFile fp; + int curr_tid, left_pos; + faidx_t *fai; + bcf_callaux_t *bca; + + int ccol, last_pos, row_shift, base_for, color_for, is_dot, l_ref, ins, no_skip, show_name; + char *ref; + khash_t(kh_rg) *rg_hash; + /* callbacks */ + void (*my_destroy)(struct AbstractTview* ); + void (*my_mvprintw)(struct AbstractTview* ,int,int,const char*,...); + void (*my_mvaddch)(struct AbstractTview*,int,int,int); + void (*my_attron)(struct AbstractTview*,int); + void (*my_attroff)(struct AbstractTview*,int); + void (*my_clear)(struct AbstractTview*); + int (*my_colorpair)(struct AbstractTview*,int); + int (*my_drawaln)(struct AbstractTview*,int,int); + int (*my_loop)(struct AbstractTview*); + int (*my_underline)(struct AbstractTview*); +} tview_t; + + +char bam_aux_getCEi(bam1_t *b, int i); +char bam_aux_getCSi(bam1_t *b, int i); +char bam_aux_getCQi(bam1_t *b, int i); + +#define TV_MIN_ALNROW 2 +#define TV_MAX_GOTO 40 +#define TV_LOW_MAPQ 10 + +#define TV_COLOR_MAPQ 0 +#define TV_COLOR_BASEQ 1 +#define TV_COLOR_NUCL 2 +#define TV_COLOR_COL 3 +#define TV_COLOR_COLQ 4 + +#define TV_BASE_NUCL 0 +#define TV_BASE_COLOR_SPACE 1 + +int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data); +int base_tv_init(tview_t*,const char *fn, const char *fn_fa, const char *samples); +void base_tv_destroy(tview_t*); +int base_draw_aln(tview_t *tv, int tid, int pos); + +typedef struct Tixel + { + int ch; + int attributes; + }tixel_t; + +#endif + diff --git a/sam/bam_tview_curses.c b/sam/bam_tview_curses.c new file mode 100644 index 0000000..4fdd1fb --- /dev/null +++ b/sam/bam_tview_curses.c @@ -0,0 +1,297 @@ +#undef _HAVE_CURSES + +#if _CURSES_LIB == 0 +#elif _CURSES_LIB == 1 +#include +#ifndef NCURSES_VERSION +#warning "_CURSES_LIB=1 but NCURSES_VERSION not defined; tview is NOT compiled" +#else +#define _HAVE_CURSES +#endif +#elif _CURSES_LIB == 2 +#include +#define _HAVE_CURSES +#else +#warning "_CURSES_LIB is not 0, 1 or 2; tview is NOT compiled" +#endif + + +#include "bam_tview.h" + +#ifdef _HAVE_CURSES + + + +typedef struct CursesTview { + tview_t view; + WINDOW *wgoto, *whelp; + } curses_tview_t; + + + + +#define FROM_TV(ptr) ((curses_tview_t*)ptr) + +static void curses_destroy(tview_t* base) + { + curses_tview_t* tv=(curses_tview_t*)base; + + + delwin(tv->wgoto); delwin(tv->whelp); + endwin(); + + base_tv_destroy(base); + + free(tv); + } + +/* + void (*my_mvprintw)(struct AbstractTview* ,int,int,const char*,...); + void (*my_)(struct AbstractTview*,int,int,int); + void (*my_attron)(struct AbstractTview*,int); + void (*my_attroff)(struct AbstractTview*,int); + void (*my_clear)(struct AbstractTview*); + int (*my_colorpair)(struct AbstractTview*,int); +*/ + +static void curses_mvprintw(struct AbstractTview* tv,int y ,int x,const char* fmt,...) + { + unsigned int size=tv->mcol+2; + char* str=malloc(size); + if(str==0) exit(EXIT_FAILURE); + va_list argptr; + va_start(argptr, fmt); + vsnprintf(str,size, fmt, argptr); + va_end(argptr); + mvprintw(y,x,str); + free(str); + } + +static void curses_mvaddch(struct AbstractTview* tv,int y,int x,int ch) + { + mvaddch(y,x,ch); + } + +static void curses_attron(struct AbstractTview* tv,int flag) + { + attron(flag); + } +static void curses_attroff(struct AbstractTview* tv,int flag) + { + attroff(flag); + } +static void curses_clear(struct AbstractTview* tv) + { + clear(); + } + +static int curses_colorpair(struct AbstractTview* tv,int flag) + { + return COLOR_PAIR(flag); + } + +static int curses_drawaln(struct AbstractTview* tv, int tid, int pos) + { + return base_draw_aln(tv, tid, pos); + } + + + +static void tv_win_goto(curses_tview_t *tv, int *tid, int *pos) + { + char str[256], *p; + int i, l = 0; + tview_t *base=(tview_t*)tv; + wborder(tv->wgoto, '|', '|', '-', '-', '+', '+', '+', '+'); + mvwprintw(tv->wgoto, 1, 2, "Goto: "); + for (;;) { + int c = wgetch(tv->wgoto); + wrefresh(tv->wgoto); + if (c == KEY_BACKSPACE || c == '\010' || c == '\177') { + if(l > 0) --l; + } else if (c == KEY_ENTER || c == '\012' || c == '\015') { + int _tid = -1, _beg, _end; + if (str[0] == '=') { + _beg = strtol(str+1, &p, 10) - 1; + if (_beg > 0) { + *pos = _beg; + return; + } + } else { + bam_parse_region(base->header, str, &_tid, &_beg, &_end); + if (_tid >= 0) { + *tid = _tid; *pos = _beg; + return; + } + } + } else if (isgraph(c)) { + if (l < TV_MAX_GOTO) str[l++] = c; + } else if (c == '\027') l = 0; + else if (c == '\033') return; + str[l] = '\0'; + for (i = 0; i < TV_MAX_GOTO; ++i) mvwaddch(tv->wgoto, 1, 8 + i, ' '); + mvwprintw(tv->wgoto, 1, 8, "%s", str); + } +} + + + + +static void tv_win_help(curses_tview_t *tv) { + int r = 1; + tview_t* base=(tview_t*)base; + WINDOW *win = tv->whelp; + wborder(win, '|', '|', '-', '-', '+', '+', '+', '+'); + mvwprintw(win, r++, 2, " -=- Help -=- "); + r++; + mvwprintw(win, r++, 2, "? This window"); + mvwprintw(win, r++, 2, "Arrows Small scroll movement"); + mvwprintw(win, r++, 2, "h,j,k,l Small scroll movement"); + mvwprintw(win, r++, 2, "H,J,K,L Large scroll movement"); + mvwprintw(win, r++, 2, "ctrl-H Scroll 1k left"); + mvwprintw(win, r++, 2, "ctrl-L Scroll 1k right"); + mvwprintw(win, r++, 2, "space Scroll one screen"); + mvwprintw(win, r++, 2, "backspace Scroll back one screen"); + mvwprintw(win, r++, 2, "g Go to specific location"); + mvwprintw(win, r++, 2, "m Color for mapping qual"); + mvwprintw(win, r++, 2, "n Color for nucleotide"); + mvwprintw(win, r++, 2, "b Color for base quality"); + mvwprintw(win, r++, 2, "c Color for cs color"); + mvwprintw(win, r++, 2, "z Color for cs qual"); + mvwprintw(win, r++, 2, ". Toggle on/off dot view"); + mvwprintw(win, r++, 2, "s Toggle on/off ref skip"); + mvwprintw(win, r++, 2, "r Toggle on/off rd name"); + mvwprintw(win, r++, 2, "N Turn on nt view"); + mvwprintw(win, r++, 2, "C Turn on cs view"); + mvwprintw(win, r++, 2, "i Toggle on/off ins"); + mvwprintw(win, r++, 2, "q Exit"); + r++; + mvwprintw(win, r++, 2, "Underline: Secondary or orphan"); + mvwprintw(win, r++, 2, "Blue: 0-9 Green: 10-19"); + mvwprintw(win, r++, 2, "Yellow: 20-29 White: >=30"); + wrefresh(win); + wgetch(win); +} + +static int curses_underline(tview_t* tv) + { + return A_UNDERLINE; + } + +static int curses_loop(tview_t* tv) + { + int tid, pos; + curses_tview_t *CTV=(curses_tview_t *)tv; + tid = tv->curr_tid; pos = tv->left_pos; + while (1) { + int c = getch(); + switch (c) { + case '?': tv_win_help(CTV); break; + case '\033': + case 'q': goto end_loop; + case '/': + case 'g': tv_win_goto(CTV, &tid, &pos); break; + case 'm': tv->color_for = TV_COLOR_MAPQ; break; + case 'b': tv->color_for = TV_COLOR_BASEQ; break; + case 'n': tv->color_for = TV_COLOR_NUCL; break; + case 'c': tv->color_for = TV_COLOR_COL; break; + case 'z': tv->color_for = TV_COLOR_COLQ; break; + case 's': tv->no_skip = !tv->no_skip; break; + case 'r': tv->show_name = !tv->show_name; break; + case KEY_LEFT: + case 'h': --pos; break; + case KEY_RIGHT: + case 'l': ++pos; break; + case KEY_SLEFT: + case 'H': pos -= 20; break; + case KEY_SRIGHT: + case 'L': pos += 20; break; + case '.': tv->is_dot = !tv->is_dot; break; + case 'N': tv->base_for = TV_BASE_NUCL; break; + case 'C': tv->base_for = TV_BASE_COLOR_SPACE; break; + case 'i': tv->ins = !tv->ins; break; + case '\010': pos -= 1000; break; + case '\014': pos += 1000; break; + case ' ': pos += tv->mcol; break; + case KEY_UP: + case 'j': --tv->row_shift; break; + case KEY_DOWN: + case 'k': ++tv->row_shift; break; + case KEY_BACKSPACE: + case '\177': pos -= tv->mcol; break; + case KEY_RESIZE: getmaxyx(stdscr, tv->mrow, tv->mcol); break; + default: continue; + } + if (pos < 0) pos = 0; + if (tv->row_shift < 0) tv->row_shift = 0; + tv->my_drawaln(tv, tid, pos); + } +end_loop: + return 0; +} + + + + +tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples) + { + curses_tview_t *tv = (curses_tview_t*)calloc(1, sizeof(curses_tview_t)); + tview_t* base=(tview_t*)tv; + if(tv==0) + { + fprintf(stderr,"Calloc failed\n"); + return 0; + } + + base_tv_init(base,fn,fn_fa,samples); + /* initialize callbacks */ +#define SET_CALLBACK(fun) base->my_##fun=curses_##fun; + SET_CALLBACK(destroy); + SET_CALLBACK(mvprintw); + SET_CALLBACK(mvaddch); + SET_CALLBACK(attron); + SET_CALLBACK(attroff); + SET_CALLBACK(clear); + SET_CALLBACK(colorpair); + SET_CALLBACK(drawaln); + SET_CALLBACK(loop); + SET_CALLBACK(underline); +#undef SET_CALLBACK + + initscr(); + keypad(stdscr, TRUE); + clear(); + noecho(); + cbreak(); + + getmaxyx(stdscr, base->mrow, base->mcol); + tv->wgoto = newwin(3, TV_MAX_GOTO + 10, 10, 5); + tv->whelp = newwin(29, 40, 5, 5); + + start_color(); + init_pair(1, COLOR_BLUE, COLOR_BLACK); + init_pair(2, COLOR_GREEN, COLOR_BLACK); + init_pair(3, COLOR_YELLOW, COLOR_BLACK); + init_pair(4, COLOR_WHITE, COLOR_BLACK); + init_pair(5, COLOR_GREEN, COLOR_BLACK); + init_pair(6, COLOR_CYAN, COLOR_BLACK); + init_pair(7, COLOR_YELLOW, COLOR_BLACK); + init_pair(8, COLOR_RED, COLOR_BLACK); + init_pair(9, COLOR_BLUE, COLOR_BLACK); + return base; + } + + +#else // #ifdef _HAVE_CURSES +#include +#warning "No curses library is available; tview with curses is disabled." + +extern tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples); + +tview_t* curses_tv_init(const char *fn, const char *fn_fa, const char *samples) + { + return text_tv_init(fn,fn_fa,samples); + } +#endif // #ifdef _HAVE_CURSES + + diff --git a/sam/bam_tview_html.c b/sam/bam_tview_html.c new file mode 100644 index 0000000..f52b4c3 --- /dev/null +++ b/sam/bam_tview_html.c @@ -0,0 +1,349 @@ +#include +#include "bam_tview.h" + +#define UNDERLINE_FLAG 10 + +typedef struct HtmlTview { + tview_t view; + int row_count; + tixel_t** screen; + FILE* out; + int attributes;/* color... */ + } html_tview_t; + +#define FROM_TV(ptr) ((html_tview_t*)ptr) + +static void html_destroy(tview_t* base) + { + int i; + html_tview_t* tv=(html_tview_t*)base; + if(tv->screen!=NULL) + { + for(i=0;i< tv->row_count;++i) free(tv->screen[i]); + free(tv->screen); + } + base_tv_destroy(base); + free(tv); + } + +/* + void (*my_mvprintw)(struct AbstractTview* ,int,int,const char*,...); + void (*my_)(struct AbstractTview*,int,int,int); + void (*my_attron)(struct AbstractTview*,int); + void (*my_attroff)(struct AbstractTview*,int); + void (*my_clear)(struct AbstractTview*); + int (*my_colorpair)(struct AbstractTview*,int); +*/ + +static void html_mvprintw(struct AbstractTview* tv,int y ,int x,const char* fmt,...) + { + int i,nchars=0; + unsigned int size=tv->mcol+2; + char* str=malloc(size); + if(str==0) exit(EXIT_FAILURE); + va_list argptr; + va_start(argptr, fmt); + nchars=vsnprintf(str,size, fmt, argptr); + va_end(argptr); + + for(i=0;i< nchars;++i) + { + tv->my_mvaddch(tv,y,x+i,str[i]); + } + free(str); + } + +static void html_mvaddch(struct AbstractTview* tv,int y,int x,int ch) + { + tixel_t* row=NULL; + html_tview_t* ptr=FROM_TV(tv); + if( x >= tv->mcol ) return; //out of screen + while(ptr->row_count<=y) + { + int x; + row=(tixel_t*)calloc(tv->mcol,sizeof(tixel_t)); + if(row==0) exit(EXIT_FAILURE); + for(x=0;xmcol;++x) {row[x].ch=' ';row[x].attributes=0;} + ptr->screen=(tixel_t**)realloc(ptr->screen,sizeof(tixel_t*)*(ptr->row_count+1)); + ptr->screen[ptr->row_count++]=row; + } + row=ptr->screen[y]; + row[x].ch=ch; + row[x].attributes=ptr->attributes; + } + +static void html_attron(struct AbstractTview* tv,int flag) + { + html_tview_t* ptr=FROM_TV(tv); + ptr->attributes |= flag; + + + } + +static void html_attroff(struct AbstractTview* tv,int flag) + { + html_tview_t* ptr=FROM_TV(tv); + ptr->attributes &= ~(flag); + } + +static void html_clear(struct AbstractTview* tv) + { + html_tview_t* ptr=FROM_TV(tv); + if(ptr->screen!=NULL) + { + int i; + for(i=0;i< ptr->row_count;++i) free(ptr->screen[i]); + free(ptr->screen); + ptr->screen=NULL; + } + ptr->row_count=0; + ptr->attributes=0; + } + +static int html_colorpair(struct AbstractTview* tv,int flag) + { + return (1 << (flag)); + } + +static int html_drawaln(struct AbstractTview* tv, int tid, int pos) + { + int y,x; + html_tview_t* ptr=FROM_TV(tv); + html_clear(tv); + base_draw_aln(tv, tid, pos); + fputs("",ptr->out); + fprintf(ptr->out,"%s:%d", + tv->header->target_name[tid], + pos+1 + ); + //style + + fputs("",ptr->out); + + fputs("",ptr->out); + + fprintf(ptr->out,"
%s:%d
", + tv->header->target_name[tid], + pos+1 + ); + + fputs("
",ptr->out);
+    for(y=0;y< ptr->row_count;++y)
+    	{
+    	
+    	for(x=0;x< tv->mcol;++x)
+	    	{
+	    	
+		
+		if(x== 0 || ptr->screen[y][x].attributes != ptr->screen[y][x-1].attributes)
+	    		{
+	    		int css=0;
+			fprintf(ptr->out,"1) fprintf(stderr,"css=%d pow2=%d vs %d\n",css,(1 << (css)),ptr->screen[y][x].attributes);
+	    			if(( (ptr->screen[y][x].attributes) & (1 << (css)))!=0)
+	    				{
+	    				
+	    				fprintf(ptr->out," class='tviewc%s%d'",
+	    					(( (ptr->screen[y][x].attributes) & (1 << (UNDERLINE_FLAG)) )!=0?"u":""),
+	    					css);
+	    				break;
+	    				}
+	    			++css;
+	    			}
+
+
+	    		fputs(">",ptr->out);
+	    		}
+		
+		int ch=ptr->screen[y][x].ch;
+		switch(ch)
+			{
+			case '<': fputs("<",ptr->out);break;
+			case '>': fputs(">",ptr->out);break;
+			case '&': fputs("&",ptr->out);break;
+			default: fputc(ch,ptr->out); break;
+			}
+	    	
+	    	
+	    	if(x+1 == tv->mcol  || ptr->screen[y][x].attributes!=ptr->screen[y][x+1].attributes)
+	    		{
+	    		fputs("",ptr->out);
+	    		}
+	    	}
+    	if(y+1 < ptr->row_count) fputs("
",ptr->out); + } + fputs("
",ptr->out); + return 0; + } + + +#define ANSI_COLOR_RED "\x1b[31m" +#define ANSI_COLOR_GREEN "\x1b[32m" +#define ANSI_COLOR_YELLOW "\x1b[33m" +#define ANSI_COLOR_BLUE "\x1b[34m" +#define ANSI_COLOR_MAGENTA "\x1b[35m" +#define ANSI_COLOR_CYAN "\x1b[36m" +#define ANSI_COLOR_BLACK "\x1b[0m" +#define ANSI_COLOR_RESET ANSI_COLOR_BLACK + +#define ANSI_UNDERLINE_SET "\033[4m" +#define ANSI_UNDERLINE_UNSET "\033[0m" + +static int text_drawaln(struct AbstractTview* tv, int tid, int pos) + { + int y,x; + html_tview_t* ptr=FROM_TV(tv); + html_clear(tv); + base_draw_aln(tv, tid, pos); + int is_term= isatty(fileno(ptr->out)); + + for(y=0;y< ptr->row_count;++y) + { + for(x=0;x< tv->mcol;++x) + { + if(is_term) + { + int css=0; + while(css<32) + { + if(( (ptr->screen[y][x].attributes) & (1 << (css)))!=0) + { + break; + } + ++css; + } + switch(css) + { + //CSS(0, "black"); + case 1: fputs(ANSI_COLOR_BLUE,ptr->out); break; + case 2: fputs(ANSI_COLOR_GREEN,ptr->out); break; + case 3: fputs(ANSI_COLOR_YELLOW,ptr->out); break; + //CSS(4, "black"); + case 5: fputs(ANSI_COLOR_GREEN,ptr->out); break; + case 6: fputs(ANSI_COLOR_CYAN,ptr->out); break; + case 7: fputs(ANSI_COLOR_YELLOW,ptr->out); break; + case 8: fputs(ANSI_COLOR_RED,ptr->out); break; + case 9: fputs(ANSI_COLOR_BLUE,ptr->out); break; + default:break; + } + if(( (ptr->screen[y][x].attributes) & (1 << (UNDERLINE_FLAG)))!=0) + { + fputs(ANSI_UNDERLINE_SET,ptr->out); + } + + } + + + int ch=ptr->screen[y][x].ch; + + fputc(ch,ptr->out); + if(is_term) + { + fputs(ANSI_COLOR_RESET,ptr->out); + if(( (ptr->screen[y][x].attributes) & (1 << (UNDERLINE_FLAG)))!=0) + { + fputs(ANSI_UNDERLINE_UNSET,ptr->out); + } + } + } + fputc('\n',ptr->out); + } + return 0; + } + + +static int html_loop(tview_t* tv) + { + //tv->my_drawaln(tv, tv->curr_tid, tv->left_pos); + return 0; + } + +static int html_underline(tview_t* tv) + { + return (1 << UNDERLINE_FLAG); + } + +/* +static void init_pair(html_tview_t *tv,int id_ge_1, const char* pen, const char* paper) + { + + } +*/ + +tview_t* html_tv_init(const char *fn, const char *fn_fa, const char *samples) + { + char* colstr=getenv("COLUMNS"); + html_tview_t *tv = (html_tview_t*)calloc(1, sizeof(html_tview_t)); + tview_t* base=(tview_t*)tv; + if(tv==0) + { + fprintf(stderr,"Calloc failed\n"); + return 0; + } + tv->row_count=0; + tv->screen=NULL; + tv->out=stdout; + tv->attributes=0; + base_tv_init(base,fn,fn_fa,samples); + /* initialize callbacks */ +#define SET_CALLBACK(fun) base->my_##fun=html_##fun; + SET_CALLBACK(destroy); + SET_CALLBACK(mvprintw); + SET_CALLBACK(mvaddch); + SET_CALLBACK(attron); + SET_CALLBACK(attroff); + SET_CALLBACK(clear); + SET_CALLBACK(colorpair); + SET_CALLBACK(drawaln); + SET_CALLBACK(loop); + SET_CALLBACK(underline); +#undef SET_CALLBACK + + + if(colstr!=0) + { + base->mcol=atoi(colstr); + if(base->mcol<10) base->mcol=80; + } + base->mrow=99999; + +/* + init_pair(tv,1, "blue", "white"); + init_pair(tv,2, "green", "white"); + init_pair(tv,3, "yellow", "white"); + init_pair(tv,4, "white", "white"); + init_pair(tv,5, "green", "white"); + init_pair(tv,6, "cyan", "white"); + init_pair(tv,7, "yellow", "white"); + init_pair(tv,8, "red", "white"); + init_pair(tv,9, "blue", "white"); + */ + return base; + } + + +tview_t* text_tv_init(const char *fn, const char *fn_fa, const char *samples) + { + tview_t* tv=html_tv_init(fn,fn_fa,samples); + tv->my_drawaln=text_drawaln; + return tv; + } + diff --git a/sam/bamshuf.c b/sam/bamshuf.c new file mode 100644 index 0000000..33a5238 --- /dev/null +++ b/sam/bamshuf.c @@ -0,0 +1,141 @@ +#include +#include +#include +#include +#include +#include "sam.h" +#include "ksort.h" + +#define DEF_CLEVEL 1 + +static inline unsigned hash_Wang(unsigned key) +{ + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; +} + +static inline unsigned hash_X31_Wang(const char *s) +{ + unsigned h = *s; + if (h) { + for (++s ; *s; ++s) h = (h << 5) - h + *s; + return hash_Wang(h); + } else return 0; +} + +typedef struct { + unsigned key; + bam1_t *b; +} elem_t; + +static inline int elem_lt(elem_t x, elem_t y) +{ + if (x.key < y.key) return 1; + if (x.key == y.key) { + int t; + t = strcmp(bam_get_qname(x.b), bam_get_qname(y.b)); + if (t < 0) return 1; + return (t == 0 && ((x.b->core.flag>>6&3) < (y.b->core.flag>>6&3))); + } else return 0; +} + +KSORT_INIT(bamshuf, elem_t, elem_lt) + +static void bamshuf(const char *fn, int n_files, const char *pre, int clevel, int is_stdout) +{ + BGZF *fp, *fpw, **fpt; + char **fnt, modew[8]; + bam1_t *b; + int i, l; + bam_hdr_t *h; + int64_t *cnt; + + // split + fp = strcmp(fn, "-")? bgzf_open(fn, "r") : bgzf_dopen(fileno(stdin), "r"); + assert(fp); + h = bam_hdr_read(fp); + fnt = (char**)calloc(n_files, sizeof(void*)); + fpt = (BGZF**)calloc(n_files, sizeof(void*)); + cnt = (int64_t*)calloc(n_files, 8); + l = strlen(pre); + for (i = 0; i < n_files; ++i) { + fnt[i] = (char*)calloc(l + 10, 1); + sprintf(fnt[i], "%s.%.4d.bam", pre, i); + fpt[i] = bgzf_open(fnt[i], "w1"); + bam_hdr_write(fpt[i], h); + } + b = bam_init1(); + while (bam_read1(fp, b) >= 0) { + uint32_t x; + x = hash_X31_Wang(bam_get_qname(b)) % n_files; + bam_write1(fpt[x], b); + ++cnt[x]; + } + bam_destroy1(b); + for (i = 0; i < n_files; ++i) bgzf_close(fpt[i]); + free(fpt); + bgzf_close(fp); + // merge + sprintf(modew, "w%d", (clevel >= 0 && clevel <= 9)? clevel : DEF_CLEVEL); + if (!is_stdout) { // output to a file + char *fnw = (char*)calloc(l + 5, 1); + sprintf(fnw, "%s.bam", pre); + fpw = bgzf_open(fnw, modew); + free(fnw); + } else fpw = bgzf_dopen(fileno(stdout), modew); // output to stdout + bam_hdr_write(fpw, h); + bam_hdr_destroy(h); + for (i = 0; i < n_files; ++i) { + int64_t j, c = cnt[i]; + elem_t *a; + fp = bgzf_open(fnt[i], "r"); + bam_hdr_destroy(bam_hdr_read(fp)); + a = (elem_t*)calloc(c, sizeof(elem_t)); + for (j = 0; j < c; ++j) { + a[j].b = bam_init1(); + assert(bam_read1(fp, a[j].b) >= 0); + a[j].key = hash_X31_Wang(bam_get_qname(a[j].b)); + } + bgzf_close(fp); + unlink(fnt[i]); + free(fnt[i]); + ks_introsort(bamshuf, c, a); + for (j = 0; j < c; ++j) { + bam_write1(fpw, a[j].b); + bam_destroy1(a[j].b); + } + free(a); + } + bgzf_close(fpw); + free(fnt); free(cnt); +} + +int main_bamshuf(int argc, char *argv[]) +{ + int c, n_files = 64, clevel = DEF_CLEVEL, is_stdout = 0, is_un = 0; + while ((c = getopt(argc, argv, "n:l:uO")) >= 0) { + switch (c) { + case 'n': n_files = atoi(optarg); break; + case 'l': clevel = atoi(optarg); break; + case 'u': is_un = 1; break; + case 'O': is_stdout = 1; break; + } + } + if (is_un) clevel = 0; + if (optind + 2 > argc) { + fprintf(stderr, "\nUsage: bamshuf [-Ou] [-n nFiles] [-c cLevel] \n\n"); + fprintf(stderr, "Options: -O output to stdout\n"); + fprintf(stderr, " -u uncompressed BAM output\n"); + fprintf(stderr, " -l INT compression level [%d]\n", DEF_CLEVEL); + fprintf(stderr, " -n INT number of temporary files [%d]\n", n_files); + fprintf(stderr, "\n"); + return 1; + } + bamshuf(argv[optind], n_files, argv[optind+1], clevel, is_stdout); + return 0; +} diff --git a/sam/bamtk.c b/sam/bamtk.c index 8ba2581..9df7c11 100644 --- a/sam/bamtk.c +++ b/sam/bamtk.c @@ -27,6 +27,9 @@ int main_phase(int argc, char *argv[]); int main_cat(int argc, char *argv[]); int main_depth(int argc, char *argv[]); int main_bam2fq(int argc, char *argv[]); +int main_pad2unpad(int argc, char *argv[]); +int main_bedcov(int argc, char *argv[]); +int main_bamshuf(int argc, char *argv[]); int faidx_main(int argc, char *argv[]); @@ -53,8 +56,11 @@ static int usage() fprintf(stderr, " rmdup remove PCR duplicates\n"); fprintf(stderr, " reheader replace BAM header\n"); fprintf(stderr, " cat concatenate BAMs\n"); + fprintf(stderr, " bedcov read depth per BED region\n"); fprintf(stderr, " targetcut cut fosmid regions (for fosmid pool only)\n"); fprintf(stderr, " phase phase heterozygotes\n"); + fprintf(stderr, " bamshuf shuffle and group alignments by name\n"); +// fprintf(stderr, " depad convert padded BAM to unpadded BAM\n"); // not stable fprintf(stderr, "\n"); #ifdef _WIN32 fprintf(stderr, "\ @@ -94,6 +100,10 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "phase") == 0) return main_phase(argc-1, argv+1); else if (strcmp(argv[1], "depth") == 0) return main_depth(argc-1, argv+1); else if (strcmp(argv[1], "bam2fq") == 0) return main_bam2fq(argc-1, argv+1); + else if (strcmp(argv[1], "pad2unpad") == 0) return main_pad2unpad(argc-1, argv+1); + else if (strcmp(argv[1], "depad") == 0) return main_pad2unpad(argc-1, argv+1); + else if (strcmp(argv[1], "bedcov") == 0) return main_bedcov(argc-1, argv+1); + else if (strcmp(argv[1], "bamshuf") == 0) return main_bamshuf(argc-1, argv+1); else if (strcmp(argv[1], "pileup") == 0) { fprintf(stderr, "[main] The `pileup' command has been removed. Please use `mpileup' instead.\n"); return 1; diff --git a/sam/bcftools/._Makefile b/sam/bcftools/._Makefile new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/bcftools/._Makefile differ diff --git a/sam/bcftools/._README b/sam/bcftools/._README new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/bcftools/._README differ diff --git a/sam/bcftools/._bcf.c b/sam/bcftools/._bcf.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/bcftools/._bcf.c differ diff --git a/sam/bcftools/._bcf.h b/sam/bcftools/._bcf.h new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/bcftools/._bcf.h differ diff --git a/sam/bcftools/._bcf.tex b/sam/bcftools/._bcf.tex new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/bcftools/._bcf.tex differ diff --git a/sam/bcftools/._bcf2qcall.c b/sam/bcftools/._bcf2qcall.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/bcftools/._bcf2qcall.c differ diff --git a/sam/bcftools/._bcfutils.c b/sam/bcftools/._bcfutils.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/bcftools/._bcfutils.c differ diff --git a/sam/bcftools/._call1.c b/sam/bcftools/._call1.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/bcftools/._call1.c differ diff --git a/sam/bcftools/._em.c b/sam/bcftools/._em.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/bcftools/._em.c differ diff --git a/sam/bcftools/._fet.c b/sam/bcftools/._fet.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/bcftools/._fet.c differ diff --git a/sam/bcftools/._index.c b/sam/bcftools/._index.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/bcftools/._index.c differ diff --git a/sam/bcftools/._kfunc.c b/sam/bcftools/._kfunc.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/bcftools/._kfunc.c differ diff --git a/sam/bcftools/._kmin.c b/sam/bcftools/._kmin.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/bcftools/._kmin.c differ diff --git a/sam/bcftools/._kmin.h b/sam/bcftools/._kmin.h new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/bcftools/._kmin.h differ diff --git a/sam/bcftools/._main.c b/sam/bcftools/._main.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/bcftools/._main.c differ diff --git a/sam/bcftools/._mut.c b/sam/bcftools/._mut.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/bcftools/._mut.c differ diff --git a/sam/bcftools/._prob1.c b/sam/bcftools/._prob1.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/bcftools/._prob1.c differ diff --git a/sam/bcftools/._prob1.h b/sam/bcftools/._prob1.h new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/bcftools/._prob1.h differ diff --git a/sam/bcftools/._vcf.c b/sam/bcftools/._vcf.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/bcftools/._vcf.c differ diff --git a/sam/bcftools/._vcfutils.pl b/sam/bcftools/._vcfutils.pl new file mode 100755 index 0000000..94286bb Binary files /dev/null and b/sam/bcftools/._vcfutils.pl differ diff --git a/sam/bcftools/Makefile b/sam/bcftools/Makefile index 9b6f863..be831de 100644 --- a/sam/bcftools/Makefile +++ b/sam/bcftools/Makefile @@ -31,7 +31,7 @@ libbcf.a:$(LOBJS) $(AR) -csru $@ $(LOBJS) bcftools:lib $(AOBJS) - $(CC) $(CFLAGS) -o $@ $(AOBJS) -L. $(LIBPATH) -lbcf -lm -lz + $(CC) $(CFLAGS) -o $@ $(AOBJS) -L. $(LIBPATH) -lbcf -lm -lz -lpthread bcf.o:bcf.h vcf.o:bcf.h diff --git a/sam/bcftools/bcf.c b/sam/bcftools/bcf.c index 84a8e76..24728db 100644 --- a/sam/bcftools/bcf.c +++ b/sam/bcftools/bcf.c @@ -13,9 +13,6 @@ bcf_t *bcf_open(const char *fn, const char *mode) } else { b->fp = strcmp(fn, "-")? bgzf_open(fn, mode) : bgzf_fdopen(fileno(stdin), mode); } -#ifndef BCF_LITE - b->fp->owned_file = 1; -#endif return b; } @@ -140,7 +137,7 @@ int bcf_sync(bcf1_t *b) for (i = 0; i < b->n_gi; ++i) { if (b->gi[i].fmt == bcf_str2int("PL", 2)) { b->gi[i].len = b->n_alleles * (b->n_alleles + 1) / 2; - } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("HQ", 2)) { + } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("HQ", 2) || b->gi[i].fmt == bcf_str2int("DV", 2)) { b->gi[i].len = 2; } else if (b->gi[i].fmt == bcf_str2int("GQ", 2) || b->gi[i].fmt == bcf_str2int("GT", 2)) { b->gi[i].len = 1; @@ -233,32 +230,59 @@ void bcf_fmt_core(const bcf_hdr_t *h, bcf1_t *b, kstring_t *s) } x = b->n_alleles * (b->n_alleles + 1) / 2; if (b->n_gi == 0) return; + int iPL = -1; + if ( b->n_alleles > 2 ) { + for (i=0; in_gi; i++) { + if ( b->gi[i].fmt == bcf_str2int("PL", 2) ) { + iPL = i; + break; + } + } + } for (j = 0; j < h->n_smpl; ++j) { + int ploidy = b->ploidy ? b->ploidy[j] : 2; kputc('\t', s); for (i = 0; i < b->n_gi; ++i) { if (i) kputc(':', s); if (b->gi[i].fmt == bcf_str2int("PL", 2)) { uint8_t *d = (uint8_t*)b->gi[i].data + j * x; int k; - for (k = 0; k < x; ++k) { - if (k > 0) kputc(',', s); - kputw(d[k], s); - } - } else if (b->gi[i].fmt == bcf_str2int("DP", 2)) { + if ( ploidy==1 ) + for (k=0; kn_alleles; k++) + { + if (k>0) kputc(',', s); + kputw(d[(k+1)*(k+2)/2-1], s); + } + else + for (k = 0; k < x; ++k) { + if (k > 0) kputc(',', s); + kputw(d[k], s); + } + } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("DV", 2)) { kputw(((uint16_t*)b->gi[i].data)[j], s); } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) { kputw(((uint8_t*)b->gi[i].data)[j], s); } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) { kputw(((int32_t*)b->gi[i].data)[j], s); } else if (b->gi[i].fmt == bcf_str2int("GT", 2)) { - int y = ((uint8_t*)b->gi[i].data)[j]; - if (y>>7&1) { - kputsn("./.", 3, s); - } else { - kputc('0' + (y>>3&7), s); - kputc("/|"[y>>6&1], s); - kputc('0' + (y&7), s); - } + int y = ((uint8_t*)b->gi[i].data)[j]; + if ( ploidy==1 ) + { + if ( y>>7&1 ) + kputc('.', s); + else + kputc('0' + (y>>3&7), s); + } + else + { + if ( y>>7&1 ) + kputsn("./.", 3, s); + else { + kputc('0' + (y>>3&7), s); + kputc("/|"[y>>6&1], s); + kputc('0' + (y&7), s); + } + } } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) { float *d = (float*)b->gi[i].data + j * x; int k; @@ -299,6 +323,50 @@ int bcf_append_info(bcf1_t *b, const char *info, int l) return 0; } +int remove_tag(char *str, const char *tag, char delim) +{ + char *tmp = str, *p; + int len_diff = 0, ori_len = strlen(str); + while ( *tmp && (p = strstr(tmp,tag)) ) + { + if ( p>str ) + { + if ( *(p-1)!=delim ) { tmp=p+1; continue; } // shared substring + p--; + } + char *q=p+1; + while ( *q && *q!=delim ) q++; + if ( p==str && *q ) q++; // the tag is first, don't move the delim char + len_diff += q-p; + if ( ! *q ) { *p = 0; break; } // the tag was last, no delim follows + else + memmove(p,q,ori_len-(int)(p-str)-(int)(q-p)); // *q==delim + } + if ( len_diff==ori_len ) + str[0]='.', str[1]=0, len_diff--; + + return len_diff; +} + + +void rm_info(kstring_t *s, const char *key) +{ + char *p = s->s; + int n = 0; + while ( n<4 ) + { + if ( !*p ) n++; + p++; + } + char *q = p+1; + while ( *q && q-s->sl ) q++; + + int nrm = remove_tag(p, key, ';'); + if ( nrm ) + memmove(q-nrm, q, s->s+s->l-q+1); + s->l -= nrm; +} + int bcf_cpy(bcf1_t *r, const bcf1_t *b) { char *t1 = r->str; diff --git a/sam/bcftools/bcf.h b/sam/bcftools/bcf.h index 822ae5c..f722525 100644 --- a/sam/bcftools/bcf.h +++ b/sam/bcftools/bcf.h @@ -28,7 +28,7 @@ #ifndef BCF_H #define BCF_H -#define BCF_VERSION "0.1.17-dev (r973:277)" +#define BCF_VERSION "0.1.19-44428cd" #include #include @@ -73,6 +73,7 @@ typedef struct { bcf_ginfo_t *gi; // array of geno fields int n_alleles, n_smpl; // number of alleles and samples // derived info: ref, alt, flt, info, fmt (<-str), n_gi (<-fmt), n_alleles (<-alt), n_smpl (<-bcf_hdr_t::n_smpl) + uint8_t *ploidy; // ploidy of all samples; if NULL, ploidy of 2 is assumed. } bcf1_t; typedef struct { @@ -122,6 +123,10 @@ extern "C" { char *bcf_fmt(const bcf_hdr_t *h, bcf1_t *b); // append more info int bcf_append_info(bcf1_t *b, const char *info, int l); + // remove tag + int remove_tag(char *string, const char *tag, char delim); + // remove info tag, string is the kstring holder of bcf1_t.str + void rm_info(kstring_t *string, const char *key); // copy int bcf_cpy(bcf1_t *r, const bcf1_t *b); @@ -142,6 +147,8 @@ extern "C" { // keep the first n alleles and discard the rest int bcf_shrink_alt(bcf1_t *b, int n); + // keep the masked alleles and discard the rest + void bcf_fit_alt(bcf1_t *b, int mask); // convert GL to PL int bcf_gl2pl(bcf1_t *b); // if the site is an indel diff --git a/sam/bcftools/bcfutils.c b/sam/bcftools/bcfutils.c index 0eab4c1..7638085 100644 --- a/sam/bcftools/bcfutils.c +++ b/sam/bcftools/bcfutils.c @@ -1,5 +1,6 @@ #include #include +#include #include "bcf.h" #include "kstring.h" #include "khash.h" @@ -66,6 +67,112 @@ int bcf_str2id_add(void *_hash, const char *str) return kh_val(hash, k); } +void bcf_fit_alt(bcf1_t *b, int mask) +{ + mask |= 1; // REF must be always present + + int i,j,nals=0; + for (i=0; in_alleles <= nals ) return; + + // update ALT, in principle any of the alleles can be removed + char *p; + if ( nals>1 ) + { + char *dst, *src; + int n=0, nalts=nals-1; + for (src=dst=p=b->alt, i=1; *p; p++) + { + if ( *p!=',' ) continue; + + if ( mask&1<=nalts ) { *dst=0; break; } + src = p+1; + } + if ( nalt, *p = '\0'; + p++; + memmove(p, b->flt, b->str + b->l_str - b->flt); + b->l_str -= b->flt - p; + + // update PL and GT + int ipl=-1, igt=-1; + for (i = 0; i < b->n_gi; ++i) + { + bcf_ginfo_t *g = b->gi + i; + if (g->fmt == bcf_str2int("PL", 2)) ipl = i; + if (g->fmt == bcf_str2int("GT", 2)) igt = i; + } + + // .. create mapping between old and new indexes + int npl = nals * (nals+1) / 2; + int *map = malloc(sizeof(int)*(npl>b->n_alleles ? npl : b->n_alleles)); + int kori=0,knew=0; + for (i=0; in_alleles; i++) + { + for (j=0; j<=i; j++) + { + int skip=0; + if ( i && !(mask&1<n_smpl; + for (i = 0; i < b->n_gi; ++i) + { + bcf_ginfo_t *g = b->gi + i; + if (g->fmt == bcf_str2int("PL", 2)) + { + g->len = npl; + uint8_t *d = (uint8_t*)g->data; + int ismpl, npl_ori = b->n_alleles * (b->n_alleles + 1) / 2; + for (knew=ismpl=0; ismpln_alleles; i++) + map[i] = mask&1<gi[igt].data)[i]; + int a1 = (gt>>3)&7; + int a2 = gt&7; + assert( map[a1]>=0 && map[a2]>=0 ); + ((uint8_t*)b->gi[igt].data)[i] = ((1<<7|1<<6)>) | map[a1]<<3 | map[a2]; + } + free(map); + b->n_alleles = nals; + bcf_sync(b); +} + int bcf_shrink_alt(bcf1_t *b, int n) { char *p; @@ -133,7 +240,7 @@ int bcf_fix_gt(bcf1_t *b) bcf_ginfo_t gt; // check the presence of the GT FMT if ((s = strstr(b->fmt, ":GT")) == 0) return 0; // no GT or GT is already the first - if (s[3] != '\0' && s[3] != ':') return 0; // :GTX in fact + assert(s[3] == '\0' || s[3] == ':'); // :GTX in fact tmp = bcf_str2int("GT", 2); for (i = 0; i < b->n_gi; ++i) if (b->gi[i].fmt == tmp) break; @@ -142,7 +249,10 @@ int bcf_fix_gt(bcf1_t *b) // move GT to the first for (; i > 0; --i) b->gi[i] = b->gi[i-1]; b->gi[0] = gt; - memmove(b->fmt + 3, b->fmt, s + 1 - b->fmt); + if ( s[3]==0 ) + memmove(b->fmt + 3, b->fmt, s - b->fmt); // :GT + else + memmove(b->fmt + 3, b->fmt, s - b->fmt + 1); // :GT: b->fmt[0] = 'G'; b->fmt[1] = 'T'; b->fmt[2] = ':'; return 0; } @@ -287,7 +397,11 @@ bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int kputs(samples[i], &s); kputc('\0', &s); } } - if (j < n) fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j); + if (j < n) + { + fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j); + exit(1); + } kh_destroy(str2id, hash); h = calloc(1, sizeof(bcf_hdr_t)); *h = *h0; diff --git a/sam/bcftools/call1.c b/sam/bcftools/call1.c index 3cc4649..e6373d3 100644 --- a/sam/bcftools/call1.c +++ b/sam/bcftools/call1.c @@ -33,13 +33,14 @@ KSTREAM_INIT(gzFile, gzread, 16384) #define VC_EM 0x10000 #define VC_PAIRCALL 0x20000 #define VC_QCNT 0x40000 +#define VC_INDEL_ONLY 0x80000 typedef struct { int flag, prior_type, n1, n_sub, *sublist, n_perm; uint32_t *trio_aux; char *prior_file, **subsam, *fn_dict; uint8_t *ploidy; - double theta, pref, indel_frac, min_perm_p, min_smpl_frac, min_lrt; + double theta, pref, indel_frac, min_perm_p, min_smpl_frac, min_lrt, min_ma_lrt; void *bed; } viewconf_t; @@ -47,11 +48,6 @@ void *bed_read(const char *fn); void bed_destroy(void *_h); int bed_overlap(const void *_h, const char *chr, int beg, int end); -typedef struct { - double p[4]; - int mq, depth, is_tested, d[4]; -} anno16_t; - static double ttest(int n1, int n2, int a[4]) { extern double kf_betai(double a, double b, double x); @@ -82,7 +78,7 @@ static int test16_core(int anno[16], anno16_t *a) return 0; } -static int test16(bcf1_t *b, anno16_t *a) +int test16(bcf1_t *b, anno16_t *a) { char *p; int i, anno[16]; @@ -99,17 +95,6 @@ static int test16(bcf1_t *b, anno16_t *a) return test16_core(anno, a); } -static void rm_info(bcf1_t *b, const char *key) -{ - char *p, *q; - if ((p = strstr(b->info, key)) == 0) return; - for (q = p; *q && *q != ';'; ++q); - if (p > b->info && *(p-1) == ';') --p; - memmove(p, q, b->l_str - (q - b->str)); - b->l_str -= q - p; - bcf_sync(b); -} - static int update_bcf1(bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, double pref, int flag, double em[10], int cons_llr, int64_t cons_gt) { kstring_t s; @@ -118,7 +103,7 @@ static int update_bcf1(bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, anno16_t a; has_I16 = test16(b, &a) >= 0? 1 : 0; - rm_info(b, "I16="); // FIXME: probably this function has a bug. If I move it below, I16 will not be removed! + //rm_info(b, "I16="); // FIXME: probably this function has a bug. If I move it below, I16 will not be removed! memset(&s, 0, sizeof(kstring_t)); kputc('\0', &s); kputs(b->ref, &s); kputc('\0', &s); @@ -169,6 +154,8 @@ static int update_bcf1(bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, } if (has_I16 && a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]); kputc('\0', &s); + rm_info(&s, "QS="); + rm_info(&s, "I16="); kputs(b->fmt, &s); kputc('\0', &s); free(b->str); b->m_str = s.m; b->l_str = s.l; b->str = s.s; @@ -203,7 +190,27 @@ static char **read_samples(const char *fn, int *_n) *_n = 0; s.l = s.m = 0; s.s = 0; fp = gzopen(fn, "r"); - if (fp == 0) return 0; // fail to open file + if (fp == 0) + { + // interpret as sample names, not as a file name + const char *t = fn, *p = t; + while (*t) + { + t++; + if ( *t==',' || !*t ) + { + sam = realloc(sam, sizeof(void*)*(n+1)); + sam[n] = (char*) malloc(sizeof(char)*(t-p+2)); + memcpy(sam[n], p, t-p); + sam[n][t-p] = 0; + sam[n][t-p+1] = 2; // assume diploid + p = t+1; + n++; + } + } + *_n = n; + return sam; // fail to open file + } ks = ks_init(fp); while (ks_getuntil(ks, 0, &s, &dret) >= 0) { int l; @@ -249,6 +256,12 @@ static void write_header(bcf_hdr_t *h) kputs("##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + //if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); + kputs("##INFO=\n", &str); if (!strstr(str.s, "##FORMAT=\n", &str); if (!strstr(str.s, "##FORMAT=\n", &str); if (!strstr(str.s, "##FORMAT=\n", &str); + if (!strstr(str.s, "##FORMAT=\n", &str); if (!strstr(str.s, "##FORMAT=\n", &str); if (!strstr(str.s, "##FORMAT== 0) { + while ((c = getopt(argc, argv, "FN1:l:cC:eHAGvbSuP:t:p:QgLi:IMs:D:U:X:d:T:Ywm:K:")) >= 0) { switch (c) { case '1': vc.n1 = atoi(optarg); break; - case 'l': vc.bed = bed_read(optarg); break; + case 'l': vc.bed = bed_read(optarg); if (!vc.bed) { fprintf(stderr,"Could not read \"%s\"\n", optarg); return 1; } break; case 'D': vc.fn_dict = strdup(optarg); break; case 'F': vc.flag |= VC_FIX_PL; break; case 'N': vc.flag |= VC_ACGT_ONLY; break; @@ -335,8 +361,10 @@ int bcfview(int argc, char *argv[]) case 'u': vc.flag |= VC_UNCOMP | VC_BCFOUT; break; case 'g': vc.flag |= VC_CALL_GT | VC_CALL; break; case 'I': vc.flag |= VC_NO_INDEL; break; + case 'w': vc.flag |= VC_INDEL_ONLY; break; case 'M': vc.flag |= VC_ANNO_MAX; break; case 'Y': vc.flag |= VC_QCNT; break; + case 'm': vc.min_ma_lrt = atof(optarg); break; case 't': vc.theta = atof(optarg); break; case 'p': vc.pref = atof(optarg); break; case 'i': vc.indel_frac = atof(optarg); break; @@ -346,6 +374,7 @@ int bcfview(int argc, char *argv[]) case 'C': vc.min_lrt = atof(optarg); break; case 'X': vc.min_perm_p = atof(optarg); break; case 'd': vc.min_smpl_frac = atof(optarg); break; + case 'K': bcf_p1_fp_lk = gzopen(optarg, "w"); break; case 's': vc.subsam = read_samples(optarg, &vc.n_sub); vc.ploidy = calloc(vc.n_sub + 1, 1); for (tid = 0; tid < vc.n_sub; ++tid) vc.ploidy[tid] = vc.subsam[tid][strlen(vc.subsam[tid]) + 1]; @@ -392,6 +421,7 @@ int bcfview(int argc, char *argv[]) fprintf(stderr, " -g call genotypes at variant sites (force -c)\n"); fprintf(stderr, " -i FLOAT indel-to-substitution ratio [%.4g]\n", vc.indel_frac); fprintf(stderr, " -I skip indels\n"); + fprintf(stderr, " -m FLOAT alternative model for multiallelic and rare-variant calling, include if P(chi^2)>=FLOAT\n"); fprintf(stderr, " -p FLOAT variant if P(ref|D) 0) { int is_indel, cons_llr = -1; int64_t cons_gt = -1; @@ -482,6 +516,7 @@ int bcfview(int argc, char *argv[]) if (vc.flag & VC_FIX_PL) bcf_fix_pl(b); is_indel = bcf_is_indel(b); if ((vc.flag & VC_NO_INDEL) && is_indel) continue; + if ((vc.flag & VC_INDEL_ONLY) && !is_indel) continue; if ((vc.flag & VC_ACGT_ONLY) && !is_indel) { int x; if (b->ref[0] == 0 || b->ref[1] != 0) continue; @@ -515,9 +550,19 @@ int bcfview(int argc, char *argv[]) int i; for (i = 0; i < 9; ++i) em[i] = -1.; } - if (vc.flag & VC_CALL) { // call variants + if ( !(vc.flag&VC_KEEPALT) && (vc.flag&VC_CALL) && vc.min_ma_lrt>=0 ) + { + bcf_p1_set_ploidy(b, p1); // could be improved: do this per site to allow pseudo-autosomal regions + int gts = call_multiallelic_gt(b, p1, vc.min_ma_lrt, vc.flag&VC_VARONLY); + if ( gts<=1 && vc.flag & VC_VARONLY ) continue; + } + else if (vc.flag & VC_CALL) { // call variants bcf_p1rst_t pr; - int calret = bcf_p1_cal(b, (em[7] >= 0 && em[7] < vc.min_lrt), p1, &pr); + int calret; + gzwrite(bcf_p1_fp_lk, &b->tid, 4); + gzwrite(bcf_p1_fp_lk, &b->pos, 4); + gzwrite(bcf_p1_fp_lk, &em[0], sizeof(double)); + calret = bcf_p1_cal(b, (em[7] >= 0 && em[7] < vc.min_lrt), p1, &pr); if (n_processed % 100000 == 0) { fprintf(stderr, "[%s] %ld sites processed.\n", __func__, (long)n_processed); bcf_p1_dump_afs(p1); @@ -562,6 +607,8 @@ int bcfview(int argc, char *argv[]) } else bcf_fix_gt(b); vcf_write(bout, hout, b); } + + if (bcf_p1_fp_lk) gzclose(bcf_p1_fp_lk); if (vc.prior_file) free(vc.prior_file); if (vc.flag & VC_CALL) bcf_p1_dump_afs(p1); if (hin != hout) bcf_hdr_destroy(hout); diff --git a/sam/bcftools/index.c b/sam/bcftools/index.c index 014856d..a7db24f 100644 --- a/sam/bcftools/index.c +++ b/sam/bcftools/index.c @@ -259,6 +259,7 @@ int bcf_idx_build2(const char *fn, const char *_fnidx) if (fpidx == 0) { fprintf(stderr, "[bcf_idx_build2] fail to create the index file.\n"); free(fnidx); + bcf_idx_destroy(idx); return -1; } bcf_idx_save(idx, fpidx); diff --git a/sam/bcftools/main.c b/sam/bcftools/main.c index fcd94b8..eda6217 100644 --- a/sam/bcftools/main.c +++ b/sam/bcftools/main.c @@ -2,6 +2,7 @@ #include #include #include +#include "knetfile.h" #include "bcf.h" #include "kseq.h" @@ -29,12 +30,12 @@ int bcf_cat(int n, char * const *fn) if (i == 0) bcf_hdr_write(out, h); bcf_hdr_destroy(h); #ifdef _USE_KNETFILE - fstat(knet_fileno(in->fp->x.fpr), &s); + fstat(knet_fileno((knetFile*)in->fp->fp), &s); end = s.st_size - 28; - while (knet_tell(in->fp->x.fpr) < end) { - int size = knet_tell(in->fp->x.fpr) + BUF_SIZE < end? BUF_SIZE : end - knet_tell(in->fp->x.fpr); - knet_read(in->fp->x.fpr, buf, size); - fwrite(buf, 1, size, out->fp->x.fpw); + while (knet_tell((knetFile*)in->fp->fp) < end) { + int size = knet_tell((knetFile*)in->fp->fp) + BUF_SIZE < end? BUF_SIZE : end - knet_tell((knetFile*)in->fp->fp); + knet_read(in->fp->fp, buf, size); + fwrite(buf, 1, size, out->fp->fp); } #else abort(); // FIXME: not implemented diff --git a/sam/bcftools/prob1.c b/sam/bcftools/prob1.c index a380484..3539ee3 100644 --- a/sam/bcftools/prob1.c +++ b/sam/bcftools/prob1.c @@ -4,7 +4,10 @@ #include #include #include +#include +#include #include "prob1.h" +#include "kstring.h" #include "kseq.h" KSTREAM_INIT(gzFile, gzread, 16384) @@ -13,6 +16,8 @@ KSTREAM_INIT(gzFile, gzread, 16384) #define MC_EM_EPS 1e-5 #define MC_DEF_INDEL 0.15 +gzFile bcf_p1_fp_lk; + unsigned char seq_nt4_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, @@ -163,6 +168,8 @@ bcf_p1aux_t *bcf_p1_init(int n, uint8_t *ploidy) return ma; } +int bcf_p1_get_M(bcf_p1aux_t *b) { return b->M; } + int bcf_p1_set_n1(bcf_p1aux_t *b, int n1) { if (n1 == 0 || n1 >= b->n) return -1; @@ -174,6 +181,13 @@ int bcf_p1_set_n1(bcf_p1aux_t *b, int n1) return 0; } +void bcf_p1_set_ploidy(bcf1_t *b, bcf_p1aux_t *ma) +{ + // bcf_p1aux_t fields are not visible outside of prob1.c, hence this wrapper. + // Ideally, this should set ploidy per site to allow pseudo-autosomal regions + b->ploidy = ma->ploidy; +} + void bcf_p1_destroy(bcf_p1aux_t *ma) { if (ma) { @@ -191,28 +205,446 @@ void bcf_p1_destroy(bcf_p1aux_t *ma) } } +extern double kf_gammap(double s, double z); +int test16(bcf1_t *b, anno16_t *a); + +// Wigginton 2005, PMID: 15789306 +// written by Jan Wigginton +double calc_hwe(int obs_hom1, int obs_hom2, int obs_hets) +{ + if (obs_hom1 + obs_hom2 + obs_hets == 0 ) return 1; + + assert(obs_hom1 >= 0 && obs_hom2 >= 0 && obs_hets >= 0); + + int obs_homc = obs_hom1 < obs_hom2 ? obs_hom2 : obs_hom1; + int obs_homr = obs_hom1 < obs_hom2 ? obs_hom1 : obs_hom2; + + int rare_copies = 2 * obs_homr + obs_hets; + int genotypes = obs_hets + obs_homc + obs_homr; + + double *het_probs = (double*) calloc(rare_copies+1, sizeof(double)); + + /* start at midpoint */ + int mid = rare_copies * (2 * genotypes - rare_copies) / (2 * genotypes); + + /* check to ensure that midpoint and rare alleles have same parity */ + if ((rare_copies & 1) ^ (mid & 1)) mid++; + + int curr_hets = mid; + int curr_homr = (rare_copies - mid) / 2; + int curr_homc = genotypes - curr_hets - curr_homr; + + het_probs[mid] = 1.0; + double sum = het_probs[mid]; + for (curr_hets = mid; curr_hets > 1; curr_hets -= 2) + { + het_probs[curr_hets - 2] = het_probs[curr_hets] * curr_hets * (curr_hets - 1.0) / (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0)); + sum += het_probs[curr_hets - 2]; + + /* 2 fewer heterozygotes for next iteration -> add one rare, one common homozygote */ + curr_homr++; + curr_homc++; + } + + curr_hets = mid; + curr_homr = (rare_copies - mid) / 2; + curr_homc = genotypes - curr_hets - curr_homr; + for (curr_hets = mid; curr_hets <= rare_copies - 2; curr_hets += 2) + { + het_probs[curr_hets + 2] = het_probs[curr_hets] * 4.0 * curr_homr * curr_homc /((curr_hets + 2.0) * (curr_hets + 1.0)); + sum += het_probs[curr_hets + 2]; + + /* add 2 heterozygotes for next iteration -> subtract one rare, one common homozygote */ + curr_homr--; + curr_homc--; + } + int i; + for (i = 0; i <= rare_copies; i++) het_probs[i] /= sum; + + /* p-value calculation for p_hwe */ + double p_hwe = 0.0; + for (i = 0; i <= rare_copies; i++) + { + if (het_probs[i] > het_probs[obs_hets]) + continue; + p_hwe += het_probs[i]; + } + + p_hwe = p_hwe > 1.0 ? 1.0 : p_hwe; + free(het_probs); + return p_hwe; + +} + + +static void _bcf1_set_ref(bcf1_t *b, int idp) +{ + kstring_t s; + int old_n_gi = b->n_gi; + s.m = b->m_str; s.l = b->l_str - 1; s.s = b->str; + kputs(":GT", &s); kputc('\0', &s); + b->m_str = s.m; b->l_str = s.l; b->str = s.s; + bcf_sync(b); + + // Call GTs + int isample, an = 0; + for (isample = 0; isample < b->n_smpl; isample++) + { + if ( idp>=0 && ((uint16_t*)b->gi[idp].data)[isample]==0 ) + ((uint8_t*)b->gi[old_n_gi].data)[isample] = 1<<7; + else + { + ((uint8_t*)b->gi[old_n_gi].data)[isample] = 0; + an += b->ploidy ? b->ploidy[isample] : 2; + } + } + bcf_fit_alt(b,1); + b->qual = 999; + + // Prepare BCF for output: ref, alt, filter, info, format + memset(&s, 0, sizeof(kstring_t)); kputc('\0', &s); + kputs(b->ref, &s); kputc('\0', &s); + kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s); + { + ksprintf(&s, "AN=%d;", an); + kputs(b->info, &s); + anno16_t a; + int has_I16 = test16(b, &a) >= 0? 1 : 0; + if (has_I16 ) + { + if ( a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]); + ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq); + } + kputc('\0', &s); + rm_info(&s, "I16="); + rm_info(&s, "QS="); + } + kputs(b->fmt, &s); kputc('\0', &s); + free(b->str); + b->m_str = s.m; b->l_str = s.l; b->str = s.s; + bcf_sync(b); +} + +int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold, int var_only) +{ + int nals = 1; + char *p; + for (p=b->alt; *p; p++) + { + if ( *p=='X' || p[0]=='.' ) break; + if ( p[0]==',' ) nals++; + } + if ( b->alt[0] && !*p ) nals++; + + if ( nals>4 ) + { + if ( *b->ref=='N' ) return 0; + fprintf(stderr,"Not ready for this, more than 4 alleles at %d: %s, %s\n", b->pos+1, b->ref,b->alt); + exit(1); + } + + // find PL, DV and DP FORMAT indexes + uint8_t *pl = NULL; + int i, npl = 0, idp = -1, idv = -1; + for (i = 0; i < b->n_gi; ++i) + { + if (b->gi[i].fmt == bcf_str2int("PL", 2)) + { + pl = (uint8_t*)b->gi[i].data; + npl = b->gi[i].len; + } + else if (b->gi[i].fmt == bcf_str2int("DP", 2)) idp=i; + else if (b->gi[i].fmt == bcf_str2int("DV", 2)) idv=i; + } + if ( nals==1 ) + { + if ( !var_only ) _bcf1_set_ref(b, idp); + return 1; + } + if ( !pl ) return -1; + + assert(ma->q2p[0] == 1); + + // Init P(D|G) + int npdg = nals*(nals+1)/2; + double *pdg,*_pdg; + _pdg = pdg = malloc(sizeof(double)*ma->n*npdg); + for (i=0; in; i++) + { + int j; + double sum = 0; + for (j=0; jq2p[pl[j]]; + sum += _pdg[j]; + } + if ( sum ) + for (j=0; jinfo, "QS=")) == 0) { fprintf(stderr,"INFO/QS is required with -m, exiting\n"); exit(1); } + double qsum[4]; + if ( sscanf(p+3,"%lf,%lf,%lf,%lf",&qsum[0],&qsum[1],&qsum[2],&qsum[3])!=4 ) { fprintf(stderr,"Could not parse %s\n",p); exit(1); } + + + // Calculate the most likely combination of alleles, remembering the most and second most likely set + int ia,ib,ic, max_als=0, max_als2=0; + double ref_lk = 0, max_lk = INT_MIN, max_lk2 = INT_MIN, lk_sum = INT_MIN, lk_sums[3]; + for (ia=0; ian; isample++) + { + double *p = pdg + isample*npdg; + // assert( log(p[iaa]) <= 0 ); + lk_tot += log(p[iaa]); + } + if ( ia==0 ) ref_lk = lk_tot; + if ( max_lklk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum)); + } + lk_sums[0] = lk_sum; + if ( nals>1 ) + { + for (ia=0; ian; isample++) + { + double *p = pdg + isample*npdg; + //assert( log(fa*p[iaa] + fb*p[ibb] + fab*p[iab]) <= 0 ); + if ( b->ploidy && b->ploidy[isample]==1 ) + lk_tot += log(fa*p[iaa] + fb*p[ibb]); + else + lk_tot += log(fa*p[iaa] + fb*p[ibb] + fab*p[iab]); + } + if ( max_lklk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum)); + } + } + lk_sums[1] = lk_sum; + } + if ( nals>2 ) + { + for (ia=0; ian; isample++) + { + double *p = pdg + isample*npdg; + //assert( log(fa*p[iaa] + fb*p[ibb] + fc*p[icc] + fab*p[iab] + fac*p[iac] + fbc*p[ibc]) <= 0 ); + if ( b->ploidy && b->ploidy[isample]==1 ) + lk_tot += log(fa*p[iaa] + fb*p[ibb] + fc*p[icc]); + else + lk_tot += log(fa*p[iaa] + fb*p[ibb] + fc*p[icc] + fab*p[iab] + fac*p[iac] + fbc*p[ibc]); + } + if ( max_lklk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum)); + } + } + } + lk_sums[2] = lk_sum; + } + + // Should we add another allele, does it increase the likelihood significantly? + int n1=0, n2=0; + for (i=0; in_gi; + s.m = b->m_str; s.l = b->l_str - 1; s.s = b->str; + kputs(":GT:GQ", &s); kputc('\0', &s); + b->m_str = s.m; b->l_str = s.l; b->str = s.s; + bcf_sync(b); + + // Call GTs + int isample, gts=0, ac[4] = {0,0,0,0}; + int nRR = 0, nAA = 0, nRA = 0, max_dv = 0; + for (isample = 0; isample < b->n_smpl; isample++) + { + int ploidy = b->ploidy ? b->ploidy[isample] : 2; + double *p = pdg + isample*npdg; + int ia, als = 0; + double lk = 0, lk_s = 0; + for (ia=0; ia lk ) { lk = _lk; als = ia<<3 | ia; } + lk_s += _lk; + } + if ( ploidy==2 ) + { + for (ia=0; ia lk ) { lk = _lk; als = ib<<3 | ia; } + lk_s += _lk; + } + } + } + lk = -log(1-lk/lk_s)/0.2302585; + int dp = 0; + if ( idp>=0 && (dp=((uint16_t*)b->gi[idp].data)[isample])==0 ) + { + // no coverage + ((uint8_t*)b->gi[old_n_gi].data)[isample] = 1<<7; + ((uint8_t*)b->gi[old_n_gi+1].data)[isample] = 0; + continue; + } + if ( lk>99 ) lk = 99; + ((uint8_t*)b->gi[old_n_gi].data)[isample] = als; + ((uint8_t*)b->gi[old_n_gi+1].data)[isample] = (int)lk; + + // For MDV annotation + int dv; + if ( als && idv>=0 && (dv=((uint16_t*)b->gi[idv].data)[isample]) ) + { + if ( max_dv < dv ) max_dv = dv; + } + + // For HWE annotation; multiple ALT alleles treated as one + if ( !als ) nRR++; + else if ( !(als>>3&7) || !(als&7) ) nRA++; + else nAA++; + + gts |= 1<<(als>>3&7) | 1<<(als&7); + ac[ als>>3&7 ]++; + ac[ als&7 ]++; + } + free(pdg); + bcf_fit_alt(b,max_als); + + // The VCF spec is ambiguous about QUAL: is it the probability of anything else + // (that is QUAL(non-ref) = P(ref)+P(any non-ref other than ALT)) or is it + // QUAL(non-ref)=P(ref) and QUAL(ref)=1-P(ref)? Assuming the latter. + b->qual = gts>1 ? -4.343*(ref_lk - lk_sum) : -4.343*log(1-exp(ref_lk - lk_sum)); + if ( b->qual>999 ) b->qual = 999; + + // Prepare BCF for output: ref, alt, filter, info, format + memset(&s, 0, sizeof(kstring_t)); kputc('\0', &s); + kputs(b->ref, &s); kputc('\0', &s); + kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s); + { + int an=0, nalts=0; + for (i=0; i0 && ac[i] ) nalts++; + } + ksprintf(&s, "AN=%d;", an); + if ( nalts ) + { + kputs("AC=", &s); + for (i=1; i0 ) kputc(',', &s); + } + kputc(';', &s); + } + kputs(b->info, &s); + anno16_t a; + int has_I16 = test16(b, &a) >= 0? 1 : 0; + if (has_I16 ) + { + if ( a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]); + ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq); + ksprintf(&s, ";QBD=%e", b->qual/(a.d[0] + a.d[1] + a.d[2] + a.d[3])); + if ( max_dv ) ksprintf(&s, ";MDV=%d", max_dv); + } + if ( nAA+nRA ) + { + double hwe = calc_hwe(nAA, nRR, nRA); + ksprintf(&s, ";HWE=%e", hwe); + } + kputc('\0', &s); + rm_info(&s, "I16="); + rm_info(&s, "QS="); + } + kputs(b->fmt, &s); kputc('\0', &s); + free(b->str); + b->m_str = s.m; b->l_str = s.l; b->str = s.s; + bcf_sync(b); + + return gts; +} + static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma) { - int i, j; - long *p, tmp; - p = alloca(b->n_alleles * sizeof(long)); - memset(p, 0, sizeof(long) * b->n_alleles); - for (j = 0; j < ma->n; ++j) { - const uint8_t *pi = ma->PL + j * ma->PL_len; - double *pdg = ma->pdg + j * 3; - pdg[0] = ma->q2p[pi[2]]; pdg[1] = ma->q2p[pi[1]]; pdg[2] = ma->q2p[pi[0]]; - for (i = 0; i < b->n_alleles; ++i) - p[i] += (int)pi[(i+1)*(i+2)/2-1]; - } - for (i = 0; i < b->n_alleles; ++i) p[i] = p[i]<<4 | i; - for (i = 1; i < b->n_alleles; ++i) // insertion sort - for (j = i; j > 0 && p[j] < p[j-1]; --j) - tmp = p[j], p[j] = p[j-1], p[j-1] = tmp; - for (i = b->n_alleles - 1; i >= 0; --i) - if ((p[i]&0xf) == 0) break; - return i; + int i, j; + long *p, tmp; + p = alloca(b->n_alleles * sizeof(long)); + memset(p, 0, sizeof(long) * b->n_alleles); + for (j = 0; j < ma->n; ++j) { + const uint8_t *pi = ma->PL + j * ma->PL_len; + double *pdg = ma->pdg + j * 3; + pdg[0] = ma->q2p[pi[2]]; pdg[1] = ma->q2p[pi[1]]; pdg[2] = ma->q2p[pi[0]]; + for (i = 0; i < b->n_alleles; ++i) + p[i] += (int)pi[(i+1)*(i+2)/2-1]; + } + for (i = 0; i < b->n_alleles; ++i) p[i] = p[i]<<4 | i; + for (i = 1; i < b->n_alleles; ++i) // insertion sort + for (j = i; j > 0 && p[j] < p[j-1]; --j) + tmp = p[j], p[j] = p[j-1], p[j-1] = tmp; + for (i = b->n_alleles - 1; i >= 0; --i) + if ((p[i]&0xf) == 0) break; + return i; } + int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k) { double sum, g[3]; @@ -322,6 +754,8 @@ static void mc_cal_y_core(bcf_p1aux_t *ma, int beg) } } if (z[0] != ma->z) memcpy(ma->z, z[0], sizeof(double) * (ma->M + 1)); + if (bcf_p1_fp_lk) + gzwrite(bcf_p1_fp_lk, ma->z, sizeof(double) * (ma->M + 1)); } static void mc_cal_y(bcf_p1aux_t *ma) diff --git a/sam/bcftools/prob1.h b/sam/bcftools/prob1.h index 0a51a0a..6f93155 100644 --- a/sam/bcftools/prob1.h +++ b/sam/bcftools/prob1.h @@ -14,6 +14,11 @@ typedef struct { double cmp[3], p_chi2, lrt; // used by contrast2() } bcf_p1rst_t; +typedef struct { + double p[4]; + int mq, depth, is_tested, d[4]; +} anno16_t; + #define MC_PTYPE_FULL 1 #define MC_PTYPE_COND2 2 #define MC_PTYPE_FLAT 3 @@ -26,7 +31,9 @@ extern "C" { void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta); void bcf_p1_init_subprior(bcf_p1aux_t *ma, int type, double theta); void bcf_p1_destroy(bcf_p1aux_t *ma); + void bcf_p1_set_ploidy(bcf1_t *b, bcf_p1aux_t *ma); int bcf_p1_cal(const bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1rst_t *rst); + int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold, int var_only); int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k); void bcf_p1_dump_afs(bcf_p1aux_t *ma); int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn); diff --git a/sam/bcftools/vcf.c b/sam/bcftools/vcf.c index 9daa845..e8526a3 100644 --- a/sam/bcftools/vcf.c +++ b/sam/bcftools/vcf.c @@ -30,7 +30,12 @@ bcf_hdr_t *vcf_hdr_read(bcf_t *bp) memset(&smpl, 0, sizeof(kstring_t)); while (ks_getuntil(v->ks, '\n', &v->line, &dret) >= 0) { if (v->line.l < 2) continue; - if (v->line.s[0] != '#') return 0; // no sample line + if (v->line.s[0] != '#') { + free(meta.s); + free(smpl.s); + free(h); + return 0; // no sample line + } if (v->line.s[0] == '#' && v->line.s[1] == '#') { kputsn(v->line.s, v->line.l, &meta); kputc('\n', &meta); } else if (v->line.s[0] == '#') { @@ -186,7 +191,7 @@ int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b) ((uint8_t*)b->gi[i].data)[k-9] = 0; } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) { ((int32_t*)b->gi[i].data)[k-9] = 0; - } else if (b->gi[i].fmt == bcf_str2int("DP", 2)) { + } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("DV", 2)) { ((uint16_t*)b->gi[i].data)[k-9] = 0; } else if (b->gi[i].fmt == bcf_str2int("PL", 2)) { int y = b->n_alleles * (b->n_alleles + 1) / 2; @@ -210,7 +215,7 @@ int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b) int x = strtol(q, &q, 10); if (x > 0xffff) x = 0xffff; ((uint32_t*)b->gi[i].data)[k-9] = x; - } else if (b->gi[i].fmt == bcf_str2int("DP", 2)) { + } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("DV", 2)) { int x = strtol(q, &q, 10); if (x > 0xffff) x = 0xffff; ((uint16_t*)b->gi[i].data)[k-9] = x; diff --git a/sam/bedcov.c b/sam/bedcov.c new file mode 100644 index 0000000..3e4b952 --- /dev/null +++ b/sam/bedcov.c @@ -0,0 +1,127 @@ +#include +#include +#include +#include +#include +#include +#include "kstring.h" +#include "bgzf.h" +#include "bam.h" + +#include "kseq.h" +KSTREAM_INIT(gzFile, gzread, 16384) + +typedef struct { + bamFile fp; + bam_iter_t iter; + int min_mapQ; +} aux_t; + +static int read_bam(void *data, bam1_t *b) +{ + aux_t *aux = (aux_t*)data; + int ret = bam_iter_read(aux->fp, aux->iter, b); + if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP; + return ret; +} + +int main_bedcov(int argc, char *argv[]) +{ + extern void bam_init_header_hash(bam_header_t*); + gzFile fp; + kstring_t str; + kstream_t *ks; + bam_index_t **idx; + bam_header_t *h = 0; + aux_t **aux; + int *n_plp, dret, i, n, c, min_mapQ = 0; + int64_t *cnt; + const bam_pileup1_t **plp; + + while ((c = getopt(argc, argv, "Q:")) >= 0) { + switch (c) { + case 'Q': min_mapQ = atoi(optarg); break; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "Usage: samtools bedcov [...]\n"); + return 1; + } + memset(&str, 0, sizeof(kstring_t)); + n = argc - optind - 1; + aux = calloc(n, sizeof(void*)); + idx = calloc(n, sizeof(void*)); + for (i = 0; i < n; ++i) { + aux[i] = calloc(1, sizeof(aux_t)); + aux[i]->min_mapQ = min_mapQ; + aux[i]->fp = bam_open(argv[i+optind+1], "r"); + idx[i] = bam_index_load(argv[i+optind+1]); + if (aux[i]->fp == 0 || idx[i] == 0) { + fprintf(stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]); + return 2; + } + bgzf_set_cache_size(aux[i]->fp, 20); + if (i == 0) h = bam_header_read(aux[0]->fp); + } + bam_init_header_hash(h); + cnt = calloc(n, 8); + + fp = gzopen(argv[optind], "rb"); + ks = ks_init(fp); + n_plp = calloc(n, sizeof(int)); + plp = calloc(n, sizeof(void*)); + while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) { + char *p, *q; + int tid, beg, end, pos; + bam_mplp_t mplp; + + for (p = q = str.s; *p && *p != '\t'; ++p); + if (*p != '\t') goto bed_error; + *p = 0; tid = bam_get_tid(h, q); *p = '\t'; + if (tid < 0) goto bed_error; + for (q = p = p + 1; isdigit(*p); ++p); + if (*p != '\t') goto bed_error; + *p = 0; beg = atoi(q); *p = '\t'; + for (q = p = p + 1; isdigit(*p); ++p); + if (*p == '\t' || *p == 0) { + int c = *p; + *p = 0; end = atoi(q); *p = c; + } else goto bed_error; + + for (i = 0; i < n; ++i) { + if (aux[i]->iter) bam_iter_destroy(aux[i]->iter); + aux[i]->iter = bam_iter_query(idx[i], tid, beg, end); + } + mplp = bam_mplp_init(n, read_bam, (void**)aux); + bam_mplp_set_maxcnt(mplp, 64000); + memset(cnt, 0, 8 * n); + while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) + if (pos >= beg && pos < end) + for (i = 0; i < n; ++i) cnt[i] += n_plp[i]; + for (i = 0; i < n; ++i) { + kputc('\t', &str); + kputl(cnt[i], &str); + } + puts(str.s); + bam_mplp_destroy(mplp); + continue; + +bed_error: + fprintf(stderr, "Errors in BED line '%s'\n", str.s); + } + free(n_plp); free(plp); + ks_destroy(ks); + gzclose(fp); + + free(cnt); + for (i = 0; i < n; ++i) { + if (aux[i]->iter) bam_iter_destroy(aux[i]->iter); + bam_index_destroy(idx[i]); + bam_close(aux[i]->fp); + free(aux[i]); + } + bam_header_destroy(h); + free(aux); free(idx); + free(str.s); + return 0; +} diff --git a/sam/bgzf.c b/sam/bgzf.c index 216cd04..880d5af 100644 --- a/sam/bgzf.c +++ b/sam/bgzf.c @@ -1,6 +1,7 @@ /* The MIT License Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology + 2011 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -21,400 +22,235 @@ THE SOFTWARE. */ -/* - 2009-06-29 by lh3: cache recent uncompressed blocks. - 2009-06-25 by lh3: optionally use my knetfile library to access file on a FTP. - 2009-06-12 by lh3: support a mode string like "wu" where 'u' for uncompressed output */ - #include #include #include #include -#include +#include +#include #include -#include #include "bgzf.h" -#include "khash.h" +#ifdef _USE_KNETFILE +#include "knetfile.h" +typedef knetFile *_bgzf_file_t; +#define _bgzf_open(fn, mode) knet_open(fn, mode) +#define _bgzf_dopen(fp, mode) knet_dopen(fp, mode) +#define _bgzf_close(fp) knet_close(fp) +#define _bgzf_fileno(fp) ((fp)->fd) +#define _bgzf_tell(fp) knet_tell(fp) +#define _bgzf_seek(fp, offset, whence) knet_seek(fp, offset, whence) +#define _bgzf_read(fp, buf, len) knet_read(fp, buf, len) +#define _bgzf_write(fp, buf, len) knet_write(fp, buf, len) +#else // ~defined(_USE_KNETFILE) +#if defined(_WIN32) || defined(_MSC_VER) +#define ftello(fp) ftell(fp) +#define fseeko(fp, offset, whence) fseek(fp, offset, whence) +#else // ~defined(_WIN32) +extern off_t ftello(FILE *stream); +extern int fseeko(FILE *stream, off_t offset, int whence); +#endif // ~defined(_WIN32) +typedef FILE *_bgzf_file_t; +#define _bgzf_open(fn, mode) fopen(fn, mode) +#define _bgzf_dopen(fp, mode) fdopen(fp, mode) +#define _bgzf_close(fp) fclose(fp) +#define _bgzf_fileno(fp) fileno(fp) +#define _bgzf_tell(fp) ftello(fp) +#define _bgzf_seek(fp, offset, whence) fseeko(fp, offset, whence) +#define _bgzf_read(fp, buf, len) fread(buf, 1, len, fp) +#define _bgzf_write(fp, buf, len) fwrite(buf, 1, len, fp) +#endif // ~define(_USE_KNETFILE) + +#define BLOCK_HEADER_LENGTH 18 +#define BLOCK_FOOTER_LENGTH 8 + + +/* BGZF/GZIP header (speciallized from RFC 1952; little endian): + +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ + | 31|139| 8| 4| 0| 0|255| 6| 66| 67| 2|BLK_LEN| + +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ +*/ +static const uint8_t g_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0"; + +#ifdef BGZF_CACHE typedef struct { int size; uint8_t *block; int64_t end_offset; } cache_t; +#include "khash.h" KHASH_MAP_INIT_INT64(cache, cache_t) - -#if defined(_WIN32) || defined(_MSC_VER) -#define ftello(fp) ftell(fp) -#define fseeko(fp, offset, whence) fseek(fp, offset, whence) -#else -extern off_t ftello(FILE *stream); -extern int fseeko(FILE *stream, off_t offset, int whence); #endif -typedef int8_t bgzf_byte_t; - -static const int DEFAULT_BLOCK_SIZE = 64 * 1024; -static const int MAX_BLOCK_SIZE = 64 * 1024; - -static const int BLOCK_HEADER_LENGTH = 18; -static const int BLOCK_FOOTER_LENGTH = 8; - -static const int GZIP_ID1 = 31; -static const int GZIP_ID2 = 139; -static const int CM_DEFLATE = 8; -static const int FLG_FEXTRA = 4; -static const int OS_UNKNOWN = 255; -static const int BGZF_ID1 = 66; // 'B' -static const int BGZF_ID2 = 67; // 'C' -static const int BGZF_LEN = 2; -static const int BGZF_XLEN = 6; // BGZF_LEN+4 - -static const int GZIP_WINDOW_BITS = -15; // no zlib header -static const int Z_DEFAULT_MEM_LEVEL = 8; - - -inline -void -packInt16(uint8_t* buffer, uint16_t value) +static inline void packInt16(uint8_t *buffer, uint16_t value) { - buffer[0] = value; - buffer[1] = value >> 8; + buffer[0] = value; + buffer[1] = value >> 8; } -inline -int -unpackInt16(const uint8_t* buffer) +static inline int unpackInt16(const uint8_t *buffer) { - return (buffer[0] | (buffer[1] << 8)); + return buffer[0] | buffer[1] << 8; } -inline -void -packInt32(uint8_t* buffer, uint32_t value) +static inline void packInt32(uint8_t *buffer, uint32_t value) { - buffer[0] = value; - buffer[1] = value >> 8; - buffer[2] = value >> 16; - buffer[3] = value >> 24; + buffer[0] = value; + buffer[1] = value >> 8; + buffer[2] = value >> 16; + buffer[3] = value >> 24; } -static inline -int -bgzf_min(int x, int y) -{ - return (x < y) ? x : y; -} - -static -void -report_error(BGZF* fp, const char* message) { - fp->error = message; -} - -int bgzf_check_bgzf(const char *fn) +static BGZF *bgzf_read_init() { - BGZF *fp; - uint8_t buf[10],magic[10]="\037\213\010\4\0\0\0\0\0\377"; - int n; - - if ((fp = bgzf_open(fn, "r")) == 0) - { - fprintf(stderr, "[bgzf_check_bgzf] failed to open the file: %s\n",fn); - return -1; - } - -#ifdef _USE_KNETFILE - n = knet_read(fp->x.fpr, buf, 10); -#else - n = fread(buf, 1, 10, fp->file); + BGZF *fp; + fp = calloc(1, sizeof(BGZF)); + fp->is_write = 0; + fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE); + fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE); +#ifdef BGZF_CACHE + fp->cache = kh_init(cache); #endif - bgzf_close(fp); - - if ( n!=10 ) - return -1; - - if ( !memcmp(magic, buf, 10) ) return 1; - return 0; + return fp; } -static BGZF *bgzf_read_init() +static BGZF *bgzf_write_init(int compress_level) // compress_level==-1 for the default level { BGZF *fp; fp = calloc(1, sizeof(BGZF)); - fp->uncompressed_block_size = MAX_BLOCK_SIZE; - fp->uncompressed_block = malloc(MAX_BLOCK_SIZE); - fp->compressed_block_size = MAX_BLOCK_SIZE; - fp->compressed_block = malloc(MAX_BLOCK_SIZE); - fp->cache_size = 0; - fp->cache = kh_init(cache); + fp->is_write = 1; + fp->uncompressed_block = malloc(BGZF_MAX_BLOCK_SIZE); + fp->compressed_block = malloc(BGZF_MAX_BLOCK_SIZE); + fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1 + if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION; return fp; } - -static -BGZF* -open_read(int fd) +// get the compress level from the mode string +static int mode2level(const char *__restrict mode) { -#ifdef _USE_KNETFILE - knetFile *file = knet_dopen(fd, "r"); -#else - FILE* file = fdopen(fd, "r"); -#endif - BGZF* fp; - if (file == 0) return 0; - fp = bgzf_read_init(); - fp->file_descriptor = fd; - fp->open_mode = 'r'; -#ifdef _USE_KNETFILE - fp->x.fpr = file; -#else - fp->file = file; -#endif - return fp; + int i, compress_level = -1; + for (i = 0; mode[i]; ++i) + if (mode[i] >= '0' && mode[i] <= '9') break; + if (mode[i]) compress_level = (int)mode[i] - '0'; + if (strchr(mode, 'u')) compress_level = 0; + return compress_level; } -static -BGZF* -open_write(int fd, int compress_level) // compress_level==-1 for the default level +BGZF *bgzf_open(const char *path, const char *mode) { - FILE* file = fdopen(fd, "w"); - BGZF* fp; - if (file == 0) return 0; - fp = malloc(sizeof(BGZF)); - fp->file_descriptor = fd; - fp->open_mode = 'w'; - fp->owned_file = 0; - fp->compress_level = compress_level < 0? Z_DEFAULT_COMPRESSION : compress_level; // Z_DEFAULT_COMPRESSION==-1 - if (fp->compress_level > 9) fp->compress_level = Z_DEFAULT_COMPRESSION; -#ifdef _USE_KNETFILE - fp->x.fpw = file; -#else - fp->file = file; -#endif - fp->uncompressed_block_size = DEFAULT_BLOCK_SIZE; - fp->uncompressed_block = NULL; - fp->compressed_block_size = MAX_BLOCK_SIZE; - fp->compressed_block = malloc(MAX_BLOCK_SIZE); - fp->block_address = 0; - fp->block_offset = 0; - fp->block_length = 0; - fp->error = NULL; - return fp; -} - -BGZF* -bgzf_open(const char* __restrict path, const char* __restrict mode) -{ - BGZF* fp = NULL; - if (strchr(mode, 'r') || strchr(mode, 'R')) { /* The reading mode is preferred. */ -#ifdef _USE_KNETFILE - knetFile *file = knet_open(path, mode); - if (file == 0) return 0; + BGZF *fp = 0; + assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE); + if (strchr(mode, 'r') || strchr(mode, 'R')) { + _bgzf_file_t fpr; + if ((fpr = _bgzf_open(path, "r")) == 0) return 0; fp = bgzf_read_init(); - fp->file_descriptor = -1; - fp->open_mode = 'r'; - fp->x.fpr = file; -#else - int fd, oflag = O_RDONLY; -#ifdef _WIN32 - oflag |= O_BINARY; -#endif - fd = open(path, oflag); - if (fd == -1) return 0; - fp = open_read(fd); -#endif - } else if (strchr(mode, 'w') || strchr(mode, 'W')) { - int fd, compress_level = -1, oflag = O_WRONLY | O_CREAT | O_TRUNC; -#ifdef _WIN32 - oflag |= O_BINARY; -#endif - fd = open(path, oflag, 0666); - if (fd == -1) return 0; - { // set compress_level - int i; - for (i = 0; mode[i]; ++i) - if (mode[i] >= '0' && mode[i] <= '9') break; - if (mode[i]) compress_level = (int)mode[i] - '0'; - if (strchr(mode, 'u')) compress_level = 0; - } - fp = open_write(fd, compress_level); - } - if (fp != NULL) fp->owned_file = 1; - return fp; -} - -BGZF* -bgzf_fdopen(int fd, const char * __restrict mode) -{ - if (fd == -1) return 0; - if (mode[0] == 'r' || mode[0] == 'R') { - return open_read(fd); - } else if (mode[0] == 'w' || mode[0] == 'W') { - int i, compress_level = -1; - for (i = 0; mode[i]; ++i) - if (mode[i] >= '0' && mode[i] <= '9') break; - if (mode[i]) compress_level = (int)mode[i] - '0'; - if (strchr(mode, 'u')) compress_level = 0; - return open_write(fd, compress_level); - } else { - return NULL; - } + fp->fp = fpr; + } else if (strchr(mode, 'w') || strchr(mode, 'W')) { + FILE *fpw; + if ((fpw = fopen(path, "w")) == 0) return 0; + fp = bgzf_write_init(mode2level(mode)); + fp->fp = fpw; + } + return fp; } -static -int -deflate_block(BGZF* fp, int block_length) -{ - // Deflate the block in fp->uncompressed_block into fp->compressed_block. - // Also adds an extra field that stores the compressed block length. - - bgzf_byte_t* buffer = fp->compressed_block; - int buffer_size = fp->compressed_block_size; - - // Init gzip header - buffer[0] = GZIP_ID1; - buffer[1] = GZIP_ID2; - buffer[2] = CM_DEFLATE; - buffer[3] = FLG_FEXTRA; - buffer[4] = 0; // mtime - buffer[5] = 0; - buffer[6] = 0; - buffer[7] = 0; - buffer[8] = 0; - buffer[9] = OS_UNKNOWN; - buffer[10] = BGZF_XLEN; - buffer[11] = 0; - buffer[12] = BGZF_ID1; - buffer[13] = BGZF_ID2; - buffer[14] = BGZF_LEN; - buffer[15] = 0; - buffer[16] = 0; // placeholder for block length - buffer[17] = 0; - - // loop to retry for blocks that do not compress enough - int input_length = block_length; - int compressed_length = 0; - while (1) { - z_stream zs; - zs.zalloc = NULL; - zs.zfree = NULL; - zs.next_in = fp->uncompressed_block; - zs.avail_in = input_length; - zs.next_out = (void*)&buffer[BLOCK_HEADER_LENGTH]; - zs.avail_out = buffer_size - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; - - int status = deflateInit2(&zs, fp->compress_level, Z_DEFLATED, - GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY); - if (status != Z_OK) { - report_error(fp, "deflate init failed"); - return -1; - } - status = deflate(&zs, Z_FINISH); - if (status != Z_STREAM_END) { - deflateEnd(&zs); - if (status == Z_OK) { - // Not enough space in buffer. - // Can happen in the rare case the input doesn't compress enough. - // Reduce the amount of input until it fits. - input_length -= 1024; - if (input_length <= 0) { - // should never happen - report_error(fp, "input reduction failed"); - return -1; - } - continue; - } - report_error(fp, "deflate failed"); - return -1; - } - status = deflateEnd(&zs); - if (status != Z_OK) { - report_error(fp, "deflate end failed"); - return -1; - } - compressed_length = zs.total_out; - compressed_length += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; - if (compressed_length > MAX_BLOCK_SIZE) { - // should never happen - report_error(fp, "deflate overflow"); - return -1; - } - break; - } - - packInt16((uint8_t*)&buffer[16], compressed_length-1); - uint32_t crc = crc32(0L, NULL, 0L); - crc = crc32(crc, fp->uncompressed_block, input_length); - packInt32((uint8_t*)&buffer[compressed_length-8], crc); - packInt32((uint8_t*)&buffer[compressed_length-4], input_length); - - int remaining = block_length - input_length; - if (remaining > 0) { - if (remaining > input_length) { - // should never happen (check so we can use memcpy) - report_error(fp, "remainder too large"); - return -1; - } - memcpy(fp->uncompressed_block, - fp->uncompressed_block + input_length, - remaining); - } - fp->block_offset = remaining; - return compressed_length; +BGZF *bgzf_dopen(int fd, const char *mode) +{ + BGZF *fp = 0; + assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE); + if (strchr(mode, 'r') || strchr(mode, 'R')) { + _bgzf_file_t fpr; + if ((fpr = _bgzf_dopen(fd, "r")) == 0) return 0; + fp = bgzf_read_init(); + fp->fp = fpr; + } else if (strchr(mode, 'w') || strchr(mode, 'W')) { + FILE *fpw; + if ((fpw = fdopen(fd, "w")) == 0) return 0; + fp = bgzf_write_init(mode2level(mode)); + fp->fp = fpw; + } + return fp; } -static -int -inflate_block(BGZF* fp, int block_length) +static int bgzf_compress(void *_dst, int *dlen, void *src, int slen, int level) { - // Inflate the block in fp->compressed_block into fp->uncompressed_block + uint32_t crc; + z_stream zs; + uint8_t *dst = (uint8_t*)_dst; + + // compress the body + zs.zalloc = NULL; zs.zfree = NULL; + zs.next_in = src; + zs.avail_in = slen; + zs.next_out = dst + BLOCK_HEADER_LENGTH; + zs.avail_out = *dlen - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; + if (deflateInit2(&zs, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) != Z_OK) return -1; // -15 to disable zlib header/footer + if (deflate(&zs, Z_FINISH) != Z_STREAM_END) return -1; + if (deflateEnd(&zs) != Z_OK) return -1; + *dlen = zs.total_out + BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; + // write the header + memcpy(dst, g_magic, BLOCK_HEADER_LENGTH); // the last two bytes are a place holder for the length of the block + packInt16(&dst[16], *dlen - 1); // write the compressed length; -1 to fit 2 bytes + // write the footer + crc = crc32(crc32(0L, NULL, 0L), src, slen); + packInt32((uint8_t*)&dst[*dlen - 8], crc); + packInt32((uint8_t*)&dst[*dlen - 4], slen); + return 0; +} - z_stream zs; - int status; - zs.zalloc = NULL; - zs.zfree = NULL; - zs.next_in = fp->compressed_block + 18; - zs.avail_in = block_length - 16; - zs.next_out = fp->uncompressed_block; - zs.avail_out = fp->uncompressed_block_size; +// Deflate the block in fp->uncompressed_block into fp->compressed_block. Also adds an extra field that stores the compressed block length. +static int deflate_block(BGZF *fp, int block_length) +{ + int comp_size = BGZF_MAX_BLOCK_SIZE; + if (bgzf_compress(fp->compressed_block, &comp_size, fp->uncompressed_block, block_length, fp->compress_level) != 0) { + fp->errcode |= BGZF_ERR_ZLIB; + return -1; + } + fp->block_offset = 0; + return comp_size; +} - status = inflateInit2(&zs, GZIP_WINDOW_BITS); - if (status != Z_OK) { - report_error(fp, "inflate init failed"); - return -1; - } - status = inflate(&zs, Z_FINISH); - if (status != Z_STREAM_END) { - inflateEnd(&zs); - report_error(fp, "inflate failed"); - return -1; - } - status = inflateEnd(&zs); - if (status != Z_OK) { - report_error(fp, "inflate failed"); - return -1; - } - return zs.total_out; +// Inflate the block in fp->compressed_block into fp->uncompressed_block +static int inflate_block(BGZF* fp, int block_length) +{ + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = fp->compressed_block + 18; + zs.avail_in = block_length - 16; + zs.next_out = fp->uncompressed_block; + zs.avail_out = BGZF_MAX_BLOCK_SIZE; + + if (inflateInit2(&zs, -15) != Z_OK) { + fp->errcode |= BGZF_ERR_ZLIB; + return -1; + } + if (inflate(&zs, Z_FINISH) != Z_STREAM_END) { + inflateEnd(&zs); + fp->errcode |= BGZF_ERR_ZLIB; + return -1; + } + if (inflateEnd(&zs) != Z_OK) { + fp->errcode |= BGZF_ERR_ZLIB; + return -1; + } + return zs.total_out; } -static -int -check_header(const bgzf_byte_t* header) +static int check_header(const uint8_t *header) { - return (header[0] == GZIP_ID1 && - header[1] == (bgzf_byte_t) GZIP_ID2 && - header[2] == Z_DEFLATED && - (header[3] & FLG_FEXTRA) != 0 && - unpackInt16((uint8_t*)&header[10]) == BGZF_XLEN && - header[12] == BGZF_ID1 && - header[13] == BGZF_ID2 && - unpackInt16((uint8_t*)&header[14]) == BGZF_LEN); + return (header[0] == 31 && header[1] == 139 && header[2] == 8 && (header[3] & 4) != 0 + && unpackInt16((uint8_t*)&header[10]) == 6 + && header[12] == 'B' && header[13] == 'C' + && unpackInt16((uint8_t*)&header[14]) == 2); } +#ifdef BGZF_CACHE static void free_cache(BGZF *fp) { khint_t k; khash_t(cache) *h = (khash_t(cache)*)fp->cache; - if (fp->open_mode != 'r') return; + if (fp->is_write) return; for (k = kh_begin(h); k < kh_end(h); ++k) if (kh_exist(h, k)) free(kh_val(h, k).block); kh_destroy(cache, h); @@ -431,12 +267,8 @@ static int load_block_from_cache(BGZF *fp, int64_t block_address) if (fp->block_length != 0) fp->block_offset = 0; fp->block_address = block_address; fp->block_length = p->size; - memcpy(fp->uncompressed_block, p->block, MAX_BLOCK_SIZE); -#ifdef _USE_KNETFILE - knet_seek(fp->x.fpr, p->end_offset, SEEK_SET); -#else - fseeko(fp->file, p->end_offset, SEEK_SET); -#endif + memcpy(fp->uncompressed_block, p->block, BGZF_MAX_BLOCK_SIZE); + _bgzf_seek((_bgzf_file_t)fp->fp, p->end_offset, SEEK_SET); return p->size; } @@ -446,8 +278,8 @@ static void cache_block(BGZF *fp, int size) khint_t k; cache_t *p; khash_t(cache) *h = (khash_t(cache)*)fp->cache; - if (MAX_BLOCK_SIZE >= fp->cache_size) return; - if ((kh_size(h) + 1) * MAX_BLOCK_SIZE > fp->cache_size) { + if (BGZF_MAX_BLOCK_SIZE >= fp->cache_size) return; + if ((kh_size(h) + 1) * BGZF_MAX_BLOCK_SIZE > fp->cache_size) { /* A better way would be to remove the oldest block in the * cache, but here we remove a random one for simplicity. This * should not have a big impact on performance. */ @@ -463,201 +295,300 @@ static void cache_block(BGZF *fp, int size) p = &kh_val(h, k); p->size = fp->block_length; p->end_offset = fp->block_address + size; - p->block = malloc(MAX_BLOCK_SIZE); - memcpy(kh_val(h, k).block, fp->uncompressed_block, MAX_BLOCK_SIZE); + p->block = malloc(BGZF_MAX_BLOCK_SIZE); + memcpy(kh_val(h, k).block, fp->uncompressed_block, BGZF_MAX_BLOCK_SIZE); } +#else +static void free_cache(BGZF *fp) {} +static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;} +static void cache_block(BGZF *fp, int size) {} +#endif -int -bgzf_read_block(BGZF* fp) +int bgzf_read_block(BGZF *fp) { - bgzf_byte_t header[BLOCK_HEADER_LENGTH]; + uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block; int count, size = 0, block_length, remaining; -#ifdef _USE_KNETFILE - int64_t block_address = knet_tell(fp->x.fpr); - if (load_block_from_cache(fp, block_address)) return 0; - count = knet_read(fp->x.fpr, header, sizeof(header)); -#else - int64_t block_address = ftello(fp->file); - if (load_block_from_cache(fp, block_address)) return 0; - count = fread(header, 1, sizeof(header), fp->file); -#endif - if (count == 0) { - fp->block_length = 0; - return 0; - } + int64_t block_address; + block_address = _bgzf_tell((_bgzf_file_t)fp->fp); + if (fp->cache_size && load_block_from_cache(fp, block_address)) return 0; + count = _bgzf_read(fp->fp, header, sizeof(header)); + if (count == 0) { // no data read + fp->block_length = 0; + return 0; + } + if (count != sizeof(header) || !check_header(header)) { + fp->errcode |= BGZF_ERR_HEADER; + return -1; + } size = count; - if (count != sizeof(header)) { - report_error(fp, "read failed"); - return -1; - } - if (!check_header(header)) { - report_error(fp, "invalid block header"); - return -1; - } - block_length = unpackInt16((uint8_t*)&header[16]) + 1; - bgzf_byte_t* compressed_block = (bgzf_byte_t*) fp->compressed_block; - memcpy(compressed_block, header, BLOCK_HEADER_LENGTH); - remaining = block_length - BLOCK_HEADER_LENGTH; -#ifdef _USE_KNETFILE - count = knet_read(fp->x.fpr, &compressed_block[BLOCK_HEADER_LENGTH], remaining); -#else - count = fread(&compressed_block[BLOCK_HEADER_LENGTH], 1, remaining, fp->file); -#endif - if (count != remaining) { - report_error(fp, "read failed"); - return -1; - } + block_length = unpackInt16((uint8_t*)&header[16]) + 1; // +1 because when writing this number, we used "-1" + compressed_block = (uint8_t*)fp->compressed_block; + memcpy(compressed_block, header, BLOCK_HEADER_LENGTH); + remaining = block_length - BLOCK_HEADER_LENGTH; + count = _bgzf_read(fp->fp, &compressed_block[BLOCK_HEADER_LENGTH], remaining); + if (count != remaining) { + fp->errcode |= BGZF_ERR_IO; + return -1; + } size += count; - count = inflate_block(fp, block_length); - if (count < 0) return -1; - if (fp->block_length != 0) { - // Do not reset offset if this read follows a seek. - fp->block_offset = 0; - } - fp->block_address = block_address; - fp->block_length = count; + if ((count = inflate_block(fp, block_length)) < 0) return -1; + if (fp->block_length != 0) fp->block_offset = 0; // Do not reset offset if this read follows a seek. + fp->block_address = block_address; + fp->block_length = count; cache_block(fp, size); - return 0; + return 0; } -int -bgzf_read(BGZF* fp, void* data, int length) +ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length) { - if (length <= 0) { - return 0; - } - if (fp->open_mode != 'r') { - report_error(fp, "file not open for reading"); - return -1; - } + ssize_t bytes_read = 0; + uint8_t *output = data; + if (length <= 0) return 0; + assert(fp->is_write == 0); + while (bytes_read < length) { + int copy_length, available = fp->block_length - fp->block_offset; + uint8_t *buffer; + if (available <= 0) { + if (bgzf_read_block(fp) != 0) return -1; + available = fp->block_length - fp->block_offset; + if (available <= 0) break; + } + copy_length = length - bytes_read < available? length - bytes_read : available; + buffer = fp->uncompressed_block; + memcpy(output, buffer + fp->block_offset, copy_length); + fp->block_offset += copy_length; + output += copy_length; + bytes_read += copy_length; + } + if (fp->block_offset == fp->block_length) { + fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp); + fp->block_offset = fp->block_length = 0; + } + return bytes_read; +} - int bytes_read = 0; - bgzf_byte_t* output = data; - while (bytes_read < length) { - int copy_length, available = fp->block_length - fp->block_offset; - bgzf_byte_t *buffer; - if (available <= 0) { - if (bgzf_read_block(fp) != 0) { - return -1; - } - available = fp->block_length - fp->block_offset; - if (available <= 0) { - break; - } - } - copy_length = bgzf_min(length-bytes_read, available); - buffer = fp->uncompressed_block; - memcpy(output, buffer + fp->block_offset, copy_length); - fp->block_offset += copy_length; - output += copy_length; - bytes_read += copy_length; - } - if (fp->block_offset == fp->block_length) { -#ifdef _USE_KNETFILE - fp->block_address = knet_tell(fp->x.fpr); -#else - fp->block_address = ftello(fp->file); -#endif - fp->block_offset = 0; - fp->block_length = 0; - } - return bytes_read; +/***** BEGIN: multi-threading *****/ + +typedef struct { + BGZF *fp; + struct mtaux_t *mt; + void *buf; + int i, errcode, toproc; +} worker_t; + +typedef struct mtaux_t { + int n_threads, n_blks, curr, done; + volatile int proc_cnt; + void **blk; + int *len; + worker_t *w; + pthread_t *tid; + pthread_mutex_t lock; + pthread_cond_t cv; +} mtaux_t; + +static int worker_aux(worker_t *w) +{ + int i, tmp, stop = 0; + // wait for condition: to process or all done + pthread_mutex_lock(&w->mt->lock); + while (!w->toproc && !w->mt->done) + pthread_cond_wait(&w->mt->cv, &w->mt->lock); + if (w->mt->done) stop = 1; + w->toproc = 0; + pthread_mutex_unlock(&w->mt->lock); + if (stop) return 1; // to quit the thread + w->errcode = 0; + for (i = w->i; i < w->mt->curr; i += w->mt->n_threads) { + int clen = BGZF_MAX_BLOCK_SIZE; + if (bgzf_compress(w->buf, &clen, w->mt->blk[i], w->mt->len[i], w->fp->compress_level) != 0) + w->errcode |= BGZF_ERR_ZLIB; + memcpy(w->mt->blk[i], w->buf, clen); + w->mt->len[i] = clen; + } + tmp = __sync_fetch_and_add(&w->mt->proc_cnt, 1); + return 0; } -int bgzf_flush(BGZF* fp) +static void *mt_worker(void *data) { - while (fp->block_offset > 0) { - int count, block_length; - block_length = deflate_block(fp, fp->block_offset); - if (block_length < 0) return -1; -#ifdef _USE_KNETFILE - count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw); -#else - count = fwrite(fp->compressed_block, 1, block_length, fp->file); -#endif - if (count != block_length) { - report_error(fp, "write failed"); - return -1; - } - fp->block_address += block_length; - } - return 0; + while (worker_aux(data) == 0); + return 0; +} + +int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks) +{ + int i; + mtaux_t *mt; + pthread_attr_t attr; + if (!fp->is_write || fp->mt || n_threads <= 1) return -1; + mt = calloc(1, sizeof(mtaux_t)); + mt->n_threads = n_threads; + mt->n_blks = n_threads * n_sub_blks; + mt->len = calloc(mt->n_blks, sizeof(int)); + mt->blk = calloc(mt->n_blks, sizeof(void*)); + for (i = 0; i < mt->n_blks; ++i) + mt->blk[i] = malloc(BGZF_MAX_BLOCK_SIZE); + mt->tid = calloc(mt->n_threads, sizeof(pthread_t)); // tid[0] is not used, as the worker 0 is launched by the master + mt->w = calloc(mt->n_threads, sizeof(worker_t)); + for (i = 0; i < mt->n_threads; ++i) { + mt->w[i].i = i; + mt->w[i].mt = mt; + mt->w[i].fp = fp; + mt->w[i].buf = malloc(BGZF_MAX_BLOCK_SIZE); + } + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); + pthread_mutex_init(&mt->lock, 0); + pthread_cond_init(&mt->cv, 0); + for (i = 1; i < mt->n_threads; ++i) // worker 0 is effectively launched by the master thread + pthread_create(&mt->tid[i], &attr, mt_worker, &mt->w[i]); + fp->mt = mt; + return 0; +} + +static void mt_destroy(mtaux_t *mt) +{ + int i; + // signal all workers to quit + pthread_mutex_lock(&mt->lock); + mt->done = 1; mt->proc_cnt = 0; + pthread_cond_broadcast(&mt->cv); + pthread_mutex_unlock(&mt->lock); + for (i = 1; i < mt->n_threads; ++i) pthread_join(mt->tid[i], 0); // worker 0 is effectively launched by the master thread + // free other data allocated on heap + for (i = 0; i < mt->n_blks; ++i) free(mt->blk[i]); + for (i = 0; i < mt->n_threads; ++i) free(mt->w[i].buf); + free(mt->blk); free(mt->len); free(mt->w); free(mt->tid); + pthread_cond_destroy(&mt->cv); + pthread_mutex_destroy(&mt->lock); + free(mt); } -int bgzf_flush_try(BGZF *fp, int size) +static void mt_queue(BGZF *fp) { - if (fp->block_offset + size > fp->uncompressed_block_size) - return bgzf_flush(fp); + mtaux_t *mt = (mtaux_t*)fp->mt; + assert(mt->curr < mt->n_blks); // guaranteed by the caller + memcpy(mt->blk[mt->curr], fp->uncompressed_block, fp->block_offset); + mt->len[mt->curr] = fp->block_offset; + fp->block_offset = 0; + ++mt->curr; +} + +static int mt_flush(BGZF *fp) +{ + int i; + mtaux_t *mt = (mtaux_t*)fp->mt; + if (fp->block_offset) mt_queue(fp); // guaranteed that assertion does not fail + // signal all the workers to compress + pthread_mutex_lock(&mt->lock); + for (i = 0; i < mt->n_threads; ++i) mt->w[i].toproc = 1; + mt->proc_cnt = 0; + pthread_cond_broadcast(&mt->cv); + pthread_mutex_unlock(&mt->lock); + // worker 0 is doing things here + worker_aux(&mt->w[0]); + // wait for all the threads to complete + while (mt->proc_cnt < mt->n_threads); + // dump data to disk + for (i = 0; i < mt->n_threads; ++i) fp->errcode |= mt->w[i].errcode; + for (i = 0; i < mt->curr; ++i) + if (fwrite(mt->blk[i], 1, mt->len[i], fp->fp) != mt->len[i]) + fp->errcode |= BGZF_ERR_IO; + mt->curr = 0; + return 0; +} + +static int mt_lazy_flush(BGZF *fp) +{ + mtaux_t *mt = (mtaux_t*)fp->mt; + if (fp->block_offset) mt_queue(fp); + if (mt->curr == mt->n_blks) + return mt_flush(fp); return -1; } -int bgzf_write(BGZF* fp, const void* data, int length) +static ssize_t mt_write(BGZF *fp, const void *data, ssize_t length) { - const bgzf_byte_t *input = data; - int block_length, bytes_written; - if (fp->open_mode != 'w') { - report_error(fp, "file not open for writing"); - return -1; - } + const uint8_t *input = data; + ssize_t rest = length; + while (rest) { + int copy_length = BGZF_BLOCK_SIZE - fp->block_offset < rest? BGZF_BLOCK_SIZE - fp->block_offset : rest; + memcpy(fp->uncompressed_block + fp->block_offset, input, copy_length); + fp->block_offset += copy_length; input += copy_length; rest -= copy_length; + if (fp->block_offset == BGZF_BLOCK_SIZE) mt_lazy_flush(fp); + } + return length - rest; +} - if (fp->uncompressed_block == NULL) - fp->uncompressed_block = malloc(fp->uncompressed_block_size); - - input = data; - block_length = fp->uncompressed_block_size; - bytes_written = 0; - while (bytes_written < length) { - int copy_length = bgzf_min(block_length - fp->block_offset, length - bytes_written); - bgzf_byte_t* buffer = fp->uncompressed_block; - memcpy(buffer + fp->block_offset, input, copy_length); - fp->block_offset += copy_length; - input += copy_length; - bytes_written += copy_length; - if (fp->block_offset == block_length) { - if (bgzf_flush(fp) != 0) { - break; - } - } - } - return bytes_written; +/***** END: multi-threading *****/ + +int bgzf_flush(BGZF *fp) +{ + if (!fp->is_write) return 0; + if (fp->mt) return mt_flush(fp); + while (fp->block_offset > 0) { + int block_length; + block_length = deflate_block(fp, fp->block_offset); + if (block_length < 0) return -1; + if (fwrite(fp->compressed_block, 1, block_length, fp->fp) != block_length) { + fp->errcode |= BGZF_ERR_IO; // possibly truncated file + return -1; + } + fp->block_address += block_length; + } + return 0; +} + +int bgzf_flush_try(BGZF *fp, ssize_t size) +{ + if (fp->block_offset + size > BGZF_BLOCK_SIZE) { + if (fp->mt) return mt_lazy_flush(fp); + else return bgzf_flush(fp); + } + return -1; +} + +ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length) +{ + const uint8_t *input = data; + int block_length = BGZF_BLOCK_SIZE, bytes_written = 0; + assert(fp->is_write); + if (fp->mt) return mt_write(fp, data, length); + while (bytes_written < length) { + uint8_t* buffer = fp->uncompressed_block; + int copy_length = block_length - fp->block_offset < length - bytes_written? block_length - fp->block_offset : length - bytes_written; + memcpy(buffer + fp->block_offset, input, copy_length); + fp->block_offset += copy_length; + input += copy_length; + bytes_written += copy_length; + if (fp->block_offset == block_length && bgzf_flush(fp)) break; + } + return bytes_written; } int bgzf_close(BGZF* fp) { - if (fp->open_mode == 'w') { - if (bgzf_flush(fp) != 0) return -1; - { // add an empty block - int count, block_length = deflate_block(fp, 0); -#ifdef _USE_KNETFILE - count = fwrite(fp->compressed_block, 1, block_length, fp->x.fpw); -#else - count = fwrite(fp->compressed_block, 1, block_length, fp->file); -#endif + int ret, count, block_length; + if (fp == 0) return -1; + if (fp->is_write) { + if (bgzf_flush(fp) != 0) return -1; + fp->compress_level = -1; + block_length = deflate_block(fp, 0); // write an empty block + count = fwrite(fp->compressed_block, 1, block_length, fp->fp); + if (fflush(fp->fp) != 0) { + fp->errcode |= BGZF_ERR_IO; + return -1; } -#ifdef _USE_KNETFILE - if (fflush(fp->x.fpw) != 0) { -#else - if (fflush(fp->file) != 0) { -#endif - report_error(fp, "flush failed"); - return -1; - } - } - if (fp->owned_file) { -#ifdef _USE_KNETFILE - int ret; - if (fp->open_mode == 'w') ret = fclose(fp->x.fpw); - else ret = knet_close(fp->x.fpr); - if (ret != 0) return -1; -#else - if (fclose(fp->file) != 0) return -1; -#endif - } - free(fp->uncompressed_block); - free(fp->compressed_block); + if (fp->mt) mt_destroy(fp->mt); + } + ret = fp->is_write? fclose(fp->fp) : _bgzf_close(fp->fp); + if (ret != 0) return -1; + free(fp->uncompressed_block); + free(fp->compressed_block); free_cache(fp); - free(fp); - return 0; + free(fp); + return 0; } void bgzf_set_cache_size(BGZF *fp, int cache_size) @@ -670,17 +601,10 @@ int bgzf_check_EOF(BGZF *fp) static uint8_t magic[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0"; uint8_t buf[28]; off_t offset; -#ifdef _USE_KNETFILE - offset = knet_tell(fp->x.fpr); - if (knet_seek(fp->x.fpr, -28, SEEK_END) != 0) return -1; - knet_read(fp->x.fpr, buf, 28); - knet_seek(fp->x.fpr, offset, SEEK_SET); -#else - offset = ftello(fp->file); - if (fseeko(fp->file, -28, SEEK_END) != 0) return -1; - fread(buf, 1, 28, fp->file); - fseeko(fp->file, offset, SEEK_SET); -#endif + offset = _bgzf_tell((_bgzf_file_t)fp->fp); + if (_bgzf_seek(fp->fp, -28, SEEK_END) < 0) return 0; + _bgzf_read(fp->fp, buf, 28); + _bgzf_seek(fp->fp, offset, SEEK_SET); return (memcmp(magic, buf, 28) == 0)? 1 : 0; } @@ -689,26 +613,82 @@ int64_t bgzf_seek(BGZF* fp, int64_t pos, int where) int block_offset; int64_t block_address; - if (fp->open_mode != 'r') { - report_error(fp, "file not open for read"); - return -1; - } - if (where != SEEK_SET) { - report_error(fp, "unimplemented seek option"); - return -1; + if (fp->is_write || where != SEEK_SET) { + fp->errcode |= BGZF_ERR_MISUSE; + return -1; + } + block_offset = pos & 0xFFFF; + block_address = pos >> 16; + if (_bgzf_seek(fp->fp, block_address, SEEK_SET) < 0) { + fp->errcode |= BGZF_ERR_IO; + return -1; + } + fp->block_length = 0; // indicates current block has not been loaded + fp->block_address = block_address; + fp->block_offset = block_offset; + return 0; +} + +int bgzf_is_bgzf(const char *fn) +{ + uint8_t buf[16]; + int n; + _bgzf_file_t fp; + if ((fp = _bgzf_open(fn, "r")) == 0) return 0; + n = _bgzf_read(fp, buf, 16); + _bgzf_close(fp); + if (n != 16) return 0; + return memcmp(g_magic, buf, 16) == 0? 1 : 0; +} + +int bgzf_getc(BGZF *fp) +{ + int c; + if (fp->block_offset >= fp->block_length) { + if (bgzf_read_block(fp) != 0) return -2; /* error */ + if (fp->block_length == 0) return -1; /* end-of-file */ + } + c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++]; + if (fp->block_offset == fp->block_length) { + fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp); + fp->block_offset = 0; + fp->block_length = 0; } - block_offset = pos & 0xFFFF; - block_address = (pos >> 16) & 0xFFFFFFFFFFFFLL; -#ifdef _USE_KNETFILE - if (knet_seek(fp->x.fpr, block_address, SEEK_SET) != 0) { -#else - if (fseeko(fp->file, block_address, SEEK_SET) != 0) { + return c; +} + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) #endif - report_error(fp, "seek failed"); - return -1; - } - fp->block_length = 0; // indicates current block is not loaded - fp->block_address = block_address; - fp->block_offset = block_offset; - return 0; + +int bgzf_getline(BGZF *fp, int delim, kstring_t *str) +{ + int l, state = 0; + unsigned char *buf = (unsigned char*)fp->uncompressed_block; + str->l = 0; + do { + if (fp->block_offset >= fp->block_length) { + if (bgzf_read_block(fp) != 0) { state = -2; break; } + if (fp->block_length == 0) { state = -1; break; } + } + for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l); + if (l < fp->block_length) state = 1; + l -= fp->block_offset; + if (str->l + l + 1 >= str->m) { + str->m = str->l + l + 2; + kroundup32(str->m); + str->s = (char*)realloc(str->s, str->m); + } + memcpy(str->s + str->l, buf + fp->block_offset, l); + str->l += l; + fp->block_offset += l + 1; + if (fp->block_offset >= fp->block_length) { + fp->block_address = _bgzf_tell((_bgzf_file_t)fp->fp); + fp->block_offset = 0; + fp->block_length = 0; + } + } while (state == 0); + if (str->l == 0 && state < 0) return state; + str->s[str->l] = 0; + return str->l; } diff --git a/sam/bgzf.h b/sam/bgzf.h index 7295f37..cb67681 100644 --- a/sam/bgzf.h +++ b/sam/bgzf.h @@ -1,6 +1,7 @@ /* The MIT License Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology + 2011, 2012 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -21,137 +22,186 @@ THE SOFTWARE. */ +/* The BGZF library was originally written by Bob Handsaker from the Broad + * Institute. It was later improved by the SAMtools developers. */ + #ifndef __BGZF_H #define __BGZF_H #include #include #include -#ifdef _USE_KNETFILE -#include "knetfile.h" -#endif +#include + +#define BGZF_BLOCK_SIZE 0xff00 // make sure compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE +#define BGZF_MAX_BLOCK_SIZE 0x10000 -//typedef int8_t bool; +#define BGZF_ERR_ZLIB 1 +#define BGZF_ERR_HEADER 2 +#define BGZF_ERR_IO 4 +#define BGZF_ERR_MISUSE 8 typedef struct { - int file_descriptor; - char open_mode; // 'r' or 'w' - int16_t owned_file, compress_level; -#ifdef _USE_KNETFILE - union { - knetFile *fpr; - FILE *fpw; - } x; -#else - FILE* file; -#endif - int uncompressed_block_size; - int compressed_block_size; - void* uncompressed_block; - void* compressed_block; - int64_t block_address; - int block_length; - int block_offset; + int errcode:16, is_write:2, compress_level:14; int cache_size; - const char* error; + int block_length, block_offset; + int64_t block_address; + void *uncompressed_block, *compressed_block; void *cache; // a pointer to a hash table + void *fp; // actual file handler; FILE* on writing; FILE* or knetFile* on reading + void *mt; // only used for multi-threading } BGZF; +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + #ifdef __cplusplus extern "C" { #endif -/* - * Open an existing file descriptor for reading or writing. - * Mode must be either "r" or "w". - * A subsequent bgzf_close will not close the file descriptor. - * Returns null on error. - */ -BGZF* bgzf_fdopen(int fd, const char* __restrict mode); - -/* - * Open the specified file for reading or writing. - * Mode must be either "r" or "w". - * Returns null on error. - */ -BGZF* bgzf_open(const char* path, const char* __restrict mode); - -/* - * Close the BGZ file and free all associated resources. - * Does not close the underlying file descriptor if created with bgzf_fdopen. - * Returns zero on success, -1 on error. - */ -int bgzf_close(BGZF* fp); - -/* - * Read up to length bytes from the file storing into data. - * Returns the number of bytes actually read. - * Returns zero on end of file. - * Returns -1 on error. - */ -int bgzf_read(BGZF* fp, void* data, int length); - -/* - * Write length bytes from data to the file. - * Returns the number of bytes written. - * Returns -1 on error. - */ -int bgzf_write(BGZF* fp, const void* data, int length); - -/* - * Return a virtual file pointer to the current location in the file. - * No interpetation of the value should be made, other than a subsequent - * call to bgzf_seek can be used to position the file at the same point. - * Return value is non-negative on success. - * Returns -1 on error. - */ -#define bgzf_tell(fp) ((fp->block_address << 16) | (fp->block_offset & 0xFFFF)) - -/* - * Set the file to read from the location specified by pos, which must - * be a value previously returned by bgzf_tell for this file (but not - * necessarily one returned by this file handle). - * The where argument must be SEEK_SET. - * Seeking on a file opened for write is not supported. - * Returns zero on success, -1 on error. - */ -int64_t bgzf_seek(BGZF* fp, int64_t pos, int where); - -/* - * Set the cache size. Zero to disable. By default, caching is - * disabled. The recommended cache size for frequent random access is - * about 8M bytes. - */ -void bgzf_set_cache_size(BGZF *fp, int cache_size); - -int bgzf_check_EOF(BGZF *fp); -int bgzf_read_block(BGZF* fp); -int bgzf_flush(BGZF* fp); -int bgzf_flush_try(BGZF *fp, int size); -int bgzf_check_bgzf(const char *fn); + /****************** + * Basic routines * + ******************/ + + /** + * Open an existing file descriptor for reading or writing. + * + * @param fd file descriptor + * @param mode mode matching /[rwu0-9]+/: 'r' for reading, 'w' for writing and a digit specifies + * the zlib compression level; if both 'r' and 'w' are present, 'w' is ignored. + * @return BGZF file handler; 0 on error + */ + BGZF* bgzf_dopen(int fd, const char *mode); + + #define bgzf_fdopen(fd, mode) bgzf_dopen((fd), (mode)) // for backward compatibility + + /** + * Open the specified file for reading or writing. + */ + BGZF* bgzf_open(const char* path, const char *mode); + + /** + * Close the BGZF and free all associated resources. + * + * @param fp BGZF file handler + * @return 0 on success and -1 on error + */ + int bgzf_close(BGZF *fp); + + /** + * Read up to _length_ bytes from the file storing into _data_. + * + * @param fp BGZF file handler + * @param data data array to read into + * @param length size of data to read + * @return number of bytes actually read; 0 on end-of-file and -1 on error + */ + ssize_t bgzf_read(BGZF *fp, void *data, ssize_t length); + + /** + * Write _length_ bytes from _data_ to the file. + * + * @param fp BGZF file handler + * @param data data array to write + * @param length size of data to write + * @return number of bytes actually written; -1 on error + */ + ssize_t bgzf_write(BGZF *fp, const void *data, ssize_t length); + + /** + * Write the data in the buffer to the file. + */ + int bgzf_flush(BGZF *fp); + + /** + * Return a virtual file pointer to the current location in the file. + * No interpetation of the value should be made, other than a subsequent + * call to bgzf_seek can be used to position the file at the same point. + * Return value is non-negative on success. + */ + #define bgzf_tell(fp) ((fp->block_address << 16) | (fp->block_offset & 0xFFFF)) + + /** + * Set the file to read from the location specified by _pos_. + * + * @param fp BGZF file handler + * @param pos virtual file offset returned by bgzf_tell() + * @param whence must be SEEK_SET + * @return 0 on success and -1 on error + */ + int64_t bgzf_seek(BGZF *fp, int64_t pos, int whence); + + /** + * Check if the BGZF end-of-file (EOF) marker is present + * + * @param fp BGZF file handler opened for reading + * @return 1 if EOF is present; 0 if not or on I/O error + */ + int bgzf_check_EOF(BGZF *fp); + + /** + * Check if a file is in the BGZF format + * + * @param fn file name + * @return 1 if _fn_ is BGZF; 0 if not or on I/O error + */ + int bgzf_is_bgzf(const char *fn); + + /********************* + * Advanced routines * + *********************/ + + /** + * Set the cache size. Only effective when compiled with -DBGZF_CACHE. + * + * @param fp BGZF file handler + * @param size size of cache in bytes; 0 to disable caching (default) + */ + void bgzf_set_cache_size(BGZF *fp, int size); + + /** + * Flush the file if the remaining buffer size is smaller than _size_ + */ + int bgzf_flush_try(BGZF *fp, ssize_t size); + + /** + * Read one byte from a BGZF file. It is faster than bgzf_read() + * @param fp BGZF file handler + * @return byte read; -1 on end-of-file or error + */ + int bgzf_getc(BGZF *fp); + + /** + * Read one line from a BGZF file. It is faster than bgzf_getc() + * + * @param fp BGZF file handler + * @param delim delimitor + * @param str string to write to; must be initialized + * @return length of the string; 0 on end-of-file; negative on error + */ + int bgzf_getline(BGZF *fp, int delim, kstring_t *str); + + /** + * Read the next BGZF block. + */ + int bgzf_read_block(BGZF *fp); + + /** + * Enable multi-threading (only effective on writing) + * + * @param fp BGZF file handler; must be opened for writing + * @param n_threads #threads used for writing + * @param n_sub_blks #blocks processed by each thread; a value 64-256 is recommended + */ + int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks); #ifdef __cplusplus } #endif -static inline int bgzf_getc(BGZF *fp) -{ - int c; - if (fp->block_offset >= fp->block_length) { - if (bgzf_read_block(fp) != 0) return -2; /* error */ - if (fp->block_length == 0) return -1; /* end-of-file */ - } - c = ((unsigned char*)fp->uncompressed_block)[fp->block_offset++]; - if (fp->block_offset == fp->block_length) { -#ifdef _USE_KNETFILE - fp->block_address = knet_tell(fp->x.fpr); -#else - fp->block_address = ftello(fp->file); -#endif - fp->block_offset = 0; - fp->block_length = 0; - } - return c; -} - #endif diff --git a/sam/examples/._00README.txt b/sam/examples/._00README.txt new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/examples/._00README.txt differ diff --git a/sam/examples/._Makefile b/sam/examples/._Makefile new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/examples/._Makefile differ diff --git a/sam/examples/._bam2bed.c b/sam/examples/._bam2bed.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/examples/._bam2bed.c differ diff --git a/sam/examples/._calDepth.c b/sam/examples/._calDepth.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/examples/._calDepth.c differ diff --git a/sam/examples/._chk_indel.c b/sam/examples/._chk_indel.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/examples/._chk_indel.c differ diff --git a/sam/examples/._ex1.fa b/sam/examples/._ex1.fa new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/examples/._ex1.fa differ diff --git a/sam/examples/._ex1.sam.gz b/sam/examples/._ex1.sam.gz new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/examples/._ex1.sam.gz differ diff --git a/sam/examples/._toy.fa b/sam/examples/._toy.fa new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/examples/._toy.fa differ diff --git a/sam/examples/._toy.sam b/sam/examples/._toy.sam new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/examples/._toy.sam differ diff --git a/sam/examples/chk_indel.c b/sam/examples/chk_indel.c new file mode 100644 index 0000000..aaa77e0 --- /dev/null +++ b/sam/examples/chk_indel.c @@ -0,0 +1,83 @@ +/* To compile, copy this file to the samtools source code directory and compile with: + gcc -g -O2 -Wall chk_indel_rg.c -o chk_indel_rg -Wall -I. -L. -lbam -lz +*/ + +#include +#include "bam.h" + +typedef struct { + long cnt[4]; // short:ins, short:del, long:ins, long:del +} rgcnt_t; + +#include "khash.h" +KHASH_MAP_INIT_STR(rgcnt, rgcnt_t) + +#define MAX_LEN 127 +#define Q_THRES 10 +#define L_THRES 6 // short: <=L_THRES; otherwise long + +int main(int argc, char *argv[]) +{ + bamFile fp; + bam1_t *b; + int i, x; + khash_t(rgcnt) *h; + khint_t k; + + if (argc == 1) { + fprintf(stderr, "Usage: chk_indel_rg \n\n"); + fprintf(stderr, "Output: filename, RG, #ins-in-short-homopolymer, #del-in-short, #ins-in-long, #del-in-long\n"); + return 1; + } + + h = kh_init(rgcnt); + fp = bam_open(argv[1], "r"); + bam_header_destroy(bam_header_read(fp)); // we do not need the header + b = bam_init1(); + + while (bam_read1(fp, b) >= 0) { + if (b->core.n_cigar >= 3 && b->core.qual >= Q_THRES) { + const uint8_t *seq; + const uint32_t *cigar = bam1_cigar(b); + char *rg; + for (i = 0; i < b->core.n_cigar; ++i) // check if there are 1bp indels + if (bam_cigar_oplen(cigar[i]) == 1 && (bam_cigar_op(cigar[i]) == BAM_CDEL || bam_cigar_op(cigar[i]) == BAM_CINS)) + break; + if (i == b->core.n_cigar) continue; // no 1bp ins or del + if ((rg = (char*)bam_aux_get(b, "RG")) == 0) continue; // no RG tag + seq = bam1_seq(b); + for (i = x = 0; i < b->core.n_cigar; ++i) { + int op = bam_cigar_op(cigar[i]); + if (bam_cigar_oplen(cigar[i]) == 1 && (op == BAM_CDEL || op == BAM_CINS)) { + int c, j, hrun, which; + c = bam1_seqi(seq, x); + for (j = x + 1, hrun = 0; j < b->core.l_qseq; ++j, ++hrun) // calculate the hompolymer run length + if (bam1_seqi(seq, j) != c) break; + k = kh_get(rgcnt, h, rg + 1); + if (k == kh_end(h)) { // absent + char *key = strdup(rg + 1); + k = kh_put(rgcnt, h, key, &c); + memset(&kh_val(h, k), 0, sizeof(rgcnt_t)); + } + which = (hrun <= L_THRES? 0 : 1)<<1 | (op == BAM_CINS? 0 : 1); + ++kh_val(h, k).cnt[which]; + } + if (bam_cigar_type(op)&1) ++x; + } + } + } + + for (k = 0; k != kh_end(h); ++k) { + if (!kh_exist(h, k)) continue; + printf("%s\t%s", argv[1], kh_key(h, k)); + for (i = 0; i < 4; ++i) + printf("\t%ld", kh_val(h, k).cnt[i]); + putchar('\n'); + free((char*)kh_key(h, k)); + } + + bam_destroy1(b); + bam_close(fp); + kh_destroy(rgcnt, h); + return 0; +} diff --git a/sam/faidx.c b/sam/faidx.c index f0798fc..51c82ac 100644 --- a/sam/faidx.c +++ b/sam/faidx.c @@ -337,6 +337,11 @@ char *fai_fetch(const faidx_t *fai, const char *str, int *len) } else s[name_end] = ':', name_end = l; } } else iter = kh_get(s, h, str); + if(iter == kh_end(h)) { + fprintf(stderr, "[fai_fetch] Warning - Reference %s not found in FASTA file, returning empty sequence\n", str); + free(s); + return 0; + }; val = kh_value(h, iter); // parse the interval if (name_end < l) { diff --git a/sam/kprobaln.c b/sam/kprobaln.c index 894a2ae..04e526a 100644 --- a/sam/kprobaln.c +++ b/sam/kprobaln.c @@ -77,6 +77,8 @@ int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_quer const uint8_t *ref, *query; int bw, bw2, i, k, is_diff = 0, is_backward = 1, Pr; + if ( l_ref<=0 || l_query<=0 ) return 0; // FIXME: this may not be an ideal fix, just prevents sefgault + /*** initialization ***/ is_backward = state && q? 1 : 0; ref = _ref - 1; query = _query - 1; // change to 1-based coordinate @@ -87,7 +89,7 @@ int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_quer // allocate the forward and backward matrices f[][] and b[][] and the scaling array s[] f = calloc(l_query+1, sizeof(void*)); if (is_backward) b = calloc(l_query+1, sizeof(void*)); - for (i = 0; i <= l_query; ++i) { + for (i = 0; i <= l_query; ++i) { // FIXME: this will lead in segfault for l_query==0 f[i] = calloc(bw2 * 3 + 6, sizeof(double)); // FIXME: this is over-allocated for very short seqs if (is_backward) b[i] = calloc(bw2 * 3 + 6, sizeof(double)); } diff --git a/sam/kseq.h b/sam/kseq.h index 0bbc7dc..a5cec7c 100644 --- a/sam/kseq.h +++ b/sam/kseq.h @@ -23,7 +23,7 @@ SOFTWARE. */ -/* Last Modified: 18AUG2011 */ +/* Last Modified: 05MAR2012 */ #ifndef AC_KSEQ_H #define AC_KSEQ_H @@ -34,7 +34,8 @@ #define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r #define KS_SEP_TAB 1 // isspace() && !' ' -#define KS_SEP_MAX 1 +#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) +#define KS_SEP_MAX 2 #define __KS_TYPE(type_t) \ typedef struct __kstream_t { \ @@ -51,7 +52,7 @@ { \ kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ ks->f = f; \ - ks->buf = malloc(__bufsize); \ + ks->buf = (unsigned char*)malloc(__bufsize); \ return ks; \ } \ static inline void ks_destroy(kstream_t *ks) \ @@ -103,7 +104,10 @@ typedef struct __kstring_t { if (ks->end == 0) break; \ } else break; \ } \ - if (delimiter > KS_SEP_MAX) { \ + if (delimiter == KS_SEP_LINE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == '\n') break; \ + } else if (delimiter > KS_SEP_MAX) { \ for (i = ks->begin; i < ks->end; ++i) \ if (ks->buf[i] == delimiter) break; \ } else if (delimiter == KS_SEP_SPACE) { \ @@ -113,7 +117,7 @@ typedef struct __kstring_t { for (i = ks->begin; i < ks->end; ++i) \ if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ } else i = 0; /* never come to here! */ \ - if (str->m - str->l < i - ks->begin + 1) { \ + if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ str->m = str->l + (i - ks->begin) + 1; \ kroundup32(str->m); \ str->s = (char*)realloc(str->s, str->m); \ @@ -129,7 +133,7 @@ typedef struct __kstring_t { if (str->s == 0) { \ str->m = 1; \ str->s = (char*)calloc(1, 1); \ - } \ + } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ str->s[str->l] = '\0'; \ return str->l; \ } \ @@ -142,19 +146,16 @@ typedef struct __kstring_t { __KS_GETC(__read, __bufsize) \ __KS_GETUNTIL(__read, __bufsize) -#define __KSEQ_BASIC(type_t) \ - static inline kseq_t *kseq_init(type_t fd) \ +#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) + +#define __KSEQ_BASIC(SCOPE, type_t) \ + SCOPE kseq_t *kseq_init(type_t fd) \ { \ kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ s->f = ks_init(fd); \ return s; \ } \ - static inline void kseq_rewind(kseq_t *ks) \ - { \ - ks->last_char = 0; \ - ks->f->is_eof = ks->f->begin = ks->f->end = 0; \ - } \ - static inline void kseq_destroy(kseq_t *ks) \ + SCOPE void kseq_destroy(kseq_t *ks) \ { \ if (!ks) return; \ free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ @@ -167,8 +168,8 @@ typedef struct __kstring_t { -1 end-of-file -2 truncated quality string */ -#define __KSEQ_READ \ - static int kseq_read(kseq_t *seq) \ +#define __KSEQ_READ(SCOPE) \ + SCOPE int kseq_read(kseq_t *seq) \ { \ int c; \ kstream_t *ks = seq->f; \ @@ -179,14 +180,15 @@ typedef struct __kstring_t { } /* else: the first header char has been read in the previous call */ \ seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ - if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); /* read FASTA/Q comment */ \ + if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ seq->seq.m = 256; \ seq->seq.s = (char*)malloc(seq->seq.m); \ } \ while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ + if (c == '\n') continue; /* skip empty lines */ \ seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ - ks_getuntil2(ks, '\n', &seq->seq, 0, 1); /* read the rest of the line */ \ + ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ } \ if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ @@ -202,7 +204,7 @@ typedef struct __kstring_t { } \ while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ if (c == -1) return -2; /* error: no quality string */ \ - while (ks_getuntil2(ks, '\n', &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ + while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ seq->last_char = 0; /* we have not come to the next header line */ \ if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ return seq->seq.l; \ @@ -215,10 +217,19 @@ typedef struct __kstring_t { kstream_t *f; \ } kseq_t; -#define KSEQ_INIT(type_t, __read) \ +#define KSEQ_INIT2(SCOPE, type_t, __read) \ KSTREAM_INIT(type_t, __read, 16384) \ __KSEQ_TYPE(type_t) \ - __KSEQ_BASIC(type_t) \ - __KSEQ_READ + __KSEQ_BASIC(SCOPE, type_t) \ + __KSEQ_READ(SCOPE) + +#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) + +#define KSEQ_DECLARE(type_t) \ + __KS_TYPE(type_t) \ + __KSEQ_TYPE(type_t) \ + extern kseq_t *kseq_init(type_t fd); \ + void kseq_destroy(kseq_t *ks); \ + int kseq_read(kseq_t *seq); #endif diff --git a/sam/ksort.h b/sam/ksort.h index fa850ab..aa0bb93 100644 --- a/sam/ksort.h +++ b/sam/ksort.h @@ -26,6 +26,10 @@ /* Contact: Heng Li */ /* + 2012-12-11 (0.1.4): + + * Defined __ks_insertsort_##name as static to compile with C99. + 2008-11-16 (0.1.4): * Fixed a bug in introsort() that happens in rare cases. @@ -141,7 +145,7 @@ typedef struct { tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \ } \ } \ - inline void __ks_insertsort_##name(type_t *s, type_t *t) \ + static inline void __ks_insertsort_##name(type_t *s, type_t *t) \ { \ type_t *i, *j, swap_tmp; \ for (i = s + 1; i < t; ++i) \ diff --git a/sam/kstring.c b/sam/kstring.c index b2a0dab..b8ff45c 100644 --- a/sam/kstring.c +++ b/sam/kstring.c @@ -98,13 +98,13 @@ typedef unsigned char ubyte_t; static int *ksBM_prep(const ubyte_t *pat, int m) { int i, *suff, *prep, *bmGs, *bmBc; - prep = calloc(m + 256, sizeof(int)); + prep = (int*)calloc(m + 256, sizeof(int)); bmGs = prep; bmBc = prep + m; { // preBmBc() for (i = 0; i < 256; ++i) bmBc[i] = m; for (i = 0; i < m - 1; ++i) bmBc[pat[i]] = m - i - 1; } - suff = calloc(m, sizeof(int)); + suff = (int*)calloc(m, sizeof(int)); { // suffixes() int f = 0, g; suff[m - 1] = m; diff --git a/sam/kstring.h b/sam/kstring.h index ec5775b..abd8236 100644 --- a/sam/kstring.h +++ b/sam/kstring.h @@ -1,3 +1,28 @@ +/* The MIT License + + Copyright (c) by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + #ifndef KSTRING_H #define KSTRING_H @@ -42,7 +67,16 @@ extern "C" { #ifdef __cplusplus } #endif - + +static inline void ks_resize(kstring_t *s, size_t size) +{ + if (s->m < size) { + s->m = size; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } +} + static inline int kputsn(const char *p, int l, kstring_t *s) { if (s->l + l + 1 >= s->m) { @@ -78,7 +112,8 @@ static inline int kputw(int c, kstring_t *s) char buf[16]; int l, x; if (c == 0) return kputc('0', s); - for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if(c < 0) for (l = 0, x = c; x < 0; x /= 10) buf[l++] = '0' - (x%10); + else for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0'; if (c < 0) buf[l++] = '-'; if (s->l + l + 1 >= s->m) { s->m = s->l + l + 2; @@ -107,6 +142,23 @@ static inline int kputuw(unsigned c, kstring_t *s) return 0; } +static inline int kputl(long c, kstring_t *s) +{ + char buf[32]; + long l, x; + if (c == 0) return kputc('0', s); + for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (c < 0) buf[l++] = '-'; + if (s->l + l + 1 >= s->m) { + s->m = s->l + l + 2; + kroundup32(s->m); + s->s = (char*)realloc(s->s, s->m); + } + for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; + s->s[s->l] = 0; + return 0; +} + static inline int *ksplit(kstring_t *s, int delimiter, int *n) { int max = 0, *offsets = 0; diff --git a/sam/misc/._HmmGlocal.java b/sam/misc/._HmmGlocal.java new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/misc/._HmmGlocal.java differ diff --git a/sam/misc/._Makefile b/sam/misc/._Makefile new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/misc/._Makefile differ diff --git a/sam/misc/._ace2sam.c b/sam/misc/._ace2sam.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/misc/._ace2sam.c differ diff --git a/sam/misc/._bamcheck.c b/sam/misc/._bamcheck.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/misc/._bamcheck.c differ diff --git a/sam/misc/._blast2sam.pl b/sam/misc/._blast2sam.pl new file mode 100755 index 0000000..94286bb Binary files /dev/null and b/sam/misc/._blast2sam.pl differ diff --git a/sam/misc/._bowtie2sam.pl b/sam/misc/._bowtie2sam.pl new file mode 100755 index 0000000..94286bb Binary files /dev/null and b/sam/misc/._bowtie2sam.pl differ diff --git a/sam/misc/._export2sam.pl b/sam/misc/._export2sam.pl new file mode 100755 index 0000000..94286bb Binary files /dev/null and b/sam/misc/._export2sam.pl differ diff --git a/sam/misc/._interpolate_sam.pl b/sam/misc/._interpolate_sam.pl new file mode 100755 index 0000000..94286bb Binary files /dev/null and b/sam/misc/._interpolate_sam.pl differ diff --git a/sam/misc/._maq2sam.c b/sam/misc/._maq2sam.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/misc/._maq2sam.c differ diff --git a/sam/misc/._md5.c b/sam/misc/._md5.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/misc/._md5.c differ diff --git a/sam/misc/._md5.h b/sam/misc/._md5.h new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/misc/._md5.h differ diff --git a/sam/misc/._md5fa.c b/sam/misc/._md5fa.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/misc/._md5fa.c differ diff --git a/sam/misc/._novo2sam.pl b/sam/misc/._novo2sam.pl new file mode 100755 index 0000000..94286bb Binary files /dev/null and b/sam/misc/._novo2sam.pl differ diff --git a/sam/misc/._plot-bamcheck b/sam/misc/._plot-bamcheck new file mode 100755 index 0000000..94286bb Binary files /dev/null and b/sam/misc/._plot-bamcheck differ diff --git a/sam/misc/._psl2sam.pl b/sam/misc/._psl2sam.pl new file mode 100755 index 0000000..94286bb Binary files /dev/null and b/sam/misc/._psl2sam.pl differ diff --git a/sam/misc/._r2plot.lua b/sam/misc/._r2plot.lua new file mode 100755 index 0000000..94286bb Binary files /dev/null and b/sam/misc/._r2plot.lua differ diff --git a/sam/misc/._sam2vcf.pl b/sam/misc/._sam2vcf.pl new file mode 100755 index 0000000..94286bb Binary files /dev/null and b/sam/misc/._sam2vcf.pl differ diff --git a/sam/misc/._samtools.pl b/sam/misc/._samtools.pl new file mode 100755 index 0000000..94286bb Binary files /dev/null and b/sam/misc/._samtools.pl differ diff --git a/sam/misc/._soap2sam.pl b/sam/misc/._soap2sam.pl new file mode 100755 index 0000000..94286bb Binary files /dev/null and b/sam/misc/._soap2sam.pl differ diff --git a/sam/misc/._varfilter.py b/sam/misc/._varfilter.py new file mode 100755 index 0000000..94286bb Binary files /dev/null and b/sam/misc/._varfilter.py differ diff --git a/sam/misc/._vcfutils.lua b/sam/misc/._vcfutils.lua new file mode 100755 index 0000000..94286bb Binary files /dev/null and b/sam/misc/._vcfutils.lua differ diff --git a/sam/misc/._wgsim.c b/sam/misc/._wgsim.c new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/misc/._wgsim.c differ diff --git a/sam/misc/._wgsim_eval.pl b/sam/misc/._wgsim_eval.pl new file mode 100755 index 0000000..94286bb Binary files /dev/null and b/sam/misc/._wgsim_eval.pl differ diff --git a/sam/misc/._zoom2sam.pl b/sam/misc/._zoom2sam.pl new file mode 100755 index 0000000..94286bb Binary files /dev/null and b/sam/misc/._zoom2sam.pl differ diff --git a/sam/misc/Makefile b/sam/misc/Makefile index d2f8bd8..d36e7ac 100644 --- a/sam/misc/Makefile +++ b/sam/misc/Makefile @@ -4,7 +4,7 @@ CFLAGS= -g -Wall -O2 #-m64 #-arch ppc CXXFLAGS= $(CFLAGS) DFLAGS= -D_FILE_OFFSET_BITS=64 OBJS= -PROG= md5sum-lite md5fa maq2sam-short maq2sam-long wgsim seqtk +PROG= md5sum-lite md5fa maq2sam-short maq2sam-long ace2sam wgsim bamcheck INCLUDES= -I.. SUBDIRS= . @@ -27,8 +27,14 @@ lib-recur all-recur clean-recur cleanlocal-recur install-recur: lib: -seqtk:seqtk.o - $(CC) $(CFLAGS) -o $@ seqtk.o -lm -lz +bamcheck:bamcheck.o + $(CC) $(CFLAGS) -o $@ bamcheck.o -L.. -lm -lbam -lpthread -lz + +bamcheck.o:bamcheck.c ../faidx.h ../khash.h ../sam.h ../razf.h + $(CC) $(CFLAGS) -c -I.. -o $@ bamcheck.c + +ace2sam:ace2sam.o + $(CC) $(CFLAGS) -o $@ ace2sam.o -lz wgsim:wgsim.o $(CC) $(CFLAGS) -o $@ wgsim.o -lm -lz @@ -51,12 +57,12 @@ maq2sam-long:maq2sam.c md5fa.o:md5.h md5fa.c $(CC) $(CFLAGS) -c -I.. -o $@ md5fa.c -seqtk.o:seqtk.c ../khash.h ../kseq.h - $(CC) $(CFLAGS) -c -I.. -o $@ seqtk.c - wgsim.o:wgsim.c ../kseq.h $(CC) $(CFLAGS) -c -I.. -o $@ wgsim.c +ace2sam.o:ace2sam.c ../kstring.h ../kseq.h + $(CC) $(CFLAGS) -c -I.. -o $@ ace2sam.c + cleanlocal: rm -fr gmon.out *.o a.out *.exe *.dSYM $(PROG) *~ *.a diff --git a/sam/misc/ace2sam.c b/sam/misc/ace2sam.c new file mode 100644 index 0000000..325133d --- /dev/null +++ b/sam/misc/ace2sam.c @@ -0,0 +1,249 @@ +/* The MIT License + + Copyright (c) 2011 Heng Li + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +#include +#include +#include +#include +#include +#include "kstring.h" +#include "kseq.h" +KSTREAM_INIT(gzFile, gzread, 16384) + +#define N_TMPSTR 5 +#define LINE_LEN 60 + +// append a CIGAR operation plus length +#define write_cigar(_c, _n, _m, _v) do { \ + if (_n == _m) { \ + _m = _m? _m<<1 : 4; \ + _c = realloc(_c, _m * sizeof(unsigned)); \ + } \ + _c[_n++] = (_v); \ + } while (0) + +// a fatal error +static void fatal(const char *msg) +{ + fprintf(stderr, "E %s\n", msg); + exit(1); +} +// remove pads +static void remove_pads(const kstring_t *src, kstring_t *dst) +{ + int i, j; + dst->l = 0; + kputsn(src->s, src->l, dst); + for (i = j = 0; i < dst->l; ++i) + if (dst->s[i] != '*') dst->s[j++] = dst->s[i]; + dst->s[j] = 0; + dst->l = j; +} + +int main(int argc, char *argv[]) +{ + gzFile fp; + kstream_t *ks; + kstring_t s, t[N_TMPSTR]; + int dret, i, k, af_n, af_max, af_i, c, is_padded = 0, write_cns = 0, *p2u = 0; + long m_cigar = 0, n_cigar = 0; + unsigned *af, *cigar = 0; + + while ((c = getopt(argc, argv, "pc")) >= 0) { + switch (c) { + case 'p': is_padded = 1; break; + case 'c': write_cns = 1; break; + } + } + if (argc == optind) { + fprintf(stderr, "\nUsage: ace2sam [-pc] \n\n"); + fprintf(stderr, "Options: -p output padded SAM\n"); + fprintf(stderr, " -c write the contig sequence in SAM\n\n"); + fprintf(stderr, "Notes: 1. Fields must appear in the following order: (CO->[BQ]->(AF)->(RD->QA))\n"); + fprintf(stderr, " 2. The order of reads in AF and in RD must be identical\n"); + fprintf(stderr, " 3. Except in BQ, words and numbers must be separated by a single SPACE or TAB\n"); + fprintf(stderr, " 4. This program writes the headerless SAM to stdout and header to stderr\n\n"); + return 1; + } + + s.l = s.m = 0; s.s = 0; + af_n = af_max = af_i = 0; af = 0; + for (i = 0; i < N_TMPSTR; ++i) t[i].l = t[i].m = 0, t[i].s = 0; + fp = strcmp(argv[1], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); + ks = ks_init(fp); + while (ks_getuntil(ks, 0, &s, &dret) >= 0) { + if (strcmp(s.s, "CO") == 0) { // contig sequence + kstring_t *cns; + t[0].l = t[1].l = t[2].l = t[3].l = t[4].l = 0; // 0: name; 1: padded ctg; 2: unpadded ctg/padded read; 3: unpadded read; 4: SAM line + af_n = af_i = 0; // reset the af array + ks_getuntil(ks, 0, &s, &dret); kputs(s.s, &t[0]); // contig name + ks_getuntil(ks, '\n', &s, &dret); // read the whole line + while (ks_getuntil(ks, '\n', &s, &dret) >= 0 && s.l > 0) kputsn(s.s, s.l, &t[1]); // read the padded consensus sequence + remove_pads(&t[1], &t[2]); // construct the unpadded sequence + // compute the array for mapping padded positions to unpadded positions + p2u = realloc(p2u, t[1].m * sizeof(int)); + for (i = k = 0; i < t[1].l; ++i) { + p2u[i] = k; + if (t[1].s[i] != '*') ++k; + } + // write out the SAM header and contig sequences + fprintf(stderr, "H @SQ\tSN:%s\tLN:%ld\n", t[0].s, t[is_padded?1:2].l); // The SAM header line + cns = &t[is_padded?1:2]; + fprintf(stderr, "S >%s\n", t[0].s); + for (i = 0; i < cns->l; i += LINE_LEN) { + fputs("S ", stderr); + for (k = 0; k < LINE_LEN && i + k < cns->l; ++k) + fputc(cns->s[i + k], stderr); + fputc('\n', stderr); + } + +#define __padded2cigar(sp) do { \ + int i, l_M = 0, l_D = 0; \ + for (i = 0; i < sp.l; ++i) { \ + if (sp.s[i] == '*') { \ + if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \ + ++l_D; l_M = 0; \ + } else { \ + if (l_D) write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \ + ++l_M; l_D = 0; \ + } \ + } \ + if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \ + else write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \ + } while (0) + + if (write_cns) { // write the consensus SAM line (dummy read) + n_cigar = 0; + if (is_padded) __padded2cigar(t[1]); + else write_cigar(cigar, n_cigar, m_cigar, t[2].l<<4); + kputsn(t[0].s, t[0].l, &t[4]); kputs("\t516\t", &t[4]); kputsn(t[0].s, t[0].l, &t[4]); kputs("\t1\t60\t", &t[4]); + for (i = 0; i < n_cigar; ++i) { + kputw(cigar[i]>>4, &t[4]); kputc("MIDNSHP=X"[cigar[i]&0xf], &t[4]); + } + kputs("\t*\t0\t0\t", &t[4]); kputsn(t[2].s, t[2].l, &t[4]); kputs("\t*", &t[4]); + } + } else if (strcmp(s.s, "BQ") == 0) { // contig quality + if (t[0].l == 0) fatal("come to 'BQ' before reading 'CO'"); + if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); // read the entire "BQ" line + if (write_cns) t[4].s[--t[4].l] = 0; // remove the trailing "*" + for (i = 0; i < t[2].l; ++i) { // read the consensus quality + int q; + if (ks_getuntil(ks, 0, &s, &dret) < 0) fprintf(stderr, "E truncated contig quality\n"); + if (s.l) { + q = atoi(s.s) + 33; + if (q > 126) q = 126; + if (write_cns) kputc(q, &t[4]); + } else --i; + } + if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); + ks_getuntil(ks, '\n', &s, &dret); // skip the empty line + if (write_cns) puts(t[4].s); t[4].l = 0; + } else if (strcmp(s.s, "AF") == 0) { // padded read position + int reversed, neg, pos; + if (t[0].l == 0) fatal("come to 'AF' before reading 'CO'"); + if (write_cns) { + if (t[4].l) puts(t[4].s); + t[4].l = 0; + } + ks_getuntil(ks, 0, &s, &dret); // read name + ks_getuntil(ks, 0, &s, &dret); reversed = s.s[0] == 'C'? 1 : 0; // strand + ks_getuntil(ks, 0, &s, &dret); pos = atoi(s.s); neg = pos < 0? 1 : 0; pos = pos < 0? -pos : pos; // position + if (af_n == af_max) { // double the af array + af_max = af_max? af_max<<1 : 4; + af = realloc(af, af_max * sizeof(unsigned)); + } + af[af_n++] = pos << 2 | neg << 1 | reversed; // keep the placement information + } else if (strcmp(s.s, "RD") == 0) { // read sequence + if (af_i >= af_n) fatal("more 'RD' records than 'AF'"); + t[2].l = t[3].l = t[4].l = 0; + ks_getuntil(ks, 0, &t[4], &dret); // QNAME + if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); // read the entire RD line + while (ks_getuntil(ks, '\n', &s, &dret) >= 0 && s.l > 0) kputs(s.s, &t[2]); // read the read sequence + } else if (strcmp(s.s, "QA") == 0) { // clipping + if (af_i >= af_n) fatal("more 'QA' records than 'AF'"); + int beg, end, pos, op; + ks_getuntil(ks, 0, &s, &dret); ks_getuntil(ks, 0, &s, &dret); // skip quality clipping + ks_getuntil(ks, 0, &s, &dret); beg = atoi(s.s) - 1; // align clipping start + ks_getuntil(ks, 0, &s, &dret); end = atoi(s.s); // clipping end + if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); + // compute 1-based POS + pos = af[af_i]>>2; // retrieve the position information + if (af[af_i]>>1&1) pos = -pos; + pos += beg; // now pos is the true padded position + // generate CIGAR + remove_pads(&t[2], &t[3]); // backup the unpadded read sequence + n_cigar = 0; + if (beg) write_cigar(cigar, n_cigar, m_cigar, beg<<4|4); + if (is_padded) { + __padded2cigar(t[2]); + if (beg && n_cigar > 1) cigar[1] -= beg<<4; // fix the left-hand CIGAR + if (end < t[2].l && n_cigar) cigar[n_cigar-1] -= (t[2].l - end)<<4; // fix the right-hand CIGAR + } else { + // generate flattened CIGAR string + for (i = beg, k = pos - 1; i < end; ++i, ++k) + t[2].s[i] = t[2].s[i] != '*'? (t[1].s[k] != '*'? 0 : 1) : (t[1].s[k] != '*'? 2 : 6); + // generate the proper CIGAR + for (i = beg + 1, k = 1, op = t[2].s[beg]; i < end; ++i) { + if (op != t[2].s[i]) { + write_cigar(cigar, n_cigar, m_cigar, k<<4|op); + op = t[2].s[i]; k = 1; + } else ++k; + } + write_cigar(cigar, n_cigar, m_cigar, k<<4|op); + // remove unnecessary "P" and possibly merge adjacent operations + for (i = 2; i < n_cigar; ++i) { + if ((cigar[i]&0xf) != 1 && (cigar[i-1]&0xf) == 6 && (cigar[i-2]&0xf) != 1) { + cigar[i-1] = 0; + if ((cigar[i]&0xf) == (cigar[i-2]&0xf)) // merge operations + cigar[i] += cigar[i-2], cigar[i-2] = 0; + } + } + for (i = k = 0; i < n_cigar; ++i) // squeeze out dumb operations + if (cigar[i]) cigar[k++] = cigar[i]; + n_cigar = k; + } + if (end < t[2].l) write_cigar(cigar, n_cigar, m_cigar, (t[2].l - end)<<4|4); + // write the SAM line for the read + kputc('\t', &t[4]); // QNAME has already been written + kputw((af[af_i]&1)? 16 : 0, &t[4]); kputc('\t', &t[4]); // FLAG + kputsn(t[0].s, t[0].l, &t[4]); kputc('\t', &t[4]); // RNAME + kputw(is_padded? pos : p2u[pos-1]+1, &t[4]); // POS + kputs("\t60\t", &t[4]); // MAPQ + for (i = 0; i < n_cigar; ++i) { // CIGAR + kputw(cigar[i]>>4, &t[4]); kputc("MIDNSHP=X"[cigar[i]&0xf], &t[4]); + } + kputs("\t*\t0\t0\t", &t[4]); // empty MRNM, MPOS and TLEN + kputsn(t[3].s, t[3].l, &t[4]); // unpadded SEQ + kputs("\t*", &t[4]); // QUAL + puts(t[4].s); // print to stdout + ++af_i; + } else if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); + } + ks_destroy(ks); + gzclose(fp); + free(af); free(s.s); free(cigar); free(p2u); + for (i = 0; i < N_TMPSTR; ++i) free(t[i].s); + return 0; +} diff --git a/sam/misc/bamcheck.c b/sam/misc/bamcheck.c new file mode 100644 index 0000000..352db21 --- /dev/null +++ b/sam/misc/bamcheck.c @@ -0,0 +1,1521 @@ +/* + Author: petr.danecek@sanger + gcc -Wall -Winline -g -O2 -I ~/git/samtools bamcheck.c -o bamcheck -lm -lz -L ~/git/samtools -lbam -lpthread + + Assumptions, approximations and other issues: + - GC-depth graph does not split reads, the starting position determines which bin is incremented. + There are small overlaps between bins (max readlen-1). However, the bins are big (20k). + - coverage distribution ignores softclips and deletions + - some stats require sorted BAMs + - GC content graph can have an untidy, step-like pattern when BAM contains multiple read lengths. + - 'bases mapped' (stats->nbases_mapped) is calculated from read lengths given by BAM (core.l_qseq) + - With the -t option, the whole reads are used. Except for the number of mapped bases (cigar) + counts, no splicing is done, no indels or soft clips are considered, even small overlap is + good enough to include the read in the stats. + +*/ + +#define BAMCHECK_VERSION "2012-09-04" + +#define _ISOC99_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "faidx.h" +#include "khash.h" +#include "sam.h" +#include "sam_header.h" +#include "razf.h" + +#define BWA_MIN_RDLEN 35 +#define IS_PAIRED(bam) ((bam)->core.flag&BAM_FPAIRED && !((bam)->core.flag&BAM_FUNMAP) && !((bam)->core.flag&BAM_FMUNMAP)) +#define IS_UNMAPPED(bam) ((bam)->core.flag&BAM_FUNMAP) +#define IS_REVERSE(bam) ((bam)->core.flag&BAM_FREVERSE) +#define IS_MATE_REVERSE(bam) ((bam)->core.flag&BAM_FMREVERSE) +#define IS_READ1(bam) ((bam)->core.flag&BAM_FREAD1) +#define IS_READ2(bam) ((bam)->core.flag&BAM_FREAD2) +#define IS_DUP(bam) ((bam)->core.flag&BAM_FDUP) + +typedef struct +{ + int32_t line_len, line_blen; + int64_t len; + uint64_t offset; +} +faidx1_t; +KHASH_MAP_INIT_STR(kh_faidx, faidx1_t) +KHASH_MAP_INIT_STR(kh_bam_tid, int) +KHASH_MAP_INIT_STR(kh_rg, const char *) +struct __faidx_t { + RAZF *rz; + int n, m; + char **name; + khash_t(kh_faidx) *hash; +}; + +typedef struct +{ + float gc; + uint32_t depth; +} +gc_depth_t; + +// For coverage distribution, a simple pileup +typedef struct +{ + int64_t pos; + int size, start; + int *buffer; +} +round_buffer_t; + +typedef struct { uint32_t from, to; } pos_t; +typedef struct +{ + int npos,mpos,cpos; + pos_t *pos; +} +regions_t; + +typedef struct +{ + // Parameters + int trim_qual; // bwa trim quality + + // Dimensions of the quality histogram holder (quals_1st,quals_2nd), GC content holder (gc_1st,gc_2nd), + // insert size histogram holder + int nquals; // The number of quality bins + int nbases; // The maximum sequence length the allocated array can hold + int nisize; // The maximum insert size that the allocated array can hold + int ngc; // The size of gc_1st and gc_2nd + int nindels; // The maximum indel length for indel distribution + + // Arrays for the histogram data + uint64_t *quals_1st, *quals_2nd; + uint64_t *gc_1st, *gc_2nd; + uint64_t *isize_inward, *isize_outward, *isize_other; + uint64_t *acgt_cycles; + uint64_t *read_lengths; + uint64_t *insertions, *deletions; + uint64_t *ins_cycles_1st, *ins_cycles_2nd, *del_cycles_1st, *del_cycles_2nd; + + // The extremes encountered + int max_len; // Maximum read length + int max_qual; // Maximum quality + float isize_main_bulk; // There are always some unrealistically big insert sizes, report only the main part + int is_sorted; + + // Summary numbers + uint64_t total_len; + uint64_t total_len_dup; + uint64_t nreads_1st; + uint64_t nreads_2nd; + uint64_t nreads_filtered; + uint64_t nreads_dup; + uint64_t nreads_unmapped; + uint64_t nreads_unpaired; + uint64_t nreads_paired; + uint64_t nreads_anomalous; + uint64_t nreads_mq0; + uint64_t nbases_mapped; + uint64_t nbases_mapped_cigar; + uint64_t nbases_trimmed; // bwa trimmed bases + uint64_t nmismatches; + uint64_t nreads_QCfailed, nreads_secondary; + + // GC-depth related data + uint32_t ngcd, igcd; // The maximum number of GC depth bins and index of the current bin + gc_depth_t *gcd; // The GC-depth bins holder + int gcd_bin_size; // The size of GC-depth bin + uint32_t gcd_ref_size; // The approximate size of the genome + int32_t tid, gcd_pos; // Position of the current bin + int32_t pos; // Position of the last read + + // Coverage distribution related data + int ncov; // The number of coverage bins + uint64_t *cov; // The coverage frequencies + int cov_min,cov_max,cov_step; // Minimum, maximum coverage and size of the coverage bins + round_buffer_t cov_rbuf; // Pileup round buffer + + // Mismatches by read cycle + uint8_t *rseq_buf; // A buffer for reference sequence to check the mismatches against + int mrseq_buf; // The size of the buffer + int32_t rseq_pos; // The coordinate of the first base in the buffer + int32_t nrseq_buf; // The used part of the buffer + uint64_t *mpc_buf; // Mismatches per cycle + + // Filters + int filter_readlen; + + // Target regions + int nregions, reg_from,reg_to; + regions_t *regions; + + // Auxiliary data + int flag_require, flag_filter; + double sum_qual; // For calculating average quality value + samfile_t *sam; + khash_t(kh_rg) *rg_hash; // Read groups to include, the array is null-terminated + faidx_t *fai; // Reference sequence for GC-depth graph + int argc; // Command line arguments to be printed on the output + char **argv; +} +stats_t; + +void error(const char *format, ...); +void bam_init_header_hash(bam_header_t *header); +int is_in_regions(bam1_t *bam_line, stats_t *stats); + + +// Coverage distribution methods +inline int coverage_idx(int min, int max, int n, int step, int depth) +{ + if ( depth < min ) + return 0; + + if ( depth > max ) + return n-1; + + return 1 + (depth - min) / step; +} + +inline int round_buffer_lidx2ridx(int offset, int size, int64_t refpos, int64_t pos) +{ + return (offset + (pos-refpos) % size) % size; +} + +void round_buffer_flush(stats_t *stats, int64_t pos) +{ + int ibuf,idp; + + if ( pos==stats->cov_rbuf.pos ) + return; + + int64_t new_pos = pos; + if ( pos==-1 || pos - stats->cov_rbuf.pos >= stats->cov_rbuf.size ) + { + // Flush the whole buffer, but in sequential order, + pos = stats->cov_rbuf.pos + stats->cov_rbuf.size - 1; + } + + if ( pos < stats->cov_rbuf.pos ) + error("Expected coordinates in ascending order, got %ld after %ld\n", pos,stats->cov_rbuf.pos); + + int ifrom = stats->cov_rbuf.start; + int ito = round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos-1); + if ( ifrom>ito ) + { + for (ibuf=ifrom; ibufcov_rbuf.size; ibuf++) + { + if ( !stats->cov_rbuf.buffer[ibuf] ) + continue; + idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]); + stats->cov[idp]++; + stats->cov_rbuf.buffer[ibuf] = 0; + } + ifrom = 0; + } + for (ibuf=ifrom; ibuf<=ito; ibuf++) + { + if ( !stats->cov_rbuf.buffer[ibuf] ) + continue; + idp = coverage_idx(stats->cov_min,stats->cov_max,stats->ncov,stats->cov_step,stats->cov_rbuf.buffer[ibuf]); + stats->cov[idp]++; + stats->cov_rbuf.buffer[ibuf] = 0; + } + stats->cov_rbuf.start = (new_pos==-1) ? 0 : round_buffer_lidx2ridx(stats->cov_rbuf.start,stats->cov_rbuf.size,stats->cov_rbuf.pos,pos); + stats->cov_rbuf.pos = new_pos; +} + +void round_buffer_insert_read(round_buffer_t *rbuf, int64_t from, int64_t to) +{ + if ( to-from >= rbuf->size ) + error("The read length too big (%d), please increase the buffer length (currently %d)\n", to-from+1,rbuf->size); + if ( from < rbuf->pos ) + error("The reads are not sorted (%ld comes after %ld).\n", from,rbuf->pos); + + int ifrom,ito,ibuf; + ifrom = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,from); + ito = round_buffer_lidx2ridx(rbuf->start,rbuf->size,rbuf->pos,to); + if ( ifrom>ito ) + { + for (ibuf=ifrom; ibufsize; ibuf++) + rbuf->buffer[ibuf]++; + ifrom = 0; + } + for (ibuf=ifrom; ibuf<=ito; ibuf++) + rbuf->buffer[ibuf]++; +} + +// Calculate the number of bases in the read trimmed by BWA +int bwa_trim_read(int trim_qual, uint8_t *quals, int len, int reverse) +{ + if ( lenmax_sum ) + { + max_sum = sum; + // This is the correct way, but bwa clips from some reason one base less + // max_l = l+1; + max_l = l; + } + } + return max_l; +} + + +void count_indels(stats_t *stats,bam1_t *bam_line) +{ + int is_fwd = IS_REVERSE(bam_line) ? 0 : 1; + int is_1st = IS_READ1(bam_line) ? 1 : 0; + int icig; + int icycle = 0; + int read_len = bam_line->core.l_qseq; + for (icig=0; icigcore.n_cigar; icig++) + { + // Conversion from uint32_t to MIDNSHP + // 0123456 + // MIDNSHP + int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK; + int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT; + + if ( cig==1 ) + { + int idx = is_fwd ? icycle : read_len-icycle-ncig; + if ( idx<0 ) + error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle); + if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%d %s\n", idx,stats->nbases, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line)); + if ( is_1st ) + stats->ins_cycles_1st[idx]++; + else + stats->ins_cycles_2nd[idx]++; + icycle += ncig; + if ( ncig<=stats->nindels ) + stats->insertions[ncig-1]++; + continue; + } + if ( cig==2 ) + { + int idx = is_fwd ? icycle-1 : read_len-icycle-1; + if ( idx<0 ) continue; // discard meaningless deletions + if ( idx >= stats->nbases ) error("FIXME: %d vs %d\n", idx,stats->nbases); + if ( is_1st ) + stats->del_cycles_1st[idx]++; + else + stats->del_cycles_2nd[idx]++; + if ( ncig<=stats->nindels ) + stats->deletions[ncig-1]++; + continue; + } + if ( cig!=3 && cig!=5 ) + icycle += ncig; + } +} + +void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line) +{ + int is_fwd = IS_REVERSE(bam_line) ? 0 : 1; + int icig,iread=0,icycle=0; + int iref = bam_line->core.pos - stats->rseq_pos; + int read_len = bam_line->core.l_qseq; + uint8_t *read = bam1_seq(bam_line); + uint8_t *quals = bam1_qual(bam_line); + uint64_t *mpc_buf = stats->mpc_buf; + for (icig=0; icigcore.n_cigar; icig++) + { + // Conversion from uint32_t to MIDNSHP + // 0123456 + // MIDNSHP + int cig = bam1_cigar(bam_line)[icig] & BAM_CIGAR_MASK; + int ncig = bam1_cigar(bam_line)[icig] >> BAM_CIGAR_SHIFT; + if ( cig==1 ) + { + iread += ncig; + icycle += ncig; + continue; + } + if ( cig==2 ) + { + iref += ncig; + continue; + } + if ( cig==4 ) + { + icycle += ncig; + // Soft-clips are present in the sequence, but the position of the read marks a start of non-clipped sequence + // iref += ncig; + iread += ncig; + continue; + } + if ( cig==5 ) + { + icycle += ncig; + continue; + } + // Ignore H and N CIGARs. The letter are inserted e.g. by TopHat and often require very large + // chunk of refseq in memory. Not very frequent and not noticable in the stats. + if ( cig==3 || cig==5 ) continue; + if ( cig!=0 ) + error("TODO: cigar %d, %s:%d %s\n", cig,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line)); + + if ( ncig+iref > stats->nrseq_buf ) + error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->nrseq_buf, bam1_qname(bam_line),stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1); + + int im; + for (im=0; imrseq_buf[iref]; + + // ---------------15 + // =ACMGRSVTWYHKDBN + if ( cread==15 ) + { + int idx = is_fwd ? icycle : read_len-icycle-1; + if ( idx>stats->max_len ) + error("mpc: %d>%d\n",idx,stats->max_len); + idx = idx*stats->nquals; + if ( idx>=stats->nquals*stats->nbases ) + error("FIXME: mpc_buf overflow\n"); + mpc_buf[idx]++; + } + else if ( cref && cread && cref!=cread ) + { + uint8_t qual = quals[iread] + 1; + if ( qual>=stats->nquals ) + error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line)); + + int idx = is_fwd ? icycle : read_len-icycle-1; + if ( idx>stats->max_len ) + error("mpc: %d>%d\n",idx,stats->max_len); + + idx = idx*stats->nquals + qual; + if ( idx>=stats->nquals*stats->nbases ) + error("FIXME: mpc_buf overflow\n"); + mpc_buf[idx]++; + } + + iref++; + iread++; + icycle++; + } + } +} + +void read_ref_seq(stats_t *stats,int32_t tid,int32_t pos) +{ + khash_t(kh_faidx) *h; + khiter_t iter; + faidx1_t val; + char *chr, c; + faidx_t *fai = stats->fai; + + h = fai->hash; + chr = stats->sam->header->target_name[tid]; + + // ID of the sequence name + iter = kh_get(kh_faidx, h, chr); + if (iter == kh_end(h)) + error("No such reference sequence [%s]?\n", chr); + val = kh_value(h, iter); + + // Check the boundaries + if (pos >= val.len) + error("Was the bam file mapped with the reference sequence supplied?" + " A read mapped beyond the end of the chromosome (%s:%d, chromosome length %d).\n", chr,pos,val.len); + int size = stats->mrseq_buf; + // The buffer extends beyond the chromosome end. Later the rest will be filled with N's. + if (size+pos > val.len) size = val.len-pos; + + // Position the razf reader + razf_seek(fai->rz, val.offset + pos / val.line_blen * val.line_len + pos % val.line_blen, SEEK_SET); + + uint8_t *ptr = stats->rseq_buf; + int nread = 0; + while ( nreadrz,&c,1) && !fai->rz->z_err ) + { + if ( !isgraph(c) ) + continue; + + // Conversion between uint8_t coding and ACGT + // -12-4---8------- + // =ACMGRSVTWYHKDBN + if ( c=='A' || c=='a' ) + *ptr = 1; + else if ( c=='C' || c=='c' ) + *ptr = 2; + else if ( c=='G' || c=='g' ) + *ptr = 4; + else if ( c=='T' || c=='t' ) + *ptr = 8; + else + *ptr = 0; + ptr++; + nread++; + } + if ( nread < stats->mrseq_buf ) + { + memset(ptr,0, stats->mrseq_buf - nread); + nread = stats->mrseq_buf; + } + stats->nrseq_buf = nread; + stats->rseq_pos = pos; + stats->tid = tid; +} + +float fai_gc_content(stats_t *stats, int pos, int len) +{ + uint32_t gc,count,c; + int i = pos - stats->rseq_pos, ito = i + len; + assert( i>=0 && ito<=stats->nrseq_buf ); + + // Count GC content + gc = count = 0; + for (; irseq_buf[i]; + if ( c==2 || c==4 ) + { + gc++; + count++; + } + else if ( c==1 || c==8 ) + count++; + } + return count ? (float)gc/count : 0; +} + +void realloc_rseq_buffer(stats_t *stats) +{ + int n = stats->nbases*10; + if ( stats->gcd_bin_size > n ) n = stats->gcd_bin_size; + if ( stats->mrseq_bufrseq_buf = realloc(stats->rseq_buf,sizeof(uint8_t)*n); + stats->mrseq_buf = n; + } +} + +void realloc_gcd_buffer(stats_t *stats, int seq_len) +{ + if ( seq_len >= stats->gcd_bin_size ) + error("The --GC-depth bin size (%d) is set too low for the read length %d\n", stats->gcd_bin_size, seq_len); + + int n = 1 + stats->gcd_ref_size / (stats->gcd_bin_size - seq_len); + if ( n <= stats->igcd ) + error("The --GC-depth bin size is too small or reference genome too big; please decrease the bin size or increase the reference length\n"); + + if ( n > stats->ngcd ) + { + stats->gcd = realloc(stats->gcd, n*sizeof(gc_depth_t)); + if ( !stats->gcd ) + error("Could not realloc GCD buffer, too many chromosomes or the genome too long?? [%u %u]\n", stats->ngcd,n); + memset(&(stats->gcd[stats->ngcd]),0,(n-stats->ngcd)*sizeof(gc_depth_t)); + stats->ngcd = n; + } + + realloc_rseq_buffer(stats); +} + +void realloc_buffers(stats_t *stats, int seq_len) +{ + int n = 2*(1 + seq_len - stats->nbases) + stats->nbases; + + stats->quals_1st = realloc(stats->quals_1st, n*stats->nquals*sizeof(uint64_t)); + if ( !stats->quals_1st ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t)); + memset(stats->quals_1st + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t)); + + stats->quals_2nd = realloc(stats->quals_2nd, n*stats->nquals*sizeof(uint64_t)); + if ( !stats->quals_2nd ) + error("Could not realloc buffers, the sequence too long: %d (2x%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t)); + memset(stats->quals_2nd + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t)); + + if ( stats->mpc_buf ) + { + stats->mpc_buf = realloc(stats->mpc_buf, n*stats->nquals*sizeof(uint64_t)); + if ( !stats->mpc_buf ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*stats->nquals*sizeof(uint64_t)); + memset(stats->mpc_buf + stats->nbases*stats->nquals, 0, (n-stats->nbases)*stats->nquals*sizeof(uint64_t)); + } + + stats->acgt_cycles = realloc(stats->acgt_cycles, n*4*sizeof(uint64_t)); + if ( !stats->acgt_cycles ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*4*sizeof(uint64_t)); + memset(stats->acgt_cycles + stats->nbases*4, 0, (n-stats->nbases)*4*sizeof(uint64_t)); + + stats->read_lengths = realloc(stats->read_lengths, n*sizeof(uint64_t)); + if ( !stats->read_lengths ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t)); + memset(stats->read_lengths + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t)); + + stats->insertions = realloc(stats->insertions, n*sizeof(uint64_t)); + if ( !stats->insertions ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t)); + memset(stats->insertions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t)); + + stats->deletions = realloc(stats->deletions, n*sizeof(uint64_t)); + if ( !stats->deletions ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,n*sizeof(uint64_t)); + memset(stats->deletions + stats->nbases, 0, (n-stats->nbases)*sizeof(uint64_t)); + + stats->ins_cycles_1st = realloc(stats->ins_cycles_1st, (n+1)*sizeof(uint64_t)); + if ( !stats->ins_cycles_1st ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t)); + memset(stats->ins_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t)); + + stats->ins_cycles_2nd = realloc(stats->ins_cycles_2nd, (n+1)*sizeof(uint64_t)); + if ( !stats->ins_cycles_2nd ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t)); + memset(stats->ins_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t)); + + stats->del_cycles_1st = realloc(stats->del_cycles_1st, (n+1)*sizeof(uint64_t)); + if ( !stats->del_cycles_1st ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t)); + memset(stats->del_cycles_1st + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t)); + + stats->del_cycles_2nd = realloc(stats->del_cycles_2nd, (n+1)*sizeof(uint64_t)); + if ( !stats->del_cycles_2nd ) + error("Could not realloc buffers, the sequence too long: %d (%ld)\n", seq_len,(n+1)*sizeof(uint64_t)); + memset(stats->del_cycles_2nd + stats->nbases + 1, 0, (n-stats->nbases)*sizeof(uint64_t)); + + stats->nbases = n; + + // Realloc the coverage distribution buffer + int *rbuffer = calloc(sizeof(int),seq_len*5); + n = stats->cov_rbuf.size-stats->cov_rbuf.start; + memcpy(rbuffer,stats->cov_rbuf.buffer+stats->cov_rbuf.start,n); + if ( stats->cov_rbuf.start>1 ) + memcpy(rbuffer+n,stats->cov_rbuf.buffer,stats->cov_rbuf.start); + stats->cov_rbuf.start = 0; + free(stats->cov_rbuf.buffer); + stats->cov_rbuf.buffer = rbuffer; + stats->cov_rbuf.size = seq_len*5; + + realloc_rseq_buffer(stats); +} + +void collect_stats(bam1_t *bam_line, stats_t *stats) +{ + if ( stats->rg_hash ) + { + const uint8_t *rg = bam_aux_get(bam_line, "RG"); + if ( !rg ) return; + khiter_t k = kh_get(kh_rg, stats->rg_hash, (const char*)(rg + 1)); + if ( k == kh_end(stats->rg_hash) ) return; + } + if ( stats->flag_require && (bam_line->core.flag & stats->flag_require)!=stats->flag_require ) + { + stats->nreads_filtered++; + return; + } + if ( stats->flag_filter && (bam_line->core.flag & stats->flag_filter) ) + { + stats->nreads_filtered++; + return; + } + if ( !is_in_regions(bam_line,stats) ) + return; + if ( stats->filter_readlen!=-1 && bam_line->core.l_qseq!=stats->filter_readlen ) + return; + + if ( bam_line->core.flag & BAM_FQCFAIL ) stats->nreads_QCfailed++; + if ( bam_line->core.flag & BAM_FSECONDARY ) stats->nreads_secondary++; + + int seq_len = bam_line->core.l_qseq; + if ( !seq_len ) return; + + if ( seq_len >= stats->nbases ) + realloc_buffers(stats,seq_len); + if ( stats->max_lenmax_len = seq_len; + + stats->read_lengths[seq_len]++; + + // Count GC and ACGT per cycle + uint8_t base, *seq = bam1_seq(bam_line); + int gc_count = 0; + int i; + int reverse = IS_REVERSE(bam_line); + for (i=0; i2 ) base=3; + if ( 4*(reverse ? seq_len-i-1 : i) + base >= stats->nbases*4 ) + error("FIXME: acgt_cycles\n"); + stats->acgt_cycles[ 4*(reverse ? seq_len-i-1 : i) + base ]++; + } + int gc_idx_min = gc_count*(stats->ngc-1)/seq_len; + int gc_idx_max = (gc_count+1)*(stats->ngc-1)/seq_len; + if ( gc_idx_max >= stats->ngc ) gc_idx_max = stats->ngc - 1; + + // Determine which array (1st or 2nd read) will these stats go to, + // trim low quality bases from end the same way BWA does, + // fill GC histogram + uint64_t *quals; + uint8_t *bam_quals = bam1_qual(bam_line); + if ( bam_line->core.flag&BAM_FREAD2 ) + { + quals = stats->quals_2nd; + stats->nreads_2nd++; + for (i=gc_idx_min; igc_2nd[i]++; + } + else + { + quals = stats->quals_1st; + stats->nreads_1st++; + for (i=gc_idx_min; igc_1st[i]++; + } + if ( stats->trim_qual>0 ) + stats->nbases_trimmed += bwa_trim_read(stats->trim_qual, bam_quals, seq_len, reverse); + + // Quality histogram and average quality + for (i=0; i=stats->nquals ) + error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line)); + if ( qual>stats->max_qual ) + stats->max_qual = qual; + + quals[ i*stats->nquals+qual ]++; + stats->sum_qual += qual; + } + + // Look at the flags and increment appropriate counters (mapped, paired, etc) + if ( IS_UNMAPPED(bam_line) ) + stats->nreads_unmapped++; + else + { + if ( !bam_line->core.qual ) + stats->nreads_mq0++; + + count_indels(stats,bam_line); + + if ( !IS_PAIRED(bam_line) ) + stats->nreads_unpaired++; + else + { + stats->nreads_paired++; + + if ( bam_line->core.tid!=bam_line->core.mtid ) + stats->nreads_anomalous++; + + // The insert size is tricky, because for long inserts the libraries are + // prepared differently and the pairs point in other direction. BWA does + // not set the paired flag for them. Similar thing is true also for 454 + // reads. Mates mapped to different chromosomes have isize==0. + int32_t isize = bam_line->core.isize; + if ( isize<0 ) isize = -isize; + if ( isize >= stats->nisize ) + isize = stats->nisize-1; + if ( isize>0 || bam_line->core.tid==bam_line->core.mtid ) + { + int pos_fst = bam_line->core.mpos - bam_line->core.pos; + int is_fst = IS_READ1(bam_line) ? 1 : -1; + int is_fwd = IS_REVERSE(bam_line) ? -1 : 1; + int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1; + + if ( is_fwd*is_mfwd>0 ) + stats->isize_other[isize]++; + else if ( is_fst*pos_fst>0 ) + { + if ( is_fst*is_fwd>0 ) + stats->isize_inward[isize]++; + else + stats->isize_outward[isize]++; + } + else if ( is_fst*pos_fst<0 ) + { + if ( is_fst*is_fwd>0 ) + stats->isize_outward[isize]++; + else + stats->isize_inward[isize]++; + } + } + } + + // Number of mismatches + uint8_t *nm = bam_aux_get(bam_line,"NM"); + if (nm) + stats->nmismatches += bam_aux2i(nm); + + // Number of mapped bases from cigar + // Conversion from uint32_t to MIDNSHP + // 012-4-- + // MIDNSHP + if ( bam_line->core.n_cigar == 0) + error("FIXME: mapped read with no cigar?\n"); + int readlen=seq_len; + if ( stats->regions ) + { + // Count only on-target bases + int iref = bam_line->core.pos + 1; + for (i=0; icore.n_cigar; i++) + { + int cig = bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK; + int ncig = bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT; + if ( cig==2 ) readlen += ncig; + else if ( cig==0 ) + { + if ( iref < stats->reg_from ) ncig -= stats->reg_from-iref; + else if ( iref+ncig-1 > stats->reg_to ) ncig -= iref+ncig-1 - stats->reg_to; + if ( ncig<0 ) ncig = 0; + stats->nbases_mapped_cigar += ncig; + iref += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT; + } + else if ( cig==1 ) + { + iref += ncig; + if ( iref>=stats->reg_from && iref<=stats->reg_to ) + stats->nbases_mapped_cigar += ncig; + } + } + } + else + { + // Count the whole read + for (i=0; icore.n_cigar; i++) + { + if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==0 || (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==1 ) + stats->nbases_mapped_cigar += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT; + if ( (bam1_cigar(bam_line)[i]&BAM_CIGAR_MASK)==2 ) + readlen += bam1_cigar(bam_line)[i]>>BAM_CIGAR_SHIFT; + } + } + stats->nbases_mapped += seq_len; + + if ( stats->tid==bam_line->core.tid && bam_line->core.pospos ) + stats->is_sorted = 0; + stats->pos = bam_line->core.pos; + + if ( stats->is_sorted ) + { + if ( stats->tid==-1 || stats->tid!=bam_line->core.tid ) + round_buffer_flush(stats,-1); + + // Mismatches per cycle and GC-depth graph. For simplicity, reads overlapping GCD bins + // are not splitted which results in up to seq_len-1 overlaps. The default bin size is + // 20kbp, so the effect is negligible. + if ( stats->fai ) + { + int inc_ref = 0, inc_gcd = 0; + // First pass or new chromosome + if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid ) { inc_ref=1; inc_gcd=1; } + // Read goes beyond the end of the rseq buffer + else if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+readlen ) { inc_ref=1; inc_gcd=1; } + // Read overlaps the next gcd bin + else if ( stats->gcd_pos+stats->gcd_bin_size < bam_line->core.pos+readlen ) + { + inc_gcd = 1; + if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+stats->gcd_bin_size ) inc_ref = 1; + } + if ( inc_gcd ) + { + stats->igcd++; + if ( stats->igcd >= stats->ngcd ) + realloc_gcd_buffer(stats, readlen); + if ( inc_ref ) + read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos); + stats->gcd_pos = bam_line->core.pos; + stats->gcd[ stats->igcd ].gc = fai_gc_content(stats, stats->gcd_pos, stats->gcd_bin_size); + } + + count_mismatches_per_cycle(stats,bam_line); + } + // No reference and first pass, new chromosome or sequence going beyond the end of the gcd bin + else if ( stats->gcd_pos==-1 || stats->tid != bam_line->core.tid || bam_line->core.pos - stats->gcd_pos > stats->gcd_bin_size ) + { + // First pass or a new chromosome + stats->tid = bam_line->core.tid; + stats->gcd_pos = bam_line->core.pos; + stats->igcd++; + if ( stats->igcd >= stats->ngcd ) + realloc_gcd_buffer(stats, readlen); + } + stats->gcd[ stats->igcd ].depth++; + // When no reference sequence is given, approximate the GC from the read (much shorter window, but otherwise OK) + if ( !stats->fai ) + stats->gcd[ stats->igcd ].gc += (float) gc_count / seq_len; + + // Coverage distribution graph + round_buffer_flush(stats,bam_line->core.pos); + round_buffer_insert_read(&(stats->cov_rbuf),bam_line->core.pos,bam_line->core.pos+seq_len-1); + } + } + + stats->total_len += seq_len; + if ( IS_DUP(bam_line) ) + { + stats->total_len_dup += seq_len; + stats->nreads_dup++; + } +} + +// Sort by GC and depth +#define GCD_t(x) ((gc_depth_t *)x) +static int gcd_cmp(const void *a, const void *b) +{ + if ( GCD_t(a)->gc < GCD_t(b)->gc ) return -1; + if ( GCD_t(a)->gc > GCD_t(b)->gc ) return 1; + if ( GCD_t(a)->depth < GCD_t(b)->depth ) return -1; + if ( GCD_t(a)->depth > GCD_t(b)->depth ) return 1; + return 0; +} +#undef GCD_t + +float gcd_percentile(gc_depth_t *gcd, int N, int p) +{ + float n,d; + int k; + + n = p*(N+1)/100; + k = n; + if ( k<=0 ) + return gcd[0].depth; + if ( k>=N ) + return gcd[N-1].depth; + + d = n - k; + return gcd[k-1].depth + d*(gcd[k].depth - gcd[k-1].depth); +} + +void output_stats(stats_t *stats) +{ + // Calculate average insert size and standard deviation (from the main bulk data only) + int isize, ibulk=0; + uint64_t nisize=0, nisize_inward=0, nisize_outward=0, nisize_other=0; + for (isize=0; isizenisize; isize++) + { + // Each pair was counted twice + stats->isize_inward[isize] *= 0.5; + stats->isize_outward[isize] *= 0.5; + stats->isize_other[isize] *= 0.5; + + nisize_inward += stats->isize_inward[isize]; + nisize_outward += stats->isize_outward[isize]; + nisize_other += stats->isize_other[isize]; + nisize += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]; + } + + double bulk=0, avg_isize=0, sd_isize=0; + for (isize=0; isizenisize; isize++) + { + bulk += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]; + avg_isize += isize * (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]); + + if ( bulk/nisize > stats->isize_main_bulk ) + { + ibulk = isize+1; + nisize = bulk; + break; + } + } + avg_isize /= nisize ? nisize : 1; + for (isize=1; isizeisize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]) * (isize-avg_isize)*(isize-avg_isize) / nisize; + sd_isize = sqrt(sd_isize); + + + printf("# This file was produced by bamcheck (%s)\n",BAMCHECK_VERSION); + printf("# The command line was: %s",stats->argv[0]); + int i; + for (i=1; iargc; i++) + printf(" %s",stats->argv[i]); + printf("\n"); + printf("# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n"); + printf("SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd)); + printf("SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered); + printf("SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd)); + printf("SN\tis paired:\t%d\n", stats->nreads_1st&&stats->nreads_2nd ? 1 : 0); + printf("SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0); + printf("SN\t1st fragments:\t%ld\n", (long)stats->nreads_1st); + printf("SN\tlast fragments:\t%ld\n", (long)stats->nreads_2nd); + printf("SN\treads mapped:\t%ld\n", (long)(stats->nreads_paired+stats->nreads_unpaired)); + printf("SN\treads unmapped:\t%ld\n", (long)stats->nreads_unmapped); + printf("SN\treads unpaired:\t%ld\n", (long)stats->nreads_unpaired); + printf("SN\treads paired:\t%ld\n", (long)stats->nreads_paired); + printf("SN\treads duplicated:\t%ld\n", (long)stats->nreads_dup); + printf("SN\treads MQ0:\t%ld\n", (long)stats->nreads_mq0); + printf("SN\treads QC failed:\t%ld\n", (long)stats->nreads_QCfailed); + printf("SN\tnon-primary alignments:\t%ld\n", (long)stats->nreads_secondary); + printf("SN\ttotal length:\t%ld\n", (long)stats->total_len); + printf("SN\tbases mapped:\t%ld\n", (long)stats->nbases_mapped); + printf("SN\tbases mapped (cigar):\t%ld\n", (long)stats->nbases_mapped_cigar); + printf("SN\tbases trimmed:\t%ld\n", (long)stats->nbases_trimmed); + printf("SN\tbases duplicated:\t%ld\n", (long)stats->total_len_dup); + printf("SN\tmismatches:\t%ld\n", (long)stats->nmismatches); + printf("SN\terror rate:\t%e\n", (float)stats->nmismatches/stats->nbases_mapped_cigar); + float avg_read_length = (stats->nreads_1st+stats->nreads_2nd)?stats->total_len/(stats->nreads_1st+stats->nreads_2nd):0; + printf("SN\taverage length:\t%.0f\n", avg_read_length); + printf("SN\tmaximum length:\t%d\n", stats->max_len); + printf("SN\taverage quality:\t%.1f\n", stats->total_len?stats->sum_qual/stats->total_len:0); + printf("SN\tinsert size average:\t%.1f\n", avg_isize); + printf("SN\tinsert size standard deviation:\t%.1f\n", sd_isize); + printf("SN\tinward oriented pairs:\t%ld\n", (long)nisize_inward); + printf("SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward); + printf("SN\tpairs with other orientation:\t%ld\n", (long)nisize_other); + printf("SN\tpairs on different chromosomes:\t%ld\n", (long)stats->nreads_anomalous/2); + + int ibase,iqual; + if ( stats->max_lennbases ) stats->max_len++; + if ( stats->max_qual+1nquals ) stats->max_qual++; + printf("# First Fragment Qualitites. Use `grep ^FFQ | cut -f 2-` to extract this part.\n"); + printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n"); + for (ibase=0; ibasemax_len; ibase++) + { + printf("FFQ\t%d",ibase+1); + for (iqual=0; iqual<=stats->max_qual; iqual++) + { + printf("\t%ld", (long)stats->quals_1st[ibase*stats->nquals+iqual]); + } + printf("\n"); + } + printf("# Last Fragment Qualitites. Use `grep ^LFQ | cut -f 2-` to extract this part.\n"); + printf("# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n"); + for (ibase=0; ibasemax_len; ibase++) + { + printf("LFQ\t%d",ibase+1); + for (iqual=0; iqual<=stats->max_qual; iqual++) + { + printf("\t%ld", (long)stats->quals_2nd[ibase*stats->nquals+iqual]); + } + printf("\n"); + } + if ( stats->mpc_buf ) + { + printf("# Mismatches per cycle and quality. Use `grep ^MPC | cut -f 2-` to extract this part.\n"); + printf("# Columns correspond to qualities, rows to cycles. First column is the cycle number, second\n"); + printf("# is the number of N's and the rest is the number of mismatches\n"); + for (ibase=0; ibasemax_len; ibase++) + { + printf("MPC\t%d",ibase+1); + for (iqual=0; iqual<=stats->max_qual; iqual++) + { + printf("\t%ld", (long)stats->mpc_buf[ibase*stats->nquals+iqual]); + } + printf("\n"); + } + } + printf("# GC Content of first fragments. Use `grep ^GCF | cut -f 2-` to extract this part.\n"); + int ibase_prev = 0; + for (ibase=0; ibasengc; ibase++) + { + if ( stats->gc_1st[ibase]==stats->gc_1st[ibase_prev] ) continue; + printf("GCF\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_1st[ibase_prev]); + ibase_prev = ibase; + } + printf("# GC Content of last fragments. Use `grep ^GCL | cut -f 2-` to extract this part.\n"); + ibase_prev = 0; + for (ibase=0; ibasengc; ibase++) + { + if ( stats->gc_2nd[ibase]==stats->gc_2nd[ibase_prev] ) continue; + printf("GCL\t%.2f\t%ld\n", (ibase+ibase_prev)*0.5*100./(stats->ngc-1), (long)stats->gc_2nd[ibase_prev]); + ibase_prev = ibase; + } + printf("# ACGT content per cycle. Use `grep ^GCC | cut -f 2-` to extract this part. The columns are: cycle, and A,C,G,T counts [%%]\n"); + for (ibase=0; ibasemax_len; ibase++) + { + uint64_t *ptr = &(stats->acgt_cycles[ibase*4]); + uint64_t sum = ptr[0]+ptr[1]+ptr[2]+ptr[3]; + if ( ! sum ) continue; + printf("GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase,100.*ptr[0]/sum,100.*ptr[1]/sum,100.*ptr[2]/sum,100.*ptr[3]/sum); + } + printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: pairs total, inward oriented pairs, outward oriented pairs, other pairs\n"); + for (isize=0; isizeisize_inward[isize]+stats->isize_outward[isize]+stats->isize_other[isize]), + (long)stats->isize_inward[isize], (long)stats->isize_outward[isize], (long)stats->isize_other[isize]); + + printf("# Read lengths. Use `grep ^RL | cut -f 2-` to extract this part. The columns are: read length, count\n"); + int ilen; + for (ilen=0; ilenmax_len; ilen++) + { + if ( stats->read_lengths[ilen]>0 ) + printf("RL\t%d\t%ld\n", ilen, (long)stats->read_lengths[ilen]); + } + + printf("# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n"); + for (ilen=0; ilennindels; ilen++) + { + if ( stats->insertions[ilen]>0 || stats->deletions[ilen]>0 ) + printf("ID\t%d\t%ld\t%ld\n", ilen+1, (long)stats->insertions[ilen], (long)stats->deletions[ilen]); + } + + printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions (fwd), .. (rev) , number of deletions (fwd), .. (rev)\n"); + for (ilen=0; ilen<=stats->nbases; ilen++) + { + // For deletions we print the index of the cycle before the deleted base (1-based) and for insertions + // the index of the cycle of the first inserted base (also 1-based) + if ( stats->ins_cycles_1st[ilen]>0 || stats->ins_cycles_2nd[ilen]>0 || stats->del_cycles_1st[ilen]>0 || stats->del_cycles_2nd[ilen]>0 ) + printf("IC\t%d\t%ld\t%ld\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles_1st[ilen], (long)stats->ins_cycles_2nd[ilen], (long)stats->del_cycles_1st[ilen], (long)stats->del_cycles_2nd[ilen]); + } + + printf("# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n"); + if ( stats->cov[0] ) + printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1, (long)stats->cov[0]); + int icov; + for (icov=1; icovncov-1; icov++) + if ( stats->cov[icov] ) + printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1, (long)stats->cov[icov]); + if ( stats->cov[stats->ncov-1] ) + printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1, (long)stats->cov[stats->ncov-1]); + + // Calculate average GC content, then sort by GC and depth + printf("# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n"); + uint32_t igcd; + for (igcd=0; igcdigcd; igcd++) + { + if ( stats->fai ) + stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc); + else + if ( stats->gcd[igcd].depth ) + stats->gcd[igcd].gc = round(100. * stats->gcd[igcd].gc / stats->gcd[igcd].depth); + } + qsort(stats->gcd, stats->igcd+1, sizeof(gc_depth_t), gcd_cmp); + igcd = 0; + while ( igcd < stats->igcd ) + { + // Calculate percentiles (10,25,50,75,90th) for the current GC content and print + uint32_t nbins=0, itmp=igcd; + float gc = stats->gcd[igcd].gc; + while ( itmpigcd && fabs(stats->gcd[itmp].gc-gc)<0.1 ) + { + nbins++; + itmp++; + } + printf("GCD\t%.1f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n", gc, (igcd+nbins+1)*100./(stats->igcd+1), + gcd_percentile(&(stats->gcd[igcd]),nbins,10) *avg_read_length/stats->gcd_bin_size, + gcd_percentile(&(stats->gcd[igcd]),nbins,25) *avg_read_length/stats->gcd_bin_size, + gcd_percentile(&(stats->gcd[igcd]),nbins,50) *avg_read_length/stats->gcd_bin_size, + gcd_percentile(&(stats->gcd[igcd]),nbins,75) *avg_read_length/stats->gcd_bin_size, + gcd_percentile(&(stats->gcd[igcd]),nbins,90) *avg_read_length/stats->gcd_bin_size + ); + igcd += nbins; + } +} + +size_t mygetline(char **line, size_t *n, FILE *fp) +{ + if (line == NULL || n == NULL || fp == NULL) + { + errno = EINVAL; + return -1; + } + if (*n==0 || !*line) + { + *line = NULL; + *n = 0; + } + + size_t nread=0; + int c; + while ((c=getc(fp))!= EOF && c!='\n') + { + if ( ++nread>=*n ) + { + *n += 255; + *line = realloc(*line, sizeof(char)*(*n)); + } + (*line)[nread-1] = c; + } + if ( nread>=*n ) + { + *n += 255; + *line = realloc(*line, sizeof(char)*(*n)); + } + (*line)[nread] = 0; + return nread>0 ? nread : -1; + +} + +void init_regions(stats_t *stats, char *file) +{ + khiter_t iter; + khash_t(kh_bam_tid) *header_hash; + + bam_init_header_hash(stats->sam->header); + header_hash = (khash_t(kh_bam_tid)*)stats->sam->header->hash; + + FILE *fp = fopen(file,"r"); + if ( !fp ) error("%s: %s\n",file,strerror(errno)); + + char *line = NULL; + size_t len = 0; + ssize_t nread; + int warned = 0; + int prev_tid=-1, prev_pos=-1; + while ((nread = mygetline(&line, &len, fp)) != -1) + { + if ( line[0] == '#' ) continue; + + int i = 0; + while ( i=nread ) error("Could not parse the file: %s [%s]\n", file,line); + line[i] = 0; + + iter = kh_get(kh_bam_tid, header_hash, line); + int tid = kh_val(header_hash, iter); + if ( iter == kh_end(header_hash) ) + { + if ( !warned ) + fprintf(stderr,"Warning: Some sequences not present in the BAM, e.g. \"%s\". This message is printed only once.\n", line); + warned = 1; + continue; + } + + if ( tid >= stats->nregions ) + { + stats->regions = realloc(stats->regions,sizeof(regions_t)*(stats->nregions+100)); + int j; + for (j=stats->nregions; jnregions+100; j++) + { + stats->regions[j].npos = stats->regions[j].mpos = stats->regions[j].cpos = 0; + stats->regions[j].pos = NULL; + } + stats->nregions += 100; + } + int npos = stats->regions[tid].npos; + if ( npos >= stats->regions[tid].mpos ) + { + stats->regions[tid].mpos += 1000; + stats->regions[tid].pos = realloc(stats->regions[tid].pos,sizeof(pos_t)*stats->regions[tid].mpos); + } + + if ( (sscanf(line+i+1,"%d %d",&stats->regions[tid].pos[npos].from,&stats->regions[tid].pos[npos].to))!=2 ) error("Could not parse the region [%s]\n"); + if ( prev_tid==-1 || prev_tid!=tid ) + { + prev_tid = tid; + prev_pos = stats->regions[tid].pos[npos].from; + } + if ( prev_pos>stats->regions[tid].pos[npos].from ) + error("The positions are not in chromosomal order (%s:%d comes after %d)\n", line,stats->regions[tid].pos[npos].from,prev_pos); + stats->regions[tid].npos++; + } + if (line) free(line); + if ( !stats->regions ) error("Unable to map the -t sequences to the BAM sequences.\n"); + fclose(fp); +} + +void destroy_regions(stats_t *stats) +{ + int i; + for (i=0; inregions; i++) + { + if ( !stats->regions[i].mpos ) continue; + free(stats->regions[i].pos); + } + if ( stats->regions ) free(stats->regions); +} + +static int fetch_read(const bam1_t *bam_line, void *data) +{ + collect_stats((bam1_t*)bam_line,(stats_t*)data); + return 1; +} + +void reset_regions(stats_t *stats) +{ + int i; + for (i=0; inregions; i++) + stats->regions[i].cpos = 0; +} + +int is_in_regions(bam1_t *bam_line, stats_t *stats) +{ + if ( !stats->regions ) return 1; + + if ( bam_line->core.tid >= stats->nregions || bam_line->core.tid<0 ) return 0; + if ( !stats->is_sorted ) error("The BAM must be sorted in order for -t to work.\n"); + + regions_t *reg = &stats->regions[bam_line->core.tid]; + if ( reg->cpos==reg->npos ) return 0; // done for this chr + + // Find a matching interval or skip this read. No splicing of reads is done, no indels or soft clips considered, + // even small overlap is enough to include the read in the stats. + int i = reg->cpos; + while ( inpos && reg->pos[i].to<=bam_line->core.pos ) i++; + if ( i>=reg->npos ) { reg->cpos = reg->npos; return 0; } + if ( bam_line->core.pos + bam_line->core.l_qseq + 1 < reg->pos[i].from ) return 0; + reg->cpos = i; + stats->reg_from = reg->pos[i].from; + stats->reg_to = reg->pos[i].to; + + return 1; +} + +void init_group_id(stats_t *stats, char *id) +{ + if ( !stats->sam->header->dict ) + stats->sam->header->dict = sam_header_parse2(stats->sam->header->text); + void *iter = stats->sam->header->dict; + const char *key, *val; + int n = 0; + stats->rg_hash = kh_init(kh_rg); + while ( (iter = sam_header2key_val(iter, "RG","ID","SM", &key, &val)) ) + { + if ( !strcmp(id,key) || (val && !strcmp(id,val)) ) + { + khiter_t k = kh_get(kh_rg, stats->rg_hash, key); + if ( k != kh_end(stats->rg_hash) ) + fprintf(stderr, "[init_group_id] The group ID not unique: \"%s\"\n", key); + int ret; + k = kh_put(kh_rg, stats->rg_hash, key, &ret); + kh_value(stats->rg_hash, k) = val; + n++; + } + } + if ( !n ) + error("The sample or read group \"%s\" not present.\n", id); +} + + +void error(const char *format, ...) +{ + if ( !format ) + { + printf("Version: %s\n", BAMCHECK_VERSION); + printf("About: The program collects statistics from BAM files. The output can be visualized using plot-bamcheck.\n"); + printf("Usage: bamcheck [OPTIONS] file.bam\n"); + printf(" bamcheck [OPTIONS] file.bam chr:from-to\n"); + printf("Options:\n"); + printf(" -c, --coverage ,, Coverage distribution min,max,step [1,1000,1]\n"); + printf(" -d, --remove-dups Exlude from statistics reads marked as duplicates\n"); + printf(" -f, --required-flag Required flag, 0 for unset [0]\n"); + printf(" -F, --filtering-flag Filtering flag, 0 for unset [0]\n"); + printf(" --GC-depth Bin size for GC-depth graph and the maximum reference length [2e4,4.2e9]\n"); + printf(" -h, --help This help message\n"); + printf(" -i, --insert-size Maximum insert size [8000]\n"); + printf(" -I, --id Include only listed read group or sample name\n"); + printf(" -l, --read-length Include in the statistics only reads with the given read length []\n"); + printf(" -m, --most-inserts Report only the main part of inserts [0.99]\n"); + printf(" -q, --trim-quality The BWA trimming parameter [0]\n"); + printf(" -r, --ref-seq Reference sequence (required for GC-depth calculation).\n"); + printf(" -t, --target-regions Do stats in these regions only. Tab-delimited file chr,from,to, 1-based, inclusive.\n"); + printf(" -s, --sam Input is SAM\n"); + printf("\n"); + } + else + { + va_list ap; + va_start(ap, format); + vfprintf(stderr, format, ap); + va_end(ap); + } + exit(-1); +} + +int main(int argc, char *argv[]) +{ + char *targets = NULL; + char *bam_fname = NULL; + char *group_id = NULL; + samfile_t *sam = NULL; + char in_mode[5]; + + stats_t *stats = calloc(1,sizeof(stats_t)); + stats->ngc = 200; + stats->nquals = 256; + stats->nbases = 300; + stats->nisize = 8000; + stats->max_len = 30; + stats->max_qual = 40; + stats->isize_main_bulk = 0.99; // There are always outliers at the far end + stats->gcd_bin_size = 20e3; + stats->gcd_ref_size = 4.2e9; + stats->rseq_pos = -1; + stats->tid = stats->gcd_pos = -1; + stats->igcd = 0; + stats->is_sorted = 1; + stats->cov_min = 1; + stats->cov_max = 1000; + stats->cov_step = 1; + stats->argc = argc; + stats->argv = argv; + stats->filter_readlen = -1; + stats->nindels = stats->nbases; + + strcpy(in_mode, "rb"); + + static struct option loptions[] = + { + {"help",0,0,'h'}, + {"remove-dups",0,0,'d'}, + {"sam",0,0,'s'}, + {"ref-seq",1,0,'r'}, + {"coverage",1,0,'c'}, + {"read-length",1,0,'l'}, + {"insert-size",1,0,'i'}, + {"most-inserts",1,0,'m'}, + {"trim-quality",1,0,'q'}, + {"target-regions",0,0,'t'}, + {"required-flag",1,0,'f'}, + {"filtering-flag",0,0,'F'}, + {"id",1,0,'I'}, + {"GC-depth",1,0,1}, + {0,0,0,0} + }; + int opt; + while ( (opt=getopt_long(argc,argv,"?hdsr:c:l:i:t:m:q:f:F:I:1:",loptions,NULL))>0 ) + { + switch (opt) + { + case 'f': stats->flag_require=strtol(optarg,0,0); break; + case 'F': stats->flag_filter=strtol(optarg,0,0); break; + case 'd': stats->flag_filter|=BAM_FDUP; break; + case 's': strcpy(in_mode, "r"); break; + case 'r': stats->fai = fai_load(optarg); + if (stats->fai==0) + error("Could not load faidx: %s\n", optarg); + break; + case 1 : { + float flen,fbin; + if ( sscanf(optarg,"%f,%f",&fbin,&flen)!= 2 ) + error("Unable to parse --GC-depth %s\n", optarg); + stats->gcd_bin_size = fbin; + stats->gcd_ref_size = flen; + } + break; + case 'c': if ( sscanf(optarg,"%d,%d,%d",&stats->cov_min,&stats->cov_max,&stats->cov_step)!= 3 ) + error("Unable to parse -c %s\n", optarg); + break; + case 'l': stats->filter_readlen = atoi(optarg); break; + case 'i': stats->nisize = atoi(optarg); break; + case 'm': stats->isize_main_bulk = atof(optarg); break; + case 'q': stats->trim_qual = atoi(optarg); break; + case 't': targets = optarg; break; + case 'I': group_id = optarg; break; + case '?': + case 'h': error(NULL); + default: error("Unknown argument: %s\n", optarg); + } + } + if ( optindcov_step > stats->cov_max - stats->cov_min + 1 ) + { + stats->cov_step = stats->cov_max - stats->cov_min; + if ( stats->cov_step <= 0 ) + stats->cov_step = 1; + } + stats->ncov = 3 + (stats->cov_max-stats->cov_min) / stats->cov_step; + stats->cov_max = stats->cov_min + ((stats->cov_max-stats->cov_min)/stats->cov_step +1)*stats->cov_step - 1; + stats->cov = calloc(sizeof(uint64_t),stats->ncov); + stats->cov_rbuf.size = stats->nbases*5; + stats->cov_rbuf.buffer = calloc(sizeof(int32_t),stats->cov_rbuf.size); + // .. bam + if ((sam = samopen(bam_fname, in_mode, NULL)) == 0) + error("Failed to open: %s\n", bam_fname); + stats->sam = sam; + if ( group_id ) init_group_id(stats, group_id); + bam1_t *bam_line = bam_init1(); + // .. arrays + stats->quals_1st = calloc(stats->nquals*stats->nbases,sizeof(uint64_t)); + stats->quals_2nd = calloc(stats->nquals*stats->nbases,sizeof(uint64_t)); + stats->gc_1st = calloc(stats->ngc,sizeof(uint64_t)); + stats->gc_2nd = calloc(stats->ngc,sizeof(uint64_t)); + stats->isize_inward = calloc(stats->nisize,sizeof(uint64_t)); + stats->isize_outward = calloc(stats->nisize,sizeof(uint64_t)); + stats->isize_other = calloc(stats->nisize,sizeof(uint64_t)); + stats->gcd = calloc(stats->ngcd,sizeof(gc_depth_t)); + stats->mpc_buf = stats->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL; + stats->acgt_cycles = calloc(4*stats->nbases,sizeof(uint64_t)); + stats->read_lengths = calloc(stats->nbases,sizeof(uint64_t)); + stats->insertions = calloc(stats->nbases,sizeof(uint64_t)); + stats->deletions = calloc(stats->nbases,sizeof(uint64_t)); + stats->ins_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t)); + stats->ins_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t)); + stats->del_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t)); + stats->del_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t)); + realloc_rseq_buffer(stats); + if ( targets ) + init_regions(stats, targets); + + // Collect statistics + if ( optindsam->header, argv[i], &tid, &beg, &end); + if ( tid < 0 ) continue; + reset_regions(stats); + bam_fetch(stats->sam->x.bam, bam_idx, tid, beg, end, stats, fetch_read); + } + bam_index_destroy(bam_idx); + } + else + { + // Stream through the entire BAM ignoring off-target regions if -t is given + while (samread(sam,bam_line) >= 0) + collect_stats(bam_line,stats); + } + round_buffer_flush(stats,-1); + + output_stats(stats); + + bam_destroy1(bam_line); + samclose(stats->sam); + if (stats->fai) fai_destroy(stats->fai); + free(stats->cov_rbuf.buffer); free(stats->cov); + free(stats->quals_1st); free(stats->quals_2nd); + free(stats->gc_1st); free(stats->gc_2nd); + free(stats->isize_inward); free(stats->isize_outward); free(stats->isize_other); + free(stats->gcd); + free(stats->rseq_buf); + free(stats->mpc_buf); + free(stats->acgt_cycles); + free(stats->read_lengths); + free(stats->insertions); + free(stats->deletions); + free(stats->ins_cycles_1st); + free(stats->ins_cycles_2nd); + free(stats->del_cycles_1st); + free(stats->del_cycles_2nd); + destroy_regions(stats); + free(stats); + if ( stats->rg_hash ) kh_destroy(kh_rg, stats->rg_hash); + + return 0; +} + + + diff --git a/sam/misc/plot-bamcheck b/sam/misc/plot-bamcheck new file mode 100755 index 0000000..1792c6f --- /dev/null +++ b/sam/misc/plot-bamcheck @@ -0,0 +1,882 @@ +#!/usr/bin/env perl +# +# Author: petr.danecek@sanger +# + +use strict; +use warnings; +use Carp; + +my $opts = parse_params(); +parse_bamcheck($opts); +plot_qualities($opts); +plot_acgt_cycles($opts); +plot_gc($opts); +plot_gc_depth($opts); +plot_isize($opts); +plot_coverage($opts); +plot_mismatches_per_cycle($opts); +plot_indel_dist($opts); +plot_indel_cycles($opts); + +exit; + +#-------------------------------- + +sub error +{ + my (@msg) = @_; + if ( scalar @msg ) { confess @msg; } + die + "Usage: plot-bamcheck [OPTIONS] file.bam.bc\n", + " plot-bamcheck -p outdir/ file.bam.bc\n", + "Options:\n", + " -k, --keep-files Do not remove temporary files.\n", + " -p, --prefix The output files prefix, add a slash to create new directory.\n", + " -r, --ref-stats Optional reference stats file with expected GC content (created with -s).\n", + " -s, --do-ref-stats Calculate reference sequence GC for later use with -r\n", + " -t, --targets Restrict -s to the listed regions (tab-delimited chr,from,to. 1-based, inclusive)\n", + " -h, -?, --help This help message.\n", + "\n"; +} + + +sub parse_params +{ + $0 =~ s{^.+/}{}; + my $opts = { args=>join(' ',$0,@ARGV) }; + while (defined(my $arg=shift(@ARGV))) + { + if ( $arg eq '-k' || $arg eq '--keep-files' ) { $$opts{keep_files}=1; next; } + if ( $arg eq '-r' || $arg eq '--ref-stats' ) { $$opts{ref_stats}=shift(@ARGV); next; } + if ( $arg eq '-s' || $arg eq '--do-ref-stats' ) { $$opts{do_ref_stats}=shift(@ARGV); next; } + if ( $arg eq '-t' || $arg eq '--targets' ) { $$opts{targets}=shift(@ARGV); next; } + if ( $arg eq '-p' || $arg eq '--prefix' ) { $$opts{prefix}=shift(@ARGV); next; } + if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } + if ( -e $arg ) { $$opts{bamcheck}=$arg; next; } + error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n"); + } + if ( exists($$opts{do_ref_stats }) ) { do_ref_stats($opts); exit; } + if ( !exists($$opts{bamcheck}) ) { error("No bamcheck file?\n") } + if ( !exists($$opts{prefix}) ) { error("Expected -p parameter.\n") } + if ( $$opts{prefix}=~m{/$} ) { `mkdir -p $$opts{prefix}`; } + elsif ( !($$opts{prefix}=~/-$/) ) { $$opts{prefix} .= '-'; } + return $opts; +} + + +# Creates GC stats for either the whole reference or only on target regions for exome QC +sub do_ref_stats +{ + my ($opts) = @_; + + + my %targets = (); + if ( exists($$opts{targets}) ) + { + my ($prev_chr,$prev_pos); + open(my $fh,'<',$$opts{targets}) or error("$$opts{targets}: $!"); + while (my $line=<$fh>) + { + if ( $line=~/^#/ ) { next; } + my ($chr,$from,$to) = split(/\s+/,$line); + chomp($to); + push @{$targets{$chr}}, $from,$to; + if ( !defined $prev_chr or $chr ne $prev_chr ) { $prev_chr=$chr; $prev_pos=$from } + if ( $prev_pos > $from ) { error("The file must be sorted: $$opts{targets}\n"); } + $prev_pos = $from; + } + close($fh); + } + + my $_len = 60; # for now do only standard fasta's with 60 bases per line + my %gc_counts = (); + my ($skip_chr,$pos,$ireg,$regions); + open(my $fh,'<',$$opts{do_ref_stats}) or error("$$opts{do_ref_stats}: $!"); + while (my $line=<$fh>) + { + if ( $line=~/^>/ ) + { + if ( !scalar %targets ) { next; } + + if ( !($line=~/>(\S+)/) ) { error("FIXME: could not determine chromosome name: $line"); } + if ( !exists($targets{$1}) ) { $skip_chr=$1; next; } + undef $skip_chr; + $pos = 0; + $ireg = 0; + $regions = $targets{$1}; + } + if ( defined $skip_chr ) { next; } + + # Only $_len sized lines are considered and no chopping for target regions. + chomp($line); + my $len = length($line); + if ( $len ne $_len ) { next; } + + if ( scalar %targets ) + { + while ( $ireg<@$regions && $$regions[$ireg+1]<=$pos ) { $ireg += 2; } + $pos += $len; + if ( $ireg==@$regions ) { next; } + if ( $pos < $$regions[$ireg] ) { next; } + } + + my $gc_count = 0; + for (my $i=0; $i<$len; $i++) + { + my $base = substr($line,$i,1); + if ( $base eq 'g' || $base eq 'G' || $base eq 'c' || $base eq 'C' ) { $gc_count++; } + } + $gc_counts{$gc_count}++; + } + + print "# Generated by $$opts{args}\n"; + print "# The columns are: GC content bin, normalized frequency\n"; + my $max; + for my $count (values %gc_counts) + { + if ( !defined $max or $count>$max ) { $max=$count; } + } + for my $gc (sort {$a<=>$b} keys %gc_counts) + { + if ( $gc==0 ) { next; } + printf "%f\t%f\n", $gc*100./$_len, $gc_counts{$gc}/$max; + } +} + +sub plot +{ + my ($cmdfile) = @_; + my $cmd = "gnuplot $cmdfile"; + system($cmd); + if ( $? ) { error("The command exited with non-zero status $?:\n\t$cmd\n\n"); } +} + + +sub parse_bamcheck +{ + my ($opts) = @_; + open(my $fh,'<',$$opts{bamcheck}) or error("$$opts{bamcheck}: $!"); + my $line = <$fh>; + if ( !($line=~/^# This file was produced by bamcheck (\S+)/) ) { error("Sanity check failed: was this file generated by bamcheck?"); } + $$opts{dat}{version} = $1; + while ($line=<$fh>) + { + if ( $line=~/^#/ ) { next; } + my @items = split(/\t/,$line); + chomp($items[-1]); + if ( $items[0] eq 'SN' ) + { + $$opts{dat}{$items[1]} = splice(@items,2); + next; + } + push @{$$opts{dat}{$items[0]}}, [splice(@items,1)]; + } + close($fh); + + # Check sanity + if ( !exists($$opts{dat}{'sequences:'}) or !$$opts{dat}{'sequences:'} ) + { + error("Sanity check failed: no sequences found by bamcheck??\n"); + } +} + +sub older_than +{ + my ($opts,$version) = @_; + my ($year,$month,$day) = split(/-/,$version); + $version = $$opts{dat}{version}; + if ( !($version=~/\((\d+)-(\d+)-(\d+)\)$/) ) { return 1; } + if ( $1<$year ) { return 1; } + elsif ( $1>$year ) { return 0; } + if ( $2<$month ) { return 1; } + elsif ( $2>$month ) { return 0; } + if ( $3<$day ) { return 1; } + return 0; +} + +sub get_defaults +{ + my ($opts,$img_fname,%args) = @_; + + if ( !($img_fname=~/\.png$/i) ) { error("FIXME: currently only PNG supported. (Easy to extend.)\n"); } + + # Determine the gnuplot script file name + my $gp_file = $img_fname; + $gp_file =~ s{\.[^.]+$}{.gp}; + if ( !($gp_file=~/.gp$/) ) { $gp_file .= '.gp'; } + + # Determine the default title: + # 5446_6/5446_6.bam.bc.gp -> 5446_6 + # test.aaa.png -> test.aaa + if ( !($$opts{bamcheck}=~m{([^/]+?)(?:\.bam)?(?:\.bc)?$}i) ) { error("FIXME: Could not determine the title from [$img_fname]\n"); } + my $title = $1; + + my $dir = $gp_file; + $dir =~ s{/[^/]+$}{}; + if ( $dir && $dir ne $gp_file ) { `mkdir -p $dir`; } + + my $wh = exists($args{wh}) ? $args{wh} : '600,400'; + + open(my $fh,'>',$gp_file) or error("$gp_file: $!"); + return { + title => $title, + gp => $gp_file, + img => $img_fname, + fh => $fh, + terminal => qq[set terminal png size $wh truecolor], + grid => 'set grid xtics ytics y2tics back lc rgb "#cccccc"', + }; +} + +sub percentile +{ + my ($p,@vals) = @_; + my $N = 0; + for my $val (@vals) { $N += $val; } + my $n = $p*($N+1)/100.; + my $k = int($n); + my $d = $n-$k; + if ( $k<=0 ) { return 0; } + if ( $k>=$N ) { return scalar @vals-1; } + my $cnt; + for (my $i=0; $i<@vals; $i++) + { + $cnt += $vals[$i]; + if ( $cnt>=$k ) { return $i; } + } + error("FIXME: this should not happen [percentile]\n"); +} + +sub plot_qualities +{ + my ($opts) = @_; + + if ( !exists($$opts{dat}{FFQ}) or !@{$$opts{dat}{FFQ}} ) { return; } + + my $yrange = @{$$opts{dat}{FFQ}[0]} > 50 ? @{$$opts{dat}{FFQ}[0]} : 50; + my $is_paired = $$opts{dat}{'is paired:'}; + + # Average quality per cycle, forward and reverse reads in one plot + my $args = get_defaults($opts,"$$opts{prefix}quals.png"); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set ylabel "Average Quality" + set xlabel "Cycle" + set yrange [0:$yrange] + set title "$$args{title}" + plot '-' using 1:2 with lines title 'Forward reads' ] . ($is_paired ? q[, '-' using 1:2 with lines title 'Reverse reads'] : '') . q[ + ]; + my (@fp75,@fp50,@fmean); + my (@lp75,@lp50,@lmean); + my ($fmax,$fmax_qual,$fmax_cycle); + my ($lmax,$lmax_qual,$lmax_cycle); + for my $cycle (@{$$opts{dat}{FFQ}}) + { + my $sum=0; my $n=0; + for (my $iqual=1; $iqual<@$cycle; $iqual++) + { + $sum += $$cycle[$iqual]*$iqual; + $n += $$cycle[$iqual]; + if ( !defined $fmax or $fmax<$$cycle[$iqual] ) { $fmax=$$cycle[$iqual]; $fmax_qual=$iqual; $fmax_cycle=$$cycle[0]; } + } + my $p25 = percentile(25,(@$cycle)[1..$#$cycle]); + my $p50 = percentile(50,(@$cycle)[1..$#$cycle]); + my $p75 = percentile(75,(@$cycle)[1..$#$cycle]); + if ( !$n ) { next; } + push @fp75, "$$cycle[0]\t$p25\t$p75\n"; + push @fp50, "$$cycle[0]\t$p50\n"; + push @fmean, sprintf "%d\t%.2f\n", $$cycle[0],$sum/$n; + printf $fh $fmean[-1]; + } + print $fh "end\n"; + if ( $is_paired ) + { + for my $cycle (@{$$opts{dat}{LFQ}}) + { + my $sum=0; my $n=0; + for (my $iqual=1; $iqual<@$cycle; $iqual++) + { + $sum += $$cycle[$iqual]*$iqual; + $n += $$cycle[$iqual]; + if ( !defined $lmax or $lmax<$$cycle[$iqual] ) { $lmax=$$cycle[$iqual]; $lmax_qual=$iqual; $lmax_cycle=$$cycle[0]; } + } + my $p25 = percentile(25,(@$cycle)[1..$#$cycle]); + my $p50 = percentile(50,(@$cycle)[1..$#$cycle]); + my $p75 = percentile(75,(@$cycle)[1..$#$cycle]); + if ( !$n ) { next; } + push @lp75, "$$cycle[0]\t$p25\t$p75\n"; + push @lp50, "$$cycle[0]\t$p50\n"; + push @lmean, sprintf "%d\t%.2f\n", $$cycle[0],$sum/$n; + printf $fh $lmean[-1]; + } + print $fh "end\n"; + } + close($fh); + plot($$args{gp}); + + + + # Average, mean and quality percentiles per cycle, forward and reverse reads in separate plots + $args = get_defaults($opts,"$$opts{prefix}quals2.png",wh=>'700,500'); + $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set multiplot + set rmargin 0 + set lmargin 0 + set tmargin 0 + set bmargin 0 + set origin 0.1,0.1 + set size 0.4,0.8 + set yrange [0:$yrange] + set ylabel "Quality" + set xlabel "Cycle (fwd reads)" + plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#cccccc" t '25-75th percentile' , '-' using 1:2 with lines lc rgb "#000000" t 'Median', '-' using 1:2 with lines lt 1 t 'Mean' + ]; + print $fh join('',@fp75),"end\n"; + print $fh join('',@fp50),"end\n"; + print $fh join('',@fmean),"end\n"; + if ( $is_paired ) + { + print $fh qq[ + set origin 0.55,0.1 + set size 0.4,0.8 + unset ytics + set y2tics mirror + set yrange [0:$yrange] + unset ylabel + set xlabel "Cycle (rev reads)" + set label "$$args{title}" at screen 0.5,0.95 center + plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#cccccc" t '25-75th percentile' , '-' using 1:2 with lines lc rgb "#000000" t 'Median', '-' using 1:2 with lines lt 2 t 'Mean' + ]; + print $fh join('',@lp75),"end\n"; + print $fh join('',@lp50),"end\n"; + print $fh join('',@lmean),"end\n"; + } + close($fh); + plot($$args{gp}); + + + + # Quality distribution per cycle, the distribution is for each cycle plotted as a separate curve + $args = get_defaults($opts,"$$opts{prefix}quals3.png",wh=>'600,600'); + $fh = $$args{fh}; + my $nquals = @{$$opts{dat}{FFQ}[0]}-1; + my $ncycles = @{$$opts{dat}{FFQ}}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set multiplot + set rmargin 0 + set lmargin 0 + set tmargin 0 + set bmargin 0 + set origin 0.15,0.52 + set size 0.8,0.4 + set title "$$args{title}" + set ylabel "Frequency (fwd reads)" + set label "Cycle $fmax_cycle" at $fmax_qual+1,$fmax + unset xlabel + set xrange [0:$nquals] + set format x "" + ]; + my @plots; + for (my $i=0; $i<$ncycles; $i++) { push @plots, q['-' using 1:2 with lines t ''] } + print $fh "plot ", join(",", @plots), "\n"; + for my $cycle (@{$$opts{dat}{FFQ}}) + { + for (my $iqual=1; $iqual<$nquals; $iqual++) { print $fh "$iqual\t$$cycle[$iqual]\n"; } + print $fh "end\n"; + } + if ( $is_paired ) + { + print $fh qq[ + set origin 0.15,0.1 + set size 0.8,0.4 + unset title + unset format + set xtics + set xlabel "Quality" + unset label + set label "Cycle $lmax_cycle" at $lmax_qual+1,$lmax + set ylabel "Frequency (rev reads)" + ]; + print $fh "plot ", join(",", @plots), "\n"; + for my $cycle (@{$$opts{dat}{LFQ}}) + { + for (my $iqual=1; $iqual<$nquals; $iqual++) + { + print $fh "$iqual\t$$cycle[$iqual]\n"; + } + print $fh "end\n"; + } + } + close($fh); + plot($$args{gp}); + + + # Heatmap qualitites + $args = get_defaults($opts,"$$opts{prefix}quals-hm.png", wh=>'600,500'); + $fh = $$args{fh}; + my $max = defined $lmax && $lmax > $fmax ? $lmax : $fmax; + my @ytics; + for my $cycle (@{$$opts{dat}{FFQ}}) { if ( $$cycle[0]%10==0 ) { push @ytics,qq["$$cycle[0]" $$cycle[0]]; } } + my $ytics = join(',', @ytics); + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + unset key + unset colorbox + set palette defined (0 0 0 0, 1 0 0 1, 3 0 1 0, 4 1 0 0, 6 1 1 1) + set cbrange [0:$max] + set yrange [0:$ncycles] + set xrange [0:$nquals] + set view map + set multiplot + set rmargin 0 + set lmargin 0 + set tmargin 0 + set bmargin 0 + set origin 0,0.46 + set size 0.95,0.6 + set obj 1 rectangle behind from first 0,0 to first $nquals,$ncycles + set obj 1 fillstyle solid 1.0 fillcolor rgbcolor "black" + set ylabel "Cycle (fwd reads)" offset character -1,0 + unset ytics + set ytics ($ytics) + unset xtics + set title "$$args{title}" + splot '-' matrix with image + ]; + for my $cycle (@{$$opts{dat}{FFQ}}) + { + for (my $iqual=1; $iqual<@$cycle; $iqual++) { print $fh "\t$$cycle[$iqual]"; } + print $fh "\n"; + } + print $fh "end\nend\n"; + @ytics = (); + for my $cycle (@{$$opts{dat}{LFQ}}) { if ( $$cycle[0]%10==0 ) { push @ytics,qq["$$cycle[0]" $$cycle[0]]; } } + $ytics = join(',', @ytics); + print $fh qq[ + set origin 0,0.03 + set size 0.95,0.6 + set ylabel "Cycle (rev reads)" offset character -1,0 + set xlabel "Base Quality" + unset title + unset ytics + set ytics ($ytics) + set xrange [0:$nquals] + set xtics + set colorbox vertical user origin first ($nquals+1),0 size screen 0.025,0.812 + set cblabel "Number of bases" + splot '-' matrix with image + ]; + for my $cycle (@{$$opts{dat}{LFQ}}) + { + for (my $iqual=1; $iqual<@$cycle; $iqual++) { print $fh "\t$$cycle[$iqual]"; } + print $fh "\n"; + } + print $fh "end\nend\n"; + close($fh); + plot($$args{gp}); +} + + +sub plot_acgt_cycles +{ + my ($opts) = @_; + + if ( !exists($$opts{dat}{GCC}) or !@{$$opts{dat}{GCC}} ) { return; } + + my $args = get_defaults($opts,"$$opts{prefix}acgt-cycles.png"); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set style line 1 linecolor rgb "green" + set style line 2 linecolor rgb "red" + set style line 3 linecolor rgb "black" + set style line 4 linecolor rgb "blue" + set style increment user + set ylabel "Base content [%]" + set xlabel "Read Cycle" + set yrange [0:100] + set title "$$args{title}" + plot '-' w l ti 'A', '-' w l ti 'C', '-' w l ti 'G', '-' w l ti 'T' + ]; + for my $base (1..4) + { + for my $cycle (@{$$opts{dat}{GCC}}) + { + print $fh $$cycle[0]+1,"\t",$$cycle[$base],"\n"; + } + print $fh "end\n"; + } + close($fh); + plot($$args{gp}); +} + + +sub plot_gc +{ + my ($opts) = @_; + + my $is_paired = $$opts{dat}{'is paired:'}; + my $args = get_defaults($opts,"$$opts{prefix}gc-content.png"); + my $fh = $$args{fh}; + my ($gcl_max,$gcf_max,$lmax,$fmax); + for my $gc (@{$$opts{dat}{GCF}}) { if ( !defined $gcf_max or $gcf_max<$$gc[1] ) { $gcf_max=$$gc[1]; $fmax=$$gc[0]; } } + for my $gc (@{$$opts{dat}{GCL}}) { if ( !defined $gcl_max or $gcl_max<$$gc[1] ) { $gcl_max=$$gc[1]; $lmax=$$gc[0]; } } + my $gcmax = $is_paired && $gcl_max > $gcf_max ? $lmax : $fmax; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set title "$$args{title}" + set ylabel "Normalized Frequency" + set xlabel "GC Content [%]" + set yrange [0:1.1] + set label sprintf("%.1f",$gcmax) at $gcmax,1 front offset 1,0 + plot ] + . (exists($$opts{ref_stats}) ? q['-' smooth csplines with lines lt 0 title 'Reference', ] : '') + . q['-' smooth csplines with lines lc 1 title 'First fragments' ] + . ($is_paired ? q[, '-' smooth csplines with lines lc 2 title 'Last fragments'] : '') + . q[ + ]; + if ( exists($$opts{ref_stats}) ) + { + open(my $ref,'<',$$opts{ref_stats}) or error("$$opts{ref_stats}: $!"); + while (my $line=<$ref>) { print $fh $line } + close($ref); + print $fh "end\n"; + } + for my $cycle (@{$$opts{dat}{GCF}}) { printf $fh "%d\t%f\n", $$cycle[0],$$cycle[1]/$gcf_max; } + print $fh "end\n"; + if ( $is_paired ) + { + for my $cycle (@{$$opts{dat}{GCL}}) { printf $fh "%d\t%f\n", $$cycle[0],$$cycle[1]/$gcl_max; } + print $fh "end\n"; + } + close($fh); + plot($$args{gp}); +} + + +sub plot_gc_depth +{ + my ($opts) = @_; + + if ( !exists($$opts{dat}{GCD}) or !@{$$opts{dat}{GCD}} ) { return; } + + # Find unique sequence percentiles for 30,40, and 50% GC content, just to draw x2tics. + my @tics = ( {gc=>30},{gc=>40},{gc=>50} ); + for my $gc (@{$$opts{dat}{GCD}}) + { + for my $tic (@tics) + { + my $diff = abs($$gc[0]-$$tic{gc}); + if ( !exists($$tic{pr}) or $diff<$$tic{diff} ) { $$tic{pr}=$$gc[1]; $$tic{diff}=$diff; } + } + } + + my @x2tics; + for my $tic (@tics) { push @x2tics, qq["$$tic{gc}" $$tic{pr}]; } + my $x2tics = join(',',@x2tics); + + my $args = get_defaults($opts,"$$opts{prefix}gc-depth.png", wh=>'600,500'); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set ylabel "Mapped depth" + set xlabel "Percentile of mapped sequence ordered by GC content" + set x2label "GC Content [%]" + set title "$$args{title}" + set x2tics ($x2tics) + set xtics nomirror + set xrange [0.1:99.9] + + plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#dedede" t '10-90th percentile' , \\ + '-' using 1:2:3 with filledcurve lt 1 lc rgb "#bbdeff" t '25-75th percentile' , \\ + '-' using 1:2 with lines lc rgb "#0084ff" t 'Median' + ]; + for my $gc (@{$$opts{dat}{GCD}}) { print $fh "$$gc[1]\t$$gc[2]\t$$gc[6]\n"; } print $fh "end\n"; + for my $gc (@{$$opts{dat}{GCD}}) { print $fh "$$gc[1]\t$$gc[3]\t$$gc[5]\n"; } print $fh "end\n"; + for my $gc (@{$$opts{dat}{GCD}}) { print $fh "$$gc[1]\t$$gc[4]\n"; } print $fh "end\n"; + close($fh); + plot($$args{gp}); +} + + +sub plot_isize +{ + my ($opts) = @_; + + if ( !$$opts{dat}{'is paired:'} or !exists($$opts{dat}{IS}) or !@{$$opts{dat}{IS}} ) { return; } + + my ($isize_max,$isize_cnt); + for my $isize (@{$$opts{dat}{IS}}) + { + if ( !defined $isize_max or $isize_cnt<$$isize[1] ) { $isize_cnt=$$isize[1]; $isize_max=$$isize[0]; } + } + + my $args = get_defaults($opts,"$$opts{prefix}insert-size.png"); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set rmargin 5 + set label sprintf("%d",$isize_max) at $isize_max+10,$isize_cnt + set ylabel "Number of pairs" + set xlabel "Insert Size" + set title "$$args{title}" + plot \\ + '-' with lines lc rgb 'black' title 'All pairs', \\ + '-' with lines title 'Inward', \\ + '-' with lines title 'Outward', \\ + '-' with lines title 'Other' + ]; + for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[1]\n"; } print $fh "end\n"; + for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[2]\n"; } print $fh "end\n"; + for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[3]\n"; } print $fh "end\n"; + for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[4]\n"; } print $fh "end\n"; + close($fh); + plot($$args{gp}); +} + + +sub plot_coverage +{ + my ($opts) = @_; + + if ( !exists($$opts{dat}{COV}) or !@{$$opts{dat}{COV}} ) { return; } + + my @vals; + for my $cov (@{$$opts{dat}{COV}}) { push @vals,$$cov[2]; } + my $i = percentile(99.8,@vals); + my $p99 = $$opts{dat}{COV}[$i][1]; + + my $args = get_defaults($opts,"$$opts{prefix}coverage.png"); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set ylabel "Number of mapped bases" + set xlabel "Coverage" + set style fill solid border -1 + set title "$$args{title}" + set xrange [:$p99] + plot '-' with lines notitle + ]; + for my $cov (@{$$opts{dat}{COV}}) + { + if ( $$cov[2]==0 ) { next; } + print $fh "$$cov[1]\t$$cov[2]\n"; + } + print $fh "end\n"; + close($fh); + plot($$args{gp}); +} + + +sub plot_mismatches_per_cycle +{ + my ($opts) = @_; + + if ( !exists($$opts{dat}{MPC}) or !@{$$opts{dat}{MPC}} ) { return; } + if ( older_than($opts,'2012-02-06') ) { plot_mismatches_per_cycle_old($opts); } + + my $nquals = @{$$opts{dat}{MPC}[0]} - 2; + my $ncycles = @{$$opts{dat}{MPC}}; + my ($style,$with); + if ( $ncycles>100 ) { $style = ''; $with = 'w l'; } + else { $style = 'set style data histogram; set style histogram rowstacked'; $with = ''; } + + my $args = get_defaults($opts,"$$opts{prefix}mism-per-cycle.png"); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set style line 1 linecolor rgb "#e40000" + set style line 2 linecolor rgb "#ff9f00" + set style line 3 linecolor rgb "#eeee00" + set style line 4 linecolor rgb "#4ebd68" + set style line 5 linecolor rgb "#0061ff" + set style increment user + set key left top + $style + set ylabel "Number of mismatches" + set xlabel "Read Cycle" + set style fill solid border -1 + set title "$$args{title}" + set xrange [-1:$ncycles] + plot '-' $with ti 'Base Quality>30', \\ + '-' $with ti '30>=Q>20', \\ + '-' $with ti '20>=Q>10', \\ + '-' $with ti '10>=Q', \\ + '-' $with ti "N's" + ]; + for my $cycle (@{$$opts{dat}{MPC}}) + { + my $sum; for my $idx (31..$#$cycle) { $sum += $$cycle[$idx]; } + print $fh "$sum\n"; + } + print $fh "end\n"; + for my $cycle (@{$$opts{dat}{MPC}}) + { + my $sum; for my $idx (22..31) { $sum += $$cycle[$idx]; } + print $fh "$sum\n"; + } + print $fh "end\n"; + for my $cycle (@{$$opts{dat}{MPC}}) + { + my $sum; for my $idx (12..21) { $sum += $$cycle[$idx]; } + print $fh "$sum\n"; + } + print $fh "end\n"; + for my $cycle (@{$$opts{dat}{MPC}}) + { + my $sum; for my $idx (2..11) { $sum += $$cycle[$idx]; } + print $fh "$sum\n"; + } + print $fh "end\n"; + for my $cycle (@{$$opts{dat}{MPC}}) { print $fh "$$cycle[1]\n"; } + print $fh "end\n"; + close($fh); + plot($$args{gp}); +} + +sub plot_indel_dist +{ + my ($opts) = @_; + + if ( !exists($$opts{dat}{ID}) or !@{$$opts{dat}{ID}} ) { return; } + + my $args = get_defaults($opts,"$$opts{prefix}indel-dist.png"); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set style line 1 linetype 1 linecolor rgb "red" + set style line 2 linetype 2 linecolor rgb "black" + set style line 3 linetype 3 linecolor rgb "green" + set style increment user + set ylabel "Indel count [log]" + set xlabel "Indel length" + set y2label "Insertions/Deletions ratio" + set log y + set y2tics nomirror + set ytics nomirror + set title "$$args{title}" + plot '-' w l ti 'Insertions', '-' w l ti 'Deletions', '-' axes x1y2 w l ti "Ins/Dels ratio" + ]; + for my $len (@{$$opts{dat}{ID}}) { print $fh "$$len[0]\t$$len[1]\n"; } print $fh "end\n"; + for my $len (@{$$opts{dat}{ID}}) { print $fh "$$len[0]\t$$len[2]\n"; } print $fh "end\n"; + for my $len (@{$$opts{dat}{ID}}) { printf $fh "%d\t%f\n", $$len[0],$$len[2]?$$len[1]/$$len[2]:0; } print $fh "end\n"; + close($fh); + plot($$args{gp}); +} + +sub plot_indel_cycles +{ + my ($opts) = @_; + + if ( !exists($$opts{dat}{IC}) or !@{$$opts{dat}{IC}} ) { return; } + + my $args = get_defaults($opts,"$$opts{prefix}indel-cycles.png"); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set style line 1 linetype 1 linecolor rgb "red" + set style line 2 linetype 2 linecolor rgb "black" + set style line 3 linetype 3 linecolor rgb "green" + set style line 4 linetype 4 linecolor rgb "blue" + set style increment user + set ylabel "Indel count" + set xlabel "Read Cycle" + set title "$$args{title}" + plot '-' w l ti 'Insertions (fwd)', '' w l ti 'Insertions (rev)', '' w l ti 'Deletions (fwd)', '' w l ti 'Deletions (rev)' + ]; + for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[1]\n"; } print $fh "end\n"; + for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[2]\n"; } print $fh "end\n"; + for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[3]\n"; } print $fh "end\n"; + for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[4]\n"; } print $fh "end\n"; + close($fh); + plot($$args{gp}); +} + + + + + + + +sub has_values +{ + my ($opts,@tags) = @_; + for my $tag (@tags) + { + my (@lines) = `cat $$opts{bamcheck} | grep ^$tag | wc -l`; + chomp($lines[0]); + if ( $lines[0]<2 ) { return 0; } + } + return 1; +} + +sub plot_mismatches_per_cycle_old +{ + my ($opts) = @_; + + my $args = get_defaults($opts,"$$opts{prefix}mism-per-cycle.png"); + my ($nquals) = `grep ^MPC $$opts{bamcheck} | awk '\$2==1' | sed 's,\\t,\\n,g' | wc -l`; + my ($ncycles) = `grep ^MPC $$opts{bamcheck} | wc -l`; + chomp($nquals); + chomp($ncycles); + $nquals--; + $ncycles--; + my @gr0_15 = (2..17); + my @gr16_30 = (18..32); + my @gr31_n = (33..$nquals); + my $gr0_15 = '$'. join('+$',@gr0_15); + my $gr16_30 = '$'. join('+$',@gr16_30); + my $gr31_n = '$'. join('+$',@gr31_n); + + open(my $fh,'>',$$args{gp}) or error("$$args{gp}: $!"); + print $fh q[ + set terminal png size 600,400 truecolor font "DejaVuSansMono,9" + set output "] . $$args{img} . q[" + + set key left top + set style data histogram + set style histogram rowstacked + + set grid back lc rgb "#aaaaaa" + set ylabel "Number of mismatches" + set xlabel "Read Cycle" + set style fill solid border -1 + set title "] . $$args{title} . qq[" + set xrange [-1:$ncycles] + + plot '< grep ^MPC $$opts{bamcheck} | cut -f 2-' using ($gr31_n) ti 'Base Quality>30', '' using ($gr16_30) ti '30>=Q>15', '' using ($gr0_15) ti '15>=Q' + ]; + close($fh); + + plot($$args{gp}); +} + + diff --git a/sam/misc/r2plot.lua b/sam/misc/r2plot.lua new file mode 100755 index 0000000..0a1b9f1 --- /dev/null +++ b/sam/misc/r2plot.lua @@ -0,0 +1,83 @@ +#!/usr/bin/env luajit + +function string:split(sep, n) + local a, start = {}, 1; + sep = sep or "%s+"; + repeat + local b, e = self:find(sep, start); + if b == nil then + table.insert(a, self:sub(start)); + break + end + a[#a+1] = self:sub(start, b - 1); + start = e + 1; + if n and #a == n then + table.insert(a, self:sub(start)); + break + end + until start > #self; + return a; +end + +function io.xopen(fn, mode) + mode = mode or 'r'; + if fn == nil then return io.stdin; + elseif fn == '-' then return (mode == 'r' and io.stdin) or io.stdout; + elseif fn:sub(-3) == '.gz' then return (mode == 'r' and io.popen('gzip -dc ' .. fn, 'r')) or io.popen('gzip > ' .. fn, 'w'); + elseif fn:sub(-4) == '.bz2' then return (mode == 'r' and io.popen('bzip2 -dc ' .. fn, 'r')) or io.popen('bgzip2 > ' .. fn, 'w'); + else return io.open(fn, mode) end +end + +local eps = {}; + +function eps.func(fp) + fp = fp or io.stdout + fp:write("/C { dup 255 and 255 div exch dup -8 bitshift 255 and 255 div 3 1 roll -16 bitshift 255 and 255 div 3 1 roll setrgbcolor } bind def\n") + fp:write("/L { 4 2 roll moveto lineto } bind def\n") + fp:write("/LX { dup 4 -1 roll exch moveto lineto } bind def\n") + fp:write("/LY { dup 4 -1 roll moveto exch lineto } bind def\n") + fp:write("/LS { 3 1 roll moveto show } bind def\n") + fp:write("/RS { dup stringwidth pop 4 -1 roll exch sub 3 -1 roll moveto show } bind def\n") + fp:write("/B { 4 copy 3 1 roll exch 6 2 roll 8 -2 roll moveto lineto lineto lineto closepath } bind def\n") +end + +function eps.font(ft, size, fp) + fp = fp or io.stdout + fp:write(string.format('/FS %d def\n', size)); + fp:write('/FS4 FS 4 div def\n'); + fp:write('/' .. ft .. ' findfont FS scalefont setfont\n'); +end + +local scale = 8; + +if #arg == 0 then + print("Usage: r2plot.lua "); + os.exit(1) +end + +local fp = io.xopen(arg[1]); +local n = tonumber(fp:read()); + +print('%!PS-Adobe-3.0 EPSF-3.0'); +print('%%' .. string.format('BoundingBox: -%d -%d %.3f %.3f\n', 10*scale, scale, (n+1)*scale, (n+1)*scale)); +print(string.format('%.3f setlinewidth', scale)); +print(string.format('/plot { setgray moveto 0 %d rlineto } def', scale)); +print(string.format('/plothalf { setgray moveto 0 %.2f rlineto } def', scale/2)); +eps.func(); +eps.font('Helvetica', scale-1); + +local i = 1; +for l in fp:lines() do + local t = l:split('\t'); + print(string.format("%d %d FS4 add (%s) RS", (i-1)*scale-2, (i-1)*scale, t[1])); + for j = 2, #t do + if tonumber(t[j]) > 0.01 then + print(string.format('%.2f %.2f %.2f plot stroke', (i-1+.5)*scale, (j-2)*scale, 1.-t[j])); + end + end + i = i + 1; +end +for j = 1, 21 do + print(string.format('%.2f %.2f %.2f plothalf stroke', -8*scale, (j-1) * scale/2, 1.-(j-1)/20)); +end +print('showpage'); diff --git a/sam/misc/seqtk.c b/sam/misc/seqtk.c deleted file mode 100644 index 591ddff..0000000 --- a/sam/misc/seqtk.c +++ /dev/null @@ -1,783 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include - -#include "kseq.h" -KSEQ_INIT(gzFile, gzread) - -typedef struct { - int n, m; - uint64_t *a; -} reglist_t; - -#include "khash.h" -KHASH_MAP_INIT_STR(reg, reglist_t) - -typedef kh_reg_t reghash_t; - -reghash_t *stk_reg_read(const char *fn) -{ - reghash_t *h = kh_init(reg); - gzFile fp; - kstream_t *ks; - int dret; - kstring_t *str; - // read the list - str = calloc(1, sizeof(kstring_t)); - fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); - ks = ks_init(fp); - while (ks_getuntil(ks, 0, str, &dret) >= 0) { - int beg = -1, end = -1; - reglist_t *p; - khint_t k = kh_get(reg, h, str->s); - if (k == kh_end(h)) { - int ret; - char *s = strdup(str->s); - k = kh_put(reg, h, s, &ret); - memset(&kh_val(h, k), 0, sizeof(reglist_t)); - } - p = &kh_val(h, k); - if (dret != '\n') { - if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { - beg = atoi(str->s); - if (dret != '\n') { - if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { - end = atoi(str->s); - if (end < 0) end = -1; - } - } - } - } - // skip the rest of the line - if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); - if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column - if (beg < 0) beg = 0, end = INT_MAX; - if (p->n == p->m) { - p->m = p->m? p->m<<1 : 4; - p->a = realloc(p->a, p->m * 8); - } - p->a[p->n++] = (uint64_t)beg<<32 | end; - } - ks_destroy(ks); - gzclose(fp); - free(str->s); free(str); - return h; -} - -void stk_reg_destroy(reghash_t *h) -{ - khint_t k; - for (k = 0; k < kh_end(h); ++k) { - if (kh_exist(h, k)) { - free(kh_val(h, k).a); - free((char*)kh_key(h, k)); - } - } - kh_destroy(reg, h); -} - -/* constant table */ - -unsigned char seq_nt16_table[256] = { - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15 /*'-'*/,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, - 15,15, 5, 6, 8,15, 7, 9, 0,10,15,15, 15,15,15,15, - 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, - 15,15, 5, 6, 8,15, 7, 9, 0,10,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, - 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 -}; - -char *seq_nt16_rev_table = "XACMGRSVTWYHKDBN"; -unsigned char seq_nt16to4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; -int bitcnt_table[] = { 4, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; - -/* composition */ -int stk_comp(int argc, char *argv[]) -{ - gzFile fp; - kseq_t *seq; - int l, c, upper_only = 0; - reghash_t *h = 0; - reglist_t dummy; - while ((c = getopt(argc, argv, "ur:")) >= 0) { - switch (c) { - case 'u': upper_only = 1; break; - case 'r': h = stk_reg_read(optarg); break; - } - } - if (argc == optind) { - fprintf(stderr, "Usage: seqtk comp [-u] [-r in.bed] \n\n"); - fprintf(stderr, "Output format: chr, length, #A, #C, #G, #T, #2, #3, #4, #CpG, #tv, #ts, #CpG-ts\n"); - return 1; - } - fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); - seq = kseq_init(fp); - dummy.n= dummy.m = 1; dummy.a = calloc(1, 8); - while ((l = kseq_read(seq)) >= 0) { - int i, k; - reglist_t *p = 0; - if (h) { - khint_t k = kh_get(reg, h, seq->name.s); - if (k != kh_end(h)) p = &kh_val(h, k); - } else { - p = &dummy; - dummy.a[0] = l; - } - for (k = 0; p && k < p->n; ++k) { - int beg = p->a[k]>>32, end = p->a[k]&0xffffffff; - int la, lb, lc, na, nb, nc, cnt[11]; - if (beg > 0) la = seq->seq.s[beg-1], lb = seq_nt16_table[la], lc = bitcnt_table[lb]; - else la = 'a', lb = -1, lc = 0; - na = seq->seq.s[beg]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb]; - memset(cnt, 0, 11 * sizeof(int)); - for (i = beg; i < end; ++i) { - int is_CpG = 0, a, b, c; - a = na; b = nb; c = nc; - na = seq->seq.s[i+1]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb]; - if (b == 2 || b == 10) { // C or Y - if (nb == 4 || nb == 5) is_CpG = 1; - } else if (b == 4 || b == 5) { // G or R - if (lb == 2 || lb == 10) is_CpG = 1; - } - if (upper_only == 0 || isupper(a)) { - if (c > 1) ++cnt[c+2]; - if (c == 1) ++cnt[seq_nt16to4_table[b]]; - if (b == 10 || b == 5) ++cnt[9]; - else if (c == 2) { - ++cnt[8]; - } - if (is_CpG) { - ++cnt[7]; - if (b == 10 || b == 5) ++cnt[10]; - } - } - la = a; lb = b; lc = c; - } - if (h) printf("%s\t%d\t%d", seq->name.s, beg, end); - else printf("%s\t%d", seq->name.s, l); - for (i = 0; i < 11; ++i) printf("\t%d", cnt[i]); - putchar('\n'); - } - fflush(stdout); - } - free(dummy.a); - kseq_destroy(seq); - gzclose(fp); - return 0; -} - -int stk_randbase(int argc, char *argv[]) -{ - gzFile fp; - kseq_t *seq; - int l; - if (argc == 1) { - fprintf(stderr, "Usage: seqtk randbase \n"); - return 1; - } - fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r"); - seq = kseq_init(fp); - while ((l = kseq_read(seq)) >= 0) { - int i; - printf(">%s", seq->name.s); - for (i = 0; i < l; ++i) { - int c, b, a, j, k, m; - b = seq->seq.s[i]; - c = seq_nt16_table[b]; - a = bitcnt_table[c]; - if (a == 2) { - m = (drand48() < 0.5); - for (j = k = 0; j < 4; ++j) { - if ((1<seq.s[i] = islower(b)? "acgt"[j] : "ACGT"[j]; - } - if (i%60 == 0) putchar('\n'); - putchar(seq->seq.s[i]); - } - putchar('\n'); - } - kseq_destroy(seq); - gzclose(fp); - return 0; -} - -int stk_hety(int argc, char *argv[]) -{ - gzFile fp; - kseq_t *seq; - int l, c, win_size = 50000, n_start = 5, win_step, is_lower_mask = 0; - char *buf; - uint32_t cnt[3]; - if (argc == 1) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: seqtk hety [options] \n\n"); - fprintf(stderr, "Options: -w INT window size [%d]\n", win_size); - fprintf(stderr, " -t INT # start positions in a window [%d]\n", n_start); - fprintf(stderr, " -m treat lowercases as masked\n"); - fprintf(stderr, "\n"); - return 1; - } - while ((c = getopt(argc, argv, "w:t:m")) >= 0) { - switch (c) { - case 'w': win_size = atoi(optarg); break; - case 't': n_start = atoi(optarg); break; - case 'm': is_lower_mask = 1; break; - } - } - fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); - seq = kseq_init(fp); - win_step = win_size / n_start; - buf = calloc(win_size, 1); - while ((l = kseq_read(seq)) >= 0) { - int x, i, y, z, next = 0; - cnt[0] = cnt[1] = cnt[2] = 0; - for (i = 0; i <= l; ++i) { - if ((i >= win_size && i % win_step == 0) || i == l) { - if (i == l && l >= win_size) { - for (y = l - win_size; y < next; ++y) --cnt[(int)buf[y % win_size]]; - } - if (cnt[1] + cnt[2] > 0) - printf("%s\t%d\t%d\t%.2lf\t%d\t%d\n", seq->name.s, next, i, - (double)cnt[2] / (cnt[1] + cnt[2]) * win_size, cnt[1] + cnt[2], cnt[2]); - next = i; - } - if (i < l) { - y = i % win_size; - c = seq->seq.s[i]; - if (is_lower_mask && islower(c)) c = 'N'; - c = seq_nt16_table[c]; - x = bitcnt_table[c]; - if (i >= win_size) --cnt[(int)buf[y]]; - buf[y] = z = x > 2? 0 : x == 2? 2 : 1; - ++cnt[z]; - } - } - } - free(buf); - kseq_destroy(seq); - gzclose(fp); - return 0; -} - -/* fq2fa */ -int stk_fq2fa(int argc, char *argv[]) -{ - gzFile fp; - kseq_t *seq; - char *buf; - int l, i, c, qual_thres = 0, linelen = 60; - while ((c = getopt(argc, argv, "q:l:")) >= 0) { - switch (c) { - case 'q': qual_thres = atoi(optarg); break; - case 'l': linelen = atoi(optarg); break; - } - } - if (argc == optind) { - fprintf(stderr, "Usage: seqtk fq2fa [-q qualThres=0] [-l lineLen=60] \n"); - return 1; - } - buf = linelen > 0? malloc(linelen + 1) : 0; - fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); - seq = kseq_init(fp); - while ((l = kseq_read(seq)) >= 0) { - if (seq->qual.l && qual_thres > 0) { - for (i = 0; i < l; ++i) - if (seq->qual.s[i] - 33 < qual_thres) - seq->seq.s[i] = tolower(seq->seq.s[i]); - } - putchar('>'); - if (seq->comment.l) { - fputs(seq->name.s, stdout); - putchar(' '); - puts(seq->comment.s); - } else puts(seq->name.s); - if (buf) { // multi-line - for (i = 0; i < l; i += linelen) { - int x = i + linelen < l? linelen : l - i; - memcpy(buf, seq->seq.s + i, x); - buf[x] = 0; - puts(buf); - } - } else puts(seq->seq.s); - } - free(buf); - kseq_destroy(seq); - gzclose(fp); - return 0; -} - -int stk_maskseq(int argc, char *argv[]) -{ - khash_t(reg) *h = kh_init(reg); - gzFile fp; - kseq_t *seq; - int l, i, j, c, is_complement = 0, is_lower = 0; - khint_t k; - while ((c = getopt(argc, argv, "cl")) >= 0) { - switch (c) { - case 'c': is_complement = 1; break; - case 'l': is_lower = 1; break; - } - } - if (argc - optind < 2) { - fprintf(stderr, "Usage: seqtk maskseq [-cl] \n\n"); - fprintf(stderr, "Options: -c mask the complement regions\n"); - fprintf(stderr, " -l soft mask (to lower cases)\n"); - return 1; - } - h = stk_reg_read(argv[optind+1]); - // maskseq - fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); - seq = kseq_init(fp); - while ((l = kseq_read(seq)) >= 0) { - k = kh_get(reg, h, seq->name.s); - if (k == kh_end(h)) { // not found in the hash table - if (is_complement) { - for (j = 0; j < l; ++j) - seq->seq.s[j] = is_lower? tolower(seq->seq.s[j]) : 'N'; - } - } else { - reglist_t *p = &kh_val(h, k); - if (!is_complement) { - for (i = 0; i < p->n; ++i) { - int beg = p->a[i]>>32, end = p->a[i]; - if (beg >= seq->seq.l) { - fprintf(stderr, "[maskseq] start position >= the sequence length.\n"); - continue; - } - if (end >= seq->seq.l) end = seq->seq.l; - if (is_lower) for (j = beg; j < end; ++j) seq->seq.s[j] = tolower(seq->seq.s[j]); - else for (j = beg; j < end; ++j) seq->seq.s[j] = 'N'; - } - } else { - int8_t *mask = calloc(seq->seq.l, 1); - for (i = 0; i < p->n; ++i) { - int beg = p->a[i]>>32, end = p->a[i]; - if (end >= seq->seq.l) end = seq->seq.l; - for (j = beg; j < end; ++j) mask[j] = 1; - } - for (j = 0; j < l; ++j) - if (mask[j] == 0) seq->seq.s[j] = is_lower? tolower(seq->seq.s[j]) : 'N'; - free(mask); - } - } - printf(">%s", seq->name.s); - for (j = 0; j < seq->seq.l; ++j) { - if (j%60 == 0) putchar('\n'); - putchar(seq->seq.s[j]); - } - putchar('\n'); - } - // free - kseq_destroy(seq); - gzclose(fp); - stk_reg_destroy(h); - return 0; -} - -/* subseq */ - -int stk_subseq(int argc, char *argv[]) -{ - khash_t(reg) *h = kh_init(reg); - gzFile fp; - kseq_t *seq; - int l, i, j, c, is_tab = 0; - khint_t k; - while ((c = getopt(argc, argv, "t")) >= 0) { - switch (c) { - case 't': is_tab = 1; break; - } - } - if (optind + 2 > argc) { - fprintf(stderr, "Usage: seqtk subseq [-t] \n\n"); - fprintf(stderr, "Note: Use 'samtools faidx' if only a few regions are intended.\n"); - return 1; - } - h = stk_reg_read(argv[optind+1]); - // subseq - fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); - seq = kseq_init(fp); - while ((l = kseq_read(seq)) >= 0) { - reglist_t *p; - k = kh_get(reg, h, seq->name.s); - if (k == kh_end(h)) continue; - p = &kh_val(h, k); - for (i = 0; i < p->n; ++i) { - int beg = p->a[i]>>32, end = p->a[i]; - if (beg >= seq->seq.l) { - fprintf(stderr, "[subseq] %s: %d >= %ld\n", seq->name.s, beg, seq->seq.l); - continue; - } - if (end > seq->seq.l) end = seq->seq.l; - if (is_tab == 0) { - printf("%c%s", seq->qual.l == seq->seq.l? '@' : '>', seq->name.s); - if (end == INT_MAX) { - if (beg) printf(":%d", beg+1); - } else printf(":%d-%d", beg+1, end); - } else printf("%s\t%d\t", seq->name.s, beg + 1); - if (end > seq->seq.l) end = seq->seq.l; - for (j = 0; j < end - beg; ++j) { - if (is_tab == 0 && j % 60 == 0) putchar('\n'); - putchar(seq->seq.s[j + beg]); - } - putchar('\n'); - if (seq->qual.l != seq->seq.l || is_tab) continue; - printf("+"); - for (j = 0; j < end - beg; ++j) { - if (j % 60 == 0) putchar('\n'); - putchar(seq->qual.s[j + beg]); - } - putchar('\n'); - } - } - // free - kseq_destroy(seq); - gzclose(fp); - stk_reg_destroy(h); - return 0; -} - -/* mergefa */ -int stk_mergefa(int argc, char *argv[]) -{ - gzFile fp[2]; - kseq_t *seq[2]; - int i, l, c, is_intersect = 0, is_haploid = 0, qual = 0, is_mask = 0; - while ((c = getopt(argc, argv, "himq:")) >= 0) { - switch (c) { - case 'i': is_intersect = 1; break; - case 'h': is_haploid = 1; break; - case 'm': is_mask = 1; break; - case 'q': qual = atoi(optarg); break; - } - } - if (is_mask && is_intersect) { - fprintf(stderr, "[%s] `-i' and `-h' cannot be applied at the same time.\n", __func__); - return 1; - } - if (optind + 2 > argc) { - fprintf(stderr, "\nUsage: seqtk mergefa [options] \n\n"); - fprintf(stderr, "Options: -q INT quality threshold [0]\n"); - fprintf(stderr, " -i take intersection\n"); - fprintf(stderr, " -m convert to lowercase when one of the input base is N.\n"); - fprintf(stderr, " -h suppress hets in the input\n\n"); - return 1; - } - for (i = 0; i < 2; ++i) { - fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r"); - seq[i] = kseq_init(fp[i]); - } - while (kseq_read(seq[0]) >= 0) { - int min_l, c[2], is_upper; - kseq_read(seq[1]); - if (strcmp(seq[0]->name.s, seq[1]->name.s)) - fprintf(stderr, "[%s] Different sequence names: %s != %s\n", __func__, seq[0]->name.s, seq[1]->name.s); - if (seq[0]->seq.l != seq[1]->seq.l) - fprintf(stderr, "[%s] Unequal sequence length: %ld != %ld\n", __func__, seq[0]->seq.l, seq[1]->seq.l); - min_l = seq[0]->seq.l < seq[1]->seq.l? seq[0]->seq.l : seq[1]->seq.l; - printf(">%s", seq[0]->name.s); - for (l = 0; l < min_l; ++l) { - c[0] = seq[0]->seq.s[l]; c[1] = seq[1]->seq.s[l]; - if (seq[0]->qual.l && seq[0]->qual.s[l] - 33 < qual) c[0] = tolower(c[0]); - if (seq[1]->qual.l && seq[1]->qual.s[l] - 33 < qual) c[1] = tolower(c[1]); - if (is_intersect) is_upper = (isupper(c[0]) || isupper(c[1]))? 1 : 0; - else if (is_mask) is_upper = (isupper(c[0]) || isupper(c[1]))? 1 : 0; - else is_upper = (isupper(c[0]) && isupper(c[1]))? 1 : 0; - c[0] = seq_nt16_table[c[0]]; c[1] = seq_nt16_table[c[1]]; - if (c[0] == 0) c[0] = 15; - if (c[1] == 0) c[1] = 15; - if (is_haploid && (bitcnt_table[c[0]] > 1 || bitcnt_table[c[1]] > 1)) is_upper = 0; - if (is_intersect) { - c[0] = c[0] & c[1]; - if (c[0] == 0) is_upper = 0; - } else if (is_mask) { - if (c[0] == 15 || c[1] == 15) is_upper = 0; - c[0] = c[0] & c[1]; - if (c[0] == 0) is_upper = 0; - } else c[0] = c[0] | c[1]; - c[0] = seq_nt16_rev_table[c[0]]; - if (!is_upper) c[0] = tolower(c[0]); - if (l%60 == 0) putchar('\n'); - putchar(c[0]); - } - putchar('\n'); - } - return 0; -} - -int stk_famask(int argc, char *argv[]) -{ - gzFile fp[2]; - kseq_t *seq[2]; - int i, l; - if (argc < 3) { - fprintf(stderr, "Usage: seqtk famask \n"); - return 1; - } - for (i = 0; i < 2; ++i) { - fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r"); - seq[i] = kseq_init(fp[i]); - } - while (kseq_read(seq[0]) >= 0) { - int min_l, c[2]; - kseq_read(seq[1]); - if (strcmp(seq[0]->name.s, seq[1]->name.s)) - fprintf(stderr, "[%s] Different sequence names: %s != %s\n", __func__, seq[0]->name.s, seq[1]->name.s); - if (seq[0]->seq.l != seq[1]->seq.l) - fprintf(stderr, "[%s] Unequal sequence length: %ld != %ld\n", __func__, seq[0]->seq.l, seq[1]->seq.l); - min_l = seq[0]->seq.l < seq[1]->seq.l? seq[0]->seq.l : seq[1]->seq.l; - printf(">%s", seq[0]->name.s); - for (l = 0; l < min_l; ++l) { - c[0] = seq[0]->seq.s[l]; c[1] = seq[1]->seq.s[l]; - if (c[1] == 'x') c[0] = tolower(c[0]); - else if (c[1] != 'X') c[0] = c[1]; - if (l%60 == 0) putchar('\n'); - putchar(c[0]); - } - putchar('\n'); - } - return 0; -} - -int stk_mutfa(int argc, char *argv[]) -{ - khash_t(reg) *h = kh_init(reg); - gzFile fp; - kseq_t *seq; - kstream_t *ks; - int l, i, dret; - kstring_t *str; - khint_t k; - if (argc < 3) { - fprintf(stderr, "Usage: seqtk mutfa \n\n"); - fprintf(stderr, "Note: contains at least four columns per line which are:\n"); - fprintf(stderr, " 'chr 1-based-pos any base-changed-to'.\n"); - return 1; - } - // read the list - str = calloc(1, sizeof(kstring_t)); - fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r"); - ks = ks_init(fp); - while (ks_getuntil(ks, 0, str, &dret) >= 0) { - char *s = strdup(str->s); - int beg = 0, ret; - reglist_t *p; - k = kh_get(reg, h, s); - if (k == kh_end(h)) { - k = kh_put(reg, h, s, &ret); - memset(&kh_val(h, k), 0, sizeof(reglist_t)); - } - p = &kh_val(h, k); - if (ks_getuntil(ks, 0, str, &dret) > 0) beg = atol(str->s) - 1; // 2nd col - ks_getuntil(ks, 0, str, &dret); // 3rd col - ks_getuntil(ks, 0, str, &dret); // 4th col - // skip the rest of the line - if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); - if (isalpha(str->s[0]) && str->l == 1) { - if (p->n == p->m) { - p->m = p->m? p->m<<1 : 4; - p->a = realloc(p->a, p->m * 8); - } - p->a[p->n++] = (uint64_t)beg<<32 | str->s[0]; - } - } - ks_destroy(ks); - gzclose(fp); - free(str->s); free(str); - // mutfa - fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); - seq = kseq_init(fp); - while ((l = kseq_read(seq)) >= 0) { - reglist_t *p; - k = kh_get(reg, h, seq->name.s); - if (k != kh_end(h)) { - p = &kh_val(h, k); - for (i = 0; i < p->n; ++i) { - int beg = p->a[i]>>32; - if (beg < seq->seq.l) - seq->seq.s[beg] = (int)p->a[i]; - } - } - printf(">%s", seq->name.s); - for (i = 0; i < l; ++i) { - if (i%60 == 0) putchar('\n'); - putchar(seq->seq.s[i]); - } - putchar('\n'); - } - // free - kseq_destroy(seq); - gzclose(fp); - for (k = 0; k < kh_end(h); ++k) { - if (kh_exist(h, k)) { - free(kh_val(h, k).a); - free((char*)kh_key(h, k)); - } - } - kh_destroy(reg, h); - return 0; -} - -int stk_listhet(int argc, char *argv[]) -{ - gzFile fp; - kseq_t *seq; - int i, l; - if (argc == 1) { - fprintf(stderr, "Usage: seqtk listhet \n"); - return 1; - } - fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r"); - seq = kseq_init(fp); - while ((l = kseq_read(seq)) >= 0) { - for (i = 0; i < l; ++i) { - int b = seq->seq.s[i]; - if (bitcnt_table[seq_nt16_table[b]] == 2) - printf("%s\t%d\t%c\n", seq->name.s, i+1, b); - } - } - kseq_destroy(seq); - gzclose(fp); - return 0; -} - -/* cutN */ -static int cutN_min_N_tract = 1000; -static int cutN_nonN_penalty = 10; - -static int find_next_cut(const kseq_t *ks, int k, int *begin, int *end) -{ - int i, b, e; - while (k < ks->seq.l) { - if (seq_nt16_table[(int)ks->seq.s[k]] == 15) { - int score, max; - score = 0; e = max = -1; - for (i = k; i < ks->seq.l && score >= 0; ++i) { /* forward */ - if (seq_nt16_table[(int)ks->seq.s[i]] == 15) ++score; - else score -= cutN_nonN_penalty; - if (score > max) max = score, e = i; - } - score = 0; b = max = -1; - for (i = e; i >= 0 && score >= 0; --i) { /* backward */ - if (seq_nt16_table[(int)ks->seq.s[i]] == 15) ++score; - else score -= cutN_nonN_penalty; - if (score > max) max = score, b = i; - } - if (e + 1 - b >= cutN_min_N_tract) { - *begin = b; - *end = e + 1; - return *end; - } - k = e + 1; - } else ++k; - } - return -1; -} -static void print_seq(FILE *fpout, const kseq_t *ks, int begin, int end) -{ - int i; - if (begin >= end) return; // FIXME: why may this happen? Understand it! - fprintf(fpout, ">%s:%d-%d", ks->name.s, begin+1, end); - for (i = begin; i < end && i < ks->seq.l; ++i) { - if ((i - begin)%60 == 0) fputc('\n', fpout); - fputc(ks->seq.s[i], fpout); - } - fputc('\n', fpout); -} -int stk_cutN(int argc, char *argv[]) -{ - int c, l, gap_only = 0; - gzFile fp; - kseq_t *ks; - while ((c = getopt(argc, argv, "n:p:g")) >= 0) { - switch (c) { - case 'n': cutN_min_N_tract = atoi(optarg); break; - case 'p': cutN_nonN_penalty = atoi(optarg); break; - case 'g': gap_only = 1; break; - default: return 1; - } - } - if (argc == optind) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: seqtk cutN [options] \n\n"); - fprintf(stderr, "Options: -n INT min size of N tract [%d]\n", cutN_min_N_tract); - fprintf(stderr, " -p INT penalty for a non-N [%d]\n", cutN_nonN_penalty); - fprintf(stderr, " -g print gaps only, no sequence\n\n"); - return 1; - } - fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); - ks = kseq_init(fp); - while ((l = kseq_read(ks)) >= 0) { - int k = 0, begin = 0, end = 0; - while (find_next_cut(ks, k, &begin, &end) >= 0) { - if (begin != 0) { - if (gap_only) printf("%s\t%d\t%d\n", ks->name.s, begin, end); - else print_seq(stdout, ks, k, begin); - } - k = end; - } - if (!gap_only) print_seq(stdout, ks, k, l); - } - kseq_destroy(ks); - gzclose(fp); - return 0; -} - -/* main function */ -static int usage() -{ - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: seqtk \n\n"); - fprintf(stderr, "Command: comp get the nucleotide composite of FASTA/Q\n"); - fprintf(stderr, " hety regional heterozygosity\n"); - fprintf(stderr, " fq2fa convert FASTQ to FASTA\n"); - fprintf(stderr, " subseq extract subsequences from FASTA/Q\n"); - fprintf(stderr, " maskseq mask sequences\n"); - fprintf(stderr, " mutfa point mutate FASTA at specified positions\n"); - fprintf(stderr, " mergefa merge two FASTA/Q files\n"); - fprintf(stderr, " randbase choose a random base from hets\n"); - fprintf(stderr, " cutN cut sequence at long N\n"); - fprintf(stderr, " listhet extract the position of each het\n"); - fprintf(stderr, "\n"); - return 1; -} - -int main(int argc, char *argv[]) -{ - if (argc == 1) return usage(); - if (strcmp(argv[1], "comp") == 0) stk_comp(argc-1, argv+1); - else if (strcmp(argv[1], "hety") == 0) stk_hety(argc-1, argv+1); - else if (strcmp(argv[1], "fq2fa") == 0) stk_fq2fa(argc-1, argv+1); - else if (strcmp(argv[1], "subseq") == 0) stk_subseq(argc-1, argv+1); - else if (strcmp(argv[1], "maskseq") == 0) stk_maskseq(argc-1, argv+1); - else if (strcmp(argv[1], "mutfa") == 0) stk_mutfa(argc-1, argv+1); - else if (strcmp(argv[1], "mergefa") == 0) stk_mergefa(argc-1, argv+1); - else if (strcmp(argv[1], "randbase") == 0) stk_randbase(argc-1, argv+1); - else if (strcmp(argv[1], "cutN") == 0) stk_cutN(argc-1, argv+1); - else if (strcmp(argv[1], "listhet") == 0) stk_listhet(argc-1, argv+1); - else if (strcmp(argv[1], "famask") == 0) stk_famask(argc-1, argv+1); - else { - fprintf(stderr, "[main] unrecognized commad '%s'. Abort!\n", argv[1]); - return 1; - } - return 0; -} diff --git a/sam/misc/vcfutils.lua b/sam/misc/vcfutils.lua new file mode 100755 index 0000000..51d374e --- /dev/null +++ b/sam/misc/vcfutils.lua @@ -0,0 +1,694 @@ +#!/usr/bin/env luajit + +----------------------------------- +-- BEGIN: routines from klib.lua -- +----------------------------------- + +-- Description: getopt() translated from the BSD getopt(); compatible with the default Unix getopt() +--[[ Example: + for o, a in os.getopt(arg, 'a:b') do + print(o, a) + end +]]-- +function os.getopt(args, ostr) + local arg, place = nil, 0; + return function () + if place == 0 then -- update scanning pointer + place = 1 + if #args == 0 or args[1]:sub(1, 1) ~= '-' then place = 0; return nil end + if #args[1] >= 2 then + place = place + 1 + if args[1]:sub(2, 2) == '-' then -- found "--" + table.remove(args, 1); + place = 0 + return nil; + end + end + end + local optopt = place <= #args[1] and args[1]:sub(place, place) or nil + place = place + 1; + local oli = optopt and ostr:find(optopt) or nil + if optopt == ':' or oli == nil then -- unknown option + if optopt == '-' then return nil end + if place > #args[1] then + table.remove(args, 1); + place = 0; + end + return '?'; + end + oli = oli + 1; + if ostr:sub(oli, oli) ~= ':' then -- do not need argument + arg = nil; + if place > #args[1] then + table.remove(args, 1); + place = 0; + end + else -- need an argument + if place <= #args[1] then -- no white space + arg = args[1]:sub(place); + else + table.remove(args, 1); + if #args == 0 then -- an option requiring argument is the last one + place = 0; + if ostr:sub(1, 1) == ':' then return ':' end + return '?'; + else arg = args[1] end + end + table.remove(args, 1); + place = 0; + end + return optopt, arg; + end +end + +-- Description: string split +function string:split(sep, n) + local a, start = {}, 1; + sep = sep or "%s+"; + repeat + local b, e = self:find(sep, start); + if b == nil then + table.insert(a, self:sub(start)); + break + end + a[#a+1] = self:sub(start, b - 1); + start = e + 1; + if n and #a == n then + table.insert(a, self:sub(start)); + break + end + until start > #self; + return a; +end + +-- Description: smart file open +function io.xopen(fn, mode) + mode = mode or 'r'; + if fn == nil then return io.stdin; + elseif fn == '-' then return (mode == 'r' and io.stdin) or io.stdout; + elseif fn:sub(-3) == '.gz' then return (mode == 'r' and io.popen('gzip -dc ' .. fn, 'r')) or io.popen('gzip > ' .. fn, 'w'); + elseif fn:sub(-4) == '.bz2' then return (mode == 'r' and io.popen('bzip2 -dc ' .. fn, 'r')) or io.popen('bgzip2 > ' .. fn, 'w'); + else return io.open(fn, mode) end +end + +-- Description: log gamma function +-- Required by: math.lbinom() +-- Reference: AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245 +function math.lgamma(z) + local x; + x = 0.1659470187408462e-06 / (z+7); + x = x + 0.9934937113930748e-05 / (z+6); + x = x - 0.1385710331296526 / (z+5); + x = x + 12.50734324009056 / (z+4); + x = x - 176.6150291498386 / (z+3); + x = x + 771.3234287757674 / (z+2); + x = x - 1259.139216722289 / (z+1); + x = x + 676.5203681218835 / z; + x = x + 0.9999999999995183; + return math.log(x) - 5.58106146679532777 - z + (z-0.5) * math.log(z+6.5); +end + +-- Description: regularized incomplete gamma function +-- Dependent on: math.lgamma() +--[[ + Formulas are taken from Wiki, with additional input from Numerical + Recipes in C (for modified Lentz's algorithm) and AS245 + (http://lib.stat.cmu.edu/apstat/245). + + A good online calculator is available at: + + http://www.danielsoper.com/statcalc/calc23.aspx + + It calculates upper incomplete gamma function, which equals + math.igamma(s,z,true)*math.exp(math.lgamma(s)) +]]-- +function math.igamma(s, z, complement) + + local function _kf_gammap(s, z) + local sum, x = 1, 1; + for k = 1, 100 do + x = x * z / (s + k); + sum = sum + x; + if x / sum < 1e-14 then break end + end + return math.exp(s * math.log(z) - z - math.lgamma(s + 1.) + math.log(sum)); + end + + local function _kf_gammaq(s, z) + local C, D, f, TINY; + f = 1. + z - s; C = f; D = 0.; TINY = 1e-290; + -- Modified Lentz's algorithm for computing continued fraction. See Numerical Recipes in C, 2nd edition, section 5.2 + for j = 1, 100 do + local d; + local a, b = j * (s - j), j*2 + 1 + z - s; + D = b + a * D; + if D < TINY then D = TINY end + C = b + a / C; + if C < TINY then C = TINY end + D = 1. / D; + d = C * D; + f = f * d; + if math.abs(d - 1) < 1e-14 then break end + end + return math.exp(s * math.log(z) - z - math.lgamma(s) - math.log(f)); + end + + if complement then + return ((z <= 1 or z < s) and 1 - _kf_gammap(s, z)) or _kf_gammaq(s, z); + else + return ((z <= 1 or z < s) and _kf_gammap(s, z)) or (1 - _kf_gammaq(s, z)); + end +end + +function math.brent(func, a, b, tol) + local gold1, gold2, tiny, max_iter = 1.6180339887, 0.3819660113, 1e-20, 100 + + local fa, fb = func(a, data), func(b, data) + if fb > fa then -- swap, such that f(a) > f(b) + a, b, fa, fb = b, a, fb, fa + end + local c = b + gold1 * (b - a) + local fc = func(c) -- golden section extrapolation + while fb > fc do + local bound = b + 100.0 * (c - b) -- the farthest point where we want to go + local r = (b - a) * (fb - fc) + local q = (b - c) * (fb - fa) + if math.abs(q - r) < tiny then -- avoid 0 denominator + tmp = q > r and tiny or 0.0 - tiny + else tmp = q - r end + u = b - ((b - c) * q - (b - a) * r) / (2.0 * tmp) -- u is the parabolic extrapolation point + if (b > u and u > c) or (b < u and u < c) then -- u lies between b and c + fu = func(u) + if fu < fc then -- (b,u,c) bracket the minimum + a, b, fa, fb = b, u, fb, fu + break + elseif fu > fb then -- (a,b,u) bracket the minimum + c, fc = u, fu + break + end + u = c + gold1 * (c - b) + fu = func(u) -- golden section extrapolation + elseif (c > u and u > bound) or (c < u and u < bound) then -- u lies between c and bound + fu = func(u) + if fu < fc then -- fb > fc > fu + b, c, u = c, u, c + gold1 * (c - b) + fb, fc, fu = fc, fu, func(u) + else -- (b,c,u) bracket the minimum + a, b, c = b, c, u + fa, fb, fc = fb, fc, fu + break + end + elseif (u > bound and bound > c) or (u < bound and bound < c) then -- u goes beyond the bound + u = bound + fu = func(u) + else -- u goes the other way around, use golden section extrapolation + u = c + gold1 * (c - b) + fu = func(u) + end + a, b, c = b, c, u + fa, fb, fc = fb, fc, fu + end + if a > c then a, c = c, a end -- swap + + -- now, afb and fb tol1 then + -- related to parabolic interpolation + local r = (b - w) * (fb - fv) + local q = (b - v) * (fb - fw) + local p = (b - v) * q - (b - w) * r + q = 2.0 * (q - r) + if q > 0.0 then p = 0.0 - p + else q = 0.0 - q end + eold, e = e, d + if math.abs(p) >= math.abs(0.5 * q * eold) or p <= q * (a - b) or p >= q * (c - b) then + e = b >= mid and a - b or c - b + d = gold2 * e + else + d, u = p / q, b + d -- actual parabolic interpolation happens here + if u - a < tol2 or c - u < tol2 then + d = mid > b and tol1 or 0.0 - tol1 + end + end + else -- golden section interpolation + e = b >= min and a - b or c - b + d = gold2 * e + end + u = fabs(d) >= tol1 and b + d or b + (d > 0.0 and tol1 or -tol1); + fu = func(u) + if fu <= fb then -- u is the minimum point so far + if u >= b then a = b + else c = b end + v, w, b = w, b, u + fv, fw, fb = fw, fb, fu + else -- adjust (a,c) and (u,v,w) + if u < b then a = u + else c = u end + if fu <= fw or w == b then + v, w = w, u + fv, fw = fw, fu + elseif fu <= fv or v == b or v == w then + v, fv = u, fu; + end + end + end + return fb, b +end + +matrix = {} + +-- Description: chi^2 test for contingency tables +-- Dependent on: math.igamma() +function matrix.chi2(a) + if #a == 2 and #a[1] == 2 then -- 2x2 table + local x, z + x = (a[1][1] + a[1][2]) * (a[2][1] + a[2][2]) * (a[1][1] + a[2][1]) * (a[1][2] + a[2][2]) + if x == 0 then return 0, 1, false end + z = a[1][1] * a[2][2] - a[1][2] * a[2][1] + z = (a[1][1] + a[1][2] + a[2][1] + a[2][2]) * z * z / x + return z, math.igamma(.5, .5 * z, true), true + else -- generic table + local rs, cs, n, m, N, z = {}, {}, #a, #a[1], 0, 0 + for i = 1, n do rs[i] = 0 end + for j = 1, m do cs[j] = 0 end + for i = 1, n do -- compute column sum and row sum + for j = 1, m do cs[j], rs[i] = cs[j] + a[i][j], rs[i] + a[i][j] end + end + for i = 1, n do N = N + rs[i] end + for i = 1, n do -- compute the chi^2 statistics + for j = 1, m do + local E = rs[i] * cs[j] / N; + z = z + (a[i][j] - E) * (a[i][j] - E) / E + end + end + return z, math.igamma(.5 * (n-1) * (m-1), .5 * z, true), true; + end +end + +--------------------------------- +-- END: routines from klib.lua -- +--------------------------------- + + +-------------------------- +-- BEGIN: misc routines -- +-------------------------- + +-- precompute an array for PL->probability conversion +-- @param m maximum PL +function algo_init_q2p(m) + local q2p = {} + for i = 0, m do + q2p[i] = math.pow(10, -i / 10) + end + return q2p +end + +-- given the haplotype frequency, compute r^2 +-- @param f 4 haplotype frequencies; f[] is 0-indexed. +-- @return r^2 +function algo_r2(f) + local p = { f[0] + f[1], f[0] + f[2] } + local D = f[0] * f[3] - f[1] * f[2] + return (p[1] == 0 or p[2] == 0 or 1-p[1] == 0 or 1-p[2] == 0) and 0 or D * D / (p[1] * p[2] * (1 - p[1]) * (1 - p[2])) +end + +-- parse a VCF line to get PL +-- @param q2p is computed by algo_init_q2p() +function text_parse_pl(t, q2p, parse_GT) + parse_GT = parse_GT == nil and true or false + local ht, gt, pl = {}, {}, {} + local s, j0 = t[9]:split(':'), 0 + for j = 1, #s do + if s[j] == 'PL' then j0 = j break end + end + local has_GT = (s[1] == 'GT' and parse_GT) and true or false + for i = 10, #t do + if j0 > 0 then + local s = t[i]:split(':') + local a, b = 1, s[j0]:find(',') + pl[#pl+1] = q2p[tonumber(s[j0]:sub(a, b - 1))] + a, b = b + 1, s[j0]:find(',', b + 1) + pl[#pl+1] = q2p[tonumber(s[j0]:sub(a, b - 1))] + a, b = b + 1, s[j0]:find(',', b + 1) + pl[#pl+1] = q2p[tonumber(s[j0]:sub(a, (b and b - 1) or nil))] + end + if has_GT then + if t[i]:sub(1, 1) ~= '.' then + local g = tonumber(t[i]:sub(1, 1)) + tonumber(t[i]:sub(3, 3)); + gt[#gt+1] = 1e-6; gt[#gt+1] = 1e-6; gt[#gt+1] = 1e-6 + gt[#gt - 2 + g] = 1 + ht[#ht+1] = tonumber(t[i]:sub(1, 1)); ht[#ht+1] = tonumber(t[i]:sub(3, 3)); + else + gt[#gt+1] = 1; gt[#gt+1] = 1; gt[#gt+1] = 1 + ht[#ht+1] = -1; ht[#ht+1] = -1; + end + end +-- print(t[i], pl[#pl-2], pl[#pl-1], pl[#pl], gt[#gt-2], gt[#gt-1], gt[#gt]) + end + if #pl == 0 then pl = nil end + local x = has_GT and { t[1], t[2], ht, gt, pl } or { t[1], t[2], nil, nil, pl } + return x +end + +-- Infer haplotype frequency +-- @param pdg genotype likelihoods P(D|g) generated by text_parse_pl(). pdg[] is 1-indexed. +-- @param eps precision [1e-5] +-- @return 2-locus haplotype frequencies, 0-indexed array +function algo_hapfreq2(pdg, eps) + eps = eps or 1e-5 + local n, f = #pdg[1] / 3, {[0]=0.25, 0.25, 0.25, 0.25} + for iter = 1, 100 do + local F = {[0]=0, 0, 0, 0} + for i = 0, n - 1 do + local p1, p2 = {[0]=pdg[1][i*3+1], pdg[1][i*3+2], pdg[1][i*3+3]}, {[0]=pdg[2][i*3+1], pdg[2][i*3+2], pdg[2][i*3+3]} + local u = { [0]= + f[0] * (f[0] * p1[0] * p2[0] + f[1] * p1[0] * p2[1] + f[2] * p1[1] * p2[0] + f[3] * p1[1] * p2[1]), + f[1] * (f[0] * p1[0] * p2[1] + f[1] * p1[0] * p2[2] + f[2] * p1[1] * p2[1] + f[3] * p1[1] * p2[2]), + f[2] * (f[0] * p1[1] * p2[0] + f[1] * p1[1] * p2[1] + f[2] * p1[2] * p2[0] + f[3] * p1[2] * p2[1]), + f[3] * (f[0] * p1[1] * p2[1] + f[1] * p1[1] * p2[2] + f[2] * p1[2] * p2[1] + f[3] * p1[2] * p2[2]) + } + local s = u[0] + u[1] + u[2] + u[3] + s = 1 / (s * n) + F[0] = F[0] + u[0] * s + F[1] = F[1] + u[1] * s + F[2] = F[2] + u[2] * s + F[3] = F[3] + u[3] * s + end + local e = 0 + for k = 0, 3 do + e = math.abs(f[k] - F[k]) > e and math.abs(f[k] - F[k]) or e + end + for k = 0, 3 do f[k] = F[k] end + if e < eps then break end +-- print(f[0], f[1], f[2], f[3]) + end + return f +end + +------------------------ +-- END: misc routines -- +------------------------ + + +--------------------- +-- BEGIN: commands -- +--------------------- + +-- CMD vcf2bgl: convert PL tagged VCF to Beagle input -- +function cmd_vcf2bgl() + if #arg == 0 then + print("\nUsage: vcf2bgl.lua ") + print("\nNB: This command finds PL by matching /(\\d+),(\\d+),(\\d+)/.\n"); + os.exit(1) + end + + local lookup = {} + for i = 0, 10000 do lookup[i] = string.format("%.4f", math.pow(10, -i/10)) end + + local fp = io.xopen(arg[1]) + for l in fp:lines() do + if l:sub(1, 2) == '##' then -- meta lines; do nothing + elseif l:sub(1, 1) == '#' then -- sample lines + local t, s = l:split('\t'), {} + for i = 10, #t do s[#s+1] = t[i]; s[#s+1] = t[i]; s[#s+1] = t[i] end + print('marker', 'alleleA', 'alleleB', table.concat(s, '\t')) + else -- data line + local t = l:split('\t'); + if t[5] ~= '.' and t[5]:find(",") == nil and #t[5] == 1 and #t[4] == 1 then -- biallic SNP + local x, z = -1, {}; + if t[9]:find('PL') then + for i = 10, #t do + local AA, Aa, aa = t[i]:match('(%d+),(%d+),(%d+)') + AA = tonumber(AA); Aa = tonumber(Aa); aa = tonumber(aa); + if AA ~= nil then + z[#z+1] = lookup[AA]; z[#z+1] = lookup[Aa]; z[#z+1] = lookup[aa]; + else z[#z+1] = 1; z[#z+1] = 1; z[#z+1] = 1; end + end + print(t[1]..':'..t[2], t[4], t[5], table.concat(z, '\t')) + elseif t[9]:find('GL') then + print('Error: not implemented') + os.exit(1) + end + end + end + end + fp:close() +end + +-- CMD bgl2vcf: convert Beagle output to VCF +function cmd_bgl2vcf() + if #arg < 2 then + print('Usage: bgl2vcf.lua ') + os.exit(1) + end + + local fpp = io.xopen(arg[1]); + local fpg = io.xopen(arg[2]); + for lg in fpg:lines() do + local tp, tg, a = fpp:read():split('%s'), lg:split('%s', 4), {} + if tp[1] == 'I' then + for i = 3, #tp, 2 do a[#a+1] = tp[i] end + print('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', table.concat(a, '\t')) + else + local chr, pos = tg[1]:match('(%S+):(%d+)$') + a = {chr, pos, '.', tg[2], tg[3], 30, '.', '.', 'GT'} + for i = 3, #tp, 2 do + a[#a+1] = ((tp[i] == tg[2] and 0) or 1) .. '|' .. ((tp[i+1] == tg[2] and 0) or 1) + end + print(table.concat(a, '\t')) + end + end + fpg:close(); fpp:close(); +end + +-- CMD freq: count alleles in each population +function cmd_freq() + -- parse the command line + local site_only = true; -- print site allele frequency or not + for c in os.getopt(arg, 's') do + if c == 's' then site_only = false end + end + if #arg == 0 then + print("\nUsage: vcfutils.lua freq [-s] [samples.txt]\n") + print("NB: 1) This command only considers biallelic variants.") + print(" 2) Apply '-s' to get the allele frequency spectrum.") + print(" 3) 'samples.txt' is TAB-delimited with each line consisting of sample and population.") + print("") + os.exit(1) + end + + -- read the sample-population pairs + local pop, sample = {}, {} + if #arg > 1 then + local fp = io.xopen(arg[2]); + for l in fp:lines() do + local s, p = l:match("^(%S+)%s+(%S+)"); -- sample, population pair + sample[s] = p; -- FIXME: check duplications + if pop[p] then table.insert(pop[p], s) + else pop[p] = {s} end + end + fp:close(); + end + pop['NA'] = {} + + -- parse VCF + fp = (#arg >= 2 and io.xopen(arg[1])) or io.stdin; + local col, cnt = {}, {}; + for k in pairs(pop) do + col[k], cnt[k] = {}, {[0]=0}; + end + for l in fp:lines() do + if l:sub(1, 2) == '##' then -- meta lines; do nothing + elseif l:sub(1, 1) == '#' then -- the sample line + local t, del_NA = l:split('\t'), true; + for i = 10, #t do + local k = sample[t[i]] + if k == nil then + k, del_NA = 'NA', false + table.insert(pop[k], t[i]) + end + table.insert(col[k], i); + table.insert(cnt[k], 0); + table.insert(cnt[k], 0); + end + if del_NA then pop['NA'], col['NA'], cnt['NA'] = nil, nil, nil end + else -- data lines + local t = l:split('\t'); + if t[5] ~= '.' and t[5]:find(",") == nil then -- biallic + if site_only == true then io.write(t[1], '\t', t[2], '\t', t[4], '\t', t[5]) end + for k, v in pairs(col) do + local ac, an = 0, 0; + for i = 1, #v do + local a1, a2 = t[v[i]]:match("^(%d).(%d)"); + if a1 ~= nil then ac, an = ac + a1 + a2, an + 2 end + end + if site_only == true then io.write('\t', k, ':', an, ':', ac) end + if an == #cnt[k] then cnt[k][ac] = cnt[k][ac] + 1 end + end + if site_only == true then io.write('\n') end + end + end + end + fp:close(); + + -- print + if site_only == false then + for k, v in pairs(cnt) do + io.write(k .. "\t" .. #v); + for i = 0, #v do io.write("\t" .. v[i]) end + io.write('\n'); + end + end +end + +function cmd_vcf2chi2() + if #arg < 3 then + print("Usage: vcfutils.lua vcf2chi2 "); + os.exit(1) + end + + local g = {}; + + -- read the list of groups + local fp = io.xopen(arg[2]); + for l in fp:lines() do local x = l:match("^(%S+)"); g[x] = 1 end -- FIXME: check duplicate + fp:close() + fp = io.xopen(arg[3]); + for l in fp:lines() do local x = l:match("^(%S+)"); g[x] = 2 end + fp:close() + + -- process VCF + fp = io.xopen(arg[1]) + local h = {{}, {}} + for l in fp:lines() do + if l:sub(1, 2) == '##' then print(l) -- meta lines; do nothing + elseif l:sub(1, 1) == '#' then -- sample lines + local t = l:split('\t'); + for i = 10, #t do + if g[t[i]] == 1 then table.insert(h[1], i) + elseif g[t[i]] == 2 then table.insert(h[2], i) end + end + while #t > 8 do table.remove(t) end + print(table.concat(t, "\t")) + else -- data line + local t = l:split('\t'); + if t[5] ~= '.' and t[5]:find(",") == nil then -- biallic + local a = {{0, 0}, {0, 0}} + for i = 1, 2 do + for _, k in pairs(h[i]) do + if t[k]:find("^0.0") then a[i][1] = a[i][1] + 2 + elseif t[k]:find("^1.1") then a[i][2] = a[i][2] + 2 + elseif t[k]:find("^0.1") or t[k]:find("^1.0") then + a[i][1], a[i][2] = a[i][1] + 1, a[i][2] + 1 + end + end + end + local chi2, p, succ = matrix.chi2(a); + while #t > 8 do table.remove(t) end + --print(a[1][1], a[1][2], a[2][1], a[2][2], chi2, p); + if succ then print(table.concat(t, "\t") .. ";PCHI2=" .. string.format("%.3g", p) + .. string.format(';AF1=%.4g;AF2=%.4g,%.4g', (a[1][2]+a[2][2]) / (a[1][1]+a[1][2]+a[2][1]+a[2][2]), + a[1][2]/(a[1][1]+a[1][2]), a[2][2]/(a[2][1]+a[2][2]))) + else print(table.concat(t, "\t")) end + end + end + end + fp:close() +end + +-- CMD: compute r^2 +function cmd_r2() + local w, is_ht, is_gt = 1, false, false + for o, a in os.getopt(arg, 'w:hg') do + if o == 'w' then w = tonumber(a) + elseif o == 'h' then is_ht, is_gt = true, true + elseif o == 'g' then is_gt = true + end + end + if #arg == 0 then + print("Usage: vcfutils.lua r2 [-hg] [-w 1] ") + os.exit(1) + end + local stack, fp, q2p = {}, io.xopen(arg[1]), algo_init_q2p(1023) + for l in fp:lines() do + if l:sub(1, 1) ~= '#' then + local t = l:split('\t') + local x = text_parse_pl(t, q2p) + if #t[5] == 1 and t[5] ~= '.' then -- biallelic + local r2 = {} + for k = 1, w do + if is_gt == false then -- use PL + if stack[k] then + local pdg = { stack[k][5], x[5] } + r2[#r2+1] = algo_r2(algo_hapfreq2(pdg)) + else r2[#r2+1] = 0 end + elseif is_ht == false then -- use unphased GT + if stack[k] then + local pdg = { stack[k][4], x[4] } + r2[#r2+1] = algo_r2(algo_hapfreq2(pdg)) + else r2[#r2+1] = 0 end + else -- use phased GT + if stack[k] then + local f, ht = { [0]=0, 0, 0, 0 }, { stack[k][3], x[3] } + for i = 1, #ht[1] do + local j = ht[1][i] * 2 + ht[2][i] + f[j] = f[j] + 1 + end + local sum = f[0] + f[1] + f[2] + f[3] + for k = 0, 3 do f[k] = f[k] / sum end + r2[#r2+1] = algo_r2(f) + else r2[#r2+1] = 0 end + end + end + for k = 1, #r2 do + r2[k] = string.format('%.3f', r2[k]) + end + print(x[1], x[2], table.concat(r2, '\t')) + if #stack == w then table.remove(stack, 1) end + stack[#stack+1] = x + end + end + end + fp:close() +end + +------------------- +-- END: commands -- +------------------- + + +------------------- +-- MAIN FUNCTION -- +------------------- + +if #arg == 0 then + print("\nUsage: vcfutils.lua \n") + print("Command: freq count biallelic alleles in each population") + print(" r2 compute r^2") + print(" vcf2chi2 compute 1-degree chi-square between two groups of samples") + print(" vcf2bgl convert PL annotated VCF to Beagle input") + print(" bgl2vcf convert Beagle input to VCF") + print("") + os.exit(1) +end + +local cmd = arg[1] +table.remove(arg, 1) +if cmd == 'vcf2bgl' then cmd_vcf2bgl() +elseif cmd == 'bgl2vcf' then cmd_bgl2vcf() +elseif cmd == 'freq' then cmd_freq() +elseif cmd == 'r2' then cmd_r2() +elseif cmd == 'vcf2chi2' then cmd_vcf2chi2() +else + print('ERROR: unknown command "' .. cmd .. '"') + os.exit(1) +end diff --git a/sam/padding.c b/sam/padding.c new file mode 100644 index 0000000..a8da562 --- /dev/null +++ b/sam/padding.c @@ -0,0 +1,479 @@ +#include +#include +#include +#include "kstring.h" +#include "sam_header.h" +#include "sam.h" +#include "bam.h" +#include "faidx.h" + +bam_header_t *bam_header_dup(const bam_header_t *h0); /*in sam.c*/ + +static void replace_cigar(bam1_t *b, int n, uint32_t *cigar) +{ + if (n != b->core.n_cigar) { + int o = b->core.l_qname + b->core.n_cigar * 4; + if (b->data_len + (n - b->core.n_cigar) * 4 > b->m_data) { + b->m_data = b->data_len + (n - b->core.n_cigar) * 4; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + } + memmove(b->data + b->core.l_qname + n * 4, b->data + o, b->data_len - o); + memcpy(b->data + b->core.l_qname, cigar, n * 4); + b->data_len += (n - b->core.n_cigar) * 4; + b->core.n_cigar = n; + } else memcpy(b->data + b->core.l_qname, cigar, n * 4); +} + +#define write_cigar(_c, _n, _m, _v) do { \ + if (_n == _m) { \ + _m = _m? _m<<1 : 4; \ + _c = (uint32_t*)realloc(_c, _m * 4); \ + } \ + _c[_n++] = (_v); \ + } while (0) + +static void unpad_seq(bam1_t *b, kstring_t *s) +{ + int k, j, i; + int length; + uint32_t *cigar = bam1_cigar(b); + uint8_t *seq = bam1_seq(b); + // b->core.l_qseq gives length of the SEQ entry (including soft clips, S) + // We need the padded length after alignment from the CIGAR (excluding + // soft clips S, but including pads from CIGAR D operations) + length = 0; + for (k = 0; k < b->core.n_cigar; ++k) { + int op, ol; + op= bam_cigar_op(cigar[k]); + ol = bam_cigar_oplen(cigar[k]); + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF || op == BAM_CDEL) + length += ol; + } + ks_resize(s, length); + for (k = 0, s->l = 0, j = 0; k < b->core.n_cigar; ++k) { + int op, ol; + op = bam_cigar_op(cigar[k]); + ol = bam_cigar_oplen(cigar[k]); + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { + for (i = 0; i < ol; ++i, ++j) s->s[s->l++] = bam1_seqi(seq, j); + } else if (op == BAM_CSOFT_CLIP) { + j += ol; + } else if (op == BAM_CHARD_CLIP) { + /* do nothing */ + } else if (op == BAM_CDEL) { + for (i = 0; i < ol; ++i) s->s[s->l++] = 0; + } else { + fprintf(stderr, "[depad] ERROR: Didn't expect CIGAR op %c in read %s\n", BAM_CIGAR_STR[op], bam1_qname(b)); + assert(-1); + } + } + assert(length == s->l); +} + +int load_unpadded_ref(faidx_t *fai, char *ref_name, int ref_len, kstring_t *seq) +{ + char base; + char *fai_ref = 0; + int fai_ref_len = 0, k; + + fai_ref = fai_fetch(fai, ref_name, &fai_ref_len); + if (fai_ref_len != ref_len) { + fprintf(stderr, "[depad] ERROR: FASTA sequence %s length %i, expected %i\n", ref_name, fai_ref_len, ref_len); + free(fai_ref); + return -1; + } + ks_resize(seq, ref_len); + seq->l = 0; + for (k = 0; k < ref_len; ++k) { + base = fai_ref[k]; + if (base == '-' || base == '*') { + // Map gaps to null to match unpad_seq function + seq->s[seq->l++] = 0; + } else { + int i = bam_nt16_table[(int)base]; + if (i == 0 || i==16) { // Equals maps to 0, anything unexpected to 16 + fprintf(stderr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence %s\n", base, (int)base, ref_name); + free(fai_ref); + return -1; + } + seq->s[seq->l++] = i; + } + } + assert(ref_len == seq->l); + free(fai_ref); + return 0; +} + +int get_unpadded_len(faidx_t *fai, char *ref_name, int padded_len) +{ + char base; + char *fai_ref = 0; + int fai_ref_len = 0, k; + int bases=0, gaps=0; + + fai_ref = fai_fetch(fai, ref_name, &fai_ref_len); + if (fai_ref_len != padded_len) { + fprintf(stderr, "[depad] ERROR: FASTA sequence '%s' length %i, expected %i\n", ref_name, fai_ref_len, padded_len); + free(fai_ref); + return -1; + } + for (k = 0; k < padded_len; ++k) { + //fprintf(stderr, "[depad] checking base %i of %i or %i\n", k+1, ref_len, strlen(fai_ref)); + base = fai_ref[k]; + if (base == '-' || base == '*') { + gaps += 1; + } else { + int i = bam_nt16_table[(int)base]; + if (i == 0 || i==16) { // Equals maps to 0, anything unexpected to 16 + fprintf(stderr, "[depad] ERROR: Invalid character %c (ASCII %i) in FASTA sequence '%s'\n", base, (int)base, ref_name); + free(fai_ref); + return -1; + } + bases += 1; + } + } + free(fai_ref); + assert (padded_len == bases + gaps); + return bases; +} + +inline int * update_posmap(int *posmap, kstring_t ref) +{ + int i, k; + posmap = realloc(posmap, ref.m * sizeof(int)); + for (i = k = 0; i < ref.l; ++i) { + posmap[i] = k; + if (ref.s[i]) ++k; + } + return posmap; +} + +int bam_pad2unpad(samfile_t *in, samfile_t *out, faidx_t *fai) +{ + bam_header_t *h = 0; + bam1_t *b = 0; + kstring_t r, q; + int r_tid = -1; + uint32_t *cigar2 = 0; + int ret = 0, n2 = 0, m2 = 0, *posmap = 0; + + b = bam_init1(); + r.l = r.m = q.l = q.m = 0; r.s = q.s = 0; + int read_ret; + h = in->header; + while ((read_ret = samread(in, b)) >= 0) { // read one alignment from `in' + uint32_t *cigar = bam1_cigar(b); + n2 = 0; + if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam1_qname(b), h->target_name[b->core.tid]) == 0) { + // fprintf(stderr, "[depad] Found embedded reference '%s'\n", bam1_qname(b)); + r_tid = b->core.tid; + unpad_seq(b, &r); + if (h->target_len[r_tid] != r.l) { + fprintf(stderr, "[depad] ERROR: (Padded) length of '%s' is %d in BAM header, but %ld in embedded reference\n", bam1_qname(b), h->target_len[r_tid], r.l); + return -1; + } + if (fai) { + // Check the embedded reference matches the FASTA file + if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &q)) { + fprintf(stderr, "[depad] ERROR: Failed to load embedded reference '%s' from FASTA\n", h->target_name[b->core.tid]); + return -1; + } + assert(r.l == q.l); + int i; + for (i = 0; i < r.l; ++i) { + if (r.s[i] != q.s[i]) { + // Show gaps as ASCII 45 + fprintf(stderr, "[depad] ERROR: Embedded sequence and reference FASTA don't match for %s base %i, '%c' vs '%c'\n", + h->target_name[b->core.tid], i+1, + r.s[i] ? bam_nt16_rev_table[(int)r.s[i]] : 45, + q.s[i] ? bam_nt16_rev_table[(int)q.s[i]] : 45); + return -1; + } + } + } + write_cigar(cigar2, n2, m2, bam_cigar_gen(b->core.l_qseq, BAM_CMATCH)); + replace_cigar(b, n2, cigar2); + posmap = update_posmap(posmap, r); + } else if (b->core.n_cigar > 0) { + int i, k, op; + if (b->core.tid < 0) { + fprintf(stderr, "[depad] ERROR: Read '%s' has CIGAR but no RNAME\n", bam1_qname(b)); + return -1; + } else if (b->core.tid == r_tid) { + ; // good case, reference available + //fprintf(stderr, "[depad] Have ref '%s' for read '%s'\n", h->target_name[b->core.tid], bam1_qname(b)); + } else if (fai) { + if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) { + fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]); + return -1; + } + posmap = update_posmap(posmap, r); + r_tid = b->core.tid; + // fprintf(stderr, "[depad] Loaded %s from FASTA file\n", h->target_name[b->core.tid]); + } else { + fprintf(stderr, "[depad] ERROR: Missing %s embedded reference sequence (and no FASTA file)\n", h->target_name[b->core.tid]); + return -1; + } + unpad_seq(b, &q); + if (bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) { + write_cigar(cigar2, n2, m2, cigar[0]); + } else if (bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP) { + write_cigar(cigar2, n2, m2, cigar[0]); + if (b->core.n_cigar > 2 && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP) { + write_cigar(cigar2, n2, m2, cigar[1]); + } + } + /* Determine CIGAR operator for each base in the aligned read */ + for (i = 0, k = b->core.pos; i < q.l; ++i, ++k) + q.s[i] = q.s[i]? (r.s[k]? BAM_CMATCH : BAM_CINS) : (r.s[k]? BAM_CDEL : BAM_CPAD); + /* Include any pads if starts with an insert */ + if (q.s[0] == BAM_CINS) { + for (k = 0; k+1 < b->core.pos && !r.s[b->core.pos - k - 1]; ++k); + if (k) write_cigar(cigar2, n2, m2, bam_cigar_gen(k, BAM_CPAD)); + } + /* Count consecutive CIGAR operators to turn into a CIGAR string */ + for (i = k = 1, op = q.s[0]; i < q.l; ++i) { + if (op != q.s[i]) { + write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op)); + op = q.s[i]; k = 1; + } else ++k; + } + write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op)); + if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CSOFT_CLIP) { + write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]); + } else if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CHARD_CLIP) { + if (b->core.n_cigar > 2 && bam_cigar_op(cigar[b->core.n_cigar-2]) == BAM_CSOFT_CLIP) { + write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-2]); + } + write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]); + } + /* Remove redundant P operators between M/X/=/D operators, e.g. 5M2P10M -> 15M */ + int pre_op, post_op; + for (i = 2; i < n2; ++i) + if (bam_cigar_op(cigar2[i-1]) == BAM_CPAD) { + pre_op = bam_cigar_op(cigar2[i-2]); + post_op = bam_cigar_op(cigar2[i]); + /* Note don't need to check for X/= as code above will use M only */ + if ((pre_op == BAM_CMATCH || pre_op == BAM_CDEL) && (post_op == BAM_CMATCH || post_op == BAM_CDEL)) { + /* This is a redundant P operator */ + cigar2[i-1] = 0; // i.e. 0M + /* If had same operator either side, combine them in post_op */ + if (pre_op == post_op) { + /* If CIGAR M, could treat as simple integers since BAM_CMATCH is zero*/ + cigar2[i] = bam_cigar_gen(bam_cigar_oplen(cigar2[i-2]) + bam_cigar_oplen(cigar2[i]), post_op); + cigar2[i-2] = 0; // i.e. 0M + } + } + } + /* Remove the zero'd operators (0M) */ + for (i = k = 0; i < n2; ++i) + if (cigar2[i]) cigar2[k++] = cigar2[i]; + n2 = k; + replace_cigar(b, n2, cigar2); + b->core.pos = posmap[b->core.pos]; + if (b->core.mtid < 0 || b->core.mpos < 0) { + /* Nice case, no mate to worry about*/ + // fprintf(stderr, "[depad] Read '%s' mate not mapped\n", bam1_qname(b)); + /* TODO - Warning if FLAG says mate should be mapped? */ + /* Clean up funny input where mate position is given but mate reference is missing: */ + b->core.mtid = -1; + b->core.mpos = -1; + } else if (b->core.mtid == b->core.tid) { + /* Nice case, same reference */ + // fprintf(stderr, "[depad] Read '%s' mate mapped to same ref\n", bam1_qname(b)); + b->core.mpos = posmap[b->core.mpos]; + } else { + /* Nasty case, Must load alternative posmap */ + // fprintf(stderr, "[depad] Loading reference '%s' temporarily\n", h->target_name[b->core.mtid]); + if (!fai) { + fprintf(stderr, "[depad] ERROR: Needed reference %s sequence for mate (and no FASTA file)\n", h->target_name[b->core.mtid]); + return -1; + } + /* Temporarily load the other reference sequence */ + if (load_unpadded_ref(fai, h->target_name[b->core.mtid], h->target_len[b->core.mtid], &r)) { + fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.mtid]); + return -1; + } + posmap = update_posmap(posmap, r); + b->core.mpos = posmap[b->core.mpos]; + /* Restore the reference and posmap*/ + if (load_unpadded_ref(fai, h->target_name[b->core.tid], h->target_len[b->core.tid], &r)) { + fprintf(stderr, "[depad] ERROR: Failed to load '%s' from reference FASTA\n", h->target_name[b->core.tid]); + return -1; + } + posmap = update_posmap(posmap, r); + } + } + samwrite(out, b); + } + if (read_ret < -1) { + fprintf(stderr, "[depad] truncated file.\n"); + ret = 1; + } + free(r.s); free(q.s); free(posmap); + bam_destroy1(b); + return ret; +} + +bam_header_t * fix_header(bam_header_t *old, faidx_t *fai) +{ + int i = 0, unpadded_len = 0; + bam_header_t *header = 0 ; + + header = bam_header_dup(old); + for (i = 0; i < old->n_targets; ++i) { + unpadded_len = get_unpadded_len(fai, old->target_name[i], old->target_len[i]); + if (unpadded_len < 0) { + fprintf(stderr, "[depad] ERROR getting unpadded length of '%s', padded length %i\n", old->target_name[i], old->target_len[i]); + } else { + header->target_len[i] = unpadded_len; + //fprintf(stderr, "[depad] Recalculating '%s' length %i -> %i\n", old->target_name[i], old->target_len[i], header->target_len[i]); + } + } + /* Duplicating the header allocated new buffer for header string */ + /* After modifying the @SQ lines it will only get smaller, since */ + /* the LN entries will be the same or shorter, and we'll remove */ + /* any MD entries (MD5 checksums). */ + assert(strlen(old->text) == strlen(header->text)); + assert (0==strcmp(old->text, header->text)); + const char *text; + text = old->text; + header->text[0] = '\0'; /* Resuse the allocated buffer */ + char * newtext = header->text; + char * end=NULL; + while (text[0]=='@') { + end = strchr(text, '\n'); + assert(end != 0); + if (text[1]=='S' && text[2]=='Q' && text[3]=='\t') { + /* TODO - edit the @SQ line here to remove MD and fix LN. */ + /* For now just remove the @SQ line, and samtools will */ + /* automatically generate a minimal replacement with LN. */ + /* However, that discards any other tags like AS, SP, UR. */ + //fprintf(stderr, "[depad] Removing @SQ line\n"); + } else { + /* Copy this line to the new header */ + strncat(newtext, text, end - text + 1); + } + text = end + 1; + } + assert (text[0]=='\0'); + /* Check we didn't overflow the buffer */ + assert (strlen(header->text) <= strlen(old->text)); + if (strlen(header->text) < header->l_text) { + //fprintf(stderr, "[depad] Reallocating header buffer\n"); + assert (newtext == header->text); + newtext = malloc(strlen(header->text) + 1); + strcpy(newtext, header->text); + free(header->text); + header->text = newtext; + header->l_text = strlen(newtext); + } + //fprintf(stderr, "[depad] Here is the new header (pending @SQ lines),\n\n%s\n(end)\n", header->text); + return header; +} + +static int usage(int is_long_help); + +int main_pad2unpad(int argc, char *argv[]) +{ + samfile_t *in = 0, *out = 0; + bam_header_t *h = 0; + faidx_t *fai = 0; + int c, is_bamin = 1, compress_level = -1, is_bamout = 1, is_long_help = 0; + char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0; + int ret=0; + + /* parse command-line options */ + strcpy(in_mode, "r"); strcpy(out_mode, "w"); + while ((c = getopt(argc, argv, "Sso:u1T:?")) >= 0) { + switch (c) { + case 'S': is_bamin = 0; break; + case 's': assert(compress_level == -1); is_bamout = 0; break; + case 'o': fn_out = strdup(optarg); break; + case 'u': assert(is_bamout == 1); compress_level = 0; break; + case '1': assert(is_bamout == 1); compress_level = 1; break; + case 'T': fn_ref = strdup(optarg); break; + case '?': is_long_help = 1; break; + default: return usage(is_long_help); + } + } + if (argc == optind) return usage(is_long_help); + + if (is_bamin) strcat(in_mode, "b"); + if (is_bamout) strcat(out_mode, "b"); + strcat(out_mode, "h"); + if (compress_level >= 0) { + char tmp[2]; + tmp[0] = compress_level + '0'; tmp[1] = '\0'; + strcat(out_mode, tmp); + } + + // Load FASTA reference (also needed for SAM -> BAM if missing header) + if (fn_ref) { + fn_list = samfaipath(fn_ref); + fai = fai_load(fn_ref); + } + // open file handlers + if ((in = samopen(argv[optind], in_mode, fn_list)) == 0) { + fprintf(stderr, "[depad] failed to open \"%s\" for reading.\n", argv[optind]); + ret = 1; + goto depad_end; + } + if (in->header == 0) { + fprintf(stderr, "[depad] failed to read the header from \"%s\".\n", argv[optind]); + ret = 1; + goto depad_end; + } + if (in->header->text == 0 || in->header->l_text == 0) { + fprintf(stderr, "[depad] Warning - failed to read any header text from \"%s\".\n", argv[optind]); + assert (0 == in->header->l_text); + assert (0 == in->header->text); + } + if (fn_ref) { + h = fix_header(in->header, fai); + } else { + fprintf(stderr, "[depad] Warning - reference lengths will not be corrected without FASTA reference\n"); + h = in->header; + } + if ((out = samopen(fn_out? fn_out : "-", out_mode, h)) == 0) { + fprintf(stderr, "[depad] failed to open \"%s\" for writing.\n", fn_out? fn_out : "standard output"); + ret = 1; + goto depad_end; + } + + // Do the depad + ret = bam_pad2unpad(in, out, fai); + +depad_end: + // close files, free and return + if (fai) fai_destroy(fai); + if (h != in->header) bam_header_destroy(h); + samclose(in); + samclose(out); + free(fn_list); free(fn_out); + return ret; +} + +static int usage(int is_long_help) +{ + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools depad \n\n"); + fprintf(stderr, "Options: -s output is SAM (default is BAM)\n"); + fprintf(stderr, " -S input is SAM (default is BAM)\n"); + fprintf(stderr, " -u uncompressed BAM output (can't use with -s)\n"); + fprintf(stderr, " -1 fast compression BAM output (can't use with -s)\n"); + fprintf(stderr, " -T FILE reference sequence file [null]\n"); + fprintf(stderr, " -o FILE output file name [stdout]\n"); + fprintf(stderr, " -? longer help\n"); + fprintf(stderr, "\n"); + if (is_long_help) + fprintf(stderr, "Notes:\n\ +\n\ + 1. Requires embedded reference sequences (before the reads for that reference),\n\ + with the future aim to also support a FASTA padded reference sequence file.\n\ +\n\ + 2. The input padded alignment read's CIGAR strings must not use P or I operators.\n\ +\n"); + return 1; +} diff --git a/sam/sam.c b/sam/sam.c index f026bc8..fa11df6 100644 --- a/sam/sam.c +++ b/sam/sam.c @@ -36,6 +36,13 @@ static void append_header_text(bam_header_t *header, char* text, int len) header->text[header->l_text] = 0; } +int samthreads(samfile_t *fp, int n_threads, int n_sub_blks) +{ + if (!(fp->type&1) || (fp->type&2)) return -1; + bgzf_mt(fp->x.bam, n_threads, n_sub_blks); + return 0; +} + samfile_t *samopen(const char *fn, const char *mode, const void *aux) { samfile_t *fp; @@ -79,7 +86,7 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux) } else { // text // open file fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout; - if (fp->x.tamr == 0) goto open_err_ret; + if (fp->x.tamw == 0) goto open_err_ret; if (strchr(mode, 'X')) fp->type |= BAM_OFSTR<<2; else if (strchr(mode, 'x')) fp->type |= BAM_OFHEX<<2; else fp->type |= BAM_OFDEC<<2; diff --git a/sam/sam.h b/sam/sam.h index 0b87194..0495501 100644 --- a/sam/sam.h +++ b/sam/sam.h @@ -90,6 +90,7 @@ extern "C" { int sampileup(samfile_t *fp, int mask, bam_pileup_f func, void *data); char *samfaipath(const char *fn_ref); + int samthreads(samfile_t *fp, int n_threads, int n_sub_blks); #ifdef __cplusplus } diff --git a/sam/sam_header.c b/sam/sam_header.c index f4c8a3b..88b6a1c 100644 --- a/sam/sam_header.c +++ b/sam/sam_header.c @@ -366,6 +366,7 @@ static HeaderLine *sam_header_line_parse(const char *headerLine) while (*to && *to=='\t') to++; if ( to-from != 1 ) { debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from)); + free(hline); return 0; } from = to; @@ -434,8 +435,14 @@ static int sam_header_line_validate(HeaderLine *hline) tag = tags->data; if ( !tag_exists(tag->key,required_tags[itype]) && !tag_exists(tag->key,optional_tags[itype]) ) { - debug("Unknown tag [%c%c] for [%c%c].\n", tag->key[0],tag->key[1], hline->type[0],hline->type[1]); - return 0; + // Lower case tags are user-defined values. + if( !(islower(tag->key[0]) || islower(tag->key[1])) ) + { + // Neither is lower case, but tag was not recognized. + debug("Unknown tag [%c%c] for [%c%c].\n", tag->key[0],tag->key[1], hline->type[0],hline->type[1]); + // return 0; // Even unknown tags are allowed - for forward compatibility with new attributes + } + // else - allow user defined tag } tags = tags->next; } @@ -663,6 +670,36 @@ char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n return ret; } +void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], const char value_tag[2], const char **_key, const char **_value) +{ + list_t *l = iter; + if ( !l ) return NULL; + + while (l) + { + HeaderLine *hline = l->data; + if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) + { + l = l->next; + continue; + } + + HeaderTag *key, *value; + key = header_line_has_tag(hline,key_tag); + value = header_line_has_tag(hline,value_tag); + if ( !key && !value ) + { + l = l->next; + continue; + } + + *_key = key->value; + *_value = value->value; + return l->next; + } + return l; +} + const char *sam_tbl_get(void *h, const char *key) { khash_t(str) *tbl = (khash_t(str)*)h; @@ -733,4 +770,41 @@ void *sam_header_merge(int n, const void **_dicts) return out_dict; } +char **sam_header2tbl_n(const void *dict, const char type[2], const char *tags[], int *n) +{ + int nout = 0; + char **out = NULL; + + *n = 0; + list_t *l = (list_t *)dict; + if ( !l ) return NULL; + + int i, ntags = 0; + while ( tags[ntags] ) ntags++; + + while (l) + { + HeaderLine *hline = l->data; + if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) + { + l = l->next; + continue; + } + out = (char**) realloc(out, sizeof(char*)*(nout+1)*ntags); + for (i=0; ivalue; + } + nout++; + l = l->next; + } + *n = nout; + return out; +} diff --git a/sam/sam_header.h b/sam/sam_header.h index e5c754f..4b0cb03 100644 --- a/sam/sam_header.h +++ b/sam/sam_header.h @@ -10,8 +10,32 @@ extern "C" { void sam_header_free(void *header); char *sam_header_write(const void *headerDict); // returns a newly allocated string + /* + // Usage example + const char *key, *val; + void *iter = sam_header_parse2(bam->header->text); + while ( iter = sam_header_key_val(iter, "RG","ID","SM" &key,&val) ) printf("%s\t%s\n", key,val); + */ + void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], const char value_tag[2], const char **key, const char **value); char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n); + /* + // Usage example + int i, j, n; + const char *tags[] = {"SN","LN","UR","M5",NULL}; + void *dict = sam_header_parse2(bam->header->text); + char **tbl = sam_header2tbl_n(h->dict, "SQ", tags, &n); + for (i=0; i #include #include +#include #include "sam_header.h" #include "sam.h" #include "faidx.h" @@ -14,15 +15,16 @@ KHASH_SET_INIT_STR(rg) // data passed to the bam_fetch callback is encapsulated in this struct. typedef struct { bam_header_t *header; - int *count; + int64_t *count; // int does overflow for very big BAMs } count_func_data_t; typedef khash_t(rg) *rghash_t; // FIXME: we'd better use no global variables... static rghash_t g_rghash = 0; -static int g_min_mapQ = 0, g_flag_on = 0, g_flag_off = 0; -static float g_subsam = -1; +static int g_min_mapQ = 0, g_flag_on = 0, g_flag_off = 0, g_qual_scale = 0, g_min_qlen = 0; +static uint32_t g_subsam_seed = 0; +static double g_subsam_frac = -1.; static char *g_library, *g_rg; static void *g_bed; @@ -30,16 +32,31 @@ void *bed_read(const char *fn); void bed_destroy(void *_h); int bed_overlap(const void *_h, const char *chr, int beg, int end); -static inline int __g_skip_aln(const bam_header_t *h, const bam1_t *b) +static int process_aln(const bam_header_t *h, bam1_t *b) { + if (g_qual_scale > 1) { + int i; + uint8_t *qual = bam1_qual(b); + for (i = 0; i < b->core.l_qseq; ++i) { + int c = qual[i] * g_qual_scale; + qual[i] = c < 93? c : 93; + } + } + if (g_min_qlen > 0) { + int k, qlen = 0; + uint32_t *cigar = bam1_cigar(b); + for (k = 0; k < b->core.n_cigar; ++k) + if ((bam_cigar_type(bam_cigar_op(cigar[k]))&1) || bam_cigar_op(cigar[k]) == BAM_CHARD_CLIP) + qlen += bam_cigar_oplen(cigar[k]); + if (qlen < g_min_qlen) return 1; + } if (b->core.qual < g_min_mapQ || ((b->core.flag & g_flag_on) != g_flag_on) || (b->core.flag & g_flag_off)) return 1; if (g_bed && b->core.tid >= 0 && !bed_overlap(g_bed, h->target_name[b->core.tid], b->core.pos, bam_calend(&b->core, bam1_cigar(b)))) return 1; - if (g_subsam > 0.) { - int x = (int)(g_subsam + .499); - uint32_t k = __ac_X31_hash_string(bam1_qname(b)) + x; - if (k%1024 / 1024.0 >= g_subsam - x) return 1; + if (g_subsam_frac > 0.) { + uint32_t k = __ac_X31_hash_string(bam1_qname(b)) + g_subsam_seed; + if ((double)(k&0xffffff) / 0x1000000 >= g_subsam_frac) return 1; } if (g_rg || g_rghash) { uint8_t *s = bam_aux_get(b, "RG"); @@ -92,7 +109,7 @@ static char *drop_rg(char *hdtxt, rghash_t h, int *len) // callback function for bam_fetch() that prints nonskipped records static int view_func(const bam1_t *b, void *data) { - if (!__g_skip_aln(((samfile_t*)data)->header, b)) + if (!process_aln(((samfile_t*)data)->header, (bam1_t*)b)) samwrite((samfile_t*)data, b); return 0; } @@ -100,7 +117,7 @@ static int view_func(const bam1_t *b, void *data) // callback function for bam_fetch() that counts nonskipped records static int count_func(const bam1_t *b, void *data) { - if (!__g_skip_aln(((count_func_data_t*)data)->header, b)) { + if (!process_aln(((count_func_data_t*)data)->header, (bam1_t*)b)) { (*((count_func_data_t*)data)->count)++; } return 0; @@ -111,16 +128,23 @@ static int usage(int is_long_help); int main_samview(int argc, char *argv[]) { int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, compress_level = -1, is_bamout = 0, is_count = 0; - int of_type = BAM_OFDEC, is_long_help = 0; - int count = 0; + int of_type = BAM_OFDEC, is_long_help = 0, n_threads = 0; + int64_t count = 0; samfile_t *in = 0, *out = 0; - char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0, *fn_rg = 0; + char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0, *fn_rg = 0, *q; /* parse command-line options */ strcpy(in_mode, "r"); strcpy(out_mode, "w"); - while ((c = getopt(argc, argv, "Sbct:h1Ho:q:f:F:ul:r:xX?T:R:L:s:")) >= 0) { + while ((c = getopt(argc, argv, "SbBct:h1Ho:q:f:F:ul:r:xX?T:R:L:s:Q:@:m:")) >= 0) { switch (c) { - case 's': g_subsam = atof(optarg); break; + case 's': + if ((g_subsam_seed = strtol(optarg, &q, 10)) != 0) { + srand(g_subsam_seed); + g_subsam_seed = rand(); + } + g_subsam_frac = strtod(q, &q); + break; + case 'm': g_min_qlen = atoi(optarg); break; case 'c': is_count = 1; break; case 'S': is_bamin = 0; break; case 'b': is_bamout = 1; break; @@ -141,6 +165,9 @@ int main_samview(int argc, char *argv[]) case 'X': of_type = BAM_OFSTR; break; case '?': is_long_help = 1; break; case 'T': fn_ref = strdup(optarg); is_bamin = 0; break; + case 'B': bam_no_B = 1; break; + case 'Q': g_qual_scale = atoi(optarg); break; + case '@': n_threads = strtol(optarg, 0, 0); break; default: return usage(is_long_help); } } @@ -198,13 +225,14 @@ int main_samview(int argc, char *argv[]) ret = 1; goto view_end; } + if (n_threads > 1) samthreads(out, n_threads, 256); if (is_header_only) goto view_end; // no need to print alignments if (argc == optind + 1) { // convert/print the entire file bam1_t *b = bam_init1(); int r; while ((r = samread(in, b)) >= 0) { // read one alignment from `in' - if (!__g_skip_aln(in->header, b)) { + if (!process_aln(in->header, b)) { if (!is_count) samwrite(out, b); // write the alignment to `out' count++; } @@ -246,9 +274,9 @@ int main_samview(int argc, char *argv[]) } view_end: - if (is_count && ret == 0) { - printf("%d\n", count); - } + if (is_count && ret == 0) + printf("%" PRId64 "\n", count); + // close files, free and return free(fn_list); free(fn_ref); free(fn_out); free(g_library); free(g_rg); free(fn_rg); if (g_bed) bed_destroy(g_bed); @@ -277,6 +305,8 @@ static int usage(int is_long_help) fprintf(stderr, " -x output FLAG in HEX (samtools-C specific)\n"); fprintf(stderr, " -X output FLAG in string (samtools-C specific)\n"); fprintf(stderr, " -c print only the count of matching records\n"); + fprintf(stderr, " -B collapse the backward CIGAR operation\n"); + fprintf(stderr, " -@ INT number of BAM compression threads [0]\n"); fprintf(stderr, " -L FILE output alignments overlapping the input BED FILE [null]\n"); fprintf(stderr, " -t FILE list of reference names and lengths (force -S) [null]\n"); fprintf(stderr, " -T FILE reference sequence file (force -S) [null]\n"); @@ -347,12 +377,14 @@ int main_bam2fq(int argc, char *argv[]) bam_header_t *h; bam1_t *b; int8_t *buf; - int max_buf; + int max_buf, c, no12 = 0; + while ((c = getopt(argc, argv, "n")) > 0) + if (c == 'n') no12 = 1; if (argc == 1) { fprintf(stderr, "Usage: samtools bam2fq \n"); return 1; } - fp = strcmp(argv[1], "-")? bam_open(argv[1], "r") : bam_dopen(fileno(stdin), "r"); + fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r"); if (fp == 0) return 1; h = bam_header_read(fp); b = bam_init1(); @@ -362,9 +394,12 @@ int main_bam2fq(int argc, char *argv[]) int i, qlen = b->core.l_qseq; uint8_t *seq; putchar('@'); fputs(bam1_qname(b), stdout); - if ((b->core.flag & 0x40) && !(b->core.flag & 0x80)) puts("/1"); - else if ((b->core.flag & 0x80) && !(b->core.flag & 0x40)) puts("/2"); - else putchar('\n'); + if (no12) putchar('\n'); + else { + if ((b->core.flag & 0x40) && !(b->core.flag & 0x80)) puts("/1"); + else if ((b->core.flag & 0x80) && !(b->core.flag & 0x40)) puts("/2"); + else putchar('\n'); + } if (max_buf < qlen + 1) { max_buf = qlen + 1; kroundup32(max_buf); diff --git a/sam/samtools.1 b/sam/samtools.1 index 98ce9d0..5923abd 100644 --- a/sam/samtools.1 +++ b/sam/samtools.1 @@ -1,4 +1,4 @@ -.TH samtools 1 "05 July 2011" "samtools-0.1.17" "Bioinformatics tools" +.TH samtools 1 "15 March 2013" "samtools-0.1.19" "Bioinformatics tools" .SH NAME .PP samtools - Utilities for the Sequence Alignment/Map (SAM) format @@ -30,7 +30,7 @@ bcftools index in.bcf .PP bcftools view in.bcf chr2:100-200 > out.vcf .PP -bcftools view -vc in.bcf > out.vcf 2> out.afs +bcftools view -Nvm0.99 in.bcf > out.vcf 2> out.afs .SH DESCRIPTION .PP @@ -69,7 +69,7 @@ format: `chr2' (the whole chr2), `chr2:1000000' (region starting from .B OPTIONS: .RS -.TP 8 +.TP 10 .B -b Output in the BAM format. .TP @@ -103,6 +103,10 @@ Output reads in read groups listed in .I FILE [null] .TP +.BI -s \ FLOAT +Fraction of templates/pairs to subsample; the integer part is treated as the +seed for the random number generator [-1] +.TP .B -S Input is in SAM. If @SQ header lines are absent, the .B `-t' @@ -136,17 +140,38 @@ to another samtools command. .TP .B tview -samtools tview [ref.fasta] +samtools tview +.RB [ \-p +.IR chr:pos ] +.RB [ \-s +.IR STR ] +.RB [ \-d +.IR display ] +.RI +.RI [ref.fasta] Text alignment viewer (based on the ncurses library). In the viewer, press `?' for help and press `g' to check the alignment start from a region in the format like `chr10:10,000,000' or `=10,000,000' when viewing the same reference sequence. +.B Options: +.RS +.TP 14 +.BI -d \ display +Output as (H)tml or (C)urses or (T)ext +.TP +.BI -p \ chr:pos +Go directly to this position +.TP +.BI -s \ STR +Display only reads from this sample or read group +.RE + .TP .B mpileup -.B samtools mpileup -.RB [ \-EBug ] +samtools mpileup +.RB [ \-EBugp ] .RB [ \-C .IR capQcoef ] .RB [ \-r @@ -293,6 +318,10 @@ Phred-scaled gap open sequencing error probability. Reducing .I INT leads to more indel calls. [40] .TP +.BI -p +Apply -m and -F thresholds per sample to increase sensitivity of calling. +By default both options are applied to reads pooled from all samples. +.TP .BI -P \ STR Comma dilimited list of platforms (determined by .BR @RG-PL ) @@ -324,7 +353,7 @@ which enables fast BAM concatenation. .TP .B sort -samtools sort [-no] [-m maxMem] +samtools sort [-nof] [-m maxMem] Sort alignments by leftmost coordinates. File .I .bam @@ -342,6 +371,13 @@ Output the final alignment to the standard output. .B -n Sort by read names rather than by chromosomal coordinates .TP +.B -f +Use +.I +as the full output path and do not append +.I .bam +suffix. +.TP .BI -m \ INT Approximately the maximum required memory. [500000000] .RE @@ -566,6 +602,8 @@ Minimum base quality to be used in het calling. [13] .IR mutRate ] .RB [ \-p .IR varThres ] +.RB [ \-m +.IR varThres ] .RB [ \-P .IR prior ] .RB [ \-1 @@ -648,6 +686,12 @@ Call per-sample genotypes at variant sites (force -c) .BI -i \ FLOAT Ratio of INDEL-to-SNP mutation rate [0.15] .TP +.BI -m \ FLOAT +New model for improved multiallelic and rare-variant calling. Another +ALT allele is accepted if P(chi^2) of LRT exceeds the FLOAT threshold. The +parameter seems robust and the actual value usually does not affect the results +much; a good value to use is 0.99. This is the recommended calling method. [0] +.TP .BI -p \ FLOAT A site is considered to be a variant if P(ref|D) +.br +Samtools latest source: +.br +VCFtools website with stable link to VCF specification: +.br +HTSlib website: diff --git a/sam/win32/._xcurses.h b/sam/win32/._xcurses.h new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/win32/._xcurses.h differ diff --git a/sam/win32/._zconf.h b/sam/win32/._zconf.h new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/win32/._zconf.h differ diff --git a/sam/win32/._zlib.h b/sam/win32/._zlib.h new file mode 100644 index 0000000..94286bb Binary files /dev/null and b/sam/win32/._zlib.h differ diff --git a/synthesisRef.cpp b/synthesisRef.cpp index 8ce268c..aa0d473 100644 --- a/synthesisRef.cpp +++ b/synthesisRef.cpp @@ -118,7 +118,6 @@ int main(int argc, char* argv[]) { ifstream fin; string line, gseq; string seqname, gene_id; - void* pt; vector vec; @@ -127,13 +126,13 @@ int main(int argc, char* argv[]) { for (int i = start; i < argc; i++) { fin.open(argv[i]); if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", argv[i]); exit(-1); } - pt = getline(fin, line); - while (pt != 0 && line[0] == '>') { + getline(fin, line); + while ((fin) && (line[0] == '>')) { istringstream strin(line.substr(1)); strin>>seqname; gseq = ""; - while((pt = getline(fin, line)) && line[0] != '>') { + while((getline(fin, line)) && (line[0] != '>')) { gseq += line; }