From a97cc1d4f0111f7fe523227412a2147f7a763d56 Mon Sep 17 00:00:00 2001 From: Bo Li Date: Thu, 17 Feb 2011 08:33:06 -0600 Subject: [PATCH] Normalized Read Fraction(nrf) are eliminated from outputs and posterior mean counts are added. A bug related to RSPD estimating is fixed --- EM.cpp | 20 +- Gibbs.cpp | 30 +- PairedEndModel.h | 2 +- PairedEndQModel.h | 2 +- RSPD.h | 2 +- SingleModel.h | 2 +- SingleQModel.h | 6 +- calcCI.cpp | 37 +- rsem-calculate-expression | 2 +- sam/ChangeLog | 1902 +++++++++++++++++++++++++++++++++++++ sam/Makefile | 20 +- sam/NEWS | 141 +++ sam/bam.c | 28 +- sam/bam.h | 27 +- sam/bam_index.c | 34 +- sam/bam_maqcns.c | 84 +- sam/bam_maqcns.h | 6 +- sam/bam_md.c | 164 +++- sam/bam_pileup.c | 157 +-- sam/bam_plcmd.c | 530 +++++++++-- sam/bam_sort.c | 157 ++- sam/bam_tview.c | 4 +- sam/bamtk.c | 2 +- sam/bgzf.c | 6 +- sam/bgzip.c | 191 ++-- sam/examples/Makefile | 17 +- sam/examples/toy.fa | 2 + sam/examples/toy.sam | 13 +- sam/kaln.c | 144 ++- sam/kaln.h | 20 +- sam/knetfile.c | 5 +- sam/ksort.h | 10 + sam/kstring.c | 89 +- sam/kstring.h | 32 +- sam/misc/Makefile | 2 +- sam/misc/samtools.pl | 41 +- sam/razip.c | 2 +- sam/sam_view.c | 66 +- sam/samtools.1 | 545 +++++++---- sam/samtools.txt | 430 ++++++--- 40 files changed, 4172 insertions(+), 802 deletions(-) diff --git a/EM.cpp b/EM.cpp index 00cb852..e381b47 100644 --- a/EM.cpp +++ b/EM.cpp @@ -315,21 +315,13 @@ void writeResults(ModelType& model, double* counts) { fprintf(fo, "%.15g\n", theta[M]); fclose(fo); - - //calculate normalized read fraction - double *nrf = new double[M + 1]; - memset(nrf, 0, sizeof(double) * (M + 1)); - denom = 1.0 - theta[0]; - if (denom <= 0) { fprintf(stderr, "No alignable reads?!\n"); exit(-1); } - for (int i = 1; i <= M; i++) nrf[i] = theta[i] / denom; - //calculate tau values double *tau = new double[M + 1]; memset(tau, 0, sizeof(double) * (M + 1)); denom = 0.0; for (int i = 1; i <= M; i++) - if (eel[i] > EPSILON) { + if (eel[i] >= EPSILON) { tau[i] = theta[i] / eel[i]; denom += tau[i]; } @@ -348,8 +340,6 @@ void writeResults(ModelType& model, double* counts) { } for (int i = 1; i <= M; i++) fprintf(fo, "%.2f%c", counts[i], (i < M ? '\t' : '\n')); - for (int i = 1; i <= M; i++) - fprintf(fo, "%.15g%c", nrf[i], (i < M ? '\t' : '\n')); for (int i = 1; i <= M; i++) fprintf(fo, "%.15g%c", tau[i], (i < M ? '\t' : '\n')); for (int i = 1; i <= M; i++) { @@ -371,12 +361,6 @@ void writeResults(ModelType& model, double* counts) { for (int j = b; j < e; j++) sumC += counts[j]; fprintf(fo, "%.2f%c", sumC, (i < m - 1 ? '\t' : '\n')); } - for (int i = 0; i < m; i++) { - double sumN = 0.0; // sum of normalized read fraction - int b = gi.spAt(i), e = gi.spAt(i + 1); - for (int j = b; j < e; j++) sumN += nrf[j]; - fprintf(fo, "%.15g%c", sumN, (i < m - 1 ? '\t' : '\n')); - } for (int i = 0; i < m; i++) { double sumT = 0.0; // sum of tau values int b = gi.spAt(i), e = gi.spAt(i + 1); @@ -391,7 +375,6 @@ void writeResults(ModelType& model, double* counts) { } fclose(fo); - delete[] nrf; delete[] tau; if (verbose) { printf("Expression Results are written!\n"); } @@ -472,7 +455,6 @@ void EM() { pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); - ROUND = 0; do { ++ROUND; diff --git a/Gibbs.cpp b/Gibbs.cpp index 2d41d08..4b1e2d5 100644 --- a/Gibbs.cpp +++ b/Gibbs.cpp @@ -41,7 +41,7 @@ char cvsF[STRLEN]; Refs refs; GroupInfo gi; -vector theta, pme_theta, eel; +vector theta, pme_theta, pme_c, eel; vector s, z; vector hits; @@ -183,6 +183,7 @@ void Gibbs(char* imdName) { fprintf(fo, "%d %d\n", CHAINLEN / GAP, M + 1); //fprintf(fo, "%d %d\n", CHAINLEN, M + 1); + pme_c.clear(); pme_c.resize(M + 1, 0.0); pme_theta.clear(); pme_theta.resize(M + 1, 0.0); for (int ROUND = 1; ROUND <= BURNIN + CHAINLEN; ROUND++) { @@ -201,14 +202,20 @@ void Gibbs(char* imdName) { if (ROUND > BURNIN) { if ((ROUND - BURNIN -1) % GAP == 0) writeCountVector(fo); writeCountVector(fo); - for (int i = 0; i <= M; i++) pme_theta[i] += counts[i] / totc; + for (int i = 0; i <= M; i++) { + pme_c[i] += counts[i] - 1; + pme_theta[i] += counts[i] / totc; + } } if (verbose) { printf("ROUND %d is finished!\n", ROUND); } } fclose(fo); - for (int i = 0; i <= M; i++) pme_theta[i] /= CHAINLEN; + for (int i = 0; i <= M; i++) { + pme_c[i] /= CHAINLEN; + pme_theta[i] /= CHAINLEN; + } if (verbose) { printf("Gibbs is finished!\n"); } } @@ -272,14 +279,6 @@ void writeEstimatedParameters(char* modelF, char* imdName) { assert(denom >= EPSILON); for (int i = 0; i <= M; i++) pme_theta[i] /= denom; - //calculate normalized read fraction - double *nrf = new double[M + 1]; - memset(nrf, 0, sizeof(double) * (M + 1)); - - denom = 1.0 - pme_theta[0]; - if (denom <= 0) { fprintf(stderr, "No alignable reads?!\n"); exit(-1); } - for (int i = 1; i <= M; i++) nrf[i] = pme_theta[i] / denom; - //calculate tau values double *tau = new double[M + 1]; memset(tau, 0, sizeof(double) * (M + 1)); @@ -301,7 +300,7 @@ void writeEstimatedParameters(char* modelF, char* imdName) { fo = fopen(outF, "a"); if (fo == NULL) { fprintf(stderr, "Cannot open %s!\n", outF); exit(-1); } for (int i = 1; i <= M; i++) - fprintf(fo, "%.15g%c", nrf[i], (i < M ? '\t' : '\n')); + fprintf(fo, "%.2f%c", pme_c[i], (i < M ? '\t' : '\n')); for (int i = 1; i <= M; i++) fprintf(fo, "%.15g%c", tau[i], (i < M ? '\t' : '\n')); fclose(fo); @@ -311,12 +310,12 @@ void writeEstimatedParameters(char* modelF, char* imdName) { fo = fopen(outF, "a"); if (fo == NULL) { fprintf(stderr, "Cannot open %s!\n", outF); exit(-1); } for (int i = 0; i < m; i++) { - double sumN = 0.0; // sum of normalized read fraction + double sumC = 0.0; // sum of pme counts int b = gi.spAt(i), e = gi.spAt(i + 1); for (int j = b; j < e; j++) { - sumN += nrf[j]; + sumC += pme_c[j]; } - fprintf(fo, "%.15g%c", sumN, (i < m - 1 ? '\t' : '\n')); + fprintf(fo, "%.15g%c", sumC, (i < m - 1 ? '\t' : '\n')); } for (int i = 0; i < m; i++) { double sumT = 0.0; // sum of tau values @@ -328,7 +327,6 @@ void writeEstimatedParameters(char* modelF, char* imdName) { } fclose(fo); - delete[] nrf; delete[] tau; if (verbose) { printf("Gibbs based expression values are written!\n"); } diff --git a/PairedEndModel.h b/PairedEndModel.h index e73c03d..bd046be 100644 --- a/PairedEndModel.h +++ b/PairedEndModel.h @@ -59,11 +59,11 @@ public: mw = NULL; if (isMaster) { - ori = new Orientation(params.probF); if (!estRSPD) rspd = new RSPD(estRSPD); mld = new LenDist(params.mate_minL, params.mate_maxL); } + ori = new Orientation(params.probF); gld = new LenDist(params.minL, params.maxL); if (estRSPD) rspd = new RSPD(estRSPD, params.B); pro = new Profile(params.maxL); diff --git a/PairedEndQModel.h b/PairedEndQModel.h index bac7135..5437eb4 100644 --- a/PairedEndQModel.h +++ b/PairedEndQModel.h @@ -61,12 +61,12 @@ public: mw = NULL; if (isMaster) { - ori = new Orientation(params.probF); if (!estRSPD) rspd = new RSPD(estRSPD); qd = new QualDist(); mld = new LenDist(params.mate_minL, params.mate_maxL); } + ori = new Orientation(params.probF); gld = new LenDist(params.minL, params.maxL); if (estRSPD) rspd = new RSPD(estRSPD, params.B); qpro = new QProfile(); diff --git a/RSPD.h b/RSPD.h index dd39783..4b8e25f 100644 --- a/RSPD.h +++ b/RSPD.h @@ -18,7 +18,7 @@ public: cdf = new double[B + 2]; //set initial parameters - memset(pdf, 0, sizeof(double) * (B + 2)); + memset(pdf, 0, sizeof(double) * (B + 2)); // use B + 2 for evalCDF memset(cdf, 0, sizeof(double) * (B + 2)); for (int i = 1; i <= B; i++) { pdf[i] = 1.0 / B; diff --git a/SingleModel.h b/SingleModel.h index 532f79c..f1d543f 100644 --- a/SingleModel.h +++ b/SingleModel.h @@ -61,7 +61,6 @@ public: mw = NULL; if (isMaster) { - ori = new Orientation(params.probF); gld = new LenDist(params.minL, params.maxL); if (mean >= EPSILON) { mld = new LenDist(params.mate_minL, params.mate_maxL); @@ -69,6 +68,7 @@ public: if (!estRSPD) { rspd = new RSPD(estRSPD); } } + ori = new Orientation(params.probF); if (estRSPD) { rspd = new RSPD(estRSPD, params.B); } pro = new Profile(params.maxL); npro = new NoiseProfile(); diff --git a/SingleQModel.h b/SingleQModel.h index 0c2ba34..dff9c46 100644 --- a/SingleQModel.h +++ b/SingleQModel.h @@ -63,7 +63,6 @@ public: mw = NULL; if (isMaster) { - ori = new Orientation(params.probF); gld = new LenDist(params.minL, params.maxL); if (mean >= EPSILON) { mld = new LenDist(params.mate_minL, params.mate_maxL); @@ -72,6 +71,7 @@ public: qd = new QualDist(); } + ori = new Orientation(params.probF); if (estRSPD) { rspd = new RSPD(estRSPD, params.B); } qpro = new QProfile(); nqpro = new NoiseQProfile(); @@ -158,7 +158,8 @@ public: void update(const SingleReadQ& read, const SingleHit& hit, double frac) { if (read.isLowQuality() || frac < EPSILON) return; - RefSeq& ref = refs->getRef(hit.getSid()); + const RefSeq& ref = refs->getRef(hit.getSid()); + int dir = hit.getDir(); int pos = hit.getPos(); @@ -465,7 +466,6 @@ void SingleQModel::calcMW() { memset(mw, 0, sizeof(double) * (M + 1)); mw[0] = 1.0; - probF = ori->getProb(0); probR = ori->getProb(1); diff --git a/calcCI.cpp b/calcCI.cpp index 581e434..d1f26e5 100644 --- a/calcCI.cpp +++ b/calcCI.cpp @@ -50,7 +50,7 @@ ofstream ftmpOut; int *cvec; double *theta; -CIType *iso_nrf, *gene_nrf, *iso_tau, *gene_tau; +CIType *iso_tau, *gene_tau; engine_type engine(time(NULL)); distribution_type **gammas; @@ -263,45 +263,37 @@ void calcCI(int nSamples, float *samples, float &lb, float &ub) { } void generateResults(char* imdName) { - float *izsamples, *gzsamples, *itsamples, *gtsamples; + float *itsamples, *gtsamples; ifstream fin; FILE *fo; char outF[STRLEN]; - iso_nrf = new CIType[M + 1]; iso_tau = new CIType[M + 1]; - gene_nrf = new CIType[m]; gene_tau = new CIType[m]; - izsamples = new float[nSamples]; itsamples = new float[nSamples]; - gzsamples = new float[nSamples]; gtsamples = new float[nSamples]; fin.open(tmpF, ios::binary); - for (int k = 0; k < nSamples; k++) fin.read((char*)(&izsamples[k]), FLOATSIZE); - calcCI(nSamples, izsamples, iso_nrf[0].lb, iso_nrf[0].ub); + //read sampled theta0 values + for (int k = 0; k < nSamples; k++) fin.read((char*)(&itsamples[k]), FLOATSIZE); for (int i = 0; i < m; i++) { int b = gi.spAt(i), e = gi.spAt(i + 1); - memset(gzsamples, 0, FLOATSIZE * nSamples); memset(gtsamples, 0, FLOATSIZE * nSamples); for (int j = b; j < e; j++) { for (int k = 0; k < nSamples; k++) { - fin.read((char*)(&izsamples[k]), FLOATSIZE); - if (eel[j] > EPSILON && tau_denoms[k] > EPSILON) { itsamples[k] = izsamples[k] / eel[j] / tau_denoms[k]; } + fin.read((char*)(&itsamples[k]), FLOATSIZE); + if (eel[j] > EPSILON && tau_denoms[k] > EPSILON) { itsamples[k] = itsamples[k] / eel[j] / tau_denoms[k]; } else { - if (izsamples[k] != 0.0) { fprintf(stderr, "K=%d, IZSAMPLES_K=%lf\n", k, izsamples[k]); exit(-1); } + if (itsamples[k] != 0.0) { fprintf(stderr, "Not equal to 0! K=%d, Sampled Theta Value=%lf\n", k, itsamples[k]); exit(-1); } itsamples[k] = 0.0; } - gzsamples[k] += izsamples[k]; gtsamples[k] += itsamples[k]; } - calcCI(nSamples, izsamples, iso_nrf[j].lb, iso_nrf[j].ub); calcCI(nSamples, itsamples, iso_tau[j].lb, iso_tau[j].ub); } - calcCI(nSamples, gzsamples, gene_nrf[i].lb, gene_nrf[i].ub); calcCI(nSamples, gtsamples, gene_tau[i].lb, gene_tau[i].ub); if (verbose && (i + 1) % 1000 == 0) { printf("%d genes are done!\n", i + 1); } @@ -312,10 +304,6 @@ void generateResults(char* imdName) { //isoform level results sprintf(outF, "%s.iso_res", imdName); fo = fopen(outF, "a"); - for (int i = 1; i <= M; i++) - fprintf(fo, "%.6g%c", iso_nrf[i].lb, (i < M ? '\t' : '\n')); - for (int i = 1; i <= M; i++) - fprintf(fo, "%.6g%c", iso_nrf[i].ub, (i < M ? '\t' : '\n')); for (int i = 1; i <= M; i++) fprintf(fo, "%.6g%c", iso_tau[i].lb, (i < M ? '\t' : '\n')); for (int i = 1; i <= M; i++) @@ -325,26 +313,16 @@ void generateResults(char* imdName) { //gene level results sprintf(outF, "%s.gene_res", imdName); fo = fopen(outF, "a"); - for (int i = 0; i < m; i++) - fprintf(fo, "%.6g%c", gene_nrf[i].lb, (i < m - 1 ? '\t' : '\n')); - for (int i = 0; i < m; i++) - fprintf(fo, "%.6g%c", gene_nrf[i].ub, (i < m - 1 ? '\t' : '\n')); for (int i = 0; i < m; i++) fprintf(fo, "%.6g%c", gene_tau[i].lb, (i < m - 1 ? '\t' : '\n')); for (int i = 0; i < m; i++) fprintf(fo, "%.6g%c", gene_tau[i].ub, (i < m - 1 ? '\t' : '\n')); fclose(fo); - printf("CI of noise isoform is [%.6g, %.6g]\n", iso_nrf[0].lb, iso_nrf[0].ub); - - delete[] izsamples; delete[] itsamples; - delete[] gzsamples; delete[] gtsamples; - delete[] iso_nrf; delete[] iso_tau; - delete[] gene_nrf; delete[] gene_tau; if (verbose) { printf("All credibility intervals are calculated!\n"); } @@ -355,7 +333,6 @@ void generateResults(char* imdName) { for (int i = 0; i < nSamples; i++) fprintf(fo, "%.15g ", tau_denoms[i]); fprintf(fo, "\n"); fclose(fo); - } int main(int argc, char* argv[]) { diff --git a/rsem-calculate-expression b/rsem-calculate-expression index ddeb8dd..3539fa4 100755 --- a/rsem-calculate-expression +++ b/rsem-calculate-expression @@ -344,7 +344,7 @@ sub collectResults { ++$cnt; chomp($line); my @local_arr = split(/\t/, $line); - if ($cnt == 4) { @comment = @local_arr; } + if ($cnt == 3) { @comment = @local_arr; } else { push(@results, \@local_arr); } } diff --git a/sam/ChangeLog b/sam/ChangeLog index 6b0ff6c..dd62b49 100644 --- a/sam/ChangeLog +++ b/sam/ChangeLog @@ -1,3 +1,1905 @@ +------------------------------------------------------------------------ +r844 | lh3lh3 | 2010-11-19 23:16:08 -0500 (Fri, 19 Nov 2010) | 3 lines +Changed paths: + M /trunk/samtools/bamtk.c + M /trunk/samtools/bcftools/call1.c + M /trunk/samtools/bcftools/prob1.c + M /trunk/samtools/bcftools/prob1.h + + * samtools-0.1.10-9 (r844) + * added the "folded" or reference-free mode for variant calling + +------------------------------------------------------------------------ +r843 | lh3lh3 | 2010-11-19 22:26:36 -0500 (Fri, 19 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/NEWS + M /trunk/samtools/bam_sort.c + +In merging, if -R is specified, do not abort if the sequence dictionary is different. + +------------------------------------------------------------------------ +r842 | jmarshall | 2010-11-19 21:24:28 -0500 (Fri, 19 Nov 2010) | 5 lines +Changed paths: + M /trunk/samtools/bam_sort.c + +When merging BAM headers, compare the list of target reference sequences +strictly (and fail/abort if there is a mismatch), but allow one list to be a +prefix of the other. (i.e., check that the lists are identical up until the +shorter runs out, and add the excess targets from the longer to the output.) + +------------------------------------------------------------------------ +r841 | lh3lh3 | 2010-11-19 14:49:27 -0500 (Fri, 19 Nov 2010) | 4 lines +Changed paths: + M /trunk/samtools/bam_index.c + M /trunk/samtools/bam_pileup.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.10 (r841) + * fixed a bug in pileup when the first CIGAR operation is D + * fixed a bug in view with range query + +------------------------------------------------------------------------ +r840 | lh3lh3 | 2010-11-19 13:45:51 -0500 (Fri, 19 Nov 2010) | 10 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf.h + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.10-4 (r840) + + * drop the MNP caller. It is slow while does not diliver too much + benefit. Possibly I will work on it in future given more time. + + * there is a segfault in pileup + + * someone has reported segfault from view/index/sort + + +------------------------------------------------------------------------ +r839 | lh3lh3 | 2010-11-18 17:30:11 -0500 (Thu, 18 Nov 2010) | 9 lines +Changed paths: + M /trunk/samtools/bam2bcf.h + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.10-6 (r839) + + * call MNPs without realignment because it seems to me that it is not + worthwhile to significantly slow down SNP calling. + + * the result looks quite different from the previous version. I have + work to do... + + +------------------------------------------------------------------------ +r838 | lh3lh3 | 2010-11-18 11:26:09 -0500 (Thu, 18 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/knetfile.c + +Apply a patch by Rob Davis, which improves fault detection. + +------------------------------------------------------------------------ +r836 | lh3lh3 | 2010-11-18 11:09:23 -0500 (Thu, 18 Nov 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bamtk.c + + * samtools-r836 + * initiate MNP realignment when the MNP has at least 0.2% frequency (otherwise too slow) + +------------------------------------------------------------------------ +r835 | lh3lh3 | 2010-11-18 00:25:13 -0500 (Thu, 18 Nov 2010) | 3 lines +Changed paths: + M /trunk/samtools/bcftools/vcfutils.pl + + * modify the filtering rule: also filter SNPs around filtered indels + * added MNP filter + +------------------------------------------------------------------------ +r834 | lh3lh3 | 2010-11-17 23:13:52 -0500 (Wed, 17 Nov 2010) | 4 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.10-4 (r834) + * fixed a silly bug in printing MNP + * restrict to at most 1 alternative allele + +------------------------------------------------------------------------ +r833 | lh3lh3 | 2010-11-17 21:58:58 -0500 (Wed, 17 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bamtk.c + +fixed a bug in printing MNPs + +------------------------------------------------------------------------ +r832 | lh3lh3 | 2010-11-17 21:47:20 -0500 (Wed, 17 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bamtk.c + +minor change to how seqQ is applied + +------------------------------------------------------------------------ +r831 | lh3lh3 | 2010-11-17 21:41:12 -0500 (Wed, 17 Nov 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf.h + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.10 (r831) + * initial MNP caller + +------------------------------------------------------------------------ +r829 | lh3lh3 | 2010-11-16 23:14:15 -0500 (Tue, 16 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/NEWS + M /trunk/samtools/bamtk.c + +Release samtools-0.1.10 (r829) + +------------------------------------------------------------------------ +r828 | lh3lh3 | 2010-11-16 20:48:49 -0500 (Tue, 16 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/bamtk.c + +update version information: samtools-0.1.9-20 (r828) + +------------------------------------------------------------------------ +r827 | lh3lh3 | 2010-11-16 15:32:50 -0500 (Tue, 16 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/call1.c + +bcftools: allow to skip indels + +------------------------------------------------------------------------ +r826 | lh3lh3 | 2010-11-16 14:11:58 -0500 (Tue, 16 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam_md.c + +remove ZQ if both BQ and ZQ are present + +------------------------------------------------------------------------ +r825 | lh3lh3 | 2010-11-16 13:51:33 -0500 (Tue, 16 Nov 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bam_md.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/samtools.1 + + * samtools-0.1.9-18 (r825) + * change the behaviour of calmd such that by default it does not change the base quality + +------------------------------------------------------------------------ +r824 | lh3lh3 | 2010-11-15 23:31:53 -0500 (Mon, 15 Nov 2010) | 4 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/bcftools/call1.c + M /trunk/samtools/samtools.1 + + * samtools-0.1.9-17 (r824) + * added command line options to change the default parameters in indel calling + * update the manual + +------------------------------------------------------------------------ +r823 | lh3lh3 | 2010-11-15 12:20:13 -0500 (Mon, 15 Nov 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bam_md.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.9-r823 + * the BQ tag is now 64 shifted, not 33 shifted + +------------------------------------------------------------------------ +r822 | lh3lh3 | 2010-11-15 00:30:18 -0500 (Mon, 15 Nov 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf.h + M /trunk/samtools/bamtk.c + M /trunk/samtools/bcftools/vcfutils.pl + M /trunk/samtools/misc/samtools.pl + + * samtools-0.1.9-16 (r822) + * keep the raw depth because in indel calling, DP4 may be way off the true depth + +------------------------------------------------------------------------ +r821 | lh3lh3 | 2010-11-13 01:18:31 -0500 (Sat, 13 Nov 2010) | 4 lines +Changed paths: + M /trunk/samtools/bam_md.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.9-15 (r821) + * calmd: write BQ + * skip realignment if BQ is present + +------------------------------------------------------------------------ +r820 | lh3lh3 | 2010-11-13 01:08:26 -0500 (Sat, 13 Nov 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.9-14 (r820) + * penalize reads with excessive differences in indel calling + +------------------------------------------------------------------------ +r819 | lh3lh3 | 2010-11-12 21:36:27 -0500 (Fri, 12 Nov 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.9-13 (r819) + * fixed a bug in pileup given refskip + +------------------------------------------------------------------------ +r818 | lh3lh3 | 2010-11-12 13:04:53 -0500 (Fri, 12 Nov 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bamtk.c + + * samtools-r818 + * for indel calling, do two rounds of probabilistic realignments + +------------------------------------------------------------------------ +r817 | lh3lh3 | 2010-11-11 20:04:07 -0500 (Thu, 11 Nov 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/bcftools/vcfutils.pl + + * samtools-0.1.19-11 (r817) + * only initiate indel calling when 0.2% of reads contain a gap + +------------------------------------------------------------------------ +r816 | lh3lh3 | 2010-11-11 01:22:59 -0500 (Thu, 11 Nov 2010) | 7 lines +Changed paths: + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.9-10 (r816) + + * I know why the forward method fails. it is because of zero base + qualities. when that is fixed, the forward method seems to give + better results than Viterbi, as it should be. I am tired... + + +------------------------------------------------------------------------ +r815 | lh3lh3 | 2010-11-11 00:57:15 -0500 (Thu, 11 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/bam2bcf_indel.c + +effectively revert to the viterbi version. The forward realignment gives too many false positives. + +------------------------------------------------------------------------ +r814 | lh3lh3 | 2010-11-11 00:18:02 -0500 (Thu, 11 Nov 2010) | 4 lines +Changed paths: + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bam_md.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.9-9 (r810) + * use forward, instead of viterbi, for realignment + * realignment is now quality aware + +------------------------------------------------------------------------ +r813 | lh3lh3 | 2010-11-10 22:45:24 -0500 (Wed, 10 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/kprobaln.c + M /trunk/samtools/kprobaln.h + + * prepare to replace kaln with kprobaln in realignment + +------------------------------------------------------------------------ +r812 | lh3lh3 | 2010-11-10 17:28:50 -0500 (Wed, 10 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/bcf.c + +fixed a typo + +------------------------------------------------------------------------ +r811 | lh3lh3 | 2010-11-10 16:54:46 -0500 (Wed, 10 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/bcf.c + M /trunk/samtools/bcftools/bcf.h + +use zlib for direct reading when BCF_LITE is in use + +------------------------------------------------------------------------ +r810 | lh3lh3 | 2010-11-10 16:32:13 -0500 (Wed, 10 Nov 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf_indel.c + + * do not use reads containing too many mismatches for indel calling + * fixed a trivial bug in case of multi-allelic indels + +------------------------------------------------------------------------ +r809 | lh3lh3 | 2010-11-10 13:23:02 -0500 (Wed, 10 Nov 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.9-8 (r809) + * fixed a bug in the indel caller + +------------------------------------------------------------------------ +r808 | lh3lh3 | 2010-11-10 12:24:10 -0500 (Wed, 10 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/Makefile + +minor change to makefile + +------------------------------------------------------------------------ +r807 | lh3lh3 | 2010-11-10 12:10:21 -0500 (Wed, 10 Nov 2010) | 4 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/bam2bcf.h + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/bcftools/vcfutils.pl + + * samtools-0.1.9-8 (r807) + * collect indel candidates only from specified platforms (@RG-PL) + * merge varFilter and filter4vcf in vcfutils.pl + +------------------------------------------------------------------------ +r806 | lh3lh3 | 2010-11-09 22:05:46 -0500 (Tue, 09 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/call1.c + M /trunk/samtools/bcftools/prob1.c + M /trunk/samtools/bcftools/prob1.h + +bcftools: compute equal-tail (Bayesian) credible interval + +------------------------------------------------------------------------ +r805 | lh3lh3 | 2010-11-09 16:28:39 -0500 (Tue, 09 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/vcfutils.pl + +added a double-hit filter to avoid overestimated indel likelihood + +------------------------------------------------------------------------ +r804 | lh3lh3 | 2010-11-09 14:12:06 -0500 (Tue, 09 Nov 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.9-7 (r804) + * fixed a bug in the gap caller + +------------------------------------------------------------------------ +r803 | lh3lh3 | 2010-11-09 10:45:33 -0500 (Tue, 09 Nov 2010) | 4 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/bcftools/bcf.c + M /trunk/samtools/bcftools/bcf.h + M /trunk/samtools/bcftools/prob1.c + + * samtools-0.1.9-6 (r803) + * mpileup: apply homopolymer correction when calculating GL, instead of before + * bcftools: apply a different prior to indels + +------------------------------------------------------------------------ +r802 | lh3lh3 | 2010-11-08 23:53:15 -0500 (Mon, 08 Nov 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.9-5 (r802) + * relax tandem penalty. this will be made a command-line option in future. + +------------------------------------------------------------------------ +r801 | lh3lh3 | 2010-11-08 23:35:52 -0500 (Mon, 08 Nov 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.9-4 (r801) + * fixed a minor issue in printing indel VCF + +------------------------------------------------------------------------ +r800 | lh3lh3 | 2010-11-08 15:28:14 -0500 (Mon, 08 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bcftools/vcfutils.pl + +fixed another silly bug in mpileup's indel caller + +------------------------------------------------------------------------ +r799 | lh3lh3 | 2010-11-08 14:28:27 -0500 (Mon, 08 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + +fixed a silly bug in the indel caller + +------------------------------------------------------------------------ +r798 | lh3lh3 | 2010-11-08 14:07:33 -0500 (Mon, 08 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/sam_view.c + M /trunk/samtools/samtools.1 + +Incorporate patches by Marcel Martin for read counting. + +------------------------------------------------------------------------ +r797 | lh3lh3 | 2010-11-08 13:39:52 -0500 (Mon, 08 Nov 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf.h + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.9-2 (r797) + * mpileup: indel calling seems to be working + +------------------------------------------------------------------------ +r796 | lh3lh3 | 2010-11-08 10:54:46 -0500 (Mon, 08 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf.h + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/kaln.c + +indel calling is apparently working, but more information needs to be collected + +------------------------------------------------------------------------ +r795 | lh3lh3 | 2010-11-08 00:39:18 -0500 (Mon, 08 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf_indel.c + +fixed a few bugs in the indel caller. Probably there are more. + +------------------------------------------------------------------------ +r794 | lh3lh3 | 2010-11-07 22:23:16 -0500 (Sun, 07 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/bam.h + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf.h + A /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/kaln.c + M /trunk/samtools/kaln.h + +prepare for the indel caller. It is not ready yet. + +------------------------------------------------------------------------ +r793 | lh3lh3 | 2010-11-05 11:28:23 -0400 (Fri, 05 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf.h + M /trunk/samtools/bam_plcmd.c + +Revert to r790. The recent changes are not good... + +------------------------------------------------------------------------ +r792 | lh3lh3 | 2010-11-05 00:19:14 -0400 (Fri, 05 Nov 2010) | 6 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf.h + M /trunk/samtools/bam_plcmd.c + + * this revision is UNSTABLE + + * indel caller seems working, but it is very insensitive and has + several things I do not quite understand. + + +------------------------------------------------------------------------ +r791 | lh3lh3 | 2010-11-04 22:58:43 -0400 (Thu, 04 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf.h + M /trunk/samtools/bam_plcmd.c + +for backup. no effective changes + +------------------------------------------------------------------------ +r790 | lh3lh3 | 2010-11-03 15:51:24 -0400 (Wed, 03 Nov 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/vcfutils.pl + M /trunk/samtools/kprobaln.c + +fixed a minor problem in the example coming with kprobaln.c + +------------------------------------------------------------------------ +r789 | lh3lh3 | 2010-11-02 15:41:27 -0400 (Tue, 02 Nov 2010) | 4 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/bam_md.c + M /trunk/samtools/kaln.c + M /trunk/samtools/kaln.h + A /trunk/samtools/kprobaln.c + A /trunk/samtools/kprobaln.h + +Separate kaln and kprobaln as I am preparing further changes. At +present, the results should be identical to the previous. + + +------------------------------------------------------------------------ +r788 | petulda | 2010-11-02 12:19:04 -0400 (Tue, 02 Nov 2010) | 1 line +Changed paths: + M /trunk/samtools/bam_plcmd.c + +Added -b option: read file names from a file +------------------------------------------------------------------------ +r787 | lh3lh3 | 2010-10-29 23:17:22 -0400 (Fri, 29 Oct 2010) | 7 lines +Changed paths: + M /trunk/samtools/bam.h + M /trunk/samtools/bam_pileup.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.9-2 (r787) + + * Allow to set a maximum per-sample depth to reduce memory. However, + BAQ computation is still applied to every read. The speed is not + improved. + + +------------------------------------------------------------------------ +r786 | lh3lh3 | 2010-10-29 12:10:40 -0400 (Fri, 29 Oct 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf.h + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/bcftools/bcf.c + M /trunk/samtools/bcftools/vcf.c + + * samtools-0.1.9-1 (r786) + * samtools: optionally perform exact test for each sample + +------------------------------------------------------------------------ +r785 | lh3lh3 | 2010-10-29 09:42:25 -0400 (Fri, 29 Oct 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf.h + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bcftools/bcf.c + +Optionally output "DP", the individual read depth + +------------------------------------------------------------------------ +r784 | lh3lh3 | 2010-10-27 23:10:27 -0400 (Wed, 27 Oct 2010) | 2 lines +Changed paths: + M /trunk/samtools/samtools.1 + +acknowledge Petr and John who have greatly contributed to the project. + +------------------------------------------------------------------------ +r783 | lh3lh3 | 2010-10-27 22:47:47 -0400 (Wed, 27 Oct 2010) | 2 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/NEWS + M /trunk/samtools/bamtk.c + M /trunk/samtools/samtools.1 + +Release samtools-0.1.9 (r783) + +------------------------------------------------------------------------ +r782 | lh3lh3 | 2010-10-27 19:58:54 -0400 (Wed, 27 Oct 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + +fixed a silly bug in pileup + +------------------------------------------------------------------------ +r781 | lh3lh3 | 2010-10-27 14:39:48 -0400 (Wed, 27 Oct 2010) | 5 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bam_sort.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/samtools.1 + + * samtools-0.1.8-22 (r781) + * made BAQ the default behavior of mpileup + * updated manual + * in merge, force to exit given inconsistent header when "-R" is not in use. + +------------------------------------------------------------------------ +r780 | lh3lh3 | 2010-10-27 11:01:11 -0400 (Wed, 27 Oct 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam.h + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.8-21 (r780) + * minor speedup to pileup + +------------------------------------------------------------------------ +r779 | lh3lh3 | 2010-10-27 09:58:56 -0400 (Wed, 27 Oct 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam_pileup.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/examples/toy.sam + +improve pileup a little bit + +------------------------------------------------------------------------ +r778 | lh3lh3 | 2010-10-27 00:14:43 -0400 (Wed, 27 Oct 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam.h + M /trunk/samtools/bam_pileup.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bam_tview.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.8-20 (r778) + * speed up pileup, although I do not know how much is the improvement + +------------------------------------------------------------------------ +r777 | lh3lh3 | 2010-10-26 17:26:04 -0400 (Tue, 26 Oct 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bam_maqcns.h + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/examples/Makefile + + * samtools-0.1.8-19 (r777) + * integrate mpileup features to pileup: min_baseQ, capQ, prob_realn, paired-only and biased prior + +------------------------------------------------------------------------ +r776 | lh3lh3 | 2010-10-26 15:27:46 -0400 (Tue, 26 Oct 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam_md.c + +remove local realignment (probabilistic realignment is still there) + +------------------------------------------------------------------------ +r774 | jmarshall | 2010-10-21 06:52:38 -0400 (Thu, 21 Oct 2010) | 3 lines +Changed paths: + M /trunk/samtools/sam_view.c + +Add the relevant filename or region to error messages, and cause a failure +exit status where appropriate. Based on a patch provided by Marcel Martin. + +------------------------------------------------------------------------ +r773 | lh3lh3 | 2010-10-19 19:44:31 -0400 (Tue, 19 Oct 2010) | 3 lines +Changed paths: + M /trunk/samtools/examples/toy.sam + M /trunk/samtools/kaln.c + + * Minor code changes. No real effect. + * change quality to 30 in toy.sam + +------------------------------------------------------------------------ +r772 | lh3lh3 | 2010-10-18 23:40:13 -0400 (Mon, 18 Oct 2010) | 2 lines +Changed paths: + M /trunk/samtools/examples/toy.fa + M /trunk/samtools/examples/toy.sam + +added another toy example + +------------------------------------------------------------------------ +r771 | lh3lh3 | 2010-10-13 23:32:12 -0400 (Wed, 13 Oct 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/call1.c + M /trunk/samtools/bcftools/ld.c + M /trunk/samtools/bcftools/vcfutils.pl + +improve the LD statistics + +------------------------------------------------------------------------ +r770 | lh3lh3 | 2010-10-12 23:49:26 -0400 (Tue, 12 Oct 2010) | 3 lines +Changed paths: + M /trunk/samtools/bcftools/call1.c + M /trunk/samtools/bcftools/vcfutils.pl + + * a minor fix to the -L option + * add ldstats to vcfutils.pl + +------------------------------------------------------------------------ +r769 | lh3lh3 | 2010-10-12 15:51:57 -0400 (Tue, 12 Oct 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/bcf.c + +a minor change + +------------------------------------------------------------------------ +r768 | lh3lh3 | 2010-10-12 15:49:06 -0400 (Tue, 12 Oct 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/Makefile + A /trunk/samtools/bcftools/ld.c + +forget to add the key file + +------------------------------------------------------------------------ +r767 | lh3lh3 | 2010-10-12 15:48:46 -0400 (Tue, 12 Oct 2010) | 4 lines +Changed paths: + M /trunk/samtools/bcftools/Makefile + M /trunk/samtools/bcftools/bcf.c + M /trunk/samtools/bcftools/bcf.h + M /trunk/samtools/bcftools/call1.c + M /trunk/samtools/bcftools/prob1.c + M /trunk/samtools/bcftools/vcfutils.pl + + * vcfutils.pl: fixed a typo in help message + * added APIs: bcf_append_info() and bcf_cpy() + * calculate adjacent LD + +------------------------------------------------------------------------ +r766 | lh3lh3 | 2010-10-11 11:06:40 -0400 (Mon, 11 Oct 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/vcfutils.pl + +added filter for samtools/bcftools genetated VCFs + +------------------------------------------------------------------------ +r765 | lh3lh3 | 2010-10-05 14:05:18 -0400 (Tue, 05 Oct 2010) | 3 lines +Changed paths: + M /trunk/samtools/bcftools/vcfutils.pl + M /trunk/samtools/kaln.c + + * removed a comment line in kaln.c + * vcfutils.pl fillac works when GT is not the first field + +------------------------------------------------------------------------ +r764 | petulda | 2010-10-05 08:59:36 -0400 (Tue, 05 Oct 2010) | 1 line +Changed paths: + A /trunk/samtools/bcftools/bcf-fix.pl + +Convert VCF output of "bcftools view -bgcv" to a valid VCF file +------------------------------------------------------------------------ +r763 | lh3lh3 | 2010-10-02 22:51:03 -0400 (Sat, 02 Oct 2010) | 4 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + A /trunk/samtools/bcftools/bcftools.1 + M /trunk/samtools/bcftools/call1.c + M /trunk/samtools/samtools.1 + + * samtools-0.1.8-18 (r763) + * added bcftools manual page + * minor fix to mpileup and view command lines + +------------------------------------------------------------------------ +r762 | lh3lh3 | 2010-10-02 21:46:25 -0400 (Sat, 02 Oct 2010) | 3 lines +Changed paths: + M /trunk/samtools/bcftools/bcf.c + M /trunk/samtools/bcftools/call1.c + M /trunk/samtools/bcftools/vcfutils.pl + + * vcfutils.pl qstats: calculate marginal ts/tv + * allow to call genotypes at variant sites + +------------------------------------------------------------------------ +r761 | lh3lh3 | 2010-10-01 00:29:55 -0400 (Fri, 01 Oct 2010) | 3 lines +Changed paths: + M /trunk/samtools/kaln.c + M /trunk/samtools/misc/HmmGlocal.java + +I am changing the gap open probability back to 0.001. It seems that +being conservative here is a good thing... + +------------------------------------------------------------------------ +r760 | lh3lh3 | 2010-10-01 00:11:27 -0400 (Fri, 01 Oct 2010) | 5 lines +Changed paths: + M /trunk/samtools/bamtk.c + M /trunk/samtools/kaln.c + A /trunk/samtools/misc/HmmGlocal.java + + * samtools-0.1.8-17 (r760) + * the default gap open penalty is too small (a typo) + * added comments on hmm_realn + * Java implementation + +------------------------------------------------------------------------ +r759 | lh3lh3 | 2010-09-30 10:12:54 -0400 (Thu, 30 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/bamtk.c + +mark samtools-0.1.8-16 (r759) + +------------------------------------------------------------------------ +r758 | lh3lh3 | 2010-09-30 10:12:02 -0400 (Thu, 30 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/kaln.c + +round to the nearest integer + +------------------------------------------------------------------------ +r757 | lh3lh3 | 2010-09-28 17:16:43 -0400 (Tue, 28 Sep 2010) | 4 lines +Changed paths: + M /trunk/samtools/kaln.c + +I was trying to accelerate ka_prob_glocal() as this will be the +bottleneck. After an hour, the only gain is to change division to +multiplication. OK. I will stop. + +------------------------------------------------------------------------ +r756 | lh3lh3 | 2010-09-28 16:57:49 -0400 (Tue, 28 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/kaln.c + +this is interesting. multiplication is much faster than division, at least on my Mac + +------------------------------------------------------------------------ +r755 | lh3lh3 | 2010-09-28 16:19:13 -0400 (Tue, 28 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/kaln.c + +minor changes + +------------------------------------------------------------------------ +r754 | lh3lh3 | 2010-09-28 15:44:16 -0400 (Tue, 28 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam_md.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/kaln.c + +prob_realn() seems working! + +------------------------------------------------------------------------ +r753 | lh3lh3 | 2010-09-28 12:48:23 -0400 (Tue, 28 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/kaln.c + +minor + +------------------------------------------------------------------------ +r752 | lh3lh3 | 2010-09-28 12:47:41 -0400 (Tue, 28 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/kaln.c + M /trunk/samtools/kaln.h + +Convert phredQ to probabilities + +------------------------------------------------------------------------ +r751 | lh3lh3 | 2010-09-28 12:32:08 -0400 (Tue, 28 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/kaln.c + M /trunk/samtools/kaln.h + +Implement the glocal HMM; discard the extention HMM + +------------------------------------------------------------------------ +r750 | lh3lh3 | 2010-09-28 00:06:11 -0400 (Tue, 28 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/kaln.c + +improve numerical stability + +------------------------------------------------------------------------ +r749 | lh3lh3 | 2010-09-27 23:27:54 -0400 (Mon, 27 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/kaln.c + +more comments + +------------------------------------------------------------------------ +r748 | lh3lh3 | 2010-09-27 23:17:16 -0400 (Mon, 27 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/kaln.c + +fixed a bug in banded DP + +------------------------------------------------------------------------ +r747 | lh3lh3 | 2010-09-27 23:05:12 -0400 (Mon, 27 Sep 2010) | 3 lines +Changed paths: + M /trunk/samtools/kaln.c + + * fixed that weird issue. + * the banded version is NOT working + +------------------------------------------------------------------------ +r746 | lh3lh3 | 2010-09-27 22:57:05 -0400 (Mon, 27 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/kaln.c + +More comments. This version seems working, but something is a little weird... + +------------------------------------------------------------------------ +r745 | lh3lh3 | 2010-09-27 17:21:40 -0400 (Mon, 27 Sep 2010) | 6 lines +Changed paths: + M /trunk/samtools/kaln.c + +A little code cleanup. Now the forward and backback algorithms give +nearly identical P(x), which means both are close to the correct +forms. However, I have only tested on toy examples. Minor errors in +the implementation may not be obvious. + + +------------------------------------------------------------------------ +r744 | lh3lh3 | 2010-09-27 16:55:15 -0400 (Mon, 27 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bam_sort.c + M /trunk/samtools/kaln.c + M /trunk/samtools/kaln.h + +... + +------------------------------------------------------------------------ +r743 | jmarshall | 2010-09-27 08:19:06 -0400 (Mon, 27 Sep 2010) | 6 lines +Changed paths: + M /trunk/samtools/bam_sort.c + +Abort if merge -h's INH.SAM cannot be opened, just as we abort +if any of the IN#.BAM input files cannot be opened. + +Also propagate any error indication returned by bam_merge_core() +to samtools merge's exit status. + +------------------------------------------------------------------------ +r741 | jmarshall | 2010-09-24 11:08:24 -0400 (Fri, 24 Sep 2010) | 5 lines +Changed paths: + M /trunk/samtools/bam_index.c + +Use bam_validate1() to detect garbage records in the event of a corrupt +BAI index file that causes a bam_seek() to an invalid position. At most +one record (namely, the bam_iter_read terminator) is tested per bam_fetch() +call, so the cost is insignificant in the normal case. + +------------------------------------------------------------------------ +r740 | jmarshall | 2010-09-24 11:00:19 -0400 (Fri, 24 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam.c + M /trunk/samtools/bam.h + +Add bam_validate1(). + +------------------------------------------------------------------------ +r739 | lh3lh3 | 2010-09-22 12:07:50 -0400 (Wed, 22 Sep 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam_md.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.8-15 (r379) + * allow to change capQ parameter in calmd + +------------------------------------------------------------------------ +r738 | jmarshall | 2010-09-22 11:15:33 -0400 (Wed, 22 Sep 2010) | 13 lines +Changed paths: + M /trunk/samtools/bam_index.c + M /trunk/samtools/sam_view.c + +When bam_read1() returns an error (return value <= -2), propagate that error +to bam_iter_read()'s own return value. Similarly, also propagate it up to +bam_fetch()'s return value. Previously bam_fetch() always returned 0, and +callers ignored its return value anyway. With this change, 0 continues to +indicate success, while <= -2 (which can be written as < 0, as -1 is never +returned) indicates corrupted input. + +bam_iter_read() ought also to propagate errors returned by bam_seek(). + +main_samview() can now print an error message and fail when bam_fetch() +detects that a .bai index file is corrupted or otherwise does not correspond +to the .bam file it is being used with. + +------------------------------------------------------------------------ +r737 | jmarshall | 2010-09-22 10:47:42 -0400 (Wed, 22 Sep 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam_index.c + +0 is a successful return value from bam_read1(). (In practice, it never +returns 0 anyway; but all the other callers treat 0 as successful.) + +------------------------------------------------------------------------ +r736 | lh3lh3 | 2010-09-20 17:43:08 -0400 (Mon, 20 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam.h + M /trunk/samtools/bam_index.c + M /trunk/samtools/bam_sort.c + + * merge files region-by-region. work on small examples but more tests are needed. + +------------------------------------------------------------------------ +r735 | lh3lh3 | 2010-09-20 16:56:24 -0400 (Mon, 20 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/vcfutils.pl + +improve qstats by checking the alleles as well + +------------------------------------------------------------------------ +r734 | lh3lh3 | 2010-09-17 18:12:13 -0400 (Fri, 17 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/vcfutils.pl + +convert UCSC SNP SQL dump to VCF + +------------------------------------------------------------------------ +r733 | lh3lh3 | 2010-09-17 13:02:11 -0400 (Fri, 17 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/vcfutils.pl + +hapmap2vcf convertor + +------------------------------------------------------------------------ +r732 | lh3lh3 | 2010-09-17 10:11:37 -0400 (Fri, 17 Sep 2010) | 3 lines +Changed paths: + M /trunk/samtools/bcftools/Makefile + M /trunk/samtools/bcftools/bcf.c + M /trunk/samtools/bcftools/bcf.h + M /trunk/samtools/bcftools/vcf.c + + * added comments + * VCF->BCF is not possible without knowing the sequence dictionary before hand... + +------------------------------------------------------------------------ +r731 | lh3lh3 | 2010-09-17 09:15:53 -0400 (Fri, 17 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bcftools/bcf.c + M /trunk/samtools/bcftools/bcf.h + M /trunk/samtools/bcftools/bcfutils.c + M /trunk/samtools/bcftools/call1.c + M /trunk/samtools/bcftools/vcf.c + + * put n_smpl to "bcf1_t" to simplify API a little + +------------------------------------------------------------------------ +r730 | lh3lh3 | 2010-09-16 21:36:01 -0400 (Thu, 16 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/bcf.h + M /trunk/samtools/bcftools/call1.c + M /trunk/samtools/bcftools/index.c + +fixed a bug in indexing + +------------------------------------------------------------------------ +r729 | lh3lh3 | 2010-09-16 16:54:48 -0400 (Thu, 16 Sep 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam.c + M /trunk/samtools/bam_md.c + M /trunk/samtools/bam_pileup.c + + * fixed a bug in capQ + * valgrind identifies a use of uninitialised value, but I have not fixed it. + +------------------------------------------------------------------------ +r728 | lh3lh3 | 2010-09-16 15:03:59 -0400 (Thu, 16 Sep 2010) | 3 lines +Changed paths: + M /trunk/samtools/bgzip.c + M /trunk/samtools/razip.c + + * fixed a bug in razip: -c will delete the input file + * copy tabix/bgzip to here + +------------------------------------------------------------------------ +r727 | lh3lh3 | 2010-09-16 13:45:49 -0400 (Thu, 16 Sep 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam_md.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.8-14 (r727) + * allow to change the capQ parameter at the command line + +------------------------------------------------------------------------ +r726 | lh3lh3 | 2010-09-16 13:38:43 -0400 (Thu, 16 Sep 2010) | 4 lines +Changed paths: + M /trunk/samtools/bam_md.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bcftools/vcfutils.pl + M /trunk/samtools/misc/samtools.pl + + * added varFilter to vcfutils.pl + * reimplement realn(). now it performs a local alignment + * added cap_mapQ() to cap mapping quality when there are many substitutions + +------------------------------------------------------------------------ +r724 | lh3lh3 | 2010-09-15 00:18:31 -0400 (Wed, 15 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/Makefile + A /trunk/samtools/bcftools/bcf2qcall.c + M /trunk/samtools/bcftools/call1.c + + * convert BCF to QCALL input + +------------------------------------------------------------------------ +r723 | lh3lh3 | 2010-09-14 22:41:50 -0400 (Tue, 14 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam_md.c + +dynamic band width in realignment + +------------------------------------------------------------------------ +r722 | lh3lh3 | 2010-09-14 22:05:32 -0400 (Tue, 14 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam_md.c + M /trunk/samtools/bam_plcmd.c + +fixed a bug in realignment + +------------------------------------------------------------------------ +r721 | lh3lh3 | 2010-09-14 20:54:09 -0400 (Tue, 14 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/prob1.c + +fixed a minor issue + +------------------------------------------------------------------------ +r720 | lh3lh3 | 2010-09-14 19:25:10 -0400 (Tue, 14 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bam_md.c + +fixed a bug in realignment + +------------------------------------------------------------------------ +r719 | lh3lh3 | 2010-09-14 19:18:24 -0400 (Tue, 14 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + +minor changes. It is BUGGY now! + +------------------------------------------------------------------------ +r718 | lh3lh3 | 2010-09-14 16:32:33 -0400 (Tue, 14 Sep 2010) | 4 lines +Changed paths: + M /trunk/samtools/bam_md.c + M /trunk/samtools/bam_pileup.c + M /trunk/samtools/kaln.c + M /trunk/samtools/kaln.h + + * aggressive gapped aligner is implemented in calmd. + * distinguish gap_open and gap_end_open in banded alignment + * make tview accepts alignment with heading and tailing D + +------------------------------------------------------------------------ +r717 | jmarshall | 2010-09-14 09:04:28 -0400 (Tue, 14 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools + +Add svn:ignore properties for generated files that don't appear in "make all". + +------------------------------------------------------------------------ +r716 | jmarshall | 2010-09-13 08:37:53 -0400 (Mon, 13 Sep 2010) | 3 lines +Changed paths: + M /trunk/samtools + M /trunk/samtools/bcftools + M /trunk/samtools/misc + +Add svn:ignore properties listing the generated files. +(Except for *.o, which we'll assume is in global-ignores.) + +------------------------------------------------------------------------ +r715 | lh3lh3 | 2010-09-08 12:53:55 -0400 (Wed, 08 Sep 2010) | 5 lines +Changed paths: + M /trunk/samtools/bamtk.c + M /trunk/samtools/bcftools/call1.c + M /trunk/samtools/bcftools/prob1.c + M /trunk/samtools/sample.c + M /trunk/samtools/sample.h + + * samtools-0.1.8-13 (r715) + * fixed a bug in identifying SM across files + * bcftools: estimate heterozygosity + * bcftools: allow to skip sites without reference bases + +------------------------------------------------------------------------ +r713 | lh3lh3 | 2010-09-03 17:19:12 -0400 (Fri, 03 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/Makefile + M /trunk/samtools/bcftools/call1.c + M /trunk/samtools/bcftools/prob1.c + M /trunk/samtools/bcftools/prob1.h + +quite a lot changes to the contrast caller, but I still feel something is missing... + +------------------------------------------------------------------------ +r711 | lh3lh3 | 2010-09-03 00:30:48 -0400 (Fri, 03 Sep 2010) | 4 lines +Changed paths: + M /trunk/samtools/bcftools/Makefile + M /trunk/samtools/bcftools/call1.c + M /trunk/samtools/bcftools/prob1.c + M /trunk/samtools/bcftools/vcfutils.pl + + * changed 3.434 to 4.343 (typo!) + * fixed a bug in the contrast caller + * calculate heterozygosity + +------------------------------------------------------------------------ +r710 | lh3lh3 | 2010-09-01 23:24:47 -0400 (Wed, 01 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/bcf.h + M /trunk/samtools/bcftools/bcfutils.c + M /trunk/samtools/bcftools/call1.c + +SNP calling from the GL field + +------------------------------------------------------------------------ +r709 | lh3lh3 | 2010-09-01 18:52:30 -0400 (Wed, 01 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/vcf.c + +fixed another problem + +------------------------------------------------------------------------ +r708 | lh3lh3 | 2010-09-01 18:31:17 -0400 (Wed, 01 Sep 2010) | 3 lines +Changed paths: + M /trunk/samtools/bcftools/bcf.c + M /trunk/samtools/bcftools/vcf.c + + * fixed bugs in parsing VCF + * parser now works with GT/GQ/DP/PL/GL + +------------------------------------------------------------------------ +r707 | lh3lh3 | 2010-09-01 15:28:29 -0400 (Wed, 01 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/Makefile + M /trunk/samtools/bcftools/prob1.c + +Do not compile _BCF_QUAD by default + +------------------------------------------------------------------------ +r706 | lh3lh3 | 2010-09-01 15:21:41 -0400 (Wed, 01 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/bcf.c + M /trunk/samtools/bcftools/bcf.h + M /trunk/samtools/bcftools/bcfutils.c + M /trunk/samtools/bcftools/call1.c + +Write the correct ALT and PL in the SNP calling mode. + +------------------------------------------------------------------------ +r705 | lh3lh3 | 2010-09-01 12:50:33 -0400 (Wed, 01 Sep 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/vcfutils.pl + +more commands for my own uses + +------------------------------------------------------------------------ +r704 | lh3lh3 | 2010-09-01 09:26:10 -0400 (Wed, 01 Sep 2010) | 2 lines +Changed paths: + A /trunk/samtools/bcftools/vcfutils.pl + +Utilities for processing VCF + +------------------------------------------------------------------------ +r703 | lh3lh3 | 2010-08-31 16:44:57 -0400 (Tue, 31 Aug 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/Makefile + M /trunk/samtools/bcftools/call1.c + M /trunk/samtools/bcftools/prob1.c + M /trunk/samtools/bcftools/prob1.h + +preliminary contrast variant caller + +------------------------------------------------------------------------ +r702 | lh3lh3 | 2010-08-31 12:28:39 -0400 (Tue, 31 Aug 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/call1.c + M /trunk/samtools/bcftools/prob1.c + M /trunk/samtools/bcftools/prob1.h + +z' and z'' can be calculated + +------------------------------------------------------------------------ +r701 | lh3lh3 | 2010-08-31 10:20:57 -0400 (Tue, 31 Aug 2010) | 3 lines +Changed paths: + M /trunk/samtools/bcftools/Makefile + A /trunk/samtools/bcftools/call1.c (from /trunk/samtools/bcftools/vcfout.c:699) + M /trunk/samtools/bcftools/prob1.c + D /trunk/samtools/bcftools/vcfout.c + + * rename vcfout.c as call1.c + * prepare to add two-sample comparison + +------------------------------------------------------------------------ +r699 | lh3lh3 | 2010-08-24 15:28:16 -0400 (Tue, 24 Aug 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/vcfout.c + +fixed a bug in calculating the t statistics + +------------------------------------------------------------------------ +r698 | lh3lh3 | 2010-08-24 14:05:50 -0400 (Tue, 24 Aug 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf.h + M /trunk/samtools/bamtk.c + M /trunk/samtools/bcftools/kfunc.c + M /trunk/samtools/bcftools/vcfout.c + + * samtools-0.1.8-13 (r698) + * perform one-tailed t-test for baseQ, mapQ and endDist + +------------------------------------------------------------------------ +r697 | lh3lh3 | 2010-08-24 12:30:13 -0400 (Tue, 24 Aug 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/kfunc.c + +added regularized incomplete beta function + +------------------------------------------------------------------------ +r695 | lh3lh3 | 2010-08-23 17:36:17 -0400 (Mon, 23 Aug 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bam_plcmd.c + +change the default correlation coefficient + +------------------------------------------------------------------------ +r694 | lh3lh3 | 2010-08-23 14:46:52 -0400 (Mon, 23 Aug 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/bcf.c + M /trunk/samtools/bcftools/vcfout.c + +print QUAL as floating numbers + +------------------------------------------------------------------------ +r693 | lh3lh3 | 2010-08-23 14:06:07 -0400 (Mon, 23 Aug 2010) | 3 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/examples/Makefile + A /trunk/samtools/sample.c + A /trunk/samtools/sample.h + + * samtools-0.1.8-12 (r692) + * group data by samples in "mpileup -g" + +------------------------------------------------------------------------ +r692 | lh3lh3 | 2010-08-23 10:58:53 -0400 (Mon, 23 Aug 2010) | 2 lines +Changed paths: + M /trunk/samtools/Makefile + D /trunk/samtools/bam_mcns.c + D /trunk/samtools/bam_mcns.h + M /trunk/samtools/bam_plcmd.c + +remove VCF output in mpileup + +------------------------------------------------------------------------ +r691 | lh3lh3 | 2010-08-23 10:48:20 -0400 (Mon, 23 Aug 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf.h + + * use the revised MAQ error model for mpileup + * prepare to remove the independent model from mpileup + +------------------------------------------------------------------------ +r690 | lh3lh3 | 2010-08-20 15:46:40 -0400 (Fri, 20 Aug 2010) | 2 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bam_maqcns.h + M /trunk/samtools/bam_plcmd.c + A /trunk/samtools/errmod.c + A /trunk/samtools/errmod.h + M /trunk/samtools/ksort.h + +added revised MAQ error model + +------------------------------------------------------------------------ +r689 | lh3lh3 | 2010-08-18 09:55:20 -0400 (Wed, 18 Aug 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/prob1.c + M /trunk/samtools/bcftools/prob1.h + M /trunk/samtools/bcftools/vcfout.c + +allow to read the prior from the error output. EM iteration is working. + +------------------------------------------------------------------------ +r688 | lh3lh3 | 2010-08-17 12:12:20 -0400 (Tue, 17 Aug 2010) | 3 lines +Changed paths: + M /trunk/samtools/bcftools/main.c + M /trunk/samtools/bcftools/vcf.c + + * write a little more VCF header + * concatenate BCFs + +------------------------------------------------------------------------ +r687 | lh3lh3 | 2010-08-16 20:53:16 -0400 (Mon, 16 Aug 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/bcf.c + M /trunk/samtools/bcftools/bcf.h + M /trunk/samtools/bcftools/bcf.tex + +use float for QUAL + +------------------------------------------------------------------------ +r686 | lh3lh3 | 2010-08-14 00:11:13 -0400 (Sat, 14 Aug 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/bcf.c + M /trunk/samtools/bcftools/prob1.c + +faster for large sample size (in principle) + +------------------------------------------------------------------------ +r685 | lh3lh3 | 2010-08-13 23:28:31 -0400 (Fri, 13 Aug 2010) | 4 lines +Changed paths: + M /trunk/samtools/bcftools/prob1.c + + * a numerically stable method to calculate z_{jk} + * currently slower than the old method but will be important for large sample size + * in principle, we can speed up for large n, but have not tried + +------------------------------------------------------------------------ +r684 | lh3lh3 | 2010-08-11 21:58:31 -0400 (Wed, 11 Aug 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/vcfout.c + +fixed an issue in parsing integer + +------------------------------------------------------------------------ +r683 | lh3lh3 | 2010-08-09 13:05:07 -0400 (Mon, 09 Aug 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/bcf.c + +do not print refname if file is converted from VCF + +------------------------------------------------------------------------ +r682 | lh3lh3 | 2010-08-09 12:59:47 -0400 (Mon, 09 Aug 2010) | 3 lines +Changed paths: + M /trunk/samtools/bcftools/vcf.c + + * parse PL + * fixed a bug in parsing VCF + +------------------------------------------------------------------------ +r681 | lh3lh3 | 2010-08-09 12:49:23 -0400 (Mon, 09 Aug 2010) | 4 lines +Changed paths: + M /trunk/samtools/bcftools/bcf.c + M /trunk/samtools/bcftools/bcf.h + M /trunk/samtools/bcftools/bcfutils.c + M /trunk/samtools/bcftools/main.c + M /trunk/samtools/bcftools/vcf.c + M /trunk/samtools/bcftools/vcfout.c + M /trunk/samtools/bgzf.c + M /trunk/samtools/kstring.c + + * fixed a bug in kstrtok@kstring.c + * preliminary VCF parser (not parse everything for now) + * improved view interface + +------------------------------------------------------------------------ +r680 | lh3lh3 | 2010-08-09 10:43:13 -0400 (Mon, 09 Aug 2010) | 4 lines +Changed paths: + M /trunk/samtools/bcftools/bcf.c + M /trunk/samtools/bcftools/bcf.h + M /trunk/samtools/bcftools/vcfout.c + M /trunk/samtools/kstring.c + M /trunk/samtools/kstring.h + + * improved kstring (added kstrtok) + * removed the limit on the format string length in bcftools + * use kstrtok to parse format which fixed a bug in the old code + +------------------------------------------------------------------------ +r679 | lh3lh3 | 2010-08-09 01:12:05 -0400 (Mon, 09 Aug 2010) | 2 lines +Changed paths: + A /trunk/samtools/bcftools/README + M /trunk/samtools/bcftools/vcfout.c + +help messages + +------------------------------------------------------------------------ +r678 | lh3lh3 | 2010-08-09 00:01:52 -0400 (Mon, 09 Aug 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/vcfout.c + +perform single-tail test for ED4 + +------------------------------------------------------------------------ +r677 | lh3lh3 | 2010-08-08 23:48:35 -0400 (Sun, 08 Aug 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/Makefile + M /trunk/samtools/bcftools/kfunc.c + M /trunk/samtools/bcftools/vcfout.c + + * test depth, end distance and HWE + +------------------------------------------------------------------------ +r676 | lh3lh3 | 2010-08-08 02:04:15 -0400 (Sun, 08 Aug 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/kfunc.c + +reimplement incomplete gamma functions. no copy-paste + +------------------------------------------------------------------------ +r675 | lh3lh3 | 2010-08-06 22:42:54 -0400 (Fri, 06 Aug 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf.h + M /trunk/samtools/bcftools/fet.c + M /trunk/samtools/bcftools/prob1.c + M /trunk/samtools/bcftools/prob1.h + M /trunk/samtools/bcftools/vcfout.c + + * bcftools: add HWE (no testing for now) + * record end dist in a 2x2 table, not avg, std any more + +------------------------------------------------------------------------ +r674 | lh3lh3 | 2010-08-06 17:30:16 -0400 (Fri, 06 Aug 2010) | 3 lines +Changed paths: + A /trunk/samtools/bcftools/kfunc.c + + * Special functions: log(gamma()), erfc(), P(a,x) (incomplete gamma) + * Not using Numerical Recipe due to licensing issues + +------------------------------------------------------------------------ +r673 | lh3lh3 | 2010-08-05 23:46:53 -0400 (Thu, 05 Aug 2010) | 2 lines +Changed paths: + A /trunk/samtools/bcftools/fet.c + +Fisher's exact test + +------------------------------------------------------------------------ +r672 | lh3lh3 | 2010-08-05 21:48:33 -0400 (Thu, 05 Aug 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf.h + M /trunk/samtools/bamtk.c + M /trunk/samtools/examples/Makefile + + * samtools-0.1.8-11 (r672) + * collect more stats for allele balance test in bcftools (not yet) + +------------------------------------------------------------------------ +r671 | lh3lh3 | 2010-08-05 16:17:58 -0400 (Thu, 05 Aug 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bcftools/bcf.c + M /trunk/samtools/bcftools/main.c + + * the code base is stablized again. + * I will delay the vcf parser, which is quite complicated but with little value for now + +------------------------------------------------------------------------ +r670 | lh3lh3 | 2010-08-05 16:03:23 -0400 (Thu, 05 Aug 2010) | 2 lines +Changed paths: + M /trunk/samtools/examples/Makefile + +minor + +------------------------------------------------------------------------ +r669 | lh3lh3 | 2010-08-05 16:03:08 -0400 (Thu, 05 Aug 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/vcf.c + +unfinished vcf parser + +------------------------------------------------------------------------ +r668 | lh3lh3 | 2010-08-05 15:46:40 -0400 (Thu, 05 Aug 2010) | 3 lines +Changed paths: + M /trunk/samtools/bcftools/Makefile + M /trunk/samtools/bcftools/bcf.c + M /trunk/samtools/bcftools/bcf.h + M /trunk/samtools/bcftools/bcfutils.c + M /trunk/samtools/bcftools/index.c + M /trunk/samtools/bcftools/main.c + A /trunk/samtools/bcftools/vcf.c + M /trunk/samtools/bcftools/vcfout.c + + * added prelimiary VCF parser (not finished) + * change struct a bit + +------------------------------------------------------------------------ +r667 | lh3lh3 | 2010-08-03 22:35:27 -0400 (Tue, 03 Aug 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf.h + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bcftools/bcf.c + + * allow to set min base q + * fixed a bug in mpileup -u + +------------------------------------------------------------------------ +r666 | lh3lh3 | 2010-08-03 22:08:44 -0400 (Tue, 03 Aug 2010) | 2 lines +Changed paths: + A /trunk/samtools/bcftools/bcf.tex + +spec + +------------------------------------------------------------------------ +r665 | lh3lh3 | 2010-08-03 21:18:57 -0400 (Tue, 03 Aug 2010) | 2 lines +Changed paths: + M /trunk/samtools/examples/Makefile + +added more examples + +------------------------------------------------------------------------ +r664 | lh3lh3 | 2010-08-03 21:13:00 -0400 (Tue, 03 Aug 2010) | 2 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf.h + M /trunk/samtools/bcftools/Makefile + +fixed compilation error + +------------------------------------------------------------------------ +r662 | lh3lh3 | 2010-08-03 21:04:00 -0400 (Tue, 03 Aug 2010) | 2 lines +Changed paths: + M /trunk/samtools/Makefile + D /trunk/samtools/bcf.c + D /trunk/samtools/bcf.h + A /trunk/samtools/bcftools + A /trunk/samtools/bcftools/Makefile + A /trunk/samtools/bcftools/bcf.c + A /trunk/samtools/bcftools/bcf.h + A /trunk/samtools/bcftools/bcfutils.c + A /trunk/samtools/bcftools/index.c + A /trunk/samtools/bcftools/main.c + A /trunk/samtools/bcftools/prob1.c + A /trunk/samtools/bcftools/prob1.h + A /trunk/samtools/bcftools/vcfout.c + +move bcftools to samtools + +------------------------------------------------------------------------ +r660 | lh3lh3 | 2010-08-03 15:58:32 -0400 (Tue, 03 Aug 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + +fixed another minor bug + +------------------------------------------------------------------------ +r658 | lh3lh3 | 2010-08-03 15:06:45 -0400 (Tue, 03 Aug 2010) | 3 lines +Changed paths: + M /trunk/samtools/bamtk.c + M /trunk/samtools/bcf.c + + * samtools-0.1.8-10 (r658) + * fixed a bug in bam2bcf when the reference is N + +------------------------------------------------------------------------ +r657 | lh3lh3 | 2010-08-03 14:50:23 -0400 (Tue, 03 Aug 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf.h + + * fixed a bug + * treat ambiguous ref base as the fifth base + +------------------------------------------------------------------------ +r654 | lh3lh3 | 2010-08-02 17:38:27 -0400 (Mon, 02 Aug 2010) | 2 lines +Changed paths: + M /trunk/bcftools/bcf.c + M /trunk/samtools/bcf.c + +missing a column in VCF output... + +------------------------------------------------------------------------ +r653 | lh3lh3 | 2010-08-02 17:31:33 -0400 (Mon, 02 Aug 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcf.c + +fixed a memory leak + +------------------------------------------------------------------------ +r651 | lh3lh3 | 2010-08-02 17:27:31 -0400 (Mon, 02 Aug 2010) | 2 lines +Changed paths: + M /trunk/samtools/bcf.c + +fixed a bug in bcf reader + +------------------------------------------------------------------------ +r650 | lh3lh3 | 2010-08-02 17:00:41 -0400 (Mon, 02 Aug 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + +fixed a bug + +------------------------------------------------------------------------ +r649 | lh3lh3 | 2010-08-02 16:49:35 -0400 (Mon, 02 Aug 2010) | 3 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam2bcf.h + M /trunk/samtools/bamtk.c + + * samtools-0.1.8-9 (r649) + * lossless representation of PL in BCF output + +------------------------------------------------------------------------ +r648 | lh3lh3 | 2010-08-02 16:07:25 -0400 (Mon, 02 Aug 2010) | 2 lines +Changed paths: + M /trunk/samtools/Makefile + A /trunk/samtools/bam2bcf.c + A /trunk/samtools/bam2bcf.h + M /trunk/samtools/bam_plcmd.c + A /trunk/samtools/bcf.c + A /trunk/samtools/bcf.h + +Generate binary VCF + +------------------------------------------------------------------------ +r644 | lh3lh3 | 2010-07-28 11:59:19 -0400 (Wed, 28 Jul 2010) | 5 lines +Changed paths: + M /trunk/samtools/bam_mcns.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.8-8 (r644) + * mpileup becomes a little stable again + * the method is slightly different, but is more theoretically correct + * snp calling is O(n^2) instead of O(n^3) + +------------------------------------------------------------------------ +r643 | lh3lh3 | 2010-07-28 11:54:15 -0400 (Wed, 28 Jul 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam_mcns.c + + * fixed a STUPID bug, which cost me a lot of time. + * I am going to clean up mcns a little bit + +------------------------------------------------------------------------ +r642 | lh3lh3 | 2010-07-27 23:23:07 -0400 (Tue, 27 Jul 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam_mcns.c + M /trunk/samtools/bam_mcns.h + M /trunk/samtools/bam_plcmd.c + +supposedly this is THE correct implementation, but more testing is needed + +------------------------------------------------------------------------ +r641 | lh3lh3 | 2010-07-27 22:43:39 -0400 (Tue, 27 Jul 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam_mcns.c + +NOT ready yet. Going to make further changes... + +------------------------------------------------------------------------ +r639 | lh3lh3 | 2010-07-25 22:18:38 -0400 (Sun, 25 Jul 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam_mcns.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.8-7 (r639) + * fixed the reference allele assignment + +------------------------------------------------------------------------ +r638 | lh3lh3 | 2010-07-25 12:01:26 -0400 (Sun, 25 Jul 2010) | 5 lines +Changed paths: + M /trunk/samtools/bam_mcns.c + M /trunk/samtools/bam_mcns.h + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.8-6 (r638) + * skip isnan/isinf in case of float underflow + * added the flat prior + * fixed an issue where there are no reads supporting the reference + +------------------------------------------------------------------------ +r637 | lh3lh3 | 2010-07-24 14:16:27 -0400 (Sat, 24 Jul 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + +minor changes + +------------------------------------------------------------------------ +r636 | lh3lh3 | 2010-07-24 14:07:27 -0400 (Sat, 24 Jul 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam_mcns.c + M /trunk/samtools/bam_mcns.h + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + +minor tweaks + +------------------------------------------------------------------------ +r635 | lh3lh3 | 2010-07-24 01:49:49 -0400 (Sat, 24 Jul 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam_mcns.c + M /trunk/samtools/bam_mcns.h + M /trunk/samtools/bam_plcmd.c + +posterior expectation FINALLY working. I am so tired... + +------------------------------------------------------------------------ +r633 | lh3lh3 | 2010-07-23 13:50:48 -0400 (Fri, 23 Jul 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + +another minor fix to mpileup + +------------------------------------------------------------------------ +r632 | lh3lh3 | 2010-07-23 13:43:31 -0400 (Fri, 23 Jul 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + +added the format column + +------------------------------------------------------------------------ +r631 | lh3lh3 | 2010-07-23 13:25:44 -0400 (Fri, 23 Jul 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam_mcns.c + M /trunk/samtools/bam_mcns.h + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + +added an alternative prior + +------------------------------------------------------------------------ +r628 | lh3lh3 | 2010-07-23 11:48:51 -0400 (Fri, 23 Jul 2010) | 2 lines +Changed paths: + M /trunk/samtools/bam_mcns.c + M /trunk/samtools/bam_mcns.h + M /trunk/samtools/bam_plcmd.c + +calculate posterior allele frequency + +------------------------------------------------------------------------ +r627 | lh3lh3 | 2010-07-22 21:39:13 -0400 (Thu, 22 Jul 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam_mcns.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.8-3 (r627) + * multi-sample snp calling appears to work. More tests needed. + +------------------------------------------------------------------------ +r626 | lh3lh3 | 2010-07-22 16:37:56 -0400 (Thu, 22 Jul 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam_mcns.c + M /trunk/samtools/bam_mcns.h + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bam_tview.c + + * preliminary multisample SNP caller. + * something looks not so right, but it largely works + +------------------------------------------------------------------------ +r617 | lh3lh3 | 2010-07-14 16:26:27 -0400 (Wed, 14 Jul 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam_mcns.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + + * samtools-0.1.8-2 (r617) + * allele frequency calculation apparently works... + +------------------------------------------------------------------------ +r616 | lh3lh3 | 2010-07-14 13:33:51 -0400 (Wed, 14 Jul 2010) | 3 lines +Changed paths: + M /trunk/samtools/Makefile + A /trunk/samtools/bam_mcns.c + A /trunk/samtools/bam_mcns.h + M /trunk/samtools/bam_plcmd.c + + * added mutli-sample framework. It is not working, yet. + * improved the mpileup interface + +------------------------------------------------------------------------ +r615 | lh3lh3 | 2010-07-13 14:50:12 -0400 (Tue, 13 Jul 2010) | 3 lines +Changed paths: + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/misc/Makefile + + * samtools-0.1.8-1 (r615) + * allow to get mpileup at required sites + +------------------------------------------------------------------------ +r613 | lh3lh3 | 2010-07-11 22:40:56 -0400 (Sun, 11 Jul 2010) | 2 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/NEWS + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/samtools.1 + +Release samtools-0.1.8 + ------------------------------------------------------------------------ r612 | lh3lh3 | 2010-07-11 21:08:56 -0400 (Sun, 11 Jul 2010) | 2 lines Changed paths: diff --git a/sam/Makefile b/sam/Makefile index 35d578f..13d4a76 100644 --- a/sam/Makefile +++ b/sam/Makefile @@ -4,13 +4,13 @@ DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE -D_CURSES_LIB=1 KNETFILE_O= knetfile.o LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o \ - $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o + $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \ bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ - bamtk.o kaln.o + bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o PROG= samtools -INCLUDES= -SUBDIRS= . misc +INCLUDES= -I. +SUBDIRS= . bcftools misc LIBPATH= LIBCURSES= -lcurses # -lXCurses @@ -39,8 +39,8 @@ lib:libbam.a libbam.a:$(LOBJS) $(AR) -cru $@ $(LOBJS) -samtools:$(AOBJS) libbam.a - $(CC) $(CFLAGS) -o $@ $(AOBJS) libbam.a -lm $(LIBPATH) $(LIBCURSES) -lz +samtools:lib-recur $(AOBJS) + $(CC) $(CFLAGS) -o $@ $(AOBJS) libbam.a -lm $(LIBPATH) $(LIBCURSES) -lz -Lbcftools -lbcf razip:razip.o razf.o $(KNETFILE_O) $(CC) $(CFLAGS) -o $@ razf.o razip.o $(KNETFILE_O) -lz @@ -53,15 +53,19 @@ bam.o:bam.h razf.h bam_endian.h kstring.h sam_header.h sam.o:sam.h bam.h bam_import.o:bam.h kseq.h khash.h razf.h bam_pileup.o:bam.h razf.h ksort.h -bam_plcmd.o:bam.h faidx.h bam_maqcns.h glf.h +bam_plcmd.o:bam.h faidx.h bam_maqcns.h glf.h bcftools/bcf.h bam2bcf.h bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h bam_lpileup.o:bam.h ksort.h bam_tview.o:bam.h faidx.h bam_maqcns.h -bam_maqcns.o:bam.h ksort.h bam_maqcns.h +bam_maqcns.o:bam.h ksort.h bam_maqcns.h kaln.h bam_sort.o:bam.h ksort.h razf.h bam_md.o:bam.h faidx.h glf.o:glf.h sam_header.o:sam_header.h khash.h +bcf.o:bcftools/bcf.h +bam2bcf.o:bam2bcf.h errmod.h bcftools/bcf.h +bam2bcf_indel.o:bam2bcf.h +errmod.o:errmod.h faidx.o:faidx.h razf.h khash.h faidx_main.o:faidx.h razf.h diff --git a/sam/NEWS b/sam/NEWS index 28d6aaa..6b4d8aa 100644 --- a/sam/NEWS +++ b/sam/NEWS @@ -1,3 +1,144 @@ +Beta release 0.1.12a (2 December, 2010) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is another bug fix release: + + * Fixed a memory violation in mpileup, which causes segfault. Release + 0.1.9 and above are affected. + + * Fixed a memory violation in the indel caller, which does not causes + segfault, but may potentially affect deletion calls in an unexpected + way. Release 0.1.10 and above are affected. + + * Fixed a bug in computing r-square in bcftools. Few are using this + functionality and it only has minor effect. + + * Fixed a memory leak in bam_fetch(). + + * Fixed a bug in writing meta information to the BAM index for the last + sequence. This bug is invisible to most users, but it is a bug anyway. + + * Fixed a bug in bcftools which causes false "DP4=0,0,0,0" annotations. + +(0.1.12: 2 December 2010, r862) + + + +Beta release 0.1.11 (21 November, 2010) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This is mainly a bug fix release: + + * Fixed a bug in random retrieval (since 0.1.8). It occurs when reads + are retrieved from a small region containing no reads. + + * Fixed a bug in pileup (since 0.1.9). The bug causes an assertion + failure when the first CIGAR operation is a deletion. + + * Improved fault tolerence in remote access. + +One minor feature has been implemented in bcftools: + + * Added a reference-free variant calling mode. In this mode, a site is + regarded as a variat iff the sample(s) contains two or more alleles; + the meaning of the QUAL field in the VCF output is changed + accordingly. Effectively, the reference allele is irrelevant to the + result in the new mode, although the reference sequence has to be + used in realignment when SAMtools computes genotype likelihoods. + +In addition, since 0.1.10, the `pileup' command has been deprecated by +`mpileup' which is more powerful and more accurate. The `pileup' command +will not be removed in the next few releases, but new features will not +be added. + +(0.1.11: 21 November 2010, r851) + + + +Beta Release 0.1.10 (16 November, 2010) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This release is featured as the first major improvement to the indel +caller. The method is similar to the old one implemented in the pileup +command, but the details are handled more carefully both in theory and +in practice. As a result, the new indel caller usually gives more +accurate indel calls, though at the cost of sensitivity. The caller is +implemented in the mpileup command and is invoked by default. It works +with multiple samples. + +Other notable changes: + + * With the -r option, the calmd command writes the difference between + the original base quality and the BAQ capped base quality at the BQ + tag but does not modify the base quality. Please use -Ar to overwrite + the original base quality (the 0.1.9 behavior). + + * Allow to set a maximum per-sample read depth to reduce memory. In + 0.1.9, most of memory is wasted for the ultra high read depth in some + regions (e.g. the chr1 centromere). + + * Optionally write per-sample read depth and per-sample strand bias + P-value. + + * Compute equal-tail (Bayesian) credible interval of site allele + frequency at the CI95 VCF annotation. + + * Merged the vcfutils.pl varFilter and filter4vcf for better SNP/indel + filtering. + +(0.1.10: 16 November 2010, r829) + + + +Beta Release 0.1.9 (27 October, 2010) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This release is featured as the first major improvement to the samtools' +SNP caller. It comes with a revised MAQ error model, the support of +multi-sample SNP calling and the computation of base alignment quality +(BAQ). + +The revised MAQ error model is based on the original model. It solves an +issue of miscalling SNPs in repetitive regions. Althought such SNPs can +usually be filtered at a later step, they mess up unfiltered calls. This +is a theoretical flaw in the original model. The revised MAQ model +deprecates the orginal MAQ model and the simplified SOAPsnp model. + +Multi-sample SNP calling is separated in two steps. The first is done by +samtools mpileup and the second by a new program, bcftools, which is +included in the samtools source code tree. Multi-sample SNP calling also +works for single sample and has the advantage of enabling more powerful +filtration. It is likely to deprecate pileup in future once a proper +indel calling method is implemented. + +BAQ is the Phred-scaled probability of a read base being wrongly +aligned. Capping base quality by BAQ has been shown to be very effective +in suppressing false SNPs caused by misalignments around indels or in +low-complexity regions with acceptable compromise on computation +time. This strategy is highly recommended and can be used with other SNP +callers as well. + +In addition to the three major improvements, other notable changes are: + + * Changes to the pileup format. A reference skip (the N CIGAR operator) + is shown as '<' or '>' depending on the strand. Tview is also changed + accordingly. + + * Accelerated pileup. The plain pileup is about 50% faster. + + * Regional merge. The merge command now accepts a new option to merge + files in a specified region. + + * Fixed a bug in bgzip and razip which causes source files to be + deleted even if option -c is applied. + + * In APIs, propogate errors to downstream callers and make samtools + return non-zero values once errors occur. + +(0.1.9: 27 October 2010, r783) + + + Beta Release 0.1.8 (11 July, 2010) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/sam/bam.c b/sam/bam.c index 94b0aa8..521c1dd 100644 --- a/sam/bam.c +++ b/sam/bam.c @@ -245,7 +245,11 @@ char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of) kputc('\t', &str); } if (c->tid < 0) kputsn("*\t", 2, &str); - else { kputs(header->target_name[c->tid], &str); kputc('\t', &str); } + else { + if (header) kputs(header->target_name[c->tid] , &str); + else kputw(c->tid, &str); + kputc('\t', &str); + } kputw(c->pos + 1, &str); kputc('\t', &str); kputw(c->qual, &str); kputc('\t', &str); if (c->n_cigar == 0) kputc('*', &str); else { @@ -257,7 +261,11 @@ char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of) kputc('\t', &str); if (c->mtid < 0) kputsn("*\t", 2, &str); else if (c->mtid == c->tid) kputsn("=\t", 2, &str); - else { kputs(header->target_name[c->mtid], &str); kputc('\t', &str); } + else { + if (header) kputs(header->target_name[c->mtid], &str); + else kputw(c->mtid, &str); + kputc('\t', &str); + } kputw(c->mpos + 1, &str); kputc('\t', &str); kputw(c->isize, &str); kputc('\t', &str); if (c->l_qseq) { for (i = 0; i < c->l_qseq; ++i) kputc(bam_nt16_rev_table[bam1_seqi(s, i)], &str); @@ -297,6 +305,22 @@ void bam_view1(const bam_header_t *header, const bam1_t *b) free(s); } +int bam_validate1(const bam_header_t *header, const bam1_t *b) +{ + char *s; + + if (b->core.tid < -1 || b->core.mtid < -1) return 0; + if (header && (b->core.tid >= header->n_targets || b->core.mtid >= header->n_targets)) return 0; + + if (b->data_len < b->core.l_qname) return 0; + s = memchr(bam1_qname(b), '\0', b->core.l_qname); + if (s != &bam1_qname(b)[b->core.l_qname-1]) return 0; + + // FIXME: Other fields could also be checked, especially the auxiliary data + + return 1; +} + // FIXME: we should also check the LB tag associated with each alignment const char *bam_get_library(bam_header_t *h, const bam1_t *b) { diff --git a/sam/bam.h b/sam/bam.h index 8e26ea6..eef2ea9 100644 --- a/sam/bam.h +++ b/sam/bam.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2008 Genome Research Ltd (GRL). + Copyright (c) 2008-2010 Genome Research Ltd (GRL). Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -230,7 +230,7 @@ typedef struct __bam_iter_t *bam_iter_t; @param b pointer to an alignment @return pointer to quality string */ -#define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + ((b)->core.l_qseq + 1)/2) +#define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1)) /*! @function @abstract Get a base on read @@ -455,6 +455,21 @@ extern "C" { char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of); + /*! + @abstract Check whether a BAM record is plausibly valid + @param header associated header structure, or NULL if unavailable + @param b alignment to validate + @return 0 if the alignment is invalid; non-zero otherwise + + @discussion Simple consistency check of some of the fields of the + alignment record. If the header is provided, several additional checks + are made. Not all fields are checked, so a non-zero result is not a + guarantee that the record is valid. However it is usually good enough + to detect when bam_seek() has been called with a virtual file offset + that is not the offset of an alignment record. + */ + int bam_validate1(const bam_header_t *header, const bam1_t *b); + const char *bam_get_library(bam_header_t *header, const bam1_t *b); @@ -480,7 +495,7 @@ extern "C" { bam1_t *b; int32_t qpos; int indel, level; - uint32_t is_del:1, is_head:1, is_tail:1; + uint32_t is_del:1, is_head:1, is_tail:1, is_refskip:1, aux:28; } bam_pileup1_t; typedef int (*bam_plp_auto_f)(void *data, bam1_t *b); @@ -493,6 +508,7 @@ extern "C" { const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); void bam_plp_set_mask(bam_plp_t iter, int mask); + void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt); void bam_plp_reset(bam_plp_t iter); void bam_plp_destroy(bam_plp_t iter); @@ -501,6 +517,7 @@ extern "C" { bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data); void bam_mplp_destroy(bam_mplp_t iter); + void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt); int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp); /*! @typedef @@ -693,8 +710,8 @@ static inline bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) { uint8_t *data = bdst->data; int m_data = bdst->m_data; // backup data and m_data - if (m_data < bsrc->m_data) { // double the capacity - m_data = bsrc->m_data; kroundup32(m_data); + if (m_data < bsrc->data_len) { // double the capacity + m_data = bsrc->data_len; kroundup32(m_data); data = (uint8_t*)realloc(data, m_data); } memcpy(data, bsrc->data, bsrc->data_len); // copy var-len data diff --git a/sam/bam_index.c b/sam/bam_index.c index 4152f20..328f011 100644 --- a/sam/bam_index.c +++ b/sam/bam_index.c @@ -212,7 +212,7 @@ bam_index_t *bam_index_core(bamFile fp) } if (save_tid >= 0) { insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp)); - insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end); + insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, bam_tell(fp)); insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped); } merge_chunks(idx); @@ -608,6 +608,9 @@ bam_iter_t bam_iter_query(const bam_index_t *idx, int tid, int beg, int end) } } free(bins); + if (n_off == 0) { + free(off); return iter; + } { bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t)); int l; @@ -656,17 +659,17 @@ void bam_iter_destroy(bam_iter_t iter) int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b) { - if (iter->finished) return -1; - if (iter->from_first) { - int ret = bam_read1(fp, b); - if (ret < 0) iter->finished = 1; + int ret; + if (iter && iter->finished) return -1; + if (iter == 0 || iter->from_first) { + ret = bam_read1(fp, b); + if (ret < 0 && iter) iter->finished = 1; return ret; } if (iter->off == 0) return -1; for (;;) { - int ret; if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk - if (iter->i == iter->n_off - 1) break; // no more chunks + if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek bam_seek(fp, iter->off[iter->i+1].u, SEEK_SET); @@ -674,23 +677,28 @@ int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b) } ++iter->i; } - if ((ret = bam_read1(fp, b)) > 0) { + if ((ret = bam_read1(fp, b)) >= 0) { iter->curr_off = bam_tell(fp); - if (b->core.tid != iter->tid || b->core.pos >= iter->end) break; // no need to proceed + if (b->core.tid != iter->tid || b->core.pos >= iter->end) { // no need to proceed + ret = bam_validate1(NULL, b)? -1 : -5; // determine whether end of region or error + break; + } else if (is_overlap(iter->beg, iter->end, b)) return ret; - } else break; // end of file + } else break; // end of file or error } iter->finished = 1; - return -1; + return ret; } int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func) { + int ret; bam_iter_t iter; bam1_t *b; b = bam_init1(); iter = bam_iter_query(idx, tid, beg, end); - while (bam_iter_read(fp, iter, b) >= 0) func(b, data); + while ((ret = bam_iter_read(fp, iter, b)) >= 0) func(b, data); + bam_iter_destroy(iter); bam_destroy1(b); - return 0; + return (ret == -1)? 0 : ret; } diff --git a/sam/bam_maqcns.c b/sam/bam_maqcns.c index cad63d7..4fbc6c6 100644 --- a/sam/bam_maqcns.c +++ b/sam/bam_maqcns.c @@ -3,6 +3,7 @@ #include "bam.h" #include "bam_maqcns.h" #include "ksort.h" +#include "errmod.h" #include "kaln.h" KSORT_INIT_GENERIC(uint32_t) @@ -12,12 +13,13 @@ KSORT_INIT_GENERIC(uint32_t) typedef struct __bmc_aux_t { int max; uint32_t *info; + uint16_t *info16; + errmod_t *em; } bmc_aux_t; typedef struct { float esum[4], fsum[4]; uint32_t c[4]; - uint32_t rms_mapQ; } glf_call_aux_t; char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; @@ -41,7 +43,7 @@ static void cal_het(bam_maqcns_t *aa) for (n1 = 0; n1 < 256; ++n1) { for (n2 = 0; n2 < 256; ++n2) { long double sum = 0.0; - double lC = aa->is_soap? 0 : lgamma(n1+n2+1) - lgamma(n1+1) - lgamma(n2+1); // \binom{n1+n2}{n1} + double lC = aa->errmod == BAM_ERRMOD_SOAP? 0 : lgamma(n1+n2+1) - lgamma(n1+1) - lgamma(n2+1); for (k = 1; k <= aa->n_hap - 1; ++k) { double pk = 1.0 / k / sum_harmo; double log1 = log((double)k/aa->n_hap); @@ -62,6 +64,7 @@ static void cal_coef(bam_maqcns_t *aa) long double sum_a[257], b[256], q_c[256], tmp[256], fk2[256]; double *lC; + if (aa->errmod == BAM_ERRMOD_MAQ2) return; // no need to do the following // aa->lhet will be allocated and initialized free(aa->fk); free(aa->coef); aa->coef = 0; @@ -71,7 +74,7 @@ static void cal_coef(bam_maqcns_t *aa) aa->fk[n] = pow(aa->theta, n) * (1.0 - aa->eta) + aa->eta; fk2[n] = aa->fk[n>>1]; // this is an approximation, assuming reads equally likely come from both strands } - if (aa->is_soap) return; + if (aa->errmod == BAM_ERRMOD_SOAP) return; aa->coef = (double*)calloc(256*256*64, sizeof(double)); lC = (double*)calloc(256 * 256, sizeof(double)); for (n = 1; n != 256; ++n) @@ -107,28 +110,31 @@ bam_maqcns_t *bam_maqcns_init() bm = (bam_maqcns_t*)calloc(1, sizeof(bam_maqcns_t)); bm->aux = (bmc_aux_t*)calloc(1, sizeof(bmc_aux_t)); bm->het_rate = 0.001; - bm->theta = 0.85; + bm->theta = 0.83f; bm->n_hap = 2; bm->eta = 0.03; bm->cap_mapQ = 60; + bm->min_baseQ = 13; return bm; } void bam_maqcns_prepare(bam_maqcns_t *bm) { + if (bm->errmod == BAM_ERRMOD_MAQ2) bm->aux->em = errmod_init(1. - bm->theta); cal_coef(bm); cal_het(bm); } void bam_maqcns_destroy(bam_maqcns_t *bm) { if (bm == 0) return; - free(bm->lhet); free(bm->fk); free(bm->coef); free(bm->aux->info); + free(bm->lhet); free(bm->fk); free(bm->coef); free(bm->aux->info); free(bm->aux->info16); + if (bm->aux->em) errmod_destroy(bm->aux->em); free(bm->aux); free(bm); } glf1_t *bam_maqcns_glfgen(int _n, const bam_pileup1_t *pl, uint8_t ref_base, bam_maqcns_t *bm) { - glf_call_aux_t *b; + glf_call_aux_t *b = 0; int i, j, k, w[8], c, n; glf1_t *g = (glf1_t*)calloc(1, sizeof(glf1_t)); float p[16], min_p = 1e30; @@ -142,28 +148,39 @@ glf1_t *bam_maqcns_glfgen(int _n, const bam_pileup1_t *pl, uint8_t ref_base, bam bm->aux->max = _n; kroundup32(bm->aux->max); bm->aux->info = (uint32_t*)realloc(bm->aux->info, 4 * bm->aux->max); + bm->aux->info16 = (uint16_t*)realloc(bm->aux->info16, 2 * bm->aux->max); } - for (i = n = 0; i < _n; ++i) { + for (i = n = 0, rms = 0; i < _n; ++i) { const bam_pileup1_t *p = pl + i; uint32_t q, x = 0, qq; - if (p->is_del || (p->b->core.flag&BAM_FUNMAP)) continue; + uint16_t y = 0; + if (p->is_del || p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue; q = (uint32_t)bam1_qual(p->b)[p->qpos]; + if (q < bm->min_baseQ) continue; x |= (uint32_t)bam1_strand(p->b) << 18 | q << 8 | p->b->core.qual; + y |= bam1_strand(p->b)<<4; if (p->b->core.qual < q) q = p->b->core.qual; + c = p->b->core.qual < bm->cap_mapQ? p->b->core.qual : bm->cap_mapQ; + rms += c * c; x |= q << 24; + y |= q << 5; qq = bam1_seqi(bam1_seq(p->b), p->qpos); q = bam_nt16_nt4_table[qq? qq : ref_base]; - if (!p->is_del && q < 4) x |= 1 << 21 | q << 16; + if (!p->is_del && !p->is_refskip && q < 4) x |= 1 << 21 | q << 16, y |= q; + bm->aux->info16[n] = y; bm->aux->info[n++] = x; } + rms = (uint8_t)(sqrt((double)rms / n) + .499); + if (bm->errmod == BAM_ERRMOD_MAQ2) { + errmod_cal(bm->aux->em, n, 4, bm->aux->info16, p); + goto goto_glf; + } ks_introsort(uint32_t, n, bm->aux->info); // generate esum and fsum b = (glf_call_aux_t*)calloc(1, sizeof(glf_call_aux_t)); for (k = 0; k != 8; ++k) w[k] = 0; - rms = 0; for (j = n - 1; j >= 0; --j) { // calculate esum and fsum uint32_t info = bm->aux->info[j]; - int tmp; if (info>>24 < 4 && (info>>8&0x3f) != 0) info = 4<<24 | (info&0xffffff); k = info>>16&7; if (info>>24 > 0) { @@ -172,17 +189,14 @@ glf1_t *bam_maqcns_glfgen(int _n, const bam_pileup1_t *pl, uint8_t ref_base, bam if (w[k] < 0xff) ++w[k]; ++b->c[k&3]; } - tmp = (int)(info&0xff) < bm->cap_mapQ? (int)(info&0xff) : bm->cap_mapQ; - rms += tmp * tmp; } - b->rms_mapQ = (uint8_t)(sqrt((double)rms / n) + .499); // rescale ->c[] for (j = c = 0; j != 4; ++j) c += b->c[j]; if (c > 255) { for (j = 0; j != 4; ++j) b->c[j] = (int)(254.0 * b->c[j] / c + 0.5); for (j = c = 0; j != 4; ++j) c += b->c[j]; } - if (!bm->is_soap) { + if (bm->errmod == BAM_ERRMOD_MAQ) { // generate likelihood for (j = 0; j != 4; ++j) { // homozygous @@ -234,7 +248,7 @@ glf1_t *bam_maqcns_glfgen(int _n, const bam_pileup1_t *pl, uint8_t ref_base, bam if (max1 > max2 && (min_k != max_k || min1 + 1.0 > min2)) p[max_k<<2|max_k] = min1 > 1.0? min1 - 1.0 : 0.0; } - } else { // apply the SOAP model + } else if (bm->errmod == BAM_ERRMOD_SOAP) { // apply the SOAP model // generate likelihood for (j = 0; j != 4; ++j) { float tmp; @@ -251,8 +265,9 @@ glf1_t *bam_maqcns_glfgen(int _n, const bam_pileup1_t *pl, uint8_t ref_base, bam } } +goto_glf: // convert necessary information to glf1_t - g->ref_base = ref_base; g->max_mapQ = b->rms_mapQ; + g->ref_base = ref_base; g->max_mapQ = rms; g->depth = n > 16777215? 16777215 : n; for (j = 0; j != 4; ++j) for (k = j; k < 4; ++k) @@ -268,26 +283,25 @@ glf1_t *bam_maqcns_glfgen(int _n, const bam_pileup1_t *pl, uint8_t ref_base, bam uint32_t glf2cns(const glf1_t *g, int q_r) { - int i, j, k, tmp[16], min = 10000, min2 = 10000, min3 = 10000, min_g = -1, min_g2 = -1; + int i, j, k, p[10], ref4; uint32_t x = 0; + ref4 = bam_nt16_nt4_table[g->ref_base]; for (i = k = 0; i < 4; ++i) for (j = i; j < 4; ++j) { - tmp[j<<2|i] = -1; - tmp[i<<2|j] = g->lk[k++] + (i == j? 0 : q_r); + int prior = (i == ref4 && j == ref4? 0 : i == ref4 || j == ref4? q_r : q_r + 3); + p[k] = (g->lk[k] + prior)<<4 | i<<2 | j; + ++k; } - for (i = 0; i < 16; ++i) { - if (tmp[i] < 0) continue; - if (tmp[i] < min) { - min3 = min2; min2 = min; min = tmp[i]; min_g2 = min_g; min_g = i; - } else if (tmp[i] < min2) { - min3 = min2; min2 = tmp[i]; min_g2 = i; - } else if (tmp[i] < min3) min3 = tmp[i]; - } - x = min_g >= 0? (1U<<(min_g>>2&3) | 1U<<(min_g&3)) << 28 : 0xf << 28; - x |= min_g2 >= 0? (1U<<(min_g2>>2&3) | 1U<<(min_g2&3)) << 24 : 0xf << 24; - x |= (uint32_t)g->max_mapQ << 16; - x |= min2 < 10000? (min2 - min < 256? min2 - min : 255) << 8 : 0xff << 8; - x |= min2 < 10000 && min3 < 10000? (min3 - min2 < 256? min3 - min2 : 255) : 0xff; + for (i = 1; i < 10; ++i) // insertion sort + for (j = i; j > 0 && p[j] < p[j-1]; --j) + k = p[j], p[j] = p[j-1], p[j-1] = k; + x = (1u<<(p[0]&3) | 1u<<(p[0]>>2&3)) << 28; // the best genotype + x |= (uint32_t)g->max_mapQ << 16; // rms mapQ + x |= ((p[1]>>4) - (p[0]>>4) < 256? (p[1]>>4) - (p[0]>>4) : 255) << 8; // consensus Q + for (k = 0; k < 10; ++k) + if ((p[k]&0xf) == (ref4<<2|ref4)) break; + if (k == 10) k = 9; + x |= (p[k]>>4) - (p[0]>>4) < 256? (p[k]>>4) - (p[0]>>4) : 255; // snp Q return x; } @@ -297,7 +311,7 @@ uint32_t bam_maqcns_call(int n, const bam_pileup1_t *pl, bam_maqcns_t *bm) uint32_t x; if (n) { g = bam_maqcns_glfgen(n, pl, 0xf, bm); - x = glf2cns(g, (int)(bm->q_r + 0.5)); + x = g->depth == 0? (0xfU<<28 | 0xfU<<24) : glf2cns(g, (int)(bm->q_r + 0.5)); free(g); } else x = 0xfU<<28 | 0xfU<<24; return x; @@ -448,7 +462,7 @@ bam_maqindel_ret_t *bam_maqindel(int n, int pos, const bam_maqindel_opt_t *mi, c for (i = 0; i < n_types; ++i) { ka_param_t ap = ka_param_blast; ap.band_width = 2 * types[n_types - 1] + 2; - ap.gap_end = 0; + ap.gap_end_ext = 0; // write ref2 for (k = 0, j = left; j <= pos; ++j) ref2[k++] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[j]]]; diff --git a/sam/bam_maqcns.h b/sam/bam_maqcns.h index 6cc5355..291ae53 100644 --- a/sam/bam_maqcns.h +++ b/sam/bam_maqcns.h @@ -3,11 +3,15 @@ #include "glf.h" +#define BAM_ERRMOD_MAQ2 0 +#define BAM_ERRMOD_MAQ 1 +#define BAM_ERRMOD_SOAP 2 + struct __bmc_aux_t; typedef struct { float het_rate, theta; - int n_hap, cap_mapQ, is_soap; + int n_hap, cap_mapQ, errmod, min_baseQ; float eta, q_r; double *fk, *coef; diff --git a/sam/bam_md.c b/sam/bam_md.c index 17b0a4a..44d46a4 100644 --- a/sam/bam_md.c +++ b/sam/bam_md.c @@ -2,9 +2,12 @@ #include #include #include +#include #include "faidx.h" #include "sam.h" #include "kstring.h" +#include "kaln.h" +#include "kprobaln.h" void bam_fillmd1_core(bam1_t *b, char *ref, int is_equal, int max_nm) { @@ -108,24 +111,168 @@ void bam_fillmd1(bam1_t *b, char *ref, int is_equal) bam_fillmd1_core(b, ref, is_equal, 0); } +int bam_cap_mapQ(bam1_t *b, char *ref, int thres) +{ + uint8_t *seq = bam1_seq(b), *qual = bam1_qual(b); + uint32_t *cigar = bam1_cigar(b); + bam1_core_t *c = &b->core; + int i, x, y, mm, q, len, clip_l, clip_q; + double t; + if (thres < 0) thres = 40; // set the default + mm = q = len = clip_l = clip_q = 0; + for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { + int j, l = cigar[i]>>4, op = cigar[i]&0xf; + if (op == BAM_CMATCH) { + for (j = 0; j < l; ++j) { + int z = y + j; + int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]]; + if (ref[x+j] == 0) break; // out of boundary + if (c2 != 15 && c1 != 15 && qual[z] >= 13) { // not ambiguous + ++len; + if (c1 && c1 != c2 && qual[z] >= 13) { // mismatch + ++mm; + q += qual[z] > 33? 33 : qual[z]; + } + } + } + if (j < l) break; + x += l; y += l; len += l; + } else if (op == BAM_CDEL) { + for (j = 0; j < l; ++j) + if (ref[x+j] == 0) break; + if (j < l) break; + x += l; + } else if (op == BAM_CSOFT_CLIP) { + for (j = 0; j < l; ++j) clip_q += qual[y+j]; + clip_l += l; + y += l; + } else if (op == BAM_CHARD_CLIP) { + clip_q += 13 * l; + clip_l += l; + } else if (op == BAM_CINS) y += l; + else if (op == BAM_CREF_SKIP) x += l; + } + for (i = 0, t = 1; i < mm; ++i) + t *= (double)len / (i+1); + t = q - 4.343 * log(t) + clip_q / 5.; + if (t > thres) return -1; + if (t < 0) t = 0; + t = sqrt((thres - t) / thres) * thres; +// fprintf(stderr, "%s %lf %d\n", bam1_qname(b), t, q); + return (int)(t + .499); +} + +int bam_prob_realn_core(bam1_t *b, const char *ref, int apply_baq) +{ + int k, i, bw, x, y, yb, ye, xb, xe; + uint32_t *cigar = bam1_cigar(b); + bam1_core_t *c = &b->core; + kpa_par_t conf = kpa_par_def; + uint8_t *bq = 0, *zq = 0, *qual = bam1_qual(b); + if (c->flag & BAM_FUNMAP) return -1; // do nothing + // test if BQ or ZQ is present + if ((bq = bam_aux_get(b, "BQ")) != 0) ++bq; + if ((zq = bam_aux_get(b, "ZQ")) != 0 && *zq == 'Z') ++zq; + if (bq && zq) { // remove the ZQ tag + bam_aux_del(b, zq-1); + zq = 0; + } + if (bq || zq) { + if ((apply_baq && zq) || (!apply_baq && bq)) return -3; // in both cases, do nothing + if (bq && apply_baq) { // then convert BQ to ZQ + for (i = 0; i < c->l_qseq; ++i) + qual[i] = qual[i] + 64 < bq[i]? 0 : qual[i] - ((int)bq[i] - 64); + *(bq - 3) = 'Z'; + } else if (zq && !apply_baq) { // then convert ZQ to BQ + for (i = 0; i < c->l_qseq; ++i) + qual[i] += (int)zq[i] - 64; + *(zq - 3) = 'B'; + } + return 0; + } + // find the start and end of the alignment + x = c->pos, y = 0, yb = ye = xb = xe = -1; + for (k = 0; k < c->n_cigar; ++k) { + int op, l; + op = cigar[k]&0xf; l = cigar[k]>>4; + if (op == BAM_CMATCH) { + if (yb < 0) yb = y; + if (xb < 0) xb = x; + ye = y + l; xe = x + l; + x += l; y += l; + } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l; + else if (op == BAM_CDEL) x += l; + else if (op == BAM_CREF_SKIP) return -1; // do nothing if there is a reference skip + } + // set bandwidth and the start and the end + bw = 7; + if (abs((xe - xb) - (ye - yb)) > bw) + bw = abs((xe - xb) - (ye - yb)) + 3; + conf.bw = bw; + xb -= yb + bw/2; if (xb < 0) xb = 0; + xe += c->l_qseq - ye + bw/2; + if (xe - xb - c->l_qseq > bw) + xb += (xe - xb - c->l_qseq - bw) / 2, xe -= (xe - xb - c->l_qseq - bw) / 2; + { // glocal + uint8_t *s, *r, *q, *seq = bam1_seq(b), *bq; + int *state; + bq = calloc(c->l_qseq + 1, 1); + memcpy(bq, qual, c->l_qseq); + s = calloc(c->l_qseq, 1); + for (i = 0; i < c->l_qseq; ++i) s[i] = bam_nt16_nt4_table[bam1_seqi(seq, i)]; + r = calloc(xe - xb, 1); + for (i = xb; i < xe; ++i) + r[i-xb] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[i]]]; + state = calloc(c->l_qseq, sizeof(int)); + q = calloc(c->l_qseq, 1); + kpa_glocal(r, xe-xb, s, c->l_qseq, qual, &conf, state, q); + for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) { + int op = cigar[k]&0xf, l = cigar[k]>>4; + if (op == BAM_CMATCH) { + for (i = y; i < y + l; ++i) { + if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) bq[i] = 0; + else bq[i] = bq[i] < q[i]? bq[i] : q[i]; + } + x += l; y += l; + } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l; + else if (op == BAM_CDEL) x += l; + } + for (i = 0; i < c->l_qseq; ++i) bq[i] = qual[i] - bq[i] + 64; // finalize BQ + if (apply_baq) { + for (i = 0; i < c->l_qseq; ++i) qual[i] -= bq[i] - 64; // modify qual + bam_aux_append(b, "ZQ", 'Z', c->l_qseq + 1, bq); + } else bam_aux_append(b, "BQ", 'Z', c->l_qseq + 1, bq); + free(bq); free(s); free(r); free(q); free(state); + } + return 0; +} + +int bam_prob_realn(bam1_t *b, const char *ref) +{ + return bam_prob_realn_core(b, ref, 1); +} + int bam_fillmd(int argc, char *argv[]) { - int c, is_equal = 0, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm = 0; + int c, is_equal, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm, is_realn, capQ, apply_baq; samfile_t *fp, *fpout = 0; faidx_t *fai; char *ref = 0, mode_w[8], mode_r[8]; bam1_t *b; - is_bam_out = is_sam_in = is_uncompressed = 0; + is_equal = is_bam_out = is_sam_in = is_uncompressed = is_realn = max_nm = apply_baq = capQ = 0; mode_w[0] = mode_r[0] = 0; strcpy(mode_r, "r"); strcpy(mode_w, "w"); - while ((c = getopt(argc, argv, "eubSn:")) >= 0) { + while ((c = getopt(argc, argv, "reubSC:n:A")) >= 0) { switch (c) { + case 'r': is_realn = 1; break; case 'e': is_equal = 1; break; case 'b': is_bam_out = 1; break; case 'u': is_uncompressed = is_bam_out = 1; break; case 'S': is_sam_in = 1; break; case 'n': max_nm = atoi(optarg); break; + case 'C': capQ = atoi(optarg); break; + case 'A': apply_baq = 1; break; default: fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n", c); return 1; } } @@ -135,11 +282,13 @@ int bam_fillmd(int argc, char *argv[]) if (is_uncompressed) strcat(mode_w, "u"); if (optind + 1 >= argc) { fprintf(stderr, "\n"); - fprintf(stderr, "Usage: samtools fillmd [-eubS] \n\n"); + fprintf(stderr, "Usage: samtools fillmd [-eubrS] \n\n"); fprintf(stderr, "Options: -e change identical bases to '='\n"); fprintf(stderr, " -u uncompressed BAM output (for piping)\n"); fprintf(stderr, " -b compressed BAM output\n"); - fprintf(stderr, " -S the input is SAM with header\n\n"); + fprintf(stderr, " -S the input is SAM with header\n"); + fprintf(stderr, " -A modify the quality string\n"); + fprintf(stderr, " -r read-independent local realignment\n\n"); return 1; } fp = samopen(argv[optind], mode_r, 0); @@ -162,6 +311,11 @@ int bam_fillmd(int argc, char *argv[]) fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n", fp->header->target_name[tid]); } + if (is_realn) bam_prob_realn_core(b, ref, apply_baq); + if (capQ > 10) { + int q = bam_cap_mapQ(b, ref, capQ); + if (b->core.qual > q) b->core.qual = q; + } if (ref) bam_fillmd1_core(b, ref, is_equal, max_nm); } samwrite(fpout, b); diff --git a/sam/bam_pileup.c b/sam/bam_pileup.c index 3c41a16..3e26f74 100644 --- a/sam/bam_pileup.c +++ b/sam/bam_pileup.c @@ -4,9 +4,16 @@ #include #include "sam.h" +typedef struct { + int k, x, y, end; +} cstate_t; + +static cstate_t g_cstate_null = { -1, 0, 0, 0 }; + typedef struct __linkbuf_t { bam1_t b; uint32_t beg, end; + cstate_t s; struct __linkbuf_t *next; } lbnode_t; @@ -53,68 +60,86 @@ static inline void mp_free(mempool_t *mp, lbnode_t *p) /* --- BEGIN: Auxiliary functions */ -static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos) +/* s->k: the index of the CIGAR operator that has just been processed. + s->x: the reference coordinate of the start of s->k + s->y: the query coordiante of the start of s->k + */ +static inline int resolve_cigar2(bam_pileup1_t *p, uint32_t pos, cstate_t *s) { - unsigned k; +#define _cop(c) ((c)&BAM_CIGAR_MASK) +#define _cln(c) ((c)>>BAM_CIGAR_SHIFT) + bam1_t *b = p->b; bam1_core_t *c = &b->core; - uint32_t x = c->pos, y = 0; - int ret = 1, is_restart = 1; - - if (c->flag&BAM_FUNMAP) return 0; // unmapped read - assert(x <= pos); // otherwise a bug - p->qpos = -1; p->indel = 0; p->is_del = p->is_head = p->is_tail = 0; - for (k = 0; k < c->n_cigar; ++k) { - int op = bam1_cigar(b)[k] & BAM_CIGAR_MASK; // operation - int l = bam1_cigar(b)[k] >> BAM_CIGAR_SHIFT; // length - if (op == BAM_CMATCH) { // NOTE: this assumes the first and the last operation MUST BE a match or a clip - if (x + l > pos) { // overlap with pos - p->indel = p->is_del = 0; - p->qpos = y + (pos - x); - if (x == pos && is_restart) p->is_head = 1; - if (x + l - 1 == pos) { // come to the end of a match - int has_next_match = 0; - unsigned i; - for (i = k + 1; i < c->n_cigar; ++i) { - uint32_t cigar = bam1_cigar(b)[i]; - int opi = cigar&BAM_CIGAR_MASK; - if (opi == BAM_CMATCH) { - has_next_match = 1; - break; - } else if (opi == BAM_CSOFT_CLIP || opi == BAM_CREF_SKIP || opi == BAM_CHARD_CLIP) break; - } - if (!has_next_match) p->is_tail = 1; - if (k < c->n_cigar - 1 && has_next_match) { // there are additional operation(s) - uint32_t cigar = bam1_cigar(b)[k+1]; // next CIGAR - int op_next = cigar&BAM_CIGAR_MASK; // next CIGAR operation - if (op_next == BAM_CDEL) p->indel = -(int32_t)(cigar>>BAM_CIGAR_SHIFT); // del - else if (op_next == BAM_CINS) p->indel = cigar>>BAM_CIGAR_SHIFT; // ins - else if (op_next == BAM_CPAD && k + 2 < c->n_cigar) { // no working for adjacent padding - cigar = bam1_cigar(b)[k+2]; op_next = cigar&BAM_CIGAR_MASK; - if (op_next == BAM_CDEL) p->indel = -(int32_t)(cigar>>BAM_CIGAR_SHIFT); // del - else if (op_next == BAM_CINS) p->indel = cigar>>BAM_CIGAR_SHIFT; // ins - } - } + uint32_t *cigar = bam1_cigar(b); + int k, is_head = 0; + // determine the current CIGAR operation +// fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam1_qname(b), pos, s->end, s->k, s->x, s->y); + if (s->k == -1) { // never processed + is_head = 1; + if (c->n_cigar == 1) { // just one operation, save a loop + if (_cop(cigar[0]) == BAM_CMATCH) s->k = 0, s->x = c->pos, s->y = 0; + } else { // find the first match or deletion + for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) { + int op = _cop(cigar[k]); + int l = _cln(cigar[k]); + if (op == BAM_CMATCH || op == BAM_CDEL) break; + else if (op == BAM_CREF_SKIP) s->x += l; + else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l; + } + assert(k < c->n_cigar); + s->k = k; + } + } else { // the read has been processed before + int op, l = _cln(cigar[s->k]); + if (pos - s->x >= l) { // jump to the next operation + assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case + op = _cop(cigar[s->k+1]); + if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP) { // jump to the next without a loop + if (_cop(cigar[s->k]) == BAM_CMATCH) s->y += l; + s->x += l; + ++s->k; + } else { // find the next M/D/N + if (_cop(cigar[s->k]) == BAM_CMATCH) s->y += l; + s->x += l; + for (k = s->k + 1; k < c->n_cigar; ++k) { + op = _cop(cigar[k]), l = _cln(cigar[k]); + if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP) break; + else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l; } + s->k = k; } - x += l; y += l; - } else if (op == BAM_CDEL) { // then set ->is_del - if (x + l > pos) { - p->indel = 0; p->is_del = 1; - p->qpos = y + (pos - x); + assert(s->k < c->n_cigar); // otherwise a bug + } // else, do nothing + } + { // collect pileup information + int op, l; + op = _cop(cigar[s->k]); l = _cln(cigar[s->k]); + p->is_del = p->indel = p->is_refskip = 0; + if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation + int op2 = _cop(cigar[s->k+1]); + int l2 = _cln(cigar[s->k+1]); + if (op2 == BAM_CDEL) p->indel = -(int)l2; + else if (op2 == BAM_CINS) p->indel = l2; + else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding + int l3 = 0; + for (k = s->k + 2; k < c->n_cigar; ++k) { + op2 = _cop(cigar[k]); l2 = _cln(cigar[k]); + if (op2 == BAM_CINS) l3 += l2; + else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP) break; + } + if (l3 > 0) p->indel = l3; } - x += l; - } else if (op == BAM_CREF_SKIP) x += l; - else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; - if (is_restart) is_restart ^= (op == BAM_CMATCH); - else is_restart ^= (op == BAM_CREF_SKIP || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP); - if (x > pos) { - if (op == BAM_CREF_SKIP) ret = 0; // then do not put it into pileup at all - break; } + if (op == BAM_CMATCH) { + p->qpos = s->y + (pos - s->x); + } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { + p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!! + p->is_refskip = (op == BAM_CREF_SKIP); + } // cannot be other operations; otherwise a bug + p->is_head = (pos == c->pos); p->is_tail = (pos == s->end); } - assert(x > pos); // otherwise a bug - return ret; + return 1; } /* --- END: Auxiliary functions */ @@ -127,7 +152,7 @@ struct __bam_plp_t { mempool_t *mp; lbnode_t *head, *tail, *dummy; int32_t tid, pos, max_tid, max_pos; - int is_eof, flag_mask, max_plp, error; + int is_eof, flag_mask, max_plp, error, maxcnt; bam_pileup1_t *plp; // for the "auto" interface only bam1_t *b; @@ -144,6 +169,7 @@ bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data) iter->dummy = mp_alloc(iter->mp); iter->max_tid = iter->max_pos = -1; iter->flag_mask = BAM_DEF_MASK; + iter->maxcnt = 8000; if (func) { iter->func = func; iter->data = data; @@ -183,7 +209,7 @@ const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_ iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp); } iter->plp[n_plp].b = &p->b; - if (resolve_cigar(iter->plp + n_plp, iter->pos)) ++n_plp; // skip the read if we are looking at ref-skip + if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true... } } iter->head = iter->dummy->next; // dummy->next may be changed @@ -215,8 +241,10 @@ int bam_plp_push(bam_plp_t iter, const bam1_t *b) if (b) { if (b->core.tid < 0) return 0; if (b->core.flag & iter->flag_mask) return 0; + if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt) return 0; bam_copy1(&iter->tail->b, b); iter->tail->beg = b->core.pos; iter->tail->end = bam_calend(&b->core, bam1_cigar(b)); + iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t if (b->core.tid < iter->max_tid) { fprintf(stderr, "[bam_pileup_core] the input is not sorted (chromosomes out of order)\n"); iter->error = 1; @@ -241,7 +269,7 @@ const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_ const bam_pileup1_t *plp; if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; } if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; - else { + else { // no pileup line can be obtained; read alignments *_n_plp = 0; if (iter->is_eof) return 0; while (iter->func(iter->data, iter->b) >= 0) { @@ -250,6 +278,7 @@ const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_ return 0; } if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; + // otherwise no pileup line can be returned; read the next alignment. } bam_plp_push(iter, 0); if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; @@ -276,6 +305,11 @@ void bam_plp_set_mask(bam_plp_t iter, int mask) iter->flag_mask = mask < 0? BAM_DEF_MASK : (BAM_FUNMAP | mask); } +void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt) +{ + iter->maxcnt = maxcnt; +} + /***************** * callback APIs * *****************/ @@ -363,6 +397,13 @@ bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data) return iter; } +void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt) +{ + int i; + for (i = 0; i < iter->n; ++i) + iter->iter[i]->maxcnt = maxcnt; +} + void bam_mplp_destroy(bam_mplp_t iter) { int i; @@ -387,7 +428,7 @@ int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_p if (new_min == (uint64_t)-1) return 0; *_tid = new_min>>32; *_pos = (uint32_t)new_min; for (i = 0; i < iter->n; ++i) { - if (iter->pos[i] == iter->min) { + if (iter->pos[i] == iter->min) { // FIXME: valgrind reports "uninitialised value(s) at this line" n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i]; ++ret; } else n_plp[i] = 0, plp[i] = 0; diff --git a/sam/bam_plcmd.c b/sam/bam_plcmd.c index 6804795..002297a 100644 --- a/sam/bam_plcmd.c +++ b/sam/bam_plcmd.c @@ -2,6 +2,8 @@ #include #include #include +#include +#include #include "sam.h" #include "faidx.h" #include "bam_maqcns.h" @@ -22,6 +24,7 @@ KHASH_MAP_INIT_INT64(64, indel_list_t) #define BAM_PLF_1STBASE 0x80 #define BAM_PLF_ALLBASE 0x100 #define BAM_PLF_READPOS 0x200 +#define BAM_PLF_NOBAQ 0x400 typedef struct { bam_header_t *h; @@ -32,6 +35,7 @@ typedef struct { uint32_t format; int tid, len, last_pos; int mask; + int capQ_thres, min_baseQ; int max_depth; // for indel calling, ignore reads with the depth too high. 0 for unlimited char *ref; glfFile fp_glf; // for glf output only @@ -89,6 +93,21 @@ static khash_t(64) *load_pos(const char *fn, bam_header_t *h) return hash; } +static inline int printw(int c, FILE *fp) +{ + char buf[16]; + int l, x; + if (c == 0) return fputc('0', fp); + for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; + if (c < 0) buf[l++] = '-'; + buf[l] = 0; + for (x = 0; x < l/2; ++x) { + int y = buf[x]; buf[x] = buf[l-1-x]; buf[l-1-x] = y; + } + fputs(buf, fp); + return 0; +} + // an analogy to pileup_func() below static int glt3_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data) { @@ -158,29 +177,38 @@ static int glt3_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, return 0; } -static void pileup_seq(const bam_pileup1_t *p, int pos, int ref_len, const char *ref) +static inline void pileup_seq(const bam_pileup1_t *p, int pos, int ref_len, const char *ref) { - if (p->is_head) printf("^%c", p->b->core.qual > 93? 126 : p->b->core.qual + 33); + int j; + if (p->is_head) { + putchar('^'); + putchar(p->b->core.qual > 93? 126 : p->b->core.qual + 33); + } if (!p->is_del) { - int j, rb, c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]; - rb = (ref && pos < ref_len)? ref[pos] : 'N'; - if (c == '=' || toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.'; - else c = bam1_strand(p->b)? tolower(c) : toupper(c); + int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]; + if (ref) { + int rb = pos < ref_len? ref[pos] : 'N'; + if (c == '=' || bam_nt16_table[c] == bam_nt16_table[rb]) c = bam1_strand(p->b)? ',' : '.'; + else c = bam1_strand(p->b)? tolower(c) : toupper(c); + } else { + if (c == '=') c = bam1_strand(p->b)? ',' : '.'; + else c = bam1_strand(p->b)? tolower(c) : toupper(c); + } putchar(c); - if (p->indel > 0) { - printf("+%d", p->indel); - for (j = 1; j <= p->indel; ++j) { - c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)]; - putchar(bam1_strand(p->b)? tolower(c) : toupper(c)); - } - } else if (p->indel < 0) { - printf("%d", p->indel); - for (j = 1; j <= -p->indel; ++j) { - c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N'; - putchar(bam1_strand(p->b)? tolower(c) : toupper(c)); - } + } else putchar(p->is_refskip? (bam1_strand(p->b)? '<' : '>') : '*'); + if (p->indel > 0) { + putchar('+'); printw(p->indel, stdout); + for (j = 1; j <= p->indel; ++j) { + int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)]; + putchar(bam1_strand(p->b)? tolower(c) : toupper(c)); + } + } else if (p->indel < 0) { + printw(p->indel, stdout); + for (j = 1; j <= -p->indel; ++j) { + int c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N'; + putchar(bam1_strand(p->b)? tolower(c) : toupper(c)); } - } else putchar('*'); + } if (p->is_tail) putchar('$'); } @@ -232,7 +260,11 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p } rmsQ = (uint64_t)(sqrt((double)rmsQ / n) + .499); cns = b<<28 | 0xf<<24 | rmsQ<<16 | 60<<8; - } else cns = bam_maqcns_call(n, pu, d->c); + } else { + glf1_t *g = bam_maqcns_glfgen(n, pu, bam_nt16_table[rb], d->c); + cns = g->depth == 0? (0xfu<<28 | 0xf<<24) : glf2cns(g, (int)(d->c->q_r + .499)); + free(g); + } } if ((d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY)) && d->ref && pos < d->len) { // call indels int m = (!d->max_depth || d->max_depth>n) ? n : d->max_depth; @@ -242,7 +274,7 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p } // when only variant sites are asked for, test if the site is a variant if ((d->format & BAM_PLF_CNS) && (d->format & BAM_PLF_VAR_ONLY)) { - if (!(bam_nt16_table[rb] != 15 && cns>>28 != bam_nt16_table[rb])) { // not a SNP + if (!(bam_nt16_table[rb] != 15 && cns>>28 != 15 && cns>>28 != bam_nt16_table[rb])) { // not a SNP if (!(r && (r->gt == 2 || strcmp(r->s[r->gt], "*")))) { // not an indel if (r) bam_maqindel_ret_destroy(r); return 0; @@ -250,30 +282,29 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p } } // print the first 3 columns - printf("%s\t%d\t%c\t", d->h->target_name[tid], pos + 1, rb); + fputs(d->h->target_name[tid], stdout); putchar('\t'); + printw(pos+1, stdout); putchar('\t'); putchar(rb); putchar('\t'); // print consensus information if required if (d->format & BAM_PLF_CNS) { - int ref_q, rb4 = bam_nt16_table[rb]; - ref_q = 0; - if (rb4 != 15 && cns>>28 != 15 && cns>>28 != rb4) { // a SNP - ref_q = ((cns>>24&0xf) == rb4)? cns>>8&0xff : (cns>>8&0xff) + (cns&0xff); - if (ref_q > 255) ref_q = 255; - } - rms_mapq = cns>>16&0xff; - printf("%c\t%d\t%d\t%d\t", bam_nt16_rev_table[cns>>28], cns>>8&0xff, ref_q, rms_mapq); + putchar(bam_nt16_rev_table[cns>>28]); putchar('\t'); + printw(cns>>8&0xff, stdout); putchar('\t'); + printw(cns&0xff, stdout); putchar('\t'); + printw(cns>>16&0xff, stdout); putchar('\t'); } // print pileup sequences - printf("%d\t", n); - rms_aux = 0; // we need to recalculate rms_mapq when -c is not flagged on the command line - for (i = 0; i < n; ++i) { - const bam_pileup1_t *p = pu + i; - int tmp = p->b->core.qual < d->c->cap_mapQ? p->b->core.qual : d->c->cap_mapQ; - rms_aux += tmp * tmp; - pileup_seq(p, pos, d->len, d->ref); - } + printw(n, stdout); putchar('\t'); + for (i = 0; i < n; ++i) + pileup_seq(pu + i, pos, d->len, d->ref); // finalize rms_mapq - rms_aux = (uint64_t)(sqrt((double)rms_aux / n) + .499); - if (rms_mapq < 0) rms_mapq = rms_aux; + if (d->format & BAM_PLF_CNS) { + for (i = rms_aux = 0; i < n; ++i) { + const bam_pileup1_t *p = pu + i; + int tmp = p->b->core.qual < d->c->cap_mapQ? p->b->core.qual : d->c->cap_mapQ; + rms_aux += tmp * tmp; + } + rms_aux = (uint64_t)(sqrt((double)rms_aux / n) + .499); + if (rms_mapq < 0) rms_mapq = rms_aux; + } putchar('\t'); // print quality for (i = 0; i < n; ++i) { @@ -312,7 +343,7 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p for (i = 0; i < n; ++i) { int x = pu[i].qpos; int l = pu[i].b->core.l_qseq; - printf("%d,", x < l/2? x+1 : -((l-1)-x+1)); + printw(x < l/2? x+1 : -((l-1)-x+1), stdout); putchar(','); } } putchar('\n'); @@ -338,15 +369,17 @@ int bam_pileup(int argc, char *argv[]) int c, is_SAM = 0; char *fn_list = 0, *fn_fa = 0, *fn_pos = 0; pu_data_t *d = (pu_data_t*)calloc(1, sizeof(pu_data_t)); - d->max_depth = 0; - d->tid = -1; d->mask = BAM_DEF_MASK; + d->max_depth = 1024; d->tid = -1; d->mask = BAM_DEF_MASK; d->min_baseQ = 13; d->c = bam_maqcns_init(); - d->c->is_soap = 1; // change the default model + d->c->errmod = BAM_ERRMOD_MAQ2; // change the default model d->ido = bam_maqindel_opt_init(); - while ((c = getopt(argc, argv, "st:f:cT:N:r:l:d:im:gI:G:vM:S2aR:PA")) >= 0) { + while ((c = getopt(argc, argv, "st:f:cT:N:r:l:d:im:gI:G:vM:S2aR:PAQ:C:B")) >= 0) { switch (c) { - case 'a': d->c->is_soap = 1; break; - case 'A': d->c->is_soap = 0; break; + case 'Q': d->c->min_baseQ = atoi(optarg); break; + case 'C': d->capQ_thres = atoi(optarg); break; + case 'B': d->format |= BAM_PLF_NOBAQ; break; + case 'a': d->c->errmod = BAM_ERRMOD_SOAP; break; + case 'A': d->c->errmod = BAM_ERRMOD_MAQ; break; case 's': d->format |= BAM_PLF_SIMPLE; break; case 't': fn_list = strdup(optarg); break; case 'l': fn_pos = strdup(optarg); break; @@ -375,29 +408,34 @@ int bam_pileup(int argc, char *argv[]) default: fprintf(stderr, "Unrecognizd option '-%c'.\n", c); return 1; } } + if (d->c->errmod != BAM_ERRMOD_MAQ2) d->c->theta += 0.02; + if (d->c->theta > 1.0) d->c->theta = 1.0; if (fn_list) is_SAM = 1; if (optind == argc) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: samtools pileup [options] |\n\n"); fprintf(stderr, "Option: -s simple (yet incomplete) pileup format\n"); fprintf(stderr, " -S the input is in SAM\n"); - fprintf(stderr, " -A use the MAQ model for SNP calling\n"); + fprintf(stderr, " -B disable BAQ computation\n"); + fprintf(stderr, " -A use the original MAQ model for SNP calling (DEPRECATED)\n"); fprintf(stderr, " -2 output the 2nd best call and quality\n"); fprintf(stderr, " -i only show lines/consensus with indels\n"); - fprintf(stderr, " -m INT filtering reads with bits in INT [%d]\n", d->mask); + fprintf(stderr, " -Q INT min base quality (possibly capped by BAQ) [%d]\n", d->c->min_baseQ); + fprintf(stderr, " -C INT coefficient for adjusting mapQ of poor mappings [%d]\n", d->capQ_thres); + fprintf(stderr, " -m INT filtering reads with bits in INT [0x%x]\n", d->mask); fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", d->c->cap_mapQ); - fprintf(stderr, " -d INT limit maximum depth for indels [unlimited]\n"); + fprintf(stderr, " -d INT limit maximum depth for indels [%d]\n", d->max_depth); fprintf(stderr, " -t FILE list of reference sequences (force -S)\n"); fprintf(stderr, " -l FILE list of sites at which pileup is output\n"); fprintf(stderr, " -f FILE reference sequence in the FASTA format\n\n"); - fprintf(stderr, " -c output the SOAPsnp consensus sequence\n"); + fprintf(stderr, " -c compute the consensus sequence\n"); fprintf(stderr, " -v print variants only (for -c)\n"); - fprintf(stderr, " -g output in the GLFv3 format (suppressing -c/-i/-s)\n"); - fprintf(stderr, " -T FLOAT theta in maq consensus calling model (for -c/-g) [%f]\n", d->c->theta); - fprintf(stderr, " -N INT number of haplotypes in the sample (for -c/-g) [%d]\n", d->c->n_hap); - fprintf(stderr, " -r FLOAT prior of a difference between two haplotypes (for -c/-g) [%f]\n", d->c->het_rate); - fprintf(stderr, " -G FLOAT prior of an indel between two haplotypes (for -c/-g) [%f]\n", d->ido->r_indel); - fprintf(stderr, " -I INT phred prob. of an indel in sequencing/prep. (for -c/-g) [%d]\n", d->ido->q_indel); + fprintf(stderr, " -g output in the GLFv3 format (DEPRECATED)\n"); + fprintf(stderr, " -T FLOAT theta in maq consensus calling model (for -c) [%.4g]\n", d->c->theta); + fprintf(stderr, " -N INT number of haplotypes in the sample (for -c) [%d]\n", d->c->n_hap); + fprintf(stderr, " -r FLOAT prior of a difference between two haplotypes (for -c) [%.4g]\n", d->c->het_rate); + fprintf(stderr, " -G FLOAT prior of an indel between two haplotypes (for -c) [%.4g]\n", d->ido->r_indel); + fprintf(stderr, " -I INT phred prob. of an indel in sequencing/prep. (for -c) [%d]\n", d->ido->q_indel); fprintf(stderr, "\n"); free(fn_list); free(fn_fa); free(d); return 1; @@ -425,7 +463,43 @@ int bam_pileup(int argc, char *argv[]) } d->h = fp->header; if (fn_pos) d->hash = load_pos(fn_pos, d->h); - sampileup(fp, d->mask, pileup_func, d); + { // run pileup + extern int bam_prob_realn(bam1_t *b, const char *ref); + extern int bam_cap_mapQ(bam1_t *b, char *ref, int thres); + bam1_t *b; + int ret, tid, pos, n_plp; + bam_plp_t iter; + const bam_pileup1_t *plp; + b = bam_init1(); + iter = bam_plp_init(0, 0); + bam_plp_set_mask(iter, d->mask); + while ((ret = samread(fp, b)) >= 0) { + int skip = 0; + if ((int)b->core.tid < 0) break; + // update d->ref if necessary + if (d->fai && (int)b->core.tid != d->tid) { + free(d->ref); + d->ref = faidx_fetch_seq(d->fai, d->h->target_name[b->core.tid], 0, 0x7fffffff, &d->len); + d->tid = b->core.tid; + } + if (d->ref && (d->format&BAM_PLF_CNS) && !(d->format&BAM_PLF_NOBAQ)) bam_prob_realn(b, d->ref); + if (d->ref && (d->format&BAM_PLF_CNS) && d->capQ_thres > 10) { + int q = bam_cap_mapQ(b, d->ref, d->capQ_thres); + if (q < 0) skip = 1; + else if (b->core.qual > q) b->core.qual = q; + } else if (b->core.flag&BAM_FUNMAP) skip = 1; + else if ((d->format&BAM_PLF_CNS) && (b->core.flag&1) && !(b->core.flag&2)) skip = 1; + if (skip) continue; + bam_plp_push(iter, b); + while ((plp = bam_plp_next(iter, &tid, &pos, &n_plp)) != 0) + pileup_func(tid, pos, n_plp, plp, d); + } + bam_plp_push(iter, 0); + while ((plp = bam_plp_next(iter, &tid, &pos, &n_plp)) != 0) + pileup_func(tid, pos, n_plp, plp, d); + bam_plp_destroy(iter); + bam_destroy1(b); + } samclose(fp); // d->h will be destroyed here } @@ -448,41 +522,129 @@ int bam_pileup(int argc, char *argv[]) * mpileup * ***********/ +#include +#include "bam2bcf.h" +#include "sample.h" + +#define MPLP_GLF 0x10 +#define MPLP_NO_COMP 0x20 +#define MPLP_NO_ORPHAN 0x40 +#define MPLP_REALN 0x80 +#define MPLP_FMT_DP 0x100 +#define MPLP_FMT_SP 0x200 +#define MPLP_NO_INDEL 0x400 + typedef struct { - char *reg; + int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth; + int openQ, extQ, tandemQ; + char *reg, *fn_pos, *pl_list; faidx_t *fai; + kh_64_t *hash; } mplp_conf_t; typedef struct { bamFile fp; bam_iter_t iter; + int min_mq, flag, ref_id, capQ_thres; + char *ref; } mplp_aux_t; +typedef struct { + int n; + int *n_plp, *m_plp; + bam_pileup1_t **plp; +} mplp_pileup_t; + static int mplp_func(void *data, bam1_t *b) { + extern int bam_realn(bam1_t *b, const char *ref); + extern int bam_prob_realn_core(bam1_t *b, const char *ref, int); + extern int bam_cap_mapQ(bam1_t *b, char *ref, int thres); mplp_aux_t *ma = (mplp_aux_t*)data; - if (ma->iter) return bam_iter_read(ma->fp, ma->iter, b); - return bam_read1(ma->fp, b); + int ret, skip = 0; + do { + int has_ref; + ret = ma->iter? bam_iter_read(ma->fp, ma->iter, b) : bam_read1(ma->fp, b); + if (ret < 0) break; + has_ref = (ma->ref && ma->ref_id == b->core.tid)? 1 : 0; + skip = 0; + if (has_ref && (ma->flag&MPLP_REALN)) bam_prob_realn_core(b, ma->ref, 1); + if (has_ref && ma->capQ_thres > 10) { + int q = bam_cap_mapQ(b, ma->ref, ma->capQ_thres); + if (q < 0) skip = 1; + else if (b->core.qual > q) b->core.qual = q; + } else if (b->core.flag&BAM_FUNMAP) skip = 1; + else if (b->core.qual < ma->min_mq) skip = 1; + else if ((ma->flag&MPLP_NO_ORPHAN) && (b->core.flag&1) && !(b->core.flag&2)) skip = 1; + } while (skip); + return ret; +} + +static void group_smpl(mplp_pileup_t *m, bam_sample_t *sm, kstring_t *buf, + int n, char *const*fn, int *n_plp, const bam_pileup1_t **plp) +{ + int i, j; + memset(m->n_plp, 0, m->n * sizeof(int)); + for (i = 0; i < n; ++i) { + for (j = 0; j < n_plp[i]; ++j) { + const bam_pileup1_t *p = plp[i] + j; + uint8_t *q; + int id = -1; + q = bam_aux_get(p->b, "RG"); + if (q) id = bam_smpl_rg2smid(sm, fn[i], (char*)q+1, buf); + if (id < 0) id = bam_smpl_rg2smid(sm, fn[i], 0, buf); + assert(id >= 0 && id < m->n); + if (m->n_plp[id] == m->m_plp[id]) { + m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8; + m->plp[id] = realloc(m->plp[id], sizeof(bam_pileup1_t) * m->m_plp[id]); + } + m->plp[id][m->n_plp[id]++] = *p; + } + } } static int mpileup(mplp_conf_t *conf, int n, char **fn) { + extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); + extern void bcf_call_del_rghash(void *rghash); mplp_aux_t **data; - int i, tid, pos, *n_plp, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid; + int i, tid, pos, *n_plp, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid, max_depth; const bam_pileup1_t **plp; bam_mplp_t iter; bam_header_t *h = 0; char *ref; - // allocate + khash_t(64) *hash = 0; + void *rghash = 0; + + bcf_callaux_t *bca = 0; + bcf_callret1_t *bcr = 0; + bcf_call_t bc; + bcf_t *bp = 0; + bcf_hdr_t *bh = 0; + + bam_sample_t *sm = 0; + kstring_t buf; + mplp_pileup_t gplp; + + memset(&gplp, 0, sizeof(mplp_pileup_t)); + memset(&buf, 0, sizeof(kstring_t)); + memset(&bc, 0, sizeof(bcf_call_t)); data = calloc(n, sizeof(void*)); plp = calloc(n, sizeof(void*)); n_plp = calloc(n, sizeof(int*)); + sm = bam_smpl_init(); + // read the header and initialize data for (i = 0; i < n; ++i) { bam_header_t *h_tmp; data[i] = calloc(1, sizeof(mplp_aux_t)); - data[i]->fp = bam_open(fn[i], "r"); + data[i]->min_mq = conf->min_mq; + data[i]->flag = conf->flag; + data[i]->capQ_thres = conf->capQ_thres; + data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r"); h_tmp = bam_header_read(data[i]->fp); + bam_smpl_add(sm, fn[i], h_tmp->text); + rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); if (conf->reg) { int beg, end; bam_index_t *idx; @@ -505,35 +667,124 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) bam_header_destroy(h_tmp); } } - // mpileup + gplp.n = sm->n; + gplp.n_plp = calloc(sm->n, sizeof(int)); + gplp.m_plp = calloc(sm->n, sizeof(int)); + gplp.plp = calloc(sm->n, sizeof(void*)); + + fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n); + if (conf->fn_pos) hash = load_pos(conf->fn_pos, h); + // write the VCF header + if (conf->flag & MPLP_GLF) { + kstring_t s; + bh = calloc(1, sizeof(bcf_hdr_t)); + s.l = s.m = 0; s.s = 0; + bp = bcf_open("-", (conf->flag&MPLP_NO_COMP)? "wu" : "w"); + for (i = 0; i < h->n_targets; ++i) { + kputs(h->target_name[i], &s); + kputc('\0', &s); + } + bh->l_nm = s.l; + bh->name = malloc(s.l); + memcpy(bh->name, s.s, s.l); + s.l = 0; + for (i = 0; i < sm->n; ++i) { + kputs(sm->smpl[i], &s); kputc('\0', &s); + } + bh->l_smpl = s.l; + bh->sname = malloc(s.l); + memcpy(bh->sname, s.s, s.l); + bh->l_txt = 0; + free(s.s); + bcf_hdr_sync(bh); + bcf_hdr_write(bp, bh); + bca = bcf_call_init(-1., conf->min_baseQ); + bcr = calloc(sm->n, sizeof(bcf_callret1_t)); + bca->rghash = rghash; + bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ; + } ref_tid = -1; ref = 0; iter = bam_mplp_init(n, mplp_func, (void**)data); + max_depth = conf->max_depth; + if (max_depth * sm->n > 1<<20) + fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__); + if (max_depth * sm->n < 8000) { + max_depth = 8000 / sm->n; + fprintf(stderr, "<%s> Set max per-sample depth to %d\n", __func__, max_depth); + } + bam_mplp_set_maxcnt(iter, max_depth); while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) { if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested + if (hash) { + khint_t k; + k = kh_get(64, hash, (uint64_t)tid<<32 | pos); + if (k == kh_end(hash)) continue; + } if (tid != ref_tid) { - free(ref); + free(ref); ref = 0; if (conf->fai) ref = fai_fetch(conf->fai, h->target_name[tid], &ref_len); + for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid; ref_tid = tid; } - printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); - for (i = 0; i < n; ++i) { - int j; - printf("\t%d\t", n_plp[i]); - if (n_plp[i] == 0) printf("*\t*"); - else { - for (j = 0; j < n_plp[i]; ++j) - pileup_seq(plp[i] + j, pos, ref_len, ref); - putchar('\t'); - for (j = 0; j < n_plp[i]; ++j) { - const bam_pileup1_t *p = plp[i] + j; - int c = bam1_qual(p->b)[p->qpos] + 33; - if (c > 126) c = 126; - putchar(c); + if (conf->flag & MPLP_GLF) { + int _ref0, ref16; + bcf1_t *b = calloc(1, sizeof(bcf1_t)); + group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp); + _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; + ref16 = bam_nt16_table[_ref0]; + for (i = 0; i < gplp.n; ++i) + bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i); + bcf_call_combine(gplp.n, bcr, ref16, &bc); + bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, + (conf->flag&MPLP_FMT_SP), 0, 0); + bcf_write(bp, bh, b); + bcf_destroy(b); + // call indels + if (!(conf->flag&MPLP_NO_INDEL) && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { + for (i = 0; i < gplp.n; ++i) + bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); + if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) { + b = calloc(1, sizeof(bcf1_t)); + bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, + (conf->flag&MPLP_FMT_SP), bca, ref); + bcf_write(bp, bh, b); + bcf_destroy(b); } } + } else { + printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); + for (i = 0; i < n; ++i) { + int j; + printf("\t%d\t", n_plp[i]); + if (n_plp[i] == 0) printf("*\t*"); + else { + for (j = 0; j < n_plp[i]; ++j) + pileup_seq(plp[i] + j, pos, ref_len, ref); + putchar('\t'); + for (j = 0; j < n_plp[i]; ++j) { + const bam_pileup1_t *p = plp[i] + j; + int c = bam1_qual(p->b)[p->qpos] + 33; + if (c > 126) c = 126; + putchar(c); + } + } + } + putchar('\n'); } - putchar('\n'); } + + bcf_close(bp); + bam_smpl_destroy(sm); free(buf.s); + for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]); + free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp); + bcf_call_del_rghash(rghash); + if (hash) { // free the hash table + khint_t k; + for (k = kh_begin(hash); k < kh_end(hash); ++k) + if (kh_exist(hash, k)) free(kh_val(hash, k)); + kh_destroy(64, hash); + } + bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr); bam_mplp_destroy(iter); bam_header_destroy(h); for (i = 0; i < n; ++i) { @@ -545,26 +796,133 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) return 0; } +#define MAX_PATH_LEN 1024 +int read_file_list(const char *file_list,int *n,char **argv[]) +{ + char buf[MAX_PATH_LEN]; + int len, nfiles; + char **files; + + FILE *fh = fopen(file_list,"r"); + if ( !fh ) + { + fprintf(stderr,"%s: %s\n", file_list,strerror(errno)); + return 1; + } + + // Speed is not an issue here, determine the number of files by reading the file twice + nfiles = 0; + while ( fgets(buf,MAX_PATH_LEN,fh) ) nfiles++; + + if ( fseek(fh, 0L, SEEK_SET) ) + { + fprintf(stderr,"%s: %s\n", file_list,strerror(errno)); + return 1; + } + + files = calloc(nfiles,sizeof(char*)); + nfiles = 0; + while ( fgets(buf,MAX_PATH_LEN,fh) ) + { + len = strlen(buf); + while ( len>0 && isspace(buf[len-1]) ) len--; + if ( !len ) continue; + + files[nfiles] = malloc(sizeof(char)*(len+1)); + strncpy(files[nfiles],buf,len); + files[nfiles][len] = 0; + nfiles++; + } + fclose(fh); + if ( !nfiles ) + { + fprintf(stderr,"No files read from %s\n", file_list); + return 1; + } + *argv = files; + *n = nfiles; + return 0; +} +#undef MAX_PATH_LEN + int bam_mpileup(int argc, char *argv[]) { int c; + const char *file_list = NULL; + char **fn = NULL; + int nfiles = 0; mplp_conf_t mplp; memset(&mplp, 0, sizeof(mplp_conf_t)); - while ((c = getopt(argc, argv, "f:r:")) >= 0) { + mplp.max_mq = 60; + mplp.min_baseQ = 13; + mplp.capQ_thres = 0; + mplp.max_depth = 250; + mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100; + mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN; + while ((c = getopt(argc, argv, "gf:r:l:M:q:Q:uaORC:BDSd:b:P:o:e:h:I")) >= 0) { switch (c) { case 'f': mplp.fai = fai_load(optarg); if (mplp.fai == 0) return 1; break; - case 'r': mplp.reg = strdup(optarg); + case 'd': mplp.max_depth = atoi(optarg); break; + case 'r': mplp.reg = strdup(optarg); break; + case 'l': mplp.fn_pos = strdup(optarg); break; + case 'P': mplp.pl_list = strdup(optarg); break; + case 'g': mplp.flag |= MPLP_GLF; break; + case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_GLF; break; + case 'a': mplp.flag |= MPLP_NO_ORPHAN | MPLP_REALN; break; + case 'B': mplp.flag &= ~MPLP_REALN & ~MPLP_NO_ORPHAN; break; + case 'O': mplp.flag |= MPLP_NO_ORPHAN; break; + case 'R': mplp.flag |= MPLP_REALN; break; + case 'D': mplp.flag |= MPLP_FMT_DP; break; + case 'S': mplp.flag |= MPLP_FMT_SP; break; + case 'I': mplp.flag |= MPLP_NO_INDEL; break; + case 'C': mplp.capQ_thres = atoi(optarg); break; + case 'M': mplp.max_mq = atoi(optarg); break; + case 'q': mplp.min_mq = atoi(optarg); break; + case 'Q': mplp.min_baseQ = atoi(optarg); break; + case 'b': file_list = optarg; break; + case 'o': mplp.openQ = atoi(optarg); break; + case 'e': mplp.extQ = atoi(optarg); break; + case 'h': mplp.tandemQ = atoi(optarg); break; } } if (argc == 1) { - fprintf(stderr, "Usage: samtools mpileup [-r reg] [-f in.fa] in1.bam [in2.bam [...]]\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools mpileup [options] in1.bam [in2.bam [...]]\n\n"); + fprintf(stderr, "Options: -f FILE reference sequence file [null]\n"); + fprintf(stderr, " -r STR region in which pileup is generated [null]\n"); + fprintf(stderr, " -l FILE list of positions (format: chr pos) [null]\n"); + fprintf(stderr, " -b FILE list of input BAM files [null]\n"); + fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", mplp.max_mq); + fprintf(stderr, " -Q INT min base quality [%d]\n", mplp.min_baseQ); + fprintf(stderr, " -q INT filter out alignment with MQ smaller than INT [%d]\n", mplp.min_mq); + fprintf(stderr, " -d INT max per-sample depth [%d]\n", mplp.max_depth); + fprintf(stderr, " -P STR comma separated list of platforms for indels [all]\n"); + fprintf(stderr, " -o INT Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ); + fprintf(stderr, " -e INT Phred-scaled gap extension seq error probability [%d]\n", mplp.extQ); + fprintf(stderr, " -h INT coefficient for homopolyer errors [%d]\n", mplp.tandemQ); + fprintf(stderr, " -g generate BCF output\n"); + fprintf(stderr, " -u do not compress BCF output\n"); + fprintf(stderr, " -B disable BAQ computation\n"); + fprintf(stderr, " -D output per-sample DP\n"); + fprintf(stderr, " -S output per-sample SP (strand bias P-value, slow)\n"); + fprintf(stderr, " -I do not perform indel calling\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Notes: Assuming diploid individuals.\n\n"); return 1; } - mpileup(&mplp, argc - optind, argv + optind); - free(mplp.reg); + if ( file_list ) + { + if ( read_file_list(file_list,&nfiles,&fn) ) return 1; + mpileup(&mplp,nfiles,fn); + for (c=0; c #include #include +#include #include #include #include @@ -51,6 +52,14 @@ static inline int heap_lt(const heap1_t a, const heap1_t b) KSORT_INIT(heap, heap1_t, heap_lt) +static void swap_header_targets(bam_header_t *h1, bam_header_t *h2) +{ + bam_header_t t; + t.n_targets = h1->n_targets, h1->n_targets = h2->n_targets, h2->n_targets = t.n_targets; + t.target_name = h1->target_name, h1->target_name = h2->target_name, h2->target_name = t.target_name; + t.target_len = h1->target_len, h1->target_len = h2->target_len, h2->target_len = t.target_len; +} + static void swap_header_text(bam_header_t *h1, bam_header_t *h2) { int tempi; @@ -59,6 +68,9 @@ static void swap_header_text(bam_header_t *h1, bam_header_t *h2) temps = h1->text, h1->text = h2->text, h2->text = temps; } +#define MERGE_RG 1 +#define MERGE_UNCOMP 2 + /*! @abstract Merge multiple sorted BAM. @param is_by_qname whether to sort by query name @@ -71,7 +83,8 @@ static void swap_header_text(bam_header_t *h1, bam_header_t *h2) @discussion Padding information may NOT correctly maintained. This function is NOT thread safe. */ -void bam_merge_core(int by_qname, const char *out, const char *headers, int n, char * const *fn, int add_RG) +int bam_merge_core(int by_qname, const char *out, const char *headers, int n, char * const *fn, + int flag, const char *reg) { bamFile fpout, *fp; heap1_t *heap; @@ -80,22 +93,25 @@ void bam_merge_core(int by_qname, const char *out, const char *headers, int n, c int i, j, *RG_len = 0; uint64_t idx = 0; char **RG = 0; + bam_iter_t *iter = 0; if (headers) { tamFile fpheaders = sam_open(headers); if (fpheaders == 0) { - fprintf(stderr, "[bam_merge_core] Cannot open file `%s'. Continue anyway.\n", headers); - } else { - hheaders = sam_header_read(fpheaders); - sam_close(fpheaders); + const char *message = strerror(errno); + fprintf(stderr, "[bam_merge_core] cannot open '%s': %s\n", headers, message); + return -1; } + hheaders = sam_header_read(fpheaders); + sam_close(fpheaders); } g_is_by_qname = by_qname; fp = (bamFile*)calloc(n, sizeof(bamFile)); heap = (heap1_t*)calloc(n, sizeof(heap1_t)); + iter = (bam_iter_t*)calloc(n, sizeof(bam_iter_t)); // prepare RG tag - if (add_RG) { + if (flag & MERGE_RG) { RG = (char**)calloc(n, sizeof(void*)); RG_len = (int*)calloc(n, sizeof(int)); for (i = 0; i != n; ++i) { @@ -111,7 +127,6 @@ void bam_merge_core(int by_qname, const char *out, const char *headers, int n, c } // read the first for (i = 0; i != n; ++i) { - heap1_t *h; bam_header_t *hin; fp[i] = bam_open(fn[i], "r"); if (fp[i] == 0) { @@ -120,61 +135,97 @@ void bam_merge_core(int by_qname, const char *out, const char *headers, int n, c for (j = 0; j < i; ++j) bam_close(fp[j]); free(fp); free(heap); // FIXME: possible memory leak - return; + return -1; } hin = bam_header_read(fp[i]); - if (i == 0) { // the first SAM + if (i == 0) { // the first BAM hout = hin; - if (hheaders) { - // If the text headers to be swapped in include any @SQ headers, - // check that they are consistent with the existing binary list - // of reference information. - if (hheaders->n_targets > 0) { - if (hout->n_targets != hheaders->n_targets) - fprintf(stderr, "[bam_merge_core] number of @SQ headers in `%s' differs from number of target sequences", headers); - for (j = 0; j < hout->n_targets; ++j) - if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0) - fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence", hheaders->target_name[j], headers); - } - swap_header_text(hout, hheaders); - bam_header_destroy(hheaders); - hheaders = NULL; - } } else { // validate multiple baf - if (hout->n_targets != hin->n_targets) { - fprintf(stderr, "[bam_merge_core] file '%s' has different number of target sequences. Abort!\n", fn[i]); - exit(1); - } - for (j = 0; j < hout->n_targets; ++j) { - if (strcmp(hout->target_name[j], hin->target_name[j])) { - fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'. Abort!\n", + int min_n_targets = hout->n_targets; + if (hin->n_targets < min_n_targets) min_n_targets = hin->n_targets; + + for (j = 0; j < min_n_targets; ++j) + if (strcmp(hout->target_name[j], hin->target_name[j]) != 0) { + fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'\n", hout->target_name[j], hin->target_name[j], fn[i]); - exit(1); + return -1; } + + // If this input file has additional target reference sequences, + // add them to the headers to be output + if (hin->n_targets > hout->n_targets) { + swap_header_targets(hout, hin); + // FIXME Possibly we should also create @SQ text headers + // for the newly added reference sequences } + bam_header_destroy(hin); } - h = heap + i; + } + + if (hheaders) { + // If the text headers to be swapped in include any @SQ headers, + // check that they are consistent with the existing binary list + // of reference information. + if (hheaders->n_targets > 0) { + if (hout->n_targets != hheaders->n_targets) { + fprintf(stderr, "[bam_merge_core] number of @SQ headers in '%s' differs from number of target sequences\n", headers); + if (!reg) return -1; + } + for (j = 0; j < hout->n_targets; ++j) + if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0) { + fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence\n", hheaders->target_name[j], headers); + if (!reg) return -1; + } + } + + swap_header_text(hout, hheaders); + bam_header_destroy(hheaders); + } + + if (reg) { + int tid, beg, end; + if (bam_parse_region(hout, reg, &tid, &beg, &end) < 0) { + fprintf(stderr, "[%s] Malformated region string or undefined reference name\n", __func__); + return -1; + } + for (i = 0; i < n; ++i) { + bam_index_t *idx; + idx = bam_index_load(fn[i]); + iter[i] = bam_iter_query(idx, tid, beg, end); + bam_index_destroy(idx); + } + } + + for (i = 0; i < n; ++i) { + heap1_t *h = heap + i; h->i = i; h->b = (bam1_t*)calloc(1, sizeof(bam1_t)); - if (bam_read1(fp[i], h->b) >= 0) { + if (bam_iter_read(fp[i], iter[i], h->b) >= 0) { h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)h->b->core.pos<<1 | bam1_strand(h->b); h->idx = idx++; } else h->pos = HEAP_EMPTY; } - fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w"); - assert(fpout); + if (flag & MERGE_UNCOMP) { + fpout = strcmp(out, "-")? bam_open(out, "wu") : bam_dopen(fileno(stdout), "wu"); + } else { + fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w"); + } + if (fpout == 0) { + fprintf(stderr, "[%s] fail to create the output file.\n", __func__); + return -1; + } bam_header_write(fpout, hout); bam_header_destroy(hout); ks_heapmake(heap, n, heap); while (heap->pos != HEAP_EMPTY) { bam1_t *b = heap->b; - if (add_RG && bam_aux_get(b, "RG") == 0) + if ((flag & MERGE_RG) && bam_aux_get(b, "RG") == 0) bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]); bam_write1_core(fpout, &b->core, b->data_len, b->data); - if ((j = bam_read1(fp[heap->i], b)) >= 0) { + if ((j = bam_iter_read(fp[heap->i], iter[heap->i], b)) >= 0) { heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)b->core.pos<<1 | bam1_strand(b); heap->idx = idx++; } else if (j == -1) { @@ -185,24 +236,31 @@ void bam_merge_core(int by_qname, const char *out, const char *headers, int n, c ks_heapadjust(heap, 0, n, heap); } - if (add_RG) { + if (flag & MERGE_RG) { for (i = 0; i != n; ++i) free(RG[i]); free(RG); free(RG_len); } - for (i = 0; i != n; ++i) bam_close(fp[i]); + for (i = 0; i != n; ++i) { + bam_iter_destroy(iter[i]); + bam_close(fp[i]); + } bam_close(fpout); - free(fp); free(heap); + free(fp); free(heap); free(iter); + return 0; } + int bam_merge(int argc, char *argv[]) { - int c, is_by_qname = 0, add_RG = 0; - char *fn_headers = NULL; + int c, is_by_qname = 0, flag = 0, ret = 0; + char *fn_headers = NULL, *reg = 0; - while ((c = getopt(argc, argv, "h:nr")) >= 0) { + while ((c = getopt(argc, argv, "h:nruR:")) >= 0) { switch (c) { - case 'r': add_RG = 1; break; + case 'r': flag |= MERGE_RG; break; case 'h': fn_headers = strdup(optarg); break; case 'n': is_by_qname = 1; break; + case 'u': flag |= MERGE_UNCOMP; break; + case 'R': reg = strdup(optarg); break; } } if (optind + 2 >= argc) { @@ -210,15 +268,18 @@ int bam_merge(int argc, char *argv[]) fprintf(stderr, "Usage: samtools merge [-nr] [-h inh.sam] [...]\n\n"); fprintf(stderr, "Options: -n sort by read names\n"); fprintf(stderr, " -r attach RG tag (inferred from file names)\n"); + fprintf(stderr, " -u uncompressed BAM output\n"); + fprintf(stderr, " -R STR merge file in the specified region STR [all]\n"); fprintf(stderr, " -h FILE copy the header in FILE to [in1.bam]\n\n"); fprintf(stderr, "Note: Samtools' merge does not reconstruct the @RG dictionary in the header. Users\n"); fprintf(stderr, " must provide the correct header with -h, or uses Picard which properly maintains\n"); fprintf(stderr, " the header dictionary in merging.\n\n"); return 1; } - bam_merge_core(is_by_qname, argv[optind], fn_headers, argc - optind - 1, argv + optind + 1, add_RG); + if (bam_merge_core(is_by_qname, argv[optind], fn_headers, argc - optind - 1, argv + optind + 1, flag, reg) < 0) ret = 1; + free(reg); free(fn_headers); - return 0; + return ret; } typedef bam1_t *bam1_p; @@ -313,7 +374,7 @@ void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size fns[i] = (char*)calloc(strlen(prefix) + 20, 1); sprintf(fns[i], "%s.%.4d.bam", prefix, i); } - bam_merge_core(is_by_qname, fnout, 0, n, fns, 0); + bam_merge_core(is_by_qname, fnout, 0, n, fns, 0, 0); free(fnout); for (i = 0; i < n; ++i) { unlink(fns[i]); diff --git a/sam/bam_tview.c b/sam/bam_tview.c index 7b326fc..e48afa7 100644 --- a/sam/bam_tview.c +++ b/sam/bam_tview.c @@ -109,7 +109,7 @@ int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void if (tv->is_dot && toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.'; } } - } else c = '*'; + } else c = p->is_refskip? (bam1_strand(p->b)? '<' : '>') : '*'; } else { // padding if (j > p->indel) c = '*'; else { // insertion @@ -292,7 +292,7 @@ static void tv_win_goto(tview_t *tv, int *tid, int *pos) } else if (c == KEY_ENTER || c == '\012' || c == '\015') { int _tid = -1, _beg, _end; if (str[0] == '=') { - _beg = strtol(str+1, &p, 10); + _beg = strtol(str+1, &p, 10) - 1; if (_beg > 0) { *pos = _beg; return; diff --git a/sam/bamtk.c b/sam/bamtk.c index 94c4d3f..79635d6 100644 --- a/sam/bamtk.c +++ b/sam/bamtk.c @@ -9,7 +9,7 @@ #endif #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.1.8 (r613)" +#define PACKAGE_VERSION "0.1.12a (r862)" #endif int bam_taf2baf(int argc, char *argv[]); diff --git a/sam/bgzf.c b/sam/bgzf.c index a6923da..66d6b02 100644 --- a/sam/bgzf.c +++ b/sam/bgzf.c @@ -177,7 +177,7 @@ BGZF* bgzf_open(const char* __restrict path, const char* __restrict mode) { BGZF* fp = NULL; - if (mode[0] == 'r' || mode[0] == 'R') { /* The reading mode is preferred. */ + if (strchr(mode, 'r') || strchr(mode, 'R')) { /* The reading mode is preferred. */ #ifdef _USE_KNETFILE knetFile *file = knet_open(path, mode); if (file == 0) return 0; @@ -194,14 +194,14 @@ bgzf_open(const char* __restrict path, const char* __restrict mode) if (fd == -1) return 0; fp = open_read(fd); #endif - } else if (mode[0] == 'w' || mode[0] == 'W') { + } else if (strchr(mode, 'w') || strchr(mode, 'W')) { int fd, oflag = O_WRONLY | O_CREAT | O_TRUNC; #ifdef _WIN32 oflag |= O_BINARY; #endif fd = open(path, oflag, 0666); if (fd == -1) return 0; - fp = open_write(fd, strstr(mode, "u")? 1 : 0); + fp = open_write(fd, strchr(mode, 'u')? 1 : 0); } if (fp != NULL) fp->owned_file = 1; return fp; diff --git a/sam/bgzip.c b/sam/bgzip.c index ac2a98e..ebcafa2 100644 --- a/sam/bgzip.c +++ b/sam/bgzip.c @@ -27,22 +27,24 @@ #include #include #include +#include +#include #include "bgzf.h" static const int WINDOW_SIZE = 64 * 1024; static int bgzip_main_usage() { - printf("\n"); - printf("Usage: bgzip [options] [file] ...\n\n"); - printf("Options: -c write on standard output, keep original files unchanged\n"); - printf(" -d decompress\n"); - // printf(" -l list compressed file contents\n"); - printf(" -b INT decompress at virtual file pointer INT\n"); - printf(" -s INT decompress INT bytes in the uncompressed file\n"); - printf(" -h give this help\n"); - printf("\n"); - return 0; + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bgzip [options] [file] ...\n\n"); + fprintf(stderr, "Options: -c write on standard output, keep original files unchanged\n"); + fprintf(stderr, " -d decompress\n"); + fprintf(stderr, " -f overwrite files without asking\n"); + fprintf(stderr, " -b INT decompress at virtual file pointer INT\n"); + fprintf(stderr, " -s INT decompress INT bytes in the uncompressed file\n"); + fprintf(stderr, " -h give this help\n"); + fprintf(stderr, "\n"); + return 1; } static int write_open(const char *fn, int is_forced) @@ -51,129 +53,154 @@ static int write_open(const char *fn, int is_forced) char c; if (!is_forced) { if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0666)) < 0 && errno == EEXIST) { - printf("bgzip: %s already exists; do you wish to overwrite (y or n)? ", fn); + fprintf(stderr, "[bgzip] %s already exists; do you wish to overwrite (y or n)? ", fn); scanf("%c", &c); if (c != 'Y' && c != 'y') { - printf("bgzip: not overwritten\n"); + fprintf(stderr, "[bgzip] not overwritten\n"); exit(1); } } } if (fd < 0) { if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0666)) < 0) { - fprintf(stderr, "bgzip: %s: Fail to write\n", fn); + fprintf(stderr, "[bgzip] %s: Fail to write\n", fn); exit(1); } } return fd; } -static -void -fail(BGZF* fp) +static void fail(BGZF* fp) { - printf("Error: %s\n", fp->error); + fprintf(stderr, "Error: %s\n", fp->error); exit(1); } int main(int argc, char **argv) { int c, compress, pstdout, is_forced; - BGZF *rz; + BGZF *fp; void *buffer; long start, end, size; compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; - while((c = getopt(argc, argv, "cdlhfb:s:")) >= 0){ + while((c = getopt(argc, argv, "cdhfb:s:")) >= 0){ switch(c){ case 'h': return bgzip_main_usage(); case 'd': compress = 0; break; case 'c': pstdout = 1; break; - // case 'l': compress = 2; break; case 'b': start = atol(optarg); break; case 's': size = atol(optarg); break; case 'f': is_forced = 1; break; } } if (size >= 0) end = start + size; - if(end >= 0 && end < start){ - fprintf(stderr, " -- Illegal region: [%ld, %ld] --\n", start, end); + if (end >= 0 && end < start) { + fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end); return 1; } - if(compress == 1){ - int f_src, f_dst = -1; - if(argc > optind){ - if((f_src = open(argv[optind], O_RDONLY)) < 0){ - fprintf(stderr, " -- Cannot open file: %s --\n", argv[optind]); + if (compress == 1) { + struct stat sbuf; + int f_src = fileno(stdin); + int f_dst = fileno(stdout); + + if ( argc>optind ) + { + if ( stat(argv[optind],&sbuf)<0 ) + { + fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); + return 1; + } + + if ((f_src = open(argv[optind], O_RDONLY)) < 0) { + fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); return 1; } - if(pstdout){ + + if (pstdout) f_dst = fileno(stdout); - } else { - char *name = malloc(sizeof(strlen(argv[optind]) + 5)); + else + { + char *name = malloc(strlen(argv[optind]) + 5); strcpy(name, argv[optind]); strcat(name, ".gz"); f_dst = write_open(name, is_forced); if (f_dst < 0) return 1; free(name); } - } else if(pstdout){ - f_src = fileno(stdin); - f_dst = fileno(stdout); - } else return bgzip_main_usage(); - rz = bgzf_fdopen(f_dst, "w"); + } + else if (!pstdout && isatty(fileno((FILE *)stdout)) ) + return bgzip_main_usage(); + + fp = bgzf_fdopen(f_dst, "w"); buffer = malloc(WINDOW_SIZE); - while((c = read(f_src, buffer, WINDOW_SIZE)) > 0) { - if (bgzf_write(rz, buffer, c) < 0) { - fail(rz); - } - } - // f_dst will be closed here - if (bgzf_close(rz) < 0) { - fail(rz); - } - if (argc > optind) unlink(argv[optind]); + while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0) + if (bgzf_write(fp, buffer, c) < 0) fail(fp); + // f_dst will be closed here + if (bgzf_close(fp) < 0) fail(fp); + if (argc > optind && !pstdout) unlink(argv[optind]); free(buffer); close(f_src); return 0; } else { - if(argc <= optind) return bgzip_main_usage(); - int f_dst; - if (argc > optind && !pstdout) { - char *name; - if (strstr(argv[optind], ".gz") - argv[optind] != strlen(argv[optind]) - 3) { - printf("bgzip: %s: unknown suffix -- ignored\n", argv[optind]); - return 1; - } - name = strdup(argv[optind]); - name[strlen(name) - 3] = '\0'; - f_dst = write_open(name, is_forced); - free(name); - } else f_dst = fileno(stdout); - rz = bgzf_open(argv[optind], "r"); - if (rz == NULL) { - printf("Could not open file: %s\n", argv[optind]); - return 1; - } - buffer = malloc(WINDOW_SIZE); - if (bgzf_seek(rz, start, SEEK_SET) < 0) { - fail(rz); - } - while(1){ - if(end < 0) c = bgzf_read(rz, buffer, WINDOW_SIZE); - else c = bgzf_read(rz, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start)); - if(c == 0) break; - if (c < 0) fail(rz); - start += c; - write(f_dst, buffer, c); - if(end >= 0 && start >= end) break; - } - free(buffer); - if (bgzf_close(rz) < 0) { - fail(rz); - } - if (!pstdout) unlink(argv[optind]); + struct stat sbuf; + int f_dst; + + if ( argc>optind ) + { + if ( stat(argv[optind],&sbuf)<0 ) + { + fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); + return 1; + } + char *name; + int len = strlen(argv[optind]); + if ( strcmp(argv[optind]+len-3,".gz") ) + { + fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]); + return 1; + } + fp = bgzf_open(argv[optind], "r"); + if (fp == NULL) { + fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]); + return 1; + } + + if (pstdout) { + f_dst = fileno(stdout); + } + else { + name = strdup(argv[optind]); + name[strlen(name) - 3] = '\0'; + f_dst = write_open(name, is_forced); + free(name); + } + } + else if (!pstdout && isatty(fileno((FILE *)stdin)) ) + return bgzip_main_usage(); + else + { + f_dst = fileno(stdout); + fp = bgzf_fdopen(fileno(stdin), "r"); + if (fp == NULL) { + fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno)); + return 1; + } + } + buffer = malloc(WINDOW_SIZE); + if (bgzf_seek(fp, start, SEEK_SET) < 0) fail(fp); + while (1) { + if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE); + else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start)); + if (c == 0) break; + if (c < 0) fail(fp); + start += c; + write(f_dst, buffer, c); + if (end >= 0 && start >= end) break; + } + free(buffer); + if (bgzf_close(fp) < 0) fail(fp); + if (!pstdout) unlink(argv[optind]); return 0; } } - diff --git a/sam/examples/Makefile b/sam/examples/Makefile index 8f0386f..ec976ae 100644 --- a/sam/examples/Makefile +++ b/sam/examples/Makefile @@ -1,4 +1,5 @@ -all:../libbam.a ../samtools ex1.glf ex1.pileup.gz ex1.bam.bai ex1f-rmduppe.bam ex1f-rmdupse.bam ex1.glfview.gz calDepth +all:../libbam.a ../samtools ../bcftools/bcftools \ + ex1.glf ex1.pileup.gz ex1.bam.bai ex1f-rmduppe.bam ex1f-rmdupse.bam ex1.glfview.gz ex1.bcf calDepth @echo; echo \# You can now launch the viewer with: \'samtools tview ex1.bam ex1.fa\'; echo; ex1.fa.fai:ex1.fa @@ -18,7 +19,7 @@ ex1a.bam:ex1.bam ex1b.bam:ex1.bam ../samtools view -h ex1.bam | awk 'BEGIN{FS=OFS="\t"}{if(/^@/)print;else{$$1=$$1"b";print}}' | ../samtools view -bS - > $@ ex1f.rg: - (echo "@RG ID:ex1 LB:ex1"; echo "@RG ID:ex1a LB:ex1"; echo "@RG ID:ex1b LB:ex1b") > $@ + (echo "@RG ID:ex1 LB:ex1 SM:ex1"; echo "@RG ID:ex1a LB:ex1 SM:ex1"; echo "@RG ID:ex1b LB:ex1b SM:ex1b") > $@ ex1f.bam:ex1.bam ex1a.bam ex1b.bam ex1f.rg ../samtools merge -rh ex1f.rg $@ ex1.bam ex1a.bam ex1b.bam ex1f-rmduppe.bam:ex1f.bam @@ -26,6 +27,12 @@ ex1f-rmduppe.bam:ex1f.bam ex1f-rmdupse.bam:ex1f.bam ../samtools rmdup -S ex1f.bam $@ +ex1.bcf:ex1.bam ex1.fa.fai + ../samtools mpileup -gf ex1.fa ex1.bam > $@ + +../bcftools/bcftools: + (cd ../bcftools; make bcftools) + ../samtools: (cd ..; make samtools) @@ -36,4 +43,8 @@ calDepth:../libbam.a calDepth.c gcc -g -Wall -O2 -I.. calDepth.c -o $@ -lm -lz -L.. -lbam clean: - rm -fr *.bam *.bai *.glf* *.fai *.pileup* *~ calDepth *.dSYM ex1*.rg \ No newline at end of file + rm -fr *.bam *.bai *.glf* *.fai *.pileup* *~ calDepth *.dSYM ex1*.rg ex1.bcf + +# ../samtools pileup ex1.bam|perl -ape '$_=$F[4];s/(\d+)(??{".{$1}"})|\^.//g;@_=(tr/A-Z//,tr/a-z//);$_=join("\t",@F[0,1],@_)."\n"' + +# ../samtools pileup -cf ex1.fa ex1.bam|perl -ape '$_=$F[8];s/\^.//g;s/(\d+)(??{".{$1}"})|\^.//g;@_=(tr/A-Za-z//,tr/,.//);$_=join("\t",@F[0,1],@_)."\n"' \ No newline at end of file diff --git a/sam/examples/toy.fa b/sam/examples/toy.fa index 38312c1..afe990a 100644 --- a/sam/examples/toy.fa +++ b/sam/examples/toy.fa @@ -1,2 +1,4 @@ >ref AGCATGTTAGATAAGATAGCTGTGCTAGTAGGCAGTCAGCGCCAT +>ref2 +aggttttataaaacaattaagtctacagagcaactacgcg diff --git a/sam/examples/toy.sam b/sam/examples/toy.sam index baf7388..1aff220 100644 --- a/sam/examples/toy.sam +++ b/sam/examples/toy.sam @@ -1,7 +1,14 @@ @SQ SN:ref LN:45 -r001 163 ref 7 30 8M2I4M1D3M = 37 39 TTAGATAAAGGATACTG * -r002 0 ref 9 30 1S2I6M1P1I4M2I * 0 0 AAAAGATAAGGATAAA * +@SQ SN:ref2 LN:40 +r001 163 ref 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * +r002 0 ref 9 30 1S2I6M1P1I1P1I4M2I * 0 0 AAAAGATAAGGGATAAA * r003 0 ref 9 30 5H6M * 0 0 AGCTAA * r004 0 ref 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC * r003 16 ref 29 30 6H5M * 0 0 TAGGC * -r001 83 ref 37 30 9M = 7 -39 CAGCGCCAT * \ No newline at end of file +r001 83 ref 37 30 9M = 7 -39 CAGCGCCAT * +x1 0 ref2 1 30 20M * 0 0 aggttttataaaacaaataa ???????????????????? +x2 0 ref2 2 30 21M * 0 0 ggttttataaaacaaataatt ????????????????????? +x3 0 ref2 6 30 9M4I13M * 0 0 ttataaaacAAATaattaagtctaca ?????????????????????????? +x4 0 ref2 10 30 25M * 0 0 CaaaTaattaagtctacagagcaac ????????????????????????? +x5 0 ref2 12 30 24M * 0 0 aaTaattaagtctacagagcaact ???????????????????????? +x6 0 ref2 14 30 23M * 0 0 Taattaagtctacagagcaacta ??????????????????????? diff --git a/sam/kaln.c b/sam/kaln.c index 9fa40d0..9c0bbaa 100644 --- a/sam/kaln.c +++ b/sam/kaln.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "kaln.h" #define FROM_M 0 @@ -72,8 +73,18 @@ int aln_sm_blast[] = { -2, -2, -2, -2, -2 }; -ka_param_t ka_param_blast = { 5, 2, 2, aln_sm_blast, 5, 50 }; -ka_param_t ka_param_aa2aa = { 10, 2, 2, aln_sm_blosum62, 22, 50 }; +int aln_sm_qual[] = { + 0, -23, -23, -23, 0, + -23, 0, -23, -23, 0, + -23, -23, 0, -23, 0, + -23, -23, -23, 0, 0, + 0, 0, 0, 0, 0 +}; + +ka_param_t ka_param_blast = { 5, 2, 5, 2, aln_sm_blast, 5, 50 }; +ka_param_t ka_param_aa2aa = { 10, 2, 10, 2, aln_sm_blosum62, 22, 50 }; + +ka_param2_t ka_param2_qual = { 37, 11, 37, 11, 37, 11, 0, 0, aln_sm_qual, 5, 50 }; static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar) { @@ -141,13 +152,13 @@ static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar) } #define set_end_I(II, cur, p) \ { \ - if (gap_end >= 0) { \ - if ((p)->M - gap_open > (p)->I) { \ + if (gap_end_ext >= 0) { \ + if ((p)->M - gap_end_open > (p)->I) { \ (cur)->It = FROM_M; \ - (II) = (p)->M - gap_open - gap_end; \ + (II) = (p)->M - gap_end_open - gap_end_ext; \ } else { \ (cur)->It = FROM_I; \ - (II) = (p)->I - gap_end; \ + (II) = (p)->I - gap_end_ext; \ } \ } else set_I(II, cur, p); \ } @@ -163,19 +174,19 @@ static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar) } #define set_end_D(DD, cur, p) \ { \ - if (gap_end >= 0) { \ - if ((p)->M - gap_open > (p)->D) { \ + if (gap_end_ext >= 0) { \ + if ((p)->M - gap_end_open > (p)->D) { \ (cur)->Dt = FROM_M; \ - (DD) = (p)->M - gap_open - gap_end; \ + (DD) = (p)->M - gap_end_open - gap_end_ext; \ } else { \ (cur)->Dt = FROM_D; \ - (DD) = (p)->D - gap_end; \ + (DD) = (p)->D - gap_end_ext; \ } \ } else set_D(DD, cur, p); \ } typedef struct { - uint8_t Mt:3, It:2, Dt:2; + uint8_t Mt:3, It:2, Dt:3; } dpcell_t; typedef struct { @@ -195,18 +206,19 @@ uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const uint8_t type, ctype; uint32_t *cigar = 0; - int gap_open, gap_ext, gap_end, b; + int gap_open, gap_ext, gap_end_open, gap_end_ext, b; int *score_matrix, N_MATRIX_ROW; /* initialize some align-related parameters. just for compatibility */ gap_open = ap->gap_open; gap_ext = ap->gap_ext; - gap_end = ap->gap_end; + gap_end_open = ap->gap_end_open; + gap_end_ext = ap->gap_end_ext; b = ap->band_width; score_matrix = ap->matrix; N_MATRIX_ROW = ap->row; - *n_cigar = 0; + if (n_cigar) *n_cigar = 0; if (len1 == 0 || len2 == 0) return 0; /* calculate b1 and b2 */ @@ -368,3 +380,107 @@ uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const return cigar; } + +typedef struct { + int M, I, D; +} score_aux_t; + +#define MINUS_INF -0x40000000 + +// matrix: len2 rows and len1 columns +int ka_global_score(const uint8_t *_seq1, int len1, const uint8_t *_seq2, int len2, const ka_param2_t *ap) +{ + +#define __score_aux(_p, _q0, _sc, _io, _ie, _do, _de) { \ + int t1, t2; \ + score_aux_t *_q; \ + _q = _q0; \ + _p->M = _q->M >= _q->I? _q->M : _q->I; \ + _p->M = _p->M >= _q->D? _p->M : _q->D; \ + _p->M += (_sc); \ + ++_q; t1 = _q->M - _io - _ie; t2 = _q->I - _ie; _p->I = t1 >= t2? t1 : t2; \ + _q = _p-1; t1 = _q->M - _do - _de; t2 = _q->D - _de; _p->D = t1 >= t2? t1 : t2; \ + } + + int i, j, bw, scmat_size = ap->row, *scmat = ap->matrix, ret; + const uint8_t *seq1, *seq2; + score_aux_t *curr, *last, *swap; + bw = abs(len1 - len2) + ap->band_width; + i = len1 > len2? len1 : len2; + if (bw > i + 1) bw = i + 1; + seq1 = _seq1 - 1; seq2 = _seq2 - 1; + curr = calloc(len1 + 2, sizeof(score_aux_t)); + last = calloc(len1 + 2, sizeof(score_aux_t)); + { // the zero-th row + int x, end = len1; + score_aux_t *p; + j = 0; + x = j + bw; end = len1 < x? len1 : x; // band end + p = curr; + p->M = 0; p->I = p->D = MINUS_INF; + for (i = 1, p = &curr[1]; i <= end; ++i, ++p) + p->M = p->I = MINUS_INF, p->D = -(ap->edo + ap->ede * i); + p->M = p->I = p->D = MINUS_INF; + swap = curr; curr = last; last = swap; + } + for (j = 1; j < len2; ++j) { + int x, beg = 0, end = len1, *scrow, col_end; + score_aux_t *p; + x = j - bw; beg = 0 > x? 0 : x; // band start + x = j + bw; end = len1 < x? len1 : x; // band end + if (beg == 0) { // from zero-th column + p = curr; + p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j); + ++beg; // then beg = 1 + } + scrow = scmat + seq2[j] * scmat_size; + if (end == len1) col_end = 1, --end; + else col_end = 0; + for (i = beg, p = &curr[beg]; i <= end; ++i, ++p) + __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->ido, ap->ide); + if (col_end) { + __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->ido, ap->ide); + ++p; + } + p->M = p->I = p->D = MINUS_INF; +// for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n'); + swap = curr; curr = last; last = swap; + } + { // the last row + int x, beg = 0, *scrow; + score_aux_t *p; + j = len2; + x = j - bw; beg = 0 > x? 0 : x; // band start + if (beg == 0) { // from zero-th column + p = curr; + p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j); + ++beg; // then beg = 1 + } + scrow = scmat + seq2[j] * scmat_size; + for (i = beg, p = &curr[beg]; i < len1; ++i, ++p) + __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->edo, ap->ede); + __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->edo, ap->ede); +// for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n'); + } + ret = curr[len1].M >= curr[len1].I? curr[len1].M : curr[len1].I; + ret = ret >= curr[len1].D? ret : curr[len1].D; + free(curr); free(last); + return ret; +} + +#ifdef _MAIN +int main(int argc, char *argv[]) +{ +// int len1 = 35, len2 = 35; +// uint8_t *seq1 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\0\1"; +// uint8_t *seq2 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\1\0"; + int len1 = 4, len2 = 4; + uint8_t *seq1 = (uint8_t*)"\1\0\0\1"; + uint8_t *seq2 = (uint8_t*)"\1\0\1\0"; + int sc; +// ka_global_core(seq1, 2, seq2, 1, &ka_param_qual, &sc, 0); + sc = ka_global_score(seq1, len1, seq2, len2, &ka_param2_qual); + printf("%d\n", sc); + return 0; +} +#endif diff --git a/sam/kaln.h b/sam/kaln.h index b04d8cc..1ece132 100644 --- a/sam/kaln.h +++ b/sam/kaln.h @@ -33,23 +33,35 @@ typedef struct { int gap_open; int gap_ext; - int gap_end; + int gap_end_open; + int gap_end_ext; int *matrix; int row; int band_width; } ka_param_t; +typedef struct { + int iio, iie, ido, ide; + int eio, eie, edo, ede; + int *matrix; + int row; + int band_width; +} ka_param2_t; + #ifdef __cplusplus extern "C" { #endif - uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar); - + uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, + int *_score, int *n_cigar); + int ka_global_score(const uint8_t *_seq1, int len1, const uint8_t *_seq2, int len2, const ka_param2_t *ap); #ifdef __cplusplus } #endif -extern ka_param_t ka_param_blast; /* = { 5, 2, 2, aln_sm_blast, 5, 50 }; */ +extern ka_param_t ka_param_blast; /* = { 5, 2, 5, 2, aln_sm_blast, 5, 50 }; */ +extern ka_param_t ka_param_qual; // only use this for global alignment!!! +extern ka_param2_t ka_param2_qual; // only use this for global alignment!!! #endif diff --git a/sam/knetfile.c b/sam/knetfile.c index e1be4d6..1e2c042 100644 --- a/sam/knetfile.c +++ b/sam/knetfile.c @@ -517,7 +517,10 @@ off_t knet_read(knetFile *fp, void *buf, off_t len) if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX off_t rest = len, curr; while (rest) { - curr = read(fp->fd, buf + l, rest); + do { + curr = read(fp->fd, buf + l, rest); + } while (curr < 0 && EINTR == errno); + if (curr < 0) return -1; if (curr == 0) break; l += curr; rest -= curr; } diff --git a/sam/ksort.h b/sam/ksort.h index 16a03fd..fa850ab 100644 --- a/sam/ksort.h +++ b/sam/ksort.h @@ -250,6 +250,15 @@ typedef struct { if (hh <= k) low = ll; \ if (hh >= k) high = hh - 1; \ } \ + } \ + void ks_shuffle_##name(size_t n, type_t a[]) \ + { \ + int i, j; \ + for (i = n; i > 1; --i) { \ + type_t tmp; \ + j = (int)(drand48() * i); \ + tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp; \ + } \ } #define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t) @@ -259,6 +268,7 @@ typedef struct { #define ks_heapmake(name, n, a) ks_heapmake_##name(n, a) #define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a) #define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k) +#define ks_shuffle(name, n, a) ks_shuffle_##name(n, a) #define ks_lt_generic(a, b) ((a) < (b)) #define ks_lt_str(a, b) (strcmp((a), (b)) < 0) diff --git a/sam/kstring.c b/sam/kstring.c index e0203fa..43d524c 100644 --- a/sam/kstring.c +++ b/sam/kstring.c @@ -24,6 +24,24 @@ int ksprintf(kstring_t *s, const char *fmt, ...) return l; } +char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux) +{ + const char *p, *start; + if (sep) { // set up the table + if (str == 0 && (aux->tab[0]&1)) return 0; // no need to set up if we have finished + aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0; + for (p = sep; *p; ++p) + aux->tab[*p/64] |= 1ull<<(*p%64); + } + if (str) aux->p = str - 1, aux->tab[0] &= ~1ull; + else if (aux->tab[0]&1) return 0; + for (p = start = aux->p + 1; *p; ++p) + if (aux->tab[*p/64]>>(*p%64)&1) break; + aux->p = p; // end of token + if (*p == 0) aux->tab[0] |= 1; // no more tokens + return (char*)start; +} + // s MUST BE a null terminated string; l = strlen(s) int ksplit_core(char *s, int delimiter, int *_max, int **_offsets) { @@ -66,11 +84,13 @@ int ksplit_core(char *s, int delimiter, int *_max, int **_offsets) * Boyer-Moore search * **********************/ +typedef unsigned char ubyte_t; + // reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html -int *ksBM_prep(const uint8_t *pat, int m) +static int *ksBM_prep(const ubyte_t *pat, int m) { int i, *suff, *prep, *bmGs, *bmBc; - prep = calloc(m + 256, 1); + prep = calloc(m + 256, sizeof(int)); bmGs = prep; bmBc = prep + m; { // preBmBc() for (i = 0; i < 256; ++i) bmBc[i] = m; @@ -107,39 +127,49 @@ int *ksBM_prep(const uint8_t *pat, int m) return prep; } -int *ksBM_search(const uint8_t *str, int n, const uint8_t *pat, int m, int *_prep, int *n_matches) +void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep) { - int i, j, *prep, *bmGs, *bmBc; - int *matches = 0, mm = 0, nm = 0; - prep = _prep? _prep : ksBM_prep(pat, m); + int i, j, *prep = 0, *bmGs, *bmBc; + const ubyte_t *str, *pat; + str = (const ubyte_t*)_str; pat = (const ubyte_t*)_pat; + prep = (_prep == 0 || *_prep == 0)? ksBM_prep(pat, m) : *_prep; + if (_prep && *_prep == 0) *_prep = prep; bmGs = prep; bmBc = prep + m; j = 0; while (j <= n - m) { for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i); - if (i < 0) { - if (nm == mm) { - mm = mm? mm<<1 : 1; - matches = realloc(matches, mm * sizeof(int)); - } - matches[nm++] = j; - j += bmGs[0]; - } else { + if (i >= 0) { int max = bmBc[str[i+j]] - m + 1 + i; if (max < bmGs[i]) max = bmGs[i]; j += max; - } + } else return (void*)(str + j); } - *n_matches = nm; if (_prep == 0) free(prep); - return matches; + return 0; } +char *kstrstr(const char *str, const char *pat, int **_prep) +{ + return (char*)kmemmem(str, strlen(str), pat, strlen(pat), _prep); +} + +char *kstrnstr(const char *str, const char *pat, int n, int **_prep) +{ + return (char*)kmemmem(str, n, pat, strlen(pat), _prep); +} + +/*********************** + * The main() function * + ***********************/ + #ifdef KSTRING_MAIN #include int main() { kstring_t *s; int *fields, n, i; + ks_tokaux_t aux; + char *p; s = (kstring_t*)calloc(1, sizeof(kstring_t)); // test ksprintf() ksprintf(s, " abcdefg: %d ", 100); @@ -148,17 +178,26 @@ int main() fields = ksplit(s, 0, &n); for (i = 0; i < n; ++i) printf("field[%d] = '%s'\n", i, s->s + fields[i]); - free(s); + // test kstrtok() + s->l = 0; + for (p = kstrtok("ab:cde:fg/hij::k", ":/", &aux); p; p = kstrtok(0, 0, &aux)) { + kputsn(p, aux.p - p, s); + kputc('\n', s); + } + printf("%s", s->s); + // free + free(s->s); free(s); free(fields); { - static char *str = "abcdefgcdg"; + static char *str = "abcdefgcdgcagtcakcdcd"; static char *pat = "cd"; - int n, *matches; - matches = ksBM_search(str, strlen(str), pat, strlen(pat), 0, &n); - printf("%d: \n", n); - for (i = 0; i < n; ++i) - printf("- %d\n", matches[i]); - free(matches); + char *ret, *s = str; + int *prep = 0; + while ((ret = kstrstr(s, pat, &prep)) != 0) { + printf("match: %s\n", ret); + s = ret + prep[0]; + } + free(prep); } return 0; } diff --git a/sam/kstring.h b/sam/kstring.h index 925117a..c46a62b 100644 --- a/sam/kstring.h +++ b/sam/kstring.h @@ -17,17 +17,31 @@ typedef struct __kstring_t { } kstring_t; #endif -int ksprintf(kstring_t *s, const char *fmt, ...); -int ksplit_core(char *s, int delimiter, int *_max, int **_offsets); +typedef struct { + uint64_t tab[4]; + const char *p; // end of the current token +} ks_tokaux_t; -// calculate the auxiliary array, allocated by calloc() -int *ksBM_prep(const uint8_t *pat, int m); +#ifdef __cplusplus +extern "C" { +#endif + + int ksprintf(kstring_t *s, const char *fmt, ...); + int ksplit_core(char *s, int delimiter, int *_max, int **_offsets); + char *kstrstr(const char *str, const char *pat, int **_prep); + char *kstrnstr(const char *str, const char *pat, int n, int **_prep); + void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep); -/* Search pat in str and returned the list of matches. The size of the - * list is returned as n_matches. _prep is the array returned by - * ksBM_prep(). If it is a NULL pointer, ksBM_prep() will be called. */ -int *ksBM_search(const uint8_t *str, int n, const uint8_t *pat, int m, int *_prep, int *n_matches); + /* kstrtok() is similar to strtok_r() except that str is not + * modified and both str and sep can be NULL. For efficiency, it is + * actually recommended to set both to NULL in the subsequent calls + * if sep is not changed. */ + char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux); +#ifdef __cplusplus +} +#endif + static inline int kputsn(const char *p, int l, kstring_t *s) { if (s->l + l + 1 >= s->m) { @@ -35,7 +49,7 @@ static inline int kputsn(const char *p, int l, kstring_t *s) kroundup32(s->m); s->s = (char*)realloc(s->s, s->m); } - strncpy(s->s + s->l, p, l); + memcpy(s->s + s->l, p, l); s->l += l; s->s[s->l] = 0; return l; diff --git a/sam/misc/Makefile b/sam/misc/Makefile index 2d7139d..6c25c78 100644 --- a/sam/misc/Makefile +++ b/sam/misc/Makefile @@ -1,6 +1,6 @@ CC= gcc CXX= g++ -CFLAGS= -g -Wall #-O2 #-m64 #-arch ppc +CFLAGS= -g -Wall -O2 #-m64 #-arch ppc CXXFLAGS= $(CFLAGS) DFLAGS= -D_FILE_OFFSET_BITS=64 OBJS= diff --git a/sam/misc/samtools.pl b/sam/misc/samtools.pl index 9f48b8f..d03c1c7 100755 --- a/sam/misc/samtools.pl +++ b/sam/misc/samtools.pl @@ -10,7 +10,7 @@ my $version = '0.3.3'; &usage if (@ARGV < 1); my $command = shift(@ARGV); -my %func = (showALEN=>\&showALEN, pileup2fq=>\&pileup2fq, varFilter=>\&varFilter, +my %func = (showALEN=>\&showALEN, pileup2fq=>\&pileup2fq, varFilter=>\&varFilter, plp2vcf=>\&plp2vcf, unique=>\&unique, uniqcmp=>\&uniqcmp, sra2hdr=>\&sra2hdr, sam2fq=>\&sam2fq); die("Unknown command \"$command\".\n") if (!defined($func{$command})); @@ -104,7 +104,6 @@ Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}] my $len=0; if ($flt == 0) { if ($t[2] eq '*') { # an indel - # If deletion, remember the length of the deletion my ($a,$b) = split(m{/},$t[3]); my $alen = length($a) - 1; @@ -474,6 +473,44 @@ sub uniqcmp_aux { close($fh); } +sub plp2vcf { + while (<>) { + my @t = split; + next if ($t[3] eq '*/*'); + if ($t[2] eq '*') { # indel + my @s = split("/", $t[3]); + my (@a, @b); + my ($ref, $alt); + for (@s) { + next if ($_ eq '*'); + if (/^-/) { + push(@a, 'N'.substr($_, 1)); + push(@b, 'N'); + } elsif (/^\+/) { + push(@a, 'N'); + push(@b, 'N'.substr($_, 1)); + } + } + if ($a[0] && $a[1]) { + if (length($a[0]) < length($a[1])) { + $ref = $a[1]; + $alt = ($b[0] . ('N' x (length($a[1]) - length($a[0])))) . ",$b[1]"; + } elsif (length($a[0]) > length($a[1])) { + $ref = $a[0]; + $alt = ($b[1] . ('N' x (length($a[0]) - length($a[1])))) . ",$b[0]"; + } else { + $ref = $a[0]; + $alt = ($b[0] eq $b[1])? $b[0] : "$b[0],$b[1]"; + } + } else { + $ref = $a[0]; $alt = $b[0]; + } + print join("\t", @t[0,1], '.', $ref, $alt, $t[5], '.', '.'), "\n"; + } else { # SNP + } + } +} + # # Usage # diff --git a/sam/razip.c b/sam/razip.c index dff9347..825e732 100644 --- a/sam/razip.c +++ b/sam/razip.c @@ -94,7 +94,7 @@ int main(int argc, char **argv) buffer = malloc(WINDOW_SIZE); while((c = read(f_src, buffer, WINDOW_SIZE)) > 0) razf_write(rz, buffer, c); razf_close(rz); // f_dst will be closed here - if (argc > optind) unlink(argv[optind]); + if (argc > optind && !pstdout) unlink(argv[optind]); free(buffer); close(f_src); return 0; diff --git a/sam/sam_view.c b/sam/sam_view.c index 3b10e2e..eb69449 100644 --- a/sam/sam_view.c +++ b/sam/sam_view.c @@ -9,6 +9,13 @@ #include "khash.h" KHASH_SET_INIT_STR(rg) +// When counting records instead of printing them, +// data passed to the bam_fetch callback is encapsulated in this struct. +typedef struct { + bam_header_t *header; + int *count; +} count_func_data_t; + typedef khash_t(rg) *rghash_t; rghash_t g_rghash = 0; @@ -54,7 +61,7 @@ static inline int __g_skip_aln(const bam_header_t *h, const bam1_t *b) return 0; } -// callback function for bam_fetch() +// callback function for bam_fetch() that prints nonskipped records static int view_func(const bam1_t *b, void *data) { if (!__g_skip_aln(((samfile_t*)data)->header, b)) @@ -62,19 +69,30 @@ static int view_func(const bam1_t *b, void *data) return 0; } +// callback function for bam_fetch() that counts nonskipped records +static int count_func(const bam1_t *b, void *data) +{ + if (!__g_skip_aln(((count_func_data_t*)data)->header, b)) { + (*((count_func_data_t*)data)->count)++; + } + return 0; +} + static int usage(int is_long_help); int main_samview(int argc, char *argv[]) { - int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, is_uncompressed = 0, is_bamout = 0, slx2sngr = 0; + int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, is_uncompressed = 0, is_bamout = 0, slx2sngr = 0, is_count = 0; int of_type = BAM_OFDEC, is_long_help = 0; + int count = 0; samfile_t *in = 0, *out = 0; char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0, *fn_rg = 0; /* parse command-line options */ strcpy(in_mode, "r"); strcpy(out_mode, "w"); - while ((c = getopt(argc, argv, "Sbt:hHo:q:f:F:ul:r:xX?T:CR:")) >= 0) { + while ((c = getopt(argc, argv, "Sbct:hHo:q:f:F:ul:r:xX?T:CR:")) >= 0) { switch (c) { + case 'c': is_count = 1; break; case 'C': slx2sngr = 1; break; case 'S': is_bamin = 0; break; case 'b': is_bamout = 1; break; @@ -124,15 +142,18 @@ int main_samview(int argc, char *argv[]) if (fn_list == 0 && fn_ref) fn_list = samfaipath(fn_ref); // open file handlers if ((in = samopen(argv[optind], in_mode, fn_list)) == 0) { - fprintf(stderr, "[main_samview] fail to open file for reading.\n"); + fprintf(stderr, "[main_samview] fail to open \"%s\" for reading.\n", argv[optind]); + ret = 1; goto view_end; } if (in->header == 0) { - fprintf(stderr, "[main_samview] fail to read the header.\n"); + fprintf(stderr, "[main_samview] fail to read the header from \"%s\".\n", argv[optind]); + ret = 1; goto view_end; } - if ((out = samopen(fn_out? fn_out : "-", out_mode, in->header)) == 0) { - fprintf(stderr, "[main_samview] fail to open file for writing.\n"); + if (!is_count && (out = samopen(fn_out? fn_out : "-", out_mode, in->header)) == 0) { + fprintf(stderr, "[main_samview] fail to open \"%s\" for writing.\n", fn_out? fn_out : "standard output"); + ret = 1; goto view_end; } if (is_header_only) goto view_end; // no need to print alignments @@ -143,10 +164,14 @@ int main_samview(int argc, char *argv[]) while ((r = samread(in, b)) >= 0) { // read one alignment from `in' if (!__g_skip_aln(in->header, b)) { if (slx2sngr) sol2sanger(b); - samwrite(out, b); // write the alignment to `out' + if (!is_count) samwrite(out, b); // write the alignment to `out' + count++; } } - if (r < -1) fprintf(stderr, "[main_samview] truncated file.\n"); + if (r < -1) { + fprintf(stderr, "[main_samview] truncated file.\n"); + ret = 1; + } bam_destroy1(b); } else { // retrieve alignments in specified regions int i; @@ -158,18 +183,31 @@ int main_samview(int argc, char *argv[]) goto view_end; } for (i = optind + 1; i < argc; ++i) { - int tid, beg, end; + int tid, beg, end, result; bam_parse_region(in->header, argv[i], &tid, &beg, &end); // parse a region in the format like `chr2:100-200' if (tid < 0) { // reference name is not found - fprintf(stderr, "[main_samview] fail to get the reference name. Continue anyway.\n"); + fprintf(stderr, "[main_samview] region \"%s\" specifies an unknown reference name. Continue anyway.\n", argv[i]); continue; } - bam_fetch(in->x.bam, idx, tid, beg, end, out, view_func); // fetch alignments + // fetch alignments + if (is_count) { + count_func_data_t count_data = { in->header, &count }; + result = bam_fetch(in->x.bam, idx, tid, beg, end, &count_data, count_func); + } else + result = bam_fetch(in->x.bam, idx, tid, beg, end, out, view_func); + if (result < 0) { + fprintf(stderr, "[main_samview] retrieval of region \"%s\" failed due to truncated file or corrupt BAM index file\n", argv[i]); + ret = 1; + break; + } } bam_index_destroy(idx); // destroy the BAM index } view_end: + if (is_count && ret == 0) { + printf("%d\n", count); + } // close files, free and return free(fn_list); free(fn_ref); free(fn_out); free(g_library); free(g_rg); free(fn_rg); if (g_rghash) { @@ -179,7 +217,8 @@ view_end: kh_destroy(rg, g_rghash); } samclose(in); - samclose(out); + if (!is_count) + samclose(out); return ret; } @@ -194,6 +233,7 @@ static int usage(int is_long_help) fprintf(stderr, " -u uncompressed BAM output (force -b)\n"); fprintf(stderr, " -x output FLAG in HEX (samtools-C specific)\n"); fprintf(stderr, " -X output FLAG in string (samtools-C specific)\n"); + fprintf(stderr, " -c print only the count of matching records\n"); fprintf(stderr, " -t FILE list of reference names and lengths (force -S) [null]\n"); fprintf(stderr, " -T FILE reference sequence file (force -S) [null]\n"); fprintf(stderr, " -o FILE output file name [stdout]\n"); diff --git a/sam/samtools.1 b/sam/samtools.1 index d79d176..57f1aff 100644 --- a/sam/samtools.1 +++ b/sam/samtools.1 @@ -1,4 +1,4 @@ -.TH samtools 1 "11 July 2010" "samtools-0.1.8" "Bioinformatics tools" +.TH samtools 1 "2 December 2010" "samtools-0.1.12" "Bioinformatics tools" .SH NAME .PP samtools - Utilities for the Sequence Alignment/Map (SAM) format @@ -18,9 +18,9 @@ samtools merge out.bam in1.bam in2.bam in3.bam .PP samtools faidx ref.fasta .PP -samtools pileup -f ref.fasta aln.sorted.bam +samtools pileup -vcf ref.fasta aln.sorted.bam .PP -samtools mpileup -f ref.fasta -r chr3:1,000-2,000 in1.bam in2.bam +samtools mpileup -C50 -gf ref.fasta -r chr3:1,000-2,000 in1.bam in2.bam .PP samtools tview aln.sorted.bam ref.fasta @@ -47,7 +47,7 @@ entire alignment file unless it is asked to do so. .TP 10 .B view -samtools view [-bhuHS] [-t in.refList] [-o output] [-f reqFlag] [-F +samtools view [-bchuHS] [-t in.refList] [-o output] [-f reqFlag] [-F skipFlag] [-q minMapQ] [-l library] [-r readGroup] [-R rgFile] | [region1 [...]] Extract/print all or sub alignments in SAM or BAM format. If no region @@ -65,10 +65,12 @@ format: `chr2' (the whole chr2), `chr2:1000000' (region starting from .B -b Output in the BAM format. .TP -.B -u -Output uncompressed BAM. This option saves time spent on -compression/decomprssion and is thus preferred when the output is piped -to another samtools command. +.BI -f \ INT +Only output alignments with all bits in INT present in the FLAG +field. INT can be in hex in the format of /^0x[0-9A-F]+/ [0] +.TP +.BI -F \ INT +Skip alignments with bits present in INT [0] .TP .B -h Include the header in the output. @@ -76,12 +78,38 @@ Include the header in the output. .B -H Output the header only. .TP +.BI -l \ STR +Only output reads in library STR [null] +.TP +.BI -o \ FILE +Output file [stdout] +.TP +.BI -q \ INT +Skip alignments with MAPQ smaller than INT [0] +.TP +.BI -r \ STR +Only output reads in read group STR [null] +.TP +.BI -R \ FILE +Output reads in read groups listed in +.I FILE +[null] +.TP .B -S Input is in SAM. If @SQ header lines are absent, the .B `-t' option is required. .TP -.B -t FILE +.B -c +Instead of printing the alignments, only count them and print the +total number. All filter options, such as +.B `-f', +.B `-F' +and +.B `-q' +, are taken into account. +.TP +.BI -t \ FILE This file is TAB-delimited. Each line must contain the reference name and the length of the reference, one line for each distinct reference; additional fields are ignored. This file also defines the order of the @@ -92,29 +120,10 @@ can be used as this .I file. .TP -.B -o FILE -Output file [stdout] -.TP -.B -f INT -Only output alignments with all bits in INT present in the FLAG -field. INT can be in hex in the format of /^0x[0-9A-F]+/ [0] -.TP -.B -F INT -Skip alignments with bits present in INT [0] -.TP -.B -q INT -Skip alignments with MAPQ smaller than INT [0] -.TP -.B -l STR -Only output reads in library STR [null] -.TP -.B -r STR -Only output reads in read group STR [null] -.TP -.B -R FILE -Output reads in read groups listed in -.I FILE -[null] +.B -u +Output uncompressed BAM. This option saves time spent on +compression/decomprssion and is thus preferred when the output is piped +to another samtools command. .RE .TP @@ -126,144 +135,81 @@ press `?' for help and press `g' to check the alignment start from a region in the format like `chr10:10,000,000' or `=10,000,000' when viewing the same reference sequence. -.TP -.B pileup -samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l in.site_list] -[-iscgS2] [-T theta] [-N nHap] [-r pairDiffRate] | - -Print the alignment in the pileup format. In the pileup format, each -line represents a genomic position, consisting of chromosome name, -coordinate, reference base, read bases, read qualities and alignment -mapping qualities. Information on match, mismatch, indel, strand, -mapping quality and start and end of a read are all encoded at the read -base column. At this column, a dot stands for a match to the reference -base on the forward strand, a comma for a match on the reverse strand, -`ACGTN' for a mismatch on the forward strand and `acgtn' for a mismatch -on the reverse strand. A pattern `\\+[0-9]+[ACGTNacgtn]+' indicates -there is an insertion between this reference position and the next -reference position. The length of the insertion is given by the integer -in the pattern, followed by the inserted sequence. Similarly, a pattern -`-[0-9]+[ACGTNacgtn]+' represents a deletion from the reference. The -deleted bases will be presented as `*' in the following lines. Also at -the read base column, a symbol `^' marks the start of a read segment -which is a contiguous subsequence on the read separated by `N/S/H' CIGAR -operations. The ASCII of the character following `^' minus 33 gives the -mapping quality. A symbol `$' marks the end of a read segment. - -If option -.B -c -is applied, the consensus base, Phred-scaled consensus quality, SNP -quality (i.e. the Phred-scaled probability of the consensus being -identical to the reference) and root mean square (RMS) mapping quality -of the reads covering the site will be inserted between the `reference -base' and the `read bases' columns. An indel occupies an additional -line. Each indel line consists of chromosome name, coordinate, a star, -the genotype, consensus quality, SNP quality, RMS mapping quality, # -covering reads, the first alllele, the second allele, # reads supporting -the first allele, # reads supporting the second allele and # reads -containing indels different from the top two alleles. - -The position of indels is offset by -1. - -.B OPTIONS: -.RS -.TP 10 -.B -s -Print the mapping quality as the last column. This option makes the -output easier to parse, although this format is not space efficient. -.TP -.B -S -The input file is in SAM. -.TP -.B -i -Only output pileup lines containing indels. -.TP -.B -f FILE -The reference sequence in the FASTA format. Index file -.I FILE.fai -will be created if -absent. -.TP -.B -M INT -Cap mapping quality at INT [60] -.TP -.B -m INT -Filter reads with flag containing bits in -.I -INT -[1796] -.TP -.B -d INT -Use the first -.I NUM -reads in the pileup for indel calling for speed up. Zero for unlimited. [0] -.TP -.B -t FILE -List of reference names ane sequence lengths, in the format described -for the -.B import -command. If this option is present, samtools assumes the input -.I -is in SAM format; otherwise it assumes in BAM format. -.TP -.B -l FILE -List of sites at which pileup is output. This file is space -delimited. The first two columns are required to be chromosome and -1-based coordinate. Additional columns are ignored. It is -recommended to use option -.B -s -together with -.B -l -as in the default format we may not know the mapping quality. -.TP -.B -c -Call the consensus sequence using SOAPsnp consensus model. Options -.B -T, -.B -N, -.B -I -and -.B -r -are only effective when -.B -c -or -.B -g -is in use. -.TP -.B -g -Generate genotype likelihood in the binary GLFv3 format. This option -suppresses -c, -i and -s. -.TP -.B -T FLOAT -The theta parameter (error dependency coefficient) in the maq consensus -calling model [0.85] -.TP -.B -N INT -Number of haplotypes in the sample (>=2) [2] -.TP -.B -r FLOAT -Expected fraction of differences between a pair of haplotypes [0.001] -.TP -.B -I INT -Phred probability of an indel in sequencing/prep. [40] -.RE - .TP .B mpileup -samtools mpileup [-r reg] [-f in.fa] in.bam [in2.bam [...]] +samtools mpileup [-Bug] [-C capQcoef] [-r reg] [-f in.fa] [-l list] [-M capMapQ] [-Q minBaseQ] [-q minMapQ] in.bam [in2.bam [...]] -Generate pileup for multiple BAM files. Consensus calling is not -implemented. +Generate BCF or pileup for one or multiple BAM files. Alignment records +are grouped by sample identifiers in @RG header lines. If sample +identifiers are absent, each input file is regarded as one sample. .B OPTIONS: .RS .TP 8 -.B -r STR +.B -B +Disable probabilistic realignment for the computation of base alignment +quality (BAQ). BAQ is the Phred-scaled probability of a read base being +misaligned. Applying this option greatly helps to reduce false SNPs +caused by misalignments. +.TP +.BI -C \ INT +Coefficient for downgrading mapping quality for reads containing +excessive mismatches. Given a read with a phred-scaled probability q of +being generated from the mapped position, the new mapping quality is +about sqrt((INT-q)/INT)*INT. A zero value disables this +functionality; if enabled, the recommended value for BWA is 50. [0] +.TP +.BI -e \ INT +Phred-scaled gap extension sequencing error probability. Reducing +.I INT +leads to longer indels. [20] +.TP +.BI -f \ FILE +The reference file [null] +.TP +.B -g +Compute genotype likelihoods and output them in the binary call format (BCF). +.TP +.BI -h \ INT +Coefficient for modeling homopolymer errors. Given an +.IR l -long +homopolymer +run, the sequencing error of an indel of size +.I s +is modeled as +.IR INT * s / l . +[100] +.TP +.BI -l \ FILE +File containing a list of sites where pileup or BCF is outputted [null] +.TP +.BI -o \ INT +Phred-scaled gap open sequencing error probability. Reducing +.I INT +leads to more indel calls. [40] +.TP +.BI -P \ STR +Comma dilimited list of platforms (determined by +.BR @RG-PL ) +from which indel candidates are obtained. It is recommended to collect +indel candidates from sequencing technologies that have low indel error +rate such as ILLUMINA. [all] +.TP +.BI -q \ INT +Minimum mapping quality for an alignment to be used [0] +.TP +.BI -Q \ INT +Minimum base quality for a base to be considered [13] +.TP +.BI -r \ STR Only generate pileup in region .I STR [all sites] .TP -.B -f FILE -The reference file [null] +.B -u +Similar to +.B -g +except that the output is uncompressed BCF, which is preferred for piping. .RE .TP @@ -297,13 +243,13 @@ Output the final alignment to the standard output. .B -n Sort by read names rather than by chromosomal coordinates .TP -.B -m INT +.BI -m \ INT Approximately the maximum required memory. [500000000] .RE .TP .B merge -samtools merge [-h inh.sam] [-nr] [...] +samtools merge [-nur] [-h inh.sam] [-R reg] [...] Merge multiple sorted alignments. The header reference lists of all the input BAM files, and the @SQ headers of @@ -320,7 +266,7 @@ and the headers of other files will be ignored. .B OPTIONS: .RS .TP 8 -.B -h FILE +.BI -h \ FILE Use the lines of .I FILE as `@' headers to be copied to @@ -331,12 +277,19 @@ replacing any header lines that would otherwise be copied from is actually in SAM format, though any alignment records it may contain are ignored.) .TP +.BI -R \ STR +Merge files in the specified region indicated by +.I STR +.TP .B -r Attach an RG tag to each alignment. The tag value is inferred from file names. .TP .B -n The input alignments are sorted by read names rather than by chromosomal coordinates +.TP +.B -u +Uncompressed BAM output .RE .TP @@ -402,7 +355,7 @@ Treat paired-end reads and single-end reads. .TP .B calmd -samtools calmd [-eubS] +samtools calmd [-eubSr] [-C capQcoef] Generate the MD tag. If the MD tag is already present, this command will give a warning if the MD tag generated is different from the existing @@ -411,6 +364,11 @@ tag. Output SAM by default. .B OPTIONS: .RS .TP 8 +.B -A +When used jointly with +.B -r +this option overwrites the original base quality. +.TP 8 .B -e Convert a the read base to = if it is identical to the aligned reference base. Indel caller does not support the = bases at the moment. @@ -423,6 +381,144 @@ Output compressed BAM .TP .B -S The input is SAM with header lines +.TP +.BI -C \ INT +Coefficient to cap mapping quality of poorly mapped reads. See the +.B pileup +command for details. [0] +.TP +.B -r +Compute the BQ tag without changing the base quality. +.RE + +.TP +.B pileup +samtools pileup [-2sSBicv] [-f in.ref.fasta] [-t in.ref_list] [-l +in.site_list] [-C capMapQ] [-M maxMapQ] [-T theta] [-N nHap] [-r +pairDiffRate] [-m mask] [-d maxIndelDepth] [-G indelPrior] +| + +Print the alignment in the pileup format. In the pileup format, each +line represents a genomic position, consisting of chromosome name, +coordinate, reference base, read bases, read qualities and alignment +mapping qualities. Information on match, mismatch, indel, strand, +mapping quality and start and end of a read are all encoded at the read +base column. At this column, a dot stands for a match to the reference +base on the forward strand, a comma for a match on the reverse strand, +a '>' or '<' for a reference skip, `ACGTN' for a mismatch on the forward +strand and `acgtn' for a mismatch on the reverse strand. A pattern +`\\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between this +reference position and the next reference position. The length of the +insertion is given by the integer in the pattern, followed by the +inserted sequence. Similarly, a pattern `-[0-9]+[ACGTNacgtn]+' +represents a deletion from the reference. The deleted bases will be +presented as `*' in the following lines. Also at the read base column, a +symbol `^' marks the start of a read. The ASCII of the character +following `^' minus 33 gives the mapping quality. A symbol `$' marks the +end of a read segment. + +If option +.B -c +is applied, the consensus base, Phred-scaled consensus quality, SNP +quality (i.e. the Phred-scaled probability of the consensus being +identical to the reference) and root mean square (RMS) mapping quality +of the reads covering the site will be inserted between the `reference +base' and the `read bases' columns. An indel occupies an additional +line. Each indel line consists of chromosome name, coordinate, a star, +the genotype, consensus quality, SNP quality, RMS mapping quality, # +covering reads, the first alllele, the second allele, # reads supporting +the first allele, # reads supporting the second allele and # reads +containing indels different from the top two alleles. + +.B NOTE: +Since 0.1.10, the `pileup' command is deprecated by `mpileup'. + +.B OPTIONS: +.RS +.TP 10 +.B -B +Disable the BAQ computation. See the +.B mpileup +command for details. +.TP +.B -c +Call the consensus sequence. Options +.BR -T ", " -N ", " -I " and " -r +are only effective when +.BR -c " or " -g +is in use. +.TP +.BI -C \ INT +Coefficient for downgrading the mapping quality of poorly mapped +reads. See the +.B mpileup +command for details. [0] +.TP +.BI -d \ INT +Use the first +.I NUM +reads in the pileup for indel calling for speed up. Zero for unlimited. [1024] +.TP +.BI -f \ FILE +The reference sequence in the FASTA format. Index file +.I FILE.fai +will be created if +absent. +.TP +.B -g +Generate genotype likelihood in the binary GLFv3 format. This option +suppresses -c, -i and -s. This option is deprecated by the +.B mpileup +command. +.TP +.B -i +Only output pileup lines containing indels. +.TP +.BI -I \ INT +Phred probability of an indel in sequencing/prep. [40] +.TP +.BI -l \ FILE +List of sites at which pileup is output. This file is space +delimited. The first two columns are required to be chromosome and +1-based coordinate. Additional columns are ignored. It is +recommended to use option +.TP +.BI -m \ INT +Filter reads with flag containing bits in +.I INT +[1796] +.TP +.BI -M \ INT +Cap mapping quality at INT [60] +.TP +.BI -N \ INT +Number of haplotypes in the sample (>=2) [2] +.TP +.BI -r \ FLOAT +Expected fraction of differences between a pair of haplotypes [0.001] +.TP +.B -s +Print the mapping quality as the last column. This option makes the +output easier to parse, although this format is not space efficient. +.TP +.B -S +The input file is in SAM. +.TP +.BI -t \ FILE +List of reference names ane sequence lengths, in the format described +for the +.B import +command. If this option is present, samtools assumes the input +.I +is in SAM format; otherwise it assumes in BAM format. +.B -s +together with +.B -l +as in the default format we may not know the mapping quality. +.TP +.BI -T \ FLOAT +The theta parameter (error dependency coefficient) in the maq consensus +calling model [0.85] .RE .SH SAM FORMAT @@ -472,6 +568,124 @@ _ 0x0400 d the read is either a PCR or an optical duplicate .TE +.SH EXAMPLES +.IP o 2 +Import SAM to BAM when +.B @SQ +lines are present in the header: + + samtools view -bS aln.sam > aln.bam + +If +.B @SQ +lines are absent: + + samtools faidx ref.fa + samtools view -bt ref.fa.fai aln.sam > aln.bam + +where +.I ref.fa.fai +is generated automatically by the +.B faidx +command. + +.IP o 2 +Attach the +.B RG +tag while merging sorted alignments: + + perl -e 'print "@RG\\tID:ga\\tSM:hs\\tLB:ga\\tPL:Illumina\\n@RG\\tID:454\\tSM:hs\\tLB:454\\tPL:454\\n"' > rg.txt + samtools merge -rh rg.txt merged.bam ga.bam 454.bam + +The value in a +.B RG +tag is determined by the file name the read is coming from. In this +example, in the +.IR merged.bam , +reads from +.I ga.bam +will be attached +.IR RG:Z:ga , +while reads from +.I 454.bam +will be attached +.IR RG:Z:454 . + +.IP o 2 +Call SNPs and short indels for one diploid individual: + + samtools mpileup -ugf ref.fa aln.bam | bcftools view -bvcg - > var.raw.bcf + bcftools view var.raw.bcf | vcfutils.pl varFilter -D 100 > var.flt.vcf + +The +.B -D +option of varFilter controls the maximum read depth, which should be +adjusted to about twice the average read depth. One may consider to add +.B -C50 +to +.B mpileup +if mapping quality is overestimated for reads containing excessive +mismatches. Applying this option usually helps +.B BWA-short +but may not other mappers. + +.IP o 2 +Call SNPs and short indels for multiple diploid individuals: + + samtools mpileup -P ILLUMINA -ugf ref.fa *.bam | bcftools view -bcvg - > var.raw.bcf + bcftools view var.raw.bcf | vcfutils.pl varFilter -D 2000 > var.flt.vcf + +Individuals are identified from the +.B SM +tags in the +.B @RG +header lines. Individuals can be pooled in one alignment file; one +individual can also be separated into multiple files. The +.B -P +option specifies that indel candidates should be collected only from +read groups with the +.B @RG-PL +tag set to +.IR ILLUMINA . +Collecting indel candidates from reads sequenced by an indel-prone +technology may affect the performance of indel calling. + +.IP o 2 +Derive the allele frequency spectrum (AFS) on a list of sites from multiple individuals: + + samtools mpileup -Igf ref.fa *.bam > all.bcf + bcftools view -bl sites.list all.bcf > sites.bcf + bcftools view -cGP cond2 sites.bcf > /dev/null 2> sites.1.afs + bcftools view -cGP sites.1.afs sites.bcf > /dev/null 2> sites.2.afs + bcftools view -cGP sites.2.afs sites.bcf > /dev/null 2> sites.3.afs + ...... + +where +.I sites.list +contains the list of sites with each line consisting of the reference +sequence name and position. The following +.B bcftools +commands estimate AFS by EM. + +.IP o 2 +Dump BAQ applied alignment for other SNP callers: + + samtools calmd -bAr aln.bam > aln.baq.bam + +It adds and corrects the +.B NM +and +.B MD +tags at the same time. The +.B calmd +command also comes with the +.B -C +option, the same as the one in +.B pileup +and +.BR mpileup . +Apply if it helps. + .SH LIMITATIONS .PP .IP o 2 @@ -492,8 +706,9 @@ although a little slower. .PP Heng Li from the Sanger Institute wrote the C version of samtools. Bob Handsaker from the Broad Institute implemented the BGZF library and Jue -Ruan from Beijing Genomics Institute wrote the RAZF library. Various -people in the 1000 Genomes Project contributed to the SAM format +Ruan from Beijing Genomics Institute wrote the RAZF library. John +Marshall and Petr Danecek contribute to the source code and various +people from the 1000 Genomes Project have contributed to the SAM format specification. .SH SEE ALSO diff --git a/sam/samtools.txt b/sam/samtools.txt index 20e6c15..4cbcef7 100644 --- a/sam/samtools.txt +++ b/sam/samtools.txt @@ -20,63 +20,79 @@ SYNOPSIS samtools faidx ref.fasta - samtools pileup -f ref.fasta aln.sorted.bam + samtools pileup -vcf ref.fasta aln.sorted.bam - samtools mpileup -f ref.fasta -r chr3:1,000-2,000 in1.bam in2.bam + samtools mpileup -C50 -gf ref.fasta -r chr3:1,000-2,000 in1.bam in2.bam samtools tview aln.sorted.bam ref.fasta DESCRIPTION - Samtools is a set of utilities that manipulate alignments in the BAM + Samtools is a set of utilities that manipulate alignments in the BAM format. It imports from and exports to the SAM (Sequence Alignment/Map) - format, does sorting, merging and indexing, and allows to retrieve + format, does sorting, merging and indexing, and allows to retrieve reads in any regions swiftly. - Samtools is designed to work on a stream. It regards an input file `-' - as the standard input (stdin) and an output file `-' as the standard + Samtools is designed to work on a stream. It regards an input file `-' + as the standard input (stdin) and an output file `-' as the standard output (stdout). Several commands can thus be combined with Unix pipes. Samtools always output warning and error messages to the standard error output (stderr). - Samtools is also able to open a BAM (not SAM) file on a remote FTP or - HTTP server if the BAM file name starts with `ftp://' or `http://'. - Samtools checks the current working directory for the index file and - will download the index upon absence. Samtools does not retrieve the + Samtools is also able to open a BAM (not SAM) file on a remote FTP or + HTTP server if the BAM file name starts with `ftp://' or `http://'. + Samtools checks the current working directory for the index file and + will download the index upon absence. Samtools does not retrieve the entire alignment file unless it is asked to do so. COMMANDS AND OPTIONS - view samtools view [-bhuHS] [-t in.refList] [-o output] [-f - reqFlag] [-F skipFlag] [-q minMapQ] [-l library] [-r read- + view samtools view [-bchuHS] [-t in.refList] [-o output] [-f + reqFlag] [-F skipFlag] [-q minMapQ] [-l library] [-r read- Group] [-R rgFile] | [region1 [...]] - Extract/print all or sub alignments in SAM or BAM format. If - no region is specified, all the alignments will be printed; - otherwise only alignments overlapping the specified regions - will be output. An alignment may be given multiple times if + Extract/print all or sub alignments in SAM or BAM format. If + no region is specified, all the alignments will be printed; + otherwise only alignments overlapping the specified regions + will be output. An alignment may be given multiple times if it is overlapping several regions. A region can be presented, - for example, in the following format: `chr2' (the whole - chr2), `chr2:1000000' (region starting from 1,000,000bp) or - `chr2:1,000,000-2,000,000' (region between 1,000,000 and - 2,000,000bp including the end points). The coordinate is + for example, in the following format: `chr2' (the whole + chr2), `chr2:1000000' (region starting from 1,000,000bp) or + `chr2:1,000,000-2,000,000' (region between 1,000,000 and + 2,000,000bp including the end points). The coordinate is 1-based. OPTIONS: -b Output in the BAM format. - -u Output uncompressed BAM. This option saves time spent - on compression/decomprssion and is thus preferred - when the output is piped to another samtools command. + -f INT Only output alignments with all bits in INT present + in the FLAG field. INT can be in hex in the format of + /^0x[0-9A-F]+/ [0] + + -F INT Skip alignments with bits present in INT [0] -h Include the header in the output. -H Output the header only. + -l STR Only output reads in library STR [null] + + -o FILE Output file [stdout] + + -q INT Skip alignments with MAPQ smaller than INT [0] + + -r STR Only output reads in read group STR [null] + + -R FILE Output reads in read groups listed in FILE [null] + -S Input is in SAM. If @SQ header lines are absent, the `-t' option is required. + -c Instead of printing the alignments, only count them + and print the total number. All filter options, such + as `-f', `-F' and `-q' , are taken into account. + -t FILE This file is TAB-delimited. Each line must contain the reference name and the length of the reference, one line for each distinct reference; additional @@ -85,137 +101,78 @@ COMMANDS AND OPTIONS `samtools faidx ', the resultant index file .fai can be used as this file. - -o FILE Output file [stdout] - - -f INT Only output alignments with all bits in INT present - in the FLAG field. INT can be in hex in the format of - /^0x[0-9A-F]+/ [0] - - -F INT Skip alignments with bits present in INT [0] - - -q INT Skip alignments with MAPQ smaller than INT [0] - - -l STR Only output reads in library STR [null] - - -r STR Only output reads in read group STR [null] - - -R FILE Output reads in read groups listed in FILE [null] + -u Output uncompressed BAM. This option saves time spent + on compression/decomprssion and is thus preferred + when the output is piped to another samtools command. tview samtools tview [ref.fasta] - Text alignment viewer (based on the ncurses library). In the - viewer, press `?' for help and press `g' to check the align- - ment start from a region in the format like - `chr10:10,000,000' or `=10,000,000' when viewing the same + Text alignment viewer (based on the ncurses library). In the + viewer, press `?' for help and press `g' to check the align- + ment start from a region in the format like + `chr10:10,000,000' or `=10,000,000' when viewing the same reference sequence. - pileup samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l - in.site_list] [-iscgS2] [-T theta] [-N nHap] [-r - pairDiffRate] | + mpileup samtools mpileup [-Bug] [-C capQcoef] [-r reg] [-f in.fa] [-l + list] [-M capMapQ] [-Q minBaseQ] [-q minMapQ] in.bam [in2.bam + [...]] - Print the alignment in the pileup format. In the pileup for- - mat, each line represents a genomic position, consisting of - chromosome name, coordinate, reference base, read bases, read - qualities and alignment mapping qualities. Information on - match, mismatch, indel, strand, mapping quality and start and - end of a read are all encoded at the read base column. At - this column, a dot stands for a match to the reference base - on the forward strand, a comma for a match on the reverse - strand, `ACGTN' for a mismatch on the forward strand and - `acgtn' for a mismatch on the reverse strand. A pattern - `\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion - between this reference position and the next reference posi- - tion. The length of the insertion is given by the integer in - the pattern, followed by the inserted sequence. Similarly, a - pattern `-[0-9]+[ACGTNacgtn]+' represents a deletion from the - reference. The deleted bases will be presented as `*' in the - following lines. Also at the read base column, a symbol `^' - marks the start of a read segment which is a contiguous sub- - sequence on the read separated by `N/S/H' CIGAR operations. - The ASCII of the character following `^' minus 33 gives the - mapping quality. A symbol `$' marks the end of a read seg- - ment. - - If option -c is applied, the consensus base, Phred-scaled - consensus quality, SNP quality (i.e. the Phred-scaled proba- - bility of the consensus being identical to the reference) and - root mean square (RMS) mapping quality of the reads covering - the site will be inserted between the `reference base' and - the `read bases' columns. An indel occupies an additional - line. Each indel line consists of chromosome name, coordi- - nate, a star, the genotype, consensus quality, SNP quality, - RMS mapping quality, # covering reads, the first alllele, the - second allele, # reads supporting the first allele, # reads - supporting the second allele and # reads containing indels - different from the top two alleles. - - The position of indels is offset by -1. + Generate BCF or pileup for one or multiple BAM files. Align- + ment records are grouped by sample identifiers in @RG header + lines. If sample identifiers are absent, each input file is + regarded as one sample. OPTIONS: - -s Print the mapping quality as the last column. This - option makes the output easier to parse, although - this format is not space efficient. + -B Disable probabilistic realignment for the computation + of base alignment quality (BAQ). BAQ is the Phred- + scaled probability of a read base being misaligned. + Applying this option greatly helps to reduce false + SNPs caused by misalignments. - -S The input file is in SAM. + -C INT Coefficient for downgrading mapping quality for reads + containing excessive mismatches. Given a read with a + phred-scaled probability q of being generated from + the mapped position, the new mapping quality is about + sqrt((INT-q)/INT)*INT. A zero value disables this + functionality; if enabled, the recommended value for + BWA is 50. [0] - -i Only output pileup lines containing indels. + -e INT Phred-scaled gap extension sequencing error probabil- + ity. Reducing INT leads to longer indels. [20] - -f FILE The reference sequence in the FASTA format. Index - file FILE.fai will be created if absent. - - -M INT Cap mapping quality at INT [60] - - -m INT Filter reads with flag containing bits in INT - [1796] + -f FILE The reference file [null] - -d INT Use the first NUM reads in the pileup for indel - calling for speed up. Zero for unlimited. [0] + -g Compute genotype likelihoods and output them in the + binary call format (BCF). - -t FILE List of reference names ane sequence lengths, in - the format described for the import command. If - this option is present, samtools assumes the input - is in SAM format; otherwise it - assumes in BAM format. + -h INT Coefficient for modeling homopolymer errors. Given an + l-long homopolymer run, the sequencing error of an + indel of size s is modeled as INT*s/l. [100] - -l FILE List of sites at which pileup is output. This file - is space delimited. The first two columns are - required to be chromosome and 1-based coordinate. - Additional columns are ignored. It is recommended - to use option -s together with -l as in the default - format we may not know the mapping quality. + -l FILE File containing a list of sites where pileup or BCF + is outputted [null] - -c Call the consensus sequence using SOAPsnp consensus - model. Options -T, -N, -I and -r are only effective - when -c or -g is in use. + -o INT Phred-scaled gap open sequencing error probability. + Reducing INT leads to more indel calls. [40] - -g Generate genotype likelihood in the binary GLFv3 - format. This option suppresses -c, -i and -s. + -P STR Comma dilimited list of platforms (determined by @RG- + PL) from which indel candidates are obtained. It is + recommended to collect indel candidates from sequenc- + ing technologies that have low indel error rate such + as ILLUMINA. [all] - -T FLOAT The theta parameter (error dependency coefficient) - in the maq consensus calling model [0.85] + -q INT Minimum mapping quality for an alignment to be used + [0] - -N INT Number of haplotypes in the sample (>=2) [2] - - -r FLOAT Expected fraction of differences between a pair of - haplotypes [0.001] - - -I INT Phred probability of an indel in sequencing/prep. - [40] - - - mpileup samtools mpileup [-r reg] [-f in.fa] in.bam [in2.bam [...]] - - Generate pileup for multiple BAM files. Consensus calling is - not implemented. - - OPTIONS: + -Q INT Minimum base quality for a base to be considered [13] -r STR Only generate pileup in region STR [all sites] - -f FILE The reference file [null] + -u Similar to -g except that the output is uncompressed + BCF, which is preferred for piping. reheader samtools reheader @@ -243,8 +200,8 @@ COMMANDS AND OPTIONS [500000000] - merge samtools merge [-h inh.sam] [-nr] - [...] + merge samtools merge [-nur] [-h inh.sam] [-R reg] + [...] Merge multiple sorted alignments. The header reference lists of all the input BAM files, and the @SQ headers of inh.sam, @@ -261,12 +218,16 @@ COMMANDS AND OPTIONS SAM format, though any alignment records it may con- tain are ignored.) + -R STR Merge files in the specified region indicated by STR + -r Attach an RG tag to each alignment. The tag value is inferred from file names. -n The input alignments are sorted by read names rather than by chromosomal coordinates + -u Uncompressed BAM output + index samtools index @@ -315,7 +276,7 @@ COMMANDS AND OPTIONS -S Treat paired-end reads and single-end reads. - calmd samtools calmd [-eubS] + calmd samtools calmd [-eubSr] [-C capQcoef] Generate the MD tag. If the MD tag is already present, this command will give a warning if the MD tag generated is dif- @@ -323,8 +284,11 @@ COMMANDS AND OPTIONS OPTIONS: - -e Convert a the read base to = if it is identical to - the aligned reference base. Indel caller does not + -A When used jointly with -r this option overwrites the + original base quality. + + -e Convert a the read base to = if it is identical to + the aligned reference base. Indel caller does not support the = bases at the moment. -u Output uncompressed BAM @@ -333,9 +297,117 @@ COMMANDS AND OPTIONS -S The input is SAM with header lines + -C INT Coefficient to cap mapping quality of poorly mapped + reads. See the pileup command for details. [0] + + -r Compute the BQ tag without changing the base quality. + + + pileup samtools pileup [-2sSBicv] [-f in.ref.fasta] [-t in.ref_list] + [-l in.site_list] [-C capMapQ] [-M maxMapQ] [-T theta] [-N + nHap] [-r pairDiffRate] [-m mask] [-d maxIndelDepth] [-G + indelPrior] | + + Print the alignment in the pileup format. In the pileup for- + mat, each line represents a genomic position, consisting of + chromosome name, coordinate, reference base, read bases, read + qualities and alignment mapping qualities. Information on + match, mismatch, indel, strand, mapping quality and start and + end of a read are all encoded at the read base column. At + this column, a dot stands for a match to the reference base + on the forward strand, a comma for a match on the reverse + strand, a '>' or '<' for a reference skip, `ACGTN' for a mis- + match on the forward strand and `acgtn' for a mismatch on the + reverse strand. A pattern `\+[0-9]+[ACGTNacgtn]+' indicates + there is an insertion between this reference position and the + next reference position. The length of the insertion is given + by the integer in the pattern, followed by the inserted + sequence. Similarly, a pattern `-[0-9]+[ACGTNacgtn]+' repre- + sents a deletion from the reference. The deleted bases will + be presented as `*' in the following lines. Also at the read + base column, a symbol `^' marks the start of a read. The + ASCII of the character following `^' minus 33 gives the map- + ping quality. A symbol `$' marks the end of a read segment. + + If option -c is applied, the consensus base, Phred-scaled + consensus quality, SNP quality (i.e. the Phred-scaled proba- + bility of the consensus being identical to the reference) and + root mean square (RMS) mapping quality of the reads covering + the site will be inserted between the `reference base' and + the `read bases' columns. An indel occupies an additional + line. Each indel line consists of chromosome name, coordi- + nate, a star, the genotype, consensus quality, SNP quality, + RMS mapping quality, # covering reads, the first alllele, the + second allele, # reads supporting the first allele, # reads + supporting the second allele and # reads containing indels + different from the top two alleles. + + NOTE: Since 0.1.10, the `pileup' command is deprecated by + `mpileup'. + + OPTIONS: + + -B Disable the BAQ computation. See the mpileup com- + mand for details. + + -c Call the consensus sequence. Options -T, -N, -I and + -r are only effective when -c or -g is in use. + + -C INT Coefficient for downgrading the mapping quality of + poorly mapped reads. See the mpileup command for + details. [0] + + -d INT Use the first NUM reads in the pileup for indel + calling for speed up. Zero for unlimited. [1024] + + -f FILE The reference sequence in the FASTA format. Index + file FILE.fai will be created if absent. + + -g Generate genotype likelihood in the binary GLFv3 + format. This option suppresses -c, -i and -s. This + option is deprecated by the mpileup command. + + -i Only output pileup lines containing indels. + + -I INT Phred probability of an indel in sequencing/prep. + [40] + + -l FILE List of sites at which pileup is output. This file + is space delimited. The first two columns are + required to be chromosome and 1-based coordinate. + Additional columns are ignored. It is recommended + to use option + + -m INT Filter reads with flag containing bits in INT + [1796] + + -M INT Cap mapping quality at INT [60] + + -N INT Number of haplotypes in the sample (>=2) [2] + + -r FLOAT Expected fraction of differences between a pair of + haplotypes [0.001] + + -s Print the mapping quality as the last column. This + option makes the output easier to parse, although + this format is not space efficient. + + -S The input file is in SAM. + + -t FILE List of reference names ane sequence lengths, in + the format described for the import command. If + this option is present, samtools assumes the input + is in SAM format; otherwise it + assumes in BAM format. -s together with -l as in + the default format we may not know the mapping + quality. + + -T FLOAT The theta parameter (error dependency coefficient) + in the maq consensus calling model [0.85] + SAM FORMAT - SAM is TAB-delimited. Apart from the header lines, which are started + SAM is TAB-delimited. Apart from the header lines, which are started with the `@' symbol, each alignment line consists of: @@ -375,6 +447,85 @@ SAM FORMAT |0x0400 | d | the read is either a PCR or an optical duplicate | +-------+-----+--------------------------------------------------+ +EXAMPLES + o Import SAM to BAM when @SQ lines are present in the header: + + samtools view -bS aln.sam > aln.bam + + If @SQ lines are absent: + + samtools faidx ref.fa + samtools view -bt ref.fa.fai aln.sam > aln.bam + + where ref.fa.fai is generated automatically by the faidx command. + + + o Attach the RG tag while merging sorted alignments: + + perl -e 'print "@RG\tID:ga\tSM:hs\tLB:ga\tPL:Illu- + mina\n@RG\tID:454\tSM:hs\tLB:454\tPL:454\n"' > rg.txt + samtools merge -rh rg.txt merged.bam ga.bam 454.bam + + The value in a RG tag is determined by the file name the read is com- + ing from. In this example, in the merged.bam, reads from ga.bam will + be attached RG:Z:ga, while reads from 454.bam will be attached + RG:Z:454. + + + o Call SNPs and short indels for one diploid individual: + + samtools mpileup -ugf ref.fa aln.bam | bcftools view -bvcg - > + var.raw.bcf + bcftools view var.raw.bcf | vcfutils.pl varFilter -D 100 > + var.flt.vcf + + The -D option of varFilter controls the maximum read depth, which + should be adjusted to about twice the average read depth. One may + consider to add -C50 to mpileup if mapping quality is overestimated + for reads containing excessive mismatches. Applying this option usu- + ally helps BWA-short but may not other mappers. + + + o Call SNPs and short indels for multiple diploid individuals: + + samtools mpileup -P ILLUMINA -ugf ref.fa *.bam | bcftools view + -bcvg - > var.raw.bcf + bcftools view var.raw.bcf | vcfutils.pl varFilter -D 2000 > + var.flt.vcf + + Individuals are identified from the SM tags in the @RG header lines. + Individuals can be pooled in one alignment file; one individual can + also be separated into multiple files. The -P option specifies that + indel candidates should be collected only from read groups with the + @RG-PL tag set to ILLUMINA. Collecting indel candidates from reads + sequenced by an indel-prone technology may affect the performance of + indel calling. + + + o Derive the allele frequency spectrum (AFS) on a list of sites from + multiple individuals: + + samtools mpileup -Igf ref.fa *.bam > all.bcf + bcftools view -bl sites.list all.bcf > sites.bcf + bcftools view -cGP cond2 sites.bcf > /dev/null 2> sites.1.afs + bcftools view -cGP sites.1.afs sites.bcf > /dev/null 2> sites.2.afs + bcftools view -cGP sites.2.afs sites.bcf > /dev/null 2> sites.3.afs + ...... + + where sites.list contains the list of sites with each line consisting + of the reference sequence name and position. The following bcftools + commands estimate AFS by EM. + + + o Dump BAQ applied alignment for other SNP callers: + + samtools calmd -bAr aln.bam > aln.baq.bam + + It adds and corrects the NM and MD tags at the same time. The calmd + command also comes with the -C option, the same as the one in pileup + and mpileup. Apply if it helps. + + LIMITATIONS o Unaligned words used in bam_import.c, bam_endian.h, bam.c and bam_aux.c. @@ -394,8 +545,9 @@ LIMITATIONS AUTHOR Heng Li from the Sanger Institute wrote the C version of samtools. Bob Handsaker from the Broad Institute implemented the BGZF library and Jue - Ruan from Beijing Genomics Institute wrote the RAZF library. Various - people in the 1000 Genomes Project contributed to the SAM format speci- + Ruan from Beijing Genomics Institute wrote the RAZF library. John Mar- + shall and Petr Danecek contribute to the source code and various people + from the 1000 Genomes Project have contributed to the SAM format speci- fication. @@ -404,4 +556,4 @@ SEE ALSO -samtools-0.1.8 11 July 2010 samtools(1) +samtools-0.1.12 2 December 2010 samtools(1) -- 2.39.2