fprintf(fo, "%.15g\n", theta[M]);
fclose(fo);
-
- //calculate normalized read fraction
- double *nrf = new double[M + 1];
- memset(nrf, 0, sizeof(double) * (M + 1));
- denom = 1.0 - theta[0];
- if (denom <= 0) { fprintf(stderr, "No alignable reads?!\n"); exit(-1); }
- for (int i = 1; i <= M; i++) nrf[i] = theta[i] / denom;
-
//calculate tau values
double *tau = new double[M + 1];
memset(tau, 0, sizeof(double) * (M + 1));
denom = 0.0;
for (int i = 1; i <= M; i++)
- if (eel[i] > EPSILON) {
+ if (eel[i] >= EPSILON) {
tau[i] = theta[i] / eel[i];
denom += tau[i];
}
}
for (int i = 1; i <= M; i++)
fprintf(fo, "%.2f%c", counts[i], (i < M ? '\t' : '\n'));
- for (int i = 1; i <= M; i++)
- fprintf(fo, "%.15g%c", nrf[i], (i < M ? '\t' : '\n'));
for (int i = 1; i <= M; i++)
fprintf(fo, "%.15g%c", tau[i], (i < M ? '\t' : '\n'));
for (int i = 1; i <= M; i++) {
for (int j = b; j < e; j++) sumC += counts[j];
fprintf(fo, "%.2f%c", sumC, (i < m - 1 ? '\t' : '\n'));
}
- for (int i = 0; i < m; i++) {
- double sumN = 0.0; // sum of normalized read fraction
- int b = gi.spAt(i), e = gi.spAt(i + 1);
- for (int j = b; j < e; j++) sumN += nrf[j];
- fprintf(fo, "%.15g%c", sumN, (i < m - 1 ? '\t' : '\n'));
- }
for (int i = 0; i < m; i++) {
double sumT = 0.0; // sum of tau values
int b = gi.spAt(i), e = gi.spAt(i + 1);
}
fclose(fo);
- delete[] nrf;
delete[] tau;
if (verbose) { printf("Expression Results are written!\n"); }
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
-
ROUND = 0;
do {
++ROUND;
Refs refs;
GroupInfo gi;
-vector<double> theta, pme_theta, eel;
+vector<double> theta, pme_theta, pme_c, eel;
vector<int> s, z;
vector<Item> hits;
fprintf(fo, "%d %d\n", CHAINLEN / GAP, M + 1);
//fprintf(fo, "%d %d\n", CHAINLEN, M + 1);
+ pme_c.clear(); pme_c.resize(M + 1, 0.0);
pme_theta.clear(); pme_theta.resize(M + 1, 0.0);
for (int ROUND = 1; ROUND <= BURNIN + CHAINLEN; ROUND++) {
if (ROUND > BURNIN) {
if ((ROUND - BURNIN -1) % GAP == 0) writeCountVector(fo);
writeCountVector(fo);
- for (int i = 0; i <= M; i++) pme_theta[i] += counts[i] / totc;
+ for (int i = 0; i <= M; i++) {
+ pme_c[i] += counts[i] - 1;
+ pme_theta[i] += counts[i] / totc;
+ }
}
if (verbose) { printf("ROUND %d is finished!\n", ROUND); }
}
fclose(fo);
- for (int i = 0; i <= M; i++) pme_theta[i] /= CHAINLEN;
+ for (int i = 0; i <= M; i++) {
+ pme_c[i] /= CHAINLEN;
+ pme_theta[i] /= CHAINLEN;
+ }
if (verbose) { printf("Gibbs is finished!\n"); }
}
assert(denom >= EPSILON);
for (int i = 0; i <= M; i++) pme_theta[i] /= denom;
- //calculate normalized read fraction
- double *nrf = new double[M + 1];
- memset(nrf, 0, sizeof(double) * (M + 1));
-
- denom = 1.0 - pme_theta[0];
- if (denom <= 0) { fprintf(stderr, "No alignable reads?!\n"); exit(-1); }
- for (int i = 1; i <= M; i++) nrf[i] = pme_theta[i] / denom;
-
//calculate tau values
double *tau = new double[M + 1];
memset(tau, 0, sizeof(double) * (M + 1));
fo = fopen(outF, "a");
if (fo == NULL) { fprintf(stderr, "Cannot open %s!\n", outF); exit(-1); }
for (int i = 1; i <= M; i++)
- fprintf(fo, "%.15g%c", nrf[i], (i < M ? '\t' : '\n'));
+ fprintf(fo, "%.2f%c", pme_c[i], (i < M ? '\t' : '\n'));
for (int i = 1; i <= M; i++)
fprintf(fo, "%.15g%c", tau[i], (i < M ? '\t' : '\n'));
fclose(fo);
fo = fopen(outF, "a");
if (fo == NULL) { fprintf(stderr, "Cannot open %s!\n", outF); exit(-1); }
for (int i = 0; i < m; i++) {
- double sumN = 0.0; // sum of normalized read fraction
+ double sumC = 0.0; // sum of pme counts
int b = gi.spAt(i), e = gi.spAt(i + 1);
for (int j = b; j < e; j++) {
- sumN += nrf[j];
+ sumC += pme_c[j];
}
- fprintf(fo, "%.15g%c", sumN, (i < m - 1 ? '\t' : '\n'));
+ fprintf(fo, "%.15g%c", sumC, (i < m - 1 ? '\t' : '\n'));
}
for (int i = 0; i < m; i++) {
double sumT = 0.0; // sum of tau values
}
fclose(fo);
- delete[] nrf;
delete[] tau;
if (verbose) { printf("Gibbs based expression values are written!\n"); }
mw = NULL;
if (isMaster) {
- ori = new Orientation(params.probF);
if (!estRSPD) rspd = new RSPD(estRSPD);
mld = new LenDist(params.mate_minL, params.mate_maxL);
}
+ ori = new Orientation(params.probF);
gld = new LenDist(params.minL, params.maxL);
if (estRSPD) rspd = new RSPD(estRSPD, params.B);
pro = new Profile(params.maxL);
mw = NULL;
if (isMaster) {
- ori = new Orientation(params.probF);
if (!estRSPD) rspd = new RSPD(estRSPD);
qd = new QualDist();
mld = new LenDist(params.mate_minL, params.mate_maxL);
}
+ ori = new Orientation(params.probF);
gld = new LenDist(params.minL, params.maxL);
if (estRSPD) rspd = new RSPD(estRSPD, params.B);
qpro = new QProfile();
cdf = new double[B + 2];
//set initial parameters
- memset(pdf, 0, sizeof(double) * (B + 2));
+ memset(pdf, 0, sizeof(double) * (B + 2)); // use B + 2 for evalCDF
memset(cdf, 0, sizeof(double) * (B + 2));
for (int i = 1; i <= B; i++) {
pdf[i] = 1.0 / B;
mw = NULL;
if (isMaster) {
- ori = new Orientation(params.probF);
gld = new LenDist(params.minL, params.maxL);
if (mean >= EPSILON) {
mld = new LenDist(params.mate_minL, params.mate_maxL);
if (!estRSPD) { rspd = new RSPD(estRSPD); }
}
+ ori = new Orientation(params.probF);
if (estRSPD) { rspd = new RSPD(estRSPD, params.B); }
pro = new Profile(params.maxL);
npro = new NoiseProfile();
mw = NULL;
if (isMaster) {
- ori = new Orientation(params.probF);
gld = new LenDist(params.minL, params.maxL);
if (mean >= EPSILON) {
mld = new LenDist(params.mate_minL, params.mate_maxL);
qd = new QualDist();
}
+ ori = new Orientation(params.probF);
if (estRSPD) { rspd = new RSPD(estRSPD, params.B); }
qpro = new QProfile();
nqpro = new NoiseQProfile();
void update(const SingleReadQ& read, const SingleHit& hit, double frac) {
if (read.isLowQuality() || frac < EPSILON) return;
- RefSeq& ref = refs->getRef(hit.getSid());
+ const RefSeq& ref = refs->getRef(hit.getSid());
+
int dir = hit.getDir();
int pos = hit.getPos();
memset(mw, 0, sizeof(double) * (M + 1));
mw[0] = 1.0;
-
probF = ori->getProb(0);
probR = ori->getProb(1);
int *cvec;
double *theta;
-CIType *iso_nrf, *gene_nrf, *iso_tau, *gene_tau;
+CIType *iso_tau, *gene_tau;
engine_type engine(time(NULL));
distribution_type **gammas;
}
void generateResults(char* imdName) {
- float *izsamples, *gzsamples, *itsamples, *gtsamples;
+ float *itsamples, *gtsamples;
ifstream fin;
FILE *fo;
char outF[STRLEN];
- iso_nrf = new CIType[M + 1];
iso_tau = new CIType[M + 1];
- gene_nrf = new CIType[m];
gene_tau = new CIType[m];
- izsamples = new float[nSamples];
itsamples = new float[nSamples];
- gzsamples = new float[nSamples];
gtsamples = new float[nSamples];
fin.open(tmpF, ios::binary);
- for (int k = 0; k < nSamples; k++) fin.read((char*)(&izsamples[k]), FLOATSIZE);
- calcCI(nSamples, izsamples, iso_nrf[0].lb, iso_nrf[0].ub);
+ //read sampled theta0 values
+ for (int k = 0; k < nSamples; k++) fin.read((char*)(&itsamples[k]), FLOATSIZE);
for (int i = 0; i < m; i++) {
int b = gi.spAt(i), e = gi.spAt(i + 1);
- memset(gzsamples, 0, FLOATSIZE * nSamples);
memset(gtsamples, 0, FLOATSIZE * nSamples);
for (int j = b; j < e; j++) {
for (int k = 0; k < nSamples; k++) {
- fin.read((char*)(&izsamples[k]), FLOATSIZE);
- if (eel[j] > EPSILON && tau_denoms[k] > EPSILON) { itsamples[k] = izsamples[k] / eel[j] / tau_denoms[k]; }
+ fin.read((char*)(&itsamples[k]), FLOATSIZE);
+ if (eel[j] > EPSILON && tau_denoms[k] > EPSILON) { itsamples[k] = itsamples[k] / eel[j] / tau_denoms[k]; }
else {
- if (izsamples[k] != 0.0) { fprintf(stderr, "K=%d, IZSAMPLES_K=%lf\n", k, izsamples[k]); exit(-1); }
+ if (itsamples[k] != 0.0) { fprintf(stderr, "Not equal to 0! K=%d, Sampled Theta Value=%lf\n", k, itsamples[k]); exit(-1); }
itsamples[k] = 0.0;
}
- gzsamples[k] += izsamples[k];
gtsamples[k] += itsamples[k];
}
- calcCI(nSamples, izsamples, iso_nrf[j].lb, iso_nrf[j].ub);
calcCI(nSamples, itsamples, iso_tau[j].lb, iso_tau[j].ub);
}
- calcCI(nSamples, gzsamples, gene_nrf[i].lb, gene_nrf[i].ub);
calcCI(nSamples, gtsamples, gene_tau[i].lb, gene_tau[i].ub);
if (verbose && (i + 1) % 1000 == 0) { printf("%d genes are done!\n", i + 1); }
//isoform level results
sprintf(outF, "%s.iso_res", imdName);
fo = fopen(outF, "a");
- for (int i = 1; i <= M; i++)
- fprintf(fo, "%.6g%c", iso_nrf[i].lb, (i < M ? '\t' : '\n'));
- for (int i = 1; i <= M; i++)
- fprintf(fo, "%.6g%c", iso_nrf[i].ub, (i < M ? '\t' : '\n'));
for (int i = 1; i <= M; i++)
fprintf(fo, "%.6g%c", iso_tau[i].lb, (i < M ? '\t' : '\n'));
for (int i = 1; i <= M; i++)
//gene level results
sprintf(outF, "%s.gene_res", imdName);
fo = fopen(outF, "a");
- for (int i = 0; i < m; i++)
- fprintf(fo, "%.6g%c", gene_nrf[i].lb, (i < m - 1 ? '\t' : '\n'));
- for (int i = 0; i < m; i++)
- fprintf(fo, "%.6g%c", gene_nrf[i].ub, (i < m - 1 ? '\t' : '\n'));
for (int i = 0; i < m; i++)
fprintf(fo, "%.6g%c", gene_tau[i].lb, (i < m - 1 ? '\t' : '\n'));
for (int i = 0; i < m; i++)
fprintf(fo, "%.6g%c", gene_tau[i].ub, (i < m - 1 ? '\t' : '\n'));
fclose(fo);
- printf("CI of noise isoform is [%.6g, %.6g]\n", iso_nrf[0].lb, iso_nrf[0].ub);
-
- delete[] izsamples;
delete[] itsamples;
- delete[] gzsamples;
delete[] gtsamples;
- delete[] iso_nrf;
delete[] iso_tau;
- delete[] gene_nrf;
delete[] gene_tau;
if (verbose) { printf("All credibility intervals are calculated!\n"); }
for (int i = 0; i < nSamples; i++) fprintf(fo, "%.15g ", tau_denoms[i]);
fprintf(fo, "\n");
fclose(fo);
-
}
int main(int argc, char* argv[]) {
++$cnt;
chomp($line);
my @local_arr = split(/\t/, $line);
- if ($cnt == 4) { @comment = @local_arr; }
+ if ($cnt == 3) { @comment = @local_arr; }
else { push(@results, \@local_arr); }
}
+------------------------------------------------------------------------
+r844 | lh3lh3 | 2010-11-19 23:16:08 -0500 (Fri, 19 Nov 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/bcftools/call1.c
+ M /trunk/samtools/bcftools/prob1.c
+ M /trunk/samtools/bcftools/prob1.h
+
+ * samtools-0.1.10-9 (r844)
+ * added the "folded" or reference-free mode for variant calling
+
+------------------------------------------------------------------------
+r843 | lh3lh3 | 2010-11-19 22:26:36 -0500 (Fri, 19 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/NEWS
+ M /trunk/samtools/bam_sort.c
+
+In merging, if -R is specified, do not abort if the sequence dictionary is different.
+
+------------------------------------------------------------------------
+r842 | jmarshall | 2010-11-19 21:24:28 -0500 (Fri, 19 Nov 2010) | 5 lines
+Changed paths:
+ M /trunk/samtools/bam_sort.c
+
+When merging BAM headers, compare the list of target reference sequences
+strictly (and fail/abort if there is a mismatch), but allow one list to be a
+prefix of the other. (i.e., check that the lists are identical up until the
+shorter runs out, and add the excess targets from the longer to the output.)
+
+------------------------------------------------------------------------
+r841 | lh3lh3 | 2010-11-19 14:49:27 -0500 (Fri, 19 Nov 2010) | 4 lines
+Changed paths:
+ M /trunk/samtools/bam_index.c
+ M /trunk/samtools/bam_pileup.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.10 (r841)
+ * fixed a bug in pileup when the first CIGAR operation is D
+ * fixed a bug in view with range query
+
+------------------------------------------------------------------------
+r840 | lh3lh3 | 2010-11-19 13:45:51 -0500 (Fri, 19 Nov 2010) | 10 lines
+Changed paths:
+ M /trunk/samtools/ChangeLog
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bam2bcf.h
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.10-4 (r840)
+
+ * drop the MNP caller. It is slow while does not diliver too much
+ benefit. Possibly I will work on it in future given more time.
+
+ * there is a segfault in pileup
+
+ * someone has reported segfault from view/index/sort
+
+
+------------------------------------------------------------------------
+r839 | lh3lh3 | 2010-11-18 17:30:11 -0500 (Thu, 18 Nov 2010) | 9 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.h
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.10-6 (r839)
+
+ * call MNPs without realignment because it seems to me that it is not
+ worthwhile to significantly slow down SNP calling.
+
+ * the result looks quite different from the previous version. I have
+ work to do...
+
+
+------------------------------------------------------------------------
+r838 | lh3lh3 | 2010-11-18 11:26:09 -0500 (Thu, 18 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/knetfile.c
+
+Apply a patch by Rob Davis, which improves fault detection.
+
+------------------------------------------------------------------------
+r836 | lh3lh3 | 2010-11-18 11:09:23 -0500 (Thu, 18 Nov 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-r836
+ * initiate MNP realignment when the MNP has at least 0.2% frequency (otherwise too slow)
+
+------------------------------------------------------------------------
+r835 | lh3lh3 | 2010-11-18 00:25:13 -0500 (Thu, 18 Nov 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bcftools/vcfutils.pl
+
+ * modify the filtering rule: also filter SNPs around filtered indels
+ * added MNP filter
+
+------------------------------------------------------------------------
+r834 | lh3lh3 | 2010-11-17 23:13:52 -0500 (Wed, 17 Nov 2010) | 4 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.10-4 (r834)
+ * fixed a silly bug in printing MNP
+ * restrict to at most 1 alternative allele
+
+------------------------------------------------------------------------
+r833 | lh3lh3 | 2010-11-17 21:58:58 -0500 (Wed, 17 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bamtk.c
+
+fixed a bug in printing MNPs
+
+------------------------------------------------------------------------
+r832 | lh3lh3 | 2010-11-17 21:47:20 -0500 (Wed, 17 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bamtk.c
+
+minor change to how seqQ is applied
+
+------------------------------------------------------------------------
+r831 | lh3lh3 | 2010-11-17 21:41:12 -0500 (Wed, 17 Nov 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bam2bcf.h
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.10 (r831)
+ * initial MNP caller
+
+------------------------------------------------------------------------
+r829 | lh3lh3 | 2010-11-16 23:14:15 -0500 (Tue, 16 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/ChangeLog
+ M /trunk/samtools/NEWS
+ M /trunk/samtools/bamtk.c
+
+Release samtools-0.1.10 (r829)
+
+------------------------------------------------------------------------
+r828 | lh3lh3 | 2010-11-16 20:48:49 -0500 (Tue, 16 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bamtk.c
+
+update version information: samtools-0.1.9-20 (r828)
+
+------------------------------------------------------------------------
+r827 | lh3lh3 | 2010-11-16 15:32:50 -0500 (Tue, 16 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/call1.c
+
+bcftools: allow to skip indels
+
+------------------------------------------------------------------------
+r826 | lh3lh3 | 2010-11-16 14:11:58 -0500 (Tue, 16 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam_md.c
+
+remove ZQ if both BQ and ZQ are present
+
+------------------------------------------------------------------------
+r825 | lh3lh3 | 2010-11-16 13:51:33 -0500 (Tue, 16 Nov 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bam_md.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/samtools.1
+
+ * samtools-0.1.9-18 (r825)
+ * change the behaviour of calmd such that by default it does not change the base quality
+
+------------------------------------------------------------------------
+r824 | lh3lh3 | 2010-11-15 23:31:53 -0500 (Mon, 15 Nov 2010) | 4 lines
+Changed paths:
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/bcftools/call1.c
+ M /trunk/samtools/samtools.1
+
+ * samtools-0.1.9-17 (r824)
+ * added command line options to change the default parameters in indel calling
+ * update the manual
+
+------------------------------------------------------------------------
+r823 | lh3lh3 | 2010-11-15 12:20:13 -0500 (Mon, 15 Nov 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bam_md.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.9-r823
+ * the BQ tag is now 64 shifted, not 33 shifted
+
+------------------------------------------------------------------------
+r822 | lh3lh3 | 2010-11-15 00:30:18 -0500 (Mon, 15 Nov 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bam2bcf.h
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/bcftools/vcfutils.pl
+ M /trunk/samtools/misc/samtools.pl
+
+ * samtools-0.1.9-16 (r822)
+ * keep the raw depth because in indel calling, DP4 may be way off the true depth
+
+------------------------------------------------------------------------
+r821 | lh3lh3 | 2010-11-13 01:18:31 -0500 (Sat, 13 Nov 2010) | 4 lines
+Changed paths:
+ M /trunk/samtools/bam_md.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.9-15 (r821)
+ * calmd: write BQ
+ * skip realignment if BQ is present
+
+------------------------------------------------------------------------
+r820 | lh3lh3 | 2010-11-13 01:08:26 -0500 (Sat, 13 Nov 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.9-14 (r820)
+ * penalize reads with excessive differences in indel calling
+
+------------------------------------------------------------------------
+r819 | lh3lh3 | 2010-11-12 21:36:27 -0500 (Fri, 12 Nov 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_maqcns.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.9-13 (r819)
+ * fixed a bug in pileup given refskip
+
+------------------------------------------------------------------------
+r818 | lh3lh3 | 2010-11-12 13:04:53 -0500 (Fri, 12 Nov 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-r818
+ * for indel calling, do two rounds of probabilistic realignments
+
+------------------------------------------------------------------------
+r817 | lh3lh3 | 2010-11-11 20:04:07 -0500 (Thu, 11 Nov 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/bcftools/vcfutils.pl
+
+ * samtools-0.1.19-11 (r817)
+ * only initiate indel calling when 0.2% of reads contain a gap
+
+------------------------------------------------------------------------
+r816 | lh3lh3 | 2010-11-11 01:22:59 -0500 (Thu, 11 Nov 2010) | 7 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.9-10 (r816)
+
+ * I know why the forward method fails. it is because of zero base
+ qualities. when that is fixed, the forward method seems to give
+ better results than Viterbi, as it should be. I am tired...
+
+
+------------------------------------------------------------------------
+r815 | lh3lh3 | 2010-11-11 00:57:15 -0500 (Thu, 11 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/ChangeLog
+ M /trunk/samtools/bam2bcf_indel.c
+
+effectively revert to the viterbi version. The forward realignment gives too many false positives.
+
+------------------------------------------------------------------------
+r814 | lh3lh3 | 2010-11-11 00:18:02 -0500 (Thu, 11 Nov 2010) | 4 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bam_md.c
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.9-9 (r810)
+ * use forward, instead of viterbi, for realignment
+ * realignment is now quality aware
+
+------------------------------------------------------------------------
+r813 | lh3lh3 | 2010-11-10 22:45:24 -0500 (Wed, 10 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/kprobaln.c
+ M /trunk/samtools/kprobaln.h
+
+ * prepare to replace kaln with kprobaln in realignment
+
+------------------------------------------------------------------------
+r812 | lh3lh3 | 2010-11-10 17:28:50 -0500 (Wed, 10 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/bcf.c
+
+fixed a typo
+
+------------------------------------------------------------------------
+r811 | lh3lh3 | 2010-11-10 16:54:46 -0500 (Wed, 10 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/bcf.c
+ M /trunk/samtools/bcftools/bcf.h
+
+use zlib for direct reading when BCF_LITE is in use
+
+------------------------------------------------------------------------
+r810 | lh3lh3 | 2010-11-10 16:32:13 -0500 (Wed, 10 Nov 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf_indel.c
+
+ * do not use reads containing too many mismatches for indel calling
+ * fixed a trivial bug in case of multi-allelic indels
+
+------------------------------------------------------------------------
+r809 | lh3lh3 | 2010-11-10 13:23:02 -0500 (Wed, 10 Nov 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.9-8 (r809)
+ * fixed a bug in the indel caller
+
+------------------------------------------------------------------------
+r808 | lh3lh3 | 2010-11-10 12:24:10 -0500 (Wed, 10 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/Makefile
+
+minor change to makefile
+
+------------------------------------------------------------------------
+r807 | lh3lh3 | 2010-11-10 12:10:21 -0500 (Wed, 10 Nov 2010) | 4 lines
+Changed paths:
+ M /trunk/samtools/Makefile
+ M /trunk/samtools/bam2bcf.h
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/bcftools/vcfutils.pl
+
+ * samtools-0.1.9-8 (r807)
+ * collect indel candidates only from specified platforms (@RG-PL)
+ * merge varFilter and filter4vcf in vcfutils.pl
+
+------------------------------------------------------------------------
+r806 | lh3lh3 | 2010-11-09 22:05:46 -0500 (Tue, 09 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/call1.c
+ M /trunk/samtools/bcftools/prob1.c
+ M /trunk/samtools/bcftools/prob1.h
+
+bcftools: compute equal-tail (Bayesian) credible interval
+
+------------------------------------------------------------------------
+r805 | lh3lh3 | 2010-11-09 16:28:39 -0500 (Tue, 09 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/vcfutils.pl
+
+added a double-hit filter to avoid overestimated indel likelihood
+
+------------------------------------------------------------------------
+r804 | lh3lh3 | 2010-11-09 14:12:06 -0500 (Tue, 09 Nov 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.9-7 (r804)
+ * fixed a bug in the gap caller
+
+------------------------------------------------------------------------
+r803 | lh3lh3 | 2010-11-09 10:45:33 -0500 (Tue, 09 Nov 2010) | 4 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/bcftools/bcf.c
+ M /trunk/samtools/bcftools/bcf.h
+ M /trunk/samtools/bcftools/prob1.c
+
+ * samtools-0.1.9-6 (r803)
+ * mpileup: apply homopolymer correction when calculating GL, instead of before
+ * bcftools: apply a different prior to indels
+
+------------------------------------------------------------------------
+r802 | lh3lh3 | 2010-11-08 23:53:15 -0500 (Mon, 08 Nov 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.9-5 (r802)
+ * relax tandem penalty. this will be made a command-line option in future.
+
+------------------------------------------------------------------------
+r801 | lh3lh3 | 2010-11-08 23:35:52 -0500 (Mon, 08 Nov 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.9-4 (r801)
+ * fixed a minor issue in printing indel VCF
+
+------------------------------------------------------------------------
+r800 | lh3lh3 | 2010-11-08 15:28:14 -0500 (Mon, 08 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bcftools/vcfutils.pl
+
+fixed another silly bug in mpileup's indel caller
+
+------------------------------------------------------------------------
+r799 | lh3lh3 | 2010-11-08 14:28:27 -0500 (Mon, 08 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+
+fixed a silly bug in the indel caller
+
+------------------------------------------------------------------------
+r798 | lh3lh3 | 2010-11-08 14:07:33 -0500 (Mon, 08 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/ChangeLog
+ M /trunk/samtools/sam_view.c
+ M /trunk/samtools/samtools.1
+
+Incorporate patches by Marcel Martin for read counting.
+
+------------------------------------------------------------------------
+r797 | lh3lh3 | 2010-11-08 13:39:52 -0500 (Mon, 08 Nov 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bam2bcf.h
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.9-2 (r797)
+ * mpileup: indel calling seems to be working
+
+------------------------------------------------------------------------
+r796 | lh3lh3 | 2010-11-08 10:54:46 -0500 (Mon, 08 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bam2bcf.h
+ M /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/kaln.c
+
+indel calling is apparently working, but more information needs to be collected
+
+------------------------------------------------------------------------
+r795 | lh3lh3 | 2010-11-08 00:39:18 -0500 (Mon, 08 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bam2bcf_indel.c
+
+fixed a few bugs in the indel caller. Probably there are more.
+
+------------------------------------------------------------------------
+r794 | lh3lh3 | 2010-11-07 22:23:16 -0500 (Sun, 07 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/Makefile
+ M /trunk/samtools/bam.h
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bam2bcf.h
+ A /trunk/samtools/bam2bcf_indel.c
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/kaln.c
+ M /trunk/samtools/kaln.h
+
+prepare for the indel caller. It is not ready yet.
+
+------------------------------------------------------------------------
+r793 | lh3lh3 | 2010-11-05 11:28:23 -0400 (Fri, 05 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bam2bcf.h
+ M /trunk/samtools/bam_plcmd.c
+
+Revert to r790. The recent changes are not good...
+
+------------------------------------------------------------------------
+r792 | lh3lh3 | 2010-11-05 00:19:14 -0400 (Fri, 05 Nov 2010) | 6 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bam2bcf.h
+ M /trunk/samtools/bam_plcmd.c
+
+ * this revision is UNSTABLE
+
+ * indel caller seems working, but it is very insensitive and has
+ several things I do not quite understand.
+
+
+------------------------------------------------------------------------
+r791 | lh3lh3 | 2010-11-04 22:58:43 -0400 (Thu, 04 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bam2bcf.h
+ M /trunk/samtools/bam_plcmd.c
+
+for backup. no effective changes
+
+------------------------------------------------------------------------
+r790 | lh3lh3 | 2010-11-03 15:51:24 -0400 (Wed, 03 Nov 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/vcfutils.pl
+ M /trunk/samtools/kprobaln.c
+
+fixed a minor problem in the example coming with kprobaln.c
+
+------------------------------------------------------------------------
+r789 | lh3lh3 | 2010-11-02 15:41:27 -0400 (Tue, 02 Nov 2010) | 4 lines
+Changed paths:
+ M /trunk/samtools/Makefile
+ M /trunk/samtools/bam_md.c
+ M /trunk/samtools/kaln.c
+ M /trunk/samtools/kaln.h
+ A /trunk/samtools/kprobaln.c
+ A /trunk/samtools/kprobaln.h
+
+Separate kaln and kprobaln as I am preparing further changes. At
+present, the results should be identical to the previous.
+
+
+------------------------------------------------------------------------
+r788 | petulda | 2010-11-02 12:19:04 -0400 (Tue, 02 Nov 2010) | 1 line
+Changed paths:
+ M /trunk/samtools/bam_plcmd.c
+
+Added -b option: read file names from a file
+------------------------------------------------------------------------
+r787 | lh3lh3 | 2010-10-29 23:17:22 -0400 (Fri, 29 Oct 2010) | 7 lines
+Changed paths:
+ M /trunk/samtools/bam.h
+ M /trunk/samtools/bam_pileup.c
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.9-2 (r787)
+
+ * Allow to set a maximum per-sample depth to reduce memory. However,
+ BAQ computation is still applied to every read. The speed is not
+ improved.
+
+
+------------------------------------------------------------------------
+r786 | lh3lh3 | 2010-10-29 12:10:40 -0400 (Fri, 29 Oct 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bam2bcf.h
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/bcftools/bcf.c
+ M /trunk/samtools/bcftools/vcf.c
+
+ * samtools-0.1.9-1 (r786)
+ * samtools: optionally perform exact test for each sample
+
+------------------------------------------------------------------------
+r785 | lh3lh3 | 2010-10-29 09:42:25 -0400 (Fri, 29 Oct 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bam2bcf.h
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bcftools/bcf.c
+
+Optionally output "DP", the individual read depth
+
+------------------------------------------------------------------------
+r784 | lh3lh3 | 2010-10-27 23:10:27 -0400 (Wed, 27 Oct 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/samtools.1
+
+acknowledge Petr and John who have greatly contributed to the project.
+
+------------------------------------------------------------------------
+r783 | lh3lh3 | 2010-10-27 22:47:47 -0400 (Wed, 27 Oct 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/ChangeLog
+ M /trunk/samtools/NEWS
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/samtools.1
+
+Release samtools-0.1.9 (r783)
+
+------------------------------------------------------------------------
+r782 | lh3lh3 | 2010-10-27 19:58:54 -0400 (Wed, 27 Oct 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam_plcmd.c
+
+fixed a silly bug in pileup
+
+------------------------------------------------------------------------
+r781 | lh3lh3 | 2010-10-27 14:39:48 -0400 (Wed, 27 Oct 2010) | 5 lines
+Changed paths:
+ M /trunk/samtools/ChangeLog
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bam_sort.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/samtools.1
+
+ * samtools-0.1.8-22 (r781)
+ * made BAQ the default behavior of mpileup
+ * updated manual
+ * in merge, force to exit given inconsistent header when "-R" is not in use.
+
+------------------------------------------------------------------------
+r780 | lh3lh3 | 2010-10-27 11:01:11 -0400 (Wed, 27 Oct 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam.h
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.8-21 (r780)
+ * minor speedup to pileup
+
+------------------------------------------------------------------------
+r779 | lh3lh3 | 2010-10-27 09:58:56 -0400 (Wed, 27 Oct 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam_pileup.c
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/examples/toy.sam
+
+improve pileup a little bit
+
+------------------------------------------------------------------------
+r778 | lh3lh3 | 2010-10-27 00:14:43 -0400 (Wed, 27 Oct 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam.h
+ M /trunk/samtools/bam_pileup.c
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bam_tview.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.8-20 (r778)
+ * speed up pileup, although I do not know how much is the improvement
+
+------------------------------------------------------------------------
+r777 | lh3lh3 | 2010-10-26 17:26:04 -0400 (Tue, 26 Oct 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_maqcns.c
+ M /trunk/samtools/bam_maqcns.h
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/examples/Makefile
+
+ * samtools-0.1.8-19 (r777)
+ * integrate mpileup features to pileup: min_baseQ, capQ, prob_realn, paired-only and biased prior
+
+------------------------------------------------------------------------
+r776 | lh3lh3 | 2010-10-26 15:27:46 -0400 (Tue, 26 Oct 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam_md.c
+
+remove local realignment (probabilistic realignment is still there)
+
+------------------------------------------------------------------------
+r774 | jmarshall | 2010-10-21 06:52:38 -0400 (Thu, 21 Oct 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/sam_view.c
+
+Add the relevant filename or region to error messages, and cause a failure
+exit status where appropriate. Based on a patch provided by Marcel Martin.
+
+------------------------------------------------------------------------
+r773 | lh3lh3 | 2010-10-19 19:44:31 -0400 (Tue, 19 Oct 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/examples/toy.sam
+ M /trunk/samtools/kaln.c
+
+ * Minor code changes. No real effect.
+ * change quality to 30 in toy.sam
+
+------------------------------------------------------------------------
+r772 | lh3lh3 | 2010-10-18 23:40:13 -0400 (Mon, 18 Oct 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/examples/toy.fa
+ M /trunk/samtools/examples/toy.sam
+
+added another toy example
+
+------------------------------------------------------------------------
+r771 | lh3lh3 | 2010-10-13 23:32:12 -0400 (Wed, 13 Oct 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/call1.c
+ M /trunk/samtools/bcftools/ld.c
+ M /trunk/samtools/bcftools/vcfutils.pl
+
+improve the LD statistics
+
+------------------------------------------------------------------------
+r770 | lh3lh3 | 2010-10-12 23:49:26 -0400 (Tue, 12 Oct 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bcftools/call1.c
+ M /trunk/samtools/bcftools/vcfutils.pl
+
+ * a minor fix to the -L option
+ * add ldstats to vcfutils.pl
+
+------------------------------------------------------------------------
+r769 | lh3lh3 | 2010-10-12 15:51:57 -0400 (Tue, 12 Oct 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/bcf.c
+
+a minor change
+
+------------------------------------------------------------------------
+r768 | lh3lh3 | 2010-10-12 15:49:06 -0400 (Tue, 12 Oct 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/Makefile
+ A /trunk/samtools/bcftools/ld.c
+
+forget to add the key file
+
+------------------------------------------------------------------------
+r767 | lh3lh3 | 2010-10-12 15:48:46 -0400 (Tue, 12 Oct 2010) | 4 lines
+Changed paths:
+ M /trunk/samtools/bcftools/Makefile
+ M /trunk/samtools/bcftools/bcf.c
+ M /trunk/samtools/bcftools/bcf.h
+ M /trunk/samtools/bcftools/call1.c
+ M /trunk/samtools/bcftools/prob1.c
+ M /trunk/samtools/bcftools/vcfutils.pl
+
+ * vcfutils.pl: fixed a typo in help message
+ * added APIs: bcf_append_info() and bcf_cpy()
+ * calculate adjacent LD
+
+------------------------------------------------------------------------
+r766 | lh3lh3 | 2010-10-11 11:06:40 -0400 (Mon, 11 Oct 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/vcfutils.pl
+
+added filter for samtools/bcftools genetated VCFs
+
+------------------------------------------------------------------------
+r765 | lh3lh3 | 2010-10-05 14:05:18 -0400 (Tue, 05 Oct 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bcftools/vcfutils.pl
+ M /trunk/samtools/kaln.c
+
+ * removed a comment line in kaln.c
+ * vcfutils.pl fillac works when GT is not the first field
+
+------------------------------------------------------------------------
+r764 | petulda | 2010-10-05 08:59:36 -0400 (Tue, 05 Oct 2010) | 1 line
+Changed paths:
+ A /trunk/samtools/bcftools/bcf-fix.pl
+
+Convert VCF output of "bcftools view -bgcv" to a valid VCF file
+------------------------------------------------------------------------
+r763 | lh3lh3 | 2010-10-02 22:51:03 -0400 (Sat, 02 Oct 2010) | 4 lines
+Changed paths:
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+ A /trunk/samtools/bcftools/bcftools.1
+ M /trunk/samtools/bcftools/call1.c
+ M /trunk/samtools/samtools.1
+
+ * samtools-0.1.8-18 (r763)
+ * added bcftools manual page
+ * minor fix to mpileup and view command lines
+
+------------------------------------------------------------------------
+r762 | lh3lh3 | 2010-10-02 21:46:25 -0400 (Sat, 02 Oct 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bcftools/bcf.c
+ M /trunk/samtools/bcftools/call1.c
+ M /trunk/samtools/bcftools/vcfutils.pl
+
+ * vcfutils.pl qstats: calculate marginal ts/tv
+ * allow to call genotypes at variant sites
+
+------------------------------------------------------------------------
+r761 | lh3lh3 | 2010-10-01 00:29:55 -0400 (Fri, 01 Oct 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/kaln.c
+ M /trunk/samtools/misc/HmmGlocal.java
+
+I am changing the gap open probability back to 0.001. It seems that
+being conservative here is a good thing...
+
+------------------------------------------------------------------------
+r760 | lh3lh3 | 2010-10-01 00:11:27 -0400 (Fri, 01 Oct 2010) | 5 lines
+Changed paths:
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/kaln.c
+ A /trunk/samtools/misc/HmmGlocal.java
+
+ * samtools-0.1.8-17 (r760)
+ * the default gap open penalty is too small (a typo)
+ * added comments on hmm_realn
+ * Java implementation
+
+------------------------------------------------------------------------
+r759 | lh3lh3 | 2010-09-30 10:12:54 -0400 (Thu, 30 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bamtk.c
+
+mark samtools-0.1.8-16 (r759)
+
+------------------------------------------------------------------------
+r758 | lh3lh3 | 2010-09-30 10:12:02 -0400 (Thu, 30 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/kaln.c
+
+round to the nearest integer
+
+------------------------------------------------------------------------
+r757 | lh3lh3 | 2010-09-28 17:16:43 -0400 (Tue, 28 Sep 2010) | 4 lines
+Changed paths:
+ M /trunk/samtools/kaln.c
+
+I was trying to accelerate ka_prob_glocal() as this will be the
+bottleneck. After an hour, the only gain is to change division to
+multiplication. OK. I will stop.
+
+------------------------------------------------------------------------
+r756 | lh3lh3 | 2010-09-28 16:57:49 -0400 (Tue, 28 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/kaln.c
+
+this is interesting. multiplication is much faster than division, at least on my Mac
+
+------------------------------------------------------------------------
+r755 | lh3lh3 | 2010-09-28 16:19:13 -0400 (Tue, 28 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/kaln.c
+
+minor changes
+
+------------------------------------------------------------------------
+r754 | lh3lh3 | 2010-09-28 15:44:16 -0400 (Tue, 28 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam_md.c
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/kaln.c
+
+prob_realn() seems working!
+
+------------------------------------------------------------------------
+r753 | lh3lh3 | 2010-09-28 12:48:23 -0400 (Tue, 28 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/kaln.c
+
+minor
+
+------------------------------------------------------------------------
+r752 | lh3lh3 | 2010-09-28 12:47:41 -0400 (Tue, 28 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/kaln.c
+ M /trunk/samtools/kaln.h
+
+Convert phredQ to probabilities
+
+------------------------------------------------------------------------
+r751 | lh3lh3 | 2010-09-28 12:32:08 -0400 (Tue, 28 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/kaln.c
+ M /trunk/samtools/kaln.h
+
+Implement the glocal HMM; discard the extention HMM
+
+------------------------------------------------------------------------
+r750 | lh3lh3 | 2010-09-28 00:06:11 -0400 (Tue, 28 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/kaln.c
+
+improve numerical stability
+
+------------------------------------------------------------------------
+r749 | lh3lh3 | 2010-09-27 23:27:54 -0400 (Mon, 27 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/kaln.c
+
+more comments
+
+------------------------------------------------------------------------
+r748 | lh3lh3 | 2010-09-27 23:17:16 -0400 (Mon, 27 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/kaln.c
+
+fixed a bug in banded DP
+
+------------------------------------------------------------------------
+r747 | lh3lh3 | 2010-09-27 23:05:12 -0400 (Mon, 27 Sep 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/kaln.c
+
+ * fixed that weird issue.
+ * the banded version is NOT working
+
+------------------------------------------------------------------------
+r746 | lh3lh3 | 2010-09-27 22:57:05 -0400 (Mon, 27 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/kaln.c
+
+More comments. This version seems working, but something is a little weird...
+
+------------------------------------------------------------------------
+r745 | lh3lh3 | 2010-09-27 17:21:40 -0400 (Mon, 27 Sep 2010) | 6 lines
+Changed paths:
+ M /trunk/samtools/kaln.c
+
+A little code cleanup. Now the forward and backback algorithms give
+nearly identical P(x), which means both are close to the correct
+forms. However, I have only tested on toy examples. Minor errors in
+the implementation may not be obvious.
+
+
+------------------------------------------------------------------------
+r744 | lh3lh3 | 2010-09-27 16:55:15 -0400 (Mon, 27 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bam_sort.c
+ M /trunk/samtools/kaln.c
+ M /trunk/samtools/kaln.h
+
+...
+
+------------------------------------------------------------------------
+r743 | jmarshall | 2010-09-27 08:19:06 -0400 (Mon, 27 Sep 2010) | 6 lines
+Changed paths:
+ M /trunk/samtools/bam_sort.c
+
+Abort if merge -h's INH.SAM cannot be opened, just as we abort
+if any of the IN#.BAM input files cannot be opened.
+
+Also propagate any error indication returned by bam_merge_core()
+to samtools merge's exit status.
+
+------------------------------------------------------------------------
+r741 | jmarshall | 2010-09-24 11:08:24 -0400 (Fri, 24 Sep 2010) | 5 lines
+Changed paths:
+ M /trunk/samtools/bam_index.c
+
+Use bam_validate1() to detect garbage records in the event of a corrupt
+BAI index file that causes a bam_seek() to an invalid position. At most
+one record (namely, the bam_iter_read terminator) is tested per bam_fetch()
+call, so the cost is insignificant in the normal case.
+
+------------------------------------------------------------------------
+r740 | jmarshall | 2010-09-24 11:00:19 -0400 (Fri, 24 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam.c
+ M /trunk/samtools/bam.h
+
+Add bam_validate1().
+
+------------------------------------------------------------------------
+r739 | lh3lh3 | 2010-09-22 12:07:50 -0400 (Wed, 22 Sep 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_md.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.8-15 (r379)
+ * allow to change capQ parameter in calmd
+
+------------------------------------------------------------------------
+r738 | jmarshall | 2010-09-22 11:15:33 -0400 (Wed, 22 Sep 2010) | 13 lines
+Changed paths:
+ M /trunk/samtools/bam_index.c
+ M /trunk/samtools/sam_view.c
+
+When bam_read1() returns an error (return value <= -2), propagate that error
+to bam_iter_read()'s own return value. Similarly, also propagate it up to
+bam_fetch()'s return value. Previously bam_fetch() always returned 0, and
+callers ignored its return value anyway. With this change, 0 continues to
+indicate success, while <= -2 (which can be written as < 0, as -1 is never
+returned) indicates corrupted input.
+
+bam_iter_read() ought also to propagate errors returned by bam_seek().
+
+main_samview() can now print an error message and fail when bam_fetch()
+detects that a .bai index file is corrupted or otherwise does not correspond
+to the .bam file it is being used with.
+
+------------------------------------------------------------------------
+r737 | jmarshall | 2010-09-22 10:47:42 -0400 (Wed, 22 Sep 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_index.c
+
+0 is a successful return value from bam_read1(). (In practice, it never
+returns 0 anyway; but all the other callers treat 0 as successful.)
+
+------------------------------------------------------------------------
+r736 | lh3lh3 | 2010-09-20 17:43:08 -0400 (Mon, 20 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam.h
+ M /trunk/samtools/bam_index.c
+ M /trunk/samtools/bam_sort.c
+
+ * merge files region-by-region. work on small examples but more tests are needed.
+
+------------------------------------------------------------------------
+r735 | lh3lh3 | 2010-09-20 16:56:24 -0400 (Mon, 20 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/vcfutils.pl
+
+improve qstats by checking the alleles as well
+
+------------------------------------------------------------------------
+r734 | lh3lh3 | 2010-09-17 18:12:13 -0400 (Fri, 17 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/vcfutils.pl
+
+convert UCSC SNP SQL dump to VCF
+
+------------------------------------------------------------------------
+r733 | lh3lh3 | 2010-09-17 13:02:11 -0400 (Fri, 17 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/vcfutils.pl
+
+hapmap2vcf convertor
+
+------------------------------------------------------------------------
+r732 | lh3lh3 | 2010-09-17 10:11:37 -0400 (Fri, 17 Sep 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bcftools/Makefile
+ M /trunk/samtools/bcftools/bcf.c
+ M /trunk/samtools/bcftools/bcf.h
+ M /trunk/samtools/bcftools/vcf.c
+
+ * added comments
+ * VCF->BCF is not possible without knowing the sequence dictionary before hand...
+
+------------------------------------------------------------------------
+r731 | lh3lh3 | 2010-09-17 09:15:53 -0400 (Fri, 17 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bcftools/bcf.c
+ M /trunk/samtools/bcftools/bcf.h
+ M /trunk/samtools/bcftools/bcfutils.c
+ M /trunk/samtools/bcftools/call1.c
+ M /trunk/samtools/bcftools/vcf.c
+
+ * put n_smpl to "bcf1_t" to simplify API a little
+
+------------------------------------------------------------------------
+r730 | lh3lh3 | 2010-09-16 21:36:01 -0400 (Thu, 16 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/bcf.h
+ M /trunk/samtools/bcftools/call1.c
+ M /trunk/samtools/bcftools/index.c
+
+fixed a bug in indexing
+
+------------------------------------------------------------------------
+r729 | lh3lh3 | 2010-09-16 16:54:48 -0400 (Thu, 16 Sep 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam.c
+ M /trunk/samtools/bam_md.c
+ M /trunk/samtools/bam_pileup.c
+
+ * fixed a bug in capQ
+ * valgrind identifies a use of uninitialised value, but I have not fixed it.
+
+------------------------------------------------------------------------
+r728 | lh3lh3 | 2010-09-16 15:03:59 -0400 (Thu, 16 Sep 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bgzip.c
+ M /trunk/samtools/razip.c
+
+ * fixed a bug in razip: -c will delete the input file
+ * copy tabix/bgzip to here
+
+------------------------------------------------------------------------
+r727 | lh3lh3 | 2010-09-16 13:45:49 -0400 (Thu, 16 Sep 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_md.c
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.8-14 (r727)
+ * allow to change the capQ parameter at the command line
+
+------------------------------------------------------------------------
+r726 | lh3lh3 | 2010-09-16 13:38:43 -0400 (Thu, 16 Sep 2010) | 4 lines
+Changed paths:
+ M /trunk/samtools/bam_md.c
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bcftools/vcfutils.pl
+ M /trunk/samtools/misc/samtools.pl
+
+ * added varFilter to vcfutils.pl
+ * reimplement realn(). now it performs a local alignment
+ * added cap_mapQ() to cap mapping quality when there are many substitutions
+
+------------------------------------------------------------------------
+r724 | lh3lh3 | 2010-09-15 00:18:31 -0400 (Wed, 15 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/Makefile
+ A /trunk/samtools/bcftools/bcf2qcall.c
+ M /trunk/samtools/bcftools/call1.c
+
+ * convert BCF to QCALL input
+
+------------------------------------------------------------------------
+r723 | lh3lh3 | 2010-09-14 22:41:50 -0400 (Tue, 14 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam_md.c
+
+dynamic band width in realignment
+
+------------------------------------------------------------------------
+r722 | lh3lh3 | 2010-09-14 22:05:32 -0400 (Tue, 14 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam_md.c
+ M /trunk/samtools/bam_plcmd.c
+
+fixed a bug in realignment
+
+------------------------------------------------------------------------
+r721 | lh3lh3 | 2010-09-14 20:54:09 -0400 (Tue, 14 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/prob1.c
+
+fixed a minor issue
+
+------------------------------------------------------------------------
+r720 | lh3lh3 | 2010-09-14 19:25:10 -0400 (Tue, 14 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/Makefile
+ M /trunk/samtools/bam_maqcns.c
+ M /trunk/samtools/bam_md.c
+
+fixed a bug in realignment
+
+------------------------------------------------------------------------
+r719 | lh3lh3 | 2010-09-14 19:18:24 -0400 (Tue, 14 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam_plcmd.c
+
+minor changes. It is BUGGY now!
+
+------------------------------------------------------------------------
+r718 | lh3lh3 | 2010-09-14 16:32:33 -0400 (Tue, 14 Sep 2010) | 4 lines
+Changed paths:
+ M /trunk/samtools/bam_md.c
+ M /trunk/samtools/bam_pileup.c
+ M /trunk/samtools/kaln.c
+ M /trunk/samtools/kaln.h
+
+ * aggressive gapped aligner is implemented in calmd.
+ * distinguish gap_open and gap_end_open in banded alignment
+ * make tview accepts alignment with heading and tailing D
+
+------------------------------------------------------------------------
+r717 | jmarshall | 2010-09-14 09:04:28 -0400 (Tue, 14 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools
+
+Add svn:ignore properties for generated files that don't appear in "make all".
+
+------------------------------------------------------------------------
+r716 | jmarshall | 2010-09-13 08:37:53 -0400 (Mon, 13 Sep 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools
+ M /trunk/samtools/bcftools
+ M /trunk/samtools/misc
+
+Add svn:ignore properties listing the generated files.
+(Except for *.o, which we'll assume is in global-ignores.)
+
+------------------------------------------------------------------------
+r715 | lh3lh3 | 2010-09-08 12:53:55 -0400 (Wed, 08 Sep 2010) | 5 lines
+Changed paths:
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/bcftools/call1.c
+ M /trunk/samtools/bcftools/prob1.c
+ M /trunk/samtools/sample.c
+ M /trunk/samtools/sample.h
+
+ * samtools-0.1.8-13 (r715)
+ * fixed a bug in identifying SM across files
+ * bcftools: estimate heterozygosity
+ * bcftools: allow to skip sites without reference bases
+
+------------------------------------------------------------------------
+r713 | lh3lh3 | 2010-09-03 17:19:12 -0400 (Fri, 03 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/Makefile
+ M /trunk/samtools/bcftools/call1.c
+ M /trunk/samtools/bcftools/prob1.c
+ M /trunk/samtools/bcftools/prob1.h
+
+quite a lot changes to the contrast caller, but I still feel something is missing...
+
+------------------------------------------------------------------------
+r711 | lh3lh3 | 2010-09-03 00:30:48 -0400 (Fri, 03 Sep 2010) | 4 lines
+Changed paths:
+ M /trunk/samtools/bcftools/Makefile
+ M /trunk/samtools/bcftools/call1.c
+ M /trunk/samtools/bcftools/prob1.c
+ M /trunk/samtools/bcftools/vcfutils.pl
+
+ * changed 3.434 to 4.343 (typo!)
+ * fixed a bug in the contrast caller
+ * calculate heterozygosity
+
+------------------------------------------------------------------------
+r710 | lh3lh3 | 2010-09-01 23:24:47 -0400 (Wed, 01 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/bcf.h
+ M /trunk/samtools/bcftools/bcfutils.c
+ M /trunk/samtools/bcftools/call1.c
+
+SNP calling from the GL field
+
+------------------------------------------------------------------------
+r709 | lh3lh3 | 2010-09-01 18:52:30 -0400 (Wed, 01 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/vcf.c
+
+fixed another problem
+
+------------------------------------------------------------------------
+r708 | lh3lh3 | 2010-09-01 18:31:17 -0400 (Wed, 01 Sep 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bcftools/bcf.c
+ M /trunk/samtools/bcftools/vcf.c
+
+ * fixed bugs in parsing VCF
+ * parser now works with GT/GQ/DP/PL/GL
+
+------------------------------------------------------------------------
+r707 | lh3lh3 | 2010-09-01 15:28:29 -0400 (Wed, 01 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/Makefile
+ M /trunk/samtools/bcftools/prob1.c
+
+Do not compile _BCF_QUAD by default
+
+------------------------------------------------------------------------
+r706 | lh3lh3 | 2010-09-01 15:21:41 -0400 (Wed, 01 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/bcf.c
+ M /trunk/samtools/bcftools/bcf.h
+ M /trunk/samtools/bcftools/bcfutils.c
+ M /trunk/samtools/bcftools/call1.c
+
+Write the correct ALT and PL in the SNP calling mode.
+
+------------------------------------------------------------------------
+r705 | lh3lh3 | 2010-09-01 12:50:33 -0400 (Wed, 01 Sep 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/vcfutils.pl
+
+more commands for my own uses
+
+------------------------------------------------------------------------
+r704 | lh3lh3 | 2010-09-01 09:26:10 -0400 (Wed, 01 Sep 2010) | 2 lines
+Changed paths:
+ A /trunk/samtools/bcftools/vcfutils.pl
+
+Utilities for processing VCF
+
+------------------------------------------------------------------------
+r703 | lh3lh3 | 2010-08-31 16:44:57 -0400 (Tue, 31 Aug 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/Makefile
+ M /trunk/samtools/bcftools/call1.c
+ M /trunk/samtools/bcftools/prob1.c
+ M /trunk/samtools/bcftools/prob1.h
+
+preliminary contrast variant caller
+
+------------------------------------------------------------------------
+r702 | lh3lh3 | 2010-08-31 12:28:39 -0400 (Tue, 31 Aug 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/call1.c
+ M /trunk/samtools/bcftools/prob1.c
+ M /trunk/samtools/bcftools/prob1.h
+
+z' and z'' can be calculated
+
+------------------------------------------------------------------------
+r701 | lh3lh3 | 2010-08-31 10:20:57 -0400 (Tue, 31 Aug 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bcftools/Makefile
+ A /trunk/samtools/bcftools/call1.c (from /trunk/samtools/bcftools/vcfout.c:699)
+ M /trunk/samtools/bcftools/prob1.c
+ D /trunk/samtools/bcftools/vcfout.c
+
+ * rename vcfout.c as call1.c
+ * prepare to add two-sample comparison
+
+------------------------------------------------------------------------
+r699 | lh3lh3 | 2010-08-24 15:28:16 -0400 (Tue, 24 Aug 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/vcfout.c
+
+fixed a bug in calculating the t statistics
+
+------------------------------------------------------------------------
+r698 | lh3lh3 | 2010-08-24 14:05:50 -0400 (Tue, 24 Aug 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bam2bcf.h
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/bcftools/kfunc.c
+ M /trunk/samtools/bcftools/vcfout.c
+
+ * samtools-0.1.8-13 (r698)
+ * perform one-tailed t-test for baseQ, mapQ and endDist
+
+------------------------------------------------------------------------
+r697 | lh3lh3 | 2010-08-24 12:30:13 -0400 (Tue, 24 Aug 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/kfunc.c
+
+added regularized incomplete beta function
+
+------------------------------------------------------------------------
+r695 | lh3lh3 | 2010-08-23 17:36:17 -0400 (Mon, 23 Aug 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam_maqcns.c
+ M /trunk/samtools/bam_plcmd.c
+
+change the default correlation coefficient
+
+------------------------------------------------------------------------
+r694 | lh3lh3 | 2010-08-23 14:46:52 -0400 (Mon, 23 Aug 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/bcf.c
+ M /trunk/samtools/bcftools/vcfout.c
+
+print QUAL as floating numbers
+
+------------------------------------------------------------------------
+r693 | lh3lh3 | 2010-08-23 14:06:07 -0400 (Mon, 23 Aug 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/Makefile
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/examples/Makefile
+ A /trunk/samtools/sample.c
+ A /trunk/samtools/sample.h
+
+ * samtools-0.1.8-12 (r692)
+ * group data by samples in "mpileup -g"
+
+------------------------------------------------------------------------
+r692 | lh3lh3 | 2010-08-23 10:58:53 -0400 (Mon, 23 Aug 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/Makefile
+ D /trunk/samtools/bam_mcns.c
+ D /trunk/samtools/bam_mcns.h
+ M /trunk/samtools/bam_plcmd.c
+
+remove VCF output in mpileup
+
+------------------------------------------------------------------------
+r691 | lh3lh3 | 2010-08-23 10:48:20 -0400 (Mon, 23 Aug 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bam2bcf.h
+
+ * use the revised MAQ error model for mpileup
+ * prepare to remove the independent model from mpileup
+
+------------------------------------------------------------------------
+r690 | lh3lh3 | 2010-08-20 15:46:40 -0400 (Fri, 20 Aug 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/Makefile
+ M /trunk/samtools/bam_maqcns.c
+ M /trunk/samtools/bam_maqcns.h
+ M /trunk/samtools/bam_plcmd.c
+ A /trunk/samtools/errmod.c
+ A /trunk/samtools/errmod.h
+ M /trunk/samtools/ksort.h
+
+added revised MAQ error model
+
+------------------------------------------------------------------------
+r689 | lh3lh3 | 2010-08-18 09:55:20 -0400 (Wed, 18 Aug 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/prob1.c
+ M /trunk/samtools/bcftools/prob1.h
+ M /trunk/samtools/bcftools/vcfout.c
+
+allow to read the prior from the error output. EM iteration is working.
+
+------------------------------------------------------------------------
+r688 | lh3lh3 | 2010-08-17 12:12:20 -0400 (Tue, 17 Aug 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bcftools/main.c
+ M /trunk/samtools/bcftools/vcf.c
+
+ * write a little more VCF header
+ * concatenate BCFs
+
+------------------------------------------------------------------------
+r687 | lh3lh3 | 2010-08-16 20:53:16 -0400 (Mon, 16 Aug 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/bcf.c
+ M /trunk/samtools/bcftools/bcf.h
+ M /trunk/samtools/bcftools/bcf.tex
+
+use float for QUAL
+
+------------------------------------------------------------------------
+r686 | lh3lh3 | 2010-08-14 00:11:13 -0400 (Sat, 14 Aug 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/bcf.c
+ M /trunk/samtools/bcftools/prob1.c
+
+faster for large sample size (in principle)
+
+------------------------------------------------------------------------
+r685 | lh3lh3 | 2010-08-13 23:28:31 -0400 (Fri, 13 Aug 2010) | 4 lines
+Changed paths:
+ M /trunk/samtools/bcftools/prob1.c
+
+ * a numerically stable method to calculate z_{jk}
+ * currently slower than the old method but will be important for large sample size
+ * in principle, we can speed up for large n, but have not tried
+
+------------------------------------------------------------------------
+r684 | lh3lh3 | 2010-08-11 21:58:31 -0400 (Wed, 11 Aug 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/vcfout.c
+
+fixed an issue in parsing integer
+
+------------------------------------------------------------------------
+r683 | lh3lh3 | 2010-08-09 13:05:07 -0400 (Mon, 09 Aug 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/bcf.c
+
+do not print refname if file is converted from VCF
+
+------------------------------------------------------------------------
+r682 | lh3lh3 | 2010-08-09 12:59:47 -0400 (Mon, 09 Aug 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bcftools/vcf.c
+
+ * parse PL
+ * fixed a bug in parsing VCF
+
+------------------------------------------------------------------------
+r681 | lh3lh3 | 2010-08-09 12:49:23 -0400 (Mon, 09 Aug 2010) | 4 lines
+Changed paths:
+ M /trunk/samtools/bcftools/bcf.c
+ M /trunk/samtools/bcftools/bcf.h
+ M /trunk/samtools/bcftools/bcfutils.c
+ M /trunk/samtools/bcftools/main.c
+ M /trunk/samtools/bcftools/vcf.c
+ M /trunk/samtools/bcftools/vcfout.c
+ M /trunk/samtools/bgzf.c
+ M /trunk/samtools/kstring.c
+
+ * fixed a bug in kstrtok@kstring.c
+ * preliminary VCF parser (not parse everything for now)
+ * improved view interface
+
+------------------------------------------------------------------------
+r680 | lh3lh3 | 2010-08-09 10:43:13 -0400 (Mon, 09 Aug 2010) | 4 lines
+Changed paths:
+ M /trunk/samtools/bcftools/bcf.c
+ M /trunk/samtools/bcftools/bcf.h
+ M /trunk/samtools/bcftools/vcfout.c
+ M /trunk/samtools/kstring.c
+ M /trunk/samtools/kstring.h
+
+ * improved kstring (added kstrtok)
+ * removed the limit on the format string length in bcftools
+ * use kstrtok to parse format which fixed a bug in the old code
+
+------------------------------------------------------------------------
+r679 | lh3lh3 | 2010-08-09 01:12:05 -0400 (Mon, 09 Aug 2010) | 2 lines
+Changed paths:
+ A /trunk/samtools/bcftools/README
+ M /trunk/samtools/bcftools/vcfout.c
+
+help messages
+
+------------------------------------------------------------------------
+r678 | lh3lh3 | 2010-08-09 00:01:52 -0400 (Mon, 09 Aug 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/vcfout.c
+
+perform single-tail test for ED4
+
+------------------------------------------------------------------------
+r677 | lh3lh3 | 2010-08-08 23:48:35 -0400 (Sun, 08 Aug 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/Makefile
+ M /trunk/samtools/bcftools/kfunc.c
+ M /trunk/samtools/bcftools/vcfout.c
+
+ * test depth, end distance and HWE
+
+------------------------------------------------------------------------
+r676 | lh3lh3 | 2010-08-08 02:04:15 -0400 (Sun, 08 Aug 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/kfunc.c
+
+reimplement incomplete gamma functions. no copy-paste
+
+------------------------------------------------------------------------
+r675 | lh3lh3 | 2010-08-06 22:42:54 -0400 (Fri, 06 Aug 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bam2bcf.h
+ M /trunk/samtools/bcftools/fet.c
+ M /trunk/samtools/bcftools/prob1.c
+ M /trunk/samtools/bcftools/prob1.h
+ M /trunk/samtools/bcftools/vcfout.c
+
+ * bcftools: add HWE (no testing for now)
+ * record end dist in a 2x2 table, not avg, std any more
+
+------------------------------------------------------------------------
+r674 | lh3lh3 | 2010-08-06 17:30:16 -0400 (Fri, 06 Aug 2010) | 3 lines
+Changed paths:
+ A /trunk/samtools/bcftools/kfunc.c
+
+ * Special functions: log(gamma()), erfc(), P(a,x) (incomplete gamma)
+ * Not using Numerical Recipe due to licensing issues
+
+------------------------------------------------------------------------
+r673 | lh3lh3 | 2010-08-05 23:46:53 -0400 (Thu, 05 Aug 2010) | 2 lines
+Changed paths:
+ A /trunk/samtools/bcftools/fet.c
+
+Fisher's exact test
+
+------------------------------------------------------------------------
+r672 | lh3lh3 | 2010-08-05 21:48:33 -0400 (Thu, 05 Aug 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bam2bcf.h
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/examples/Makefile
+
+ * samtools-0.1.8-11 (r672)
+ * collect more stats for allele balance test in bcftools (not yet)
+
+------------------------------------------------------------------------
+r671 | lh3lh3 | 2010-08-05 16:17:58 -0400 (Thu, 05 Aug 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bcftools/bcf.c
+ M /trunk/samtools/bcftools/main.c
+
+ * the code base is stablized again.
+ * I will delay the vcf parser, which is quite complicated but with little value for now
+
+------------------------------------------------------------------------
+r670 | lh3lh3 | 2010-08-05 16:03:23 -0400 (Thu, 05 Aug 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/examples/Makefile
+
+minor
+
+------------------------------------------------------------------------
+r669 | lh3lh3 | 2010-08-05 16:03:08 -0400 (Thu, 05 Aug 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcftools/vcf.c
+
+unfinished vcf parser
+
+------------------------------------------------------------------------
+r668 | lh3lh3 | 2010-08-05 15:46:40 -0400 (Thu, 05 Aug 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bcftools/Makefile
+ M /trunk/samtools/bcftools/bcf.c
+ M /trunk/samtools/bcftools/bcf.h
+ M /trunk/samtools/bcftools/bcfutils.c
+ M /trunk/samtools/bcftools/index.c
+ M /trunk/samtools/bcftools/main.c
+ A /trunk/samtools/bcftools/vcf.c
+ M /trunk/samtools/bcftools/vcfout.c
+
+ * added prelimiary VCF parser (not finished)
+ * change struct a bit
+
+------------------------------------------------------------------------
+r667 | lh3lh3 | 2010-08-03 22:35:27 -0400 (Tue, 03 Aug 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bam2bcf.h
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bcftools/bcf.c
+
+ * allow to set min base q
+ * fixed a bug in mpileup -u
+
+------------------------------------------------------------------------
+r666 | lh3lh3 | 2010-08-03 22:08:44 -0400 (Tue, 03 Aug 2010) | 2 lines
+Changed paths:
+ A /trunk/samtools/bcftools/bcf.tex
+
+spec
+
+------------------------------------------------------------------------
+r665 | lh3lh3 | 2010-08-03 21:18:57 -0400 (Tue, 03 Aug 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/examples/Makefile
+
+added more examples
+
+------------------------------------------------------------------------
+r664 | lh3lh3 | 2010-08-03 21:13:00 -0400 (Tue, 03 Aug 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/Makefile
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bam2bcf.h
+ M /trunk/samtools/bcftools/Makefile
+
+fixed compilation error
+
+------------------------------------------------------------------------
+r662 | lh3lh3 | 2010-08-03 21:04:00 -0400 (Tue, 03 Aug 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/Makefile
+ D /trunk/samtools/bcf.c
+ D /trunk/samtools/bcf.h
+ A /trunk/samtools/bcftools
+ A /trunk/samtools/bcftools/Makefile
+ A /trunk/samtools/bcftools/bcf.c
+ A /trunk/samtools/bcftools/bcf.h
+ A /trunk/samtools/bcftools/bcfutils.c
+ A /trunk/samtools/bcftools/index.c
+ A /trunk/samtools/bcftools/main.c
+ A /trunk/samtools/bcftools/prob1.c
+ A /trunk/samtools/bcftools/prob1.h
+ A /trunk/samtools/bcftools/vcfout.c
+
+move bcftools to samtools
+
+------------------------------------------------------------------------
+r660 | lh3lh3 | 2010-08-03 15:58:32 -0400 (Tue, 03 Aug 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+
+fixed another minor bug
+
+------------------------------------------------------------------------
+r658 | lh3lh3 | 2010-08-03 15:06:45 -0400 (Tue, 03 Aug 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/bcf.c
+
+ * samtools-0.1.8-10 (r658)
+ * fixed a bug in bam2bcf when the reference is N
+
+------------------------------------------------------------------------
+r657 | lh3lh3 | 2010-08-03 14:50:23 -0400 (Tue, 03 Aug 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bam2bcf.h
+
+ * fixed a bug
+ * treat ambiguous ref base as the fifth base
+
+------------------------------------------------------------------------
+r654 | lh3lh3 | 2010-08-02 17:38:27 -0400 (Mon, 02 Aug 2010) | 2 lines
+Changed paths:
+ M /trunk/bcftools/bcf.c
+ M /trunk/samtools/bcf.c
+
+missing a column in VCF output...
+
+------------------------------------------------------------------------
+r653 | lh3lh3 | 2010-08-02 17:31:33 -0400 (Mon, 02 Aug 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcf.c
+
+fixed a memory leak
+
+------------------------------------------------------------------------
+r651 | lh3lh3 | 2010-08-02 17:27:31 -0400 (Mon, 02 Aug 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bcf.c
+
+fixed a bug in bcf reader
+
+------------------------------------------------------------------------
+r650 | lh3lh3 | 2010-08-02 17:00:41 -0400 (Mon, 02 Aug 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam2bcf.c
+
+fixed a bug
+
+------------------------------------------------------------------------
+r649 | lh3lh3 | 2010-08-02 16:49:35 -0400 (Mon, 02 Aug 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/Makefile
+ M /trunk/samtools/bam2bcf.c
+ M /trunk/samtools/bam2bcf.h
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.8-9 (r649)
+ * lossless representation of PL in BCF output
+
+------------------------------------------------------------------------
+r648 | lh3lh3 | 2010-08-02 16:07:25 -0400 (Mon, 02 Aug 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/Makefile
+ A /trunk/samtools/bam2bcf.c
+ A /trunk/samtools/bam2bcf.h
+ M /trunk/samtools/bam_plcmd.c
+ A /trunk/samtools/bcf.c
+ A /trunk/samtools/bcf.h
+
+Generate binary VCF
+
+------------------------------------------------------------------------
+r644 | lh3lh3 | 2010-07-28 11:59:19 -0400 (Wed, 28 Jul 2010) | 5 lines
+Changed paths:
+ M /trunk/samtools/bam_mcns.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.8-8 (r644)
+ * mpileup becomes a little stable again
+ * the method is slightly different, but is more theoretically correct
+ * snp calling is O(n^2) instead of O(n^3)
+
+------------------------------------------------------------------------
+r643 | lh3lh3 | 2010-07-28 11:54:15 -0400 (Wed, 28 Jul 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_mcns.c
+
+ * fixed a STUPID bug, which cost me a lot of time.
+ * I am going to clean up mcns a little bit
+
+------------------------------------------------------------------------
+r642 | lh3lh3 | 2010-07-27 23:23:07 -0400 (Tue, 27 Jul 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam_mcns.c
+ M /trunk/samtools/bam_mcns.h
+ M /trunk/samtools/bam_plcmd.c
+
+supposedly this is THE correct implementation, but more testing is needed
+
+------------------------------------------------------------------------
+r641 | lh3lh3 | 2010-07-27 22:43:39 -0400 (Tue, 27 Jul 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam_mcns.c
+
+NOT ready yet. Going to make further changes...
+
+------------------------------------------------------------------------
+r639 | lh3lh3 | 2010-07-25 22:18:38 -0400 (Sun, 25 Jul 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_mcns.c
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.8-7 (r639)
+ * fixed the reference allele assignment
+
+------------------------------------------------------------------------
+r638 | lh3lh3 | 2010-07-25 12:01:26 -0400 (Sun, 25 Jul 2010) | 5 lines
+Changed paths:
+ M /trunk/samtools/bam_mcns.c
+ M /trunk/samtools/bam_mcns.h
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.8-6 (r638)
+ * skip isnan/isinf in case of float underflow
+ * added the flat prior
+ * fixed an issue where there are no reads supporting the reference
+
+------------------------------------------------------------------------
+r637 | lh3lh3 | 2010-07-24 14:16:27 -0400 (Sat, 24 Jul 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam_plcmd.c
+
+minor changes
+
+------------------------------------------------------------------------
+r636 | lh3lh3 | 2010-07-24 14:07:27 -0400 (Sat, 24 Jul 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam_mcns.c
+ M /trunk/samtools/bam_mcns.h
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+
+minor tweaks
+
+------------------------------------------------------------------------
+r635 | lh3lh3 | 2010-07-24 01:49:49 -0400 (Sat, 24 Jul 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam_mcns.c
+ M /trunk/samtools/bam_mcns.h
+ M /trunk/samtools/bam_plcmd.c
+
+posterior expectation FINALLY working. I am so tired...
+
+------------------------------------------------------------------------
+r633 | lh3lh3 | 2010-07-23 13:50:48 -0400 (Fri, 23 Jul 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam_plcmd.c
+
+another minor fix to mpileup
+
+------------------------------------------------------------------------
+r632 | lh3lh3 | 2010-07-23 13:43:31 -0400 (Fri, 23 Jul 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam_plcmd.c
+
+added the format column
+
+------------------------------------------------------------------------
+r631 | lh3lh3 | 2010-07-23 13:25:44 -0400 (Fri, 23 Jul 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam_mcns.c
+ M /trunk/samtools/bam_mcns.h
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+
+added an alternative prior
+
+------------------------------------------------------------------------
+r628 | lh3lh3 | 2010-07-23 11:48:51 -0400 (Fri, 23 Jul 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/bam_mcns.c
+ M /trunk/samtools/bam_mcns.h
+ M /trunk/samtools/bam_plcmd.c
+
+calculate posterior allele frequency
+
+------------------------------------------------------------------------
+r627 | lh3lh3 | 2010-07-22 21:39:13 -0400 (Thu, 22 Jul 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_mcns.c
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.8-3 (r627)
+ * multi-sample snp calling appears to work. More tests needed.
+
+------------------------------------------------------------------------
+r626 | lh3lh3 | 2010-07-22 16:37:56 -0400 (Thu, 22 Jul 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_mcns.c
+ M /trunk/samtools/bam_mcns.h
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bam_tview.c
+
+ * preliminary multisample SNP caller.
+ * something looks not so right, but it largely works
+
+------------------------------------------------------------------------
+r617 | lh3lh3 | 2010-07-14 16:26:27 -0400 (Wed, 14 Jul 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_mcns.c
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+
+ * samtools-0.1.8-2 (r617)
+ * allele frequency calculation apparently works...
+
+------------------------------------------------------------------------
+r616 | lh3lh3 | 2010-07-14 13:33:51 -0400 (Wed, 14 Jul 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/Makefile
+ A /trunk/samtools/bam_mcns.c
+ A /trunk/samtools/bam_mcns.h
+ M /trunk/samtools/bam_plcmd.c
+
+ * added mutli-sample framework. It is not working, yet.
+ * improved the mpileup interface
+
+------------------------------------------------------------------------
+r615 | lh3lh3 | 2010-07-13 14:50:12 -0400 (Tue, 13 Jul 2010) | 3 lines
+Changed paths:
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/misc/Makefile
+
+ * samtools-0.1.8-1 (r615)
+ * allow to get mpileup at required sites
+
+------------------------------------------------------------------------
+r613 | lh3lh3 | 2010-07-11 22:40:56 -0400 (Sun, 11 Jul 2010) | 2 lines
+Changed paths:
+ M /trunk/samtools/ChangeLog
+ M /trunk/samtools/NEWS
+ M /trunk/samtools/bam_plcmd.c
+ M /trunk/samtools/bamtk.c
+ M /trunk/samtools/samtools.1
+
+Release samtools-0.1.8
+
------------------------------------------------------------------------
r612 | lh3lh3 | 2010-07-11 21:08:56 -0400 (Sun, 11 Jul 2010) | 2 lines
Changed paths:
KNETFILE_O= knetfile.o
LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \
bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o \
- $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o
+ $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o kprobaln.o
AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \
bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \
- bamtk.o kaln.o
+ bamtk.o kaln.o bam2bcf.o bam2bcf_indel.o errmod.o sample.o
PROG= samtools
-INCLUDES=
-SUBDIRS= . misc
+INCLUDES= -I.
+SUBDIRS= . bcftools misc
LIBPATH=
LIBCURSES= -lcurses # -lXCurses
libbam.a:$(LOBJS)
$(AR) -cru $@ $(LOBJS)
-samtools:$(AOBJS) libbam.a
- $(CC) $(CFLAGS) -o $@ $(AOBJS) libbam.a -lm $(LIBPATH) $(LIBCURSES) -lz
+samtools:lib-recur $(AOBJS)
+ $(CC) $(CFLAGS) -o $@ $(AOBJS) libbam.a -lm $(LIBPATH) $(LIBCURSES) -lz -Lbcftools -lbcf
razip:razip.o razf.o $(KNETFILE_O)
$(CC) $(CFLAGS) -o $@ razf.o razip.o $(KNETFILE_O) -lz
sam.o:sam.h bam.h
bam_import.o:bam.h kseq.h khash.h razf.h
bam_pileup.o:bam.h razf.h ksort.h
-bam_plcmd.o:bam.h faidx.h bam_maqcns.h glf.h
+bam_plcmd.o:bam.h faidx.h bam_maqcns.h glf.h bcftools/bcf.h bam2bcf.h
bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h
bam_lpileup.o:bam.h ksort.h
bam_tview.o:bam.h faidx.h bam_maqcns.h
-bam_maqcns.o:bam.h ksort.h bam_maqcns.h
+bam_maqcns.o:bam.h ksort.h bam_maqcns.h kaln.h
bam_sort.o:bam.h ksort.h razf.h
bam_md.o:bam.h faidx.h
glf.o:glf.h
sam_header.o:sam_header.h khash.h
+bcf.o:bcftools/bcf.h
+bam2bcf.o:bam2bcf.h errmod.h bcftools/bcf.h
+bam2bcf_indel.o:bam2bcf.h
+errmod.o:errmod.h
faidx.o:faidx.h razf.h khash.h
faidx_main.o:faidx.h razf.h
+Beta release 0.1.12a (2 December, 2010)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This is another bug fix release:
+
+ * Fixed a memory violation in mpileup, which causes segfault. Release
+ 0.1.9 and above are affected.
+
+ * Fixed a memory violation in the indel caller, which does not causes
+ segfault, but may potentially affect deletion calls in an unexpected
+ way. Release 0.1.10 and above are affected.
+
+ * Fixed a bug in computing r-square in bcftools. Few are using this
+ functionality and it only has minor effect.
+
+ * Fixed a memory leak in bam_fetch().
+
+ * Fixed a bug in writing meta information to the BAM index for the last
+ sequence. This bug is invisible to most users, but it is a bug anyway.
+
+ * Fixed a bug in bcftools which causes false "DP4=0,0,0,0" annotations.
+
+(0.1.12: 2 December 2010, r862)
+
+
+
+Beta release 0.1.11 (21 November, 2010)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This is mainly a bug fix release:
+
+ * Fixed a bug in random retrieval (since 0.1.8). It occurs when reads
+ are retrieved from a small region containing no reads.
+
+ * Fixed a bug in pileup (since 0.1.9). The bug causes an assertion
+ failure when the first CIGAR operation is a deletion.
+
+ * Improved fault tolerence in remote access.
+
+One minor feature has been implemented in bcftools:
+
+ * Added a reference-free variant calling mode. In this mode, a site is
+ regarded as a variat iff the sample(s) contains two or more alleles;
+ the meaning of the QUAL field in the VCF output is changed
+ accordingly. Effectively, the reference allele is irrelevant to the
+ result in the new mode, although the reference sequence has to be
+ used in realignment when SAMtools computes genotype likelihoods.
+
+In addition, since 0.1.10, the `pileup' command has been deprecated by
+`mpileup' which is more powerful and more accurate. The `pileup' command
+will not be removed in the next few releases, but new features will not
+be added.
+
+(0.1.11: 21 November 2010, r851)
+
+
+
+Beta Release 0.1.10 (16 November, 2010)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This release is featured as the first major improvement to the indel
+caller. The method is similar to the old one implemented in the pileup
+command, but the details are handled more carefully both in theory and
+in practice. As a result, the new indel caller usually gives more
+accurate indel calls, though at the cost of sensitivity. The caller is
+implemented in the mpileup command and is invoked by default. It works
+with multiple samples.
+
+Other notable changes:
+
+ * With the -r option, the calmd command writes the difference between
+ the original base quality and the BAQ capped base quality at the BQ
+ tag but does not modify the base quality. Please use -Ar to overwrite
+ the original base quality (the 0.1.9 behavior).
+
+ * Allow to set a maximum per-sample read depth to reduce memory. In
+ 0.1.9, most of memory is wasted for the ultra high read depth in some
+ regions (e.g. the chr1 centromere).
+
+ * Optionally write per-sample read depth and per-sample strand bias
+ P-value.
+
+ * Compute equal-tail (Bayesian) credible interval of site allele
+ frequency at the CI95 VCF annotation.
+
+ * Merged the vcfutils.pl varFilter and filter4vcf for better SNP/indel
+ filtering.
+
+(0.1.10: 16 November 2010, r829)
+
+
+
+Beta Release 0.1.9 (27 October, 2010)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This release is featured as the first major improvement to the samtools'
+SNP caller. It comes with a revised MAQ error model, the support of
+multi-sample SNP calling and the computation of base alignment quality
+(BAQ).
+
+The revised MAQ error model is based on the original model. It solves an
+issue of miscalling SNPs in repetitive regions. Althought such SNPs can
+usually be filtered at a later step, they mess up unfiltered calls. This
+is a theoretical flaw in the original model. The revised MAQ model
+deprecates the orginal MAQ model and the simplified SOAPsnp model.
+
+Multi-sample SNP calling is separated in two steps. The first is done by
+samtools mpileup and the second by a new program, bcftools, which is
+included in the samtools source code tree. Multi-sample SNP calling also
+works for single sample and has the advantage of enabling more powerful
+filtration. It is likely to deprecate pileup in future once a proper
+indel calling method is implemented.
+
+BAQ is the Phred-scaled probability of a read base being wrongly
+aligned. Capping base quality by BAQ has been shown to be very effective
+in suppressing false SNPs caused by misalignments around indels or in
+low-complexity regions with acceptable compromise on computation
+time. This strategy is highly recommended and can be used with other SNP
+callers as well.
+
+In addition to the three major improvements, other notable changes are:
+
+ * Changes to the pileup format. A reference skip (the N CIGAR operator)
+ is shown as '<' or '>' depending on the strand. Tview is also changed
+ accordingly.
+
+ * Accelerated pileup. The plain pileup is about 50% faster.
+
+ * Regional merge. The merge command now accepts a new option to merge
+ files in a specified region.
+
+ * Fixed a bug in bgzip and razip which causes source files to be
+ deleted even if option -c is applied.
+
+ * In APIs, propogate errors to downstream callers and make samtools
+ return non-zero values once errors occur.
+
+(0.1.9: 27 October 2010, r783)
+
+
+
Beta Release 0.1.8 (11 July, 2010)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
kputc('\t', &str);
}
if (c->tid < 0) kputsn("*\t", 2, &str);
- else { kputs(header->target_name[c->tid], &str); kputc('\t', &str); }
+ else {
+ if (header) kputs(header->target_name[c->tid] , &str);
+ else kputw(c->tid, &str);
+ kputc('\t', &str);
+ }
kputw(c->pos + 1, &str); kputc('\t', &str); kputw(c->qual, &str); kputc('\t', &str);
if (c->n_cigar == 0) kputc('*', &str);
else {
kputc('\t', &str);
if (c->mtid < 0) kputsn("*\t", 2, &str);
else if (c->mtid == c->tid) kputsn("=\t", 2, &str);
- else { kputs(header->target_name[c->mtid], &str); kputc('\t', &str); }
+ else {
+ if (header) kputs(header->target_name[c->mtid], &str);
+ else kputw(c->mtid, &str);
+ kputc('\t', &str);
+ }
kputw(c->mpos + 1, &str); kputc('\t', &str); kputw(c->isize, &str); kputc('\t', &str);
if (c->l_qseq) {
for (i = 0; i < c->l_qseq; ++i) kputc(bam_nt16_rev_table[bam1_seqi(s, i)], &str);
free(s);
}
+int bam_validate1(const bam_header_t *header, const bam1_t *b)
+{
+ char *s;
+
+ if (b->core.tid < -1 || b->core.mtid < -1) return 0;
+ if (header && (b->core.tid >= header->n_targets || b->core.mtid >= header->n_targets)) return 0;
+
+ if (b->data_len < b->core.l_qname) return 0;
+ s = memchr(bam1_qname(b), '\0', b->core.l_qname);
+ if (s != &bam1_qname(b)[b->core.l_qname-1]) return 0;
+
+ // FIXME: Other fields could also be checked, especially the auxiliary data
+
+ return 1;
+}
+
// FIXME: we should also check the LB tag associated with each alignment
const char *bam_get_library(bam_header_t *h, const bam1_t *b)
{
/* The MIT License
- Copyright (c) 2008 Genome Research Ltd (GRL).
+ Copyright (c) 2008-2010 Genome Research Ltd (GRL).
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
@param b pointer to an alignment
@return pointer to quality string
*/
-#define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + ((b)->core.l_qseq + 1)/2)
+#define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1))
/*! @function
@abstract Get a base on read
char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of);
+ /*!
+ @abstract Check whether a BAM record is plausibly valid
+ @param header associated header structure, or NULL if unavailable
+ @param b alignment to validate
+ @return 0 if the alignment is invalid; non-zero otherwise
+
+ @discussion Simple consistency check of some of the fields of the
+ alignment record. If the header is provided, several additional checks
+ are made. Not all fields are checked, so a non-zero result is not a
+ guarantee that the record is valid. However it is usually good enough
+ to detect when bam_seek() has been called with a virtual file offset
+ that is not the offset of an alignment record.
+ */
+ int bam_validate1(const bam_header_t *header, const bam1_t *b);
+
const char *bam_get_library(bam_header_t *header, const bam1_t *b);
bam1_t *b;
int32_t qpos;
int indel, level;
- uint32_t is_del:1, is_head:1, is_tail:1;
+ uint32_t is_del:1, is_head:1, is_tail:1, is_refskip:1, aux:28;
} bam_pileup1_t;
typedef int (*bam_plp_auto_f)(void *data, bam1_t *b);
const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp);
const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp);
void bam_plp_set_mask(bam_plp_t iter, int mask);
+ void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt);
void bam_plp_reset(bam_plp_t iter);
void bam_plp_destroy(bam_plp_t iter);
bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data);
void bam_mplp_destroy(bam_mplp_t iter);
+ void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt);
int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp);
/*! @typedef
{
uint8_t *data = bdst->data;
int m_data = bdst->m_data; // backup data and m_data
- if (m_data < bsrc->m_data) { // double the capacity
- m_data = bsrc->m_data; kroundup32(m_data);
+ if (m_data < bsrc->data_len) { // double the capacity
+ m_data = bsrc->data_len; kroundup32(m_data);
data = (uint8_t*)realloc(data, m_data);
}
memcpy(data, bsrc->data, bsrc->data_len); // copy var-len data
}
if (save_tid >= 0) {
insert_offset(idx->index[save_tid], save_bin, save_off, bam_tell(fp));
- insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, off_end);
+ insert_offset(idx->index[save_tid], BAM_MAX_BIN, off_beg, bam_tell(fp));
insert_offset(idx->index[save_tid], BAM_MAX_BIN, n_mapped, n_unmapped);
}
merge_chunks(idx);
}
}
free(bins);
+ if (n_off == 0) {
+ free(off); return iter;
+ }
{
bam1_t *b = (bam1_t*)calloc(1, sizeof(bam1_t));
int l;
int bam_iter_read(bamFile fp, bam_iter_t iter, bam1_t *b)
{
- if (iter->finished) return -1;
- if (iter->from_first) {
- int ret = bam_read1(fp, b);
- if (ret < 0) iter->finished = 1;
+ int ret;
+ if (iter && iter->finished) return -1;
+ if (iter == 0 || iter->from_first) {
+ ret = bam_read1(fp, b);
+ if (ret < 0 && iter) iter->finished = 1;
return ret;
}
if (iter->off == 0) return -1;
for (;;) {
- int ret;
if (iter->curr_off == 0 || iter->curr_off >= iter->off[iter->i].v) { // then jump to the next chunk
- if (iter->i == iter->n_off - 1) break; // no more chunks
+ if (iter->i == iter->n_off - 1) { ret = -1; break; } // no more chunks
if (iter->i >= 0) assert(iter->curr_off == iter->off[iter->i].v); // otherwise bug
if (iter->i < 0 || iter->off[iter->i].v != iter->off[iter->i+1].u) { // not adjacent chunks; then seek
bam_seek(fp, iter->off[iter->i+1].u, SEEK_SET);
}
++iter->i;
}
- if ((ret = bam_read1(fp, b)) > 0) {
+ if ((ret = bam_read1(fp, b)) >= 0) {
iter->curr_off = bam_tell(fp);
- if (b->core.tid != iter->tid || b->core.pos >= iter->end) break; // no need to proceed
+ if (b->core.tid != iter->tid || b->core.pos >= iter->end) { // no need to proceed
+ ret = bam_validate1(NULL, b)? -1 : -5; // determine whether end of region or error
+ break;
+ }
else if (is_overlap(iter->beg, iter->end, b)) return ret;
- } else break; // end of file
+ } else break; // end of file or error
}
iter->finished = 1;
- return -1;
+ return ret;
}
int bam_fetch(bamFile fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func)
{
+ int ret;
bam_iter_t iter;
bam1_t *b;
b = bam_init1();
iter = bam_iter_query(idx, tid, beg, end);
- while (bam_iter_read(fp, iter, b) >= 0) func(b, data);
+ while ((ret = bam_iter_read(fp, iter, b)) >= 0) func(b, data);
+ bam_iter_destroy(iter);
bam_destroy1(b);
- return 0;
+ return (ret == -1)? 0 : ret;
}
#include "bam.h"
#include "bam_maqcns.h"
#include "ksort.h"
+#include "errmod.h"
#include "kaln.h"
KSORT_INIT_GENERIC(uint32_t)
typedef struct __bmc_aux_t {
int max;
uint32_t *info;
+ uint16_t *info16;
+ errmod_t *em;
} bmc_aux_t;
typedef struct {
float esum[4], fsum[4];
uint32_t c[4];
- uint32_t rms_mapQ;
} glf_call_aux_t;
char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
for (n1 = 0; n1 < 256; ++n1) {
for (n2 = 0; n2 < 256; ++n2) {
long double sum = 0.0;
- double lC = aa->is_soap? 0 : lgamma(n1+n2+1) - lgamma(n1+1) - lgamma(n2+1); // \binom{n1+n2}{n1}
+ double lC = aa->errmod == BAM_ERRMOD_SOAP? 0 : lgamma(n1+n2+1) - lgamma(n1+1) - lgamma(n2+1);
for (k = 1; k <= aa->n_hap - 1; ++k) {
double pk = 1.0 / k / sum_harmo;
double log1 = log((double)k/aa->n_hap);
long double sum_a[257], b[256], q_c[256], tmp[256], fk2[256];
double *lC;
+ if (aa->errmod == BAM_ERRMOD_MAQ2) return; // no need to do the following
// aa->lhet will be allocated and initialized
free(aa->fk); free(aa->coef);
aa->coef = 0;
aa->fk[n] = pow(aa->theta, n) * (1.0 - aa->eta) + aa->eta;
fk2[n] = aa->fk[n>>1]; // this is an approximation, assuming reads equally likely come from both strands
}
- if (aa->is_soap) return;
+ if (aa->errmod == BAM_ERRMOD_SOAP) return;
aa->coef = (double*)calloc(256*256*64, sizeof(double));
lC = (double*)calloc(256 * 256, sizeof(double));
for (n = 1; n != 256; ++n)
bm = (bam_maqcns_t*)calloc(1, sizeof(bam_maqcns_t));
bm->aux = (bmc_aux_t*)calloc(1, sizeof(bmc_aux_t));
bm->het_rate = 0.001;
- bm->theta = 0.85;
+ bm->theta = 0.83f;
bm->n_hap = 2;
bm->eta = 0.03;
bm->cap_mapQ = 60;
+ bm->min_baseQ = 13;
return bm;
}
void bam_maqcns_prepare(bam_maqcns_t *bm)
{
+ if (bm->errmod == BAM_ERRMOD_MAQ2) bm->aux->em = errmod_init(1. - bm->theta);
cal_coef(bm); cal_het(bm);
}
void bam_maqcns_destroy(bam_maqcns_t *bm)
{
if (bm == 0) return;
- free(bm->lhet); free(bm->fk); free(bm->coef); free(bm->aux->info);
+ free(bm->lhet); free(bm->fk); free(bm->coef); free(bm->aux->info); free(bm->aux->info16);
+ if (bm->aux->em) errmod_destroy(bm->aux->em);
free(bm->aux); free(bm);
}
glf1_t *bam_maqcns_glfgen(int _n, const bam_pileup1_t *pl, uint8_t ref_base, bam_maqcns_t *bm)
{
- glf_call_aux_t *b;
+ glf_call_aux_t *b = 0;
int i, j, k, w[8], c, n;
glf1_t *g = (glf1_t*)calloc(1, sizeof(glf1_t));
float p[16], min_p = 1e30;
bm->aux->max = _n;
kroundup32(bm->aux->max);
bm->aux->info = (uint32_t*)realloc(bm->aux->info, 4 * bm->aux->max);
+ bm->aux->info16 = (uint16_t*)realloc(bm->aux->info16, 2 * bm->aux->max);
}
- for (i = n = 0; i < _n; ++i) {
+ for (i = n = 0, rms = 0; i < _n; ++i) {
const bam_pileup1_t *p = pl + i;
uint32_t q, x = 0, qq;
- if (p->is_del || (p->b->core.flag&BAM_FUNMAP)) continue;
+ uint16_t y = 0;
+ if (p->is_del || p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue;
q = (uint32_t)bam1_qual(p->b)[p->qpos];
+ if (q < bm->min_baseQ) continue;
x |= (uint32_t)bam1_strand(p->b) << 18 | q << 8 | p->b->core.qual;
+ y |= bam1_strand(p->b)<<4;
if (p->b->core.qual < q) q = p->b->core.qual;
+ c = p->b->core.qual < bm->cap_mapQ? p->b->core.qual : bm->cap_mapQ;
+ rms += c * c;
x |= q << 24;
+ y |= q << 5;
qq = bam1_seqi(bam1_seq(p->b), p->qpos);
q = bam_nt16_nt4_table[qq? qq : ref_base];
- if (!p->is_del && q < 4) x |= 1 << 21 | q << 16;
+ if (!p->is_del && !p->is_refskip && q < 4) x |= 1 << 21 | q << 16, y |= q;
+ bm->aux->info16[n] = y;
bm->aux->info[n++] = x;
}
+ rms = (uint8_t)(sqrt((double)rms / n) + .499);
+ if (bm->errmod == BAM_ERRMOD_MAQ2) {
+ errmod_cal(bm->aux->em, n, 4, bm->aux->info16, p);
+ goto goto_glf;
+ }
ks_introsort(uint32_t, n, bm->aux->info);
// generate esum and fsum
b = (glf_call_aux_t*)calloc(1, sizeof(glf_call_aux_t));
for (k = 0; k != 8; ++k) w[k] = 0;
- rms = 0;
for (j = n - 1; j >= 0; --j) { // calculate esum and fsum
uint32_t info = bm->aux->info[j];
- int tmp;
if (info>>24 < 4 && (info>>8&0x3f) != 0) info = 4<<24 | (info&0xffffff);
k = info>>16&7;
if (info>>24 > 0) {
if (w[k] < 0xff) ++w[k];
++b->c[k&3];
}
- tmp = (int)(info&0xff) < bm->cap_mapQ? (int)(info&0xff) : bm->cap_mapQ;
- rms += tmp * tmp;
}
- b->rms_mapQ = (uint8_t)(sqrt((double)rms / n) + .499);
// rescale ->c[]
for (j = c = 0; j != 4; ++j) c += b->c[j];
if (c > 255) {
for (j = 0; j != 4; ++j) b->c[j] = (int)(254.0 * b->c[j] / c + 0.5);
for (j = c = 0; j != 4; ++j) c += b->c[j];
}
- if (!bm->is_soap) {
+ if (bm->errmod == BAM_ERRMOD_MAQ) {
// generate likelihood
for (j = 0; j != 4; ++j) {
// homozygous
if (max1 > max2 && (min_k != max_k || min1 + 1.0 > min2))
p[max_k<<2|max_k] = min1 > 1.0? min1 - 1.0 : 0.0;
}
- } else { // apply the SOAP model
+ } else if (bm->errmod == BAM_ERRMOD_SOAP) { // apply the SOAP model
// generate likelihood
for (j = 0; j != 4; ++j) {
float tmp;
}
}
+goto_glf:
// convert necessary information to glf1_t
- g->ref_base = ref_base; g->max_mapQ = b->rms_mapQ;
+ g->ref_base = ref_base; g->max_mapQ = rms;
g->depth = n > 16777215? 16777215 : n;
for (j = 0; j != 4; ++j)
for (k = j; k < 4; ++k)
uint32_t glf2cns(const glf1_t *g, int q_r)
{
- int i, j, k, tmp[16], min = 10000, min2 = 10000, min3 = 10000, min_g = -1, min_g2 = -1;
+ int i, j, k, p[10], ref4;
uint32_t x = 0;
+ ref4 = bam_nt16_nt4_table[g->ref_base];
for (i = k = 0; i < 4; ++i)
for (j = i; j < 4; ++j) {
- tmp[j<<2|i] = -1;
- tmp[i<<2|j] = g->lk[k++] + (i == j? 0 : q_r);
+ int prior = (i == ref4 && j == ref4? 0 : i == ref4 || j == ref4? q_r : q_r + 3);
+ p[k] = (g->lk[k] + prior)<<4 | i<<2 | j;
+ ++k;
}
- for (i = 0; i < 16; ++i) {
- if (tmp[i] < 0) continue;
- if (tmp[i] < min) {
- min3 = min2; min2 = min; min = tmp[i]; min_g2 = min_g; min_g = i;
- } else if (tmp[i] < min2) {
- min3 = min2; min2 = tmp[i]; min_g2 = i;
- } else if (tmp[i] < min3) min3 = tmp[i];
- }
- x = min_g >= 0? (1U<<(min_g>>2&3) | 1U<<(min_g&3)) << 28 : 0xf << 28;
- x |= min_g2 >= 0? (1U<<(min_g2>>2&3) | 1U<<(min_g2&3)) << 24 : 0xf << 24;
- x |= (uint32_t)g->max_mapQ << 16;
- x |= min2 < 10000? (min2 - min < 256? min2 - min : 255) << 8 : 0xff << 8;
- x |= min2 < 10000 && min3 < 10000? (min3 - min2 < 256? min3 - min2 : 255) : 0xff;
+ for (i = 1; i < 10; ++i) // insertion sort
+ for (j = i; j > 0 && p[j] < p[j-1]; --j)
+ k = p[j], p[j] = p[j-1], p[j-1] = k;
+ x = (1u<<(p[0]&3) | 1u<<(p[0]>>2&3)) << 28; // the best genotype
+ x |= (uint32_t)g->max_mapQ << 16; // rms mapQ
+ x |= ((p[1]>>4) - (p[0]>>4) < 256? (p[1]>>4) - (p[0]>>4) : 255) << 8; // consensus Q
+ for (k = 0; k < 10; ++k)
+ if ((p[k]&0xf) == (ref4<<2|ref4)) break;
+ if (k == 10) k = 9;
+ x |= (p[k]>>4) - (p[0]>>4) < 256? (p[k]>>4) - (p[0]>>4) : 255; // snp Q
return x;
}
uint32_t x;
if (n) {
g = bam_maqcns_glfgen(n, pl, 0xf, bm);
- x = glf2cns(g, (int)(bm->q_r + 0.5));
+ x = g->depth == 0? (0xfU<<28 | 0xfU<<24) : glf2cns(g, (int)(bm->q_r + 0.5));
free(g);
} else x = 0xfU<<28 | 0xfU<<24;
return x;
for (i = 0; i < n_types; ++i) {
ka_param_t ap = ka_param_blast;
ap.band_width = 2 * types[n_types - 1] + 2;
- ap.gap_end = 0;
+ ap.gap_end_ext = 0;
// write ref2
for (k = 0, j = left; j <= pos; ++j)
ref2[k++] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[j]]];
#include "glf.h"
+#define BAM_ERRMOD_MAQ2 0
+#define BAM_ERRMOD_MAQ 1
+#define BAM_ERRMOD_SOAP 2
+
struct __bmc_aux_t;
typedef struct {
float het_rate, theta;
- int n_hap, cap_mapQ, is_soap;
+ int n_hap, cap_mapQ, errmod, min_baseQ;
float eta, q_r;
double *fk, *coef;
#include <assert.h>
#include <string.h>
#include <ctype.h>
+#include <math.h>
#include "faidx.h"
#include "sam.h"
#include "kstring.h"
+#include "kaln.h"
+#include "kprobaln.h"
void bam_fillmd1_core(bam1_t *b, char *ref, int is_equal, int max_nm)
{
bam_fillmd1_core(b, ref, is_equal, 0);
}
+int bam_cap_mapQ(bam1_t *b, char *ref, int thres)
+{
+ uint8_t *seq = bam1_seq(b), *qual = bam1_qual(b);
+ uint32_t *cigar = bam1_cigar(b);
+ bam1_core_t *c = &b->core;
+ int i, x, y, mm, q, len, clip_l, clip_q;
+ double t;
+ if (thres < 0) thres = 40; // set the default
+ mm = q = len = clip_l = clip_q = 0;
+ for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
+ int j, l = cigar[i]>>4, op = cigar[i]&0xf;
+ if (op == BAM_CMATCH) {
+ for (j = 0; j < l; ++j) {
+ int z = y + j;
+ int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
+ if (ref[x+j] == 0) break; // out of boundary
+ if (c2 != 15 && c1 != 15 && qual[z] >= 13) { // not ambiguous
+ ++len;
+ if (c1 && c1 != c2 && qual[z] >= 13) { // mismatch
+ ++mm;
+ q += qual[z] > 33? 33 : qual[z];
+ }
+ }
+ }
+ if (j < l) break;
+ x += l; y += l; len += l;
+ } else if (op == BAM_CDEL) {
+ for (j = 0; j < l; ++j)
+ if (ref[x+j] == 0) break;
+ if (j < l) break;
+ x += l;
+ } else if (op == BAM_CSOFT_CLIP) {
+ for (j = 0; j < l; ++j) clip_q += qual[y+j];
+ clip_l += l;
+ y += l;
+ } else if (op == BAM_CHARD_CLIP) {
+ clip_q += 13 * l;
+ clip_l += l;
+ } else if (op == BAM_CINS) y += l;
+ else if (op == BAM_CREF_SKIP) x += l;
+ }
+ for (i = 0, t = 1; i < mm; ++i)
+ t *= (double)len / (i+1);
+ t = q - 4.343 * log(t) + clip_q / 5.;
+ if (t > thres) return -1;
+ if (t < 0) t = 0;
+ t = sqrt((thres - t) / thres) * thres;
+// fprintf(stderr, "%s %lf %d\n", bam1_qname(b), t, q);
+ return (int)(t + .499);
+}
+
+int bam_prob_realn_core(bam1_t *b, const char *ref, int apply_baq)
+{
+ int k, i, bw, x, y, yb, ye, xb, xe;
+ uint32_t *cigar = bam1_cigar(b);
+ bam1_core_t *c = &b->core;
+ kpa_par_t conf = kpa_par_def;
+ uint8_t *bq = 0, *zq = 0, *qual = bam1_qual(b);
+ if (c->flag & BAM_FUNMAP) return -1; // do nothing
+ // test if BQ or ZQ is present
+ if ((bq = bam_aux_get(b, "BQ")) != 0) ++bq;
+ if ((zq = bam_aux_get(b, "ZQ")) != 0 && *zq == 'Z') ++zq;
+ if (bq && zq) { // remove the ZQ tag
+ bam_aux_del(b, zq-1);
+ zq = 0;
+ }
+ if (bq || zq) {
+ if ((apply_baq && zq) || (!apply_baq && bq)) return -3; // in both cases, do nothing
+ if (bq && apply_baq) { // then convert BQ to ZQ
+ for (i = 0; i < c->l_qseq; ++i)
+ qual[i] = qual[i] + 64 < bq[i]? 0 : qual[i] - ((int)bq[i] - 64);
+ *(bq - 3) = 'Z';
+ } else if (zq && !apply_baq) { // then convert ZQ to BQ
+ for (i = 0; i < c->l_qseq; ++i)
+ qual[i] += (int)zq[i] - 64;
+ *(zq - 3) = 'B';
+ }
+ return 0;
+ }
+ // find the start and end of the alignment
+ x = c->pos, y = 0, yb = ye = xb = xe = -1;
+ for (k = 0; k < c->n_cigar; ++k) {
+ int op, l;
+ op = cigar[k]&0xf; l = cigar[k]>>4;
+ if (op == BAM_CMATCH) {
+ if (yb < 0) yb = y;
+ if (xb < 0) xb = x;
+ ye = y + l; xe = x + l;
+ x += l; y += l;
+ } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
+ else if (op == BAM_CDEL) x += l;
+ else if (op == BAM_CREF_SKIP) return -1; // do nothing if there is a reference skip
+ }
+ // set bandwidth and the start and the end
+ bw = 7;
+ if (abs((xe - xb) - (ye - yb)) > bw)
+ bw = abs((xe - xb) - (ye - yb)) + 3;
+ conf.bw = bw;
+ xb -= yb + bw/2; if (xb < 0) xb = 0;
+ xe += c->l_qseq - ye + bw/2;
+ if (xe - xb - c->l_qseq > bw)
+ xb += (xe - xb - c->l_qseq - bw) / 2, xe -= (xe - xb - c->l_qseq - bw) / 2;
+ { // glocal
+ uint8_t *s, *r, *q, *seq = bam1_seq(b), *bq;
+ int *state;
+ bq = calloc(c->l_qseq + 1, 1);
+ memcpy(bq, qual, c->l_qseq);
+ s = calloc(c->l_qseq, 1);
+ for (i = 0; i < c->l_qseq; ++i) s[i] = bam_nt16_nt4_table[bam1_seqi(seq, i)];
+ r = calloc(xe - xb, 1);
+ for (i = xb; i < xe; ++i)
+ r[i-xb] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[i]]];
+ state = calloc(c->l_qseq, sizeof(int));
+ q = calloc(c->l_qseq, 1);
+ kpa_glocal(r, xe-xb, s, c->l_qseq, qual, &conf, state, q);
+ for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) {
+ int op = cigar[k]&0xf, l = cigar[k]>>4;
+ if (op == BAM_CMATCH) {
+ for (i = y; i < y + l; ++i) {
+ if ((state[i]&3) != 0 || state[i]>>2 != x - xb + (i - y)) bq[i] = 0;
+ else bq[i] = bq[i] < q[i]? bq[i] : q[i];
+ }
+ x += l; y += l;
+ } else if (op == BAM_CSOFT_CLIP || op == BAM_CINS) y += l;
+ else if (op == BAM_CDEL) x += l;
+ }
+ for (i = 0; i < c->l_qseq; ++i) bq[i] = qual[i] - bq[i] + 64; // finalize BQ
+ if (apply_baq) {
+ for (i = 0; i < c->l_qseq; ++i) qual[i] -= bq[i] - 64; // modify qual
+ bam_aux_append(b, "ZQ", 'Z', c->l_qseq + 1, bq);
+ } else bam_aux_append(b, "BQ", 'Z', c->l_qseq + 1, bq);
+ free(bq); free(s); free(r); free(q); free(state);
+ }
+ return 0;
+}
+
+int bam_prob_realn(bam1_t *b, const char *ref)
+{
+ return bam_prob_realn_core(b, ref, 1);
+}
+
int bam_fillmd(int argc, char *argv[])
{
- int c, is_equal = 0, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm = 0;
+ int c, is_equal, tid = -2, ret, len, is_bam_out, is_sam_in, is_uncompressed, max_nm, is_realn, capQ, apply_baq;
samfile_t *fp, *fpout = 0;
faidx_t *fai;
char *ref = 0, mode_w[8], mode_r[8];
bam1_t *b;
- is_bam_out = is_sam_in = is_uncompressed = 0;
+ is_equal = is_bam_out = is_sam_in = is_uncompressed = is_realn = max_nm = apply_baq = capQ = 0;
mode_w[0] = mode_r[0] = 0;
strcpy(mode_r, "r"); strcpy(mode_w, "w");
- while ((c = getopt(argc, argv, "eubSn:")) >= 0) {
+ while ((c = getopt(argc, argv, "reubSC:n:A")) >= 0) {
switch (c) {
+ case 'r': is_realn = 1; break;
case 'e': is_equal = 1; break;
case 'b': is_bam_out = 1; break;
case 'u': is_uncompressed = is_bam_out = 1; break;
case 'S': is_sam_in = 1; break;
case 'n': max_nm = atoi(optarg); break;
+ case 'C': capQ = atoi(optarg); break;
+ case 'A': apply_baq = 1; break;
default: fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n", c); return 1;
}
}
if (is_uncompressed) strcat(mode_w, "u");
if (optind + 1 >= argc) {
fprintf(stderr, "\n");
- fprintf(stderr, "Usage: samtools fillmd [-eubS] <aln.bam> <ref.fasta>\n\n");
+ fprintf(stderr, "Usage: samtools fillmd [-eubrS] <aln.bam> <ref.fasta>\n\n");
fprintf(stderr, "Options: -e change identical bases to '='\n");
fprintf(stderr, " -u uncompressed BAM output (for piping)\n");
fprintf(stderr, " -b compressed BAM output\n");
- fprintf(stderr, " -S the input is SAM with header\n\n");
+ fprintf(stderr, " -S the input is SAM with header\n");
+ fprintf(stderr, " -A modify the quality string\n");
+ fprintf(stderr, " -r read-independent local realignment\n\n");
return 1;
}
fp = samopen(argv[optind], mode_r, 0);
fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n",
fp->header->target_name[tid]);
}
+ if (is_realn) bam_prob_realn_core(b, ref, apply_baq);
+ if (capQ > 10) {
+ int q = bam_cap_mapQ(b, ref, capQ);
+ if (b->core.qual > q) b->core.qual = q;
+ }
if (ref) bam_fillmd1_core(b, ref, is_equal, max_nm);
}
samwrite(fpout, b);
#include <assert.h>
#include "sam.h"
+typedef struct {
+ int k, x, y, end;
+} cstate_t;
+
+static cstate_t g_cstate_null = { -1, 0, 0, 0 };
+
typedef struct __linkbuf_t {
bam1_t b;
uint32_t beg, end;
+ cstate_t s;
struct __linkbuf_t *next;
} lbnode_t;
/* --- BEGIN: Auxiliary functions */
-static inline int resolve_cigar(bam_pileup1_t *p, uint32_t pos)
+/* s->k: the index of the CIGAR operator that has just been processed.
+ s->x: the reference coordinate of the start of s->k
+ s->y: the query coordiante of the start of s->k
+ */
+static inline int resolve_cigar2(bam_pileup1_t *p, uint32_t pos, cstate_t *s)
{
- unsigned k;
+#define _cop(c) ((c)&BAM_CIGAR_MASK)
+#define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
+
bam1_t *b = p->b;
bam1_core_t *c = &b->core;
- uint32_t x = c->pos, y = 0;
- int ret = 1, is_restart = 1;
-
- if (c->flag&BAM_FUNMAP) return 0; // unmapped read
- assert(x <= pos); // otherwise a bug
- p->qpos = -1; p->indel = 0; p->is_del = p->is_head = p->is_tail = 0;
- for (k = 0; k < c->n_cigar; ++k) {
- int op = bam1_cigar(b)[k] & BAM_CIGAR_MASK; // operation
- int l = bam1_cigar(b)[k] >> BAM_CIGAR_SHIFT; // length
- if (op == BAM_CMATCH) { // NOTE: this assumes the first and the last operation MUST BE a match or a clip
- if (x + l > pos) { // overlap with pos
- p->indel = p->is_del = 0;
- p->qpos = y + (pos - x);
- if (x == pos && is_restart) p->is_head = 1;
- if (x + l - 1 == pos) { // come to the end of a match
- int has_next_match = 0;
- unsigned i;
- for (i = k + 1; i < c->n_cigar; ++i) {
- uint32_t cigar = bam1_cigar(b)[i];
- int opi = cigar&BAM_CIGAR_MASK;
- if (opi == BAM_CMATCH) {
- has_next_match = 1;
- break;
- } else if (opi == BAM_CSOFT_CLIP || opi == BAM_CREF_SKIP || opi == BAM_CHARD_CLIP) break;
- }
- if (!has_next_match) p->is_tail = 1;
- if (k < c->n_cigar - 1 && has_next_match) { // there are additional operation(s)
- uint32_t cigar = bam1_cigar(b)[k+1]; // next CIGAR
- int op_next = cigar&BAM_CIGAR_MASK; // next CIGAR operation
- if (op_next == BAM_CDEL) p->indel = -(int32_t)(cigar>>BAM_CIGAR_SHIFT); // del
- else if (op_next == BAM_CINS) p->indel = cigar>>BAM_CIGAR_SHIFT; // ins
- else if (op_next == BAM_CPAD && k + 2 < c->n_cigar) { // no working for adjacent padding
- cigar = bam1_cigar(b)[k+2]; op_next = cigar&BAM_CIGAR_MASK;
- if (op_next == BAM_CDEL) p->indel = -(int32_t)(cigar>>BAM_CIGAR_SHIFT); // del
- else if (op_next == BAM_CINS) p->indel = cigar>>BAM_CIGAR_SHIFT; // ins
- }
- }
+ uint32_t *cigar = bam1_cigar(b);
+ int k, is_head = 0;
+ // determine the current CIGAR operation
+// fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam1_qname(b), pos, s->end, s->k, s->x, s->y);
+ if (s->k == -1) { // never processed
+ is_head = 1;
+ if (c->n_cigar == 1) { // just one operation, save a loop
+ if (_cop(cigar[0]) == BAM_CMATCH) s->k = 0, s->x = c->pos, s->y = 0;
+ } else { // find the first match or deletion
+ for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
+ int op = _cop(cigar[k]);
+ int l = _cln(cigar[k]);
+ if (op == BAM_CMATCH || op == BAM_CDEL) break;
+ else if (op == BAM_CREF_SKIP) s->x += l;
+ else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
+ }
+ assert(k < c->n_cigar);
+ s->k = k;
+ }
+ } else { // the read has been processed before
+ int op, l = _cln(cigar[s->k]);
+ if (pos - s->x >= l) { // jump to the next operation
+ assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
+ op = _cop(cigar[s->k+1]);
+ if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP) { // jump to the next without a loop
+ if (_cop(cigar[s->k]) == BAM_CMATCH) s->y += l;
+ s->x += l;
+ ++s->k;
+ } else { // find the next M/D/N
+ if (_cop(cigar[s->k]) == BAM_CMATCH) s->y += l;
+ s->x += l;
+ for (k = s->k + 1; k < c->n_cigar; ++k) {
+ op = _cop(cigar[k]), l = _cln(cigar[k]);
+ if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP) break;
+ else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
}
+ s->k = k;
}
- x += l; y += l;
- } else if (op == BAM_CDEL) { // then set ->is_del
- if (x + l > pos) {
- p->indel = 0; p->is_del = 1;
- p->qpos = y + (pos - x);
+ assert(s->k < c->n_cigar); // otherwise a bug
+ } // else, do nothing
+ }
+ { // collect pileup information
+ int op, l;
+ op = _cop(cigar[s->k]); l = _cln(cigar[s->k]);
+ p->is_del = p->indel = p->is_refskip = 0;
+ if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
+ int op2 = _cop(cigar[s->k+1]);
+ int l2 = _cln(cigar[s->k+1]);
+ if (op2 == BAM_CDEL) p->indel = -(int)l2;
+ else if (op2 == BAM_CINS) p->indel = l2;
+ else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding
+ int l3 = 0;
+ for (k = s->k + 2; k < c->n_cigar; ++k) {
+ op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
+ if (op2 == BAM_CINS) l3 += l2;
+ else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP) break;
+ }
+ if (l3 > 0) p->indel = l3;
}
- x += l;
- } else if (op == BAM_CREF_SKIP) x += l;
- else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
- if (is_restart) is_restart ^= (op == BAM_CMATCH);
- else is_restart ^= (op == BAM_CREF_SKIP || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP);
- if (x > pos) {
- if (op == BAM_CREF_SKIP) ret = 0; // then do not put it into pileup at all
- break;
}
+ if (op == BAM_CMATCH) {
+ p->qpos = s->y + (pos - s->x);
+ } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
+ p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
+ p->is_refskip = (op == BAM_CREF_SKIP);
+ } // cannot be other operations; otherwise a bug
+ p->is_head = (pos == c->pos); p->is_tail = (pos == s->end);
}
- assert(x > pos); // otherwise a bug
- return ret;
+ return 1;
}
/* --- END: Auxiliary functions */
mempool_t *mp;
lbnode_t *head, *tail, *dummy;
int32_t tid, pos, max_tid, max_pos;
- int is_eof, flag_mask, max_plp, error;
+ int is_eof, flag_mask, max_plp, error, maxcnt;
bam_pileup1_t *plp;
// for the "auto" interface only
bam1_t *b;
iter->dummy = mp_alloc(iter->mp);
iter->max_tid = iter->max_pos = -1;
iter->flag_mask = BAM_DEF_MASK;
+ iter->maxcnt = 8000;
if (func) {
iter->func = func;
iter->data = data;
iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp);
}
iter->plp[n_plp].b = &p->b;
- if (resolve_cigar(iter->plp + n_plp, iter->pos)) ++n_plp; // skip the read if we are looking at ref-skip
+ if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true...
}
}
iter->head = iter->dummy->next; // dummy->next may be changed
if (b) {
if (b->core.tid < 0) return 0;
if (b->core.flag & iter->flag_mask) return 0;
+ if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt) return 0;
bam_copy1(&iter->tail->b, b);
iter->tail->beg = b->core.pos; iter->tail->end = bam_calend(&b->core, bam1_cigar(b));
+ iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
if (b->core.tid < iter->max_tid) {
fprintf(stderr, "[bam_pileup_core] the input is not sorted (chromosomes out of order)\n");
iter->error = 1;
const bam_pileup1_t *plp;
if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
- else {
+ else { // no pileup line can be obtained; read alignments
*_n_plp = 0;
if (iter->is_eof) return 0;
while (iter->func(iter->data, iter->b) >= 0) {
return 0;
}
if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
+ // otherwise no pileup line can be returned; read the next alignment.
}
bam_plp_push(iter, 0);
if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
iter->flag_mask = mask < 0? BAM_DEF_MASK : (BAM_FUNMAP | mask);
}
+void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
+{
+ iter->maxcnt = maxcnt;
+}
+
/*****************
* callback APIs *
*****************/
return iter;
}
+void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
+{
+ int i;
+ for (i = 0; i < iter->n; ++i)
+ iter->iter[i]->maxcnt = maxcnt;
+}
+
void bam_mplp_destroy(bam_mplp_t iter)
{
int i;
if (new_min == (uint64_t)-1) return 0;
*_tid = new_min>>32; *_pos = (uint32_t)new_min;
for (i = 0; i < iter->n; ++i) {
- if (iter->pos[i] == iter->min) {
+ if (iter->pos[i] == iter->min) { // FIXME: valgrind reports "uninitialised value(s) at this line"
n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
++ret;
} else n_plp[i] = 0, plp[i] = 0;
#include <stdio.h>
#include <unistd.h>
#include <ctype.h>
+#include <string.h>
+#include <errno.h>
#include "sam.h"
#include "faidx.h"
#include "bam_maqcns.h"
#define BAM_PLF_1STBASE 0x80
#define BAM_PLF_ALLBASE 0x100
#define BAM_PLF_READPOS 0x200
+#define BAM_PLF_NOBAQ 0x400
typedef struct {
bam_header_t *h;
uint32_t format;
int tid, len, last_pos;
int mask;
+ int capQ_thres, min_baseQ;
int max_depth; // for indel calling, ignore reads with the depth too high. 0 for unlimited
char *ref;
glfFile fp_glf; // for glf output only
return hash;
}
+static inline int printw(int c, FILE *fp)
+{
+ char buf[16];
+ int l, x;
+ if (c == 0) return fputc('0', fp);
+ for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0';
+ if (c < 0) buf[l++] = '-';
+ buf[l] = 0;
+ for (x = 0; x < l/2; ++x) {
+ int y = buf[x]; buf[x] = buf[l-1-x]; buf[l-1-x] = y;
+ }
+ fputs(buf, fp);
+ return 0;
+}
+
// an analogy to pileup_func() below
static int glt3_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data)
{
return 0;
}
-static void pileup_seq(const bam_pileup1_t *p, int pos, int ref_len, const char *ref)
+static inline void pileup_seq(const bam_pileup1_t *p, int pos, int ref_len, const char *ref)
{
- if (p->is_head) printf("^%c", p->b->core.qual > 93? 126 : p->b->core.qual + 33);
+ int j;
+ if (p->is_head) {
+ putchar('^');
+ putchar(p->b->core.qual > 93? 126 : p->b->core.qual + 33);
+ }
if (!p->is_del) {
- int j, rb, c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)];
- rb = (ref && pos < ref_len)? ref[pos] : 'N';
- if (c == '=' || toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.';
- else c = bam1_strand(p->b)? tolower(c) : toupper(c);
+ int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)];
+ if (ref) {
+ int rb = pos < ref_len? ref[pos] : 'N';
+ if (c == '=' || bam_nt16_table[c] == bam_nt16_table[rb]) c = bam1_strand(p->b)? ',' : '.';
+ else c = bam1_strand(p->b)? tolower(c) : toupper(c);
+ } else {
+ if (c == '=') c = bam1_strand(p->b)? ',' : '.';
+ else c = bam1_strand(p->b)? tolower(c) : toupper(c);
+ }
putchar(c);
- if (p->indel > 0) {
- printf("+%d", p->indel);
- for (j = 1; j <= p->indel; ++j) {
- c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)];
- putchar(bam1_strand(p->b)? tolower(c) : toupper(c));
- }
- } else if (p->indel < 0) {
- printf("%d", p->indel);
- for (j = 1; j <= -p->indel; ++j) {
- c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N';
- putchar(bam1_strand(p->b)? tolower(c) : toupper(c));
- }
+ } else putchar(p->is_refskip? (bam1_strand(p->b)? '<' : '>') : '*');
+ if (p->indel > 0) {
+ putchar('+'); printw(p->indel, stdout);
+ for (j = 1; j <= p->indel; ++j) {
+ int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)];
+ putchar(bam1_strand(p->b)? tolower(c) : toupper(c));
+ }
+ } else if (p->indel < 0) {
+ printw(p->indel, stdout);
+ for (j = 1; j <= -p->indel; ++j) {
+ int c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N';
+ putchar(bam1_strand(p->b)? tolower(c) : toupper(c));
}
- } else putchar('*');
+ }
if (p->is_tail) putchar('$');
}
}
rmsQ = (uint64_t)(sqrt((double)rmsQ / n) + .499);
cns = b<<28 | 0xf<<24 | rmsQ<<16 | 60<<8;
- } else cns = bam_maqcns_call(n, pu, d->c);
+ } else {
+ glf1_t *g = bam_maqcns_glfgen(n, pu, bam_nt16_table[rb], d->c);
+ cns = g->depth == 0? (0xfu<<28 | 0xf<<24) : glf2cns(g, (int)(d->c->q_r + .499));
+ free(g);
+ }
}
if ((d->format & (BAM_PLF_CNS|BAM_PLF_INDEL_ONLY)) && d->ref && pos < d->len) { // call indels
int m = (!d->max_depth || d->max_depth>n) ? n : d->max_depth;
}
// when only variant sites are asked for, test if the site is a variant
if ((d->format & BAM_PLF_CNS) && (d->format & BAM_PLF_VAR_ONLY)) {
- if (!(bam_nt16_table[rb] != 15 && cns>>28 != bam_nt16_table[rb])) { // not a SNP
+ if (!(bam_nt16_table[rb] != 15 && cns>>28 != 15 && cns>>28 != bam_nt16_table[rb])) { // not a SNP
if (!(r && (r->gt == 2 || strcmp(r->s[r->gt], "*")))) { // not an indel
if (r) bam_maqindel_ret_destroy(r);
return 0;
}
}
// print the first 3 columns
- printf("%s\t%d\t%c\t", d->h->target_name[tid], pos + 1, rb);
+ fputs(d->h->target_name[tid], stdout); putchar('\t');
+ printw(pos+1, stdout); putchar('\t'); putchar(rb); putchar('\t');
// print consensus information if required
if (d->format & BAM_PLF_CNS) {
- int ref_q, rb4 = bam_nt16_table[rb];
- ref_q = 0;
- if (rb4 != 15 && cns>>28 != 15 && cns>>28 != rb4) { // a SNP
- ref_q = ((cns>>24&0xf) == rb4)? cns>>8&0xff : (cns>>8&0xff) + (cns&0xff);
- if (ref_q > 255) ref_q = 255;
- }
- rms_mapq = cns>>16&0xff;
- printf("%c\t%d\t%d\t%d\t", bam_nt16_rev_table[cns>>28], cns>>8&0xff, ref_q, rms_mapq);
+ putchar(bam_nt16_rev_table[cns>>28]); putchar('\t');
+ printw(cns>>8&0xff, stdout); putchar('\t');
+ printw(cns&0xff, stdout); putchar('\t');
+ printw(cns>>16&0xff, stdout); putchar('\t');
}
// print pileup sequences
- printf("%d\t", n);
- rms_aux = 0; // we need to recalculate rms_mapq when -c is not flagged on the command line
- for (i = 0; i < n; ++i) {
- const bam_pileup1_t *p = pu + i;
- int tmp = p->b->core.qual < d->c->cap_mapQ? p->b->core.qual : d->c->cap_mapQ;
- rms_aux += tmp * tmp;
- pileup_seq(p, pos, d->len, d->ref);
- }
+ printw(n, stdout); putchar('\t');
+ for (i = 0; i < n; ++i)
+ pileup_seq(pu + i, pos, d->len, d->ref);
// finalize rms_mapq
- rms_aux = (uint64_t)(sqrt((double)rms_aux / n) + .499);
- if (rms_mapq < 0) rms_mapq = rms_aux;
+ if (d->format & BAM_PLF_CNS) {
+ for (i = rms_aux = 0; i < n; ++i) {
+ const bam_pileup1_t *p = pu + i;
+ int tmp = p->b->core.qual < d->c->cap_mapQ? p->b->core.qual : d->c->cap_mapQ;
+ rms_aux += tmp * tmp;
+ }
+ rms_aux = (uint64_t)(sqrt((double)rms_aux / n) + .499);
+ if (rms_mapq < 0) rms_mapq = rms_aux;
+ }
putchar('\t');
// print quality
for (i = 0; i < n; ++i) {
for (i = 0; i < n; ++i) {
int x = pu[i].qpos;
int l = pu[i].b->core.l_qseq;
- printf("%d,", x < l/2? x+1 : -((l-1)-x+1));
+ printw(x < l/2? x+1 : -((l-1)-x+1), stdout); putchar(',');
}
}
putchar('\n');
int c, is_SAM = 0;
char *fn_list = 0, *fn_fa = 0, *fn_pos = 0;
pu_data_t *d = (pu_data_t*)calloc(1, sizeof(pu_data_t));
- d->max_depth = 0;
- d->tid = -1; d->mask = BAM_DEF_MASK;
+ d->max_depth = 1024; d->tid = -1; d->mask = BAM_DEF_MASK; d->min_baseQ = 13;
d->c = bam_maqcns_init();
- d->c->is_soap = 1; // change the default model
+ d->c->errmod = BAM_ERRMOD_MAQ2; // change the default model
d->ido = bam_maqindel_opt_init();
- while ((c = getopt(argc, argv, "st:f:cT:N:r:l:d:im:gI:G:vM:S2aR:PA")) >= 0) {
+ while ((c = getopt(argc, argv, "st:f:cT:N:r:l:d:im:gI:G:vM:S2aR:PAQ:C:B")) >= 0) {
switch (c) {
- case 'a': d->c->is_soap = 1; break;
- case 'A': d->c->is_soap = 0; break;
+ case 'Q': d->c->min_baseQ = atoi(optarg); break;
+ case 'C': d->capQ_thres = atoi(optarg); break;
+ case 'B': d->format |= BAM_PLF_NOBAQ; break;
+ case 'a': d->c->errmod = BAM_ERRMOD_SOAP; break;
+ case 'A': d->c->errmod = BAM_ERRMOD_MAQ; break;
case 's': d->format |= BAM_PLF_SIMPLE; break;
case 't': fn_list = strdup(optarg); break;
case 'l': fn_pos = strdup(optarg); break;
default: fprintf(stderr, "Unrecognizd option '-%c'.\n", c); return 1;
}
}
+ if (d->c->errmod != BAM_ERRMOD_MAQ2) d->c->theta += 0.02;
+ if (d->c->theta > 1.0) d->c->theta = 1.0;
if (fn_list) is_SAM = 1;
if (optind == argc) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: samtools pileup [options] <in.bam>|<in.sam>\n\n");
fprintf(stderr, "Option: -s simple (yet incomplete) pileup format\n");
fprintf(stderr, " -S the input is in SAM\n");
- fprintf(stderr, " -A use the MAQ model for SNP calling\n");
+ fprintf(stderr, " -B disable BAQ computation\n");
+ fprintf(stderr, " -A use the original MAQ model for SNP calling (DEPRECATED)\n");
fprintf(stderr, " -2 output the 2nd best call and quality\n");
fprintf(stderr, " -i only show lines/consensus with indels\n");
- fprintf(stderr, " -m INT filtering reads with bits in INT [%d]\n", d->mask);
+ fprintf(stderr, " -Q INT min base quality (possibly capped by BAQ) [%d]\n", d->c->min_baseQ);
+ fprintf(stderr, " -C INT coefficient for adjusting mapQ of poor mappings [%d]\n", d->capQ_thres);
+ fprintf(stderr, " -m INT filtering reads with bits in INT [0x%x]\n", d->mask);
fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", d->c->cap_mapQ);
- fprintf(stderr, " -d INT limit maximum depth for indels [unlimited]\n");
+ fprintf(stderr, " -d INT limit maximum depth for indels [%d]\n", d->max_depth);
fprintf(stderr, " -t FILE list of reference sequences (force -S)\n");
fprintf(stderr, " -l FILE list of sites at which pileup is output\n");
fprintf(stderr, " -f FILE reference sequence in the FASTA format\n\n");
- fprintf(stderr, " -c output the SOAPsnp consensus sequence\n");
+ fprintf(stderr, " -c compute the consensus sequence\n");
fprintf(stderr, " -v print variants only (for -c)\n");
- fprintf(stderr, " -g output in the GLFv3 format (suppressing -c/-i/-s)\n");
- fprintf(stderr, " -T FLOAT theta in maq consensus calling model (for -c/-g) [%f]\n", d->c->theta);
- fprintf(stderr, " -N INT number of haplotypes in the sample (for -c/-g) [%d]\n", d->c->n_hap);
- fprintf(stderr, " -r FLOAT prior of a difference between two haplotypes (for -c/-g) [%f]\n", d->c->het_rate);
- fprintf(stderr, " -G FLOAT prior of an indel between two haplotypes (for -c/-g) [%f]\n", d->ido->r_indel);
- fprintf(stderr, " -I INT phred prob. of an indel in sequencing/prep. (for -c/-g) [%d]\n", d->ido->q_indel);
+ fprintf(stderr, " -g output in the GLFv3 format (DEPRECATED)\n");
+ fprintf(stderr, " -T FLOAT theta in maq consensus calling model (for -c) [%.4g]\n", d->c->theta);
+ fprintf(stderr, " -N INT number of haplotypes in the sample (for -c) [%d]\n", d->c->n_hap);
+ fprintf(stderr, " -r FLOAT prior of a difference between two haplotypes (for -c) [%.4g]\n", d->c->het_rate);
+ fprintf(stderr, " -G FLOAT prior of an indel between two haplotypes (for -c) [%.4g]\n", d->ido->r_indel);
+ fprintf(stderr, " -I INT phred prob. of an indel in sequencing/prep. (for -c) [%d]\n", d->ido->q_indel);
fprintf(stderr, "\n");
free(fn_list); free(fn_fa); free(d);
return 1;
}
d->h = fp->header;
if (fn_pos) d->hash = load_pos(fn_pos, d->h);
- sampileup(fp, d->mask, pileup_func, d);
+ { // run pileup
+ extern int bam_prob_realn(bam1_t *b, const char *ref);
+ extern int bam_cap_mapQ(bam1_t *b, char *ref, int thres);
+ bam1_t *b;
+ int ret, tid, pos, n_plp;
+ bam_plp_t iter;
+ const bam_pileup1_t *plp;
+ b = bam_init1();
+ iter = bam_plp_init(0, 0);
+ bam_plp_set_mask(iter, d->mask);
+ while ((ret = samread(fp, b)) >= 0) {
+ int skip = 0;
+ if ((int)b->core.tid < 0) break;
+ // update d->ref if necessary
+ if (d->fai && (int)b->core.tid != d->tid) {
+ free(d->ref);
+ d->ref = faidx_fetch_seq(d->fai, d->h->target_name[b->core.tid], 0, 0x7fffffff, &d->len);
+ d->tid = b->core.tid;
+ }
+ if (d->ref && (d->format&BAM_PLF_CNS) && !(d->format&BAM_PLF_NOBAQ)) bam_prob_realn(b, d->ref);
+ if (d->ref && (d->format&BAM_PLF_CNS) && d->capQ_thres > 10) {
+ int q = bam_cap_mapQ(b, d->ref, d->capQ_thres);
+ if (q < 0) skip = 1;
+ else if (b->core.qual > q) b->core.qual = q;
+ } else if (b->core.flag&BAM_FUNMAP) skip = 1;
+ else if ((d->format&BAM_PLF_CNS) && (b->core.flag&1) && !(b->core.flag&2)) skip = 1;
+ if (skip) continue;
+ bam_plp_push(iter, b);
+ while ((plp = bam_plp_next(iter, &tid, &pos, &n_plp)) != 0)
+ pileup_func(tid, pos, n_plp, plp, d);
+ }
+ bam_plp_push(iter, 0);
+ while ((plp = bam_plp_next(iter, &tid, &pos, &n_plp)) != 0)
+ pileup_func(tid, pos, n_plp, plp, d);
+ bam_plp_destroy(iter);
+ bam_destroy1(b);
+ }
samclose(fp); // d->h will be destroyed here
}
* mpileup *
***********/
+#include <assert.h>
+#include "bam2bcf.h"
+#include "sample.h"
+
+#define MPLP_GLF 0x10
+#define MPLP_NO_COMP 0x20
+#define MPLP_NO_ORPHAN 0x40
+#define MPLP_REALN 0x80
+#define MPLP_FMT_DP 0x100
+#define MPLP_FMT_SP 0x200
+#define MPLP_NO_INDEL 0x400
+
typedef struct {
- char *reg;
+ int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth;
+ int openQ, extQ, tandemQ;
+ char *reg, *fn_pos, *pl_list;
faidx_t *fai;
+ kh_64_t *hash;
} mplp_conf_t;
typedef struct {
bamFile fp;
bam_iter_t iter;
+ int min_mq, flag, ref_id, capQ_thres;
+ char *ref;
} mplp_aux_t;
+typedef struct {
+ int n;
+ int *n_plp, *m_plp;
+ bam_pileup1_t **plp;
+} mplp_pileup_t;
+
static int mplp_func(void *data, bam1_t *b)
{
+ extern int bam_realn(bam1_t *b, const char *ref);
+ extern int bam_prob_realn_core(bam1_t *b, const char *ref, int);
+ extern int bam_cap_mapQ(bam1_t *b, char *ref, int thres);
mplp_aux_t *ma = (mplp_aux_t*)data;
- if (ma->iter) return bam_iter_read(ma->fp, ma->iter, b);
- return bam_read1(ma->fp, b);
+ int ret, skip = 0;
+ do {
+ int has_ref;
+ ret = ma->iter? bam_iter_read(ma->fp, ma->iter, b) : bam_read1(ma->fp, b);
+ if (ret < 0) break;
+ has_ref = (ma->ref && ma->ref_id == b->core.tid)? 1 : 0;
+ skip = 0;
+ if (has_ref && (ma->flag&MPLP_REALN)) bam_prob_realn_core(b, ma->ref, 1);
+ if (has_ref && ma->capQ_thres > 10) {
+ int q = bam_cap_mapQ(b, ma->ref, ma->capQ_thres);
+ if (q < 0) skip = 1;
+ else if (b->core.qual > q) b->core.qual = q;
+ } else if (b->core.flag&BAM_FUNMAP) skip = 1;
+ else if (b->core.qual < ma->min_mq) skip = 1;
+ else if ((ma->flag&MPLP_NO_ORPHAN) && (b->core.flag&1) && !(b->core.flag&2)) skip = 1;
+ } while (skip);
+ return ret;
+}
+
+static void group_smpl(mplp_pileup_t *m, bam_sample_t *sm, kstring_t *buf,
+ int n, char *const*fn, int *n_plp, const bam_pileup1_t **plp)
+{
+ int i, j;
+ memset(m->n_plp, 0, m->n * sizeof(int));
+ for (i = 0; i < n; ++i) {
+ for (j = 0; j < n_plp[i]; ++j) {
+ const bam_pileup1_t *p = plp[i] + j;
+ uint8_t *q;
+ int id = -1;
+ q = bam_aux_get(p->b, "RG");
+ if (q) id = bam_smpl_rg2smid(sm, fn[i], (char*)q+1, buf);
+ if (id < 0) id = bam_smpl_rg2smid(sm, fn[i], 0, buf);
+ assert(id >= 0 && id < m->n);
+ if (m->n_plp[id] == m->m_plp[id]) {
+ m->m_plp[id] = m->m_plp[id]? m->m_plp[id]<<1 : 8;
+ m->plp[id] = realloc(m->plp[id], sizeof(bam_pileup1_t) * m->m_plp[id]);
+ }
+ m->plp[id][m->n_plp[id]++] = *p;
+ }
+ }
}
static int mpileup(mplp_conf_t *conf, int n, char **fn)
{
+ extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list);
+ extern void bcf_call_del_rghash(void *rghash);
mplp_aux_t **data;
- int i, tid, pos, *n_plp, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid;
+ int i, tid, pos, *n_plp, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid, max_depth;
const bam_pileup1_t **plp;
bam_mplp_t iter;
bam_header_t *h = 0;
char *ref;
- // allocate
+ khash_t(64) *hash = 0;
+ void *rghash = 0;
+
+ bcf_callaux_t *bca = 0;
+ bcf_callret1_t *bcr = 0;
+ bcf_call_t bc;
+ bcf_t *bp = 0;
+ bcf_hdr_t *bh = 0;
+
+ bam_sample_t *sm = 0;
+ kstring_t buf;
+ mplp_pileup_t gplp;
+
+ memset(&gplp, 0, sizeof(mplp_pileup_t));
+ memset(&buf, 0, sizeof(kstring_t));
+ memset(&bc, 0, sizeof(bcf_call_t));
data = calloc(n, sizeof(void*));
plp = calloc(n, sizeof(void*));
n_plp = calloc(n, sizeof(int*));
+ sm = bam_smpl_init();
+
// read the header and initialize data
for (i = 0; i < n; ++i) {
bam_header_t *h_tmp;
data[i] = calloc(1, sizeof(mplp_aux_t));
- data[i]->fp = bam_open(fn[i], "r");
+ data[i]->min_mq = conf->min_mq;
+ data[i]->flag = conf->flag;
+ data[i]->capQ_thres = conf->capQ_thres;
+ data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r");
h_tmp = bam_header_read(data[i]->fp);
+ bam_smpl_add(sm, fn[i], h_tmp->text);
+ rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list);
if (conf->reg) {
int beg, end;
bam_index_t *idx;
bam_header_destroy(h_tmp);
}
}
- // mpileup
+ gplp.n = sm->n;
+ gplp.n_plp = calloc(sm->n, sizeof(int));
+ gplp.m_plp = calloc(sm->n, sizeof(int));
+ gplp.plp = calloc(sm->n, sizeof(void*));
+
+ fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n);
+ if (conf->fn_pos) hash = load_pos(conf->fn_pos, h);
+ // write the VCF header
+ if (conf->flag & MPLP_GLF) {
+ kstring_t s;
+ bh = calloc(1, sizeof(bcf_hdr_t));
+ s.l = s.m = 0; s.s = 0;
+ bp = bcf_open("-", (conf->flag&MPLP_NO_COMP)? "wu" : "w");
+ for (i = 0; i < h->n_targets; ++i) {
+ kputs(h->target_name[i], &s);
+ kputc('\0', &s);
+ }
+ bh->l_nm = s.l;
+ bh->name = malloc(s.l);
+ memcpy(bh->name, s.s, s.l);
+ s.l = 0;
+ for (i = 0; i < sm->n; ++i) {
+ kputs(sm->smpl[i], &s); kputc('\0', &s);
+ }
+ bh->l_smpl = s.l;
+ bh->sname = malloc(s.l);
+ memcpy(bh->sname, s.s, s.l);
+ bh->l_txt = 0;
+ free(s.s);
+ bcf_hdr_sync(bh);
+ bcf_hdr_write(bp, bh);
+ bca = bcf_call_init(-1., conf->min_baseQ);
+ bcr = calloc(sm->n, sizeof(bcf_callret1_t));
+ bca->rghash = rghash;
+ bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ;
+ }
ref_tid = -1; ref = 0;
iter = bam_mplp_init(n, mplp_func, (void**)data);
+ max_depth = conf->max_depth;
+ if (max_depth * sm->n > 1<<20)
+ fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__);
+ if (max_depth * sm->n < 8000) {
+ max_depth = 8000 / sm->n;
+ fprintf(stderr, "<%s> Set max per-sample depth to %d\n", __func__, max_depth);
+ }
+ bam_mplp_set_maxcnt(iter, max_depth);
while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) {
if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested
+ if (hash) {
+ khint_t k;
+ k = kh_get(64, hash, (uint64_t)tid<<32 | pos);
+ if (k == kh_end(hash)) continue;
+ }
if (tid != ref_tid) {
- free(ref);
+ free(ref); ref = 0;
if (conf->fai) ref = fai_fetch(conf->fai, h->target_name[tid], &ref_len);
+ for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid;
ref_tid = tid;
}
- printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N');
- for (i = 0; i < n; ++i) {
- int j;
- printf("\t%d\t", n_plp[i]);
- if (n_plp[i] == 0) printf("*\t*");
- else {
- for (j = 0; j < n_plp[i]; ++j)
- pileup_seq(plp[i] + j, pos, ref_len, ref);
- putchar('\t');
- for (j = 0; j < n_plp[i]; ++j) {
- const bam_pileup1_t *p = plp[i] + j;
- int c = bam1_qual(p->b)[p->qpos] + 33;
- if (c > 126) c = 126;
- putchar(c);
+ if (conf->flag & MPLP_GLF) {
+ int _ref0, ref16;
+ bcf1_t *b = calloc(1, sizeof(bcf1_t));
+ group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp);
+ _ref0 = (ref && pos < ref_len)? ref[pos] : 'N';
+ ref16 = bam_nt16_table[_ref0];
+ for (i = 0; i < gplp.n; ++i)
+ bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i);
+ bcf_call_combine(gplp.n, bcr, ref16, &bc);
+ bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0,
+ (conf->flag&MPLP_FMT_SP), 0, 0);
+ bcf_write(bp, bh, b);
+ bcf_destroy(b);
+ // call indels
+ if (!(conf->flag&MPLP_NO_INDEL) && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) {
+ for (i = 0; i < gplp.n; ++i)
+ bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i);
+ if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) {
+ b = calloc(1, sizeof(bcf1_t));
+ bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0,
+ (conf->flag&MPLP_FMT_SP), bca, ref);
+ bcf_write(bp, bh, b);
+ bcf_destroy(b);
}
}
+ } else {
+ printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N');
+ for (i = 0; i < n; ++i) {
+ int j;
+ printf("\t%d\t", n_plp[i]);
+ if (n_plp[i] == 0) printf("*\t*");
+ else {
+ for (j = 0; j < n_plp[i]; ++j)
+ pileup_seq(plp[i] + j, pos, ref_len, ref);
+ putchar('\t');
+ for (j = 0; j < n_plp[i]; ++j) {
+ const bam_pileup1_t *p = plp[i] + j;
+ int c = bam1_qual(p->b)[p->qpos] + 33;
+ if (c > 126) c = 126;
+ putchar(c);
+ }
+ }
+ }
+ putchar('\n');
}
- putchar('\n');
}
+
+ bcf_close(bp);
+ bam_smpl_destroy(sm); free(buf.s);
+ for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]);
+ free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp);
+ bcf_call_del_rghash(rghash);
+ if (hash) { // free the hash table
+ khint_t k;
+ for (k = kh_begin(hash); k < kh_end(hash); ++k)
+ if (kh_exist(hash, k)) free(kh_val(hash, k));
+ kh_destroy(64, hash);
+ }
+ bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr);
bam_mplp_destroy(iter);
bam_header_destroy(h);
for (i = 0; i < n; ++i) {
return 0;
}
+#define MAX_PATH_LEN 1024
+int read_file_list(const char *file_list,int *n,char **argv[])
+{
+ char buf[MAX_PATH_LEN];
+ int len, nfiles;
+ char **files;
+
+ FILE *fh = fopen(file_list,"r");
+ if ( !fh )
+ {
+ fprintf(stderr,"%s: %s\n", file_list,strerror(errno));
+ return 1;
+ }
+
+ // Speed is not an issue here, determine the number of files by reading the file twice
+ nfiles = 0;
+ while ( fgets(buf,MAX_PATH_LEN,fh) ) nfiles++;
+
+ if ( fseek(fh, 0L, SEEK_SET) )
+ {
+ fprintf(stderr,"%s: %s\n", file_list,strerror(errno));
+ return 1;
+ }
+
+ files = calloc(nfiles,sizeof(char*));
+ nfiles = 0;
+ while ( fgets(buf,MAX_PATH_LEN,fh) )
+ {
+ len = strlen(buf);
+ while ( len>0 && isspace(buf[len-1]) ) len--;
+ if ( !len ) continue;
+
+ files[nfiles] = malloc(sizeof(char)*(len+1));
+ strncpy(files[nfiles],buf,len);
+ files[nfiles][len] = 0;
+ nfiles++;
+ }
+ fclose(fh);
+ if ( !nfiles )
+ {
+ fprintf(stderr,"No files read from %s\n", file_list);
+ return 1;
+ }
+ *argv = files;
+ *n = nfiles;
+ return 0;
+}
+#undef MAX_PATH_LEN
+
int bam_mpileup(int argc, char *argv[])
{
int c;
+ const char *file_list = NULL;
+ char **fn = NULL;
+ int nfiles = 0;
mplp_conf_t mplp;
memset(&mplp, 0, sizeof(mplp_conf_t));
- while ((c = getopt(argc, argv, "f:r:")) >= 0) {
+ mplp.max_mq = 60;
+ mplp.min_baseQ = 13;
+ mplp.capQ_thres = 0;
+ mplp.max_depth = 250;
+ mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100;
+ mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN;
+ while ((c = getopt(argc, argv, "gf:r:l:M:q:Q:uaORC:BDSd:b:P:o:e:h:I")) >= 0) {
switch (c) {
case 'f':
mplp.fai = fai_load(optarg);
if (mplp.fai == 0) return 1;
break;
- case 'r': mplp.reg = strdup(optarg);
+ case 'd': mplp.max_depth = atoi(optarg); break;
+ case 'r': mplp.reg = strdup(optarg); break;
+ case 'l': mplp.fn_pos = strdup(optarg); break;
+ case 'P': mplp.pl_list = strdup(optarg); break;
+ case 'g': mplp.flag |= MPLP_GLF; break;
+ case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_GLF; break;
+ case 'a': mplp.flag |= MPLP_NO_ORPHAN | MPLP_REALN; break;
+ case 'B': mplp.flag &= ~MPLP_REALN & ~MPLP_NO_ORPHAN; break;
+ case 'O': mplp.flag |= MPLP_NO_ORPHAN; break;
+ case 'R': mplp.flag |= MPLP_REALN; break;
+ case 'D': mplp.flag |= MPLP_FMT_DP; break;
+ case 'S': mplp.flag |= MPLP_FMT_SP; break;
+ case 'I': mplp.flag |= MPLP_NO_INDEL; break;
+ case 'C': mplp.capQ_thres = atoi(optarg); break;
+ case 'M': mplp.max_mq = atoi(optarg); break;
+ case 'q': mplp.min_mq = atoi(optarg); break;
+ case 'Q': mplp.min_baseQ = atoi(optarg); break;
+ case 'b': file_list = optarg; break;
+ case 'o': mplp.openQ = atoi(optarg); break;
+ case 'e': mplp.extQ = atoi(optarg); break;
+ case 'h': mplp.tandemQ = atoi(optarg); break;
}
}
if (argc == 1) {
- fprintf(stderr, "Usage: samtools mpileup [-r reg] [-f in.fa] in1.bam [in2.bam [...]]\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: samtools mpileup [options] in1.bam [in2.bam [...]]\n\n");
+ fprintf(stderr, "Options: -f FILE reference sequence file [null]\n");
+ fprintf(stderr, " -r STR region in which pileup is generated [null]\n");
+ fprintf(stderr, " -l FILE list of positions (format: chr pos) [null]\n");
+ fprintf(stderr, " -b FILE list of input BAM files [null]\n");
+ fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", mplp.max_mq);
+ fprintf(stderr, " -Q INT min base quality [%d]\n", mplp.min_baseQ);
+ fprintf(stderr, " -q INT filter out alignment with MQ smaller than INT [%d]\n", mplp.min_mq);
+ fprintf(stderr, " -d INT max per-sample depth [%d]\n", mplp.max_depth);
+ fprintf(stderr, " -P STR comma separated list of platforms for indels [all]\n");
+ fprintf(stderr, " -o INT Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ);
+ fprintf(stderr, " -e INT Phred-scaled gap extension seq error probability [%d]\n", mplp.extQ);
+ fprintf(stderr, " -h INT coefficient for homopolyer errors [%d]\n", mplp.tandemQ);
+ fprintf(stderr, " -g generate BCF output\n");
+ fprintf(stderr, " -u do not compress BCF output\n");
+ fprintf(stderr, " -B disable BAQ computation\n");
+ fprintf(stderr, " -D output per-sample DP\n");
+ fprintf(stderr, " -S output per-sample SP (strand bias P-value, slow)\n");
+ fprintf(stderr, " -I do not perform indel calling\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Notes: Assuming diploid individuals.\n\n");
return 1;
}
- mpileup(&mplp, argc - optind, argv + optind);
- free(mplp.reg);
+ if ( file_list )
+ {
+ if ( read_file_list(file_list,&nfiles,&fn) ) return 1;
+ mpileup(&mplp,nfiles,fn);
+ for (c=0; c<nfiles; c++) free(fn[c]);
+ free(fn);
+ }
+ else
+ mpileup(&mplp, argc - optind, argv + optind);
+ free(mplp.reg); free(mplp.pl_list);
if (mplp.fai) fai_destroy(mplp.fai);
return 0;
}
#include <stdlib.h>
#include <ctype.h>
#include <assert.h>
+#include <errno.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
KSORT_INIT(heap, heap1_t, heap_lt)
+static void swap_header_targets(bam_header_t *h1, bam_header_t *h2)
+{
+ bam_header_t t;
+ t.n_targets = h1->n_targets, h1->n_targets = h2->n_targets, h2->n_targets = t.n_targets;
+ t.target_name = h1->target_name, h1->target_name = h2->target_name, h2->target_name = t.target_name;
+ t.target_len = h1->target_len, h1->target_len = h2->target_len, h2->target_len = t.target_len;
+}
+
static void swap_header_text(bam_header_t *h1, bam_header_t *h2)
{
int tempi;
temps = h1->text, h1->text = h2->text, h2->text = temps;
}
+#define MERGE_RG 1
+#define MERGE_UNCOMP 2
+
/*!
@abstract Merge multiple sorted BAM.
@param is_by_qname whether to sort by query name
@discussion Padding information may NOT correctly maintained. This
function is NOT thread safe.
*/
-void bam_merge_core(int by_qname, const char *out, const char *headers, int n, char * const *fn, int add_RG)
+int bam_merge_core(int by_qname, const char *out, const char *headers, int n, char * const *fn,
+ int flag, const char *reg)
{
bamFile fpout, *fp;
heap1_t *heap;
int i, j, *RG_len = 0;
uint64_t idx = 0;
char **RG = 0;
+ bam_iter_t *iter = 0;
if (headers) {
tamFile fpheaders = sam_open(headers);
if (fpheaders == 0) {
- fprintf(stderr, "[bam_merge_core] Cannot open file `%s'. Continue anyway.\n", headers);
- } else {
- hheaders = sam_header_read(fpheaders);
- sam_close(fpheaders);
+ const char *message = strerror(errno);
+ fprintf(stderr, "[bam_merge_core] cannot open '%s': %s\n", headers, message);
+ return -1;
}
+ hheaders = sam_header_read(fpheaders);
+ sam_close(fpheaders);
}
g_is_by_qname = by_qname;
fp = (bamFile*)calloc(n, sizeof(bamFile));
heap = (heap1_t*)calloc(n, sizeof(heap1_t));
+ iter = (bam_iter_t*)calloc(n, sizeof(bam_iter_t));
// prepare RG tag
- if (add_RG) {
+ if (flag & MERGE_RG) {
RG = (char**)calloc(n, sizeof(void*));
RG_len = (int*)calloc(n, sizeof(int));
for (i = 0; i != n; ++i) {
}
// read the first
for (i = 0; i != n; ++i) {
- heap1_t *h;
bam_header_t *hin;
fp[i] = bam_open(fn[i], "r");
if (fp[i] == 0) {
for (j = 0; j < i; ++j) bam_close(fp[j]);
free(fp); free(heap);
// FIXME: possible memory leak
- return;
+ return -1;
}
hin = bam_header_read(fp[i]);
- if (i == 0) { // the first SAM
+ if (i == 0) { // the first BAM
hout = hin;
- if (hheaders) {
- // If the text headers to be swapped in include any @SQ headers,
- // check that they are consistent with the existing binary list
- // of reference information.
- if (hheaders->n_targets > 0) {
- if (hout->n_targets != hheaders->n_targets)
- fprintf(stderr, "[bam_merge_core] number of @SQ headers in `%s' differs from number of target sequences", headers);
- for (j = 0; j < hout->n_targets; ++j)
- if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0)
- fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence", hheaders->target_name[j], headers);
- }
- swap_header_text(hout, hheaders);
- bam_header_destroy(hheaders);
- hheaders = NULL;
- }
} else { // validate multiple baf
- if (hout->n_targets != hin->n_targets) {
- fprintf(stderr, "[bam_merge_core] file '%s' has different number of target sequences. Abort!\n", fn[i]);
- exit(1);
- }
- for (j = 0; j < hout->n_targets; ++j) {
- if (strcmp(hout->target_name[j], hin->target_name[j])) {
- fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'. Abort!\n",
+ int min_n_targets = hout->n_targets;
+ if (hin->n_targets < min_n_targets) min_n_targets = hin->n_targets;
+
+ for (j = 0; j < min_n_targets; ++j)
+ if (strcmp(hout->target_name[j], hin->target_name[j]) != 0) {
+ fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'\n",
hout->target_name[j], hin->target_name[j], fn[i]);
- exit(1);
+ return -1;
}
+
+ // If this input file has additional target reference sequences,
+ // add them to the headers to be output
+ if (hin->n_targets > hout->n_targets) {
+ swap_header_targets(hout, hin);
+ // FIXME Possibly we should also create @SQ text headers
+ // for the newly added reference sequences
}
+
bam_header_destroy(hin);
}
- h = heap + i;
+ }
+
+ if (hheaders) {
+ // If the text headers to be swapped in include any @SQ headers,
+ // check that they are consistent with the existing binary list
+ // of reference information.
+ if (hheaders->n_targets > 0) {
+ if (hout->n_targets != hheaders->n_targets) {
+ fprintf(stderr, "[bam_merge_core] number of @SQ headers in '%s' differs from number of target sequences\n", headers);
+ if (!reg) return -1;
+ }
+ for (j = 0; j < hout->n_targets; ++j)
+ if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0) {
+ fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence\n", hheaders->target_name[j], headers);
+ if (!reg) return -1;
+ }
+ }
+
+ swap_header_text(hout, hheaders);
+ bam_header_destroy(hheaders);
+ }
+
+ if (reg) {
+ int tid, beg, end;
+ if (bam_parse_region(hout, reg, &tid, &beg, &end) < 0) {
+ fprintf(stderr, "[%s] Malformated region string or undefined reference name\n", __func__);
+ return -1;
+ }
+ for (i = 0; i < n; ++i) {
+ bam_index_t *idx;
+ idx = bam_index_load(fn[i]);
+ iter[i] = bam_iter_query(idx, tid, beg, end);
+ bam_index_destroy(idx);
+ }
+ }
+
+ for (i = 0; i < n; ++i) {
+ heap1_t *h = heap + i;
h->i = i;
h->b = (bam1_t*)calloc(1, sizeof(bam1_t));
- if (bam_read1(fp[i], h->b) >= 0) {
+ if (bam_iter_read(fp[i], iter[i], h->b) >= 0) {
h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)h->b->core.pos<<1 | bam1_strand(h->b);
h->idx = idx++;
}
else h->pos = HEAP_EMPTY;
}
- fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w");
- assert(fpout);
+ if (flag & MERGE_UNCOMP) {
+ fpout = strcmp(out, "-")? bam_open(out, "wu") : bam_dopen(fileno(stdout), "wu");
+ } else {
+ fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w");
+ }
+ if (fpout == 0) {
+ fprintf(stderr, "[%s] fail to create the output file.\n", __func__);
+ return -1;
+ }
bam_header_write(fpout, hout);
bam_header_destroy(hout);
ks_heapmake(heap, n, heap);
while (heap->pos != HEAP_EMPTY) {
bam1_t *b = heap->b;
- if (add_RG && bam_aux_get(b, "RG") == 0)
+ if ((flag & MERGE_RG) && bam_aux_get(b, "RG") == 0)
bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]);
bam_write1_core(fpout, &b->core, b->data_len, b->data);
- if ((j = bam_read1(fp[heap->i], b)) >= 0) {
+ if ((j = bam_iter_read(fp[heap->i], iter[heap->i], b)) >= 0) {
heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)b->core.pos<<1 | bam1_strand(b);
heap->idx = idx++;
} else if (j == -1) {
ks_heapadjust(heap, 0, n, heap);
}
- if (add_RG) {
+ if (flag & MERGE_RG) {
for (i = 0; i != n; ++i) free(RG[i]);
free(RG); free(RG_len);
}
- for (i = 0; i != n; ++i) bam_close(fp[i]);
+ for (i = 0; i != n; ++i) {
+ bam_iter_destroy(iter[i]);
+ bam_close(fp[i]);
+ }
bam_close(fpout);
- free(fp); free(heap);
+ free(fp); free(heap); free(iter);
+ return 0;
}
+
int bam_merge(int argc, char *argv[])
{
- int c, is_by_qname = 0, add_RG = 0;
- char *fn_headers = NULL;
+ int c, is_by_qname = 0, flag = 0, ret = 0;
+ char *fn_headers = NULL, *reg = 0;
- while ((c = getopt(argc, argv, "h:nr")) >= 0) {
+ while ((c = getopt(argc, argv, "h:nruR:")) >= 0) {
switch (c) {
- case 'r': add_RG = 1; break;
+ case 'r': flag |= MERGE_RG; break;
case 'h': fn_headers = strdup(optarg); break;
case 'n': is_by_qname = 1; break;
+ case 'u': flag |= MERGE_UNCOMP; break;
+ case 'R': reg = strdup(optarg); break;
}
}
if (optind + 2 >= argc) {
fprintf(stderr, "Usage: samtools merge [-nr] [-h inh.sam] <out.bam> <in1.bam> <in2.bam> [...]\n\n");
fprintf(stderr, "Options: -n sort by read names\n");
fprintf(stderr, " -r attach RG tag (inferred from file names)\n");
+ fprintf(stderr, " -u uncompressed BAM output\n");
+ fprintf(stderr, " -R STR merge file in the specified region STR [all]\n");
fprintf(stderr, " -h FILE copy the header in FILE to <out.bam> [in1.bam]\n\n");
fprintf(stderr, "Note: Samtools' merge does not reconstruct the @RG dictionary in the header. Users\n");
fprintf(stderr, " must provide the correct header with -h, or uses Picard which properly maintains\n");
fprintf(stderr, " the header dictionary in merging.\n\n");
return 1;
}
- bam_merge_core(is_by_qname, argv[optind], fn_headers, argc - optind - 1, argv + optind + 1, add_RG);
+ if (bam_merge_core(is_by_qname, argv[optind], fn_headers, argc - optind - 1, argv + optind + 1, flag, reg) < 0) ret = 1;
+ free(reg);
free(fn_headers);
- return 0;
+ return ret;
}
typedef bam1_t *bam1_p;
fns[i] = (char*)calloc(strlen(prefix) + 20, 1);
sprintf(fns[i], "%s.%.4d.bam", prefix, i);
}
- bam_merge_core(is_by_qname, fnout, 0, n, fns, 0);
+ bam_merge_core(is_by_qname, fnout, 0, n, fns, 0, 0);
free(fnout);
for (i = 0; i < n; ++i) {
unlink(fns[i]);
if (tv->is_dot && toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.';
}
}
- } else c = '*';
+ } else c = p->is_refskip? (bam1_strand(p->b)? '<' : '>') : '*';
} else { // padding
if (j > p->indel) c = '*';
else { // insertion
} else if (c == KEY_ENTER || c == '\012' || c == '\015') {
int _tid = -1, _beg, _end;
if (str[0] == '=') {
- _beg = strtol(str+1, &p, 10);
+ _beg = strtol(str+1, &p, 10) - 1;
if (_beg > 0) {
*pos = _beg;
return;
#endif
#ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.1.8 (r613)"
+#define PACKAGE_VERSION "0.1.12a (r862)"
#endif
int bam_taf2baf(int argc, char *argv[]);
bgzf_open(const char* __restrict path, const char* __restrict mode)
{
BGZF* fp = NULL;
- if (mode[0] == 'r' || mode[0] == 'R') { /* The reading mode is preferred. */
+ if (strchr(mode, 'r') || strchr(mode, 'R')) { /* The reading mode is preferred. */
#ifdef _USE_KNETFILE
knetFile *file = knet_open(path, mode);
if (file == 0) return 0;
if (fd == -1) return 0;
fp = open_read(fd);
#endif
- } else if (mode[0] == 'w' || mode[0] == 'W') {
+ } else if (strchr(mode, 'w') || strchr(mode, 'W')) {
int fd, oflag = O_WRONLY | O_CREAT | O_TRUNC;
#ifdef _WIN32
oflag |= O_BINARY;
#endif
fd = open(path, oflag, 0666);
if (fd == -1) return 0;
- fp = open_write(fd, strstr(mode, "u")? 1 : 0);
+ fp = open_write(fd, strchr(mode, 'u')? 1 : 0);
}
if (fp != NULL) fp->owned_file = 1;
return fp;
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
+#include <sys/select.h>
+#include <sys/stat.h>
#include "bgzf.h"
static const int WINDOW_SIZE = 64 * 1024;
static int bgzip_main_usage()
{
- printf("\n");
- printf("Usage: bgzip [options] [file] ...\n\n");
- printf("Options: -c write on standard output, keep original files unchanged\n");
- printf(" -d decompress\n");
- // printf(" -l list compressed file contents\n");
- printf(" -b INT decompress at virtual file pointer INT\n");
- printf(" -s INT decompress INT bytes in the uncompressed file\n");
- printf(" -h give this help\n");
- printf("\n");
- return 0;
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: bgzip [options] [file] ...\n\n");
+ fprintf(stderr, "Options: -c write on standard output, keep original files unchanged\n");
+ fprintf(stderr, " -d decompress\n");
+ fprintf(stderr, " -f overwrite files without asking\n");
+ fprintf(stderr, " -b INT decompress at virtual file pointer INT\n");
+ fprintf(stderr, " -s INT decompress INT bytes in the uncompressed file\n");
+ fprintf(stderr, " -h give this help\n");
+ fprintf(stderr, "\n");
+ return 1;
}
static int write_open(const char *fn, int is_forced)
char c;
if (!is_forced) {
if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | O_EXCL, 0666)) < 0 && errno == EEXIST) {
- printf("bgzip: %s already exists; do you wish to overwrite (y or n)? ", fn);
+ fprintf(stderr, "[bgzip] %s already exists; do you wish to overwrite (y or n)? ", fn);
scanf("%c", &c);
if (c != 'Y' && c != 'y') {
- printf("bgzip: not overwritten\n");
+ fprintf(stderr, "[bgzip] not overwritten\n");
exit(1);
}
}
}
if (fd < 0) {
if ((fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC, 0666)) < 0) {
- fprintf(stderr, "bgzip: %s: Fail to write\n", fn);
+ fprintf(stderr, "[bgzip] %s: Fail to write\n", fn);
exit(1);
}
}
return fd;
}
-static
-void
-fail(BGZF* fp)
+static void fail(BGZF* fp)
{
- printf("Error: %s\n", fp->error);
+ fprintf(stderr, "Error: %s\n", fp->error);
exit(1);
}
int main(int argc, char **argv)
{
int c, compress, pstdout, is_forced;
- BGZF *rz;
+ BGZF *fp;
void *buffer;
long start, end, size;
compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0;
- while((c = getopt(argc, argv, "cdlhfb:s:")) >= 0){
+ while((c = getopt(argc, argv, "cdhfb:s:")) >= 0){
switch(c){
case 'h': return bgzip_main_usage();
case 'd': compress = 0; break;
case 'c': pstdout = 1; break;
- // case 'l': compress = 2; break;
case 'b': start = atol(optarg); break;
case 's': size = atol(optarg); break;
case 'f': is_forced = 1; break;
}
}
if (size >= 0) end = start + size;
- if(end >= 0 && end < start){
- fprintf(stderr, " -- Illegal region: [%ld, %ld] --\n", start, end);
+ if (end >= 0 && end < start) {
+ fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end);
return 1;
}
- if(compress == 1){
- int f_src, f_dst = -1;
- if(argc > optind){
- if((f_src = open(argv[optind], O_RDONLY)) < 0){
- fprintf(stderr, " -- Cannot open file: %s --\n", argv[optind]);
+ if (compress == 1) {
+ struct stat sbuf;
+ int f_src = fileno(stdin);
+ int f_dst = fileno(stdout);
+
+ if ( argc>optind )
+ {
+ if ( stat(argv[optind],&sbuf)<0 )
+ {
+ fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
+ return 1;
+ }
+
+ if ((f_src = open(argv[optind], O_RDONLY)) < 0) {
+ fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
return 1;
}
- if(pstdout){
+
+ if (pstdout)
f_dst = fileno(stdout);
- } else {
- char *name = malloc(sizeof(strlen(argv[optind]) + 5));
+ else
+ {
+ char *name = malloc(strlen(argv[optind]) + 5);
strcpy(name, argv[optind]);
strcat(name, ".gz");
f_dst = write_open(name, is_forced);
if (f_dst < 0) return 1;
free(name);
}
- } else if(pstdout){
- f_src = fileno(stdin);
- f_dst = fileno(stdout);
- } else return bgzip_main_usage();
- rz = bgzf_fdopen(f_dst, "w");
+ }
+ else if (!pstdout && isatty(fileno((FILE *)stdout)) )
+ return bgzip_main_usage();
+
+ fp = bgzf_fdopen(f_dst, "w");
buffer = malloc(WINDOW_SIZE);
- while((c = read(f_src, buffer, WINDOW_SIZE)) > 0) {
- if (bgzf_write(rz, buffer, c) < 0) {
- fail(rz);
- }
- }
- // f_dst will be closed here
- if (bgzf_close(rz) < 0) {
- fail(rz);
- }
- if (argc > optind) unlink(argv[optind]);
+ while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0)
+ if (bgzf_write(fp, buffer, c) < 0) fail(fp);
+ // f_dst will be closed here
+ if (bgzf_close(fp) < 0) fail(fp);
+ if (argc > optind && !pstdout) unlink(argv[optind]);
free(buffer);
close(f_src);
return 0;
} else {
- if(argc <= optind) return bgzip_main_usage();
- int f_dst;
- if (argc > optind && !pstdout) {
- char *name;
- if (strstr(argv[optind], ".gz") - argv[optind] != strlen(argv[optind]) - 3) {
- printf("bgzip: %s: unknown suffix -- ignored\n", argv[optind]);
- return 1;
- }
- name = strdup(argv[optind]);
- name[strlen(name) - 3] = '\0';
- f_dst = write_open(name, is_forced);
- free(name);
- } else f_dst = fileno(stdout);
- rz = bgzf_open(argv[optind], "r");
- if (rz == NULL) {
- printf("Could not open file: %s\n", argv[optind]);
- return 1;
- }
- buffer = malloc(WINDOW_SIZE);
- if (bgzf_seek(rz, start, SEEK_SET) < 0) {
- fail(rz);
- }
- while(1){
- if(end < 0) c = bgzf_read(rz, buffer, WINDOW_SIZE);
- else c = bgzf_read(rz, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start));
- if(c == 0) break;
- if (c < 0) fail(rz);
- start += c;
- write(f_dst, buffer, c);
- if(end >= 0 && start >= end) break;
- }
- free(buffer);
- if (bgzf_close(rz) < 0) {
- fail(rz);
- }
- if (!pstdout) unlink(argv[optind]);
+ struct stat sbuf;
+ int f_dst;
+
+ if ( argc>optind )
+ {
+ if ( stat(argv[optind],&sbuf)<0 )
+ {
+ fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
+ return 1;
+ }
+ char *name;
+ int len = strlen(argv[optind]);
+ if ( strcmp(argv[optind]+len-3,".gz") )
+ {
+ fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]);
+ return 1;
+ }
+ fp = bgzf_open(argv[optind], "r");
+ if (fp == NULL) {
+ fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]);
+ return 1;
+ }
+
+ if (pstdout) {
+ f_dst = fileno(stdout);
+ }
+ else {
+ name = strdup(argv[optind]);
+ name[strlen(name) - 3] = '\0';
+ f_dst = write_open(name, is_forced);
+ free(name);
+ }
+ }
+ else if (!pstdout && isatty(fileno((FILE *)stdin)) )
+ return bgzip_main_usage();
+ else
+ {
+ f_dst = fileno(stdout);
+ fp = bgzf_fdopen(fileno(stdin), "r");
+ if (fp == NULL) {
+ fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno));
+ return 1;
+ }
+ }
+ buffer = malloc(WINDOW_SIZE);
+ if (bgzf_seek(fp, start, SEEK_SET) < 0) fail(fp);
+ while (1) {
+ if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE);
+ else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start));
+ if (c == 0) break;
+ if (c < 0) fail(fp);
+ start += c;
+ write(f_dst, buffer, c);
+ if (end >= 0 && start >= end) break;
+ }
+ free(buffer);
+ if (bgzf_close(fp) < 0) fail(fp);
+ if (!pstdout) unlink(argv[optind]);
return 0;
}
}
-
-all:../libbam.a ../samtools ex1.glf ex1.pileup.gz ex1.bam.bai ex1f-rmduppe.bam ex1f-rmdupse.bam ex1.glfview.gz calDepth
+all:../libbam.a ../samtools ../bcftools/bcftools \
+ ex1.glf ex1.pileup.gz ex1.bam.bai ex1f-rmduppe.bam ex1f-rmdupse.bam ex1.glfview.gz ex1.bcf calDepth
@echo; echo \# You can now launch the viewer with: \'samtools tview ex1.bam ex1.fa\'; echo;
ex1.fa.fai:ex1.fa
ex1b.bam:ex1.bam
../samtools view -h ex1.bam | awk 'BEGIN{FS=OFS="\t"}{if(/^@/)print;else{$$1=$$1"b";print}}' | ../samtools view -bS - > $@
ex1f.rg:
- (echo "@RG ID:ex1 LB:ex1"; echo "@RG ID:ex1a LB:ex1"; echo "@RG ID:ex1b LB:ex1b") > $@
+ (echo "@RG ID:ex1 LB:ex1 SM:ex1"; echo "@RG ID:ex1a LB:ex1 SM:ex1"; echo "@RG ID:ex1b LB:ex1b SM:ex1b") > $@
ex1f.bam:ex1.bam ex1a.bam ex1b.bam ex1f.rg
../samtools merge -rh ex1f.rg $@ ex1.bam ex1a.bam ex1b.bam
ex1f-rmduppe.bam:ex1f.bam
ex1f-rmdupse.bam:ex1f.bam
../samtools rmdup -S ex1f.bam $@
+ex1.bcf:ex1.bam ex1.fa.fai
+ ../samtools mpileup -gf ex1.fa ex1.bam > $@
+
+../bcftools/bcftools:
+ (cd ../bcftools; make bcftools)
+
../samtools:
(cd ..; make samtools)
gcc -g -Wall -O2 -I.. calDepth.c -o $@ -lm -lz -L.. -lbam
clean:
- rm -fr *.bam *.bai *.glf* *.fai *.pileup* *~ calDepth *.dSYM ex1*.rg
\ No newline at end of file
+ rm -fr *.bam *.bai *.glf* *.fai *.pileup* *~ calDepth *.dSYM ex1*.rg ex1.bcf
+
+# ../samtools pileup ex1.bam|perl -ape '$_=$F[4];s/(\d+)(??{".{$1}"})|\^.//g;@_=(tr/A-Z//,tr/a-z//);$_=join("\t",@F[0,1],@_)."\n"'
+
+# ../samtools pileup -cf ex1.fa ex1.bam|perl -ape '$_=$F[8];s/\^.//g;s/(\d+)(??{".{$1}"})|\^.//g;@_=(tr/A-Za-z//,tr/,.//);$_=join("\t",@F[0,1],@_)."\n"'
\ No newline at end of file
>ref
AGCATGTTAGATAAGATAGCTGTGCTAGTAGGCAGTCAGCGCCAT
+>ref2
+aggttttataaaacaattaagtctacagagcaactacgcg
@SQ SN:ref LN:45
-r001 163 ref 7 30 8M2I4M1D3M = 37 39 TTAGATAAAGGATACTG *
-r002 0 ref 9 30 1S2I6M1P1I4M2I * 0 0 AAAAGATAAGGATAAA *
+@SQ SN:ref2 LN:40
+r001 163 ref 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG *
+r002 0 ref 9 30 1S2I6M1P1I1P1I4M2I * 0 0 AAAAGATAAGGGATAAA *
r003 0 ref 9 30 5H6M * 0 0 AGCTAA *
r004 0 ref 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC *
r003 16 ref 29 30 6H5M * 0 0 TAGGC *
-r001 83 ref 37 30 9M = 7 -39 CAGCGCCAT *
\ No newline at end of file
+r001 83 ref 37 30 9M = 7 -39 CAGCGCCAT *
+x1 0 ref2 1 30 20M * 0 0 aggttttataaaacaaataa ????????????????????
+x2 0 ref2 2 30 21M * 0 0 ggttttataaaacaaataatt ?????????????????????
+x3 0 ref2 6 30 9M4I13M * 0 0 ttataaaacAAATaattaagtctaca ??????????????????????????
+x4 0 ref2 10 30 25M * 0 0 CaaaTaattaagtctacagagcaac ?????????????????????????
+x5 0 ref2 12 30 24M * 0 0 aaTaattaagtctacagagcaact ????????????????????????
+x6 0 ref2 14 30 23M * 0 0 Taattaagtctacagagcaacta ???????????????????????
#include <stdio.h>
#include <string.h>
#include <stdint.h>
+#include <math.h>
#include "kaln.h"
#define FROM_M 0
-2, -2, -2, -2, -2
};
-ka_param_t ka_param_blast = { 5, 2, 2, aln_sm_blast, 5, 50 };
-ka_param_t ka_param_aa2aa = { 10, 2, 2, aln_sm_blosum62, 22, 50 };
+int aln_sm_qual[] = {
+ 0, -23, -23, -23, 0,
+ -23, 0, -23, -23, 0,
+ -23, -23, 0, -23, 0,
+ -23, -23, -23, 0, 0,
+ 0, 0, 0, 0, 0
+};
+
+ka_param_t ka_param_blast = { 5, 2, 5, 2, aln_sm_blast, 5, 50 };
+ka_param_t ka_param_aa2aa = { 10, 2, 10, 2, aln_sm_blosum62, 22, 50 };
+
+ka_param2_t ka_param2_qual = { 37, 11, 37, 11, 37, 11, 0, 0, aln_sm_qual, 5, 50 };
static uint32_t *ka_path2cigar32(const path_t *path, int path_len, int *n_cigar)
{
}
#define set_end_I(II, cur, p) \
{ \
- if (gap_end >= 0) { \
- if ((p)->M - gap_open > (p)->I) { \
+ if (gap_end_ext >= 0) { \
+ if ((p)->M - gap_end_open > (p)->I) { \
(cur)->It = FROM_M; \
- (II) = (p)->M - gap_open - gap_end; \
+ (II) = (p)->M - gap_end_open - gap_end_ext; \
} else { \
(cur)->It = FROM_I; \
- (II) = (p)->I - gap_end; \
+ (II) = (p)->I - gap_end_ext; \
} \
} else set_I(II, cur, p); \
}
}
#define set_end_D(DD, cur, p) \
{ \
- if (gap_end >= 0) { \
- if ((p)->M - gap_open > (p)->D) { \
+ if (gap_end_ext >= 0) { \
+ if ((p)->M - gap_end_open > (p)->D) { \
(cur)->Dt = FROM_M; \
- (DD) = (p)->M - gap_open - gap_end; \
+ (DD) = (p)->M - gap_end_open - gap_end_ext; \
} else { \
(cur)->Dt = FROM_D; \
- (DD) = (p)->D - gap_end; \
+ (DD) = (p)->D - gap_end_ext; \
} \
} else set_D(DD, cur, p); \
}
typedef struct {
- uint8_t Mt:3, It:2, Dt:2;
+ uint8_t Mt:3, It:2, Dt:3;
} dpcell_t;
typedef struct {
uint8_t type, ctype;
uint32_t *cigar = 0;
- int gap_open, gap_ext, gap_end, b;
+ int gap_open, gap_ext, gap_end_open, gap_end_ext, b;
int *score_matrix, N_MATRIX_ROW;
/* initialize some align-related parameters. just for compatibility */
gap_open = ap->gap_open;
gap_ext = ap->gap_ext;
- gap_end = ap->gap_end;
+ gap_end_open = ap->gap_end_open;
+ gap_end_ext = ap->gap_end_ext;
b = ap->band_width;
score_matrix = ap->matrix;
N_MATRIX_ROW = ap->row;
- *n_cigar = 0;
+ if (n_cigar) *n_cigar = 0;
if (len1 == 0 || len2 == 0) return 0;
/* calculate b1 and b2 */
return cigar;
}
+
+typedef struct {
+ int M, I, D;
+} score_aux_t;
+
+#define MINUS_INF -0x40000000
+
+// matrix: len2 rows and len1 columns
+int ka_global_score(const uint8_t *_seq1, int len1, const uint8_t *_seq2, int len2, const ka_param2_t *ap)
+{
+
+#define __score_aux(_p, _q0, _sc, _io, _ie, _do, _de) { \
+ int t1, t2; \
+ score_aux_t *_q; \
+ _q = _q0; \
+ _p->M = _q->M >= _q->I? _q->M : _q->I; \
+ _p->M = _p->M >= _q->D? _p->M : _q->D; \
+ _p->M += (_sc); \
+ ++_q; t1 = _q->M - _io - _ie; t2 = _q->I - _ie; _p->I = t1 >= t2? t1 : t2; \
+ _q = _p-1; t1 = _q->M - _do - _de; t2 = _q->D - _de; _p->D = t1 >= t2? t1 : t2; \
+ }
+
+ int i, j, bw, scmat_size = ap->row, *scmat = ap->matrix, ret;
+ const uint8_t *seq1, *seq2;
+ score_aux_t *curr, *last, *swap;
+ bw = abs(len1 - len2) + ap->band_width;
+ i = len1 > len2? len1 : len2;
+ if (bw > i + 1) bw = i + 1;
+ seq1 = _seq1 - 1; seq2 = _seq2 - 1;
+ curr = calloc(len1 + 2, sizeof(score_aux_t));
+ last = calloc(len1 + 2, sizeof(score_aux_t));
+ { // the zero-th row
+ int x, end = len1;
+ score_aux_t *p;
+ j = 0;
+ x = j + bw; end = len1 < x? len1 : x; // band end
+ p = curr;
+ p->M = 0; p->I = p->D = MINUS_INF;
+ for (i = 1, p = &curr[1]; i <= end; ++i, ++p)
+ p->M = p->I = MINUS_INF, p->D = -(ap->edo + ap->ede * i);
+ p->M = p->I = p->D = MINUS_INF;
+ swap = curr; curr = last; last = swap;
+ }
+ for (j = 1; j < len2; ++j) {
+ int x, beg = 0, end = len1, *scrow, col_end;
+ score_aux_t *p;
+ x = j - bw; beg = 0 > x? 0 : x; // band start
+ x = j + bw; end = len1 < x? len1 : x; // band end
+ if (beg == 0) { // from zero-th column
+ p = curr;
+ p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j);
+ ++beg; // then beg = 1
+ }
+ scrow = scmat + seq2[j] * scmat_size;
+ if (end == len1) col_end = 1, --end;
+ else col_end = 0;
+ for (i = beg, p = &curr[beg]; i <= end; ++i, ++p)
+ __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->ido, ap->ide);
+ if (col_end) {
+ __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->ido, ap->ide);
+ ++p;
+ }
+ p->M = p->I = p->D = MINUS_INF;
+// for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n');
+ swap = curr; curr = last; last = swap;
+ }
+ { // the last row
+ int x, beg = 0, *scrow;
+ score_aux_t *p;
+ j = len2;
+ x = j - bw; beg = 0 > x? 0 : x; // band start
+ if (beg == 0) { // from zero-th column
+ p = curr;
+ p->M = p->D = MINUS_INF; p->I = -(ap->eio + ap->eie * j);
+ ++beg; // then beg = 1
+ }
+ scrow = scmat + seq2[j] * scmat_size;
+ for (i = beg, p = &curr[beg]; i < len1; ++i, ++p)
+ __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->iio, ap->iie, ap->edo, ap->ede);
+ __score_aux(p, &last[i-1], scrow[(int)seq1[i]], ap->eio, ap->eie, ap->edo, ap->ede);
+// for (i = 0; i <= len1; ++i) printf("(%d,%d,%d) ", curr[i].M, curr[i].I, curr[i].D); putchar('\n');
+ }
+ ret = curr[len1].M >= curr[len1].I? curr[len1].M : curr[len1].I;
+ ret = ret >= curr[len1].D? ret : curr[len1].D;
+ free(curr); free(last);
+ return ret;
+}
+
+#ifdef _MAIN
+int main(int argc, char *argv[])
+{
+// int len1 = 35, len2 = 35;
+// uint8_t *seq1 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\0\1";
+// uint8_t *seq2 = (uint8_t*)"\0\0\3\3\2\0\0\0\1\0\2\1\2\1\3\2\3\3\3\0\2\3\2\1\1\3\3\3\2\3\3\1\0\1\0";
+ int len1 = 4, len2 = 4;
+ uint8_t *seq1 = (uint8_t*)"\1\0\0\1";
+ uint8_t *seq2 = (uint8_t*)"\1\0\1\0";
+ int sc;
+// ka_global_core(seq1, 2, seq2, 1, &ka_param_qual, &sc, 0);
+ sc = ka_global_score(seq1, len1, seq2, len2, &ka_param2_qual);
+ printf("%d\n", sc);
+ return 0;
+}
+#endif
typedef struct {
int gap_open;
int gap_ext;
- int gap_end;
+ int gap_end_open;
+ int gap_end_ext;
int *matrix;
int row;
int band_width;
} ka_param_t;
+typedef struct {
+ int iio, iie, ido, ide;
+ int eio, eie, edo, ede;
+ int *matrix;
+ int row;
+ int band_width;
+} ka_param2_t;
+
#ifdef __cplusplus
extern "C" {
#endif
- uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap, int *_score, int *n_cigar);
-
+ uint32_t *ka_global_core(uint8_t *seq1, int len1, uint8_t *seq2, int len2, const ka_param_t *ap,
+ int *_score, int *n_cigar);
+ int ka_global_score(const uint8_t *_seq1, int len1, const uint8_t *_seq2, int len2, const ka_param2_t *ap);
#ifdef __cplusplus
}
#endif
-extern ka_param_t ka_param_blast; /* = { 5, 2, 2, aln_sm_blast, 5, 50 }; */
+extern ka_param_t ka_param_blast; /* = { 5, 2, 5, 2, aln_sm_blast, 5, 50 }; */
+extern ka_param_t ka_param_qual; // only use this for global alignment!!!
+extern ka_param2_t ka_param2_qual; // only use this for global alignment!!!
#endif
if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX
off_t rest = len, curr;
while (rest) {
- curr = read(fp->fd, buf + l, rest);
+ do {
+ curr = read(fp->fd, buf + l, rest);
+ } while (curr < 0 && EINTR == errno);
+ if (curr < 0) return -1;
if (curr == 0) break;
l += curr; rest -= curr;
}
if (hh <= k) low = ll; \
if (hh >= k) high = hh - 1; \
} \
+ } \
+ void ks_shuffle_##name(size_t n, type_t a[]) \
+ { \
+ int i, j; \
+ for (i = n; i > 1; --i) { \
+ type_t tmp; \
+ j = (int)(drand48() * i); \
+ tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp; \
+ } \
}
#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t)
#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a)
#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a)
#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k)
+#define ks_shuffle(name, n, a) ks_shuffle_##name(n, a)
#define ks_lt_generic(a, b) ((a) < (b))
#define ks_lt_str(a, b) (strcmp((a), (b)) < 0)
return l;
}
+char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux)
+{
+ const char *p, *start;
+ if (sep) { // set up the table
+ if (str == 0 && (aux->tab[0]&1)) return 0; // no need to set up if we have finished
+ aux->tab[0] = aux->tab[1] = aux->tab[2] = aux->tab[3] = 0;
+ for (p = sep; *p; ++p)
+ aux->tab[*p/64] |= 1ull<<(*p%64);
+ }
+ if (str) aux->p = str - 1, aux->tab[0] &= ~1ull;
+ else if (aux->tab[0]&1) return 0;
+ for (p = start = aux->p + 1; *p; ++p)
+ if (aux->tab[*p/64]>>(*p%64)&1) break;
+ aux->p = p; // end of token
+ if (*p == 0) aux->tab[0] |= 1; // no more tokens
+ return (char*)start;
+}
+
// s MUST BE a null terminated string; l = strlen(s)
int ksplit_core(char *s, int delimiter, int *_max, int **_offsets)
{
* Boyer-Moore search *
**********************/
+typedef unsigned char ubyte_t;
+
// reference: http://www-igm.univ-mlv.fr/~lecroq/string/node14.html
-int *ksBM_prep(const uint8_t *pat, int m)
+static int *ksBM_prep(const ubyte_t *pat, int m)
{
int i, *suff, *prep, *bmGs, *bmBc;
- prep = calloc(m + 256, 1);
+ prep = calloc(m + 256, sizeof(int));
bmGs = prep; bmBc = prep + m;
{ // preBmBc()
for (i = 0; i < 256; ++i) bmBc[i] = m;
return prep;
}
-int *ksBM_search(const uint8_t *str, int n, const uint8_t *pat, int m, int *_prep, int *n_matches)
+void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep)
{
- int i, j, *prep, *bmGs, *bmBc;
- int *matches = 0, mm = 0, nm = 0;
- prep = _prep? _prep : ksBM_prep(pat, m);
+ int i, j, *prep = 0, *bmGs, *bmBc;
+ const ubyte_t *str, *pat;
+ str = (const ubyte_t*)_str; pat = (const ubyte_t*)_pat;
+ prep = (_prep == 0 || *_prep == 0)? ksBM_prep(pat, m) : *_prep;
+ if (_prep && *_prep == 0) *_prep = prep;
bmGs = prep; bmBc = prep + m;
j = 0;
while (j <= n - m) {
for (i = m - 1; i >= 0 && pat[i] == str[i+j]; --i);
- if (i < 0) {
- if (nm == mm) {
- mm = mm? mm<<1 : 1;
- matches = realloc(matches, mm * sizeof(int));
- }
- matches[nm++] = j;
- j += bmGs[0];
- } else {
+ if (i >= 0) {
int max = bmBc[str[i+j]] - m + 1 + i;
if (max < bmGs[i]) max = bmGs[i];
j += max;
- }
+ } else return (void*)(str + j);
}
- *n_matches = nm;
if (_prep == 0) free(prep);
- return matches;
+ return 0;
}
+char *kstrstr(const char *str, const char *pat, int **_prep)
+{
+ return (char*)kmemmem(str, strlen(str), pat, strlen(pat), _prep);
+}
+
+char *kstrnstr(const char *str, const char *pat, int n, int **_prep)
+{
+ return (char*)kmemmem(str, n, pat, strlen(pat), _prep);
+}
+
+/***********************
+ * The main() function *
+ ***********************/
+
#ifdef KSTRING_MAIN
#include <stdio.h>
int main()
{
kstring_t *s;
int *fields, n, i;
+ ks_tokaux_t aux;
+ char *p;
s = (kstring_t*)calloc(1, sizeof(kstring_t));
// test ksprintf()
ksprintf(s, " abcdefg: %d ", 100);
fields = ksplit(s, 0, &n);
for (i = 0; i < n; ++i)
printf("field[%d] = '%s'\n", i, s->s + fields[i]);
- free(s);
+ // test kstrtok()
+ s->l = 0;
+ for (p = kstrtok("ab:cde:fg/hij::k", ":/", &aux); p; p = kstrtok(0, 0, &aux)) {
+ kputsn(p, aux.p - p, s);
+ kputc('\n', s);
+ }
+ printf("%s", s->s);
+ // free
+ free(s->s); free(s); free(fields);
{
- static char *str = "abcdefgcdg";
+ static char *str = "abcdefgcdgcagtcakcdcd";
static char *pat = "cd";
- int n, *matches;
- matches = ksBM_search(str, strlen(str), pat, strlen(pat), 0, &n);
- printf("%d: \n", n);
- for (i = 0; i < n; ++i)
- printf("- %d\n", matches[i]);
- free(matches);
+ char *ret, *s = str;
+ int *prep = 0;
+ while ((ret = kstrstr(s, pat, &prep)) != 0) {
+ printf("match: %s\n", ret);
+ s = ret + prep[0];
+ }
+ free(prep);
}
return 0;
}
} kstring_t;
#endif
-int ksprintf(kstring_t *s, const char *fmt, ...);
-int ksplit_core(char *s, int delimiter, int *_max, int **_offsets);
+typedef struct {
+ uint64_t tab[4];
+ const char *p; // end of the current token
+} ks_tokaux_t;
-// calculate the auxiliary array, allocated by calloc()
-int *ksBM_prep(const uint8_t *pat, int m);
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ int ksprintf(kstring_t *s, const char *fmt, ...);
+ int ksplit_core(char *s, int delimiter, int *_max, int **_offsets);
+ char *kstrstr(const char *str, const char *pat, int **_prep);
+ char *kstrnstr(const char *str, const char *pat, int n, int **_prep);
+ void *kmemmem(const void *_str, int n, const void *_pat, int m, int **_prep);
-/* Search pat in str and returned the list of matches. The size of the
- * list is returned as n_matches. _prep is the array returned by
- * ksBM_prep(). If it is a NULL pointer, ksBM_prep() will be called. */
-int *ksBM_search(const uint8_t *str, int n, const uint8_t *pat, int m, int *_prep, int *n_matches);
+ /* kstrtok() is similar to strtok_r() except that str is not
+ * modified and both str and sep can be NULL. For efficiency, it is
+ * actually recommended to set both to NULL in the subsequent calls
+ * if sep is not changed. */
+ char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux);
+#ifdef __cplusplus
+}
+#endif
+
static inline int kputsn(const char *p, int l, kstring_t *s)
{
if (s->l + l + 1 >= s->m) {
kroundup32(s->m);
s->s = (char*)realloc(s->s, s->m);
}
- strncpy(s->s + s->l, p, l);
+ memcpy(s->s + s->l, p, l);
s->l += l;
s->s[s->l] = 0;
return l;
CC= gcc
CXX= g++
-CFLAGS= -g -Wall #-O2 #-m64 #-arch ppc
+CFLAGS= -g -Wall -O2 #-m64 #-arch ppc
CXXFLAGS= $(CFLAGS)
DFLAGS= -D_FILE_OFFSET_BITS=64
OBJS=
&usage if (@ARGV < 1);
my $command = shift(@ARGV);
-my %func = (showALEN=>\&showALEN, pileup2fq=>\&pileup2fq, varFilter=>\&varFilter,
+my %func = (showALEN=>\&showALEN, pileup2fq=>\&pileup2fq, varFilter=>\&varFilter, plp2vcf=>\&plp2vcf,
unique=>\&unique, uniqcmp=>\&uniqcmp, sra2hdr=>\&sra2hdr, sam2fq=>\&sam2fq);
die("Unknown command \"$command\".\n") if (!defined($func{$command}));
my $len=0;
if ($flt == 0) {
if ($t[2] eq '*') { # an indel
-
# If deletion, remember the length of the deletion
my ($a,$b) = split(m{/},$t[3]);
my $alen = length($a) - 1;
close($fh);
}
+sub plp2vcf {
+ while (<>) {
+ my @t = split;
+ next if ($t[3] eq '*/*');
+ if ($t[2] eq '*') { # indel
+ my @s = split("/", $t[3]);
+ my (@a, @b);
+ my ($ref, $alt);
+ for (@s) {
+ next if ($_ eq '*');
+ if (/^-/) {
+ push(@a, 'N'.substr($_, 1));
+ push(@b, 'N');
+ } elsif (/^\+/) {
+ push(@a, 'N');
+ push(@b, 'N'.substr($_, 1));
+ }
+ }
+ if ($a[0] && $a[1]) {
+ if (length($a[0]) < length($a[1])) {
+ $ref = $a[1];
+ $alt = ($b[0] . ('N' x (length($a[1]) - length($a[0])))) . ",$b[1]";
+ } elsif (length($a[0]) > length($a[1])) {
+ $ref = $a[0];
+ $alt = ($b[1] . ('N' x (length($a[0]) - length($a[1])))) . ",$b[0]";
+ } else {
+ $ref = $a[0];
+ $alt = ($b[0] eq $b[1])? $b[0] : "$b[0],$b[1]";
+ }
+ } else {
+ $ref = $a[0]; $alt = $b[0];
+ }
+ print join("\t", @t[0,1], '.', $ref, $alt, $t[5], '.', '.'), "\n";
+ } else { # SNP
+ }
+ }
+}
+
#
# Usage
#
buffer = malloc(WINDOW_SIZE);
while((c = read(f_src, buffer, WINDOW_SIZE)) > 0) razf_write(rz, buffer, c);
razf_close(rz); // f_dst will be closed here
- if (argc > optind) unlink(argv[optind]);
+ if (argc > optind && !pstdout) unlink(argv[optind]);
free(buffer);
close(f_src);
return 0;
#include "khash.h"
KHASH_SET_INIT_STR(rg)
+// When counting records instead of printing them,
+// data passed to the bam_fetch callback is encapsulated in this struct.
+typedef struct {
+ bam_header_t *header;
+ int *count;
+} count_func_data_t;
+
typedef khash_t(rg) *rghash_t;
rghash_t g_rghash = 0;
return 0;
}
-// callback function for bam_fetch()
+// callback function for bam_fetch() that prints nonskipped records
static int view_func(const bam1_t *b, void *data)
{
if (!__g_skip_aln(((samfile_t*)data)->header, b))
return 0;
}
+// callback function for bam_fetch() that counts nonskipped records
+static int count_func(const bam1_t *b, void *data)
+{
+ if (!__g_skip_aln(((count_func_data_t*)data)->header, b)) {
+ (*((count_func_data_t*)data)->count)++;
+ }
+ return 0;
+}
+
static int usage(int is_long_help);
int main_samview(int argc, char *argv[])
{
- int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, is_uncompressed = 0, is_bamout = 0, slx2sngr = 0;
+ int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, is_uncompressed = 0, is_bamout = 0, slx2sngr = 0, is_count = 0;
int of_type = BAM_OFDEC, is_long_help = 0;
+ int count = 0;
samfile_t *in = 0, *out = 0;
char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0, *fn_rg = 0;
/* parse command-line options */
strcpy(in_mode, "r"); strcpy(out_mode, "w");
- while ((c = getopt(argc, argv, "Sbt:hHo:q:f:F:ul:r:xX?T:CR:")) >= 0) {
+ while ((c = getopt(argc, argv, "Sbct:hHo:q:f:F:ul:r:xX?T:CR:")) >= 0) {
switch (c) {
+ case 'c': is_count = 1; break;
case 'C': slx2sngr = 1; break;
case 'S': is_bamin = 0; break;
case 'b': is_bamout = 1; break;
if (fn_list == 0 && fn_ref) fn_list = samfaipath(fn_ref);
// open file handlers
if ((in = samopen(argv[optind], in_mode, fn_list)) == 0) {
- fprintf(stderr, "[main_samview] fail to open file for reading.\n");
+ fprintf(stderr, "[main_samview] fail to open \"%s\" for reading.\n", argv[optind]);
+ ret = 1;
goto view_end;
}
if (in->header == 0) {
- fprintf(stderr, "[main_samview] fail to read the header.\n");
+ fprintf(stderr, "[main_samview] fail to read the header from \"%s\".\n", argv[optind]);
+ ret = 1;
goto view_end;
}
- if ((out = samopen(fn_out? fn_out : "-", out_mode, in->header)) == 0) {
- fprintf(stderr, "[main_samview] fail to open file for writing.\n");
+ if (!is_count && (out = samopen(fn_out? fn_out : "-", out_mode, in->header)) == 0) {
+ fprintf(stderr, "[main_samview] fail to open \"%s\" for writing.\n", fn_out? fn_out : "standard output");
+ ret = 1;
goto view_end;
}
if (is_header_only) goto view_end; // no need to print alignments
while ((r = samread(in, b)) >= 0) { // read one alignment from `in'
if (!__g_skip_aln(in->header, b)) {
if (slx2sngr) sol2sanger(b);
- samwrite(out, b); // write the alignment to `out'
+ if (!is_count) samwrite(out, b); // write the alignment to `out'
+ count++;
}
}
- if (r < -1) fprintf(stderr, "[main_samview] truncated file.\n");
+ if (r < -1) {
+ fprintf(stderr, "[main_samview] truncated file.\n");
+ ret = 1;
+ }
bam_destroy1(b);
} else { // retrieve alignments in specified regions
int i;
goto view_end;
}
for (i = optind + 1; i < argc; ++i) {
- int tid, beg, end;
+ int tid, beg, end, result;
bam_parse_region(in->header, argv[i], &tid, &beg, &end); // parse a region in the format like `chr2:100-200'
if (tid < 0) { // reference name is not found
- fprintf(stderr, "[main_samview] fail to get the reference name. Continue anyway.\n");
+ fprintf(stderr, "[main_samview] region \"%s\" specifies an unknown reference name. Continue anyway.\n", argv[i]);
continue;
}
- bam_fetch(in->x.bam, idx, tid, beg, end, out, view_func); // fetch alignments
+ // fetch alignments
+ if (is_count) {
+ count_func_data_t count_data = { in->header, &count };
+ result = bam_fetch(in->x.bam, idx, tid, beg, end, &count_data, count_func);
+ } else
+ result = bam_fetch(in->x.bam, idx, tid, beg, end, out, view_func);
+ if (result < 0) {
+ fprintf(stderr, "[main_samview] retrieval of region \"%s\" failed due to truncated file or corrupt BAM index file\n", argv[i]);
+ ret = 1;
+ break;
+ }
}
bam_index_destroy(idx); // destroy the BAM index
}
view_end:
+ if (is_count && ret == 0) {
+ printf("%d\n", count);
+ }
// close files, free and return
free(fn_list); free(fn_ref); free(fn_out); free(g_library); free(g_rg); free(fn_rg);
if (g_rghash) {
kh_destroy(rg, g_rghash);
}
samclose(in);
- samclose(out);
+ if (!is_count)
+ samclose(out);
return ret;
}
fprintf(stderr, " -u uncompressed BAM output (force -b)\n");
fprintf(stderr, " -x output FLAG in HEX (samtools-C specific)\n");
fprintf(stderr, " -X output FLAG in string (samtools-C specific)\n");
+ fprintf(stderr, " -c print only the count of matching records\n");
fprintf(stderr, " -t FILE list of reference names and lengths (force -S) [null]\n");
fprintf(stderr, " -T FILE reference sequence file (force -S) [null]\n");
fprintf(stderr, " -o FILE output file name [stdout]\n");
-.TH samtools 1 "11 July 2010" "samtools-0.1.8" "Bioinformatics tools"
+.TH samtools 1 "2 December 2010" "samtools-0.1.12" "Bioinformatics tools"
.SH NAME
.PP
samtools - Utilities for the Sequence Alignment/Map (SAM) format
.PP
samtools faidx ref.fasta
.PP
-samtools pileup -f ref.fasta aln.sorted.bam
+samtools pileup -vcf ref.fasta aln.sorted.bam
.PP
-samtools mpileup -f ref.fasta -r chr3:1,000-2,000 in1.bam in2.bam
+samtools mpileup -C50 -gf ref.fasta -r chr3:1,000-2,000 in1.bam in2.bam
.PP
samtools tview aln.sorted.bam ref.fasta
.TP 10
.B view
-samtools view [-bhuHS] [-t in.refList] [-o output] [-f reqFlag] [-F
+samtools view [-bchuHS] [-t in.refList] [-o output] [-f reqFlag] [-F
skipFlag] [-q minMapQ] [-l library] [-r readGroup] [-R rgFile] <in.bam>|<in.sam> [region1 [...]]
Extract/print all or sub alignments in SAM or BAM format. If no region
.B -b
Output in the BAM format.
.TP
-.B -u
-Output uncompressed BAM. This option saves time spent on
-compression/decomprssion and is thus preferred when the output is piped
-to another samtools command.
+.BI -f \ INT
+Only output alignments with all bits in INT present in the FLAG
+field. INT can be in hex in the format of /^0x[0-9A-F]+/ [0]
+.TP
+.BI -F \ INT
+Skip alignments with bits present in INT [0]
.TP
.B -h
Include the header in the output.
.B -H
Output the header only.
.TP
+.BI -l \ STR
+Only output reads in library STR [null]
+.TP
+.BI -o \ FILE
+Output file [stdout]
+.TP
+.BI -q \ INT
+Skip alignments with MAPQ smaller than INT [0]
+.TP
+.BI -r \ STR
+Only output reads in read group STR [null]
+.TP
+.BI -R \ FILE
+Output reads in read groups listed in
+.I FILE
+[null]
+.TP
.B -S
Input is in SAM. If @SQ header lines are absent, the
.B `-t'
option is required.
.TP
-.B -t FILE
+.B -c
+Instead of printing the alignments, only count them and print the
+total number. All filter options, such as
+.B `-f',
+.B `-F'
+and
+.B `-q'
+, are taken into account.
+.TP
+.BI -t \ FILE
This file is TAB-delimited. Each line must contain the reference name
and the length of the reference, one line for each distinct reference;
additional fields are ignored. This file also defines the order of the
.I <in.ref_list>
file.
.TP
-.B -o FILE
-Output file [stdout]
-.TP
-.B -f INT
-Only output alignments with all bits in INT present in the FLAG
-field. INT can be in hex in the format of /^0x[0-9A-F]+/ [0]
-.TP
-.B -F INT
-Skip alignments with bits present in INT [0]
-.TP
-.B -q INT
-Skip alignments with MAPQ smaller than INT [0]
-.TP
-.B -l STR
-Only output reads in library STR [null]
-.TP
-.B -r STR
-Only output reads in read group STR [null]
-.TP
-.B -R FILE
-Output reads in read groups listed in
-.I FILE
-[null]
+.B -u
+Output uncompressed BAM. This option saves time spent on
+compression/decomprssion and is thus preferred when the output is piped
+to another samtools command.
.RE
.TP
region in the format like `chr10:10,000,000' or `=10,000,000' when
viewing the same reference sequence.
-.TP
-.B pileup
-samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l in.site_list]
-[-iscgS2] [-T theta] [-N nHap] [-r pairDiffRate] <in.bam>|<in.sam>
-
-Print the alignment in the pileup format. In the pileup format, each
-line represents a genomic position, consisting of chromosome name,
-coordinate, reference base, read bases, read qualities and alignment
-mapping qualities. Information on match, mismatch, indel, strand,
-mapping quality and start and end of a read are all encoded at the read
-base column. At this column, a dot stands for a match to the reference
-base on the forward strand, a comma for a match on the reverse strand,
-`ACGTN' for a mismatch on the forward strand and `acgtn' for a mismatch
-on the reverse strand. A pattern `\\+[0-9]+[ACGTNacgtn]+' indicates
-there is an insertion between this reference position and the next
-reference position. The length of the insertion is given by the integer
-in the pattern, followed by the inserted sequence. Similarly, a pattern
-`-[0-9]+[ACGTNacgtn]+' represents a deletion from the reference. The
-deleted bases will be presented as `*' in the following lines. Also at
-the read base column, a symbol `^' marks the start of a read segment
-which is a contiguous subsequence on the read separated by `N/S/H' CIGAR
-operations. The ASCII of the character following `^' minus 33 gives the
-mapping quality. A symbol `$' marks the end of a read segment.
-
-If option
-.B -c
-is applied, the consensus base, Phred-scaled consensus quality, SNP
-quality (i.e. the Phred-scaled probability of the consensus being
-identical to the reference) and root mean square (RMS) mapping quality
-of the reads covering the site will be inserted between the `reference
-base' and the `read bases' columns. An indel occupies an additional
-line. Each indel line consists of chromosome name, coordinate, a star,
-the genotype, consensus quality, SNP quality, RMS mapping quality, #
-covering reads, the first alllele, the second allele, # reads supporting
-the first allele, # reads supporting the second allele and # reads
-containing indels different from the top two alleles.
-
-The position of indels is offset by -1.
-
-.B OPTIONS:
-.RS
-.TP 10
-.B -s
-Print the mapping quality as the last column. This option makes the
-output easier to parse, although this format is not space efficient.
-.TP
-.B -S
-The input file is in SAM.
-.TP
-.B -i
-Only output pileup lines containing indels.
-.TP
-.B -f FILE
-The reference sequence in the FASTA format. Index file
-.I FILE.fai
-will be created if
-absent.
-.TP
-.B -M INT
-Cap mapping quality at INT [60]
-.TP
-.B -m INT
-Filter reads with flag containing bits in
-.I
-INT
-[1796]
-.TP
-.B -d INT
-Use the first
-.I NUM
-reads in the pileup for indel calling for speed up. Zero for unlimited. [0]
-.TP
-.B -t FILE
-List of reference names ane sequence lengths, in the format described
-for the
-.B import
-command. If this option is present, samtools assumes the input
-.I <in.alignment>
-is in SAM format; otherwise it assumes in BAM format.
-.TP
-.B -l FILE
-List of sites at which pileup is output. This file is space
-delimited. The first two columns are required to be chromosome and
-1-based coordinate. Additional columns are ignored. It is
-recommended to use option
-.B -s
-together with
-.B -l
-as in the default format we may not know the mapping quality.
-.TP
-.B -c
-Call the consensus sequence using SOAPsnp consensus model. Options
-.B -T,
-.B -N,
-.B -I
-and
-.B -r
-are only effective when
-.B -c
-or
-.B -g
-is in use.
-.TP
-.B -g
-Generate genotype likelihood in the binary GLFv3 format. This option
-suppresses -c, -i and -s.
-.TP
-.B -T FLOAT
-The theta parameter (error dependency coefficient) in the maq consensus
-calling model [0.85]
-.TP
-.B -N INT
-Number of haplotypes in the sample (>=2) [2]
-.TP
-.B -r FLOAT
-Expected fraction of differences between a pair of haplotypes [0.001]
-.TP
-.B -I INT
-Phred probability of an indel in sequencing/prep. [40]
-.RE
-
.TP
.B mpileup
-samtools mpileup [-r reg] [-f in.fa] in.bam [in2.bam [...]]
+samtools mpileup [-Bug] [-C capQcoef] [-r reg] [-f in.fa] [-l list] [-M capMapQ] [-Q minBaseQ] [-q minMapQ] in.bam [in2.bam [...]]
-Generate pileup for multiple BAM files. Consensus calling is not
-implemented.
+Generate BCF or pileup for one or multiple BAM files. Alignment records
+are grouped by sample identifiers in @RG header lines. If sample
+identifiers are absent, each input file is regarded as one sample.
.B OPTIONS:
.RS
.TP 8
-.B -r STR
+.B -B
+Disable probabilistic realignment for the computation of base alignment
+quality (BAQ). BAQ is the Phred-scaled probability of a read base being
+misaligned. Applying this option greatly helps to reduce false SNPs
+caused by misalignments.
+.TP
+.BI -C \ INT
+Coefficient for downgrading mapping quality for reads containing
+excessive mismatches. Given a read with a phred-scaled probability q of
+being generated from the mapped position, the new mapping quality is
+about sqrt((INT-q)/INT)*INT. A zero value disables this
+functionality; if enabled, the recommended value for BWA is 50. [0]
+.TP
+.BI -e \ INT
+Phred-scaled gap extension sequencing error probability. Reducing
+.I INT
+leads to longer indels. [20]
+.TP
+.BI -f \ FILE
+The reference file [null]
+.TP
+.B -g
+Compute genotype likelihoods and output them in the binary call format (BCF).
+.TP
+.BI -h \ INT
+Coefficient for modeling homopolymer errors. Given an
+.IR l -long
+homopolymer
+run, the sequencing error of an indel of size
+.I s
+is modeled as
+.IR INT * s / l .
+[100]
+.TP
+.BI -l \ FILE
+File containing a list of sites where pileup or BCF is outputted [null]
+.TP
+.BI -o \ INT
+Phred-scaled gap open sequencing error probability. Reducing
+.I INT
+leads to more indel calls. [40]
+.TP
+.BI -P \ STR
+Comma dilimited list of platforms (determined by
+.BR @RG-PL )
+from which indel candidates are obtained. It is recommended to collect
+indel candidates from sequencing technologies that have low indel error
+rate such as ILLUMINA. [all]
+.TP
+.BI -q \ INT
+Minimum mapping quality for an alignment to be used [0]
+.TP
+.BI -Q \ INT
+Minimum base quality for a base to be considered [13]
+.TP
+.BI -r \ STR
Only generate pileup in region
.I STR
[all sites]
.TP
-.B -f FILE
-The reference file [null]
+.B -u
+Similar to
+.B -g
+except that the output is uncompressed BCF, which is preferred for piping.
.RE
.TP
.B -n
Sort by read names rather than by chromosomal coordinates
.TP
-.B -m INT
+.BI -m \ INT
Approximately the maximum required memory. [500000000]
.RE
.TP
.B merge
-samtools merge [-h inh.sam] [-nr] <out.bam> <in1.bam> <in2.bam> [...]
+samtools merge [-nur] [-h inh.sam] [-R reg] <out.bam> <in1.bam> <in2.bam> [...]
Merge multiple sorted alignments.
The header reference lists of all the input BAM files, and the @SQ headers of
.B OPTIONS:
.RS
.TP 8
-.B -h FILE
+.BI -h \ FILE
Use the lines of
.I FILE
as `@' headers to be copied to
is actually in SAM format, though any alignment records it may contain
are ignored.)
.TP
+.BI -R \ STR
+Merge files in the specified region indicated by
+.I STR
+.TP
.B -r
Attach an RG tag to each alignment. The tag value is inferred from file names.
.TP
.B -n
The input alignments are sorted by read names rather than by chromosomal
coordinates
+.TP
+.B -u
+Uncompressed BAM output
.RE
.TP
.TP
.B calmd
-samtools calmd [-eubS] <aln.bam> <ref.fasta>
+samtools calmd [-eubSr] [-C capQcoef] <aln.bam> <ref.fasta>
Generate the MD tag. If the MD tag is already present, this command will
give a warning if the MD tag generated is different from the existing
.B OPTIONS:
.RS
.TP 8
+.B -A
+When used jointly with
+.B -r
+this option overwrites the original base quality.
+.TP 8
.B -e
Convert a the read base to = if it is identical to the aligned reference
base. Indel caller does not support the = bases at the moment.
.TP
.B -S
The input is SAM with header lines
+.TP
+.BI -C \ INT
+Coefficient to cap mapping quality of poorly mapped reads. See the
+.B pileup
+command for details. [0]
+.TP
+.B -r
+Compute the BQ tag without changing the base quality.
+.RE
+
+.TP
+.B pileup
+samtools pileup [-2sSBicv] [-f in.ref.fasta] [-t in.ref_list] [-l
+in.site_list] [-C capMapQ] [-M maxMapQ] [-T theta] [-N nHap] [-r
+pairDiffRate] [-m mask] [-d maxIndelDepth] [-G indelPrior]
+<in.bam>|<in.sam>
+
+Print the alignment in the pileup format. In the pileup format, each
+line represents a genomic position, consisting of chromosome name,
+coordinate, reference base, read bases, read qualities and alignment
+mapping qualities. Information on match, mismatch, indel, strand,
+mapping quality and start and end of a read are all encoded at the read
+base column. At this column, a dot stands for a match to the reference
+base on the forward strand, a comma for a match on the reverse strand,
+a '>' or '<' for a reference skip, `ACGTN' for a mismatch on the forward
+strand and `acgtn' for a mismatch on the reverse strand. A pattern
+`\\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between this
+reference position and the next reference position. The length of the
+insertion is given by the integer in the pattern, followed by the
+inserted sequence. Similarly, a pattern `-[0-9]+[ACGTNacgtn]+'
+represents a deletion from the reference. The deleted bases will be
+presented as `*' in the following lines. Also at the read base column, a
+symbol `^' marks the start of a read. The ASCII of the character
+following `^' minus 33 gives the mapping quality. A symbol `$' marks the
+end of a read segment.
+
+If option
+.B -c
+is applied, the consensus base, Phred-scaled consensus quality, SNP
+quality (i.e. the Phred-scaled probability of the consensus being
+identical to the reference) and root mean square (RMS) mapping quality
+of the reads covering the site will be inserted between the `reference
+base' and the `read bases' columns. An indel occupies an additional
+line. Each indel line consists of chromosome name, coordinate, a star,
+the genotype, consensus quality, SNP quality, RMS mapping quality, #
+covering reads, the first alllele, the second allele, # reads supporting
+the first allele, # reads supporting the second allele and # reads
+containing indels different from the top two alleles.
+
+.B NOTE:
+Since 0.1.10, the `pileup' command is deprecated by `mpileup'.
+
+.B OPTIONS:
+.RS
+.TP 10
+.B -B
+Disable the BAQ computation. See the
+.B mpileup
+command for details.
+.TP
+.B -c
+Call the consensus sequence. Options
+.BR -T ", " -N ", " -I " and " -r
+are only effective when
+.BR -c " or " -g
+is in use.
+.TP
+.BI -C \ INT
+Coefficient for downgrading the mapping quality of poorly mapped
+reads. See the
+.B mpileup
+command for details. [0]
+.TP
+.BI -d \ INT
+Use the first
+.I NUM
+reads in the pileup for indel calling for speed up. Zero for unlimited. [1024]
+.TP
+.BI -f \ FILE
+The reference sequence in the FASTA format. Index file
+.I FILE.fai
+will be created if
+absent.
+.TP
+.B -g
+Generate genotype likelihood in the binary GLFv3 format. This option
+suppresses -c, -i and -s. This option is deprecated by the
+.B mpileup
+command.
+.TP
+.B -i
+Only output pileup lines containing indels.
+.TP
+.BI -I \ INT
+Phred probability of an indel in sequencing/prep. [40]
+.TP
+.BI -l \ FILE
+List of sites at which pileup is output. This file is space
+delimited. The first two columns are required to be chromosome and
+1-based coordinate. Additional columns are ignored. It is
+recommended to use option
+.TP
+.BI -m \ INT
+Filter reads with flag containing bits in
+.I INT
+[1796]
+.TP
+.BI -M \ INT
+Cap mapping quality at INT [60]
+.TP
+.BI -N \ INT
+Number of haplotypes in the sample (>=2) [2]
+.TP
+.BI -r \ FLOAT
+Expected fraction of differences between a pair of haplotypes [0.001]
+.TP
+.B -s
+Print the mapping quality as the last column. This option makes the
+output easier to parse, although this format is not space efficient.
+.TP
+.B -S
+The input file is in SAM.
+.TP
+.BI -t \ FILE
+List of reference names ane sequence lengths, in the format described
+for the
+.B import
+command. If this option is present, samtools assumes the input
+.I <in.alignment>
+is in SAM format; otherwise it assumes in BAM format.
+.B -s
+together with
+.B -l
+as in the default format we may not know the mapping quality.
+.TP
+.BI -T \ FLOAT
+The theta parameter (error dependency coefficient) in the maq consensus
+calling model [0.85]
.RE
.SH SAM FORMAT
0x0400 d the read is either a PCR or an optical duplicate
.TE
+.SH EXAMPLES
+.IP o 2
+Import SAM to BAM when
+.B @SQ
+lines are present in the header:
+
+ samtools view -bS aln.sam > aln.bam
+
+If
+.B @SQ
+lines are absent:
+
+ samtools faidx ref.fa
+ samtools view -bt ref.fa.fai aln.sam > aln.bam
+
+where
+.I ref.fa.fai
+is generated automatically by the
+.B faidx
+command.
+
+.IP o 2
+Attach the
+.B RG
+tag while merging sorted alignments:
+
+ perl -e 'print "@RG\\tID:ga\\tSM:hs\\tLB:ga\\tPL:Illumina\\n@RG\\tID:454\\tSM:hs\\tLB:454\\tPL:454\\n"' > rg.txt
+ samtools merge -rh rg.txt merged.bam ga.bam 454.bam
+
+The value in a
+.B RG
+tag is determined by the file name the read is coming from. In this
+example, in the
+.IR merged.bam ,
+reads from
+.I ga.bam
+will be attached
+.IR RG:Z:ga ,
+while reads from
+.I 454.bam
+will be attached
+.IR RG:Z:454 .
+
+.IP o 2
+Call SNPs and short indels for one diploid individual:
+
+ samtools mpileup -ugf ref.fa aln.bam | bcftools view -bvcg - > var.raw.bcf
+ bcftools view var.raw.bcf | vcfutils.pl varFilter -D 100 > var.flt.vcf
+
+The
+.B -D
+option of varFilter controls the maximum read depth, which should be
+adjusted to about twice the average read depth. One may consider to add
+.B -C50
+to
+.B mpileup
+if mapping quality is overestimated for reads containing excessive
+mismatches. Applying this option usually helps
+.B BWA-short
+but may not other mappers.
+
+.IP o 2
+Call SNPs and short indels for multiple diploid individuals:
+
+ samtools mpileup -P ILLUMINA -ugf ref.fa *.bam | bcftools view -bcvg - > var.raw.bcf
+ bcftools view var.raw.bcf | vcfutils.pl varFilter -D 2000 > var.flt.vcf
+
+Individuals are identified from the
+.B SM
+tags in the
+.B @RG
+header lines. Individuals can be pooled in one alignment file; one
+individual can also be separated into multiple files. The
+.B -P
+option specifies that indel candidates should be collected only from
+read groups with the
+.B @RG-PL
+tag set to
+.IR ILLUMINA .
+Collecting indel candidates from reads sequenced by an indel-prone
+technology may affect the performance of indel calling.
+
+.IP o 2
+Derive the allele frequency spectrum (AFS) on a list of sites from multiple individuals:
+
+ samtools mpileup -Igf ref.fa *.bam > all.bcf
+ bcftools view -bl sites.list all.bcf > sites.bcf
+ bcftools view -cGP cond2 sites.bcf > /dev/null 2> sites.1.afs
+ bcftools view -cGP sites.1.afs sites.bcf > /dev/null 2> sites.2.afs
+ bcftools view -cGP sites.2.afs sites.bcf > /dev/null 2> sites.3.afs
+ ......
+
+where
+.I sites.list
+contains the list of sites with each line consisting of the reference
+sequence name and position. The following
+.B bcftools
+commands estimate AFS by EM.
+
+.IP o 2
+Dump BAQ applied alignment for other SNP callers:
+
+ samtools calmd -bAr aln.bam > aln.baq.bam
+
+It adds and corrects the
+.B NM
+and
+.B MD
+tags at the same time. The
+.B calmd
+command also comes with the
+.B -C
+option, the same as the one in
+.B pileup
+and
+.BR mpileup .
+Apply if it helps.
+
.SH LIMITATIONS
.PP
.IP o 2
.PP
Heng Li from the Sanger Institute wrote the C version of samtools. Bob
Handsaker from the Broad Institute implemented the BGZF library and Jue
-Ruan from Beijing Genomics Institute wrote the RAZF library. Various
-people in the 1000 Genomes Project contributed to the SAM format
+Ruan from Beijing Genomics Institute wrote the RAZF library. John
+Marshall and Petr Danecek contribute to the source code and various
+people from the 1000 Genomes Project have contributed to the SAM format
specification.
.SH SEE ALSO
samtools faidx ref.fasta
- samtools pileup -f ref.fasta aln.sorted.bam
+ samtools pileup -vcf ref.fasta aln.sorted.bam
- samtools mpileup -f ref.fasta -r chr3:1,000-2,000 in1.bam in2.bam
+ samtools mpileup -C50 -gf ref.fasta -r chr3:1,000-2,000 in1.bam in2.bam
samtools tview aln.sorted.bam ref.fasta
DESCRIPTION
- Samtools is a set of utilities that manipulate alignments in the BAM
+ Samtools is a set of utilities that manipulate alignments in the BAM
format. It imports from and exports to the SAM (Sequence Alignment/Map)
- format, does sorting, merging and indexing, and allows to retrieve
+ format, does sorting, merging and indexing, and allows to retrieve
reads in any regions swiftly.
- Samtools is designed to work on a stream. It regards an input file `-'
- as the standard input (stdin) and an output file `-' as the standard
+ Samtools is designed to work on a stream. It regards an input file `-'
+ as the standard input (stdin) and an output file `-' as the standard
output (stdout). Several commands can thus be combined with Unix pipes.
Samtools always output warning and error messages to the standard error
output (stderr).
- Samtools is also able to open a BAM (not SAM) file on a remote FTP or
- HTTP server if the BAM file name starts with `ftp://' or `http://'.
- Samtools checks the current working directory for the index file and
- will download the index upon absence. Samtools does not retrieve the
+ Samtools is also able to open a BAM (not SAM) file on a remote FTP or
+ HTTP server if the BAM file name starts with `ftp://' or `http://'.
+ Samtools checks the current working directory for the index file and
+ will download the index upon absence. Samtools does not retrieve the
entire alignment file unless it is asked to do so.
COMMANDS AND OPTIONS
- view samtools view [-bhuHS] [-t in.refList] [-o output] [-f
- reqFlag] [-F skipFlag] [-q minMapQ] [-l library] [-r read-
+ view samtools view [-bchuHS] [-t in.refList] [-o output] [-f
+ reqFlag] [-F skipFlag] [-q minMapQ] [-l library] [-r read-
Group] [-R rgFile] <in.bam>|<in.sam> [region1 [...]]
- Extract/print all or sub alignments in SAM or BAM format. If
- no region is specified, all the alignments will be printed;
- otherwise only alignments overlapping the specified regions
- will be output. An alignment may be given multiple times if
+ Extract/print all or sub alignments in SAM or BAM format. If
+ no region is specified, all the alignments will be printed;
+ otherwise only alignments overlapping the specified regions
+ will be output. An alignment may be given multiple times if
it is overlapping several regions. A region can be presented,
- for example, in the following format: `chr2' (the whole
- chr2), `chr2:1000000' (region starting from 1,000,000bp) or
- `chr2:1,000,000-2,000,000' (region between 1,000,000 and
- 2,000,000bp including the end points). The coordinate is
+ for example, in the following format: `chr2' (the whole
+ chr2), `chr2:1000000' (region starting from 1,000,000bp) or
+ `chr2:1,000,000-2,000,000' (region between 1,000,000 and
+ 2,000,000bp including the end points). The coordinate is
1-based.
OPTIONS:
-b Output in the BAM format.
- -u Output uncompressed BAM. This option saves time spent
- on compression/decomprssion and is thus preferred
- when the output is piped to another samtools command.
+ -f INT Only output alignments with all bits in INT present
+ in the FLAG field. INT can be in hex in the format of
+ /^0x[0-9A-F]+/ [0]
+
+ -F INT Skip alignments with bits present in INT [0]
-h Include the header in the output.
-H Output the header only.
+ -l STR Only output reads in library STR [null]
+
+ -o FILE Output file [stdout]
+
+ -q INT Skip alignments with MAPQ smaller than INT [0]
+
+ -r STR Only output reads in read group STR [null]
+
+ -R FILE Output reads in read groups listed in FILE [null]
+
-S Input is in SAM. If @SQ header lines are absent, the
`-t' option is required.
+ -c Instead of printing the alignments, only count them
+ and print the total number. All filter options, such
+ as `-f', `-F' and `-q' , are taken into account.
+
-t FILE This file is TAB-delimited. Each line must contain
the reference name and the length of the reference,
one line for each distinct reference; additional
`samtools faidx <ref.fa>', the resultant index file
<ref.fa>.fai can be used as this <in.ref_list> file.
- -o FILE Output file [stdout]
-
- -f INT Only output alignments with all bits in INT present
- in the FLAG field. INT can be in hex in the format of
- /^0x[0-9A-F]+/ [0]
-
- -F INT Skip alignments with bits present in INT [0]
-
- -q INT Skip alignments with MAPQ smaller than INT [0]
-
- -l STR Only output reads in library STR [null]
-
- -r STR Only output reads in read group STR [null]
-
- -R FILE Output reads in read groups listed in FILE [null]
+ -u Output uncompressed BAM. This option saves time spent
+ on compression/decomprssion and is thus preferred
+ when the output is piped to another samtools command.
tview samtools tview <in.sorted.bam> [ref.fasta]
- Text alignment viewer (based on the ncurses library). In the
- viewer, press `?' for help and press `g' to check the align-
- ment start from a region in the format like
- `chr10:10,000,000' or `=10,000,000' when viewing the same
+ Text alignment viewer (based on the ncurses library). In the
+ viewer, press `?' for help and press `g' to check the align-
+ ment start from a region in the format like
+ `chr10:10,000,000' or `=10,000,000' when viewing the same
reference sequence.
- pileup samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l
- in.site_list] [-iscgS2] [-T theta] [-N nHap] [-r
- pairDiffRate] <in.bam>|<in.sam>
+ mpileup samtools mpileup [-Bug] [-C capQcoef] [-r reg] [-f in.fa] [-l
+ list] [-M capMapQ] [-Q minBaseQ] [-q minMapQ] in.bam [in2.bam
+ [...]]
- Print the alignment in the pileup format. In the pileup for-
- mat, each line represents a genomic position, consisting of
- chromosome name, coordinate, reference base, read bases, read
- qualities and alignment mapping qualities. Information on
- match, mismatch, indel, strand, mapping quality and start and
- end of a read are all encoded at the read base column. At
- this column, a dot stands for a match to the reference base
- on the forward strand, a comma for a match on the reverse
- strand, `ACGTN' for a mismatch on the forward strand and
- `acgtn' for a mismatch on the reverse strand. A pattern
- `\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion
- between this reference position and the next reference posi-
- tion. The length of the insertion is given by the integer in
- the pattern, followed by the inserted sequence. Similarly, a
- pattern `-[0-9]+[ACGTNacgtn]+' represents a deletion from the
- reference. The deleted bases will be presented as `*' in the
- following lines. Also at the read base column, a symbol `^'
- marks the start of a read segment which is a contiguous sub-
- sequence on the read separated by `N/S/H' CIGAR operations.
- The ASCII of the character following `^' minus 33 gives the
- mapping quality. A symbol `$' marks the end of a read seg-
- ment.
-
- If option -c is applied, the consensus base, Phred-scaled
- consensus quality, SNP quality (i.e. the Phred-scaled proba-
- bility of the consensus being identical to the reference) and
- root mean square (RMS) mapping quality of the reads covering
- the site will be inserted between the `reference base' and
- the `read bases' columns. An indel occupies an additional
- line. Each indel line consists of chromosome name, coordi-
- nate, a star, the genotype, consensus quality, SNP quality,
- RMS mapping quality, # covering reads, the first alllele, the
- second allele, # reads supporting the first allele, # reads
- supporting the second allele and # reads containing indels
- different from the top two alleles.
-
- The position of indels is offset by -1.
+ Generate BCF or pileup for one or multiple BAM files. Align-
+ ment records are grouped by sample identifiers in @RG header
+ lines. If sample identifiers are absent, each input file is
+ regarded as one sample.
OPTIONS:
- -s Print the mapping quality as the last column. This
- option makes the output easier to parse, although
- this format is not space efficient.
+ -B Disable probabilistic realignment for the computation
+ of base alignment quality (BAQ). BAQ is the Phred-
+ scaled probability of a read base being misaligned.
+ Applying this option greatly helps to reduce false
+ SNPs caused by misalignments.
- -S The input file is in SAM.
+ -C INT Coefficient for downgrading mapping quality for reads
+ containing excessive mismatches. Given a read with a
+ phred-scaled probability q of being generated from
+ the mapped position, the new mapping quality is about
+ sqrt((INT-q)/INT)*INT. A zero value disables this
+ functionality; if enabled, the recommended value for
+ BWA is 50. [0]
- -i Only output pileup lines containing indels.
+ -e INT Phred-scaled gap extension sequencing error probabil-
+ ity. Reducing INT leads to longer indels. [20]
- -f FILE The reference sequence in the FASTA format. Index
- file FILE.fai will be created if absent.
-
- -M INT Cap mapping quality at INT [60]
-
- -m INT Filter reads with flag containing bits in INT
- [1796]
+ -f FILE The reference file [null]
- -d INT Use the first NUM reads in the pileup for indel
- calling for speed up. Zero for unlimited. [0]
+ -g Compute genotype likelihoods and output them in the
+ binary call format (BCF).
- -t FILE List of reference names ane sequence lengths, in
- the format described for the import command. If
- this option is present, samtools assumes the input
- <in.alignment> is in SAM format; otherwise it
- assumes in BAM format.
+ -h INT Coefficient for modeling homopolymer errors. Given an
+ l-long homopolymer run, the sequencing error of an
+ indel of size s is modeled as INT*s/l. [100]
- -l FILE List of sites at which pileup is output. This file
- is space delimited. The first two columns are
- required to be chromosome and 1-based coordinate.
- Additional columns are ignored. It is recommended
- to use option -s together with -l as in the default
- format we may not know the mapping quality.
+ -l FILE File containing a list of sites where pileup or BCF
+ is outputted [null]
- -c Call the consensus sequence using SOAPsnp consensus
- model. Options -T, -N, -I and -r are only effective
- when -c or -g is in use.
+ -o INT Phred-scaled gap open sequencing error probability.
+ Reducing INT leads to more indel calls. [40]
- -g Generate genotype likelihood in the binary GLFv3
- format. This option suppresses -c, -i and -s.
+ -P STR Comma dilimited list of platforms (determined by @RG-
+ PL) from which indel candidates are obtained. It is
+ recommended to collect indel candidates from sequenc-
+ ing technologies that have low indel error rate such
+ as ILLUMINA. [all]
- -T FLOAT The theta parameter (error dependency coefficient)
- in the maq consensus calling model [0.85]
+ -q INT Minimum mapping quality for an alignment to be used
+ [0]
- -N INT Number of haplotypes in the sample (>=2) [2]
-
- -r FLOAT Expected fraction of differences between a pair of
- haplotypes [0.001]
-
- -I INT Phred probability of an indel in sequencing/prep.
- [40]
-
-
- mpileup samtools mpileup [-r reg] [-f in.fa] in.bam [in2.bam [...]]
-
- Generate pileup for multiple BAM files. Consensus calling is
- not implemented.
-
- OPTIONS:
+ -Q INT Minimum base quality for a base to be considered [13]
-r STR Only generate pileup in region STR [all sites]
- -f FILE The reference file [null]
+ -u Similar to -g except that the output is uncompressed
+ BCF, which is preferred for piping.
reheader samtools reheader <in.header.sam> <in.bam>
[500000000]
- merge samtools merge [-h inh.sam] [-nr] <out.bam> <in1.bam>
- <in2.bam> [...]
+ merge samtools merge [-nur] [-h inh.sam] [-R reg] <out.bam>
+ <in1.bam> <in2.bam> [...]
Merge multiple sorted alignments. The header reference lists
of all the input BAM files, and the @SQ headers of inh.sam,
SAM format, though any alignment records it may con-
tain are ignored.)
+ -R STR Merge files in the specified region indicated by STR
+
-r Attach an RG tag to each alignment. The tag value is
inferred from file names.
-n The input alignments are sorted by read names rather
than by chromosomal coordinates
+ -u Uncompressed BAM output
+
index samtools index <aln.bam>
-S Treat paired-end reads and single-end reads.
- calmd samtools calmd [-eubS] <aln.bam> <ref.fasta>
+ calmd samtools calmd [-eubSr] [-C capQcoef] <aln.bam> <ref.fasta>
Generate the MD tag. If the MD tag is already present, this
command will give a warning if the MD tag generated is dif-
OPTIONS:
- -e Convert a the read base to = if it is identical to
- the aligned reference base. Indel caller does not
+ -A When used jointly with -r this option overwrites the
+ original base quality.
+
+ -e Convert a the read base to = if it is identical to
+ the aligned reference base. Indel caller does not
support the = bases at the moment.
-u Output uncompressed BAM
-S The input is SAM with header lines
+ -C INT Coefficient to cap mapping quality of poorly mapped
+ reads. See the pileup command for details. [0]
+
+ -r Compute the BQ tag without changing the base quality.
+
+
+ pileup samtools pileup [-2sSBicv] [-f in.ref.fasta] [-t in.ref_list]
+ [-l in.site_list] [-C capMapQ] [-M maxMapQ] [-T theta] [-N
+ nHap] [-r pairDiffRate] [-m mask] [-d maxIndelDepth] [-G
+ indelPrior] <in.bam>|<in.sam>
+
+ Print the alignment in the pileup format. In the pileup for-
+ mat, each line represents a genomic position, consisting of
+ chromosome name, coordinate, reference base, read bases, read
+ qualities and alignment mapping qualities. Information on
+ match, mismatch, indel, strand, mapping quality and start and
+ end of a read are all encoded at the read base column. At
+ this column, a dot stands for a match to the reference base
+ on the forward strand, a comma for a match on the reverse
+ strand, a '>' or '<' for a reference skip, `ACGTN' for a mis-
+ match on the forward strand and `acgtn' for a mismatch on the
+ reverse strand. A pattern `\+[0-9]+[ACGTNacgtn]+' indicates
+ there is an insertion between this reference position and the
+ next reference position. The length of the insertion is given
+ by the integer in the pattern, followed by the inserted
+ sequence. Similarly, a pattern `-[0-9]+[ACGTNacgtn]+' repre-
+ sents a deletion from the reference. The deleted bases will
+ be presented as `*' in the following lines. Also at the read
+ base column, a symbol `^' marks the start of a read. The
+ ASCII of the character following `^' minus 33 gives the map-
+ ping quality. A symbol `$' marks the end of a read segment.
+
+ If option -c is applied, the consensus base, Phred-scaled
+ consensus quality, SNP quality (i.e. the Phred-scaled proba-
+ bility of the consensus being identical to the reference) and
+ root mean square (RMS) mapping quality of the reads covering
+ the site will be inserted between the `reference base' and
+ the `read bases' columns. An indel occupies an additional
+ line. Each indel line consists of chromosome name, coordi-
+ nate, a star, the genotype, consensus quality, SNP quality,
+ RMS mapping quality, # covering reads, the first alllele, the
+ second allele, # reads supporting the first allele, # reads
+ supporting the second allele and # reads containing indels
+ different from the top two alleles.
+
+ NOTE: Since 0.1.10, the `pileup' command is deprecated by
+ `mpileup'.
+
+ OPTIONS:
+
+ -B Disable the BAQ computation. See the mpileup com-
+ mand for details.
+
+ -c Call the consensus sequence. Options -T, -N, -I and
+ -r are only effective when -c or -g is in use.
+
+ -C INT Coefficient for downgrading the mapping quality of
+ poorly mapped reads. See the mpileup command for
+ details. [0]
+
+ -d INT Use the first NUM reads in the pileup for indel
+ calling for speed up. Zero for unlimited. [1024]
+
+ -f FILE The reference sequence in the FASTA format. Index
+ file FILE.fai will be created if absent.
+
+ -g Generate genotype likelihood in the binary GLFv3
+ format. This option suppresses -c, -i and -s. This
+ option is deprecated by the mpileup command.
+
+ -i Only output pileup lines containing indels.
+
+ -I INT Phred probability of an indel in sequencing/prep.
+ [40]
+
+ -l FILE List of sites at which pileup is output. This file
+ is space delimited. The first two columns are
+ required to be chromosome and 1-based coordinate.
+ Additional columns are ignored. It is recommended
+ to use option
+
+ -m INT Filter reads with flag containing bits in INT
+ [1796]
+
+ -M INT Cap mapping quality at INT [60]
+
+ -N INT Number of haplotypes in the sample (>=2) [2]
+
+ -r FLOAT Expected fraction of differences between a pair of
+ haplotypes [0.001]
+
+ -s Print the mapping quality as the last column. This
+ option makes the output easier to parse, although
+ this format is not space efficient.
+
+ -S The input file is in SAM.
+
+ -t FILE List of reference names ane sequence lengths, in
+ the format described for the import command. If
+ this option is present, samtools assumes the input
+ <in.alignment> is in SAM format; otherwise it
+ assumes in BAM format. -s together with -l as in
+ the default format we may not know the mapping
+ quality.
+
+ -T FLOAT The theta parameter (error dependency coefficient)
+ in the maq consensus calling model [0.85]
+
SAM FORMAT
- SAM is TAB-delimited. Apart from the header lines, which are started
+ SAM is TAB-delimited. Apart from the header lines, which are started
with the `@' symbol, each alignment line consists of:
|0x0400 | d | the read is either a PCR or an optical duplicate |
+-------+-----+--------------------------------------------------+
+EXAMPLES
+ o Import SAM to BAM when @SQ lines are present in the header:
+
+ samtools view -bS aln.sam > aln.bam
+
+ If @SQ lines are absent:
+
+ samtools faidx ref.fa
+ samtools view -bt ref.fa.fai aln.sam > aln.bam
+
+ where ref.fa.fai is generated automatically by the faidx command.
+
+
+ o Attach the RG tag while merging sorted alignments:
+
+ perl -e 'print "@RG\tID:ga\tSM:hs\tLB:ga\tPL:Illu-
+ mina\n@RG\tID:454\tSM:hs\tLB:454\tPL:454\n"' > rg.txt
+ samtools merge -rh rg.txt merged.bam ga.bam 454.bam
+
+ The value in a RG tag is determined by the file name the read is com-
+ ing from. In this example, in the merged.bam, reads from ga.bam will
+ be attached RG:Z:ga, while reads from 454.bam will be attached
+ RG:Z:454.
+
+
+ o Call SNPs and short indels for one diploid individual:
+
+ samtools mpileup -ugf ref.fa aln.bam | bcftools view -bvcg - >
+ var.raw.bcf
+ bcftools view var.raw.bcf | vcfutils.pl varFilter -D 100 >
+ var.flt.vcf
+
+ The -D option of varFilter controls the maximum read depth, which
+ should be adjusted to about twice the average read depth. One may
+ consider to add -C50 to mpileup if mapping quality is overestimated
+ for reads containing excessive mismatches. Applying this option usu-
+ ally helps BWA-short but may not other mappers.
+
+
+ o Call SNPs and short indels for multiple diploid individuals:
+
+ samtools mpileup -P ILLUMINA -ugf ref.fa *.bam | bcftools view
+ -bcvg - > var.raw.bcf
+ bcftools view var.raw.bcf | vcfutils.pl varFilter -D 2000 >
+ var.flt.vcf
+
+ Individuals are identified from the SM tags in the @RG header lines.
+ Individuals can be pooled in one alignment file; one individual can
+ also be separated into multiple files. The -P option specifies that
+ indel candidates should be collected only from read groups with the
+ @RG-PL tag set to ILLUMINA. Collecting indel candidates from reads
+ sequenced by an indel-prone technology may affect the performance of
+ indel calling.
+
+
+ o Derive the allele frequency spectrum (AFS) on a list of sites from
+ multiple individuals:
+
+ samtools mpileup -Igf ref.fa *.bam > all.bcf
+ bcftools view -bl sites.list all.bcf > sites.bcf
+ bcftools view -cGP cond2 sites.bcf > /dev/null 2> sites.1.afs
+ bcftools view -cGP sites.1.afs sites.bcf > /dev/null 2> sites.2.afs
+ bcftools view -cGP sites.2.afs sites.bcf > /dev/null 2> sites.3.afs
+ ......
+
+ where sites.list contains the list of sites with each line consisting
+ of the reference sequence name and position. The following bcftools
+ commands estimate AFS by EM.
+
+
+ o Dump BAQ applied alignment for other SNP callers:
+
+ samtools calmd -bAr aln.bam > aln.baq.bam
+
+ It adds and corrects the NM and MD tags at the same time. The calmd
+ command also comes with the -C option, the same as the one in pileup
+ and mpileup. Apply if it helps.
+
+
LIMITATIONS
o Unaligned words used in bam_import.c, bam_endian.h, bam.c and
bam_aux.c.
AUTHOR
Heng Li from the Sanger Institute wrote the C version of samtools. Bob
Handsaker from the Broad Institute implemented the BGZF library and Jue
- Ruan from Beijing Genomics Institute wrote the RAZF library. Various
- people in the 1000 Genomes Project contributed to the SAM format speci-
+ Ruan from Beijing Genomics Institute wrote the RAZF library. John Mar-
+ shall and Petr Danecek contribute to the source code and various people
+ from the 1000 Genomes Project have contributed to the SAM format speci-
fication.
-samtools-0.1.8 11 July 2010 samtools(1)
+samtools-0.1.12 2 December 2010 samtools(1)