bamcheck: max quality value increased to 255 to handle missing qual field; pairs...

[samtools.git] / misc / bamcheck.c
diff --git a/misc/bamcheck.c b/misc/bamcheck.c

index a438adac3453e73a86f65017bb72ca54ba3ac4c4..d9d54c5856e5ba6832486c81bf460dfdf19574d2 100644 (file)
--- a/misc/bamcheck.c
+++ b/misc/bamcheck.c
@@ -395,7 +395,7 @@ void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line)
              {
                  uint8_t qual = quals[iread] + 1;
                  if ( qual>=stats->nquals )
-                    error("TODO: quality too high %d>=%d\n", quals[iread],stats->nquals);
+                    error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
  
                  int idx = is_fwd ? icycle : read_len-icycle-1;
                  if ( idx>stats->max_len )
@@ -684,7 +684,7 @@ void collect_stats(bam1_t *bam_line, stats_t *stats)
      {
          uint8_t qual = bam_quals[ reverse ? seq_len-i-1 : i];
          if ( qual>=stats->nquals )
-            error("TODO: quality too high %d>=%d\n", quals[i],stats->nquals);
+            error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
          if ( qual>stats->max_qual )
              stats->max_qual = qual;
  
@@ -708,7 +708,7 @@ void collect_stats(bam1_t *bam_line, stats_t *stats)
          // reads. Therefore, do the insert size stats for all mapped reads.
          int32_t isize = bam_line->core.isize;
          if ( isize<0 ) isize = -isize;
-        if ( IS_PAIRED(bam_line) && isize!=0 )
+        if ( IS_PAIRED(bam_line) )
          {
              stats->nreads_paired++;
              if ( isize >= stats->nisize )
@@ -889,7 +889,7 @@ void output_stats(stats_t *stats)
      // Calculate average insert size and standard deviation (from the main bulk data only)
      int isize, ibulk=0;
      uint64_t nisize=0, nisize_inward=0, nisize_outward=0, nisize_other=0;
-    for (isize=1; isize<stats->nisize; isize++)
+    for (isize=0; isize<stats->nisize; isize++)
      {
          // Each pair was counted twice
          stats->isize_inward[isize] *= 0.5;
@@ -903,7 +903,7 @@ void output_stats(stats_t *stats)
      }
  
      double bulk=0, avg_isize=0, sd_isize=0;
-    for (isize=1; isize<stats->nisize; isize++)
+    for (isize=0; isize<stats->nisize; isize++)
      {
          bulk += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
          avg_isize += isize * (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]);
@@ -1021,7 +1021,7 @@ void output_stats(stats_t *stats)
          printf("GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase,100.*ptr[0]/sum,100.*ptr[1]/sum,100.*ptr[2]/sum,100.*ptr[3]/sum);
      }
      printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: pairs total, inward oriented pairs, outward oriented pairs, other pairs\n");
-    for (isize=1; isize<ibulk; isize++)
+    for (isize=0; isize<ibulk; isize++)
          printf("IS\t%d\t%ld\t%ld\t%ld\t%ld\n", isize, (long)(stats->isize_inward[isize]+stats->isize_outward[isize]+stats->isize_other[isize]),
              (long)stats->isize_inward[isize], (long)stats->isize_outward[isize], (long)stats->isize_other[isize]);
  
@@ -1050,12 +1050,14 @@ void output_stats(stats_t *stats)
      }
  
      printf("# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n");
-    printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1, (long)stats->cov[0]);
+    if  ( stats->cov[0] )
+        printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1, (long)stats->cov[0]);
      int icov;
      for (icov=1; icov<stats->ncov-1; icov++)
-        printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1, (long)stats->cov[icov]);
-    printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1, (long)stats->cov[stats->ncov-1]);
-
+        if ( stats->cov[icov] )
+            printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1, (long)stats->cov[icov]);
+    if ( stats->cov[stats->ncov-1] )
+        printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1, (long)stats->cov[stats->ncov-1]);
  
      // Calculate average GC content, then sort by GC and depth
      printf("# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n");
@@ -1311,7 +1313,7 @@ int main(int argc, char *argv[])
  
      stats_t *stats = calloc(1,sizeof(stats_t));
      stats->ngc    = 200;
-    stats->nquals = 95;
+    stats->nquals = 256;
      stats->nbases = 300;
      stats->nisize = 8000;
      stats->max_len   = 30;
@@ -1320,7 +1322,8 @@ int main(int argc, char *argv[])
      stats->gcd_bin_size = 20e3;
      stats->gcd_ref_size = 4.2e9;
      stats->rseq_pos     = -1;
-    stats->tid = stats->gcd_pos = stats->igcd = -1;
+    stats->tid = stats->gcd_pos = -1;
+    stats->igcd = 0;
      stats->is_sorted = 1;
      stats->cov_min  = 1;
      stats->cov_max  = 1000;