bamcheck: New stats, number of pairs mapped to different chromosomes

[samtools.git] / misc / bamcheck.c
diff --git a/misc/bamcheck.c b/misc/bamcheck.c

index 3db9cfb55f5d3ed7ba10138384048c94b6dabbc0..2a3642dc174f8da1b63804920f3430420337d749 100644 (file)
--- a/misc/bamcheck.c
+++ b/misc/bamcheck.c
@@ -26,6 +26,7 @@
  #include <ctype.h>
  #include <getopt.h>
  #include <errno.h>
+#include <assert.h>
  #include "faidx.h"
  #include "khash.h"
  #include "sam.h"
@@ -119,6 +120,7 @@ typedef struct
      uint64_t nreads_unmapped;
      uint64_t nreads_unpaired;
      uint64_t nreads_paired;
+    uint64_t nreads_anomalous;
      uint64_t nreads_mq0;
      uint64_t nbases_mapped;
      uint64_t nbases_mapped_cigar;
@@ -129,6 +131,7 @@ typedef struct
      uint32_t ngcd, igcd;        // The maximum number of GC depth bins and index of the current bin
      gc_depth_t *gcd;            // The GC-depth bins holder
      int gcd_bin_size;           // The size of GC-depth bin
+    uint32_t gcd_ref_size;      // The approximate size of the genome
      int32_t tid, gcd_pos;       // Position of the current bin
      int32_t pos;                // Position of the last read
  
@@ -139,11 +142,11 @@ typedef struct
      round_buffer_t cov_rbuf;        // Pileup round buffer
  
      // Mismatches by read cycle 
-    uint8_t *rseq_buf;             // A buffer for reference sequence to check the mismatches against
-    int nref_seq;               // The size of the buffer
-    int32_t rseq_pos;           // The coordinate of the first base in the buffer
-    int32_t rseq_len;           // The used part of the buffer
-    uint64_t *mpc_buf;          // Mismatches per cycle
+    uint8_t *rseq_buf;              // A buffer for reference sequence to check the mismatches against
+    int mrseq_buf;                  // The size of the buffer
+    int32_t rseq_pos;               // The coordinate of the first base in the buffer
+    int32_t nrseq_buf;              // The used part of the buffer
+    uint64_t *mpc_buf;              // Mismatches per cycle
  
      // Filters
      int filter_readlen;
@@ -291,7 +294,7 @@ void count_indels(stats_t *stats,bam1_t *bam_line)
  
          if ( cig==1 )
          {
-            int idx = is_fwd ? icycle : read_len-icycle;
+            int idx = is_fwd ? icycle : read_len-icycle-ncig;
              if ( idx<0 ) 
                  error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle);
              if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%d %s\n", idx,stats->nbases, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
@@ -362,11 +365,14 @@ void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line)
              icycle += ncig;
              continue;
          }
+        // Ignore H and N CIGARs. The letter are inserted e.g. by TopHat and often require very large
+        //  chunk of refseq in memory. Not very frequent and not noticable in the stats.
+        if ( cig==3 || cig==5 ) continue;
          if ( cig!=0 )
-            error("TODO: cigar %d, %s\n", cig,bam1_qname(bam_line));
+            error("TODO: cigar %d, %s:%d %s\n", cig,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
         
-        if ( ncig+iref > stats->rseq_len )
-            error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->rseq_len, bam1_qname(bam_line),stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1);
+        if ( ncig+iref > stats->nrseq_buf )
+            error("FIXME: %d+%d > %d, %s, %s:%d\n",ncig,iref,stats->nrseq_buf, bam1_qname(bam_line),stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1);
  
          int im;
          for (im=0; im<ncig; im++)
@@ -390,7 +396,7 @@ void count_mismatches_per_cycle(stats_t *stats,bam1_t *bam_line)
              {
                  uint8_t qual = quals[iread] + 1;
                  if ( qual>=stats->nquals )
-                    error("TODO: quality too high %d>=%d\n", quals[iread],stats->nquals);
+                    error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
  
                  int idx = is_fwd ? icycle : read_len-icycle-1;
                  if ( idx>stats->max_len )
@@ -430,7 +436,7 @@ void read_ref_seq(stats_t *stats,int32_t tid,int32_t pos)
      if (pos >= val.len)
          error("Was the bam file mapped with the reference sequence supplied?"
                " A read mapped beyond the end of the chromosome (%s:%d, chromosome length %d).\n", chr,pos,val.len);
-    int size = stats->nref_seq;
+    int size = stats->mrseq_buf;
      // The buffer extends beyond the chromosome end. Later the rest will be filled with N's.
      if (size+pos > val.len) size = val.len-pos;
  
@@ -460,24 +466,25 @@ void read_ref_seq(stats_t *stats,int32_t tid,int32_t pos)
          ptr++;
          nread++;
      }
-    if ( nread < stats->nref_seq )
+    if ( nread < stats->mrseq_buf )
      {
-        memset(ptr,0, stats->nref_seq - nread);
-        nread = stats->nref_seq;
+        memset(ptr,0, stats->mrseq_buf - nread);
+        nread = stats->mrseq_buf;
      }
-    stats->rseq_len = nread;
-    stats->rseq_pos = pos;
-    stats->tid      = tid;
+    stats->nrseq_buf = nread;
+    stats->rseq_pos  = pos;
+    stats->tid       = tid;
  }
  
-float fai_gc_content(stats_t *stats)
+float fai_gc_content(stats_t *stats, int pos, int len)
  {
      uint32_t gc,count,c;
-    int i,size = stats->rseq_len;
+    int i = pos - stats->rseq_pos, ito = i + len;
+    assert( i>=0 && ito<=stats->nrseq_buf );
  
      // Count GC content
      gc = count = 0;
-    for (i=0; i<size; i++)
+    for (; i<ito; i++)
      {
          c = stats->rseq_buf[i];
          if ( c==2 || c==4 )
@@ -491,6 +498,37 @@ float fai_gc_content(stats_t *stats)
      return count ? (float)gc/count : 0;
  }
  
+void realloc_rseq_buffer(stats_t *stats)
+{
+    int n = stats->nbases*10;
+    if ( stats->gcd_bin_size > n ) n = stats->gcd_bin_size;
+    if ( stats->mrseq_buf<n )
+    {
+        stats->rseq_buf = realloc(stats->rseq_buf,sizeof(uint8_t)*n);
+        stats->mrseq_buf = n;
+    }
+}
+
+void realloc_gcd_buffer(stats_t *stats, int seq_len)
+{
+    if ( seq_len >= stats->gcd_bin_size )
+        error("The --GC-depth bin size (%d) is set too low for the read length %d\n", stats->gcd_bin_size, seq_len);
+
+    int n = 1 + stats->gcd_ref_size / (stats->gcd_bin_size - seq_len);
+    if ( n <= stats->igcd )
+        error("The --GC-depth bin size is too small or reference genome too big; please decrease the bin size or increase the reference length\n");
+        
+    if ( n > stats->ngcd )
+    {
+        stats->gcd = realloc(stats->gcd, n*sizeof(gc_depth_t));
+        if ( !stats->gcd )
+            error("Could not realloc GCD buffer, too many chromosomes or the genome too long?? [%u %u]\n", stats->ngcd,n);
+        memset(&(stats->gcd[stats->ngcd]),0,(n-stats->ngcd)*sizeof(gc_depth_t));
+        stats->ngcd = n;
+    }
+
+    realloc_rseq_buffer(stats);
+}
  
  void realloc_buffers(stats_t *stats, int seq_len)
  {
@@ -566,6 +604,8 @@ void realloc_buffers(stats_t *stats, int seq_len)
      free(stats->cov_rbuf.buffer);
      stats->cov_rbuf.buffer = rbuffer;
      stats->cov_rbuf.size = seq_len*5;
+
+    realloc_rseq_buffer(stats);
  }
  
  void collect_stats(bam1_t *bam_line, stats_t *stats)
@@ -645,7 +685,7 @@ void collect_stats(bam1_t *bam_line, stats_t *stats)
      {
          uint8_t qual = bam_quals[ reverse ? seq_len-i-1 : i];
          if ( qual>=stats->nquals )
-            error("TODO: quality too high %d>=%d\n", quals[i],stats->nquals);
+            error("TODO: quality too high %d>=%d (%s %d %s)\n", qual,stats->nquals,stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
          if ( qual>stats->max_qual )
              stats->max_qual = qual;
  
@@ -663,42 +703,48 @@ void collect_stats(bam1_t *bam_line, stats_t *stats)
  
          count_indels(stats,bam_line);
  
-        // The insert size is tricky, because for long inserts the libraries are
-        // prepared differently and the pairs point in other direction. BWA does
-        // not set the paired flag for them.  Similar thing is true also for 454
-        // reads. Therefore, do the insert size stats for all mapped reads.
-        int32_t isize = bam_line->core.isize;
-        if ( isize<0 ) isize = -isize;
-        if ( IS_PAIRED(bam_line) && isize!=0 )
+        if ( !IS_PAIRED(bam_line) )
+            stats->nreads_unpaired++;
+        else 
          {
              stats->nreads_paired++;
-            if ( isize >= stats->nisize )
-                isize=stats->nisize-1;
  
-            int pos_fst = bam_line->core.mpos - bam_line->core.pos;
-            int is_fst  = IS_READ1(bam_line) ? 1 : -1;
-            int is_fwd  = IS_REVERSE(bam_line) ? -1 : 1;
-            int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1;
+            if ( bam_line->core.tid!=bam_line->core.mtid )
+                stats->nreads_anomalous++;
  
-            if ( is_fwd*is_mfwd>0 )
-                stats->isize_other[isize]++;
-            else if ( is_fst*pos_fst>0 )
-            {
-                if ( is_fst*is_fwd>0 )
-                    stats->isize_inward[isize]++;
-                else
-                    stats->isize_outward[isize]++;
-            }
-            else if ( is_fst*pos_fst<0 )
+            // The insert size is tricky, because for long inserts the libraries are
+            // prepared differently and the pairs point in other direction. BWA does
+            // not set the paired flag for them.  Similar thing is true also for 454
+            // reads. Mates mapped to different chromosomes have isize==0.
+            int32_t isize = bam_line->core.isize;
+            if ( isize<0 ) isize = -isize;
+            if ( isize >= stats->nisize )
+                isize = stats->nisize-1;
+            if ( isize>0 || bam_line->core.tid==bam_line->core.mtid )
              {
-                if ( is_fst*is_fwd>0 )
-                    stats->isize_outward[isize]++;
-                else
-                    stats->isize_inward[isize]++;
+                int pos_fst = bam_line->core.mpos - bam_line->core.pos;
+                int is_fst  = IS_READ1(bam_line) ? 1 : -1;
+                int is_fwd  = IS_REVERSE(bam_line) ? -1 : 1;
+                int is_mfwd = IS_MATE_REVERSE(bam_line) ? -1 : 1;
+
+                if ( is_fwd*is_mfwd>0 )
+                    stats->isize_other[isize]++;
+                else if ( is_fst*pos_fst>0 )
+                {
+                    if ( is_fst*is_fwd>0 )
+                        stats->isize_inward[isize]++;
+                    else
+                        stats->isize_outward[isize]++;
+                }
+                else if ( is_fst*pos_fst<0 )
+                {
+                    if ( is_fst*is_fwd>0 )
+                        stats->isize_outward[isize]++;
+                    else
+                        stats->isize_inward[isize]++;
+                }
              }
          }
-        else
-            stats->nreads_unpaired++;
  
          // Number of mismatches 
          uint8_t *nm = bam_aux_get(bam_line,"NM");
@@ -759,44 +805,45 @@ void collect_stats(bam1_t *bam_line, stats_t *stats)
              if ( stats->tid==-1 || stats->tid!=bam_line->core.tid )
                  round_buffer_flush(stats,-1);
  
-            // Mismatches per cycle and GC-depth graph
+            // Mismatches per cycle and GC-depth graph. For simplicity, reads overlapping GCD bins
+            //  are not splitted which results in up to seq_len-1 overlaps. The default bin size is
+            //  20kbp, so the effect is negligible.
              if ( stats->fai )
              {
-                // First pass, new chromosome or sequence spanning beyond the end of the buffer 
-                if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid || stats->rseq_pos+stats->rseq_len < bam_line->core.pos+readlen )
+                int inc_ref = 0, inc_gcd = 0;
+                // First pass or new chromosome
+                if ( stats->rseq_pos==-1 || stats->tid != bam_line->core.tid ) { inc_ref=1; inc_gcd=1; }
+                // Read goes beyond the end of the rseq buffer
+                else if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+readlen ) { inc_ref=1; inc_gcd=1; }
+                // Read overlaps the next gcd bin
+                else if ( stats->gcd_pos+stats->gcd_bin_size < bam_line->core.pos+readlen ) 
+                { 
+                    inc_gcd = 1;
+                    if ( stats->rseq_pos+stats->nrseq_buf < bam_line->core.pos+stats->gcd_bin_size ) inc_ref = 1;
+                }
+                if ( inc_gcd )
                  {
-                    read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos);
-
-                    // Get the reference GC content for this bin. Note that in the current implementation
-                    //  the GCD bins overlap by seq_len-1. Because the bin size is by default 20k and the 
-                    //  expected read length only ~100bp, it shouldn't really matter.
-                    stats->gcd_pos = bam_line->core.pos;
                      stats->igcd++;
-
                      if ( stats->igcd >= stats->ngcd )
-                        error("The genome too long or the GCD bin overlaps too big [%ud]\n", stats->igcd);
-
-                    stats->gcd[ stats->igcd ].gc = fai_gc_content(stats);
+                        realloc_gcd_buffer(stats, readlen);
+                    if ( inc_ref )
+                        read_ref_seq(stats,bam_line->core.tid,bam_line->core.pos);
+                    stats->gcd_pos = bam_line->core.pos;
+                    stats->gcd[ stats->igcd ].gc = fai_gc_content(stats, stats->gcd_pos, stats->gcd_bin_size);
                  }
+
                  count_mismatches_per_cycle(stats,bam_line);
              }
+            // No reference and first pass, new chromosome or sequence going beyond the end of the gcd bin
              else if ( stats->gcd_pos==-1 || stats->tid != bam_line->core.tid || bam_line->core.pos - stats->gcd_pos > stats->gcd_bin_size )
              {
                  // First pass or a new chromosome
                  stats->tid     = bam_line->core.tid;
                  stats->gcd_pos = bam_line->core.pos;
                  stats->igcd++;
-
                  if ( stats->igcd >= stats->ngcd )
-                {
-                    uint32_t n = 2*(1 + stats->ngcd);
-                    stats->gcd = realloc(stats->gcd, n*sizeof(gc_depth_t));
-                    if ( !stats->gcd )
-                        error("Could not realloc GCD buffer, too many chromosomes or the genome too long?? [%u %u]\n", stats->ngcd,n);
-                    memset(&(stats->gcd[stats->ngcd]),0,(n-stats->ngcd)*sizeof(gc_depth_t));
-                }
+                    realloc_gcd_buffer(stats, readlen);
              }
-
              stats->gcd[ stats->igcd ].depth++;
              // When no reference sequence is given, approximate the GC from the read (much shorter window, but otherwise OK)
              if ( !stats->fai )
@@ -849,7 +896,7 @@ void output_stats(stats_t *stats)
      // Calculate average insert size and standard deviation (from the main bulk data only)
      int isize, ibulk=0;
      uint64_t nisize=0, nisize_inward=0, nisize_outward=0, nisize_other=0;
-    for (isize=1; isize<stats->nisize; isize++)
+    for (isize=0; isize<stats->nisize; isize++)
      {
          // Each pair was counted twice
          stats->isize_inward[isize] *= 0.5;
@@ -863,7 +910,7 @@ void output_stats(stats_t *stats)
      }
  
      double bulk=0, avg_isize=0, sd_isize=0;
-    for (isize=1; isize<stats->nisize; isize++)
+    for (isize=0; isize<stats->nisize; isize++)
      {
          bulk += stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize];
          avg_isize += isize * (stats->isize_inward[isize] + stats->isize_outward[isize] + stats->isize_other[isize]);
@@ -915,6 +962,7 @@ void output_stats(stats_t *stats)
      printf("SN\tinward oriented pairs:\t%ld\n", (long)nisize_inward);
      printf("SN\toutward oriented pairs:\t%ld\n", (long)nisize_outward);
      printf("SN\tpairs with other orientation:\t%ld\n", (long)nisize_other);
+    printf("SN\tpairs on different chromosomes:\t%ld\n", (long)stats->nreads_anomalous/2);
  
      int ibase,iqual;
      if ( stats->max_len<stats->nbases ) stats->max_len++;
@@ -981,7 +1029,7 @@ void output_stats(stats_t *stats)
          printf("GCC\t%d\t%.2f\t%.2f\t%.2f\t%.2f\n", ibase,100.*ptr[0]/sum,100.*ptr[1]/sum,100.*ptr[2]/sum,100.*ptr[3]/sum);
      }
      printf("# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: pairs total, inward oriented pairs, outward oriented pairs, other pairs\n");
-    for (isize=1; isize<ibulk; isize++)
+    for (isize=0; isize<ibulk; isize++)
          printf("IS\t%d\t%ld\t%ld\t%ld\t%ld\n", isize, (long)(stats->isize_inward[isize]+stats->isize_outward[isize]+stats->isize_other[isize]),
              (long)stats->isize_inward[isize], (long)stats->isize_outward[isize], (long)stats->isize_other[isize]);
  
@@ -1003,17 +1051,21 @@ void output_stats(stats_t *stats)
      printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions (fwd), .. (rev) , number of deletions (fwd), .. (rev)\n");
      for (ilen=0; ilen<=stats->nbases; ilen++)
      {
+        // For deletions we print the index of the cycle before the deleted base (1-based) and for insertions
+        //  the index of the cycle of the first inserted base (also 1-based)
          if ( stats->ins_cycles_1st[ilen]>0 || stats->ins_cycles_2nd[ilen]>0 || stats->del_cycles_1st[ilen]>0 || stats->del_cycles_2nd[ilen]>0 )
              printf("IC\t%d\t%ld\t%ld\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles_1st[ilen], (long)stats->ins_cycles_2nd[ilen], (long)stats->del_cycles_1st[ilen], (long)stats->del_cycles_2nd[ilen]);
      }
  
      printf("# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n");
-    printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1, (long)stats->cov[0]);
+    if  ( stats->cov[0] )
+        printf("COV\t[<%d]\t%d\t%ld\n",stats->cov_min,stats->cov_min-1, (long)stats->cov[0]);
      int icov;
      for (icov=1; icov<stats->ncov-1; icov++)
-        printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1, (long)stats->cov[icov]);
-    printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1, (long)stats->cov[stats->ncov-1]);
-
+        if ( stats->cov[icov] )
+            printf("COV\t[%d-%d]\t%d\t%ld\n",stats->cov_min + (icov-1)*stats->cov_step, stats->cov_min + icov*stats->cov_step-1,stats->cov_min + icov*stats->cov_step-1, (long)stats->cov[icov]);
+    if ( stats->cov[stats->ncov-1] )
+        printf("COV\t[%d<]\t%d\t%ld\n",stats->cov_min + (stats->ncov-2)*stats->cov_step-1,stats->cov_min + (stats->ncov-2)*stats->cov_step-1, (long)stats->cov[stats->ncov-1]);
  
      // Calculate average GC content, then sort by GC and depth
      printf("# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n");
@@ -1237,7 +1289,7 @@ void error(const char *format, ...)
          printf("    -d, --remove-dups                   Exlude from statistics reads marked as duplicates\n");
          printf("    -f, --required-flag <int>           Required flag, 0 for unset [0]\n");
          printf("    -F, --filtering-flag <int>          Filtering flag, 0 for unset [0]\n");
-        printf("        --GC-depth <float,float>        Bin size for GC-depth graph and the maximum reference length [2e4,6e9]\n");
+        printf("        --GC-depth <float,float>        Bin size for GC-depth graph and the maximum reference length [2e4,4.2e9]\n");
          printf("    -h, --help                          This help message\n");
          printf("    -i, --insert-size <int>             Maximum insert size [8000]\n");
          printf("    -I, --id <string>                   Include only listed read group or sample name\n");
@@ -1269,16 +1321,17 @@ int main(int argc, char *argv[])
  
      stats_t *stats = calloc(1,sizeof(stats_t));
      stats->ngc    = 200;
-    stats->nquals = 95;
+    stats->nquals = 256;
      stats->nbases = 300;
      stats->nisize = 8000;
      stats->max_len   = 30;
      stats->max_qual  = 40;
      stats->isize_main_bulk = 0.99;   // There are always outliers at the far end
-    stats->gcd_bin_size = 20000;
-    stats->ngcd         = 3e5;     // 300k of 20k bins is enough to hold a genome 6Gbp big
+    stats->gcd_bin_size = 20e3;
+    stats->gcd_ref_size = 4.2e9;
      stats->rseq_pos     = -1;
      stats->tid = stats->gcd_pos = -1;
+    stats->igcd = 0;
      stats->is_sorted = 1;
      stats->cov_min  = 1;
      stats->cov_max  = 1000;
@@ -1326,7 +1379,7 @@ int main(int argc, char *argv[])
                          if ( sscanf(optarg,"%f,%f",&fbin,&flen)!= 2 ) 
                              error("Unable to parse --GC-depth %s\n", optarg); 
                          stats->gcd_bin_size = fbin;
-                        stats->ngcd = flen/fbin;
+                        stats->gcd_ref_size = flen;
                        }
                        break;
              case 'c': if ( sscanf(optarg,"%d,%d,%d",&stats->cov_min,&stats->cov_max,&stats->cov_step)!= 3 ) 
@@ -1381,8 +1434,6 @@ int main(int argc, char *argv[])
      stats->isize_outward  = calloc(stats->nisize,sizeof(uint64_t));
      stats->isize_other    = calloc(stats->nisize,sizeof(uint64_t));
      stats->gcd            = calloc(stats->ngcd,sizeof(gc_depth_t));
-    stats->nref_seq       = stats->gcd_bin_size;
-    stats->rseq_buf       = calloc(stats->nref_seq,sizeof(uint8_t));
      stats->mpc_buf        = stats->fai ? calloc(stats->nquals*stats->nbases,sizeof(uint64_t)) : NULL;
      stats->acgt_cycles    = calloc(4*stats->nbases,sizeof(uint64_t));
      stats->read_lengths   = calloc(stats->nbases,sizeof(uint64_t));
@@ -1392,6 +1443,7 @@ int main(int argc, char *argv[])
      stats->ins_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
      stats->del_cycles_1st = calloc(stats->nbases+1,sizeof(uint64_t));
      stats->del_cycles_2nd = calloc(stats->nbases+1,sizeof(uint64_t));
+    realloc_rseq_buffer(stats);
      if ( targets )
          init_regions(stats, targets);
  
@@ -1446,7 +1498,7 @@ int main(int argc, char *argv[])
      free(stats);
      if ( stats->rg_hash ) kh_destroy(kh_rg, stats->rg_hash);
  
-       return 0;
+    return 0;
  }