]> git.donarmstrong.com Git - samtools.git/blobdiff - misc/bamcheck.c
bamcheck: new stats, number of filtered vs raw sequences
[samtools.git] / misc / bamcheck.c
index 2a3642dc174f8da1b63804920f3430420337d749..352db21b12dbaf0edaa1a5d90f2e872aa258874f 100644 (file)
@@ -116,6 +116,7 @@ typedef struct
     uint64_t total_len_dup;
     uint64_t nreads_1st;
     uint64_t nreads_2nd;
+    uint64_t nreads_filtered;
     uint64_t nreads_dup;
     uint64_t nreads_unmapped;
     uint64_t nreads_unpaired;
@@ -126,6 +127,7 @@ typedef struct
     uint64_t nbases_mapped_cigar;
     uint64_t nbases_trimmed;  // bwa trimmed bases
     uint64_t nmismatches;
+    uint64_t nreads_QCfailed, nreads_secondary;
 
     // GC-depth related data
     uint32_t ngcd, igcd;        // The maximum number of GC depth bins and index of the current bin
@@ -618,16 +620,26 @@ void collect_stats(bam1_t *bam_line, stats_t *stats)
         if ( k == kh_end(stats->rg_hash) ) return;
     }
     if ( stats->flag_require && (bam_line->core.flag & stats->flag_require)!=stats->flag_require )
+    {
+        stats->nreads_filtered++;
         return;
+    }
     if ( stats->flag_filter && (bam_line->core.flag & stats->flag_filter) )
+    {
+        stats->nreads_filtered++;
         return;
-
+    }
     if ( !is_in_regions(bam_line,stats) )
         return;
+    if ( stats->filter_readlen!=-1 && bam_line->core.l_qseq!=stats->filter_readlen ) 
+        return;
+
+    if ( bam_line->core.flag & BAM_FQCFAIL ) stats->nreads_QCfailed++;
+    if ( bam_line->core.flag & BAM_FSECONDARY ) stats->nreads_secondary++;
 
     int seq_len = bam_line->core.l_qseq;
     if ( !seq_len ) return;
-    if ( stats->filter_readlen!=-1 && seq_len!=stats->filter_readlen ) return;
+
     if ( seq_len >= stats->nbases )
         realloc_buffers(stats,seq_len);
     if ( stats->max_len<seq_len )
@@ -935,6 +947,8 @@ void output_stats(stats_t *stats)
         printf(" %s",stats->argv[i]);
     printf("\n");
     printf("# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n");
+    printf("SN\traw total sequences:\t%ld\n", (long)(stats->nreads_filtered+stats->nreads_1st+stats->nreads_2nd));
+    printf("SN\tfiltered sequences:\t%ld\n", (long)stats->nreads_filtered);
     printf("SN\tsequences:\t%ld\n", (long)(stats->nreads_1st+stats->nreads_2nd));
     printf("SN\tis paired:\t%d\n", stats->nreads_1st&&stats->nreads_2nd ? 1 : 0);
     printf("SN\tis sorted:\t%d\n", stats->is_sorted ? 1 : 0);
@@ -946,6 +960,8 @@ void output_stats(stats_t *stats)
     printf("SN\treads paired:\t%ld\n", (long)stats->nreads_paired);
     printf("SN\treads duplicated:\t%ld\n", (long)stats->nreads_dup);
     printf("SN\treads MQ0:\t%ld\n", (long)stats->nreads_mq0);
+    printf("SN\treads QC failed:\t%ld\n", (long)stats->nreads_QCfailed);
+    printf("SN\tnon-primary alignments:\t%ld\n", (long)stats->nreads_secondary);
     printf("SN\ttotal length:\t%ld\n", (long)stats->total_len);
     printf("SN\tbases mapped:\t%ld\n", (long)stats->nbases_mapped);
     printf("SN\tbases mapped (cigar):\t%ld\n", (long)stats->nbases_mapped_cigar);