]> git.donarmstrong.com Git - samtools.git/blob - examples/chk_indel.c
Revert one of my earlier changes - Heng was right, CIGAR P not sensible in a padded...
[samtools.git] / examples / chk_indel.c
1 /* To compile, copy this file to the samtools source code directory and compile with:
2      gcc -g -O2 -Wall chk_indel_rg.c -o chk_indel_rg -Wall -I. -L. -lbam -lz
3 */
4
5 #include <string.h>
6 #include "bam.h"
7
8 typedef struct {
9         long cnt[4]; // short:ins, short:del, long:ins, long:del
10 } rgcnt_t;
11
12 #include "khash.h"
13 KHASH_MAP_INIT_STR(rgcnt, rgcnt_t)
14
15 #define MAX_LEN 127
16 #define Q_THRES 10
17 #define L_THRES 6 // short: <=L_THRES; otherwise long
18
19 int main(int argc, char *argv[])
20 {
21         bamFile fp;
22         bam1_t *b;
23         int i, x;
24         khash_t(rgcnt) *h;
25         khint_t k;
26
27         if (argc == 1) {
28                 fprintf(stderr, "Usage: chk_indel_rg <in.bam>\n\n");
29                 fprintf(stderr, "Output: filename, RG, #ins-in-short-homopolymer, #del-in-short, #ins-in-long, #del-in-long\n");
30                 return 1;
31         }
32
33         h = kh_init(rgcnt);
34         fp = bam_open(argv[1], "r");
35         bam_header_destroy(bam_header_read(fp)); // we do not need the header
36         b = bam_init1();
37
38         while (bam_read1(fp, b) >= 0) {
39                 if (b->core.n_cigar >= 3 && b->core.qual >= Q_THRES) {
40                         const uint8_t *seq;
41                         const uint32_t *cigar = bam1_cigar(b);
42                         char *rg;
43                         for (i = 0; i < b->core.n_cigar; ++i) // check if there are 1bp indels
44                                 if (bam_cigar_oplen(cigar[i]) == 1 && (bam_cigar_op(cigar[i]) == BAM_CDEL || bam_cigar_op(cigar[i]) == BAM_CINS))
45                                         break;
46                         if (i == b->core.n_cigar) continue; // no 1bp ins or del
47                         if ((rg = (char*)bam_aux_get(b, "RG")) == 0) continue; // no RG tag
48                         seq = bam1_seq(b);
49                         for (i = x = 0; i < b->core.n_cigar; ++i) {
50                                 int op = bam_cigar_op(cigar[i]);
51                                 if (bam_cigar_oplen(cigar[i]) == 1 && (op == BAM_CDEL || op == BAM_CINS)) {
52                                         int c, j, hrun, which;
53                                         c = bam1_seqi(seq, x);
54                                         for (j = x + 1, hrun = 0; j < b->core.l_qseq; ++j, ++hrun) // calculate the hompolymer run length
55                                                 if (bam1_seqi(seq, j) != c) break;
56                                         k = kh_get(rgcnt, h, rg + 1);
57                                         if (k == kh_end(h)) { // absent
58                                                 char *key = strdup(rg + 1);
59                                                 k = kh_put(rgcnt, h, key, &c);
60                                                 memset(&kh_val(h, k), 0, sizeof(rgcnt_t));
61                                         }
62                                         which = (hrun <= L_THRES? 0 : 1)<<1 | (op == BAM_CINS? 0 : 1);
63                                         ++kh_val(h, k).cnt[which];
64                                 }
65                                 if (bam_cigar_type(op)&1) ++x;
66                         }
67                 }
68         }
69
70         for (k = 0; k != kh_end(h); ++k) {
71                 if (!kh_exist(h, k)) continue;
72                 printf("%s\t%s", argv[1], kh_key(h, k));
73                 for (i = 0; i < 4; ++i)
74                         printf("\t%ld", kh_val(h, k).cnt[i]);
75                 putchar('\n');
76                 free((char*)kh_key(h, k));
77         }
78
79         bam_destroy1(b);
80         bam_close(fp);
81         kh_destroy(rgcnt, h);
82         return 0;
83 }