]> git.donarmstrong.com Git - rsem.git/blob - scanForPairedEndReads.cpp
Added .gitignore to ignore built files
[rsem.git] / scanForPairedEndReads.cpp
1 #include<cstdio>
2 #include<cstring>
3 #include<cstdlib>
4 #include<cassert>
5 #include<string>
6 #include<vector>
7 #include<algorithm>
8
9 #include <stdint.h>
10 #include "sam/bam.h"
11 #include "sam/sam.h"
12
13 #include "utils.h"
14 #include "my_assert.h"
15
16 using namespace std;
17
18 samfile_t *in, *out;
19 bam1_t *b, *b2;
20 vector<bam1_t*> arr_both, arr_partial_1, arr_partial_2, arr_partial_unknown;
21
22 inline void add_to_appropriate_arr(bam1_t *b) {
23         if (!(b->core.flag & 0x0004) && (b->core.flag & 0x0002)) {
24                 arr_both.push_back(bam_dup1(b)); return;
25         }
26
27         if (b->core.flag & 0x0040) arr_partial_1.push_back(bam_dup1(b));
28         else if (b->core.flag & 0x0080) arr_partial_2.push_back(bam_dup1(b));
29         else arr_partial_unknown.push_back(bam_dup1(b));
30 }
31
32 char get_pattern_code(uint32_t flag) {
33   if (flag & 0x0040) return (flag & 0x0010 ? 1 : 0);
34   else return (flag & 0x0010 ? 0 : 1);
35 }
36
37 bool less_than(bam1_t *a, bam1_t *b) {
38         int32_t ap1 = min(a->core.pos, a->core.mpos);
39         int32_t ap2 = max(a->core.pos, a->core.mpos);
40         int32_t bp1 = min(b->core.pos, b->core.mpos);
41         int32_t bp2 = max(b->core.pos, b->core.mpos);
42         char apat = get_pattern_code(a->core.flag); // apt: a's pattern of strand and mate information
43         char bpat = get_pattern_code(b->core.flag);
44
45         if (a->core.tid != b->core.tid) return a->core.tid < b->core.tid;
46         if (ap1 != bp1) return ap1 < bp1;
47         if (ap2 != bp2) return ap2 < bp2;
48         return apat < bpat;
49 }
50
51 int main(int argc, char* argv[]) {
52         if (argc != 3) {
53                 printf("Usage: rsem-scan-for-paired-end-reads input.sam output.bam\n");
54                 exit(-1);
55         }
56
57         in = samopen(argv[1], "r", NULL);
58         general_assert(in != 0, "Cannot open " + cstrtos(argv[1]) + " !");
59         out = samopen(argv[2], "wb", in->header);
60         general_assert(out != 0, "Cannot open " + cstrtos(argv[2]) + " !");
61
62         b = bam_init1(); b2 = bam_init1();
63
64         string qname;
65         bool go_on = (samread(in, b) >= 0);
66         bool isPaired;
67         HIT_INT_TYPE cnt = 0;
68
69         printf("."); fflush(stdout);
70
71         while (go_on) {
72                 qname.assign(bam1_qname(b));
73                 isPaired = (b->core.flag & 0x0001);
74
75                 if (isPaired) {
76                         add_to_appropriate_arr(b);
77                         while ((go_on = (samread(in, b) >= 0)) && (qname == bam1_qname(b))) {
78                                 general_assert_1(b->core.flag & 0x0001, "Read " + qname + " is detected as both single-end and paired-end read!");
79                                 add_to_appropriate_arr(b);
80                         }
81
82                         general_assert_1(arr_both.size() % 2 == 0, "Number of first and second mates in read " + qname + "'s full alignments (both mates are aligned) are not matched!");
83                         general_assert_1((arr_partial_1.size() + arr_partial_2.size() + arr_partial_unknown.size()) % 2 == 0, "Number of first and second mates in read " + qname + "'s partial alignments (at most one mate is aligned) are not matched!");
84
85                         if (!arr_both.empty()) {
86                                 sort(arr_both.begin(), arr_both.end(), less_than);
87                                 for (size_t i = 0; i < arr_both.size(); i++) { samwrite(out, arr_both[i]); bam_destroy1(arr_both[i]); }
88                                 arr_both.clear();
89                         }
90
91                         while (!arr_partial_1.empty() || !arr_partial_2.empty()) {
92                                 if (!arr_partial_1.empty() && !arr_partial_2.empty()) {
93                                         samwrite(out, arr_partial_1.back()); bam_destroy1(arr_partial_1.back()); arr_partial_1.pop_back();
94                                         samwrite(out, arr_partial_2.back()); bam_destroy1(arr_partial_2.back()); arr_partial_2.pop_back();
95                                 }
96                                 else if (!arr_partial_1.empty()) {
97                                         samwrite(out, arr_partial_1.back()); bam_destroy1(arr_partial_1.back()); arr_partial_1.pop_back();
98                                         samwrite(out, arr_partial_unknown.back()); bam_destroy1(arr_partial_unknown.back()); arr_partial_unknown.pop_back();
99                                 }
100                                 else {
101                                         samwrite(out, arr_partial_2.back()); bam_destroy1(arr_partial_2.back()); arr_partial_2.pop_back();
102                                         samwrite(out, arr_partial_unknown.back()); bam_destroy1(arr_partial_unknown.back()); arr_partial_unknown.pop_back();
103                                 }
104                         }
105
106                         while (!arr_partial_unknown.empty()) {
107                                 samwrite(out, arr_partial_unknown.back()); bam_destroy1(arr_partial_unknown.back()); arr_partial_unknown.pop_back();
108                         }
109                 }
110                 else {
111                         samwrite(out, b);
112                         while ((go_on = (samread(in, b) >= 0)) && (qname == bam1_qname(b))) {
113                                 samwrite(out, b);
114                         }
115                 }
116
117                 ++cnt;
118                 if (cnt % 1000000 == 0) { printf("."); fflush(stdout); }
119         }
120
121         printf("\nFinished!\n");
122
123         bam_destroy1(b); bam_destroy1(b2);
124
125         samclose(in);
126         samclose(out);
127
128         return 0;
129 }