]> git.donarmstrong.com Git - rsem.git/blob - scanForPairedEndReads.cpp
751fa558833aabce8bac0ae4d02013aae0ac1685
[rsem.git] / scanForPairedEndReads.cpp
1 #include<cstdio>
2 #include<cstring>
3 #include<cstdlib>
4 #include<cassert>
5 #include<string>
6 #include<vector>
7 #include<algorithm>
8
9 #include <stdint.h>
10 #include "sam/bam.h"
11 #include "sam/sam.h"
12
13 #include "utils.h"
14 #include "my_assert.h"
15
16 using namespace std;
17
18 samfile_t *in, *out;
19 bam1_t *b, *b2;
20 vector<bam1_t*> arr_both, arr_partial_1, arr_partial_2, arr_partial_unknown;
21
22 inline void add_to_appropriate_arr(bam1_t *b) {
23         if (!(b->core.flag & 0x0004) && (b->core.flag & 0x0002)) {
24                 arr_both.push_back(bam_dup1(b)); return;
25         }
26
27         if (b->core.flag & 0x0040) arr_partial_1.push_back(bam_dup1(b));
28         else if (b->core.flag & 0x0080) arr_partial_2.push_back(bam_dup1(b));
29         else arr_partial_unknown.push_back(bam_dup1(b));
30 }
31
32 bool less_than(bam1_t *a, bam1_t *b) {
33         int32_t ap1 = min(a->core.pos, a->core.mpos);
34         int32_t ap2 = max(a->core.pos, a->core.mpos);
35         int32_t bp1 = min(b->core.pos, b->core.mpos);
36         int32_t bp2 = max(b->core.pos, b->core.mpos);
37
38         if (a->core.tid != b->core.tid) return a->core.tid < b->core.tid;
39         if (ap1 != bp1) return ap1 < bp1;
40         return ap2 < bp2;
41 }
42
43 int main(int argc, char* argv[]) {
44         if (argc != 3) {
45                 printf("Usage: rsem-scan-for-paired-end-reads input.sam output.bam\n");
46                 exit(-1);
47         }
48
49         in = samopen(argv[1], "r", NULL);
50         general_assert(in != 0, "Cannot open " + cstrtos(argv[1]) + " !");
51         out = samopen(argv[2], "wb", in->header);
52         general_assert(out != 0, "Cannot open " + cstrtos(argv[2]) + " !");
53
54         b = bam_init1(); b2 = bam_init1();
55
56         string qname;
57         bool go_on = (samread(in, b) >= 0);
58         bool isPaired;
59         HIT_INT_TYPE cnt = 0;
60
61         printf("."); fflush(stdout);
62
63         while (go_on) {
64                 qname.assign(bam1_qname(b));
65                 isPaired = (b->core.flag & 0x0001);
66
67                 if (isPaired) {
68                         add_to_appropriate_arr(b);
69                         while ((go_on = (samread(in, b) >= 0)) && (qname == bam1_qname(b))) {
70                                 general_assert(b->core.flag & 0x0001, "Read " + qname + " is detected as both single-end and paired-end read!", true);
71                                 add_to_appropriate_arr(b);
72                         }
73
74                         general_assert(arr_both.size() % 2 == 0, "Number of first and second mates in read " + qname + "'s full alignments (both mates are aligned) are not matched!", true);
75                         general_assert((arr_partial_1.size() + arr_partial_2.size() + arr_partial_unknown.size()) % 2 == 0, "Number of first and second mates in read " + qname + "'s partial alignments (at most one mate is aligned) are not matched!", true);
76
77                         if (!arr_both.empty()) {
78                                 sort(arr_both.begin(), arr_both.end(), less_than);
79                                 for (size_t i = 0; i < arr_both.size(); i++) { samwrite(out, arr_both[i]); bam_destroy1(arr_both[i]); }
80                                 arr_both.clear();
81                         }
82
83                         while (!arr_partial_1.empty() || !arr_partial_2.empty()) {
84                                 if (!arr_partial_1.empty() && !arr_partial_2.empty()) {
85                                         samwrite(out, arr_partial_1.back()); bam_destroy1(arr_partial_1.back()); arr_partial_1.pop_back();
86                                         samwrite(out, arr_partial_2.back()); bam_destroy1(arr_partial_2.back()); arr_partial_2.pop_back();
87                                 }
88                                 else if (!arr_partial_1.empty()) {
89                                         samwrite(out, arr_partial_1.back()); bam_destroy1(arr_partial_1.back()); arr_partial_1.pop_back();
90                                         samwrite(out, arr_partial_unknown.back()); bam_destroy1(arr_partial_unknown.back()); arr_partial_unknown.pop_back();
91                                 }
92                                 else {
93                                         samwrite(out, arr_partial_2.back()); bam_destroy1(arr_partial_2.back()); arr_partial_2.pop_back();
94                                         samwrite(out, arr_partial_unknown.back()); bam_destroy1(arr_partial_unknown.back()); arr_partial_unknown.pop_back();
95                                 }
96                         }
97
98                         while (!arr_partial_unknown.empty()) {
99                                 samwrite(out, arr_partial_unknown.back()); bam_destroy1(arr_partial_unknown.back()); arr_partial_unknown.pop_back();
100                         }
101                 }
102                 else {
103                         samwrite(out, b);
104                         while ((go_on = (samread(in, b) >= 0)) && (qname == bam1_qname(b))) {
105                                 samwrite(out, b);
106                         }
107                 }
108
109                 ++cnt;
110                 if (cnt % 1000000 == 0) { printf("."); fflush(stdout); }
111         }
112
113         printf("\nFinished!\n");
114
115         bam_destroy1(b); bam_destroy1(b2);
116
117         samclose(in);
118         samclose(out);
119
120         return 0;
121 }