]> git.donarmstrong.com Git - rsem.git/blob - scanForPairedEndReads.cpp
Modified 'convert-sam-for-rsem' so that this program will convert users' SAM/BAM...
[rsem.git] / scanForPairedEndReads.cpp
1 #include<cstdio>
2 #include<cstring>
3 #include<cstdlib>
4 #include<cassert>
5 #include<string>
6 #include<vector>
7 #include<algorithm>
8
9 #include <stdint.h>
10 #include "sam/bam.h"
11 #include "sam/sam.h"
12
13 #include "my_assert.h"
14
15 using namespace std;
16
17 samfile_t *in, *out;
18 bam1_t *b, *b2;
19 vector<bam1_t*> arr_both, arr_partial_1, arr_partial_2, arr_partial_unknown;
20
21 inline void add_to_appropriate_arr(bam1_t *b) {
22         if (!(b->core.flag & 0x0004) && (b->core.flag & 0x0002)) {
23                 arr_both.push_back(bam_dup1(b)); return;
24         }
25
26         if (b->core.flag & 0x0040) arr_partial_1.push_back(bam_dup1(b));
27         else if (b->core.flag & 0x0080) arr_partial_2.push_back(bam_dup1(b));
28         else arr_partial_unknown.push_back(bam_dup1(b));
29 }
30
31 bool less_than(bam1_t *a, bam1_t *b) {
32         int32_t ap1 = min(a->core.pos, a->core.mpos);
33         int32_t ap2 = max(a->core.pos, a->core.mpos);
34         int32_t bp1 = min(b->core.pos, b->core.mpos);
35         int32_t bp2 = max(b->core.pos, b->core.mpos);
36
37         if (a->core.tid != b->core.tid) return a->core.tid < b->core.tid;
38         if (ap1 != bp1) return ap1 < bp1;
39         return ap2 < bp2;
40 }
41
42 int main(int argc, char* argv[]) {
43         if (argc != 3) {
44                 printf("Usage: rsem-scan-for-paired-end-reads input.sam output.bam\n");
45                 exit(-1);
46         }
47
48         in = samopen(argv[1], "r", NULL);
49         general_assert(in != 0, "Cannot open " + cstrtos(argv[1]) + " !");
50         out = samopen(argv[2], "wb", in->header);
51         general_assert(out != 0, "Cannot open " + cstrtos(argv[2]) + " !");
52
53         b = bam_init1(); b2 = bam_init1();
54
55         string qname;
56         bool go_on = (samread(in, b) >= 0);
57         bool isPaired;
58         long cnt = 0;
59
60         printf("."); fflush(stdout);
61
62         while (go_on) {
63                 qname.assign(bam1_qname(b));
64                 isPaired = (b->core.flag & 0x0001);
65
66                 if (isPaired) {
67                         add_to_appropriate_arr(b);
68                         while ((go_on = (samread(in, b) >= 0)) && (qname == bam1_qname(b))) {
69                                 general_assert(b->core.flag & 0x0001, "Read " + qname + " is detected as both single-end and paired-end read!", true);
70                                 add_to_appropriate_arr(b);
71                         }
72
73                         general_assert(arr_both.size() % 2 == 0, "Number of first and second mates in read " + qname + "'s full alignments (both mates are aligned) are not matched!", true);
74                         general_assert((arr_partial_1.size() + arr_partial_2.size() + arr_partial_unknown.size()) % 2 == 0, "Number of first and second mates in read " + qname + "'s partial alignments (at most one mate is aligned) are not matched!", true);
75
76                         if (!arr_both.empty()) {
77                                 sort(arr_both.begin(), arr_both.end(), less_than);
78                                 for (size_t i = 0; i < arr_both.size(); i++) { samwrite(out, arr_both[i]); bam_destroy1(arr_both[i]); }
79                                 arr_both.clear();
80                         }
81
82                         while (!arr_partial_1.empty() || !arr_partial_2.empty()) {
83                                 if (!arr_partial_1.empty() && !arr_partial_2.empty()) {
84                                         samwrite(out, arr_partial_1.back()); bam_destroy1(arr_partial_1.back()); arr_partial_1.pop_back();
85                                         samwrite(out, arr_partial_2.back()); bam_destroy1(arr_partial_2.back()); arr_partial_2.pop_back();
86                                 }
87                                 else if (!arr_partial_1.empty()) {
88                                         samwrite(out, arr_partial_1.back()); bam_destroy1(arr_partial_1.back()); arr_partial_1.pop_back();
89                                         samwrite(out, arr_partial_unknown.back()); bam_destroy1(arr_partial_unknown.back()); arr_partial_unknown.pop_back();
90                                 }
91                                 else {
92                                         samwrite(out, arr_partial_2.back()); bam_destroy1(arr_partial_2.back()); arr_partial_2.pop_back();
93                                         samwrite(out, arr_partial_unknown.back()); bam_destroy1(arr_partial_unknown.back()); arr_partial_unknown.pop_back();
94                                 }
95                         }
96
97                         while (!arr_partial_unknown.empty()) {
98                                 samwrite(out, arr_partial_unknown.back()); bam_destroy1(arr_partial_unknown.back()); arr_partial_unknown.pop_back();
99                         }
100                 }
101                 else {
102                         samwrite(out, b);
103                         while ((go_on = (samread(in, b) >= 0)) && (qname == bam1_qname(b))) {
104                                 samwrite(out, b);
105                         }
106                 }
107
108                 ++cnt;
109                 if (cnt % 1000000 == 0) { printf("."); fflush(stdout); }
110         }
111
112         printf("\nFinished!\n");
113
114         bam_destroy1(b); bam_destroy1(b2);
115
116         samclose(in);
117         samclose(out);
118
119         return 0;
120 }