+#include<cstdio>
+#include<cstring>
+#include<cstdlib>
+#include<cassert>
+#include<string>
+#include<vector>
+#include<algorithm>
+
+#include <stdint.h>
+#include "sam/bam.h"
+#include "sam/sam.h"
+
+#include "my_assert.h"
+
+using namespace std;
+
+samfile_t *in, *out;
+bam1_t *b, *b2;
+vector<bam1_t*> arr_both, arr_partial_1, arr_partial_2, arr_partial_unknown;
+
+inline void add_to_appropriate_arr(bam1_t *b) {
+ if (!(b->core.flag & 0x0004) && (b->core.flag & 0x0002)) {
+ arr_both.push_back(bam_dup1(b)); return;
+ }
+
+ if (b->core.flag & 0x0040) arr_partial_1.push_back(bam_dup1(b));
+ else if (b->core.flag & 0x0080) arr_partial_2.push_back(bam_dup1(b));
+ else arr_partial_unknown.push_back(bam_dup1(b));
+}
+
+bool less_than(bam1_t *a, bam1_t *b) {
+ int32_t ap1 = min(a->core.pos, a->core.mpos);
+ int32_t ap2 = max(a->core.pos, a->core.mpos);
+ int32_t bp1 = min(b->core.pos, b->core.mpos);
+ int32_t bp2 = max(b->core.pos, b->core.mpos);
+
+ if (a->core.tid != b->core.tid) return a->core.tid < b->core.tid;
+ if (ap1 != bp1) return ap1 < bp1;
+ return ap2 < bp2;
+}
+
+int main(int argc, char* argv[]) {
+ if (argc != 3) {
+ printf("Usage: rsem-scan-for-paired-end-reads input.sam output.bam\n");
+ exit(-1);
+ }
+
+ in = samopen(argv[1], "r", NULL);
+ general_assert(in != 0, "Cannot open " + cstrtos(argv[1]) + " !");
+ out = samopen(argv[2], "wb", in->header);
+ general_assert(out != 0, "Cannot open " + cstrtos(argv[2]) + " !");
+
+ b = bam_init1(); b2 = bam_init1();
+
+ string qname;
+ bool go_on = (samread(in, b) >= 0);
+ bool isPaired;
+ long cnt = 0;
+
+ printf("."); fflush(stdout);
+
+ while (go_on) {
+ qname.assign(bam1_qname(b));
+ isPaired = (b->core.flag & 0x0001);
+
+ if (isPaired) {
+ add_to_appropriate_arr(b);
+ while ((go_on = (samread(in, b) >= 0)) && (qname == bam1_qname(b))) {
+ general_assert(b->core.flag & 0x0001, "Read " + qname + " is detected as both single-end and paired-end read!", true);
+ add_to_appropriate_arr(b);
+ }
+
+ general_assert(arr_both.size() % 2 == 0, "Number of first and second mates in read " + qname + "'s full alignments (both mates are aligned) are not matched!", true);
+ general_assert((arr_partial_1.size() + arr_partial_2.size() + arr_partial_unknown.size()) % 2 == 0, "Number of first and second mates in read " + qname + "'s partial alignments (at most one mate is aligned) are not matched!", true);
+
+ if (!arr_both.empty()) {
+ sort(arr_both.begin(), arr_both.end(), less_than);
+ for (size_t i = 0; i < arr_both.size(); i++) { samwrite(out, arr_both[i]); bam_destroy1(arr_both[i]); }
+ arr_both.clear();
+ }
+
+ while (!arr_partial_1.empty() || !arr_partial_2.empty()) {
+ if (!arr_partial_1.empty() && !arr_partial_2.empty()) {
+ samwrite(out, arr_partial_1.back()); bam_destroy1(arr_partial_1.back()); arr_partial_1.pop_back();
+ samwrite(out, arr_partial_2.back()); bam_destroy1(arr_partial_2.back()); arr_partial_2.pop_back();
+ }
+ else if (!arr_partial_1.empty()) {
+ samwrite(out, arr_partial_1.back()); bam_destroy1(arr_partial_1.back()); arr_partial_1.pop_back();
+ samwrite(out, arr_partial_unknown.back()); bam_destroy1(arr_partial_unknown.back()); arr_partial_unknown.pop_back();
+ }
+ else {
+ samwrite(out, arr_partial_2.back()); bam_destroy1(arr_partial_2.back()); arr_partial_2.pop_back();
+ samwrite(out, arr_partial_unknown.back()); bam_destroy1(arr_partial_unknown.back()); arr_partial_unknown.pop_back();
+ }
+ }
+
+ while (!arr_partial_unknown.empty()) {
+ samwrite(out, arr_partial_unknown.back()); bam_destroy1(arr_partial_unknown.back()); arr_partial_unknown.pop_back();
+ }
+ }
+ else {
+ samwrite(out, b);
+ while ((go_on = (samread(in, b) >= 0)) && (qname == bam1_qname(b))) {
+ samwrite(out, b);
+ }
+ }
+
+ ++cnt;
+ if (cnt % 1000000 == 0) { printf("."); fflush(stdout); }
+ }
+
+ printf("\nFinished!\n");
+
+ bam_destroy1(b); bam_destroy1(b2);
+
+ samclose(in);
+ samclose(out);
+
+ return 0;
+}