X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=scanForPairedEndReads.cpp;fp=scanForPairedEndReads.cpp;h=4d1fbfb79ea16d52a0e1b903bd0e40cfc11f66cd;hb=d5639baddcd4bdf09dd31237e914afb987954472;hp=0000000000000000000000000000000000000000;hpb=fb2aa1ca9d00710943155ef3abbbdd87df116e4a;p=rsem.git diff --git a/scanForPairedEndReads.cpp b/scanForPairedEndReads.cpp new file mode 100644 index 0000000..4d1fbfb --- /dev/null +++ b/scanForPairedEndReads.cpp @@ -0,0 +1,120 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include "sam/bam.h" +#include "sam/sam.h" + +#include "my_assert.h" + +using namespace std; + +samfile_t *in, *out; +bam1_t *b, *b2; +vector arr_both, arr_partial_1, arr_partial_2, arr_partial_unknown; + +inline void add_to_appropriate_arr(bam1_t *b) { + if (!(b->core.flag & 0x0004) && (b->core.flag & 0x0002)) { + arr_both.push_back(bam_dup1(b)); return; + } + + if (b->core.flag & 0x0040) arr_partial_1.push_back(bam_dup1(b)); + else if (b->core.flag & 0x0080) arr_partial_2.push_back(bam_dup1(b)); + else arr_partial_unknown.push_back(bam_dup1(b)); +} + +bool less_than(bam1_t *a, bam1_t *b) { + int32_t ap1 = min(a->core.pos, a->core.mpos); + int32_t ap2 = max(a->core.pos, a->core.mpos); + int32_t bp1 = min(b->core.pos, b->core.mpos); + int32_t bp2 = max(b->core.pos, b->core.mpos); + + if (a->core.tid != b->core.tid) return a->core.tid < b->core.tid; + if (ap1 != bp1) return ap1 < bp1; + return ap2 < bp2; +} + +int main(int argc, char* argv[]) { + if (argc != 3) { + printf("Usage: rsem-scan-for-paired-end-reads input.sam output.bam\n"); + exit(-1); + } + + in = samopen(argv[1], "r", NULL); + general_assert(in != 0, "Cannot open " + cstrtos(argv[1]) + " !"); + out = samopen(argv[2], "wb", in->header); + general_assert(out != 0, "Cannot open " + cstrtos(argv[2]) + " !"); + + b = bam_init1(); b2 = bam_init1(); + + string qname; + bool go_on = (samread(in, b) >= 0); + bool isPaired; + long cnt = 0; + + printf("."); fflush(stdout); + + while (go_on) { + qname.assign(bam1_qname(b)); + isPaired = (b->core.flag & 0x0001); + + if (isPaired) { + add_to_appropriate_arr(b); + while ((go_on = (samread(in, b) >= 0)) && (qname == bam1_qname(b))) { + general_assert(b->core.flag & 0x0001, "Read " + qname + " is detected as both single-end and paired-end read!", true); + add_to_appropriate_arr(b); + } + + general_assert(arr_both.size() % 2 == 0, "Number of first and second mates in read " + qname + "'s full alignments (both mates are aligned) are not matched!", true); + general_assert((arr_partial_1.size() + arr_partial_2.size() + arr_partial_unknown.size()) % 2 == 0, "Number of first and second mates in read " + qname + "'s partial alignments (at most one mate is aligned) are not matched!", true); + + if (!arr_both.empty()) { + sort(arr_both.begin(), arr_both.end(), less_than); + for (size_t i = 0; i < arr_both.size(); i++) { samwrite(out, arr_both[i]); bam_destroy1(arr_both[i]); } + arr_both.clear(); + } + + while (!arr_partial_1.empty() || !arr_partial_2.empty()) { + if (!arr_partial_1.empty() && !arr_partial_2.empty()) { + samwrite(out, arr_partial_1.back()); bam_destroy1(arr_partial_1.back()); arr_partial_1.pop_back(); + samwrite(out, arr_partial_2.back()); bam_destroy1(arr_partial_2.back()); arr_partial_2.pop_back(); + } + else if (!arr_partial_1.empty()) { + samwrite(out, arr_partial_1.back()); bam_destroy1(arr_partial_1.back()); arr_partial_1.pop_back(); + samwrite(out, arr_partial_unknown.back()); bam_destroy1(arr_partial_unknown.back()); arr_partial_unknown.pop_back(); + } + else { + samwrite(out, arr_partial_2.back()); bam_destroy1(arr_partial_2.back()); arr_partial_2.pop_back(); + samwrite(out, arr_partial_unknown.back()); bam_destroy1(arr_partial_unknown.back()); arr_partial_unknown.pop_back(); + } + } + + while (!arr_partial_unknown.empty()) { + samwrite(out, arr_partial_unknown.back()); bam_destroy1(arr_partial_unknown.back()); arr_partial_unknown.pop_back(); + } + } + else { + samwrite(out, b); + while ((go_on = (samread(in, b) >= 0)) && (qname == bam1_qname(b))) { + samwrite(out, b); + } + } + + ++cnt; + if (cnt % 1000000 == 0) { printf("."); fflush(stdout); } + } + + printf("\nFinished!\n"); + + bam_destroy1(b); bam_destroy1(b2); + + samclose(in); + samclose(out); + + return 0; +}