From 53e8bbb15e0bfed6a0caae7b5ba6777a9a942266 Mon Sep 17 00:00:00 2001 From: Colin Dewey Date: Wed, 16 Nov 2011 15:34:30 -0600 Subject: [PATCH] Refactored wiggle code and added rsem-bam2readdepth program --- bam2readdepth.cpp | 13 ++++++ bam2wig.cpp | 98 +++-------------------------------------- makefile | 12 ++++-- wiggle.cpp | 108 ++++++++++++++++++++++++++++++++++++++++++++++ wiggle.h | 41 ++++++++++++++++++ 5 files changed, 176 insertions(+), 96 deletions(-) create mode 100644 bam2readdepth.cpp create mode 100644 wiggle.cpp create mode 100644 wiggle.h diff --git a/bam2readdepth.cpp b/bam2readdepth.cpp new file mode 100644 index 0000000..c7f0adb --- /dev/null +++ b/bam2readdepth.cpp @@ -0,0 +1,13 @@ +#include +#include "wiggle.h" + +int main(int argc, char* argv[]) { + if (argc != 2) { + printf("Usage: rsem-bam2readdepth sorted_bam_input\n"); + std::exit(1); + } + ReadDepthWriter depth_writer(std::cout); + build_wiggles(argv[1], depth_writer); + + return 0; +} diff --git a/bam2wig.cpp b/bam2wig.cpp index fcb86b3..cb02bc2 100644 --- a/bam2wig.cpp +++ b/bam2wig.cpp @@ -1,103 +1,15 @@ -#include -#include -#include -#include -#include - -#include "sam/bam.h" -#include "sam/sam.h" +#include +#include "wiggle.h" using namespace std; -samfile_t *bam_in; -bam1_t *b; - -int cur_tid; //current tid; -float *wig_arr; // wiggle array -FILE *fo; - -void generateWiggle(int tid) { - int chr_len = bam_in->header->target_len[tid]; - char *chr_name = bam_in->header->target_name[tid]; - int sp, ep; - - sp = ep = -1; - for (int i = 0; i < chr_len; i++) { - if (wig_arr[i] > 0) { - ep = i; - } - else { - if (sp < ep) { - ++sp; - fprintf(fo, "fixedStep chrom=%s start=%d step=1\n", chr_name, sp + 1); - for (int j = sp; j <= ep; j++) fprintf(fo, "%.7g\n", wig_arr[j]); - } - sp = i; - } - } - if (sp < ep) { - ++sp; - fprintf(fo, "fixedStep chrom=%s start=%d step=1\n", chr_name, sp + 1); - for (int j = sp; j <= ep; j++) fprintf(fo, "%.7g\n", wig_arr[j]); - } -} - int main(int argc, char* argv[]) { - int cnt = 0; - if (argc != 4) { - printf("Usage : rsem-bam2wig sorted_bam_input wig_output wiggle_name\n"); + printf("Usage: rsem-bam2wig sorted_bam_input wig_output wiggle_name\n"); exit(-1); } - - bam_in = samopen(argv[1], "rb", NULL); - if (bam_in == 0) { fprintf(stderr, "Cannot open %s!\n", argv[1]); exit(-1); } - //assert(bam_in != 0); - b = bam_init1(); - - fo = fopen(argv[2], "w"); - fprintf(fo, "track type=wiggle_0 name=\"%s\" description=\"%s\" visibility=full\n", argv[3], argv[3]); - - cur_tid = -1; - wig_arr = NULL; - while (samread(bam_in, b) >= 0) { - if (b->core.tid != cur_tid) { - if (cur_tid >= 0) generateWiggle(cur_tid); - cur_tid = b->core.tid; - size_t len = sizeof(float) * bam_in->header->target_len[cur_tid]; - wig_arr = (float*)realloc(wig_arr, len); - memset(wig_arr, 0, len); - } - - float w = bam_aux2f(bam_aux_get(b, "ZW")); - int pos = b->core.pos; - uint32_t *p = bam1_cigar(b); - - for (int i = 0; i < (int)b->core.n_cigar; i++, ++p) { - int op = *p & BAM_CIGAR_MASK; - int op_len = *p >> BAM_CIGAR_SHIFT; - - switch (op) { - //case BAM_CSOFT_CLIP : pos += op_len; break; - case BAM_CINS : pos += op_len; break; - case BAM_CMATCH : - for (int j = 0; j < op_len; j++, ++pos) wig_arr[pos] += w; - break; - case BAM_CREF_SKIP : pos += op_len; break; - default : assert(false); - } - } - - ++cnt; - if (cnt % 1000000 == 0) printf("%d FIN\n", cnt); - } - if (cur_tid >= 0) generateWiggle(cur_tid); - free(wig_arr); - - samclose(bam_in); - bam_destroy1(b); - - fclose(fo); + UCSCWiggleTrackWriter track_writer(argv[2], argv[3]); + build_wiggles(argv[1], track_writer); return 0; } diff --git a/makefile b/makefile index 97cba95..c5c5271 100644 --- a/makefile +++ b/makefile @@ -2,7 +2,7 @@ CC = g++ #LFLAGS = -Wall -O3 -ffast-math CFLAGS = -Wall -c -I. COFLAGS = -Wall -O3 -ffast-math -c -I. -PROGRAMS = rsem-bam2wig rsem-build-read-index rsem-run-em rsem-extract-reference-transcripts rsem-synthesis-reference-transcripts rsem-parse-alignments rsem-preref rsem-simulate-reads rsem-run-gibbs rsem-calculate-credibility-intervals +PROGRAMS = rsem-bam2wig rsem-bam2readdepth rsem-build-read-index rsem-run-em rsem-extract-reference-transcripts rsem-synthesis-reference-transcripts rsem-parse-alignments rsem-preref rsem-simulate-reads rsem-run-gibbs rsem-calculate-credibility-intervals all : build-sam $(PROGRAMS) @@ -82,8 +82,14 @@ rsem-run-em : EM.o sam/libbam.a EM.o : utils.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h GroupInfo.h HitContainer.h ReadIndex.h ReadReader.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h RefSeqPolicy.h PolyARules.h Profile.h NoiseProfile.h Transcript.h Transcripts.h HitWrapper.h BamWriter.h sam/bam.h sam/sam.h EM.cpp simul.h $(CC) $(COFLAGS) EM.cpp -rsem-bam2wig : sam/bam.h sam/sam.h sam/libbam.a bam2wig.cpp - $(CC) -O3 -Wall bam2wig.cpp sam/libbam.a -lz -o rsem-bam2wig +rsem-bam2wig : wiggle.h wiggle.o sam/libbam.a bam2wig.cpp + $(CC) -O3 -Wall bam2wig.cpp wiggle.o sam/libbam.a -lz -o $@ + +rsem-bam2readdepth : wiggle.h wiggle.o sam/libbam.a bam2readdepth.cpp + $(CC) -O3 -Wall bam2readdepth.cpp wiggle.o sam/libbam.a -lz -o $@ + +wiggle.o: sam/bam.h sam/sam.h wiggle.cpp wiggle.h + $(CC) $(COFLAGS) wiggle.cpp rsem-simulate-reads : simulation.o $(CC) -o rsem-simulate-reads simulation.o diff --git a/wiggle.cpp b/wiggle.cpp new file mode 100644 index 0000000..90b6f8b --- /dev/null +++ b/wiggle.cpp @@ -0,0 +1,108 @@ +#include +#include +#include + +#include "wiggle.h" + +#include "sam/bam.h" +#include "sam/sam.h" + +void add_bam_record_to_wiggle(const bam1_t *b, Wiggle& wiggle) { + float w = bam_aux2f(bam_aux_get(b, "ZW")); + int pos = b->core.pos; + uint32_t *p = bam1_cigar(b); + + for (int i = 0; i < (int)b->core.n_cigar; i++, ++p) { + int op = *p & BAM_CIGAR_MASK; + int op_len = *p >> BAM_CIGAR_SHIFT; + + switch (op) { + //case BAM_CSOFT_CLIP : pos += op_len; break; + case BAM_CINS : pos += op_len; break; + case BAM_CMATCH : + for (int j = 0; j < op_len; j++, ++pos) { + wiggle.read_depth[pos] += w; + } + break; + case BAM_CREF_SKIP : pos += op_len; break; + default : assert(false); + } + } +} + +void build_wiggles(const std::string& bam_filename, + WiggleProcessor& processor) { + samfile_t *bam_in = samopen(bam_filename.c_str(), "rb", NULL); + if (bam_in == 0) { fprintf(stderr, "Cannot open %s!\n", bam_filename.c_str()); exit(-1); } + //assert(bam_in != 0); + + int cur_tid = -1; //current tid; + int cnt = 0; + bam1_t *b = bam_init1(); + Wiggle wiggle; + while (samread(bam_in, b) >= 0) { + if (b->core.tid != cur_tid) { + if (cur_tid >= 0) processor.process(wiggle); + cur_tid = b->core.tid; + wiggle.name = bam_in->header->target_name[cur_tid]; + wiggle.read_depth.assign(bam_in->header->target_len[cur_tid], 0.0); + } + add_bam_record_to_wiggle(b, wiggle); + ++cnt; + if (cnt % 1000000 == 0) fprintf(stderr, "%d FIN\n", cnt); + } + if (cur_tid >= 0) processor.process(wiggle); + + samclose(bam_in); + bam_destroy1(b); +} + +UCSCWiggleTrackWriter::UCSCWiggleTrackWriter(const std::string& output_filename, + const std::string& track_name) { + fo = fopen(output_filename.c_str(), "w"); + fprintf(fo, "track type=wiggle_0 name=\"%s\" description=\"%s\" visibility=full\n", + track_name.c_str(), + track_name.c_str()); +} + +UCSCWiggleTrackWriter::~UCSCWiggleTrackWriter() { + fclose(fo); +} + +void UCSCWiggleTrackWriter::process(const Wiggle& wiggle) { + int sp, ep; + + sp = ep = -1; + for (size_t i = 0; i < wiggle.read_depth.size(); i++) { + if (wiggle.read_depth[i] > 0) { + ep = i; + } + else { + if (sp < ep) { + ++sp; + fprintf(fo, "fixedStep chrom=%s start=%d step=1\n", wiggle.name.c_str(), sp + 1); + for (int j = sp; j <= ep; j++) fprintf(fo, "%.7g\n", wiggle.read_depth[j]); + } + sp = i; + } + } + if (sp < ep) { + ++sp; + fprintf(fo, "fixedStep chrom=%s start=%d step=1\n", wiggle.name.c_str(), sp + 1); + for (int j = sp; j <= ep; j++) fprintf(fo, "%.7g\n", wiggle.read_depth[j]); + } +} + +ReadDepthWriter::ReadDepthWriter(std::ostream& stream) + : stream_(stream) { +} + +void ReadDepthWriter::process(const Wiggle& wiggle) { + stream_ << wiggle.name << '\t' + << wiggle.read_depth.size() << '\t'; + for (size_t i = 0; i < wiggle.read_depth.size(); ++i) { + if (i > 0) stream_ << ' '; + stream_ << wiggle.read_depth[i]; + } + stream_ << '\n'; +} diff --git a/wiggle.h b/wiggle.h new file mode 100644 index 0000000..09cc4f9 --- /dev/null +++ b/wiggle.h @@ -0,0 +1,41 @@ +#include +#include +#include +#include + +struct Wiggle { + std::string name; + std::vector read_depth; +}; + +class WiggleProcessor { +public: + virtual ~WiggleProcessor() {} + virtual void process(const Wiggle& wiggle) = 0; +}; + +class UCSCWiggleTrackWriter : public WiggleProcessor { +public: + UCSCWiggleTrackWriter(const std::string& output_filename, + const std::string& track_name); + + ~UCSCWiggleTrackWriter(); + + void process(const Wiggle& wiggle); + +private: + FILE *fo; +}; + +class ReadDepthWriter : public WiggleProcessor { +public: + ReadDepthWriter(std::ostream& stream); + + void process(const Wiggle& wiggle); + +private: + std::ostream& stream_; +}; + +void build_wiggles(const std::string& bam_filename, + WiggleProcessor& processor); -- 2.39.2