From 5fd66ada6c610981dcecd50e5e41436e0458b110 Mon Sep 17 00:00:00 2001 From: Bo Li Date: Wed, 30 Nov 2011 21:14:06 -0600 Subject: [PATCH] add new files --- BamConverter.h | 230 +++++++++++++++++++++++++++++++++++ BamWriter.h | 2 +- EM.cpp | 2 +- bc_aux.h | 120 ++++++++++++++++++ getUnique.cpp | 72 +++++++++++ makefile | 8 +- rsem-calculate-expression | 26 ++-- rsem-gen-transcript-plots | 124 +++++++++++++++++++ rsem-plot-transcript-wiggles | 119 ++++++++++++++++++ rsem-prepare-reference | 2 +- sam_rsem_aux.h | 48 ++++++++ sam_rsem_cvt.h | 92 ++++++++++++++ tbam2gbam.cpp | 31 +++++ wiggle.cpp | 35 ++++-- wiggle.h | 1 + 15 files changed, 886 insertions(+), 26 deletions(-) create mode 100644 BamConverter.h create mode 100644 bc_aux.h create mode 100644 getUnique.cpp create mode 100755 rsem-gen-transcript-plots create mode 100755 rsem-plot-transcript-wiggles create mode 100644 sam_rsem_aux.h create mode 100644 sam_rsem_cvt.h create mode 100644 tbam2gbam.cpp diff --git a/BamConverter.h b/BamConverter.h new file mode 100644 index 0000000..9b54308 --- /dev/null +++ b/BamConverter.h @@ -0,0 +1,230 @@ +#ifndef BAMCONVERTER_H_ +#define BAMCONVERTER_H_ + +#include +#include +#include +#include +#include + +#include +#include "sam/bam.h" +#include "sam/sam.h" +#include "sam_rsem_aux.h" +#include "sam_rsem_cvt.h" + +#include "utils.h" +#include "bc_aux.h" +#include "Transcript.h" +#include "Transcripts.h" + +class BamConverter { +public: + BamConverter(const char*, const char*, const char*, Transcripts&); + ~BamConverter(); + + void process(); +private: + samfile_t *in, *out; + Transcripts& transcripts; + + std::map refmap; + std::map::iterator iter; + + CollapseMap collapseMap; + + void convert(bam1_t*, const Transcript&); + + void writeCollapsedLines(); + void flipSeq(uint8_t*, int); + void flipQual(uint8_t*, int); + void addXSTag(bam1_t*, const Transcript&); +}; + +BamConverter::BamConverter(const char* inpF, const char* outF, const char* chr_list, Transcripts& transcripts) + : transcripts(transcripts) +{ + if (transcripts.getType() != 0) + exitWithError("Genome information is not provided! RSEM cannot convert the transcript bam file!"); + + in = samopen(inpF, "rb", NULL); + assert(in != 0); + + bam_header_t *out_header = sam_header_read2(chr_list); + refmap.clear(); + for (int i = 0; i < out_header->n_targets; i++) { + refmap[out_header->target_name[i]] = i; + } + + append_header_text(out_header, in->header->text, in->header->l_text); + + out = samopen(outF, "wb", out_header); + assert(out != 0); + + bam_header_destroy(out_header); +} + +BamConverter::~BamConverter() { + samclose(in); + samclose(out); +} + +void BamConverter::process() { + bam1_t *b, *b2; + std::string cqname; + bool isPaired = false; + + int cnt = 0; + + cqname = ""; + b = bam_init1(); b2 = bam_init1(); + + while (samread(in, b) >= 0) { + ++cnt; + isPaired = (b->core.flag & 0x0001) > 0; + if (isPaired) { + assert(samread(in, b2) >= 0 && (b2->core.flag & 0x0001) && b->core.tid == b2->core.tid); + assert((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)); // for collapsing + ++cnt; + } + + if (cnt % 1000000 == 0) { printf("."); fflush(stdout); } + + // at least one segment is not properly mapped + if ((b->core.flag & 0x0004) || isPaired && (b2->core.flag & 0x0004)) continue; + + const Transcript& transcript = transcripts.getTranscriptAt(b->core.tid + 1); + + convert(b, transcript); + if (isPaired) { + convert(b2, transcript); + b->core.mpos = b2->core.pos; + b2->core.mpos = b->core.pos; + } + + if (cqname != bam1_qname(b)) { + writeCollapsedLines(); + cqname = bam1_qname(b); + collapseMap.init(isPaired); + } + + collapseMap.insert(b, b2, bam_aux2f(bam_aux_get(b, "ZW"))); + } + + writeCollapsedLines(); + + bam_destroy1(b); + bam_destroy1(b2); + + if (cnt >= 1000000) printf("\n"); +} + +void BamConverter::convert(bam1_t* b, const Transcript& transcript) { + int pos = b->core.pos; + int readlen = b->core.l_qseq; + + if (readlen == 0) exitWithError("One alignment line has SEQ field as *. RSEM does not support this currently!"); + + iter = refmap.find(transcript.getSeqName()); + assert(iter != refmap.end()); + b->core.tid = iter->second; + if (b->core.flag & 0x0001) { b->core.mtid = b->core.tid; } + b->core.qual = 255; // set to not available temporarily + + if (transcript.getStrand() == '-') { + b->core.flag ^= 0x0010; + if (b->core.flag & 0x0001) { + b->core.flag ^= 0x0020; + b->core.isize = -b->core.isize; + } + flipSeq(bam1_seq(b), readlen); + flipQual(bam1_qual(b), readlen); + } + + std::vector data; + data.clear(); + + int core_pos, core_n_cigar; + tr2chr(transcript, pos + 1, pos + readlen, core_pos, core_n_cigar, data); + assert(core_pos >= 0); + + int rest_len = b->data_len - b->core.l_qname - b->core.n_cigar * 4; + b->data_len = b->core.l_qname + core_n_cigar * 4 + rest_len; + expand_data_size(b); + uint8_t* pt = b->data + b->core.l_qname; + memmove(pt + core_n_cigar * 4, pt + b->core.n_cigar * 4, rest_len); + for (int i = 0; i < core_n_cigar; i++) { memmove(pt, &data[i], 4); pt += 4; } + + b->core.pos = core_pos; + b->core.n_cigar = core_n_cigar; + b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&(b->core), bam1_cigar(b))); + + addXSTag(b, transcript); // check if need to add XS tag, if need, add it +} + +inline void BamConverter::writeCollapsedLines() { + bam1_t *tmp_b = NULL,*tmp_b2 = NULL; + float prb; + bool isPaired; + + if (!collapseMap.empty(isPaired)) { + while (collapseMap.next(tmp_b, tmp_b2, prb)) { + memcpy(bam_aux_get(tmp_b, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f')); + tmp_b->core.qual = getMAPQ(prb); + if (tmp_b->core.qual > 0) { + samwrite(out, tmp_b); + if (isPaired) { + memcpy(bam_aux_get(tmp_b2, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f')); + tmp_b2->core.qual = tmp_b->core.qual; + samwrite(out, tmp_b2); + } + } + bam_destroy1(tmp_b); + if (isPaired) bam_destroy1(tmp_b2); + + } + } +} + +inline void BamConverter::flipSeq(uint8_t* s, int readlen) { + uint8_t code, base; + std::vector seq; + + code = 0; base = 0; + seq.clear(); + for (int i = 0; i < readlen; i++) { + switch (bam1_seqi(s, readlen - i - 1)) { + case 1: base = 8; break; + case 2: base = 4; break; + case 4: base = 2; break; + case 8: base = 1; break; + case 15: base = 15; break; + default: assert(false); + } + code |= base << (4 * (1 - i % 2)); + if (i % 2 == 1) { seq.push_back(code); code = 0; } + } + if (readlen % 2 == 1) { seq.push_back(code); } + + for (int i = 0; i < (int)seq.size(); i++) s[i] = seq[i]; +} + +inline void BamConverter::flipQual(uint8_t* q, int readlen) { + int32_t mid = readlen / 2; + uint8_t tmp; + for (int i = 0; i < mid; i++) { + tmp = q[i]; q[i] = q[readlen - i - 1]; q[readlen -i -1] = tmp; + } +} + +inline void BamConverter::addXSTag(bam1_t* b, const Transcript& transcript) { + uint32_t* p = bam1_cigar(b); + bool hasN = false; + for (int i = 0; i < (int)b->core.n_cigar; i++) + if ((*(p + i) & BAM_CIGAR_MASK) == BAM_CREF_SKIP) { hasN = true; break; } + if (!hasN) return; + char strand = transcript.getStrand(); + bam_aux_append(b, "XS", 'A', 1, (uint8_t*)&strand); +} + +#endif /* BAMCONVERTER_H_ */ diff --git a/BamWriter.h b/BamWriter.h index b0aeee0..ff11397 100644 --- a/BamWriter.h +++ b/BamWriter.h @@ -123,7 +123,7 @@ void BamWriter::work(HitWrapper wrapper) { cnt += 2; if (verbose && cnt % 1000000 == 0) { printf("%d alignment lines are loaded!\n", cnt); } - if (!((b->core.flag & 0x0002) && (b2->core.flag & 0x0002))) continue; + if ((b->core.flag & 0x0004) || (b2->core.flag & 0x0004)) continue; //swap if b is mate 2 if (b->core.flag & 0x0080) { diff --git a/EM.cpp b/EM.cpp index 2921ad3..8362f7d 100644 --- a/EM.cpp +++ b/EM.cpp @@ -340,7 +340,7 @@ void writeResults(ModelType& model, double* counts) { fprintf(fo, "%.15g%c", tau[i], (i < M ? '\t' : '\n')); for (int i = 1; i <= M; i++) { const Transcript& transcript = transcripts.getTranscriptAt(i); - fprintf(fo, "%s%c", transcript.getLeft().c_str(), (i < M ? '\t' : '\n')); + fprintf(fo, "%s%c", transcript.getGeneID().c_str(), (i < M ? '\t' : '\n')); } fclose(fo); diff --git a/bc_aux.h b/bc_aux.h new file mode 100644 index 0000000..0527e7e --- /dev/null +++ b/bc_aux.h @@ -0,0 +1,120 @@ +#ifndef BC_AUX_H_ +#define BC_AUX_H_ + +#include + +#include +#include "sam/bam.h" + +struct SingleEndT { + bam1_t *b; + + SingleEndT(bam1_t *b) { + this->b = b; + } + + int getSign(bool value) const { return value ? -1 : 1; } + + int compare(const SingleEndT& o) const { + int strand1, strand2; + uint32_t *p1, *p2; + + if (b->core.tid != o.b->core.tid) return getSign(b->core.tid < o.b->core.tid); + if (b->core.pos != o.b->core.pos) return getSign(b->core.pos < o.b->core.pos); + strand1 = b->core.flag & 0x0010; strand2 = o.b->core.flag & 0x0010; + if (strand1 != strand2) return getSign(strand1 < strand2); + if (b->core.n_cigar != o.b->core.n_cigar) return getSign(b->core.n_cigar < o.b->core.n_cigar); + p1 = bam1_cigar(b); p2 = bam1_cigar(o.b); + for (int i = 0; i < (int)b->core.n_cigar; i++) { + if (*p1 != *p2) return getSign(*p1 < *p2); + ++p1; ++p2; + } + + return 0; + } + + bool operator< (const SingleEndT& o) const { + return compare(o) < 0; + } +}; + +struct PairedEndT { + SingleEndT mate1, mate2; + + PairedEndT(const SingleEndT& mate1, const SingleEndT& mate2) : mate1(mate1), mate2(mate2) { + } + + bool operator< (const PairedEndT& o) const { + int value = mate1.compare(o.mate1); + return value < 0 || value == 0 && mate2 < o.mate2; + } +}; + +class CollapseMap { +public: + CollapseMap() { isPaired = false; smap.clear(); pmap.clear(); } + + void init(bool isPaired) { + this->isPaired = isPaired; + isPaired ? pmap.clear() : smap.clear(); + } + + void insert(bam1_t *b, bam1_t *b2, float prb) { + if (!isPaired) { + smapIter = smap.find(SingleEndT(b)); + if (smapIter == smap.end()) { smap[SingleEndT(bam_dup1(b))] = prb; } + else smapIter->second += prb; + } + else { + pmapIter = pmap.find(PairedEndT(SingleEndT(b), SingleEndT(b2))); + if (pmapIter == pmap.end()) { pmap[PairedEndT(SingleEndT(bam_dup1(b)), SingleEndT(bam_dup1(b2)))] = prb; } + else pmapIter->second += prb; + } + } + + //once this function is called, "insert" cannot be called anymore + bool empty(bool& par) { + bool value; + + par = isPaired; + if (!isPaired) { value = smap.empty(); smapIter = smap.begin(); } + else { value = pmap.empty(); pmapIter = pmap.begin(); } + + return value; + } + + bool next(bam1_t*& b, bam1_t*& b2, float& prb) { + bool value; + + if (!isPaired) { + value = smapIter != smap.end(); + if (value) { + b = smapIter->first.b; + prb = smapIter->second; + smapIter++; + } + } + else { + value = pmapIter != pmap.end(); + if (value) { + b = pmapIter->first.mate1.b; + b2 = pmapIter->first.mate2.b; + prb = pmapIter->second; + pmapIter++; + } + } + + return value; + } + +private: + bool isPaired; + + std::map smap; + std::map::iterator smapIter; + + std::map pmap; + std::map::iterator pmapIter; +}; + +#endif /* BC_AUX_H_ */ diff --git a/getUnique.cpp b/getUnique.cpp new file mode 100644 index 0000000..8e2ba4e --- /dev/null +++ b/getUnique.cpp @@ -0,0 +1,72 @@ +#include +#include +#include +#include +#include +#include + +#include +#include "sam/bam.h" +#include "sam/sam.h" + +using namespace std; + +string cqname; +samfile_t *in, *out; +bam1_t *b; +vector arr; +bool unaligned; + +void output() { + if (unaligned || arr.size() == 0) return; + bool isPaired = (arr[0]->core.flag & 0x0001); + if (isPaired && arr.size() != 2 || !isPaired && arr.size() != 1) return; + for (int i = 0; i < (int)arr.size(); i++) samwrite(out, arr[i]); +} + +int main(int argc, char* argv[]) { + if (argc != 3) { + printf("Usage: rsem-get-unique unsorted_transcript_bam_input bam_output\n"); + exit(-1); + } + + in = samopen(argv[1], "rb", NULL); + assert(in != 0); + out = samopen(argv[2], "wb", in->header); + assert(out != 0); + + int cnt = 0; + + cqname = ""; + arr.clear(); + b = bam_init1(); + unaligned = false; + + while (samread(in, b) >= 0) { + if (cqname != bam1_qname(b)) { + output(); + cqname = bam1_qname(b); + for (int i = 0; i < (int)arr.size(); i++) bam_destroy1(arr[i]); + arr.clear(); + unaligned = false; + } + + unaligned = unaligned || (b->core.flag & 0x0004); + arr.push_back(bam_dup1(b)); + + ++cnt; + if (cnt % 1000000 == 0) { printf("."); fflush(stdout); } + } + + if (cnt >= 1000000) printf("\n"); + + output(); + + bam_destroy1(b); + samclose(in); + samclose(out); + + printf("done!\n"); + + return 0; +} diff --git a/makefile b/makefile index e8b1318..6b11536 100644 --- a/makefile +++ b/makefile @@ -1,8 +1,7 @@ CC = g++ -#LFLAGS = -Wall -O3 -ffast-math CFLAGS = -Wall -c -I. COFLAGS = -Wall -O3 -ffast-math -c -I. -PROGRAMS = rsem-tbam2gbam rsem-bam2wig rsem-bam2readdepth rsem-build-read-index rsem-run-em rsem-extract-reference-transcripts rsem-synthesis-reference-transcripts rsem-parse-alignments rsem-preref rsem-simulate-reads rsem-run-gibbs rsem-calculate-credibility-intervals +PROGRAMS = rsem-extract-reference-transcripts rsem-synthesis-reference-transcripts rsem-preref rsem-parse-alignments rsem-build-read-index rsem-run-em rsem-tbam2gbam rsem-run-gibbs rsem-calculate-credibility-intervals rsem-simulate-reads rsem-bam2wig rsem-get-unique rsem-bam2readdepth all : build-sam $(PROGRAMS) @@ -94,7 +93,7 @@ bc_aux.h : sam/bam.h BamConverter.h : utils.h sam/sam.h sam/bam.h sam_rsem_aux.h sam_rsem_cvt.h bc_aux.h Transcript.h Transcripts.h -rsem-tbam2gbam : utils.h Transcripts.h Transcript.h bc_aux.h BamConverter.h sam/sam.h sam/bam.h sam/libbam.a sam_rsem_aux.h sam_rsem_cvt.h tbam2gbam.cpp +rsem-tbam2gbam : utils.h Transcripts.h Transcript.h bc_aux.h BamConverter.h sam/sam.h sam/bam.h sam/libbam.a sam_rsem_aux.h sam_rsem_cvt.h tbam2gbam.cpp sam/libbam.a $(CC) -O3 -Wall tbam2gbam.cpp sam/libbam.a -lz -o $@ rsem-bam2wig : wiggle.h wiggle.o sam/libbam.a bam2wig.cpp @@ -126,6 +125,9 @@ rsem-calculate-credibility-intervals : calcCI.o calcCI.o : utils.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h calcCI.cpp boost/random.hpp $(CC) $(COFLAGS) calcCI.cpp +rsem-get-unique : sam/bam.h sam/sam.h getUnique.cpp sam/libbam.a + $(CC) -O3 -Wall getUnique.cpp sam/libbam.a -lz -o $@ + clean: rm -f *.o *~ $(PROGRAMS) cd sam ; ${MAKE} clean diff --git a/rsem-calculate-expression b/rsem-calculate-expression index 6f9706e..774065a 100755 --- a/rsem-calculate-expression +++ b/rsem-calculate-expression @@ -334,7 +334,7 @@ sub runCommand { if ($status != 0) { my $errmsg; if (scalar(@_) > 1) { $errmsg = $_[1]; } - else { $errmsg = "$command failed! Plase check if you provide correct parameters/options for the pipeline!"; } + else { $errmsg = "\"$command\" failed! Plase check if you provide correct parameters/options for the pipeline!"; } print $errmsg."\n"; exit(-1); } @@ -345,7 +345,7 @@ sub runCommand { sub collectResults { my $local_status; my ($inpF, $outF); - my (@results, @comment) = (); + my (@results, @ids) = (); my $line; my $cnt; @@ -362,11 +362,11 @@ sub collectResults { ++$cnt; chomp($line); my @local_arr = split(/\t/, $line); - if ($cnt == 4) { @comment = @local_arr; } + if ($cnt == 4) { @ids = @local_arr; } else { push(@results, \@local_arr); } } - push(@results, \@comment); + push(@results, \@ids); close(INPUT); $local_status = open(OUTPUT, ">$outF"); @@ -589,20 +589,20 @@ estimation. pmc stands for posterior mean counts. ci_lower_bound(l) means the lower bound of the credibility intervals, ci_upper_bound(u) means the upper bound of the credibility intervals. So the credibility interval is [l, u]. 'transcript_id_list' is a space-separated list of -transcript_ids belonging to the gene. +transcript_ids belonging to the gene. If no gene information is +provided, this file has the same content as +'sample_name.isoforms.results'. =item B File containing isoform level expression values. The format of each line in this file is: -transcript_id expected_counts tau_value [pmc_value tau_pme_value tau_ci_lower_bound tau_ci_upper_bound] other_attributes +transcript_id expected_counts tau_value [pmc_value tau_pme_value tau_ci_lower_bound tau_ci_upper_bound] gene_id -Fields are separated by the tab character. 'other_attributes' are all -other attributes after attribute 'transcript_id' field in the GTF -file. If no other attributes are given or no GTF file is provided in -'rsem-prepare-reference', there will be no tab after the -tau_value field. +Fields are separated by the tab character. 'gene_id' is the gene_id of +the gene which this transcript belongs to. If no gene information is +provided, 'gene_id' and 'transcript_id' are the same. =item B @@ -630,7 +630,9 @@ of each alignment is set to min(100, floor(-10 * log10(1.0 - w) + 0.5)), where w is the posterior probability of that alignment being the true mapping of a read. In addition, RSEM pads a new tag ZW:f:value, where value is a single precision floating number -representing the posterior probability. +representing the posterior probability. If an alignment is spliced, a +XS:A:value tag is also added, where value is either '+' or '-' +indicating the strand of the transcript it aligns to. 'sample_name.genome.sorted.bam' and 'sample_name.genome.sorted.bam.bai' are the sorted BAM file and indices generated by samtools (included in RSEM package). diff --git a/rsem-gen-transcript-plots b/rsem-gen-transcript-plots new file mode 100755 index 0000000..6f7ae10 --- /dev/null +++ b/rsem-gen-transcript-plots @@ -0,0 +1,124 @@ +#!/usr/bin/env Rscript + +nrow_per_page <- 3 # if input_list is composed of transcript ids +ncol_per_page <- 2 # if input_list is composed of transcript ids +num_plots_per_page <- nrow_per_page * ncol_per_page # if input_list is composed of transcript ids + + +exit_with_error <- function(errmsg) { + cat(errmsg, "\n", sep = "", file = stderr()) + quit(save = "no", status = 1) +} + + +args <- commandArgs(TRUE) +if (length(args) != 5) + exit_with_error("Usage: rsem-gen-transcript-plots sample_name input_list is_gene show_uniq output_plot_file") + +sample_name <- args[1] +input_list <- args[2] +is_gene <- args[3] +show_uniq <- args[4] +output_plot_file <- args[5] + + + +load_readdepth_file <- function(filename) { + data <- read.table(file = filename, sep = "\t", stringsAsFactors = FALSE) + nrow <- dim(data)[1] + readdepth <- list() + for (i in 1:nrow) { + readdepth[[data[i, 1]]] <- data[i, c(2, 3)] + } + readdepth +} + +build_t2gmap <- function(filename) { + data <- read.table(file = filename, sep = "\t", stringsAsFactors = FALSE) + t2gmap <- list() + + nrow <- dim(data)[1] + ncol <- dim(data)[2] + + gene_id <- "" + tids <- c() + for (i in 1:nrow) { + if (gene_id != data[i, ncol]) { + if (gene_id != "") { + t2gmap[[gene_id]] <- tids + } + gene_id <- data[i, ncol] + tids <- c() + } + tids <- c(tids, data[i, 1]) + } + if (gene_id != "") t2gmap[[gene_id]] <- tids + + t2gmap +} + +generate_a_page <- function(tids, gene_id = NULL) { + n <- length(tids) + ncol <- ifelse(is_gene, floor(sqrt(n)), ncol_per_page) + nrow <- ifelse(is_gene, ceiling(n / ncol), nrow_per_page) + + par(mfrow = c(nrow, ncol), mar = c(1, 1, 2, 1)) + if (is_gene) par(oma = c(0, 0, 3, 0)) + + for (i in 1:n) { + vec <- readdepth[[tids[i]]] + if (is.null(vec)) exit_with_error(paste("Cannot find transcript", tids[i], sep = "")) + wiggle <- ifelse(vec[2] == "NA", rep(0, vec[1]), as.numeric(unlist(strsplit(vec[2], split = " ")))) + len <- length(wiggle) + if (!show_uniq) { + plot(wiggle, type = 'h') + } + else { + vec <- readdepth_uniq[[tids[i]]] + stopifnot(!is.null(vec)) + wiggle_uniq <- ifelse(vec[2] == "NA", rep(0, vec[1]), as.numeric(unlist(strsplit(vec[2], split = " ")))) + stopifnot(len == length(wiggle_uniq), len == sum(wiggle >= wiggle_uniq)) + heights <- rbind(wiggle_uniq, wiggle - wiggle_uniq) + barplot(heights, space = 0, border = NA, names.arg = 1:len) + } + title(main = tids[i], xlab = "Position in transcript", ylab = "Read depth") + } + + if (is_gene) mtext(gene_id, outer = TRUE, line = 1) +} + +readdepth <- load_readdepth_file(paste(sample_name, ".transcript.readdepth", sep = "")) + +if (show_uniq) { + readdepth_uniq <- load_readdepth_file(paste(sample_name, ".uniq.transcript.readdepth", sep = "")) +} + +ids <- scan(file = input_list, what = "", sep = "\n") + +if (is_gene) { + t2gmap <- build_t2gmap(paste(sample_name, ".isoforms.results", sep = "")) +} + +pdf(output_plot_file) + +if (!is_gene) { + n <- length(ids) + ub <- (n - 1) %/% num_plots_per_page + 1 + for (i in 1:ub) { + fr <- (i - 1) * num_plots_per_page + 1 + to <- min(i * num_plots_per_page, n) + generate_a_page(ids[fr:to]) + } +} +else { + for (gene_id in ids) { + if (is.null(t2gmap[[gene_id]])) exit_with_error(paste("Cannot find gene", gene_id, sep = "")) + generate_a_page(t2gmap[[gene_id]], gene_id) + } +} + +dev.off.output <- dev.off() + + + + diff --git a/rsem-plot-transcript-wiggles b/rsem-plot-transcript-wiggles new file mode 100755 index 0000000..9180cb7 --- /dev/null +++ b/rsem-plot-transcript-wiggles @@ -0,0 +1,119 @@ +#!/usr/bin/perl + +use Getopt::Long; +use Pod::Usage; +use File::Basename; +use strict; + +my $gene_list = 0; # default is 0, means input is a transcript list; 1 means input is a gene list +my $show_unique = 0; # 0, default value, means do not show unique transcript wiggles; 1 means show unique transcript wiggles +my $help = 0; + +GetOptions("gene-list" => \$gene_list, + "show-unique" => \$show_unique, + "h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2); + +pod2usage(-verbose => 2) if ($help == 1); +pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 3); + +my ($fn, $dir, $suf) = fileparse($0); +my $command = ""; + +unless (-e "$ARGV[0].transcript.readdepth") { + $command = $dir."rsem-bam2readdepth $ARGV[0].transcript.sorted.bam > $ARGV[0].transcript.readdepth"; + &runCommand($command); +} + +if ($show_unique) { + unless (-e "$ARGV[0].uniq.transcript.bam") { + $command = $dir."rsem-get-unique $ARGV[0].transcript.bam $ARGV[0].uniq.transcript.bam"; + &runCommand($command); + } + unless (-e "$ARGV[0].uniq.transcript.sorted.bam") { + $command = $dir."sam/samtools sort $ARGV[0].uniq.transcript.bam $ARGV[0].uniq.transcript.sorted"; + &runCommand($command); + } + unless (-e "$ARGV[0].uniq.transcript.readdepth") { + $command = $dir."rsem-bam2readdepth $ARGV[0].uniq.transcript.sorted.bam > $ARGV[0].uniq.transcript.readdepth"; + &runCommand($command); + } +} + +$command = $dir."rsem-gen-transcript-plots $ARGV[0] $ARGV[1] $gene_list $show_unique $ARGV[2]"; +&runCommand($command); + +# command, {err_msg} +sub runCommand { + print $_[0]."\n"; + my $status = system($_[0]); + if ($status != 0) { + my $errmsg; + if (scalar(@_) > 1) { $errmsg = $_[1]; } + else { $errmsg = "\"$command\" failed! Plase check if you provide correct parameters/options for the pipeline!"; } + print $errmsg."\n"; + exit(-1); + } + print "\n"; +} + +__END__ + +=head1 NAME + +rsem-plot-transcript-wiggles + +=head1 SYNOPSIS + +=over + + rsem-plot-transcript-wiggles [options] sample_name input_list output_plot_file + +=back + +=head1 ARGUMENTS + +=over + +=item B + +blablabla + +=item B + +blablabla + +=item B + +blablabla + +=back + +=head1 OPTIONS + +=over + +=item B<--gene-list> + +blablabla + +=item B<--show-unique> + +blablabla + +=item B<-h/--help> + +Show help information. + +=back + +=head1 DESCRIPTION + +blablabla + +=head1 OUTPUT + +blablabla + +=head1 EXAMPLES + +=cut diff --git a/rsem-prepare-reference b/rsem-prepare-reference index 6eb6b60..c4527be 100755 --- a/rsem-prepare-reference +++ b/rsem-prepare-reference @@ -100,7 +100,7 @@ sub runCommand { if ($status != 0) { my $errmsg; if (scalar(@_) > 1) { $errmsg = $_[1]; } - else { $errmsg = "$command failed! Plase check if you provide correct parameters/options for the pipeline!"; } + else { $errmsg = "\"$command\" failed! Plase check if you provide correct parameters/options for the pipeline!"; } print $errmsg."\n"; exit(-1); } diff --git a/sam_rsem_aux.h b/sam_rsem_aux.h new file mode 100644 index 0000000..5b4f9cd --- /dev/null +++ b/sam_rsem_aux.h @@ -0,0 +1,48 @@ +#ifndef SAM_RSEM_AUX_H_ +#define SAM_RSEM_AUX_H_ + +#include +#include +#include + +#include "sam/bam.h" + +// dwt: duplicate without text +bam_header_t *bam_header_dwt(const bam_header_t *ori_h) +{ + bam_header_t *h; + + h = bam_header_init(); + h->n_targets = ori_h->n_targets; + h->target_len = (uint32_t*)calloc(h->n_targets, 4); + h->target_name = (char**)calloc(h->n_targets, sizeof(char*)); + for (int i = 0; i < h->n_targets; i++) { + h->target_len[i] = ori_h->target_len[i]; + h->target_name[i] = strdup(ori_h->target_name[i]); + } + + return h; +} + +void append_header_text(bam_header_t *header, const char* text, int len) +{ + int x = header->l_text + 1; + int y = header->l_text + len + 1; // 1 byte null + if (text == 0) return; + kroundup32(x); + kroundup32(y); + if (x < y) header->text = (char*)realloc(header->text, y); + strncpy(header->text + header->l_text, text, len); // we cannot use strcpy() here. + header->l_text += len; + header->text[header->l_text] = 0; +} + +void expand_data_size(bam1_t *b) { + if (b->m_data < b->data_len) { + b->m_data = b->data_len; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + } +} + +#endif /* SAM_RSEM_AUX_H_ */ diff --git a/sam_rsem_cvt.h b/sam_rsem_cvt.h new file mode 100644 index 0000000..4347ef8 --- /dev/null +++ b/sam_rsem_cvt.h @@ -0,0 +1,92 @@ +#ifndef SAM_RSEM_CVT_H_ +#define SAM_RSEM_CVT_H_ + +#include + +#include "stdint.h" +#include "sam/bam.h" + +#include "Transcript.h" +#include "Transcripts.h" + +uint8_t getMAPQ(double val) { + double err = 1.0 - val; + if (err <= 1e-10) return 100; + return (uint8_t)(-10 * log10(err) + .5); // round it +} + +//convert transcript coordinate to chromosome coordinate and generate CIGAR string +void tr2chr(const Transcript& transcript, int sp, int ep, int& pos, int& n_cigar, std::vector& data) { + int length = transcript.getLength(); + char strand = transcript.getStrand(); + const std::vector& structure = transcript.getStructure(); + + int s, i; + int oldlen, curlen; + + uint32_t operation; + + n_cigar = 0; + s = structure.size(); + + if (strand == '-') { + int tmp = sp; + sp = length - ep + 1; + ep = length - tmp + 1; + } + + if (ep < 1 || sp > length) { // a read which align to polyA tails totally! + pos = (sp > length ? structure[s - 1].end : structure[0].start - 1); // 0 based + + n_cigar = 1; + operation = (ep - sp + 1) << BAM_CIGAR_SHIFT | BAM_CINS; //BAM_CSOFT_CLIP; + data.push_back(operation); + + return; + } + + if (sp < 1) { + n_cigar++; + operation = (1 - sp) << BAM_CIGAR_SHIFT | BAM_CINS; //BAM_CSOFT_CLIP; + data.push_back(operation); + sp = 1; + } + + oldlen = curlen = 0; + + for (i = 0; i < s; i++) { + oldlen = curlen; + curlen += structure[i].end - structure[i].start + 1; + if (curlen >= sp) break; + } + assert(i < s); + pos = structure[i].start + (sp - oldlen - 1) - 1; // 0 based + + while (curlen < ep && i < s) { + n_cigar++; + operation = (curlen - sp + 1) << BAM_CIGAR_SHIFT | BAM_CMATCH; + data.push_back(operation); + ++i; + if (i >= s) continue; + n_cigar++; + operation = (structure[i].start - structure[i - 1].end - 1) << BAM_CIGAR_SHIFT | BAM_CREF_SKIP; + data.push_back(operation); + + oldlen = curlen; + sp = oldlen + 1; + curlen += structure[i].end - structure[i].start + 1; + } + + if (i >= s) { + n_cigar++; + operation = (ep - length) << BAM_CIGAR_SHIFT | BAM_CINS; //BAM_CSOFT_CLIP; + data.push_back(operation); + } + else { + n_cigar++; + operation = (ep - sp + 1) << BAM_CIGAR_SHIFT | BAM_CMATCH; + data.push_back(operation); + } +} + +#endif /* SAM_RSEM_CVT_H_ */ diff --git a/tbam2gbam.cpp b/tbam2gbam.cpp new file mode 100644 index 0000000..18688d0 --- /dev/null +++ b/tbam2gbam.cpp @@ -0,0 +1,31 @@ +#include +#include +#include + +#include "utils.h" +#include "Transcripts.h" +#include "BamConverter.h" + +using namespace std; + +char tiF[STRLEN], chr_list[STRLEN]; +Transcripts transcripts; + +int main(int argc, char* argv[]) { + if (argc != 4) { + printf("Usage: rsem-tbam2gbam reference_name unsorted_transcript_bam_input genome_bam_output\n"); + exit(-1); + } + + sprintf(tiF, "%s.ti", argv[1]); + sprintf(chr_list, "%s.chrlist", argv[1]); + transcripts.readFrom(tiF); + + printf("Start converting:\n"); + BamConverter bc(argv[2], argv[3], chr_list, transcripts); + bc.process(); + printf("Genome bam file is generated!\n"); + + return 0; +} + diff --git a/wiggle.cpp b/wiggle.cpp index 19f52b4..65a3d4a 100644 --- a/wiggle.cpp +++ b/wiggle.cpp @@ -36,7 +36,10 @@ void build_wiggles(const std::string& bam_filename, WiggleProcessor& processor) { samfile_t *bam_in = samopen(bam_filename.c_str(), "rb", NULL); if (bam_in == 0) { fprintf(stderr, "Cannot open %s!\n", bam_filename.c_str()); exit(-1); } - //assert(bam_in != 0); + + bam_header_t *header = bam_in->header; + bool *used = new bool[header->n_targets]; + memset(used, 0, sizeof(bool) * header->n_targets); int cur_tid = -1; //current tid; int cnt = 0; @@ -46,19 +49,29 @@ void build_wiggles(const std::string& bam_filename, if (b->core.flag & 0x0004) continue; if (b->core.tid != cur_tid) { - if (cur_tid >= 0) processor.process(wiggle); + if (cur_tid >= 0) { used[cur_tid] = true; processor.process(wiggle); } cur_tid = b->core.tid; - wiggle.name = bam_in->header->target_name[cur_tid]; - wiggle.read_depth.assign(bam_in->header->target_len[cur_tid], 0.0); + wiggle.name = header->target_name[cur_tid]; + wiggle.length = header->target_len[cur_tid]; + wiggle.read_depth.assign(wiggle.length, 0.0); } add_bam_record_to_wiggle(b, wiggle); ++cnt; if (cnt % 1000000 == 0) fprintf(stderr, "%d FIN\n", cnt); } - if (cur_tid >= 0) processor.process(wiggle); + if (cur_tid >= 0) { used[cur_tid] = true; processor.process(wiggle); } + + for (int32_t i = 0; i < header->n_targets; i++) + if (!used[i]) { + wiggle.name = header->target_name[i]; + wiggle.length = header->target_len[i]; + wiggle.read_depth.clear(); + processor.process(wiggle); + } samclose(bam_in); bam_destroy1(b); + delete[] used; } UCSCWiggleTrackWriter::UCSCWiggleTrackWriter(const std::string& output_filename, @@ -75,9 +88,11 @@ UCSCWiggleTrackWriter::~UCSCWiggleTrackWriter() { void UCSCWiggleTrackWriter::process(const Wiggle& wiggle) { int sp, ep; + + if (wiggle.read_depth.empty()) return; sp = ep = -1; - for (size_t i = 0; i < wiggle.read_depth.size(); i++) { + for (size_t i = 0; i < wiggle.length; i++) { if (wiggle.read_depth[i] > 0) { ep = i; } @@ -102,9 +117,13 @@ ReadDepthWriter::ReadDepthWriter(std::ostream& stream) } void ReadDepthWriter::process(const Wiggle& wiggle) { + stream_ << wiggle.name << '\t' - << wiggle.read_depth.size() << '\t'; - for (size_t i = 0; i < wiggle.read_depth.size(); ++i) { + << wiggle.length << '\t'; + + if (wiggle.read_depth.empty()) { stream_ << "NA\n"; return; } + + for (size_t i = 0; i < wiggle.length; ++i) { if (i > 0) stream_ << ' '; stream_ << wiggle.read_depth[i]; } diff --git a/wiggle.h b/wiggle.h index 09cc4f9..1f37592 100644 --- a/wiggle.h +++ b/wiggle.h @@ -6,6 +6,7 @@ struct Wiggle { std::string name; std::vector read_depth; + size_t length; }; class WiggleProcessor { -- 2.39.2