--- /dev/null
+#ifndef BAMCONVERTER_H_
+#define BAMCONVERTER_H_
+
+#include<cstdio>
+#include<cstring>
+#include<cassert>
+#include<string>
+#include<map>
+
+#include <stdint.h>
+#include "sam/bam.h"
+#include "sam/sam.h"
+#include "sam_rsem_aux.h"
+#include "sam_rsem_cvt.h"
+
+#include "utils.h"
+#include "bc_aux.h"
+#include "Transcript.h"
+#include "Transcripts.h"
+
+class BamConverter {
+public:
+ BamConverter(const char*, const char*, const char*, Transcripts&);
+ ~BamConverter();
+
+ void process();
+private:
+ samfile_t *in, *out;
+ Transcripts& transcripts;
+
+ std::map<std::string, int> refmap;
+ std::map<std::string, int>::iterator iter;
+
+ CollapseMap collapseMap;
+
+ void convert(bam1_t*, const Transcript&);
+
+ void writeCollapsedLines();
+ void flipSeq(uint8_t*, int);
+ void flipQual(uint8_t*, int);
+ void addXSTag(bam1_t*, const Transcript&);
+};
+
+BamConverter::BamConverter(const char* inpF, const char* outF, const char* chr_list, Transcripts& transcripts)
+ : transcripts(transcripts)
+{
+ if (transcripts.getType() != 0)
+ exitWithError("Genome information is not provided! RSEM cannot convert the transcript bam file!");
+
+ in = samopen(inpF, "rb", NULL);
+ assert(in != 0);
+
+ bam_header_t *out_header = sam_header_read2(chr_list);
+ refmap.clear();
+ for (int i = 0; i < out_header->n_targets; i++) {
+ refmap[out_header->target_name[i]] = i;
+ }
+
+ append_header_text(out_header, in->header->text, in->header->l_text);
+
+ out = samopen(outF, "wb", out_header);
+ assert(out != 0);
+
+ bam_header_destroy(out_header);
+}
+
+BamConverter::~BamConverter() {
+ samclose(in);
+ samclose(out);
+}
+
+void BamConverter::process() {
+ bam1_t *b, *b2;
+ std::string cqname;
+ bool isPaired = false;
+
+ int cnt = 0;
+
+ cqname = "";
+ b = bam_init1(); b2 = bam_init1();
+
+ while (samread(in, b) >= 0) {
+ ++cnt;
+ isPaired = (b->core.flag & 0x0001) > 0;
+ if (isPaired) {
+ assert(samread(in, b2) >= 0 && (b2->core.flag & 0x0001) && b->core.tid == b2->core.tid);
+ assert((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)); // for collapsing
+ ++cnt;
+ }
+
+ if (cnt % 1000000 == 0) { printf("."); fflush(stdout); }
+
+ // at least one segment is not properly mapped
+ if ((b->core.flag & 0x0004) || isPaired && (b2->core.flag & 0x0004)) continue;
+
+ const Transcript& transcript = transcripts.getTranscriptAt(b->core.tid + 1);
+
+ convert(b, transcript);
+ if (isPaired) {
+ convert(b2, transcript);
+ b->core.mpos = b2->core.pos;
+ b2->core.mpos = b->core.pos;
+ }
+
+ if (cqname != bam1_qname(b)) {
+ writeCollapsedLines();
+ cqname = bam1_qname(b);
+ collapseMap.init(isPaired);
+ }
+
+ collapseMap.insert(b, b2, bam_aux2f(bam_aux_get(b, "ZW")));
+ }
+
+ writeCollapsedLines();
+
+ bam_destroy1(b);
+ bam_destroy1(b2);
+
+ if (cnt >= 1000000) printf("\n");
+}
+
+void BamConverter::convert(bam1_t* b, const Transcript& transcript) {
+ int pos = b->core.pos;
+ int readlen = b->core.l_qseq;
+
+ if (readlen == 0) exitWithError("One alignment line has SEQ field as *. RSEM does not support this currently!");
+
+ iter = refmap.find(transcript.getSeqName());
+ assert(iter != refmap.end());
+ b->core.tid = iter->second;
+ if (b->core.flag & 0x0001) { b->core.mtid = b->core.tid; }
+ b->core.qual = 255; // set to not available temporarily
+
+ if (transcript.getStrand() == '-') {
+ b->core.flag ^= 0x0010;
+ if (b->core.flag & 0x0001) {
+ b->core.flag ^= 0x0020;
+ b->core.isize = -b->core.isize;
+ }
+ flipSeq(bam1_seq(b), readlen);
+ flipQual(bam1_qual(b), readlen);
+ }
+
+ std::vector<uint32_t> data;
+ data.clear();
+
+ int core_pos, core_n_cigar;
+ tr2chr(transcript, pos + 1, pos + readlen, core_pos, core_n_cigar, data);
+ assert(core_pos >= 0);
+
+ int rest_len = b->data_len - b->core.l_qname - b->core.n_cigar * 4;
+ b->data_len = b->core.l_qname + core_n_cigar * 4 + rest_len;
+ expand_data_size(b);
+ uint8_t* pt = b->data + b->core.l_qname;
+ memmove(pt + core_n_cigar * 4, pt + b->core.n_cigar * 4, rest_len);
+ for (int i = 0; i < core_n_cigar; i++) { memmove(pt, &data[i], 4); pt += 4; }
+
+ b->core.pos = core_pos;
+ b->core.n_cigar = core_n_cigar;
+ b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&(b->core), bam1_cigar(b)));
+
+ addXSTag(b, transcript); // check if need to add XS tag, if need, add it
+}
+
+inline void BamConverter::writeCollapsedLines() {
+ bam1_t *tmp_b = NULL,*tmp_b2 = NULL;
+ float prb;
+ bool isPaired;
+
+ if (!collapseMap.empty(isPaired)) {
+ while (collapseMap.next(tmp_b, tmp_b2, prb)) {
+ memcpy(bam_aux_get(tmp_b, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f'));
+ tmp_b->core.qual = getMAPQ(prb);
+ if (tmp_b->core.qual > 0) {
+ samwrite(out, tmp_b);
+ if (isPaired) {
+ memcpy(bam_aux_get(tmp_b2, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f'));
+ tmp_b2->core.qual = tmp_b->core.qual;
+ samwrite(out, tmp_b2);
+ }
+ }
+ bam_destroy1(tmp_b);
+ if (isPaired) bam_destroy1(tmp_b2);
+
+ }
+ }
+}
+
+inline void BamConverter::flipSeq(uint8_t* s, int readlen) {
+ uint8_t code, base;
+ std::vector<uint8_t> seq;
+
+ code = 0; base = 0;
+ seq.clear();
+ for (int i = 0; i < readlen; i++) {
+ switch (bam1_seqi(s, readlen - i - 1)) {
+ case 1: base = 8; break;
+ case 2: base = 4; break;
+ case 4: base = 2; break;
+ case 8: base = 1; break;
+ case 15: base = 15; break;
+ default: assert(false);
+ }
+ code |= base << (4 * (1 - i % 2));
+ if (i % 2 == 1) { seq.push_back(code); code = 0; }
+ }
+ if (readlen % 2 == 1) { seq.push_back(code); }
+
+ for (int i = 0; i < (int)seq.size(); i++) s[i] = seq[i];
+}
+
+inline void BamConverter::flipQual(uint8_t* q, int readlen) {
+ int32_t mid = readlen / 2;
+ uint8_t tmp;
+ for (int i = 0; i < mid; i++) {
+ tmp = q[i]; q[i] = q[readlen - i - 1]; q[readlen -i -1] = tmp;
+ }
+}
+
+inline void BamConverter::addXSTag(bam1_t* b, const Transcript& transcript) {
+ uint32_t* p = bam1_cigar(b);
+ bool hasN = false;
+ for (int i = 0; i < (int)b->core.n_cigar; i++)
+ if ((*(p + i) & BAM_CIGAR_MASK) == BAM_CREF_SKIP) { hasN = true; break; }
+ if (!hasN) return;
+ char strand = transcript.getStrand();
+ bam_aux_append(b, "XS", 'A', 1, (uint8_t*)&strand);
+}
+
+#endif /* BAMCONVERTER_H_ */
cnt += 2;
if (verbose && cnt % 1000000 == 0) { printf("%d alignment lines are loaded!\n", cnt); }
- if (!((b->core.flag & 0x0002) && (b2->core.flag & 0x0002))) continue;
+ if ((b->core.flag & 0x0004) || (b2->core.flag & 0x0004)) continue;
//swap if b is mate 2
if (b->core.flag & 0x0080) {
fprintf(fo, "%.15g%c", tau[i], (i < M ? '\t' : '\n'));
for (int i = 1; i <= M; i++) {
const Transcript& transcript = transcripts.getTranscriptAt(i);
- fprintf(fo, "%s%c", transcript.getLeft().c_str(), (i < M ? '\t' : '\n'));
+ fprintf(fo, "%s%c", transcript.getGeneID().c_str(), (i < M ? '\t' : '\n'));
}
fclose(fo);
--- /dev/null
+#ifndef BC_AUX_H_
+#define BC_AUX_H_
+
+#include<map>
+
+#include <stdint.h>
+#include "sam/bam.h"
+
+struct SingleEndT {
+ bam1_t *b;
+
+ SingleEndT(bam1_t *b) {
+ this->b = b;
+ }
+
+ int getSign(bool value) const { return value ? -1 : 1; }
+
+ int compare(const SingleEndT& o) const {
+ int strand1, strand2;
+ uint32_t *p1, *p2;
+
+ if (b->core.tid != o.b->core.tid) return getSign(b->core.tid < o.b->core.tid);
+ if (b->core.pos != o.b->core.pos) return getSign(b->core.pos < o.b->core.pos);
+ strand1 = b->core.flag & 0x0010; strand2 = o.b->core.flag & 0x0010;
+ if (strand1 != strand2) return getSign(strand1 < strand2);
+ if (b->core.n_cigar != o.b->core.n_cigar) return getSign(b->core.n_cigar < o.b->core.n_cigar);
+ p1 = bam1_cigar(b); p2 = bam1_cigar(o.b);
+ for (int i = 0; i < (int)b->core.n_cigar; i++) {
+ if (*p1 != *p2) return getSign(*p1 < *p2);
+ ++p1; ++p2;
+ }
+
+ return 0;
+ }
+
+ bool operator< (const SingleEndT& o) const {
+ return compare(o) < 0;
+ }
+};
+
+struct PairedEndT {
+ SingleEndT mate1, mate2;
+
+ PairedEndT(const SingleEndT& mate1, const SingleEndT& mate2) : mate1(mate1), mate2(mate2) {
+ }
+
+ bool operator< (const PairedEndT& o) const {
+ int value = mate1.compare(o.mate1);
+ return value < 0 || value == 0 && mate2 < o.mate2;
+ }
+};
+
+class CollapseMap {
+public:
+ CollapseMap() { isPaired = false; smap.clear(); pmap.clear(); }
+
+ void init(bool isPaired) {
+ this->isPaired = isPaired;
+ isPaired ? pmap.clear() : smap.clear();
+ }
+
+ void insert(bam1_t *b, bam1_t *b2, float prb) {
+ if (!isPaired) {
+ smapIter = smap.find(SingleEndT(b));
+ if (smapIter == smap.end()) { smap[SingleEndT(bam_dup1(b))] = prb; }
+ else smapIter->second += prb;
+ }
+ else {
+ pmapIter = pmap.find(PairedEndT(SingleEndT(b), SingleEndT(b2)));
+ if (pmapIter == pmap.end()) { pmap[PairedEndT(SingleEndT(bam_dup1(b)), SingleEndT(bam_dup1(b2)))] = prb; }
+ else pmapIter->second += prb;
+ }
+ }
+
+ //once this function is called, "insert" cannot be called anymore
+ bool empty(bool& par) {
+ bool value;
+
+ par = isPaired;
+ if (!isPaired) { value = smap.empty(); smapIter = smap.begin(); }
+ else { value = pmap.empty(); pmapIter = pmap.begin(); }
+
+ return value;
+ }
+
+ bool next(bam1_t*& b, bam1_t*& b2, float& prb) {
+ bool value;
+
+ if (!isPaired) {
+ value = smapIter != smap.end();
+ if (value) {
+ b = smapIter->first.b;
+ prb = smapIter->second;
+ smapIter++;
+ }
+ }
+ else {
+ value = pmapIter != pmap.end();
+ if (value) {
+ b = pmapIter->first.mate1.b;
+ b2 = pmapIter->first.mate2.b;
+ prb = pmapIter->second;
+ pmapIter++;
+ }
+ }
+
+ return value;
+ }
+
+private:
+ bool isPaired;
+
+ std::map<SingleEndT, float> smap;
+ std::map<SingleEndT, float>::iterator smapIter;
+
+ std::map<PairedEndT, float> pmap;
+ std::map<PairedEndT, float>::iterator pmapIter;
+};
+
+#endif /* BC_AUX_H_ */
--- /dev/null
+#include<cstdio>
+#include<cstring>
+#include<cstdlib>
+#include<cassert>
+#include<string>
+#include<vector>
+
+#include <stdint.h>
+#include "sam/bam.h"
+#include "sam/sam.h"
+
+using namespace std;
+
+string cqname;
+samfile_t *in, *out;
+bam1_t *b;
+vector<bam1_t*> arr;
+bool unaligned;
+
+void output() {
+ if (unaligned || arr.size() == 0) return;
+ bool isPaired = (arr[0]->core.flag & 0x0001);
+ if (isPaired && arr.size() != 2 || !isPaired && arr.size() != 1) return;
+ for (int i = 0; i < (int)arr.size(); i++) samwrite(out, arr[i]);
+}
+
+int main(int argc, char* argv[]) {
+ if (argc != 3) {
+ printf("Usage: rsem-get-unique unsorted_transcript_bam_input bam_output\n");
+ exit(-1);
+ }
+
+ in = samopen(argv[1], "rb", NULL);
+ assert(in != 0);
+ out = samopen(argv[2], "wb", in->header);
+ assert(out != 0);
+
+ int cnt = 0;
+
+ cqname = "";
+ arr.clear();
+ b = bam_init1();
+ unaligned = false;
+
+ while (samread(in, b) >= 0) {
+ if (cqname != bam1_qname(b)) {
+ output();
+ cqname = bam1_qname(b);
+ for (int i = 0; i < (int)arr.size(); i++) bam_destroy1(arr[i]);
+ arr.clear();
+ unaligned = false;
+ }
+
+ unaligned = unaligned || (b->core.flag & 0x0004);
+ arr.push_back(bam_dup1(b));
+
+ ++cnt;
+ if (cnt % 1000000 == 0) { printf("."); fflush(stdout); }
+ }
+
+ if (cnt >= 1000000) printf("\n");
+
+ output();
+
+ bam_destroy1(b);
+ samclose(in);
+ samclose(out);
+
+ printf("done!\n");
+
+ return 0;
+}
CC = g++
-#LFLAGS = -Wall -O3 -ffast-math
CFLAGS = -Wall -c -I.
COFLAGS = -Wall -O3 -ffast-math -c -I.
-PROGRAMS = rsem-tbam2gbam rsem-bam2wig rsem-bam2readdepth rsem-build-read-index rsem-run-em rsem-extract-reference-transcripts rsem-synthesis-reference-transcripts rsem-parse-alignments rsem-preref rsem-simulate-reads rsem-run-gibbs rsem-calculate-credibility-intervals
+PROGRAMS = rsem-extract-reference-transcripts rsem-synthesis-reference-transcripts rsem-preref rsem-parse-alignments rsem-build-read-index rsem-run-em rsem-tbam2gbam rsem-run-gibbs rsem-calculate-credibility-intervals rsem-simulate-reads rsem-bam2wig rsem-get-unique rsem-bam2readdepth
all : build-sam $(PROGRAMS)
BamConverter.h : utils.h sam/sam.h sam/bam.h sam_rsem_aux.h sam_rsem_cvt.h bc_aux.h Transcript.h Transcripts.h
-rsem-tbam2gbam : utils.h Transcripts.h Transcript.h bc_aux.h BamConverter.h sam/sam.h sam/bam.h sam/libbam.a sam_rsem_aux.h sam_rsem_cvt.h tbam2gbam.cpp
+rsem-tbam2gbam : utils.h Transcripts.h Transcript.h bc_aux.h BamConverter.h sam/sam.h sam/bam.h sam/libbam.a sam_rsem_aux.h sam_rsem_cvt.h tbam2gbam.cpp sam/libbam.a
$(CC) -O3 -Wall tbam2gbam.cpp sam/libbam.a -lz -o $@
rsem-bam2wig : wiggle.h wiggle.o sam/libbam.a bam2wig.cpp
calcCI.o : utils.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h calcCI.cpp boost/random.hpp
$(CC) $(COFLAGS) calcCI.cpp
+rsem-get-unique : sam/bam.h sam/sam.h getUnique.cpp sam/libbam.a
+ $(CC) -O3 -Wall getUnique.cpp sam/libbam.a -lz -o $@
+
clean:
rm -f *.o *~ $(PROGRAMS)
cd sam ; ${MAKE} clean
if ($status != 0) {
my $errmsg;
if (scalar(@_) > 1) { $errmsg = $_[1]; }
- else { $errmsg = "$command failed! Plase check if you provide correct parameters/options for the pipeline!"; }
+ else { $errmsg = "\"$command\" failed! Plase check if you provide correct parameters/options for the pipeline!"; }
print $errmsg."\n";
exit(-1);
}
sub collectResults {
my $local_status;
my ($inpF, $outF);
- my (@results, @comment) = ();
+ my (@results, @ids) = ();
my $line;
my $cnt;
++$cnt;
chomp($line);
my @local_arr = split(/\t/, $line);
- if ($cnt == 4) { @comment = @local_arr; }
+ if ($cnt == 4) { @ids = @local_arr; }
else { push(@results, \@local_arr); }
}
- push(@results, \@comment);
+ push(@results, \@ids);
close(INPUT);
$local_status = open(OUTPUT, ">$outF");
means the lower bound of the credibility intervals, ci_upper_bound(u)
means the upper bound of the credibility intervals. So the credibility
interval is [l, u]. 'transcript_id_list' is a space-separated list of
-transcript_ids belonging to the gene.
+transcript_ids belonging to the gene. If no gene information is
+provided, this file has the same content as
+'sample_name.isoforms.results'.
=item B<sample_name.isoforms.results>
File containing isoform level expression values. The format of each
line in this file is:
-transcript_id expected_counts tau_value [pmc_value tau_pme_value tau_ci_lower_bound tau_ci_upper_bound] other_attributes
+transcript_id expected_counts tau_value [pmc_value tau_pme_value tau_ci_lower_bound tau_ci_upper_bound] gene_id
-Fields are separated by the tab character. 'other_attributes' are all
-other attributes after attribute 'transcript_id' field in the GTF
-file. If no other attributes are given or no GTF file is provided in
-'rsem-prepare-reference', there will be no tab after the
-tau_value field.
+Fields are separated by the tab character. 'gene_id' is the gene_id of
+the gene which this transcript belongs to. If no gene information is
+provided, 'gene_id' and 'transcript_id' are the same.
=item B<sample_name.transcript.bam, sample_name.transcript.sorted.bam and sample_name.transcript.sorted.bam.bai>
0.5)), where w is the posterior probability of that alignment being
the true mapping of a read. In addition, RSEM pads a new tag
ZW:f:value, where value is a single precision floating number
-representing the posterior probability.
+representing the posterior probability. If an alignment is spliced, a
+XS:A:value tag is also added, where value is either '+' or '-'
+indicating the strand of the transcript it aligns to.
'sample_name.genome.sorted.bam' and 'sample_name.genome.sorted.bam.bai' are the
sorted BAM file and indices generated by samtools (included in RSEM package).
--- /dev/null
+#!/usr/bin/env Rscript
+
+nrow_per_page <- 3 # if input_list is composed of transcript ids
+ncol_per_page <- 2 # if input_list is composed of transcript ids
+num_plots_per_page <- nrow_per_page * ncol_per_page # if input_list is composed of transcript ids
+
+
+exit_with_error <- function(errmsg) {
+ cat(errmsg, "\n", sep = "", file = stderr())
+ quit(save = "no", status = 1)
+}
+
+
+args <- commandArgs(TRUE)
+if (length(args) != 5)
+ exit_with_error("Usage: rsem-gen-transcript-plots sample_name input_list is_gene show_uniq output_plot_file")
+
+sample_name <- args[1]
+input_list <- args[2]
+is_gene <- args[3]
+show_uniq <- args[4]
+output_plot_file <- args[5]
+
+
+
+load_readdepth_file <- function(filename) {
+ data <- read.table(file = filename, sep = "\t", stringsAsFactors = FALSE)
+ nrow <- dim(data)[1]
+ readdepth <- list()
+ for (i in 1:nrow) {
+ readdepth[[data[i, 1]]] <- data[i, c(2, 3)]
+ }
+ readdepth
+}
+
+build_t2gmap <- function(filename) {
+ data <- read.table(file = filename, sep = "\t", stringsAsFactors = FALSE)
+ t2gmap <- list()
+
+ nrow <- dim(data)[1]
+ ncol <- dim(data)[2]
+
+ gene_id <- ""
+ tids <- c()
+ for (i in 1:nrow) {
+ if (gene_id != data[i, ncol]) {
+ if (gene_id != "") {
+ t2gmap[[gene_id]] <- tids
+ }
+ gene_id <- data[i, ncol]
+ tids <- c()
+ }
+ tids <- c(tids, data[i, 1])
+ }
+ if (gene_id != "") t2gmap[[gene_id]] <- tids
+
+ t2gmap
+}
+
+generate_a_page <- function(tids, gene_id = NULL) {
+ n <- length(tids)
+ ncol <- ifelse(is_gene, floor(sqrt(n)), ncol_per_page)
+ nrow <- ifelse(is_gene, ceiling(n / ncol), nrow_per_page)
+
+ par(mfrow = c(nrow, ncol), mar = c(1, 1, 2, 1))
+ if (is_gene) par(oma = c(0, 0, 3, 0))
+
+ for (i in 1:n) {
+ vec <- readdepth[[tids[i]]]
+ if (is.null(vec)) exit_with_error(paste("Cannot find transcript", tids[i], sep = ""))
+ wiggle <- ifelse(vec[2] == "NA", rep(0, vec[1]), as.numeric(unlist(strsplit(vec[2], split = " "))))
+ len <- length(wiggle)
+ if (!show_uniq) {
+ plot(wiggle, type = 'h')
+ }
+ else {
+ vec <- readdepth_uniq[[tids[i]]]
+ stopifnot(!is.null(vec))
+ wiggle_uniq <- ifelse(vec[2] == "NA", rep(0, vec[1]), as.numeric(unlist(strsplit(vec[2], split = " "))))
+ stopifnot(len == length(wiggle_uniq), len == sum(wiggle >= wiggle_uniq))
+ heights <- rbind(wiggle_uniq, wiggle - wiggle_uniq)
+ barplot(heights, space = 0, border = NA, names.arg = 1:len)
+ }
+ title(main = tids[i], xlab = "Position in transcript", ylab = "Read depth")
+ }
+
+ if (is_gene) mtext(gene_id, outer = TRUE, line = 1)
+}
+
+readdepth <- load_readdepth_file(paste(sample_name, ".transcript.readdepth", sep = ""))
+
+if (show_uniq) {
+ readdepth_uniq <- load_readdepth_file(paste(sample_name, ".uniq.transcript.readdepth", sep = ""))
+}
+
+ids <- scan(file = input_list, what = "", sep = "\n")
+
+if (is_gene) {
+ t2gmap <- build_t2gmap(paste(sample_name, ".isoforms.results", sep = ""))
+}
+
+pdf(output_plot_file)
+
+if (!is_gene) {
+ n <- length(ids)
+ ub <- (n - 1) %/% num_plots_per_page + 1
+ for (i in 1:ub) {
+ fr <- (i - 1) * num_plots_per_page + 1
+ to <- min(i * num_plots_per_page, n)
+ generate_a_page(ids[fr:to])
+ }
+}
+else {
+ for (gene_id in ids) {
+ if (is.null(t2gmap[[gene_id]])) exit_with_error(paste("Cannot find gene", gene_id, sep = ""))
+ generate_a_page(t2gmap[[gene_id]], gene_id)
+ }
+}
+
+dev.off.output <- dev.off()
+
+
+
+
--- /dev/null
+#!/usr/bin/perl
+
+use Getopt::Long;
+use Pod::Usage;
+use File::Basename;
+use strict;
+
+my $gene_list = 0; # default is 0, means input is a transcript list; 1 means input is a gene list
+my $show_unique = 0; # 0, default value, means do not show unique transcript wiggles; 1 means show unique transcript wiggles
+my $help = 0;
+
+GetOptions("gene-list" => \$gene_list,
+ "show-unique" => \$show_unique,
+ "h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2);
+
+pod2usage(-verbose => 2) if ($help == 1);
+pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 3);
+
+my ($fn, $dir, $suf) = fileparse($0);
+my $command = "";
+
+unless (-e "$ARGV[0].transcript.readdepth") {
+ $command = $dir."rsem-bam2readdepth $ARGV[0].transcript.sorted.bam > $ARGV[0].transcript.readdepth";
+ &runCommand($command);
+}
+
+if ($show_unique) {
+ unless (-e "$ARGV[0].uniq.transcript.bam") {
+ $command = $dir."rsem-get-unique $ARGV[0].transcript.bam $ARGV[0].uniq.transcript.bam";
+ &runCommand($command);
+ }
+ unless (-e "$ARGV[0].uniq.transcript.sorted.bam") {
+ $command = $dir."sam/samtools sort $ARGV[0].uniq.transcript.bam $ARGV[0].uniq.transcript.sorted";
+ &runCommand($command);
+ }
+ unless (-e "$ARGV[0].uniq.transcript.readdepth") {
+ $command = $dir."rsem-bam2readdepth $ARGV[0].uniq.transcript.sorted.bam > $ARGV[0].uniq.transcript.readdepth";
+ &runCommand($command);
+ }
+}
+
+$command = $dir."rsem-gen-transcript-plots $ARGV[0] $ARGV[1] $gene_list $show_unique $ARGV[2]";
+&runCommand($command);
+
+# command, {err_msg}
+sub runCommand {
+ print $_[0]."\n";
+ my $status = system($_[0]);
+ if ($status != 0) {
+ my $errmsg;
+ if (scalar(@_) > 1) { $errmsg = $_[1]; }
+ else { $errmsg = "\"$command\" failed! Plase check if you provide correct parameters/options for the pipeline!"; }
+ print $errmsg."\n";
+ exit(-1);
+ }
+ print "\n";
+}
+
+__END__
+
+=head1 NAME
+
+rsem-plot-transcript-wiggles
+
+=head1 SYNOPSIS
+
+=over
+
+ rsem-plot-transcript-wiggles [options] sample_name input_list output_plot_file
+
+=back
+
+=head1 ARGUMENTS
+
+=over
+
+=item B<sample_name>
+
+blablabla
+
+=item B<input_list>
+
+blablabla
+
+=item B<output_plot_file>
+
+blablabla
+
+=back
+
+=head1 OPTIONS
+
+=over
+
+=item B<--gene-list>
+
+blablabla
+
+=item B<--show-unique>
+
+blablabla
+
+=item B<-h/--help>
+
+Show help information.
+
+=back
+
+=head1 DESCRIPTION
+
+blablabla
+
+=head1 OUTPUT
+
+blablabla
+
+=head1 EXAMPLES
+
+=cut
if ($status != 0) {
my $errmsg;
if (scalar(@_) > 1) { $errmsg = $_[1]; }
- else { $errmsg = "$command failed! Plase check if you provide correct parameters/options for the pipeline!"; }
+ else { $errmsg = "\"$command\" failed! Plase check if you provide correct parameters/options for the pipeline!"; }
print $errmsg."\n";
exit(-1);
}
--- /dev/null
+#ifndef SAM_RSEM_AUX_H_
+#define SAM_RSEM_AUX_H_
+
+#include<cstdio>
+#include<cstring>
+#include<stdint.h>
+
+#include "sam/bam.h"
+
+// dwt: duplicate without text
+bam_header_t *bam_header_dwt(const bam_header_t *ori_h)
+{
+ bam_header_t *h;
+
+ h = bam_header_init();
+ h->n_targets = ori_h->n_targets;
+ h->target_len = (uint32_t*)calloc(h->n_targets, 4);
+ h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
+ for (int i = 0; i < h->n_targets; i++) {
+ h->target_len[i] = ori_h->target_len[i];
+ h->target_name[i] = strdup(ori_h->target_name[i]);
+ }
+
+ return h;
+}
+
+void append_header_text(bam_header_t *header, const char* text, int len)
+{
+ int x = header->l_text + 1;
+ int y = header->l_text + len + 1; // 1 byte null
+ if (text == 0) return;
+ kroundup32(x);
+ kroundup32(y);
+ if (x < y) header->text = (char*)realloc(header->text, y);
+ strncpy(header->text + header->l_text, text, len); // we cannot use strcpy() here.
+ header->l_text += len;
+ header->text[header->l_text] = 0;
+}
+
+void expand_data_size(bam1_t *b) {
+ if (b->m_data < b->data_len) {
+ b->m_data = b->data_len;
+ kroundup32(b->m_data);
+ b->data = (uint8_t*)realloc(b->data, b->m_data);
+ }
+}
+
+#endif /* SAM_RSEM_AUX_H_ */
--- /dev/null
+#ifndef SAM_RSEM_CVT_H_
+#define SAM_RSEM_CVT_H_
+
+#include<vector>
+
+#include "stdint.h"
+#include "sam/bam.h"
+
+#include "Transcript.h"
+#include "Transcripts.h"
+
+uint8_t getMAPQ(double val) {
+ double err = 1.0 - val;
+ if (err <= 1e-10) return 100;
+ return (uint8_t)(-10 * log10(err) + .5); // round it
+}
+
+//convert transcript coordinate to chromosome coordinate and generate CIGAR string
+void tr2chr(const Transcript& transcript, int sp, int ep, int& pos, int& n_cigar, std::vector<uint32_t>& data) {
+ int length = transcript.getLength();
+ char strand = transcript.getStrand();
+ const std::vector<Interval>& structure = transcript.getStructure();
+
+ int s, i;
+ int oldlen, curlen;
+
+ uint32_t operation;
+
+ n_cigar = 0;
+ s = structure.size();
+
+ if (strand == '-') {
+ int tmp = sp;
+ sp = length - ep + 1;
+ ep = length - tmp + 1;
+ }
+
+ if (ep < 1 || sp > length) { // a read which align to polyA tails totally!
+ pos = (sp > length ? structure[s - 1].end : structure[0].start - 1); // 0 based
+
+ n_cigar = 1;
+ operation = (ep - sp + 1) << BAM_CIGAR_SHIFT | BAM_CINS; //BAM_CSOFT_CLIP;
+ data.push_back(operation);
+
+ return;
+ }
+
+ if (sp < 1) {
+ n_cigar++;
+ operation = (1 - sp) << BAM_CIGAR_SHIFT | BAM_CINS; //BAM_CSOFT_CLIP;
+ data.push_back(operation);
+ sp = 1;
+ }
+
+ oldlen = curlen = 0;
+
+ for (i = 0; i < s; i++) {
+ oldlen = curlen;
+ curlen += structure[i].end - structure[i].start + 1;
+ if (curlen >= sp) break;
+ }
+ assert(i < s);
+ pos = structure[i].start + (sp - oldlen - 1) - 1; // 0 based
+
+ while (curlen < ep && i < s) {
+ n_cigar++;
+ operation = (curlen - sp + 1) << BAM_CIGAR_SHIFT | BAM_CMATCH;
+ data.push_back(operation);
+ ++i;
+ if (i >= s) continue;
+ n_cigar++;
+ operation = (structure[i].start - structure[i - 1].end - 1) << BAM_CIGAR_SHIFT | BAM_CREF_SKIP;
+ data.push_back(operation);
+
+ oldlen = curlen;
+ sp = oldlen + 1;
+ curlen += structure[i].end - structure[i].start + 1;
+ }
+
+ if (i >= s) {
+ n_cigar++;
+ operation = (ep - length) << BAM_CIGAR_SHIFT | BAM_CINS; //BAM_CSOFT_CLIP;
+ data.push_back(operation);
+ }
+ else {
+ n_cigar++;
+ operation = (ep - sp + 1) << BAM_CIGAR_SHIFT | BAM_CMATCH;
+ data.push_back(operation);
+ }
+}
+
+#endif /* SAM_RSEM_CVT_H_ */
--- /dev/null
+#include<cstdio>
+#include<cstring>
+#include<cstdlib>
+
+#include "utils.h"
+#include "Transcripts.h"
+#include "BamConverter.h"
+
+using namespace std;
+
+char tiF[STRLEN], chr_list[STRLEN];
+Transcripts transcripts;
+
+int main(int argc, char* argv[]) {
+ if (argc != 4) {
+ printf("Usage: rsem-tbam2gbam reference_name unsorted_transcript_bam_input genome_bam_output\n");
+ exit(-1);
+ }
+
+ sprintf(tiF, "%s.ti", argv[1]);
+ sprintf(chr_list, "%s.chrlist", argv[1]);
+ transcripts.readFrom(tiF);
+
+ printf("Start converting:\n");
+ BamConverter bc(argv[2], argv[3], chr_list, transcripts);
+ bc.process();
+ printf("Genome bam file is generated!\n");
+
+ return 0;
+}
+
WiggleProcessor& processor) {
samfile_t *bam_in = samopen(bam_filename.c_str(), "rb", NULL);
if (bam_in == 0) { fprintf(stderr, "Cannot open %s!\n", bam_filename.c_str()); exit(-1); }
- //assert(bam_in != 0);
+
+ bam_header_t *header = bam_in->header;
+ bool *used = new bool[header->n_targets];
+ memset(used, 0, sizeof(bool) * header->n_targets);
int cur_tid = -1; //current tid;
int cnt = 0;
if (b->core.flag & 0x0004) continue;
if (b->core.tid != cur_tid) {
- if (cur_tid >= 0) processor.process(wiggle);
+ if (cur_tid >= 0) { used[cur_tid] = true; processor.process(wiggle); }
cur_tid = b->core.tid;
- wiggle.name = bam_in->header->target_name[cur_tid];
- wiggle.read_depth.assign(bam_in->header->target_len[cur_tid], 0.0);
+ wiggle.name = header->target_name[cur_tid];
+ wiggle.length = header->target_len[cur_tid];
+ wiggle.read_depth.assign(wiggle.length, 0.0);
}
add_bam_record_to_wiggle(b, wiggle);
++cnt;
if (cnt % 1000000 == 0) fprintf(stderr, "%d FIN\n", cnt);
}
- if (cur_tid >= 0) processor.process(wiggle);
+ if (cur_tid >= 0) { used[cur_tid] = true; processor.process(wiggle); }
+
+ for (int32_t i = 0; i < header->n_targets; i++)
+ if (!used[i]) {
+ wiggle.name = header->target_name[i];
+ wiggle.length = header->target_len[i];
+ wiggle.read_depth.clear();
+ processor.process(wiggle);
+ }
samclose(bam_in);
bam_destroy1(b);
+ delete[] used;
}
UCSCWiggleTrackWriter::UCSCWiggleTrackWriter(const std::string& output_filename,
void UCSCWiggleTrackWriter::process(const Wiggle& wiggle) {
int sp, ep;
+
+ if (wiggle.read_depth.empty()) return;
sp = ep = -1;
- for (size_t i = 0; i < wiggle.read_depth.size(); i++) {
+ for (size_t i = 0; i < wiggle.length; i++) {
if (wiggle.read_depth[i] > 0) {
ep = i;
}
}
void ReadDepthWriter::process(const Wiggle& wiggle) {
+
stream_ << wiggle.name << '\t'
- << wiggle.read_depth.size() << '\t';
- for (size_t i = 0; i < wiggle.read_depth.size(); ++i) {
+ << wiggle.length << '\t';
+
+ if (wiggle.read_depth.empty()) { stream_ << "NA\n"; return; }
+
+ for (size_t i = 0; i < wiggle.length; ++i) {
if (i > 0) stream_ << ' ';
stream_ << wiggle.read_depth[i];
}
struct Wiggle {
std::string name;
std::vector<float> read_depth;
+ size_t length;
};
class WiggleProcessor {