From 8ca5b7c2fb57bc523431c1e37d5ab9337eccbc37 Mon Sep 17 00:00:00 2001 From: Bo Li Date: Tue, 11 Sep 2012 04:07:43 -0500 Subject: [PATCH] Added a script, 'rsem-find-DE', to run EBSeq automatically --- EBSeq/makefile | 2 +- README.md | 50 ++++++++++++++++++++++----- WHAT_IS_NEW | 8 +++++ makefile | 5 ++- rsem-find-DE | 69 +++++++++++++++++++++++++++++++++++++ rsem-form-counts-matrix | 42 ----------------------- rsem-generate-data-matrix | 72 +++++++++++++++++++++++++++++++++++++++ 7 files changed, 196 insertions(+), 52 deletions(-) create mode 100755 rsem-find-DE delete mode 100755 rsem-form-counts-matrix create mode 100755 rsem-generate-data-matrix diff --git a/EBSeq/makefile b/EBSeq/makefile index aae42c1..f97c12d 100644 --- a/EBSeq/makefile +++ b/EBSeq/makefile @@ -13,4 +13,4 @@ rsem-for-ebseq-calculate-clustering-info : calcClusteringInfo.cpp $(CC) -O3 -Wall calcClusteringInfo.cpp -o $@ clean : - rm -rf $(PROGRAMS) + rm -rf $(PROGRAMS) *~ diff --git a/README.md b/README.md index 8d3c15f..459616f 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ Table of Contents * [Simulation](#simulation) * [Generate Transcript-to-Gene-Map from Trinity Output](#gen_trinity) * [Differential Expression Analysis](#de) +* [Authors](#authors) * [Acknowledgements](#acknowledgements) * [License](#license) @@ -300,13 +301,12 @@ named 'EBSeq'. For more information about EBSeq (including the paper describing their method), please visit EBSeq -website. You can also find a local version of vignette under -'EBSeq/inst/doc/EBSeq_Vignette.pdf'. +website. EBSeq requires gene-isoform relationship for its isoform DE detection. However, for de novo assembled transcriptome, it is hard to obtain an accurate gene-isoform relationship. Instead, RSEM provides a -script 'rsem-generate-ngvector', which clusters isoforms based on +script 'rsem-generate-ngvector', which clusters transcripts based on measures directly relating to read mappaing ambiguity. First, it calcualtes the 'unmappability' of each transcript. The 'unmappability' of a transcript is the ratio between the number of k mers with at @@ -336,20 +336,54 @@ section 3.2.5 (Page 10) of EBSeq's vignette: IsoEBres=EBTest(Data=IsoMat, NgVector=NgVec, ...) For users' convenience, RSEM also provides a script -'rsem-form-counts-matrix' to extract input matrix from expression +'rsem-generate-data-matrix' to extract input matrix from expression results: - rsem-form-counts-matrix sampleA.[genes/isoforms].results sampleB.[genes/isoforms].results ... > output_name.counts.matrix + rsem-generate-data-matrix sampleA.[genes/isoforms].results sampleB.[genes/isoforms].results ... > output_name.counts.matrix The results files are required to be either all gene level results or all isoform level results. You can load the matrix into R by - IsoMat <- read.table(file="output_name.counts.matrix") + IsoMat <- data.matrix(read.table(file="output_name.counts.matrix")) before running function 'EBTest'. -Questions related to EBSeq should be sent to Ning Leng. - +At last, RSEM provides a R script, 'rsem-find-DE', which run EBSeq for +you. + +Usage: + + rsem-find-DE data_matrix_file [--ngvector ngvector_file] number_sample_condition1 FDR_rate output_file + +This script calls EBSeq to find differentially expressed genes/transcripts in two conditions. + +data_matrix_file: m by n matrix containing expected counts, m is the number of transcripts/genes, n is the number of total samples. +[--ngvector ngvector_file]: optional field. 'ngvector_file' is calculated by 'rsem-generate-ngvector'. Having this field is recommended for transcript data. +number_sample_condition1: the number of samples in condition 1. A condition's samples must be adjacent. The left group of samples are defined as condition 1. +FDR_rate: false discovery rate. +output_file: the output file. + +The results are written as a matrix with row and column names. The row names are the differentially expressed transcripts'/genes' ids. The column names are 'PPEE', 'PPDE', 'PostFC' and 'RealFC'. + +PPEE: posterior probability of being equally expressed. +PPDE: posterior probability of being differentially expressed. +PostFC: posterior fold change (condition 1 over condition2). +RealFC: real fold change (condition 1 over condition2). + +To get the above usage information, type + + rsem-find-DE + +Note: any wrong parameter setting will lead 'rsem-find-DE' to output +usage information and halt. + +Questions related to EBSeq should +be sent to Ning Leng. + +## Authors + +RSEM is developed by Bo Li, with substaintial technical input from Colin Dewey. + ## Acknowledgements RSEM uses the [Boost C++](http://www.boost.org) and diff --git a/WHAT_IS_NEW b/WHAT_IS_NEW index 244066c..073de14 100644 --- a/WHAT_IS_NEW +++ b/WHAT_IS_NEW @@ -1,3 +1,11 @@ +RSEM v1.2.0 + +- Changed output formats, added FPKM field etc. +- Fixed a bug related to paired-end reads data +- Added a script to run EBSeq automatically and updated EBSeq to v1.1.3 + +-------------------------------------------------------------------------------------------- + RSEM v1.1.21 - Removed optional field "Z0:A:!" in the BAM outputs diff --git a/makefile b/makefile index 8d969e7..3ea1b58 100644 --- a/makefile +++ b/makefile @@ -1,7 +1,7 @@ CC = g++ CFLAGS = -Wall -c -I. COFLAGS = -Wall -O3 -ffast-math -c -I. -PROGRAMS = rsem-extract-reference-transcripts rsem-synthesis-reference-transcripts rsem-preref rsem-parse-alignments rsem-build-read-index rsem-run-em rsem-tbam2gbam rsem-run-gibbs rsem-calculate-credibility-intervals rsem-simulate-reads rsem-bam2wig rsem-get-unique rsem-bam2readdepth rsem-sam-validator rsem-scan-for-paired-end-reads +PROGRAMS = rsem-extract-reference-transcripts rsem-synthesis-reference-transcripts rsem-preref rsem-parse-alignments rsem-build-read-index rsem-run-em rsem-tbam2gbam rsem-run-gibbs rsem-calculate-credibility-intervals rsem-simulate-reads rsem-bam2wig rsem-get-unique rsem-bam2readdepth rsem-sam-validator rsem-scan-for-paired-end-reads ebseq all : $(PROGRAMS) @@ -135,6 +135,9 @@ rsem-sam-validator : sam/bam.h sam/sam.h my_assert.h samValidator.cpp sam/libbam rsem-scan-for-paired-end-reads : sam/bam.h sam/sam.h my_assert.h scanForPairedEndReads.cpp sam/libbam.a $(CC) -O3 -Wall scanForPairedEndReads.cpp sam/libbam.a -lz -o $@ +ebseq : + cd EBSeq ; ${MAKE} all + clean : rm -f *.o *~ $(PROGRAMS) cd sam ; ${MAKE} clean diff --git a/rsem-find-DE b/rsem-find-DE new file mode 100755 index 0000000..e9d25f7 --- /dev/null +++ b/rsem-find-DE @@ -0,0 +1,69 @@ +#!/usr/bin/env Rscript + +printUsage <- function() { + cat("Usage: rsem-find-DE data_matrix_file [--ngvector ngvector_file] number_sample_condition1 FDR_rate output_file\n\n") + cat("This script calls EBSeq to find differentially expressed genes/transcripts in two conditions.\n\n") + cat("data_matrix_file: m by n matrix containing expected counts, m is the number of transcripts/genes, n is the number of total samples.\n") + cat("[--ngvector ngvector_file]: optional field. 'ngvector_file' is calculated by 'rsem-generate-ngvector'. Having this field is recommended for transcript data.\n") + cat("number_sample_condition1: the number of samples in condition 1. A condition's samples must be adjacent. The left group of samples are defined as condition 1.\n") + cat("FDR_rate: false discovery rate.\n") + cat("output_file: the output file.\n\n") + cat("The results are written as a matrix with row and column names. The row names are the differentially expressed transcripts'/genes' ids. The column names are 'PPEE', 'PPDE', 'PostFC' and 'RealFC'.\n\n") + cat("PPEE: posterior probability of being equally expressed.\n") + cat("PPDE: posterior probability of being differentially expressed.\n") + cat("PostFC: posterior fold change (condition 1 over condition2).\n") + cat("RealFC: real fold change (condition 1 over condition2).\n") + q(status = 1) +} + +argv <- commandArgs(FALSE) +path <- dirname(sub("--file=", "", argv[grep("--file", argv)])) +path <- paste(path, "EBSeq", sep = "/") +start_pos <- grep("--args", argv) + 1 +end_pos <- length(argv) +if (start_pos <= end_pos) argv <- argv[start_pos : end_pos] else argv <- NULL + +ng_pos <- grep("--ngvector", argv) +if (length(ng_pos) > 1 || length(ng_pos) == 0 && length(argv) != 4 || length(ng_pos) == 1 && length(argv) != 6) printUsage() +ngvector_file <- NULL +if (length(ng_pos) == 1) { + if (ng_pos == length(argv)) printUsage() + ngvector_file <- argv[ng_pos + 1] + argv <- argv[-c(ng_pos, ng_pos + 1)] +} + +data_matrix_file <- argv[1] +num_con1 <- as.numeric(argv[2]) +fdr <- as.numeric(argv[3]) +output_file <- argv[4] + +library(blockmodeling, lib.loc = path) +library(EBSeq, lib.loc = path) + +DataMat <- data.matrix(read.table(data_matrix_file)) +n <- dim(DataMat)[2] +conditions <- as.factor(rep(c("C1", "C2"), times = c(num_con1, n - num_con1))) +Sizes <- MedianNorm(DataMat) +ngvector <- NULL +if (!is.null(ngvector_file)) { + ngvector <- as.vector(data.matrix(read.table(ngvector_file))) +} + + +EBOut <- NULL +if (is.null(ngvector)) { + EBOut <- EBTest(Data = DataMat, Conditions = conditions, sizeFactors = Sizes, maxround = 5) +} else { + EBOut <- EBTest(Data = DataMat, NgVector = ngvector, Conditions = conditions, sizeFactors = Sizes, maxround = 5) +} +stopifnot(!is.null(EBOut)) + +PP <- GetPPMat(EBOut) +thre <- crit_fun(PP[, "PPEE"], fdr) +DEfound <- rownames(PP)[which(PP[, "PPDE"] >= thre)] + +fc_res <- PostFC(EBOut) + +results <- cbind(PP[DEfound, ], fc_res$GenePostFC[DEfound], fc_res$GeneRealFC[DEfound]) +colnames(results) <- c("PPEE", "PPDE", "PostFC", "RealFC") +write.table(results, file = output_file) diff --git a/rsem-form-counts-matrix b/rsem-form-counts-matrix deleted file mode 100755 index d275d6e..0000000 --- a/rsem-form-counts-matrix +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/perl - -use strict; - -if (scalar(@ARGV) == 0) { - print "Usage: rsem-form-counts-matrix sampleA.[genes/isoforms].results sampleB.[genes/isoforms].results ... > output_name.counts.matrix\n"; - print "Results files should be either all .genes.results or all .isoforms.results.\n"; - exit(-1); -} - -my $offsite = 4; # for new file formats - -my $line; -my $n = scalar(@ARGV); -my $M = -1; -my @matrix = (); - -for (my $i = 0; $i < $n; $i++) { - my @sample = (); - open(INPUT, $ARGV[$i]); - while ($line = ) { - chomp($line); - my @fields = split(/\t/, $line); - push(@sample, $fields[$offsite]); - } - close(INPUT); - if (scalar(@sample) == 0) { - print STDERR "No transcript is detected! Please check if $ARGV[$i] exists.\n"; - exit(-1); - } - if ($M < 0) { $M = scalar(@sample); } - elsif ($M != scalar(@sample)) { - print STDERR "Number of transcripts among samples are not equal!\n"; - exit(-1); - } - push(@matrix, \@sample); -} - -for (my $i = 0; $i < $M; $i++) { - for (my $j = 0; $j < $n - 1; $j++) { print "$matrix[$j][$i]\t"; } - print "$matrix[$n - 1][$i]\n"; -} diff --git a/rsem-generate-data-matrix b/rsem-generate-data-matrix new file mode 100755 index 0000000..951f415 --- /dev/null +++ b/rsem-generate-data-matrix @@ -0,0 +1,72 @@ +#!/usr/bin/perl + +use strict; + +if (scalar(@ARGV) == 0) { + print "Usage: rsem-generate-data-matrix sampleA.[genes/isoforms].results sampleB.[genes/isoforms].results ... > output_name.matrix\n"; + print "Results files should be either all .genes.results or all .isoforms.results.\n"; + exit(-1); +} + +my $offsite = 4; # for new file formats + +my $line; +my $n = scalar(@ARGV); +my $M = -1; +my @matrix = (); + +# 0, file_name; 1, reference of expected count array; 2, reference of transcript_id/gene_id array +sub loadData { + open(INPUT, $_[0]); + my $line = ; # The first line contains only column names + while ($line = ) { + chomp($line); + my @fields = split(/\t/, $line); + push(@{$_[2]}, $fields[0]); + push(@{$_[1]}, $fields[$offsite]); + } + close(INPUT); + + if (scalar(@{$_[1]}) == 0) { + print STDERR "Nothing is detected! $_[0] may not exist or is empty.\n"; + exit(-1); + } +} + +#0, M; 1, reference of @ids_arr; 2, reference of @ids +sub check { + my $size = $_[0]; + for (my $i = 0; $i < $size; $i++) { + if ($_[1]->[$i] ne $_[2]->[$i]) { + return 0; + } + } + return 1; +} + +my @ids_arr = (); + +for (my $i = 0; $i < $n; $i++) { + my (@ids, @ecs) = (); + &loadData($ARGV[$i], \@ecs, \@ids); + + if ($M < 0) { + $M = scalar(@ids); + @ids_arr = @ids; + } + elsif (!&check($M, \@ids_arr, \@ids)) { + print STDERR "Number of lines among samples are not equal!\n"; + exit(-1); + } + + @ecs = ($ARGV[$i], @ecs); + push(@matrix, \@ecs); +} + +@ids_arr = ("", @ids_arr); +@matrix = (\@ids_arr, @matrix); + +for (my $i = 0; $i <= $M; $i++) { + for (my $j = 0; $j < $n; $j++) { print "$matrix[$j][$i]\t"; } + print "$matrix[$n][$i]\n"; +} -- 2.39.2