Added support for DE analysis on multiple conditions via running EBSeq

author Bo Li <bli@cs.wisc.edu>

Mon, 2 Sep 2013 08:25:23 +0000 (03:25 -0500)

committer Bo Li <bli@cs.wisc.edu>

Mon, 2 Sep 2013 08:25:23 +0000 (03:25 -0500)
author Bo Li <bli@cs.wisc.edu>
Mon, 2 Sep 2013 08:25:23 +0000 (03:25 -0500)
committer Bo Li <bli@cs.wisc.edu>
Mon, 2 Sep 2013 08:25:23 +0000 (03:25 -0500)
diff --git a/EBSeq/install b/EBSeq/install

new file mode 100755 (executable)

index 0000000..2438adb
--- /dev/null
+++ b/EBSeq/install
@@ -0,0 +1,19 @@
+#!/usr/bin/env Rscript
+
+result <- suppressWarnings(tryCatch({
+                 library("EBSeq", lib.loc = ".")
+       }, error = function(err) {
+            tryCatch({
+               source("http://www.bioconductor.org/biocLite.R") 
+               biocLite("EBSeq", lib = ".")
+               library("EBSeq", lib.loc = ".")
+               }, error = function(err) {
+                    tryCatch({
+                        cat("Failed to install the latest version of EBSeq from Bioconductor! Try to install EBSeq v1.1.5 locally instead.\n")
+                        install.packages(c("blockmodeling_0.1.8.tar.gz", "EBSeq_1.1.5.tar.gz"), lib = ".", repos = NULL)
+                        library("EBSeq", lib.loc = ".") 
+                        }, error = function(err) { cat("Failed to install EBSeq v1.1.5 locally!\n") })
+                   })
+       }))
+
+
diff --git a/EBSeq/install.R b/EBSeq/install.R

deleted file mode 100755 (executable)

index 2438adb..0000000
--- a/EBSeq/install.R
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/usr/bin/env Rscript
-
-result <- suppressWarnings(tryCatch({
-                 library("EBSeq", lib.loc = ".")
-       }, error = function(err) {
-            tryCatch({
-               source("http://www.bioconductor.org/biocLite.R") 
-               biocLite("EBSeq", lib = ".")
-               library("EBSeq", lib.loc = ".")
-               }, error = function(err) {
-                    tryCatch({
-                        cat("Failed to install the latest version of EBSeq from Bioconductor! Try to install EBSeq v1.1.5 locally instead.\n")
-                        install.packages(c("blockmodeling_0.1.8.tar.gz", "EBSeq_1.1.5.tar.gz"), lib = ".", repos = NULL)
-                        library("EBSeq", lib.loc = ".") 
-                        }, error = function(err) { cat("Failed to install EBSeq v1.1.5 locally!\n") })
-                   })
-       }))
-
-
diff --git a/EBSeq/makefile b/EBSeq/makefile

index 5bd86ffe94b28e8dd6af3ece46c0fe55333a0b10..bf25454731aeb30ec5d71eadead83eceb922023b 100644 (file)
--- a/EBSeq/makefile
+++ b/EBSeq/makefile
@@ -4,7 +4,7 @@ PROGRAMS = EBSeq rsem-for-ebseq-calculate-clustering-info
  all : $(PROGRAMS)
  
  EBSeq : 
-       ./install.R
+       ./install
  
  rsem-for-ebseq-calculate-clustering-info : calcClusteringInfo.cpp
         $(CC) -O3 -Wall calcClusteringInfo.cpp -o $@
diff --git a/EBSeq/rsem-for-ebseq-find-DE b/EBSeq/rsem-for-ebseq-find-DE

new file mode 100755 (executable)

index 0000000..07b9899
--- /dev/null
+++ b/EBSeq/rsem-for-ebseq-find-DE
@@ -0,0 +1,65 @@
+#!/usr/bin/env Rscript
+
+argv <- commandArgs(TRUE)
+if (length(argv) < 6) {
+  cat("Usage: rsem-for-ebseq-find-DE path ngvector_file data_matrix_file output_file number_of_replicate_for_condition_1 number_of_replicate_for_condition_2 ...\n")
+  q(status = 1)
+}
+
+path <- argv[1]
+ngvector_file <- argv[2]
+data_matrix_file <- argv[3]
+output_file <- argv[4]
+
+nc <- length(argv) - 4;
+num_reps <- as.numeric(argv[5:(5+nc-1)])
+
+library(EBSeq, lib.loc = path)
+
+DataMat <- data.matrix(read.table(data_matrix_file))
+n <- dim(DataMat)[2]
+if (sum(num_reps) != n) stop("Total number of replicates given does not match the number of columns from the data matrix!")
+
+conditions <- as.factor(rep(paste("C", 1:nc, sep=""), times = num_reps))
+Sizes <- MedianNorm(DataMat)
+ngvector <- NULL
+if (ngvector_file != "#") {
+  ngvector <- as.vector(data.matrix(read.table(ngvector_file)))
+  stopifnot(!is.null(ngvector))
+}
+
+if (nc == 2) {
+  EBOut <- NULL
+  EBOut <- EBTest(Data = DataMat, NgVector = ngvector, Conditions = conditions, sizeFactors = Sizes, maxround = 5)
+  stopifnot(!is.null(EBOut))
+
+  PP <- as.data.frame(GetPPMat(EBOut))
+  fc_res <- PostFC(EBOut)
+
+  results <- cbind(PP, fc_res$PostFC, fc_res$RealFC)
+  colnames(results) <- c("PPEE", "PPDE", "PostFC", "RealFC")
+  results <- results[order(results[,"PPDE"], decreasing = TRUE),]
+  write.table(results, file = output_file, sep = "\t")
+  
+} else {
+  patterns <- GetPatterns(conditions)
+  eename <- rownames(patterns)[which(rowSums(patterns) == nc)]
+  stopifnot(length(eename) == 1)
+
+  MultiOut <- NULL
+  MultiOut <- EBMultiTest(Data = DataMat, NgVector = ngvector, Conditions = conditions, AllParti = patterns, sizeFactors = Sizes, maxround = 5)
+  stopifnot(!is.null(MultiOut))
+
+  MultiPP <- GetMultiPP(MultiOut)
+
+  PP <- as.data.frame(MultiPP$PP)
+  pos <- which(names(PP) == eename)
+  probs <- rowSums(PP[,-pos])
+
+  results <- cbind(PP, MultiPP$MAP[rownames(PP)], probs)
+  colnames(results) <- c(colnames(PP), "MAP", "PPDE")  
+  results <- results[order(results[,"PPDE"], decreasing = TRUE),]
+  write.table(results, file = output_file, sep = "\t")
+
+  write.table(MultiPP$Patterns, file = paste(output_file, ".pattern", sep = ""), sep = "\t")
+}
diff --git a/README.md b/README.md

index 7b9a4b83ca83a7ec87926af31cab8bf292a29e8c..b0fa97cd27c9831e9fb8876f28a5c3d725d6b1fc 100644 (file)
--- a/README.md
+++ b/README.md
@@ -307,19 +307,17 @@ Popular differential expression (DE) analysis tools such as edgeR and
  DESeq do not take variance due to read mapping uncertainty into
  consideration. Because read mapping ambiguity is prevalent among
  isoforms and de novo assembled transcripts, these tools are not ideal
-for DE detection in such conditions. 
-
-**EBSeq**, an empirical Bayesian DE analysis tool developed in
-UW-Madison, can take variance due to read mapping ambiguity into
-consideration by grouping isoforms with parent gene's number of
-isoforms. In addition, it is more robust to outliers. For more
-information about EBSeq (including the paper describing their method),
-please visit <a
+for DE detection in such conditions.
+
+EBSeq, an empirical Bayesian DE analysis tool developed in UW-Madison,
+can take variance due to read mapping ambiguity into consideration by
+grouping isoforms with parent gene's number of isoforms. In addition,
+it is more robust to outliers. For more information about EBSeq
+(including the paper describing their method), please visit <a
  href="http://www.biostat.wisc.edu/~ningleng/EBSeq_Package">EBSeq
  website</a>.
  
-RSEM includes the newest version of EBSeq in its folder
-named 'EBSeq'. To use it, first type
+RSEM includes EBSeq in its folder named 'EBSeq'. To use it, first type
  
      make ebseq
  
@@ -352,10 +350,9 @@ run 'rsem-generate-ngvector' first. Then load the resulting
  
      NgVec <- scan(file="output_name.ngvec", what=0, sep="\n")
  
-. After that, replace 'IsoNgTrun' with 'NgVec' in the second line of
-section 3.2.5 (Page 10) of EBSeq's vignette:
+. After that, set "NgVector = NgVec" for your differential expression
+test (either 'EBTest' or 'EBMultiTest').
  
-    IsoEBres=EBTest(Data=IsoMat, NgVector=NgVec, ...)
  
  For users' convenience, RSEM also provides a script
  'rsem-generate-data-matrix' to extract input matrix from expression
@@ -368,36 +365,27 @@ all isoform level results. You can load the matrix into R by
  
      IsoMat <- data.matrix(read.table(file="output_name.counts.matrix"))
  
-before running function 'EBTest'.
-
-At last, RSEM provides a R script, 'rsem-find-DE', which run EBSeq for
-you. 
-
-Usage: 
-
-    rsem-find-DE data_matrix_file [--ngvector ngvector_file] number_of_samples_in_condition_1 FDR_rate output_file
-
-This script calls EBSeq to find differentially expressed genes/transcripts in two conditions.
-
-data_matrix_file: m by n matrix containing expected counts, m is the number of transcripts/genes, n is the number of total samples.   
-[--ngvector ngvector_file]: optional field. 'ngvector_file' is calculated by 'rsem-generate-ngvector'. Having this field is recommended for transcript data.   
-number_of_samples_in_condition_1: the number of samples in condition 1. A condition's samples must be adjacent. The left group of samples are defined as condition 1.   
-FDR_rate: false discovery rate.   
-output_file: the output file. Three files will be generated: 'output_file', 'output_file.hard_threshold' and 'output_file.all'. The first file reports all DE genes/transcripts using a soft threshold (calculated by crit_func in EBSeq). The second file reports all DE genes/transcripts using a hard threshold (only report if PPEE <= fdr). The third file reports all genes/transcripts. The first file is recommended to be used as DE results because it generally contains more called genes/transcripts.   
+before running either 'EBTest' or 'EBMultiTest'.
  
-The results are written as a matrix with row and column names. The row names are the differentially expressed transcripts'/genes' ids. The column names are 'PPEE', 'PPDE', 'PostFC' and 'RealFC'.
+Lastly, RSEM provides two scripts, 'rsem-run-ebseq' and
+'rsem-control-fdr', to help users find differential expressed
+genes. First, 'rsem-run-ebseq' calls EBSeq to calculate related statistics
+for all genes/transcripts. Run 
  
-PPEE: posterior probability of being equally expressed.   
-PPDE: posterior probability of being differentially expressed.   
-PostFC: posterior fold change (condition 1 over condition2).   
-RealFC: real fold change (condition 1 over condition2).   
+    rsem-run-ebseq --help
  
-To get the above usage information, type 
+to get usage information or visit the [rsem-run-ebseq documentation
+page](http://deweylab.biostat.wisc.edu/rsem/rsem-run-ebseq.html). Second,
+'rsem-control-fdr' takes 'rsem-run-ebseq' 's result and reports called
+differentially expressed genes/transcripts by controlling the false
+discovery rate. Run
  
-    rsem-find-DE
+    rsem-control-fdr --help
  
-Note: any wrong parameter setting will lead 'rsem-find-DE' to output
-usage information and halt.
+to get usage information or visit the [rsem-control-fdr documentation
+page](http://deweylab.biostat.wisc.edu/rsem/rsem-control-fdr.html). These
+two scripts can perform DE analysis on either 2 conditions or multiple
+conditions.
  
  Questions related to EBSeq should
  be sent to <a href="mailto:nleng@wisc.edu">Ning Leng</a>.
diff --git a/convert-sam-for-rsem b/convert-sam-for-rsem

index eb34c543009049c80dd0f4526a10d0a3a3a86eee..ba75db8b6fd23ca22886dc07dc1db1ed94aada5c 100755 (executable)
--- a/convert-sam-for-rsem
+++ b/convert-sam-for-rsem
@@ -91,11 +91,7 @@ convert-sam-for-rsem
  
  =head1 SYNOPSIS
  
-=over
-
- convert-sam-for-rsem [options] <input.sam/input.bam> output_file_name
-
-=back
+convert-sam-for-rsem [options] <input.sam/input.bam> output_file_name
  
  =head1 ARGUMENTS
  
diff --git a/rsem-calculate-expression b/rsem-calculate-expression

index edb6fddf5458d3d80da38dc9996d586e902a8b9d..62339cb4aa41b1e43a5da611a8bb110f018cf399 100755 (executable)
--- a/rsem-calculate-expression
+++ b/rsem-calculate-expression
@@ -354,14 +354,10 @@ rsem-calculate-expression
  
  =head1 SYNOPSIS
  
-=over
-
- rsem-calculate-expression [options] upstream_read_file(s) reference_name sample_name
- rsem-calculate-expression [options] --paired-end upstream_read_file(s) downstream_read_file(s) reference_name sample_name
+ rsem-calculate-expression [options] upstream_read_file(s) reference_name sample_name 
+ rsem-calculate-expression [options] --paired-end upstream_read_file(s) downstream_read_file(s) reference_name sample_name 
   rsem-calculate-expression [options] --sam/--bam [--paired-end] input reference_name sample_name
  
-=back
-
  =head1 ARGUMENTS
  
  =over
diff --git a/rsem-control-fdr b/rsem-control-fdr

new file mode 100755 (executable)

index 0000000..86cfb93
--- /dev/null
+++ b/rsem-control-fdr
@@ -0,0 +1,121 @@
+#!/usr/bin/perl
+
+use Getopt::Long;
+use Pod::Usage;
+use strict;
+
+my $hard = 0;
+my $soft = 0;
+my $help = 0;
+
+GetOptions("hard-threshold" => \$hard,
+          "soft-threshold" => \$soft,
+          "h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2);
+
+pod2usage(-verbose => 2) if ($help == 1);
+pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 3);
+pod2usage(-msg => "--hard-threshold and --soft-threshold cannot be set at the same time!", -exitval => 2, -verbose => 2) if ($hard && $soft);
+
+if ($hard == 0 && $soft == 0) { $hard = 1; }
+
+my $fdr = $ARGV[1];
+
+open(INPUT, "$ARGV[0]");
+open(OUTPUT, ">$ARGV[2]");
+
+my $header = <INPUT>;
+chomp($header);
+my @columns = split(/\t/, $header);
+
+my $pos = 0;
+while ($pos <= $#columns && $columns[$pos] ne "\"PPDE\"") { ++$pos; }
+if ($pos > $#columns) { print "Error: Cannot find column PPDE!\n"; exit(-1); }
+++$pos;
+
+print OUTPUT "$header\n";
+
+my ($n, $sum) = (0, 0);
+my $line = "";
+while($line = <INPUT>) {
+    chomp($line);
+    my @fields = split(/\t/, $line);
+    my $ppee = 1.0 - $fields[$pos];
+    if ($hard) {
+       if ($ppee > $fdr) { last; }
+       ++$n;
+       print OUTPUT "$line\n";
+    }
+    else {
+       if ($sum + $ppee > $fdr * ($n + 1)) { last; }
+       ++$n;
+       $sum += $ppee;
+       print OUTPUT "$line\n";
+    }
+}
+
+print "There are $n genes/transcripts reported at FDR = $fdr.\n";
+
+close(INPUT);
+close(OUTPUT);
+
+__END__
+
+=head1 NAME
+
+rsem-control-fdr
+
+=head1 SYNOPSIS
+
+rsem-control-fdr [options] input_file fdr_rate output_file
+
+=head1 ARGUMENTS
+
+=over
+
+=item B<input_file>
+
+This should be the result file generated by 'rsem-run-ebseq', which contains all genes/transcripts and their associated statistics.
+
+=item B<fdr_rate>
+
+The desire false discovery rate (FDR). 
+
+=item B<output_file>
+
+This file is a subset of the 'input_file'. It only contains the genes/transcripts called as differentially expressed (DE). When more than 2 conditions exist, DE is defined as not all conditions are equally expressed.
+
+=back
+
+=head1 OPTIONS
+
+=over
+
+=item B<--hard-threshold> 
+
+Use hard threshold method to control FDR. If this option is set, only those genes/transcripts with their PPDE >= 1 - fdr_rate are called as DE. (Default: on)
+
+=item B<--soft-threshold>
+
+Use soft threshold method to control FDR. If this option is set, this program will try to report as many genes/transcripts as possible, as long as their average PPDE >= 1 - fdr_rate. (Default: off)
+
+=item B<-h/--help> 
+
+Show help information.
+
+=back
+
+=head1 DESCRIPTION
+
+This program controls the false discovery rate and reports differentially expressed genes/transcripts. 
+
+=head1 EXAMPLES
+
+We assume that we have 'GeneMat.results' as input. We want to control FDR at 0.05 using hard threshold method and name the output file as 'GeneMat.de.txt':
+
+ rsem-control-fdr GeneMat.results 0.05 GeneMat.de.txt
+
+If we instead want to use soft threshold method to obtain more called genes/transcripts, we can use:
+
+ rsem-control-fdr --soft-threshold GeneMat.results 0.05 GeneMat.de.txt
+
+=cut
diff --git a/rsem-find-DE b/rsem-find-DE

deleted file mode 100755 (executable)

index 98db6d8..0000000
--- a/rsem-find-DE
+++ /dev/null
@@ -1,83 +0,0 @@
-#!/usr/bin/env Rscript
-
-printUsage <- function() {
-  cat("Usage: rsem-find-DE data_matrix_file [--ngvector ngvector_file] number_of_samples_in_condition_1 FDR_rate output_file\n\n")
-  cat("This script calls EBSeq to find differentially expressed genes/transcripts in two conditions.\n\n") 
-  cat("data_matrix_file: m by n matrix containing expected counts, m is the number of transcripts/genes, n is the number of total samples.\n")
-  cat("[--ngvector ngvector_file]: optional field. 'ngvector_file' is calculated by 'rsem-generate-ngvector'. Having this field is recommended for transcript data.\n")
-  cat("number_of_samples_in_condition_1: the number of samples in condition 1. A condition's samples must be adjacent. The left group of samples are defined as condition 1.\n")
-  cat("FDR_rate: false discovery rate.\n")
-  cat("output_file: the output file. Three files will be generated: 'output_file', 'output_file.hard_threshold' and 'output_file.all'. The first file reports all DE genes/transcripts using a soft threshold (calculated by crit_func in EBSeq). The second file reports all DE genes/transcripts using a hard threshold (only report if PPEE <= fdr). The third file reports all genes/transcripts. The first file is recommended to be used as DE results because it generally contains more called genes/transcripts.\n\n")
-  cat("The results are written as a matrix with row and column names. The row names are the genes'/transcripts' ids. The column names are 'PPEE', 'PPDE', 'PostFC' and 'RealFC'.\n\n")
-  cat("PPEE: posterior probability of being equally expressed.\n")
-  cat("PPDE: posterior probability of being differentially expressed.\n")
-  cat("PostFC: posterior fold change (condition 1 over condition2).\n")
-  cat("RealFC: real fold change (condition 1 over condition2).\n")
-  q(status = 1)
-}
-
-argv <- commandArgs(FALSE)
-path <- dirname(sub("--file=", "", argv[grep("--file", argv)]))
-path <- paste(path, "EBSeq", sep = "/")
-start_pos <- grep("--args", argv) + 1
-end_pos <- length(argv)
-if (start_pos <= end_pos) argv <- argv[start_pos : end_pos] else argv <- NULL
-
-ng_pos <- grep("--ngvector", argv) 
-if (length(ng_pos) > 1 || length(ng_pos) == 0 && length(argv) != 4 || length(ng_pos) == 1 && length(argv) != 6) printUsage()
-ngvector_file <- NULL
-if (length(ng_pos) == 1) {
-  if (ng_pos == length(argv)) printUsage()
-  ngvector_file <- argv[ng_pos + 1]
-  argv <- argv[-c(ng_pos, ng_pos + 1)]
-}      
-
-data_matrix_file <- argv[1]
-num_con1 <- as.numeric(argv[2])
-fdr <- as.numeric(argv[3])
-output_file <- argv[4]
-
-library(blockmodeling, lib.loc = path)
-library(EBSeq, lib.loc = path)
-
-DataMat <- data.matrix(read.table(data_matrix_file))
-n <- dim(DataMat)[2]
-conditions <- as.factor(rep(c("C1", "C2"), times = c(num_con1, n - num_con1)))
-Sizes <- MedianNorm(DataMat)
-ngvector <- NULL
-if (!is.null(ngvector_file)) {
-  ngvector <- as.vector(data.matrix(read.table(ngvector_file)))
-}
-
-
-EBOut <- NULL
-if (is.null(ngvector)) {
-  EBOut <- EBTest(Data = DataMat, Conditions = conditions, sizeFactors = Sizes, maxround = 5)
-} else {
-  EBOut <- EBTest(Data = DataMat, NgVector = ngvector, Conditions = conditions, sizeFactors = Sizes, maxround = 5) 
-}
-stopifnot(!is.null(EBOut))
-
-PP <- as.data.frame(GetPPMat(EBOut))
-fc_res <- PostFC(EBOut)
-
-# soft threshold, default output
-thre <- crit_fun(PP[, "PPEE"], fdr)
-DEfound <- rownames(PP)[which(PP[, "PPDE"] >= thre)]
-
-results <- cbind(PP[DEfound, ], fc_res$PostFC[DEfound], fc_res$RealFC[DEfound])
-colnames(results) <- c("PPEE", "PPDE", "PostFC", "RealFC")
-write.table(results, file = output_file)
-
-# hard threshold
-thre <- 1.0 - fdr
-DEfound <- rownames(PP)[which(PP[, "PPDE"] >= thre)]
-
-results <- cbind(PP[DEfound, ], fc_res$PostFC[DEfound], fc_res$RealFC[DEfound])
-colnames(results) <- c("PPEE", "PPDE", "PostFC", "RealFC")
-write.table(results, file = paste(output_file, ".hard_threshold", sep = ""))
-
-# all
-results <- cbind(PP, fc_res$PostFC, fc_res$RealFC)
-colnames(results) <- c("PPEE", "PPDE", "PostFC", "RealFC")
-write.table(results, file = paste(output_file, ".all", sep = ""))
diff --git a/rsem-generate-ngvector b/rsem-generate-ngvector

index 137fa03db354e8e7078a8dd16b535549034f949e..68fe664e5dd69a9dae2ca8a20e91e9bf0bf8430e 100755 (executable)
--- a/rsem-generate-ngvector
+++ b/rsem-generate-ngvector
@@ -34,11 +34,7 @@ rsem-generate-ngvector
  
  =head1 SYNOPSIS
  
-=over
-
- rsem-generate-ngvector [options] input_fasta_file output_name
-
-=back
+rsem-generate-ngvector [options] input_fasta_file output_name
  
  =head1 ARGUMENTS
  
diff --git a/rsem-plot-transcript-wiggles b/rsem-plot-transcript-wiggles

index 11027a1ea30fa91b880d5e776dfdd5b032a2d71b..efe00ba91863d7125268d0e492d602ae6b9cbeb7 100755 (executable)
--- a/rsem-plot-transcript-wiggles
+++ b/rsem-plot-transcript-wiggles
@@ -57,11 +57,7 @@ rsem-plot-transcript-wiggles
  
  =head1 SYNOPSIS
  
-=over
-
- rsem-plot-transcript-wiggles [options] sample_name input_list output_plot_file
-
-=back
+rsem-plot-transcript-wiggles [options] sample_name input_list output_plot_file
  
  =head1 ARGUMENTS
  
diff --git a/rsem-prepare-reference b/rsem-prepare-reference

index 223d79db8a2d3852d0fa601decdef89651b7aae5..8ff37c7ffc79dcc894cbbd2e3fb0b402fbb89570 100755 (executable)
--- a/rsem-prepare-reference
+++ b/rsem-prepare-reference
@@ -104,11 +104,7 @@ rsem-prepare-reference
  
  =head1 SYNOPSIS
  
-=over
-
- rsem-prepare-reference [options] reference_fasta_file(s) reference_name
-
-=back
+rsem-prepare-reference [options] reference_fasta_file(s) reference_name
  
  =head1 ARGUMENTS
  
diff --git a/rsem-run-ebseq b/rsem-run-ebseq

new file mode 100755 (executable)

index 0000000..c4140c5
--- /dev/null
+++ b/rsem-run-ebseq
@@ -0,0 +1,117 @@
+#!/usr/bin/perl
+
+use Getopt::Long;
+use Pod::Usage;
+use FindBin;
+use lib $FindBin::Bin;
+use strict;
+
+use rsem_perl_utils;
+
+my $ngvF = "";
+my $help = 0;
+
+GetOptions("ngvector=s" => \$ngvF,
+          "h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2);
+
+pod2usage(-verbose => 2) if ($help == 1);
+pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 3);
+pod2usage(-msg => "ngvector file cannot be named as #! # is reserved for other purpose!", -exitval => 2, -verbose => 2) if ($ngvF eq "#");
+
+my $dir = "$FindBin::Bin/";
+my $command = "";
+
+my @conditions = split(/,/, $ARGV[1]);
+
+pod2usage(-msg => "At least 2 conditions are required for differential expression analysis!", -exitval => 2, -verbose => 2) if (scalar(@conditions) < 2);
+
+if ($ngvF eq "") { $ngvF = "#"; }
+
+$" = " ";
+$command = $dir."EBSeq/rsem-for-ebseq-find-DE ".$dir."EBSeq $ngvF $ARGV[0] $ARGV[2] @conditions";
+&runCommand($command)
+
+__END__
+
+=head1 NAME
+
+rsem-run-ebseq
+
+=head1 SYNOPSIS
+
+rsem-run-ebseq [options] data_matrix_file conditions output_file
+
+=head1 ARGUMENTS
+
+=over
+
+=item B<data_matrix_file>
+
+This file is a m by n matrix. m is the number of genes/transcripts and n is the number of total samples. Each element in the matrix represents the expected count for a particular gene/transcript in a particular sample. Users can use 'rsem-generate-data-matrix' to generate this file from expression result files. 
+
+=item B<conditions>
+
+Comma-separated list of values representing the number of replicates for each condition. For example, "3,3" means the data set contains 2 conditions and each condition has 3 replicates. "2,3,3" means the data set contains 3 conditions, with 2, 3, and 3 replicates for each condition respectively.
+
+=item B<output_file>
+
+Output file name.
+
+=back
+
+=head1 OPTIONS
+
+=over
+
+=item B<--ngvector> <file>
+
+This option provides the grouping information required by EBSeq for isoform-level differential expression analysis. The file can be generated by 'rsem-generate-ngvector'. Turning this option on is highly recommended for isoform-level differential expression analysis. (Default: off)
+
+=item B<-h/--help>
+
+Show help information.
+
+=back
+
+=head1 DESCRIPTION
+
+This program is a wrapper over EBSeq. It performs differential expression analysis and can work on two or more conditions. All genes/transcripts and their associated statistcs are reported in one output file. This program does not control false discovery rate and call differential expressed genes/transcripts. Please use 'rsem-control-fdr' to control false discovery rate after this program is finished.
+
+=head1 OUTPUT
+
+=over
+
+=item B<output_file>
+
+This file reports the calculated statistics for all genes/transcripts. It is written as a matrix with row and column names. The row names are the genes'/transcripts' names. The column names are for the reported statistics.
+
+If there are only 2 different conditions among the samples, four statistics (columns) will be reported for each gene/transcript. They are "PPEE", "PPDE", "PostFC" and "RealFC". "PPEE" is the posterior probability (estimated by EBSeq) that a gene/transcript is equally expressed. "PPDE" is the posterior probability that a gene/transcript is differentially expressed. "PostFC" is the posterior fold change (condition 1 over condition2) for a gene/transcript. It is defined as the ratio between posterior mean expression estimates of the gene/transcript for each condition. "RealFC" is the real fold change (condition 1 over condition2) for a gene/transcript.  It is the ratio of the normalized within condition 1 mean count over normalized within condition 2 mean count for the gene/transcript. Fold changes are calculated using EBSeq's 'PostFC' function. The genes/transcripts are reported in descending order of their "PPDE" values. 
+  
+If there are more than 2 different conditions among the samples, the output format is different. For differential expression analysis with more than 2 conditions, EBSeq will enumerate all possible expression patterns (on which conditions are equally expressed and which conditions are not). Suppose there are k different patterns, the first k columns of the output file give the posterior probability of each expression pattern is true. Patterns are defined in a separate file, 'output_file.pattern'. The k+1 column gives the maximum a posteriori (MAP) expression pattern for each gene/transcript. The k+2 column gives the posterior probability that not all conditions are equally expressed (column name "PPDE"). The genes/transcripts are reported in descending order of their "PPDE" column values. For details on how EBSeq works for more than 2 conditions, please refer to EBSeq's manual.
+
+=item B<output_file.pattern>
+
+This file is only generated when there are more than 2 conditions. It defines all possible expression patterns over the conditions using a matrix with names. Each row of the matrix refers to a different expression pattern and each column gives the expression status of a different condition. Two conditions are equally expressed if and only if their statuses are the same.
+
+=back
+
+=head1 EXAMPLES
+
+1) We're interested in isoform-level differential expression analysis and there are two conditions. Each condition has 5 replicates. We have already collected the data matrix as 'IsoMat' and generated ngvector as 'ngvector.ngvec':
+
+ rsem-run-ebseq --ngvector ngvector.ngvec IsoMat 5,5 IsoMat.results
+
+The results will be in 'IsoMat.results'.
+
+2) We're interested in gene-level analysis and there are 3 conditions. The first condition has 3 replicates and the other two has 4 replicates each. The data matrix is named as 'GeneMat':
+
+ rsem-run-ebseq GeneMat 3,4,4 GeneMat.results
+
+Two files, 'GeneMat.results' and 'GeneMat.results.pattern', will be generated. 
+
+=cut
+
+
+ 
+
+
author	Bo Li <bli@cs.wisc.edu>
	Mon, 2 Sep 2013 08:25:23 +0000 (03:25 -0500)
committer	Bo Li <bli@cs.wisc.edu>
	Mon, 2 Sep 2013 08:25:23 +0000 (03:25 -0500)
EBSeq/install	[new file with mode: 0755]	patch \| blob
EBSeq/install.R	[deleted file]	patch \| blob \| history
EBSeq/makefile		patch \| blob \| history
EBSeq/rsem-for-ebseq-find-DE	[new file with mode: 0755]	patch \| blob
README.md		patch \| blob \| history
convert-sam-for-rsem		patch \| blob \| history
rsem-calculate-expression		patch \| blob \| history
rsem-control-fdr	[new file with mode: 0755]	patch \| blob
rsem-find-DE	[deleted file]	patch \| blob \| history
rsem-generate-ngvector		patch \| blob \| history
rsem-plot-transcript-wiggles		patch \| blob \| history
rsem-prepare-reference		patch \| blob \| history
rsem-run-ebseq	[new file with mode: 0755]	patch \| blob