From e02c423a1de88224f13c33bdbf23461e41edb233 Mon Sep 17 00:00:00 2001
From: Bo Li <bli@cs.wisc.edu>
Date: Fri, 6 Sep 2013 23:00:12 -0500
Subject: [PATCH] 'rsem-run-ebseq': Added 'output_file.condmeans' for tests
 with more than 2 conditions

---
 EBSeq/rsem-for-ebseq-find-DE |  6 +++++-
 README.md                    | 13 +++++++++----
 rsem-control-fdr             | 10 +++-------
 rsem-run-ebseq               |  6 +++++-
 4 files changed, 22 insertions(+), 13 deletions(-)
diff --git a/EBSeq/rsem-for-ebseq-find-DE b/EBSeq/rsem-for-ebseq-find-DE
index 07b9899..3c5d304 100755
--- a/EBSeq/rsem-for-ebseq-find-DE
+++ b/EBSeq/rsem-for-ebseq-find-DE
@@ -58,8 +58,12 @@ if (nc == 2) {
 
   results <- cbind(PP, MultiPP$MAP[rownames(PP)], probs)
   colnames(results) <- c(colnames(PP), "MAP", "PPDE")  
-  results <- results[order(results[,"PPDE"], decreasing = TRUE),]
+  ord <- order(results[,"PPDE"], decreasing = TRUE)
+  results <- results[ord,]
   write.table(results, file = output_file, sep = "\t")
 
   write.table(MultiPP$Patterns, file = paste(output_file, ".pattern", sep = ""), sep = "\t")
+
+  MultiFC <- GetMultiFC(MultiOut)
+  write.table(MultiFC$CondMeans[ord,], file = paste(output_file, ".condmeans", sep = ""), sep = "\t")
 }
diff --git a/README.md b/README.md
index b0fa97c..f729ef8 100644
--- a/README.md
+++ b/README.md
@@ -313,9 +313,9 @@ EBSeq, an empirical Bayesian DE analysis tool developed in UW-Madison,
 can take variance due to read mapping ambiguity into consideration by
 grouping isoforms with parent gene's number of isoforms. In addition,
 it is more robust to outliers. For more information about EBSeq
-(including the paper describing their method), please visit <a
-href="http://www.biostat.wisc.edu/~ningleng/EBSeq_Package">EBSeq
-website</a>.
+(including the paper describing their method), please visit [EBSeq's
+website](http://www.biostat.wisc.edu/~ningleng/EBSeq_Package).
+
 
 RSEM includes EBSeq in its folder named 'EBSeq'. To use it, first type
 
@@ -369,7 +369,7 @@ before running either 'EBTest' or 'EBMultiTest'.
 
 Lastly, RSEM provides two scripts, 'rsem-run-ebseq' and
 'rsem-control-fdr', to help users find differential expressed
-genes. First, 'rsem-run-ebseq' calls EBSeq to calculate related statistics
+genes/transcripts. First, 'rsem-run-ebseq' calls EBSeq to calculate related statistics
 for all genes/transcripts. Run 
 
     rsem-run-ebseq --help
@@ -387,6 +387,11 @@ page](http://deweylab.biostat.wisc.edu/rsem/rsem-control-fdr.html). These
 two scripts can perform DE analysis on either 2 conditions or multiple
 conditions.
 
+Please note that 'rsem-run-ebseq' and 'rsem-control-fdr' use EBSeq's
+default parameters. For advanced use of EBSeq or information about how
+EBSeq works, please refer to [EBSeq's
+manual](http://www.bioconductor.org/packages/devel/bioc/vignettes/EBSeq/inst/doc/EBSeq_Vignette.pdf).
+
 Questions related to EBSeq should
 be sent to <a href="mailto:nleng@wisc.edu">Ning Leng</a>.
 
diff --git a/rsem-control-fdr b/rsem-control-fdr
index 86cfb93..65c57cb 100755
--- a/rsem-control-fdr
+++ b/rsem-control-fdr
@@ -74,7 +74,7 @@ rsem-control-fdr [options] input_file fdr_rate output_file
 
 =item B<input_file>
 
-This should be the result file generated by 'rsem-run-ebseq', which contains all genes/transcripts and their associated statistics.
+This should be the main result file generated by 'rsem-run-ebseq', which contains all genes/transcripts and their associated statistics.
 
 =item B<fdr_rate>
 
@@ -82,7 +82,7 @@ The desire false discovery rate (FDR).
 
 =item B<output_file>
 
-This file is a subset of the 'input_file'. It only contains the genes/transcripts called as differentially expressed (DE). When more than 2 conditions exist, DE is defined as not all conditions are equally expressed.
+This file is a subset of the 'input_file'. It only contains the genes/transcripts called as differentially expressed (DE). When more than 2 conditions exist, DE is defined as not all conditions are equally expressed.  Because statistical significance does not necessarily mean biological significance, users should also refer to the fold changes to decide which genes/transcripts are biologically significant. When more than two conditions exist, this file will not contain fold change information and users need to calculate it from 'input_file.condmeans' by themselves.
 
 =back
 
@@ -96,7 +96,7 @@ Use hard threshold method to control FDR. If this option is set, only those gene
 
 =item B<--soft-threshold>
 
-Use soft threshold method to control FDR. If this option is set, this program will try to report as many genes/transcripts as possible, as long as their average PPDE >= 1 - fdr_rate. (Default: off)
+Use soft threshold method to control FDR. If this option is set, this program will try to report as many genes/transcripts as possible, as long as their average PPDE >= 1 - fdr_rate. This option is equivalent to use EBSeq's 'crit_fun' for FDR control. (Default: off)
 
 =item B<-h/--help> 
 
@@ -114,8 +114,4 @@ We assume that we have 'GeneMat.results' as input. We want to control FDR at 0.0
 
  rsem-control-fdr GeneMat.results 0.05 GeneMat.de.txt
 
-If we instead want to use soft threshold method to obtain more called genes/transcripts, we can use:
-
- rsem-control-fdr --soft-threshold GeneMat.results 0.05 GeneMat.de.txt
-
 =cut
diff --git a/rsem-run-ebseq b/rsem-run-ebseq
index c4140c5..cd4750f 100755
--- a/rsem-run-ebseq
+++ b/rsem-run-ebseq
@@ -93,6 +93,10 @@ If there are more than 2 different conditions among the samples, the output form
 
 This file is only generated when there are more than 2 conditions. It defines all possible expression patterns over the conditions using a matrix with names. Each row of the matrix refers to a different expression pattern and each column gives the expression status of a different condition. Two conditions are equally expressed if and only if their statuses are the same.
 
+=item B<output_file.condmeans>
+
+This file is only generated when there are more than 2 conditions. It gives the normalized mean count value for each gene/transcript at each condition. It is formatted as a matrix with names. Each row represents a gene/transcript and each column represent a condition. The order of genes/transcripts is the same as 'output_file'. This file can be used to calculate fold changes between conditions which users are interested in.  
+
 =back
 
 =head1 EXAMPLES
@@ -107,7 +111,7 @@ The results will be in 'IsoMat.results'.
 
  rsem-run-ebseq GeneMat 3,4,4 GeneMat.results
 
-Two files, 'GeneMat.results' and 'GeneMat.results.pattern', will be generated. 
+Three files, 'GeneMat.results', 'GeneMat.results.pattern', and 'GeneMat.results.condmeans', will be generated. 
 
 =cut
 
-- 
2.39.5