X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=rsem-find-DE;h=98db6d8b91e141f148f3274fd305da107e182bb5;hb=1d786dca8b4c9c1e2176a28a669d2d16cd93e395;hp=e9d25f7d38d957f2dbfc2e9a92762d16468d200f;hpb=8ca5b7c2fb57bc523431c1e37d5ab9337eccbc37;p=rsem.git diff --git a/rsem-find-DE b/rsem-find-DE index e9d25f7..98db6d8 100755 --- a/rsem-find-DE +++ b/rsem-find-DE @@ -1,14 +1,14 @@ #!/usr/bin/env Rscript printUsage <- function() { - cat("Usage: rsem-find-DE data_matrix_file [--ngvector ngvector_file] number_sample_condition1 FDR_rate output_file\n\n") + cat("Usage: rsem-find-DE data_matrix_file [--ngvector ngvector_file] number_of_samples_in_condition_1 FDR_rate output_file\n\n") cat("This script calls EBSeq to find differentially expressed genes/transcripts in two conditions.\n\n") cat("data_matrix_file: m by n matrix containing expected counts, m is the number of transcripts/genes, n is the number of total samples.\n") cat("[--ngvector ngvector_file]: optional field. 'ngvector_file' is calculated by 'rsem-generate-ngvector'. Having this field is recommended for transcript data.\n") - cat("number_sample_condition1: the number of samples in condition 1. A condition's samples must be adjacent. The left group of samples are defined as condition 1.\n") + cat("number_of_samples_in_condition_1: the number of samples in condition 1. A condition's samples must be adjacent. The left group of samples are defined as condition 1.\n") cat("FDR_rate: false discovery rate.\n") - cat("output_file: the output file.\n\n") - cat("The results are written as a matrix with row and column names. The row names are the differentially expressed transcripts'/genes' ids. The column names are 'PPEE', 'PPDE', 'PostFC' and 'RealFC'.\n\n") + cat("output_file: the output file. Three files will be generated: 'output_file', 'output_file.hard_threshold' and 'output_file.all'. The first file reports all DE genes/transcripts using a soft threshold (calculated by crit_func in EBSeq). The second file reports all DE genes/transcripts using a hard threshold (only report if PPEE <= fdr). The third file reports all genes/transcripts. The first file is recommended to be used as DE results because it generally contains more called genes/transcripts.\n\n") + cat("The results are written as a matrix with row and column names. The row names are the genes'/transcripts' ids. The column names are 'PPEE', 'PPDE', 'PostFC' and 'RealFC'.\n\n") cat("PPEE: posterior probability of being equally expressed.\n") cat("PPDE: posterior probability of being differentially expressed.\n") cat("PostFC: posterior fold change (condition 1 over condition2).\n") @@ -58,12 +58,26 @@ if (is.null(ngvector)) { } stopifnot(!is.null(EBOut)) -PP <- GetPPMat(EBOut) +PP <- as.data.frame(GetPPMat(EBOut)) +fc_res <- PostFC(EBOut) + +# soft threshold, default output thre <- crit_fun(PP[, "PPEE"], fdr) DEfound <- rownames(PP)[which(PP[, "PPDE"] >= thre)] -fc_res <- PostFC(EBOut) - -results <- cbind(PP[DEfound, ], fc_res$GenePostFC[DEfound], fc_res$GeneRealFC[DEfound]) +results <- cbind(PP[DEfound, ], fc_res$PostFC[DEfound], fc_res$RealFC[DEfound]) colnames(results) <- c("PPEE", "PPDE", "PostFC", "RealFC") write.table(results, file = output_file) + +# hard threshold +thre <- 1.0 - fdr +DEfound <- rownames(PP)[which(PP[, "PPDE"] >= thre)] + +results <- cbind(PP[DEfound, ], fc_res$PostFC[DEfound], fc_res$RealFC[DEfound]) +colnames(results) <- c("PPEE", "PPDE", "PostFC", "RealFC") +write.table(results, file = paste(output_file, ".hard_threshold", sep = "")) + +# all +results <- cbind(PP, fc_res$PostFC, fc_res$RealFC) +colnames(results) <- c("PPEE", "PPDE", "PostFC", "RealFC") +write.table(results, file = paste(output_file, ".all", sep = ""))