From: Bo Li <bli@cs.wisc.edu>
Date: Fri, 11 Mar 2011 03:25:54 +0000 (-0600)
Subject: Enable RSEM to visualize the model learned from data
X-Git-Url: https://git.donarmstrong.com/?p=rsem.git;a=commitdiff_plain;h=18c158c73b8f7f0afe4db97ac51b520149d33c12

Enable RSEM to visualize the model learned from data
---

diff --git a/README.md b/README.md
index 0f4d982..0c900b7 100644
--- a/README.md
+++ b/README.md
@@ -123,6 +123,23 @@ wiggle_name: the name the user wants to use for this wiggle plot
 
 Refer to the [UCSC custom track help page](http://genome.ucsc.edu/goldenPath/help/customTrack.html).
 
+#### c) Visualize the model learned by RSEM
+
+RSEM provides an R script, plotModel.R, for visulazing the model learned.
+
+Usage:
+    
+    plotModel.R modelF outF
+
+modelF: the sample_name.model file generated by RSEM
+outF: the file name for plots generated from the model. It is a pdf file
+
+The plots generated depends on read type and user configuration. It
+may include fragment length distribution, mate length distribution,
+read start position distribution (RSPD), quality score vs percentage
+of sequecing error given the reference base, position vs percentage of
+sequencing errro given the reference base.
+
 ## <a name="example"></a> Example
 
 Suppose we download the mouse genome from UCSC Genome Browser.  We will
diff --git a/plotModel.R b/plotModel.R
new file mode 100755
index 0000000..ce0a8dc
--- /dev/null
+++ b/plotModel.R
@@ -0,0 +1,109 @@
+#!/usr/bin/env Rscript
+
+argv <- commandArgs(TRUE)
+if (length(argv) != 2) {
+  cat("Usage: plotModel.R modelF outF\n")
+  q(status = 1)
+}
+
+con <- file(argv[1], open = "r")	
+pdf(argv[2])
+
+# model type and forward probability
+model_type <- as.numeric(readLines(con, n = 4)[1])  
+
+# fragment length distribution
+strvec <- readLines(con, n = 3)
+vec <- as.numeric(strsplit(strvec[1], split = " ")[[1]])
+maxL <- vec[2] # maxL used for Profile
+x <- (vec[1] + 1) : vec[2]
+y <- as.numeric(strsplit(strvec[2], split = " ")[[1]])
+mean <- weighted.mean(x, y)
+std <- sqrt(weighted.mean((x - mean)^2, y))
+plot(x, y, type = "h", main = "Fragment Length Distribution", sub = paste("Mean = ", mean, ", Std = ", std), xlab = "Fragment Length", ylab = "Probability")  
+
+# mate length distribution
+bval <- as.numeric(readLines(con, n = 1)[1])
+if (bval == 1) {
+  list <- strsplit(readLines(con, n = 2), split = " ")
+  vec <- as.numeric(list[[1]])
+  maxL <- vec[2]
+  x <- (vec[1] + 1) : vec[2]
+  y <- as.numeric(list[[2]])
+  mean <- weighted.mean(x, y)
+  std <- sqrt(weighted.mean((x - mean)^2, y))
+  plot(x, y, type = "h", main = "Mate Length Distribution", sub = paste("Mean = ", mean, ", Std = ", std), xlab = "Mate Length", ylab = "Probability")  
+}
+strvec <- readLines(con, n = 1)
+
+# RSPD
+bval <- as.numeric(readLines(con, n = 1)[1])
+if (bval == 1) {
+  bin_size <- as.numeric(readLines(con, n = 1)[1])
+  y <- as.numeric(strsplit(readLines(con, n = 1), split = " ")[[1]])
+  barplot(y, space = 0, main = "Read Start Position Distribution", xlab = "Bin #", ylab = "Probability")
+}
+strvec <- readLines(con, n = 1)
+
+# plot sequencing errors
+if (model_type == 1 || model_type == 3) {
+  # skip QD
+  N <- as.numeric(readLines(con, n = 1)[1])
+  readLines(con, n = N + 1)
+  readLines(con, n = 1) # for the blank line
+  
+  # QProfile
+  readLines(con, n = 1)
+
+  peA <- c() # probability of sequencing error given reference base is A
+  peC <- c()
+  peG <- c()
+  peT <- c()
+
+  for (i in 1 : N) {
+    strvec <- readLines(con, n = 6)
+    list <- strsplit(strvec[1:4], split = " ")
+    vecA <- as.numeric(list[[1]])
+    vecC <- as.numeric(list[[2]])
+    vecG <- as.numeric(list[[3]])
+    vecT <- as.numeric(list[[4]])
+    if (sum(c(vecA, vecC, vecG, vecT)) < 1e-8) break
+    peA <- c(peA, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecA[1]))
+    peC <- c(peC, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecC[2]))
+    peG <- c(peG, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecG[3]))
+    peT <- c(peT, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecT[4]))
+  }
+
+  x <- 0 : (length(peA) - 1)
+  matplot(x, cbind(peA, peC, peG, peT), type = "b", lty = 1:4, pch = 0:3, main = "Quality Score v.s. Percentage Sequence Error", xlab = "Quality Score", ylab = "Percentage of Sequencing Error")
+  legend("topright", c("A", "C", "G", "T"), lty = 1:4, pch = 0:3)
+} else {
+  # Profile
+  readLines(con, n = 1)
+  
+  peA <- c() # probability of sequencing error given reference base is A
+  peC <- c()
+  peG <- c()
+  peT <- c()
+
+  for (i in 1: maxL) {
+    strvec <- readLines(con, n = 6)
+    list <- strsplit(strvec[1:4], split = " ")
+    vecA <- as.numeric(list[[1]])
+    vecC <- as.numeric(list[[2]])
+    vecG <- as.numeric(list[[3]])
+    vecT <- as.numeric(list[[4]])
+    if (sum(c(vecA, vecC, vecG, vecT)) < 1e-8) break
+    peA <- c(peA, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecA[1]))
+    peC <- c(peC, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecC[2]))
+    peG <- c(peG, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecG[3]))
+    peT <- c(peT, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecT[4]))
+  }
+
+  x <- 1 : length(peA)
+  matplot(x, cbind(peA, peC, peG, peT), type = "b", lty = 1:4, pch = 0:3, main = "Position v.s. Percentage Sequence Error", xlab = "Position", ylab = "Percentage of Sequencing Error")
+  legend("topleft", c("A", "C", "G", "T"), lty = 1:4, pch = 0:3)       
+}
+
+dev.off()
+close(con)