rsem-plot-model

   1 #!/usr/bin/env Rscript
   2
   3 argv <- commandArgs(TRUE)
   4 if (length(argv) != 2) {
   5   cat("Usage: rsem-plot-model modelF outF\n")
   6   q(status = 1)
   7 }
   8
   9 con <- file(argv[1], open = "r")
  10 pdf(argv[2])
  11
  12 # model type and forward probability
  13 model_type <- as.numeric(readLines(con, n = 4)[1])
  14
  15 # fragment length distribution
  16 strvec <- readLines(con, n = 3)
  17 vec <- as.numeric(strsplit(strvec[1], split = " ")[[1]])
  18 maxL <- vec[2] # maxL used for Profile
  19 x <- (vec[1] + 1) : vec[2]
  20 y <- as.numeric(strsplit(strvec[2], split = " ")[[1]])
  21 mean <- weighted.mean(x, y)
  22 std <- sqrt(weighted.mean((x - mean)^2, y))
  23 plot(x, y, type = "h", main = "Fragment Length Distribution", sub = paste("Mean = ", mean, ", Std = ", std), xlab = "Fragment Length", ylab = "Probability")
  24
  25 # mate length distribution
  26 if (model_type == 0 || model_type == 1) bval <- as.numeric(readLines(con, n = 1)[1]) else bval <- 1
  27
  28 if (bval == 1) {
  29   list <- strsplit(readLines(con, n = 2), split = " ")
  30   vec <- as.numeric(list[[1]])
  31   maxL <- vec[2]
  32   x <- (vec[1] + 1) : vec[2]
  33   y <- as.numeric(list[[2]])
  34   mean <- weighted.mean(x, y)
  35   std <- sqrt(weighted.mean((x - mean)^2, y))
  36   plot(x, y, type = "h", main = "Mate Length Distribution", sub = paste("Mean = ", mean, ", Std = ", std), xlab = "Mate Length", ylab = "Probability")
  37 }
  38 strvec <- readLines(con, n = 1)
  39
  40 # RSPD
  41 bval <- as.numeric(readLines(con, n = 1)[1])
  42 if (bval == 1) {
  43   bin_size <- as.numeric(readLines(con, n = 1)[1])
  44   y <- as.numeric(strsplit(readLines(con, n = 1), split = " ")[[1]])
  45   par(cex.axis = 0.7)
  46   barplot(y, space = 0, names.arg = 1:bin_size, main = "Read Start Position Distribution", xlab = "Bin #", ylab = "Probability")
  47   par(cex.axis = 1.0)
  48 }
  49 strvec <- readLines(con, n = 1)
  50
  51 # plot sequencing errors
  52 if (model_type == 1 || model_type == 3) {
  53   # skip QD
  54   N <- as.numeric(readLines(con, n = 1)[1])
  55   readLines(con, n = N + 1)
  56   readLines(con, n = 1) # for the blank line
  57
  58   # QProfile
  59   readLines(con, n = 1)
  60
  61   peA <- c() # probability of sequencing error given reference base is A
  62   peC <- c()
  63   peG <- c()
  64   peT <- c()
  65
  66   for (i in 1 : N) {
  67     strvec <- readLines(con, n = 6)
  68     list <- strsplit(strvec[1:4], split = " ")
  69     vecA <- as.numeric(list[[1]])
  70     vecC <- as.numeric(list[[2]])
  71     vecG <- as.numeric(list[[3]])
  72     vecT <- as.numeric(list[[4]])
  73     if (sum(c(vecA, vecC, vecG, vecT)) < 1e-8) break
  74     peA <- c(peA, ifelse(sum(vec) < 1e-8, NA, -10 * log(1.0 - vecA[1])))
  75     peC <- c(peC, ifelse(sum(vec) < 1e-8, NA, -10 * log(1.0 - vecC[2])))
  76     peG <- c(peG, ifelse(sum(vec) < 1e-8, NA, -10 * log(1.0 - vecG[3])))
  77     peT <- c(peT, ifelse(sum(vec) < 1e-8, NA, -10 * log(1.0 - vecT[4])))
  78   }
  79
  80   x <- 0 : (length(peA) - 1)
  81   matplot(x, cbind(peA, peC, peG, peT), type = "b", lty = 1:4, pch = 0:3, col = 1:4, main = "Quality Score vs. Observed Quality", xlab = "Quality Score", ylab = "Observed Quality")
  82   legend("topleft", c("A", "C", "G", "T"), lty = 1:4, pch = 0:3, col = 1:4)
  83 } else {
  84   # Profile
  85   readLines(con, n = 1)
  86
  87   peA <- c() # probability of sequencing error given reference base is A
  88   peC <- c()
  89   peG <- c()
  90   peT <- c()
  91
  92   for (i in 1: maxL) {
  93     strvec <- readLines(con, n = 6)
  94     list <- strsplit(strvec[1:4], split = " ")
  95     vecA <- as.numeric(list[[1]])
  96     vecC <- as.numeric(list[[2]])
  97     vecG <- as.numeric(list[[3]])
  98     vecT <- as.numeric(list[[4]])
  99     if (sum(c(vecA, vecC, vecG, vecT)) < 1e-8) break
 100     peA <- c(peA, ifelse(sum(vec) < 1e-8, NA, (1.0 - vecA[1]) * 100))
 101     peC <- c(peC, ifelse(sum(vec) < 1e-8, NA, (1.0 - vecC[2]) * 100))
 102     peG <- c(peG, ifelse(sum(vec) < 1e-8, NA, (1.0 - vecG[3]) * 100))
 103     peT <- c(peT, ifelse(sum(vec) < 1e-8, NA, (1.0 - vecT[4]) * 100))
 104   }
 105
 106   x <- 1 : length(peA)
 107   matplot(x, cbind(peA, peC, peG, peT), type = "b", lty = 1:4, pch = 0:3, col = 1:4, main = "Position vs. Percentage Sequence Error", xlab = "Position", ylab = "Percentage of Sequencing Error")
 108   legend("topleft", c("A", "C", "G", "T"), lty = 1:4, pch = 0:3, col = 1:4)
 109 }
 110
 111 dev.off()
 112 close(con)