rsem-plot-model

   1 #!/usr/bin/env Rscript
   2
   3 argv <- commandArgs(TRUE)
   4 if (length(argv) != 2) {
   5   cat("Usage: rsem-plot-model sample_name outF\n")
   6   q(status = 1)
   7 }
   8
   9 strvec <- strsplit(argv[1], split = "/")[[1]]
  10 token <- strvec[length(strvec)]
  11
  12 modelF <- paste(argv[1], ".stat/", token, ".model")
  13 cntF <- paste(argv[1], ".stat/", token, ".cnt")
  14
  15 pdf(argv[2])
  16
  17 con <- file(modelF, open = "r")
  18
  19 # model type and forward probability
  20 model_type <- as.numeric(readLines(con, n = 4)[1])
  21
  22 # fragment length distribution
  23 strvec <- readLines(con, n = 3)
  24 vec <- as.numeric(strsplit(strvec[1], split = " ")[[1]])
  25 maxL <- vec[2] # maxL used for Profile
  26 x <- (vec[1] + 1) : vec[2]
  27 y <- as.numeric(strsplit(strvec[2], split = " ")[[1]])
  28 mean <- weighted.mean(x, y)
  29 std <- sqrt(weighted.mean((x - mean)^2, y))
  30 plot(x, y, type = "h", main = "Fragment Length Distribution", sub = paste("Mean = ", mean, ", Std = ", std), xlab = "Fragment Length", ylab = "Probability")
  31
  32 # mate length distribution
  33 if (model_type == 0 || model_type == 1) bval <- as.numeric(readLines(con, n = 1)[1]) else bval <- 1
  34
  35 if (bval == 1) {
  36   list <- strsplit(readLines(con, n = 2), split = " ")
  37   vec <- as.numeric(list[[1]])
  38   maxL <- vec[2]
  39   x <- (vec[1] + 1) : vec[2]
  40   y <- as.numeric(list[[2]])
  41   mean <- weighted.mean(x, y)
  42   std <- sqrt(weighted.mean((x - mean)^2, y))
  43   plot(x, y, type = "h", main = "Mate Length Distribution", sub = paste("Mean = ", mean, ", Std = ", std), xlab = "Mate Length", ylab = "Probability")
  44 }
  45 strvec <- readLines(con, n = 1)
  46
  47 # RSPD
  48 bval <- as.numeric(readLines(con, n = 1)[1])
  49 if (bval == 1) {
  50   bin_size <- as.numeric(readLines(con, n = 1)[1])
  51   y <- as.numeric(strsplit(readLines(con, n = 1), split = " ")[[1]])
  52   par(cex.axis = 0.7)
  53   barplot(y, space = 0, names.arg = 1:bin_size, main = "Read Start Position Distribution", xlab = "Bin #", ylab = "Probability")
  54   par(cex.axis = 1.0)
  55 }
  56 strvec <- readLines(con, n = 1)
  57
  58 # plot sequencing errors
  59 if (model_type == 1 || model_type == 3) {
  60   # skip QD
  61   N <- as.numeric(readLines(con, n = 1)[1])
  62   readLines(con, n = N + 1)
  63   readLines(con, n = 1) # for the blank line
  64
  65   # QProfile
  66   readLines(con, n = 1)
  67
  68   peA <- c() # probability of sequencing error given reference base is A
  69   peC <- c()
  70   peG <- c()
  71   peT <- c()
  72
  73   for (i in 1 : N) {
  74     strvec <- readLines(con, n = 6)
  75     list <- strsplit(strvec[1:4], split = " ")
  76     vecA <- as.numeric(list[[1]])
  77     vecC <- as.numeric(list[[2]])
  78     vecG <- as.numeric(list[[3]])
  79     vecT <- as.numeric(list[[4]])
  80     if (sum(c(vecA, vecC, vecG, vecT)) < 1e-8) break
  81     peA <- c(peA, ifelse(sum(vec) < 1e-8, NA, -10 * log(1.0 - vecA[1])))
  82     peC <- c(peC, ifelse(sum(vec) < 1e-8, NA, -10 * log(1.0 - vecC[2])))
  83     peG <- c(peG, ifelse(sum(vec) < 1e-8, NA, -10 * log(1.0 - vecG[3])))
  84     peT <- c(peT, ifelse(sum(vec) < 1e-8, NA, -10 * log(1.0 - vecT[4])))
  85   }
  86
  87   x <- 0 : (length(peA) - 1)
  88   matplot(x, cbind(peA, peC, peG, peT), type = "b", lty = 1:4, pch = 0:3, col = 1:4, main = "Quality Score vs. Observed Quality", xlab = "Quality Score", ylab = "Observed Quality")
  89   legend("topleft", c("A", "C", "G", "T"), lty = 1:4, pch = 0:3, col = 1:4)
  90 } else {
  91   # Profile
  92   readLines(con, n = 1)
  93
  94   peA <- c() # probability of sequencing error given reference base is A
  95   peC <- c()
  96   peG <- c()
  97   peT <- c()
  98
  99   for (i in 1: maxL) {
 100     strvec <- readLines(con, n = 6)
 101     list <- strsplit(strvec[1:4], split = " ")
 102     vecA <- as.numeric(list[[1]])
 103     vecC <- as.numeric(list[[2]])
 104     vecG <- as.numeric(list[[3]])
 105     vecT <- as.numeric(list[[4]])
 106     if (sum(c(vecA, vecC, vecG, vecT)) < 1e-8) break
 107     peA <- c(peA, ifelse(sum(vec) < 1e-8, NA, (1.0 - vecA[1]) * 100))
 108     peC <- c(peC, ifelse(sum(vec) < 1e-8, NA, (1.0 - vecC[2]) * 100))
 109     peG <- c(peG, ifelse(sum(vec) < 1e-8, NA, (1.0 - vecG[3]) * 100))
 110     peT <- c(peT, ifelse(sum(vec) < 1e-8, NA, (1.0 - vecT[4]) * 100))
 111   }
 112
 113   x <- 1 : length(peA)
 114   matplot(x, cbind(peA, peC, peG, peT), type = "b", lty = 1:4, pch = 0:3, col = 1:4, main = "Position vs. Percentage Sequence Error", xlab = "Position", ylab = "Percentage of Sequencing Error")
 115   legend("topleft", c("A", "C", "G", "T"), lty = 1:4, pch = 0:3, col = 1:4)
 116 }
 117
 118 close(con)
 119
 120 pair <- read.table(file = cntF, skip = 3, sep = "\t")
 121 plot(pair[,1], pair[,2], xlab = "Number of Alignments", ylab = "Number of Reads", main = "Among alignable reads, distribution of # of alignments")
 122
 123 dev.off()