rsem-plot-model

   1 #!/usr/bin/env Rscript
   2
   3 argv <- commandArgs(TRUE)
   4 if (length(argv) != 2) {
   5   cat("Usage: rsem-plot-model sample_name output_plot_file\n")
   6   q(status = 1)
   7 }
   8
   9 strvec <- strsplit(argv[1], split = "/")[[1]]
  10 token <- strvec[length(strvec)]
  11
  12 stat.dir <- paste(argv[1], ".stat", sep = "")
  13 if (!file.exists(stat.dir)) {
  14   cat("Error: directory does not exist: ", stat.dir, "\n", sep = "")
  15   cat(strwrap("This version of rsem-plot-model only works with the output of RSEM versions >= 1.1.8"), sep="\n")
  16   q(status = 1)
  17 }
  18 modelF <- paste(stat.dir, "/", token, ".model", sep = "")
  19 cntF <- paste(stat.dir, "/", token, ".cnt", sep = "")
  20
  21 pdf(argv[2])
  22
  23 con <- file(modelF, open = "r")
  24
  25 # model type and forward probability
  26 model_type <- as.numeric(readLines(con, n = 4)[1])
  27
  28 # fragment length distribution
  29 strvec <- readLines(con, n = 3)
  30 vec <- as.numeric(strsplit(strvec[1], split = " ")[[1]])
  31 maxL <- vec[2] # maxL used for Profile
  32 x <- (vec[1] + 1) : vec[2]
  33 y <- as.numeric(strsplit(strvec[2], split = " ")[[1]])
  34 mean <- weighted.mean(x, y)
  35 std <- sqrt(weighted.mean((x - mean)^2, y))
  36 plot(x, y, type = "h",
  37      main = "Fragment Length Distribution",
  38      sub = paste("Mean = ", round(mean, 1), ", Std = ", round(std, 1)),
  39      xlab = "Fragment Length",
  40      ylab = "Probability")
  41
  42 # mate length distribution
  43 if (model_type == 0 || model_type == 1) bval <- as.numeric(readLines(con, n = 1)[1]) else bval <- 1
  44
  45 if (bval == 1) {
  46   list <- strsplit(readLines(con, n = 2), split = " ")
  47   vec <- as.numeric(list[[1]])
  48   maxL <- vec[2]
  49   x <- (vec[1] + 1) : vec[2]
  50   y <- as.numeric(list[[2]])
  51   mean <- weighted.mean(x, y)
  52   std <- sqrt(weighted.mean((x - mean)^2, y))
  53   plot(x, y, type = "h",
  54        main = "Read Length Distribution",
  55        sub=paste("Mean = ", round(mean, 1), ", Std = ", round(std, 1)),
  56        xlab = "Read Length",
  57        ylab = "Probability")
  58 }
  59 strvec <- readLines(con, n = 1)
  60
  61 # RSPD
  62 bval <- as.numeric(readLines(con, n = 1)[1])
  63 if (bval == 1) {
  64   bin_size <- as.numeric(readLines(con, n = 1)[1])
  65   y <- as.numeric(strsplit(readLines(con, n = 1), split = " ")[[1]])
  66   par(cex.axis = 0.7)
  67   barplot(y, space = 0, names.arg = 1:bin_size, main = "Read Start Position Distribution", xlab = "Bin #", ylab = "Probability")
  68   par(cex.axis = 1.0)
  69 }
  70 strvec <- readLines(con, n = 1)
  71
  72 # plot sequencing errors
  73 if (model_type == 1 || model_type == 3) {
  74   # skip QD
  75   N <- as.numeric(readLines(con, n = 1)[1])
  76   readLines(con, n = N + 1)
  77   readLines(con, n = 1) # for the blank line
  78
  79   # QProfile
  80   readLines(con, n = 1)
  81
  82   x <- c()
  83   peA <- c() # probability of sequencing error given reference base is A
  84   peC <- c()
  85   peG <- c()
  86   peT <- c()
  87
  88   for (i in 1 : N) {
  89     strvec <- readLines(con, n = 6)
  90     list <- strsplit(strvec[1:4], split = " ")
  91
  92     vecA <- as.numeric(list[[1]])
  93     vecC <- as.numeric(list[[2]])
  94     vecG <- as.numeric(list[[3]])
  95     vecT <- as.numeric(list[[4]])
  96
  97     if (sum(c(vecA, vecC, vecG, vecT)) < 1e-8) next
  98     x <- c(x, (i - 1))
  99     peA <- c(peA, ifelse(sum(vecA) < 1e-8, NA, -10 * log10(1.0 - vecA[1])))
 100     peC <- c(peC, ifelse(sum(vecC) < 1e-8, NA, -10 * log10(1.0 - vecC[2])))
 101     peG <- c(peG, ifelse(sum(vecG) < 1e-8, NA, -10 * log10(1.0 - vecG[3])))
 102     peT <- c(peT, ifelse(sum(vecT) < 1e-8, NA, -10 * log10(1.0 - vecT[4])))
 103   }
 104
 105   matplot(x, cbind(peA, peC, peG, peT), type = "b", lty = 1:4, pch = 0:3, col = 1:4,
 106           main = "Observed Quality vs. Phred Quality Score",
 107           xlab = "Phred Quality Score",
 108           ylab = "Observed Quality")
 109   legend("topleft", c("A", "C", "G", "T"), lty = 1:4, pch = 0:3, col = 1:4)
 110 } else {
 111   # Profile
 112   readLines(con, n = 1)
 113
 114   x <- c()
 115   peA <- c() # probability of sequencing error given reference base is A
 116   peC <- c()
 117   peG <- c()
 118   peT <- c()
 119
 120   for (i in 1: maxL) {
 121     strvec <- readLines(con, n = 6)
 122     list <- strsplit(strvec[1:4], split = " ")
 123
 124     vecA <- as.numeric(list[[1]])
 125     vecC <- as.numeric(list[[2]])
 126     vecG <- as.numeric(list[[3]])
 127     vecT <- as.numeric(list[[4]])
 128
 129     if (sum(c(vecA, vecC, vecG, vecT)) < 1e-8) next
 130     x <- c(x, i)
 131     peA <- c(peA, ifelse(sum(vecA) < 1e-8, NA, (1.0 - vecA[1]) * 100))
 132     peC <- c(peC, ifelse(sum(vecC) < 1e-8, NA, (1.0 - vecC[2]) * 100))
 133     peG <- c(peG, ifelse(sum(vecG) < 1e-8, NA, (1.0 - vecG[3]) * 100))
 134     peT <- c(peT, ifelse(sum(vecT) < 1e-8, NA, (1.0 - vecT[4]) * 100))
 135   }
 136
 137   matplot(x, cbind(peA, peC, peG, peT), type = "b", lty = 1:4, pch = 0:3, col = 1:4, main = "Position vs. Percentage Sequence Error", xlab = "Position", ylab = "Percentage of Sequencing Error")
 138   legend("topleft", c("A", "C", "G", "T"), lty = 1:4, pch = 0:3, col = 1:4)
 139 }
 140
 141 close(con)
 142
 143 pair <- read.table(file = cntF, skip = 3, sep = "\t")
 144 barplot(pair[,2], names.arg = pair[,1],
 145         xlab = "Alignments per fragment",
 146         ylab = "Number of fragments",
 147         main = "Histogram of Alignments per Fragment")
 148
 149 dev.off()