From: Bo Li Date: Fri, 11 Mar 2011 03:25:54 +0000 (-0600) Subject: Enable RSEM to visualize the model learned from data X-Git-Url: https://git.donarmstrong.com/?p=rsem.git;a=commitdiff_plain;h=18c158c73b8f7f0afe4db97ac51b520149d33c12 Enable RSEM to visualize the model learned from data --- diff --git a/README.md b/README.md index 0f4d982..0c900b7 100644 --- a/README.md +++ b/README.md @@ -123,6 +123,23 @@ wiggle_name: the name the user wants to use for this wiggle plot Refer to the [UCSC custom track help page](http://genome.ucsc.edu/goldenPath/help/customTrack.html). +#### c) Visualize the model learned by RSEM + +RSEM provides an R script, plotModel.R, for visulazing the model learned. + +Usage: + + plotModel.R modelF outF + +modelF: the sample_name.model file generated by RSEM +outF: the file name for plots generated from the model. It is a pdf file + +The plots generated depends on read type and user configuration. It +may include fragment length distribution, mate length distribution, +read start position distribution (RSPD), quality score vs percentage +of sequecing error given the reference base, position vs percentage of +sequencing errro given the reference base. + ## Example Suppose we download the mouse genome from UCSC Genome Browser. We will diff --git a/plotModel.R b/plotModel.R new file mode 100755 index 0000000..ce0a8dc --- /dev/null +++ b/plotModel.R @@ -0,0 +1,109 @@ +#!/usr/bin/env Rscript + +argv <- commandArgs(TRUE) +if (length(argv) != 2) { + cat("Usage: plotModel.R modelF outF\n") + q(status = 1) +} + +con <- file(argv[1], open = "r") +pdf(argv[2]) + +# model type and forward probability +model_type <- as.numeric(readLines(con, n = 4)[1]) + +# fragment length distribution +strvec <- readLines(con, n = 3) +vec <- as.numeric(strsplit(strvec[1], split = " ")[[1]]) +maxL <- vec[2] # maxL used for Profile +x <- (vec[1] + 1) : vec[2] +y <- as.numeric(strsplit(strvec[2], split = " ")[[1]]) +mean <- weighted.mean(x, y) +std <- sqrt(weighted.mean((x - mean)^2, y)) +plot(x, y, type = "h", main = "Fragment Length Distribution", sub = paste("Mean = ", mean, ", Std = ", std), xlab = "Fragment Length", ylab = "Probability") + +# mate length distribution +bval <- as.numeric(readLines(con, n = 1)[1]) +if (bval == 1) { + list <- strsplit(readLines(con, n = 2), split = " ") + vec <- as.numeric(list[[1]]) + maxL <- vec[2] + x <- (vec[1] + 1) : vec[2] + y <- as.numeric(list[[2]]) + mean <- weighted.mean(x, y) + std <- sqrt(weighted.mean((x - mean)^2, y)) + plot(x, y, type = "h", main = "Mate Length Distribution", sub = paste("Mean = ", mean, ", Std = ", std), xlab = "Mate Length", ylab = "Probability") +} +strvec <- readLines(con, n = 1) + +# RSPD +bval <- as.numeric(readLines(con, n = 1)[1]) +if (bval == 1) { + bin_size <- as.numeric(readLines(con, n = 1)[1]) + y <- as.numeric(strsplit(readLines(con, n = 1), split = " ")[[1]]) + barplot(y, space = 0, main = "Read Start Position Distribution", xlab = "Bin #", ylab = "Probability") +} +strvec <- readLines(con, n = 1) + +# plot sequencing errors +if (model_type == 1 || model_type == 3) { + # skip QD + N <- as.numeric(readLines(con, n = 1)[1]) + readLines(con, n = N + 1) + readLines(con, n = 1) # for the blank line + + # QProfile + readLines(con, n = 1) + + peA <- c() # probability of sequencing error given reference base is A + peC <- c() + peG <- c() + peT <- c() + + for (i in 1 : N) { + strvec <- readLines(con, n = 6) + list <- strsplit(strvec[1:4], split = " ") + vecA <- as.numeric(list[[1]]) + vecC <- as.numeric(list[[2]]) + vecG <- as.numeric(list[[3]]) + vecT <- as.numeric(list[[4]]) + if (sum(c(vecA, vecC, vecG, vecT)) < 1e-8) break + peA <- c(peA, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecA[1])) + peC <- c(peC, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecC[2])) + peG <- c(peG, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecG[3])) + peT <- c(peT, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecT[4])) + } + + x <- 0 : (length(peA) - 1) + matplot(x, cbind(peA, peC, peG, peT), type = "b", lty = 1:4, pch = 0:3, main = "Quality Score v.s. Percentage Sequence Error", xlab = "Quality Score", ylab = "Percentage of Sequencing Error") + legend("topright", c("A", "C", "G", "T"), lty = 1:4, pch = 0:3) +} else { + # Profile + readLines(con, n = 1) + + peA <- c() # probability of sequencing error given reference base is A + peC <- c() + peG <- c() + peT <- c() + + for (i in 1: maxL) { + strvec <- readLines(con, n = 6) + list <- strsplit(strvec[1:4], split = " ") + vecA <- as.numeric(list[[1]]) + vecC <- as.numeric(list[[2]]) + vecG <- as.numeric(list[[3]]) + vecT <- as.numeric(list[[4]]) + if (sum(c(vecA, vecC, vecG, vecT)) < 1e-8) break + peA <- c(peA, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecA[1])) + peC <- c(peC, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecC[2])) + peG <- c(peG, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecG[3])) + peT <- c(peT, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecT[4])) + } + + x <- 1 : length(peA) + matplot(x, cbind(peA, peC, peG, peT), type = "b", lty = 1:4, pch = 0:3, main = "Position v.s. Percentage Sequence Error", xlab = "Position", ylab = "Percentage of Sequencing Error") + legend("topleft", c("A", "C", "G", "T"), lty = 1:4, pch = 0:3) +} + +dev.off() +close(con)