From: Bo Li Date: Fri, 11 Mar 2011 04:00:40 +0000 (-0600) Subject: rename plotModel.R to rsem-plot-model X-Git-Url: https://git.donarmstrong.com/?p=rsem.git;a=commitdiff_plain;h=029284bf74305b5540970f858d7194d05aac501b rename plotModel.R to rsem-plot-model --- diff --git a/README.md b/README.md index 0c74b6e..b95c4ea 100644 --- a/README.md +++ b/README.md @@ -125,11 +125,11 @@ Refer to the [UCSC custom track help page](http://genome.ucsc.edu/goldenPath/hel #### c) Visualize the model learned by RSEM -RSEM provides an R script, plotModel.R, for visulazing the model learned. +RSEM provides an R script, 'rsem-plot-model', for visulazing the model learned. Usage: - plotModel.R modelF outF + rsem-plot-model modelF outF modelF: the sample_name.model file generated by RSEM outF: the file name for plots generated from the model. It is a pdf file @@ -138,7 +138,7 @@ The plots generated depends on read type and user configuration. It may include fragment length distribution, mate length distribution, read start position distribution (RSPD), quality score vs percentage of sequecing error given the reference base, position vs percentage of -sequencing errro given the reference base. +sequencing error given the reference base. ## Example diff --git a/plotModel.R b/plotModel.R deleted file mode 100755 index ce0a8dc..0000000 --- a/plotModel.R +++ /dev/null @@ -1,109 +0,0 @@ -#!/usr/bin/env Rscript - -argv <- commandArgs(TRUE) -if (length(argv) != 2) { - cat("Usage: plotModel.R modelF outF\n") - q(status = 1) -} - -con <- file(argv[1], open = "r") -pdf(argv[2]) - -# model type and forward probability -model_type <- as.numeric(readLines(con, n = 4)[1]) - -# fragment length distribution -strvec <- readLines(con, n = 3) -vec <- as.numeric(strsplit(strvec[1], split = " ")[[1]]) -maxL <- vec[2] # maxL used for Profile -x <- (vec[1] + 1) : vec[2] -y <- as.numeric(strsplit(strvec[2], split = " ")[[1]]) -mean <- weighted.mean(x, y) -std <- sqrt(weighted.mean((x - mean)^2, y)) -plot(x, y, type = "h", main = "Fragment Length Distribution", sub = paste("Mean = ", mean, ", Std = ", std), xlab = "Fragment Length", ylab = "Probability") - -# mate length distribution -bval <- as.numeric(readLines(con, n = 1)[1]) -if (bval == 1) { - list <- strsplit(readLines(con, n = 2), split = " ") - vec <- as.numeric(list[[1]]) - maxL <- vec[2] - x <- (vec[1] + 1) : vec[2] - y <- as.numeric(list[[2]]) - mean <- weighted.mean(x, y) - std <- sqrt(weighted.mean((x - mean)^2, y)) - plot(x, y, type = "h", main = "Mate Length Distribution", sub = paste("Mean = ", mean, ", Std = ", std), xlab = "Mate Length", ylab = "Probability") -} -strvec <- readLines(con, n = 1) - -# RSPD -bval <- as.numeric(readLines(con, n = 1)[1]) -if (bval == 1) { - bin_size <- as.numeric(readLines(con, n = 1)[1]) - y <- as.numeric(strsplit(readLines(con, n = 1), split = " ")[[1]]) - barplot(y, space = 0, main = "Read Start Position Distribution", xlab = "Bin #", ylab = "Probability") -} -strvec <- readLines(con, n = 1) - -# plot sequencing errors -if (model_type == 1 || model_type == 3) { - # skip QD - N <- as.numeric(readLines(con, n = 1)[1]) - readLines(con, n = N + 1) - readLines(con, n = 1) # for the blank line - - # QProfile - readLines(con, n = 1) - - peA <- c() # probability of sequencing error given reference base is A - peC <- c() - peG <- c() - peT <- c() - - for (i in 1 : N) { - strvec <- readLines(con, n = 6) - list <- strsplit(strvec[1:4], split = " ") - vecA <- as.numeric(list[[1]]) - vecC <- as.numeric(list[[2]]) - vecG <- as.numeric(list[[3]]) - vecT <- as.numeric(list[[4]]) - if (sum(c(vecA, vecC, vecG, vecT)) < 1e-8) break - peA <- c(peA, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecA[1])) - peC <- c(peC, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecC[2])) - peG <- c(peG, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecG[3])) - peT <- c(peT, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecT[4])) - } - - x <- 0 : (length(peA) - 1) - matplot(x, cbind(peA, peC, peG, peT), type = "b", lty = 1:4, pch = 0:3, main = "Quality Score v.s. Percentage Sequence Error", xlab = "Quality Score", ylab = "Percentage of Sequencing Error") - legend("topright", c("A", "C", "G", "T"), lty = 1:4, pch = 0:3) -} else { - # Profile - readLines(con, n = 1) - - peA <- c() # probability of sequencing error given reference base is A - peC <- c() - peG <- c() - peT <- c() - - for (i in 1: maxL) { - strvec <- readLines(con, n = 6) - list <- strsplit(strvec[1:4], split = " ") - vecA <- as.numeric(list[[1]]) - vecC <- as.numeric(list[[2]]) - vecG <- as.numeric(list[[3]]) - vecT <- as.numeric(list[[4]]) - if (sum(c(vecA, vecC, vecG, vecT)) < 1e-8) break - peA <- c(peA, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecA[1])) - peC <- c(peC, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecC[2])) - peG <- c(peG, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecG[3])) - peT <- c(peT, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecT[4])) - } - - x <- 1 : length(peA) - matplot(x, cbind(peA, peC, peG, peT), type = "b", lty = 1:4, pch = 0:3, main = "Position v.s. Percentage Sequence Error", xlab = "Position", ylab = "Percentage of Sequencing Error") - legend("topleft", c("A", "C", "G", "T"), lty = 1:4, pch = 0:3) -} - -dev.off() -close(con) diff --git a/rsem-plot-model b/rsem-plot-model new file mode 100755 index 0000000..aa2c724 --- /dev/null +++ b/rsem-plot-model @@ -0,0 +1,109 @@ +#!/usr/bin/env Rscript + +argv <- commandArgs(TRUE) +if (length(argv) != 2) { + cat("Usage: rsem-plot-model modelF outF\n") + q(status = 1) +} + +con <- file(argv[1], open = "r") +pdf(argv[2]) + +# model type and forward probability +model_type <- as.numeric(readLines(con, n = 4)[1]) + +# fragment length distribution +strvec <- readLines(con, n = 3) +vec <- as.numeric(strsplit(strvec[1], split = " ")[[1]]) +maxL <- vec[2] # maxL used for Profile +x <- (vec[1] + 1) : vec[2] +y <- as.numeric(strsplit(strvec[2], split = " ")[[1]]) +mean <- weighted.mean(x, y) +std <- sqrt(weighted.mean((x - mean)^2, y)) +plot(x, y, type = "h", main = "Fragment Length Distribution", sub = paste("Mean = ", mean, ", Std = ", std), xlab = "Fragment Length", ylab = "Probability") + +# mate length distribution +bval <- as.numeric(readLines(con, n = 1)[1]) +if (bval == 1) { + list <- strsplit(readLines(con, n = 2), split = " ") + vec <- as.numeric(list[[1]]) + maxL <- vec[2] + x <- (vec[1] + 1) : vec[2] + y <- as.numeric(list[[2]]) + mean <- weighted.mean(x, y) + std <- sqrt(weighted.mean((x - mean)^2, y)) + plot(x, y, type = "h", main = "Mate Length Distribution", sub = paste("Mean = ", mean, ", Std = ", std), xlab = "Mate Length", ylab = "Probability") +} +strvec <- readLines(con, n = 1) + +# RSPD +bval <- as.numeric(readLines(con, n = 1)[1]) +if (bval == 1) { + bin_size <- as.numeric(readLines(con, n = 1)[1]) + y <- as.numeric(strsplit(readLines(con, n = 1), split = " ")[[1]]) + barplot(y, space = 0, main = "Read Start Position Distribution", xlab = "Bin #", ylab = "Probability") +} +strvec <- readLines(con, n = 1) + +# plot sequencing errors +if (model_type == 1 || model_type == 3) { + # skip QD + N <- as.numeric(readLines(con, n = 1)[1]) + readLines(con, n = N + 1) + readLines(con, n = 1) # for the blank line + + # QProfile + readLines(con, n = 1) + + peA <- c() # probability of sequencing error given reference base is A + peC <- c() + peG <- c() + peT <- c() + + for (i in 1 : N) { + strvec <- readLines(con, n = 6) + list <- strsplit(strvec[1:4], split = " ") + vecA <- as.numeric(list[[1]]) + vecC <- as.numeric(list[[2]]) + vecG <- as.numeric(list[[3]]) + vecT <- as.numeric(list[[4]]) + if (sum(c(vecA, vecC, vecG, vecT)) < 1e-8) break + peA <- c(peA, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecA[1])) + peC <- c(peC, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecC[2])) + peG <- c(peG, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecG[3])) + peT <- c(peT, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecT[4])) + } + + x <- 0 : (length(peA) - 1) + matplot(x, cbind(peA, peC, peG, peT), type = "b", lty = 1:4, pch = 0:3, main = "Quality Score v.s. Percentage Sequence Error", xlab = "Quality Score", ylab = "Percentage of Sequencing Error") + legend("topright", c("A", "C", "G", "T"), lty = 1:4, pch = 0:3) +} else { + # Profile + readLines(con, n = 1) + + peA <- c() # probability of sequencing error given reference base is A + peC <- c() + peG <- c() + peT <- c() + + for (i in 1: maxL) { + strvec <- readLines(con, n = 6) + list <- strsplit(strvec[1:4], split = " ") + vecA <- as.numeric(list[[1]]) + vecC <- as.numeric(list[[2]]) + vecG <- as.numeric(list[[3]]) + vecT <- as.numeric(list[[4]]) + if (sum(c(vecA, vecC, vecG, vecT)) < 1e-8) break + peA <- c(peA, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecA[1])) + peC <- c(peC, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecC[2])) + peG <- c(peG, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecG[3])) + peT <- c(peT, ifelse(sum(vec) < 1e-8, NA, 1.0 - vecT[4])) + } + + x <- 1 : length(peA) + matplot(x, cbind(peA, peC, peG, peT), type = "b", lty = 1:4, pch = 0:3, main = "Position v.s. Percentage Sequence Error", xlab = "Position", ylab = "Percentage of Sequencing Error") + legend("topleft", c("A", "C", "G", "T"), lty = 1:4, pch = 0:3) +} + +dev.off() +close(con)