From b2a4b4bf4ab67eda00742e53dda28a9b77239a26 Mon Sep 17 00:00:00 2001 From: Don Armstrong Date: Sun, 2 Mar 2014 21:12:55 -0800 Subject: [PATCH] add functions --- DESCRIPTION | 5 +- R/bayesian_analysis.R | 453 +++++++++++++++++++++++++++++++++++++ R/bayesian_analysis_plot.R | 370 ++++++++++++++++++++++++++++++ R/ma_loess_norm.R | 86 +++++++ R/mva_plotting.R | 112 +++++++++ R/power_analysis.R | 76 +++++++ R/significant_cut_util.R | 10 + 7 files changed, 1111 insertions(+), 1 deletion(-) create mode 100644 R/bayesian_analysis.R create mode 100644 R/bayesian_analysis_plot.R create mode 100644 R/ma_loess_norm.R create mode 100644 R/mva_plotting.R create mode 100644 R/power_analysis.R create mode 100644 R/significant_cut_util.R diff --git a/DESCRIPTION b/DESCRIPTION index f170ab8..0c87e25 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,8 +3,11 @@ Type: Package Title: Don's Microarray Analysis Routines Version: 1.0 Date: 2014-02-28 +Depends: qvalue, hexbin, stats Author: Don Armstrong Maintainer: Don Armstrong Description: Routines which handle analyzing Affymetrix-style microarrays -License: GPL3+ +License: GPL (>= 3) +URL: http://www.donarmstrong.com/projects/don_microarray + diff --git a/R/bayesian_analysis.R b/R/bayesian_analysis.R new file mode 100644 index 0000000..438ee59 --- /dev/null +++ b/R/bayesian_analysis.R @@ -0,0 +1,453 @@ +# Copyright (C) 1999 Anthony D. Long, Ecology & Evolutionary Biology +# University of California, Irvine (tdlong@uci.edu) + +# Portions Copyright 2002,2003 Don Armstrong + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# $Id: bayesian_analysis.R,v 1.5 2006/07/12 03:27:03 don Exp $ + +pierre.pair.from.unpaired <- function(h,cs,ce,es,ee,end, + experror=0.05,winsize=101,conf=10,minrep=3,file="pierre.pair", + file.save=TRUE,returnh=FALSE,qvalues=TRUE) +{ + # Use the difference between experimental and control as the test statistic. + new.h <- h[,es:ee]-h[,cs:ce] + if (cs > 1){ + new.h <- cbind(h[,1:cs-1],new.h) + } + totalexpresscol <- apply(h[,c(cs:ce,es:ee)],1,function(x) {mean(log(x[cs:ce]),log(x[es:ee]))}); + new.h <- cbind(new.h,totalexpresscol) + if (end > ee){ + new.h <- cbind(new.h,h[,(ee+1):end]) + } + newer.h <- pierre.pair(new.h,cs,ce,(end-(ee-es)),ce+1,file=file,experror=experror, + winsize=winsize,conf=conf,minrep=minrep,file.save=file.save,returnh=T) + if (returnh){ + return(newer.h) + } +} + +pierre.pair <- function (h, cs, ce, end, totalexpresscol, + experror=0.05, winsize=101, conf=10, minrep=3, file="pierre.pair", + file.save=TRUE,returnh=FALSE,qvalues=TRUE) +{ +# note 'totalexpresscol' added April 18th 2001: +# this should be the column number of the column in "h[1:N]" that represents +# total expression for that gene. +# A good value for 'totalexpresscol' would be the mean of the log of the +# "total expression level" over both treatments (i.e., control and +# experimentals) and replicates, where the total expression level for each +# gene/treatment/replicate is given as a fraction of total expression over +# all genes for that treatment/replicate. + +# number of non-zero samples, their mean and st.dev., index_col, running average sd. + + S.N <- apply(h[, cs:ce], 1, function(x) sum(x != 0 & !is.na(x))) + S.mean <- apply(h[, cs:ce], 1, function(x) if (sum(x[x != + 0 & !is.na(x)] != 0)) + mean(x[x != 0 & !is.na(x)]) + else NA) + S.sd <- apply(h[, cs:ce], 1, function(x) if (sum(x[x != + 0 & !is.na(x)] != 0) >= minrep) + sqrt(var(x[x != 0 & !is.na(x)])) + else NA) + index.col <- h[,totalexpresscol] + +# running average standard deviation + xxx <- S.sd[!is.na(S.sd)][order(index.col[!is.na(S.sd)])] + xxx <- runavg(xxx, winsize) + xxx <- xxx[rank(index.col[!is.na(S.sd)])] + S.rasd <- rep(NA, nrow(h)) + S.rasd[!is.na(S.sd)] <- xxx + + h <- cbind(h, S.N, S.mean, S.sd, S.rasd) + colnames(h)[(end + 1):(end + 4)] <- c("N", "x", "sd", "rasd") + rm(S.N, S.mean, S.sd, S.rasd) + +# regular t-test, Bayesian variance, Bayesian t-test, p-reg, p-Bayes + # Regular t + temp1 <- sqrt(h[end + 1]) * (h[end + 2]/h[end + 3]) + # Baysian variance + temp2 <- sqrt((conf * (h[end + 4])^2 + (h[end + 1] - 1) * (h[end + 3])^2)/(conf + h[end + 1] - 2)) + # Baysian t-test + temp3 <- sqrt(h[end + 1]) * (h[end + 2]/temp2) +# modify d.f. to reflect effect of conf Jan. 10 / 2001 + # calc regular p (temp4 is the same paired t-test as in doitall.pair) + temp4 <- 1 - pf((temp1)^2, 1, h[, end + 1] - 1) + # and Bayesian-p. temp5 is a _paired_ t-test with a Bayesian correction. + # The degrees of freedom are h[,end+1] - 1 + conf - 1. Since it is a paired + # t-test the degrees of freedom are fewer than non-paired approaches!! + temp5 <- 1 - pf((temp3)^2, 1, h[, end + 1] + conf - 2) + h <- cbind(h, temp1, temp2, temp3, temp4, temp5) + rm(temp1, temp2, temp3, temp4, temp5) + colnames(h)[(end + 5):(end + 9)] <- c("reg-t", "Bay-sd", "Bay-t", "reg-p", "Bay-p") + + if (returnh & ! file.save){ + return(h) + } + write.table(h, file = paste(file,"_from_R.txt",sep=""), sep = "\t") + png(file = paste(file,"_%03d.png",sep=""), width = 1600, height = 1200) + plot(h[, end + 2], h[, end + 9], xlab = "difference in expression", + ylab = "p-value", main = "Does difference predict significance?", + abline(lm(h[, end + 9] ~ h[, end + 2]))) + graphics.off() + if (returnh){ + return(h) + } +} + +tstat.general <- function (n1, n2, x1, x2, sd1, sd2, + small=0, minrep=2) +{ + ## calculates and returns values associated with a t-test of the + ## difference between two means + ## x is the dataframe + ## nn1 and nn2 are the number of control and experimental observations respectively + ## xx1 and xx2 are the means of the C's and E's + ## vv1 and vv2 are the standard deviations + ## small is the smallest detectable "signal" + ## minrep defines the smallest number of replicates required to do a two sample t-test + ## this must be AT LEAST TWO otherwise the variance will not be claculated + ## depending on the number of repllicate observations different t-tests are carried out + + ## do the two sample t-test + if (n1 >= minrep & n2 >= minrep) { + tt <- -(x1 - x2)/ + sqrt((((n1 - 1) * sd1^2 + (n2 - 1) * sd2^2)/ + (n1 + n2 - 2)) + * ((n1 + n2)/(n1 * n2))) + dft <- n1 + n2 - 2 + rvar <- max((sd1^2)/(sd2^2), (sd2^2)/(sd1^2)) + } + + ## do not do a t-test + if (n1 < minrep & n2 < minrep) { + tt <- NA + dft <- n1 + n2 - 2 + rvar <- NA + } + + ## do the one sample versus a constant t-test -- use the mean for + ## the constant if available, otherwise use "small" + if (n1 < minrep & n2 >= minrep) { + if(is.na(x1)){ + tt <- (x2 - small)/(sd2/sqrt(n2)) + } + else{ + tt <- (x2 - x1)/(sd2/sqrt(n2)) + } + dft <- n2 - 1 + rvar <- NA + } + if (n1 >= minrep & n2 < minrep) { + if(is.na(x2)){ + tt <- -(x1 - small)/(sd1/sqrt(n1)) + } + else{ + tt <- -(x1 - x2)/(sd1/sqrt(n1)) + } + dft <- n1 - 1 + rvar <- NA + } + return(list(t=tt, + df=dft, + rvar=rvar) + ) +} + + +microarray.bayesian <- + function (data, + winsize=101, conf=10, + minrep=3) { + # experror is the experiment wide probability of a false positive + # winsize control the degree of local averaging + # conf is the weighting of the Baysian prior relative to the observed within gene variance + # temp1 and 2 are the number of control and experimental observations + cont <- attr(data,"controls") + exp <- attr(data,"experiments") + data[data == 0] <- NA + ## temp1 and 2 + cont.nsamples <- apply(data[,cont],1,function(x){NROW(which(!is.na(x)))}) + exp.nsamples <- apply(data[,exp],1,function(x){NROW(which(!is.na(x)))}) + ## temp3 and 4 + cont.mean <- apply(data[,cont],1,function(x){mean(x,na.rm=TRUE)}) + exp.mean <- apply(data[, exp],1,function(x){mean(x,na.rm=TRUE)}) + ## temp5 and 6 + cont.sd <- apply(data[,cont],1,function(x){sd(x,na.rm=TRUE)}) + exp.sd <- apply(data[, exp],1,function(x){sd(x,na.rm=TRUE)}) + ## temp7 and 8 + cont.mean.log <- apply(data[,cont],1,function(x){mean(log(x),na.rm=TRUE)}) + exp.mean.log <- apply(data[, exp],1,function(x){mean(log(x),na.rm=TRUE)}) + # temp9 and 10 + cont.sd.log <- apply(data[,cont],1,function(x){sd(log(x),na.rm=TRUE)}) + exp.sd.log <- apply(data[, exp],1,function(x){sd(log(x),na.rm=TRUE)}) + ## temp11 and 12 are the running average standard deviations where + ## the running averages are based on "winsize" and the function + ## runavg. In short all the data within a treatment are sorted by + ## average expression level and the running average calculated on + ## the estimated st. dev. for each locus. All the wierd code is a + ## work around of missing values + cont.sd.runavg <- runavg(cont.sd[order(cont.mean)])[order(order(cont.mean))] + exp.sd.runavg <- runavg( exp.sd[order( exp.mean)])[order(order( exp.mean))] + ## xxx <- temp5[!is.na(temp5)][order(temp3[!is.na(temp5)])] + ## xxx <- runavg(xxx, winsize) + ## xxx <- xxx[rank(temp3[!is.na(temp5)])] + ## temp11 <- rep(NA, nrow(h)) + ## temp11[!is.na(temp5)] <- xxx + ## xxx <- temp6[!is.na(temp6)][order(temp4[!is.na(temp6)])] + ## xxx <- runavg(xxx, winsize) + ## xxx <- xxx[rank(temp4[!is.na(temp6)])] + ## temp12 <- rep(NA, nrow(h)) + ## temp12[!is.na(temp6)] <- xxx + cont.sd.log.runavg <- runavg(cont.sd.log[order(cont.mean.log)])[order(order(cont.mean.log))] + exp.sd.log.runavg <- runavg( exp.sd.log[order( exp.mean.log)])[order(order( exp.mean.log))] + ## xxx <- temp9[!is.na(temp9)][order(temp7[!is.na(temp9)])] + ## xxx <- runavg(xxx, winsize) + ## xxx <- xxx[rank(temp7[!is.na(temp9)])] + ## ## like 11 and 12 but for the log transofrmed data + ## temp13 <- rep(NA, nrow(h)) + ## temp13[!is.na(temp9)] <- xxx + ## xxx <- temp10[!is.na(temp10)][order(temp8[!is.na(temp10)])] + ## xxx <- runavg(xxx, winsize) + ## xxx <- xxx[rank(temp8[!is.na(temp10)])] + ## temp14 <- rep(NA, nrow(h)) + ## temp14[!is.na(temp10)] <- xxx + + ## now calculate the weighted average (Baysian) estimate of the + ## standard deviation + cont.sd.bayes <- sqrt((conf * cont.sd.runavg^2 + + (cont.nsamples - 1) * cont.sd^2)/ + (conf + cont.nsamples - 1)) + exp.sd.bayes <- sqrt((conf * exp.sd.runavg^2 + + (exp.nsamples - 1) * exp.sd^2)/ + (conf + exp.nsamples - 1)) + cont.sd.log.bayes <- sqrt((conf * cont.sd.log.runavg^2 + + (cont.nsamples - 1) * cont.sd.log^2)/ + (conf + cont.nsamples - 1)) + exp.sd.log.bayes <- sqrt((conf * exp.sd.log.runavg^2 + + (exp.nsamples - 1) * exp.sd.log^2)/ + (conf + exp.nsamples - 1)) + ## temp11 <- sqrt((conf * temp11^2 + (temp1 - 1) * temp5^2)/ + ## (conf + temp1 - 2)) + ## temp12 <- sqrt((conf * temp12^2 + (temp2 - 1) * temp6^2)/ + ## (conf + temp2 - 2)) + ## temp13 <- sqrt((conf * temp13^2 + (temp1 - 1) * temp9^2)/(conf + + ## temp1 - 2)) + ## temp14 <- sqrt((conf * temp14^2 + (temp2 - 1) * temp10^2)/(conf + + ## temp2 - 2) + cont.cols <- 1:NCOL(data[,cont]) + exp.cols <- cont.cols[length(cont.cols)]+1:NCOL(data[,exp]) + result <- cbind(data[,cont],data[,exp], + cont.nsamples, exp.nsamples, + cont.mean, exp.mean, + cont.sd, exp.sd, + cont.mean.log, exp.mean.log, + cont.sd.log, exp.sd.log, + cont.sd.bayes,exp.sd.bayes, + cont.sd.log.bayes,exp.sd.log.bayes + ) + colnames(result)[cont.cols] <- + sapply(colnames(result)[cont.cols], + function(x){ + paste(sep="",collapse="", + "cont.",x)}) + colnames(result)[exp.cols] <- + sapply(colnames(result)[exp.cols], + function(x){ + paste(sep="",collapse="", + "exp.",x)}) + colnames(result)[(NCOL(data[,cont|exp])+1):NCOL(result)] <- + c("controls.num","experiments.num", + "controls.mean","experiments.mean", + "controls.sd","experiments.sd", + "controls.mean.log","experiments.mean.log", + "controls.sd.log","experiments.sd.log", + "controls.sd.bayesian","experiments.sd.bayesian", + "controls.sd.log.bayesian","experiments.sd.log.bayesian" + ) + ## calculate the "lower bound" on the limit to detection the lower + ## bound is simply the 0.0025 quantile of the means + threshold <- c(quantile(cont.mean, 0.0025,na.rm=TRUE), + quantile(exp.mean, 0.0025,na.rm=TRUE) + ) + ## calculate statistics associated with the t-test for the raw data + ## (temp1) and log transformed data (temp2). The call to pierre uses + ## the st. dev.'s which incorporate the running average + temp1 <- apply(result, 1, function(x) + { + temp <- tstat.general(n1=x["controls.num"], n2=x["experiments.num"], + x1=x["controls.mean"], x2=x["experiments.mean"], + sd1=x["controls.sd.bayesian"], sd2=x["experiments.sd.bayesian"], + small=(exp(threshold[1]) + exp(threshold[2]))/2, + minrep=minrep) + return(c(temp$t,temp$df,temp$rvar)) + } + ) + temp2 <- apply(result, 1, function(x) + { + temp <- tstat.general(n1=x["controls.num"], n2=x["experiments.num"], + x1=x["controls.mean.log"], x2=x["experiments.mean.log"], + sd1=x["controls.sd.log.bayesian"], sd2=x["experiments.sd.log.bayesian"], + small=(exp(threshold[1]) + exp(threshold[2]))/2, + minrep=minrep) + return(c(temp$t,temp$df,temp$rvar)) + } + ) + result <- cbind(result, t(temp1), t(temp2)) + colnames(result)[(NCOL(result)-5):NCOL(result)] <- + c("t.bayes", "df.bayes", "vr.bayes", + "t.log.bayes", "df.log.bayes", "vr.log.bayes") + rm(temp1, temp2) + + ## p-values associated with the two t-stats change d.f. to + ## incorporate Bayesian Jan 10 / 2001 the calculations below are + ## both regular t-tests with a Bayesian correction. The last term + ## 'h[,end+16|19]' are degrees of freedom associated with the + ## regular t-test plus the '2*conf - 2' which is the d.f.'s + ## associated with the Bayes estimate + + ## p-raw + p.bayes <- 2*pt(-abs(result[,"t.bayes"]), result[,"df.bayes"] + 2*conf - 2) + ## p-log + p.log.bayes <- 2*pt(-abs(result[,"t.log.bayes"]), result[,"df.log.bayes"] + 2*conf - 2) + qvalue.p.bayes <- rep(NA,times=length(p.log.bayes)) + temp.qvalue <- qvalue(p=na.omit(p.bayes)) + if (class(temp.qvalue)=="qvalue") { + qvalue.p.bayes[!is.na(p.bayes)] <- temp.qvalue$qvalues + } + qvalue.p.log.bayes <- rep(NA,times=length(p.log.bayes)) + temp.qvalue <- qvalue(p=na.omit(p.log.bayes)) + if (class(temp.qvalue)=="qvalue") { + qvalue.p.log.bayes[!is.na(p.log.bayes)] <- temp.qvalue$qvalues + } + result <- cbind(result, + p.bayes, + p.log.bayes, + qvalue.p.bayes, + qvalue.p.log.bayes, + p.adjust(p.bayes,method="BH"), + p.adjust(p.log.bayes,method="BH") + ) + colnames(result)[(NCOL(result)-5):NCOL(result)] <- + c("p.bayes","p.log.bayes", + "fdr.q.bayes","fdr.q.log.bayes", + "fdr.bh.bayes","fdr.bh.log.bayes" + ) + + ## calculate Bonferroni threshold, also calculate fold expression + ## (signed) print a file of all genes that pass the Bonferroni + ## and print all file that gives the results for all genes + + ## we no longer bother to calculate a bonferoni correction; that's + ## the job of things following to do + + ## Bonf <- 1 - exp(log(1 - experror)/length(h[!is.na(h[, end + + ## 22]), end + 22])) + + fold.change <- exp.mean/cont.mean + fold.change[fold.change<1] <- - 1 / fold.change[fold.change<1] + ## temp1 <- -(h[, end + 3]/h[, end + 4]) * + ## ((h[, end + 3]/h[,end + 4]) >= 1) + + ## (h[, end + 4]/h[, end + 3]) * + ## ((h[,end + 3]/h[, end + 4]) < 1) + + ## calculate fold change stuff + ## thres <- (exp(threshold[1]) + exp(threshold[2]))/2 + ## I'm not sure if this correction by threshold is actually + ## necessary or a good idea. + ## temp2 <- is.na(temp1) * is.na(h[,end + 3]) * (h[,end+4]/thres) + ## temp3 <- is.na(temp1) * is.na(h[,end + 4]) * -(h[,end+3]/thres) + ## ttemp <- cbind(temp1,temp2,temp3) + ## rm(temp1,temp2,temp3) + ## temp4 <- apply(ttemp,1,function(x) sum(x,na.rm=TRUE)) + ## rm(ttemp) + result <- cbind(result, fold.change) + ## rm(temp4) + colnames(result)[NCOL(result)] <- "fold.change" + + ## calculate the t-test statistic, p-value and degrees of freedom + ## for raw data and log transformed data change the na.action to + ## na.omit to allow to continue calculating past bad values + + temp.t <- t(apply(data, 1, + function(x) { + if ((NROW(na.omit(x[cont])) < 2) || (NROW(na.omit(x[exp])) < 2)){ + return(c(NA,NA,NA)) + } + temp <- t.test(na.omit(x[cont]), + na.omit(x[exp]) + ) + c(temp$p.value,temp$statistic,temp$parameter) + })) + temp.logt <- t(apply(data, 1, + function(x) { + if ((NROW(na.omit(x[cont])) < 2) || (NROW(na.omit(x[exp]))<2)){ + return(c(NA,NA,NA)) + } + temp <- t.test(na.omit(log(x[cont])), + na.omit(log(x[exp])) + ) + c(temp$p.value,temp$statistic,temp$parameter) + })) + result <- cbind(result,temp.t,temp.logt) + colnames(result)[(NCOL(result)-5):NCOL(result)] <- c('t.test.p','t.test.t','t.test.df', + 't.test.p.log','t.test.t.log','t.test.df.log') + + ## calculate q values for the normal t test results + temp.t.test.q <- result[,"t.test.p"] + temp.t.test.q[!is.finite(temp.t.test.q)] <- NA + temp.qvalue <- qvalue(p=na.omit(temp.t.test.q)) + if (class(temp.qvalue)=="qvalue") + temp.t.test.q[!is.na(temp.t.test.q)] <- temp.qvalue$qvalues + temp.t.test.q.log <- result[,"t.test.p.log"] + temp.t.test.q.log[!is.finite(temp.t.test.q.log)] <- NA + temp.qvalue <- qvalue(p=na.omit(temp.t.test.q.log)) + if (class(temp.qvalue)=="qvalue") + temp.t.test.q.log[!is.na(temp.t.test.q.log)] <- temp.qvalue$qvalues + result <- cbind(result,temp.t.test.q, + temp.t.test.q.log, + p.adjust(result[,"t.test.p"],method="BH"), + p.adjust(result[,"t.test.p.log"],method="BH")) + colnames(result)[(NCOL(result)-3):NCOL(result)] <- + c('t.test.fdr.q','t.test.fdr.q.log', + 't.test.fdr.bh','t.test.fdr.bh.log') + attr(result,"controls") <- cont.cols + attr(result,"experiments") <- exp.cols + + return(result) +} + + +runavg <- function(x,k=1){ + if (k < 0) + stop("k must be greater than or equal to 0"); + if (k==0) + return(x) + n <- length(x) + r <- numeric(n) + # one side of the window + for (i in 1:n) { + if(is.na(x[i])) { + r[i] <- NA + } else { + j <- i - k + if (j < 1) + j <- 1 + l <- i + k + if (l > n) + l <- n + r[i] <- mean(x[j:l],na.rm=TRUE) + } + } + return(r) +} diff --git a/R/bayesian_analysis_plot.R b/R/bayesian_analysis_plot.R new file mode 100644 index 0000000..47dfd6a --- /dev/null +++ b/R/bayesian_analysis_plot.R @@ -0,0 +1,370 @@ +# Copyright (C) 1999 Anthony D. Long, Ecology & Evolutionary Biology +# University of California, Irvine (tdlong@uci.edu) + +# Portions Copyright 2002,2003 Don Armstrong + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# $Id: bayesian_analysis.R,v 1.5 2006/07/12 03:27:03 don Exp $ + + +microarray.bayesian.plot <- + function(data) { + + ## png(file = paste(file,"_%03d.png",sep=""), width = 1600, height = 1200) + plotting.function <- getFunction("plot") + text.function <- getFunction("text") + use.grid <- 0 + if (NROW(data)>1e4) + use.grid <- 1 + if (use.grid) { + nbins <- 80 + plotting.function <- + function(x,y, + xbins = nbins, shape = 1, + xbnds = range(x), ybnds = range(y), + xlab = NULL, ylab = NULL, IDs = FALSE,...) { + return(plot(hexbin(x,y, + xbins=xbins, + shape=shape, + xbnds=xbnds, + xlab=xlab, + ylab=ylab, + IDs=IDs), + colramp=BTY, + ...)) + } + text.function <- function(x,y,text) { + grid.text(text,x=unit(x,"native"),y=unit(y,"native")) + } + } + plotting.function(data[, "controls.mean"]/1e3, + data[, "experiments.mean"]/1e3, + xlab = expression(paste("Mean of Control Expression (",10**3,")")), + ylab = expression(paste("Mean of Experimental Expression (",10**3,")")), + main = "Mean expression level comparison" + ) + plotting.function(data[, "controls.mean.log"], + data[, "experiments.mean.log"], + xlab = "Mean of Controls", + ylab = "Mean of Experimentals", + main = "log transformed expression level comparison" + ) + ## controls mean v controls sd + temp.y <- data[,"controls.sd"]/1000 + temp.x <- data[, "controls.mean"]/1000 + lmline <- lm(temp.y~temp.x) + temp <- + plotting.function(data[, "controls.mean"]/1e3, + data[, "controls.sd"]/1e3, + xlab = expression(paste("Mean of Control Expression (",10**3,")")), + ylab = expression(paste("SD of Control Expression (",10**3,")")), + main = "Control SD as a function of Control Mean" + ) + if (use.grid) { + pushHexport(temp$plot.vp) + panel.abline(lmline) + } + else { + abline(lmline) + } + xlim <- range(data[,"controls.mean"]/1e3) + ylim <- range(data[,"controls.sd"]/1e3) + text.function(0.5*(xlim[2]-xlim[1])+xlim[1], + 0.9*(ylim[2]-ylim[1])+ylim[1], + substitute(r**2 == y, + list(y=format(summary(lmline)$r.squared, + digits=4))) + ) + if (use.grid) + popViewport(1) + + ## experiments mean v experiments sd + temp.y <- data[,"experiments.sd"]/1000 + temp.x <- data[,"experiments.mean"]/1000 + temp <- + plotting.function(data[, "experiments.mean"]/1e3, + data[, "experiments.sd"]/1e3, + xlab = expression(paste("Mean of Experimental Expression (",10**3,")")), + ylab = expression(paste( "SD of Experimental Expression (",10**3,")")), + main = "Experiment SD as a function of Experiment Mean" + ) + if (use.grid) { + pushHexport(temp$plot.vp) + panel.abline(lmline) + } + else { + abline(lmline) + } + xlim <- range(data[,"experiments.mean"]) + ylim <- range(data[,"experiments.sd"]) + text.function(0.5*(xlim[2]-xlim[1])+xlim[1], + 0.9*(ylim[2]-ylim[1])+ylim[1], + substitute(r**2 == y, + list(y=format(summary(lmline)$r.squared, + digits=4))) + ) + if (use.grid) + popViewport(1) + + ## controls log mean v controls log sd + lmline <- lm(data[,"controls.sd.log"]~data[, "controls.mean.log"]) + temp <- plotting.function(data[, "controls.mean.log"], + data[, "controls.sd.log"], + xlab = "Mean of log(Control Expression)", + ylab = "SD of log(Control Expression)", + main = "log(Control SD) as a function of log(Control Mean)" + ) + if (use.grid) { + pushHexport(temp$plot.vp) + panel.abline(lmline) + } + else { + abline(lmline) + } + xlim <- range(data[,"controls.mean.log"]) + ylim <- range(data[,"controls.sd.log"]) + print(xlim) + print(ylim) + print(summary(lmline)) + print(0.5*(xlim[2]-xlim[1])+xlim[1]) + print(0.9*(ylim[2]-ylim[1])+ylim[1]) + if (use.grid) + print(current.vpTree()) + text.function(0.5*(xlim[2]-xlim[1])+xlim[1], + 0.9*(ylim[2]-ylim[1])+ylim[1], + substitute(r**2 == y, + list(y=format(summary(lmline)$r.squared, + digits=4))) + ) + if (use.grid) + popViewport(1) + + ## experiments mean v experiments sd + lmline <- lm(data[,"experiments.sd.log"]~data[, "experiments.mean.log"]) + temp <- plotting.function(data[, "experiments.mean.log"], + data[, "experiments.sd.log"], + xlab = "Mean of log(Experiment Expression)", + ylab = "SD of log(Experiment Expression)", + main = "log(Experiment SD) as a function of log(Experiment Mean)" + ) + if (use.grid) { + pushHexport(temp$plot.vp) + panel.abline(lmline) + } + else { + abline(lmline) + } + xlim <- range(data[,"experiments.mean.log"]) + ylim <- range(data[,"experiments.sd.log"]) + text.function(0.5*(xlim[2]-xlim[1])+xlim[1], + 0.9*(ylim[2]-ylim[1])+ylim[1], + substitute(r**2 == y, + list(y=format(summary(lmline)$r.squared, + digits=4))) + ) + if (use.grid) + popViewport(1) + + ## mean expression vs significance + mean.expression <- apply(data[,c("experiments.mean","controls.mean")],1,mean) + lmline <- lm(data[,"t.test.p"] ~ + mean.expression) + temp <- plotting.function(mean.expression, + data[, "t.test.p"], + xlab = "Mean Expression", + ylab = "p value of T Test", + main = "Significance as a function of Mean Expression" + ) + if (use.grid) { + pushHexport(temp$plot.vp) + panel.abline(lmline) + panel.abline(h=0.05) + } + else { + abline(lmline) + abline(h=0.05) + } + xlim <- range(mean.expression) + ylim <- range(data[, "t.test.p"]) + text.function(0.5*(xlim[2]-xlim[1])+xlim[1], + 0.9*(ylim[2]-ylim[1])+ylim[1], + substitute(r**2 == y, + list(y=format(summary(lmline)$r.squared, + digits=4))) + ) + + if (use.grid) + popViewport(1) + + ## log transformed mean expression vs significance + mean.expression <- apply(data[,c("experiments.mean.log","controls.mean.log")],1,mean) + lmline <- lm(data[, "t.test.p.log"] ~ + mean.expression) + temp <- + plotting.function(mean.expression, + data[, "t.test.p.log"], + xlab = "Mean Expression", + ylab = "p value of T Test log(Expression)", + main = "Significance of log(Expression) as a function of Mean Expression" + ) + if (use.grid) { + pushHexport(temp$plot.vp) + panel.abline(lmline) + panel.abline(h=0.05) + } + else { + abline(lmline) + abline(h=0.05) + } + xlim <- range(mean.expression) + ylim <- range(data[, "t.test.p.log"]) + text.function(0.5*(xlim[2]-xlim[1])+xlim[1], + 0.9*(ylim[2]-ylim[1])+ylim[1], + substitute(r**2 == y, + list(y=format(summary(lmline)$r.squared, + digits=4))) + ) + if (use.grid) + popViewport(1) + + + + ## Fold change vs significance + lmline <- lm(data[abs(data[, "fold.change"]) < 20, "t.test.p"] ~ + abs(data[abs(data[, "fold.change"]) < 20, "fold.change"])) + temp <- plotting.function(abs(data[abs(data[,"fold.change"]) < 20, "fold.change"]), + data[abs(data[,"fold.change"]) < 20, "t.test.p"], + xlab = "Fold Change of Expression", + ylab = "p value of T Test", + main = "Significance as a function of Fold Change" + ) + if (use.grid) { + pushHexport(temp$plot.vp) + panel.abline(lmline) + panel.abline(h=0.05) + } + else { + abline(lmline) + abline(h=0.05) + } + xlim <- range(abs(data[abs(data[,"fold.change"]) < 20, "fold.change"])) + ylim <- range(data[abs(data[,"fold.change"]) < 20, "t.test.p"]) + text.function(0.5*(xlim[2]-xlim[1])+xlim[1], + 0.9*(ylim[2]-ylim[1])+ylim[1], + substitute(r**2 == y, + list(y=format(summary(lmline)$r.squared, + digits=4))) + ) + + if (use.grid) + popViewport(1) + + ## log transformed fold change vs significance + print("log transformed fold change vs significance") + temp.data <- na.omit(cbind(data[abs(data[, "fold.change"]) < 20, "t.test.p.log"], + abs(data[abs(data[, "fold.change"]) < 20, "fold.change"]))) + lmline <- lm(temp.data[,1]~temp.data[,2]) + temp <- + plotting.function(temp.data[,2],temp.data[,1], + xlab = "Fold Change of Expression", + ylab = "p value of T Test log(Expression)", + main = "Significance of log(Expression) as a function of Fold Change" + ) + if (use.grid) { + pushHexport(temp$plot.vp) + panel.abline(lmline) + panel.abline(h=0.05) + } + else { + abline(lmline) + abline(h=0.05) + } + xlim <- range(temp.data[,2]) + ylim <- range(temp.data[,1]) + text.function(0.5*(xlim[2]-xlim[1])+xlim[1], + 0.9*(ylim[2]-ylim[1])+ylim[1], + substitute(r**2 == y, + list(y=format(summary(lmline)$r.squared, + digits=4))) + ) + if (use.grid) + popViewport(1) + + ## bayesian vs t test + temp.data <- log(na.omit(cbind(data[,"t.test.p"], + data[,"p.bayes"]))) + lmline <- lm(temp.data[,1]~temp.data[,2]) + temp <- + plotting.function(temp.data[,2],temp.data[,1], + xlab = "log(p value) of Bayesian", + ylab = "log(p value) of t test", + main = "Correspondance of t-test and bayesian p values" + ) + if (use.grid) { + pushHexport(temp$plot.vp) + panel.abline(lmline) + } + else { + abline(lmline) + } + xlim <- range(temp.data[,2]) + ylim <- range(temp.data[,1]) + text.function(0.5*(xlim[2]-xlim[1])+xlim[1], + 0.9*(ylim[2]-ylim[1])+ylim[1], + substitute(r**2 == y, + list(y=format(summary(lmline)$r.squared, + digits=4))) + ) + + if (use.grid) + popViewport(1) + + ## log(bayesian) vs t test + print("log(bayesian) vs t test") + temp.data <- log(na.omit(cbind(data[,"t.test.p.log"], + data[,"p.log.bayes"]))) + lmline <- lm(temp.data[,1]~temp.data[,2]) + temp <- + plotting.function(temp.data[,2],temp.data[,1], + xlab = "log(p value) of Bayesian (log transformed)", + ylab = "log(p value) of t test (log transformed)", + main = "Correspondance of t-test and bayesian p values of log transformed values" + ) + if (use.grid) { + pushHexport(temp$plot.vp) + panel.abline(lmline) + } + else { + abline(lmline) + } + xlim <- range(temp.data[,2]) + ylim <- range(temp.data[,1]) + text.function(0.5*(xlim[2]-xlim[1])+xlim[1], + 0.9*(ylim[2]-ylim[1])+ylim[1], + substitute(r**2 == y, + list(y=format(summary(lmline)$r.squared, + digits=4))) + ) + + if (use.grid) + popViewport(1) + + + ## histograms + print("histograms") + plot(hist(data[,"t.test.p"],breaks=40,plot=FALSE),main="Histogram of T Test p values") + plot(hist(data[,"t.test.p.log"],breaks=40,plot=FALSE),main="Histogram of T Test p values for log transformed data") + plot(hist(data[,"p.bayes"],breaks=40,plot=FALSE),main="Histogram of Bayesian p values") + plot(hist(data[,"p.log.bayes"],breaks=40,plot=FALSE),main="Histogram of Bayesian p values for log transformed data") + + } + diff --git a/R/ma_loess_norm.R b/R/ma_loess_norm.R new file mode 100644 index 0000000..250fb81 --- /dev/null +++ b/R/ma_loess_norm.R @@ -0,0 +1,86 @@ +library(stats) + + +# the appropriate method is to average over the \hat{M}'s to find the +# average fit, and then use that to correct the data. + +# ma.loessnorm <- function(array,iterations=2){ +# n <- ncol(array) +# for (q in 1:iterations){ +# bar.m.hat.array <- array(0,dim=dim(array)) +# for (i in 1:(n-1)){ +# for (j in (i+1):n){ +# print(paste(i,j)) +# # fit lowess +# # [we have to use loess so we can predict using predict.loess +# mva <- cbind(log2(array[,i]/array[,j]),0.5*log2(array[,i]*array[,j])) +# colnames(mva) <- c("m","a") +# mva <- data.frame(na.omit(mva)) +# temp.loess <- loess(mva[,"m"]~mva[,"a"],control = loess.control(surface = "direct")) +# # predict for i, predict for j (\hat{M}) +# # subtract out \hat{M} from M +# m.hat <- predict(temp.loess,mva[,"a"]) +# # weight correction by the inverse of 2(n-1) . [so that there will be n adjustments] +# bar.m.hat.array[,i] <- bar.m.hat.array[,i]+m.hat/(n-1) +# bar.m.hat.array[,j] <- bar.m.hat.array[,j]+m.hat/(n-1) +# print(paste(i,j)) +# } +# } +# # correct arrays +# result.array <- array(0,dim=dim(array)) +# i <- 1 +# for (j in (i+1):n){ +# mva <- cbind(log2(array[,i]/array[,j]),0.5*log2(array[,i]*array[,j])) +# colnames(mva) <- c("m","a") +# if (j == 2){ +# # correct Mi the first time through +# result.array[,i] <- 2^(mva[,"a"]+0.5*(mva[,"m"]-bar.m.hat.array[,i])) +# } +# result.array[,j] <- 2^(mva[,"a"]-0.5*(mva[,"m"]-bar.m.hat.array[,i])) +# } +# rownames(result.array) <- rownames(array) +# colnames(result.array) <- colnames(array) +# array <- result.array +# } +# return(array) +# } + + +ma.loessnorm2 <- function(array,iterations=2,small.positive=0.1,print.status=F){ + n <- ncol(array) + # strip na values. + array <- na.omit(array) + for (q in 1:iterations){ + if (print.status){ + print(q) + } + result.array <- array(0,dim=dim(array)) + for (i in 1:(n-1)){ + for (j in (i+1):n){ + #print(paste(i,j)) + # replace 0's or negative values with a small positive value + array[array[,i]<=0,i] <- small.positive + array[array[,j]<=0,j] <- small.positive + # fit lowess + # [we have to use loess so we can predict using predict.loess + mva <- cbind(log2(array[,i]/array[,j]),0.5*log2(array[,i]*array[,j])) + colnames(mva) <- c("m","a") + mva <- data.frame(mva) + #temp.loess <- loess(m~a,mva,control = loess.control(surface = "direct")) + temp.loess <- loess(m~a,mva,control=loess.control(surface="interpolate",statistics="approximate", + trace.hat="approximate",iterations=2)) + # predict for i, predict for j (\hat{M}) + # subtract out \hat{M} from M + m.hat <- predict(temp.loess,mva[,"a"]) + # weight correction by the inverse of 2(n-1) . [so that there will be n adjustments] + result.array[,i] <- result.array[,i]+2^(mva[,"a"]+0.5*(mva[,"m"]-m.hat))/(n-1) + result.array[,j] <- result.array[,j]+2^(mva[,"a"]-0.5*(mva[,"m"]-m.hat))/(n-1) + } + } + # correct arrays + rownames(result.array) <- rownames(array) + colnames(result.array) <- colnames(array) + array <- result.array + } + return(array) +} diff --git a/R/mva_plotting.R b/R/mva_plotting.R new file mode 100644 index 0000000..f40be20 --- /dev/null +++ b/R/mva_plotting.R @@ -0,0 +1,112 @@ +# MvA plotting for microarray data + + +# Copyright 2003 By Don Armstrong + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA +# 02111-1307, USA. + +# $Id: mva_plotting.R,v 1.5 2006/06/10 04:14:45 don Exp $ + + +# mva.plot +# Plots a pairwise M versus A plot with a loess fit line +mva.plot.pair <- function(array,title=F,small.positive=0.1){ + size <- ncol(array) + npair <- size*(size-1)/2 + height <- min(round((npair*3/4)^0.5),3) + width <- min(ceiling(npair/height),3) + ## print(c("width",width,"height",height)) + def.par <- par(no.readonly = TRUE) + use.grid <- 0 + if (NROW(array[,1]) > 1e4) { + use.grid <- 1 + } + if (use.grid) { + mva.layout <- grid.layout(width,height) + pushViewport(viewport(layout=mva.layout)) + } + else { + par(mfcol=c(width,height),ps=9,cex.main=1) + } + # we could also use combn here instead + num.plot <- 0 + for(a in 1:(size-1)){ + for(b in (a+1):size){ +# par(mfg=c(a,b-1)) + num.plot <- num.plot + 1 + array[array[,a]<=0,a] <- small.positive + array[array[,b]<=0,b] <- small.positive + A <- 0.5*log2(array[,a]*array[,b]) + M <- log2(array[,a]/array[,b]) + temp.frame <- data.frame(cbind(A,M)) + temp.frame <- temp.frame[abs(temp.frame$M) < 7.8 & temp.frame$A < 15.8 & temp.frame$A > 0,] + if (use.grid) { + ## print(c("num.plot",num.plot)) + ## print(c("row",floor((num.plot -1)/ width) %% height)) + ## print(c("col",(num.plot-1) %% width)) + if (num.plot > 1 && floor((num.plot -1)/ width) %% height == 0 && (num.plot-1) %% width == 0) { + popViewport(1) + grid.text(title,x=unit(0.0,"npc")+unit(1,"lines"),y=unit(0.5,"npc"),just="center",rot=90) + grid.newpage() + pushViewport(viewport(layout=mva.layout)) + } + ## print(floor((num.plot -1)/ width) %% height+1) + ## print((num.plot - 1)%% width+1) + pushViewport(viewport(layout.pos.row=(floor((num.plot -1)/ width) %% height)+1, layout.pos.col=((num.plot - 1)%% width)+1)) + hb <- hexbin(temp.frame, + ##xlab="A",ylab="M", + ##ybnds = c(-8,8), + ##xbnds = c(0,16) + ); + # P <- plot(hb,type="n", + # ## main=paste(colnames(array)[a],colnames(array)[b],sep=" vs. "), + # xlab="",ylab="",newpage=FALSE,legend=FALSE) + hb.viewport <- hexViewport(hb,mar=unit(0.1,"npc")) + pushHexport(hb.viewport) + grid.hexagons(hb,style="colorscale",border=gray(.1),pen=gray(.6),minarea=.1,maxarea=1.5,colramp=BTY) + grid.xaxis() + grid.yaxis() + temp.loess <- loess.smooth(A,M,family="gaussian", + control=loess.control(surface="interpolate",statistics="approximate", + trace.hat="approximate",iterations=2,cell=0.6)) + panel.loess(A,M, + col="red",lwd=2) + popViewport(1) + ## grid.rect() + grid.text(paste(colnames(array)[a],colnames(array)[b],sep=" vs. "),y=unit(0.9,"npc"),x=unit(0.5,"npc"),just="center") + popViewport(1) +# grid.lines() + } + else { + plot(A,M,pch=".",main=paste(colnames(array)[a],colnames(array)[b],sep=" vs. "),xlab="A",ylab="M",ylim=c(-8,8),xlim=c(0,16)) + lines(loess.smooth(A,M,family="gaussian", + control=loess.control(surface="interpolate",statistics="approximate", + trace.hat="approximate",iterations=2,cell=0.6)), + col="blue",lwd=2) + } + ## abline(h=0) + } + } + if (use.grid) { + popViewport(1) + grid.text(title,x=unit(0.0,"npc")+unit(1,"lines"),y=unit(0.5,"npc"),just="center",rot=90) + ## grid.newpage() + } + else { + title(title,outer=TRUE) + par(def.par) + } +} diff --git a/R/power_analysis.R b/R/power_analysis.R new file mode 100644 index 0000000..3598910 --- /dev/null +++ b/R/power_analysis.R @@ -0,0 +1,76 @@ +# Power analysis for microarray data + + +# Copyright 2003 By Don Armstrong + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA +# 02111-1307, USA. + + + +# power.ma.partition +# calcualate the power for data split into paritions of 200 genes. +power.ma.partition <- function(dataset,partition=200,n=7,delta.mul=1,sig.level=0.05){ + power.ma.temp <- dataset[names(sort(apply(dataset,1,mean))),] + if (partition>NROW(power.ma.temp)){ + partition <- NROW(power.ma.temp) + } + power.ma.p.high <- NROW(power.ma.temp) + power.ma.return <- c() + for (power.ma.p.low in ceiling(seq(NROW(power.ma.temp),0,length=partition))){ + if(power.ma.p.low == power.ma.p.high){ + next; + } + power.ma.p.sd <- mean(apply(power.ma.temp[power.ma.p.low:power.ma.p.high,],1,sd,na.rm=T)) + power.ma.p.mean <- mean(apply(power.ma.temp[power.ma.p.low:power.ma.p.high,],1,mean,na.rm=T)) + power.ma.p.power <- power.t.test(n=n,sd=power.ma.p.sd,delta=power.ma.p.mean*delta.mul,sig.level=0.05) + power.ma.p.power <- power.ma.p.power$power + power.ma.return <- rbind(cbind(power.ma.p.mean,power.ma.p.power,power.ma.p.sd),power.ma.return) + power.ma.p.high <- power.ma.p.low -1; + } + + return(power.ma.return) +} + +# power.ma +# calculate the power of individual genes +power.ma <- function(dataset,n=7,delta.mul=1,sig.level=0.05){ + power.ma.p.sd <- apply(dataset,1,sd,na.rm=T) + power.ma.p.mean <- apply(dataset,1,mean,na.rm=T) + power.ma.p.return <- cbind(power.ma.p.mean,power.ma.p.sd) + power.ma.p.power.calc <- function(sdmeanarray,n=7,delta.mul=1,sig.level=0.05){ + power.ma.p.power.calc.power <- power.t.test(n=n,sd=sdmeanarray[2],delta=sdmeanarray[1]*delta.mul,sig.level=sig.level) + return(power.ma.p.power.calc.power$power) + } + power.ma.p.power <- apply(power.ma.p.return,1,power.ma.p.power.calc,delta.mul=delta.mul,sig.level=sig.level,n=n) + power.ma.p.return <- cbind(power.ma.p.return,power.ma.p.power) + + return(power.ma.p.return) +} + +power.ma.sdmean <- function(dataset,n=7,delta.mul=1,sig.level=0.05){ + power.ma.p.mean <- dataset[,1] + power.ma.p.sd <- dataset[,2] + power.ma.p.return <- cbind(power.ma.p.mean,power.ma.p.sd) + power.ma.p.power.calc <- function(sdmeanarray,n=7,delta.mul=1,sig.level=0.05){ + power.ma.p.power.calc.power <- power.t.test(n=n,sd=sdmeanarray[2],delta=sdmeanarray[1]*delta.mul,sig.level=sig.level) + return(power.ma.p.power.calc.power$power) + } + power.ma.p.power <- apply(power.ma.p.return,1,power.ma.p.power.calc,delta.mul=delta.mul,sig.level=sig.level,n=n) + power.ma.p.return <- cbind(power.ma.p.return,power.ma.p.power) + + return(power.ma.p.return) +} + diff --git a/R/significant_cut_util.R b/R/significant_cut_util.R new file mode 100644 index 0000000..523d1c3 --- /dev/null +++ b/R/significant_cut_util.R @@ -0,0 +1,10 @@ +significant.cut.util <- + function(significant.cuts,experiments,cut="complete.cut") { + result <- NULL + for (experiment in experiments) { + result <- cbind(result, + significant.cuts[[experiment]][[cut]]) + } + colnames(result) <- experiments + return(result) + } -- 2.39.2