# Copyright (C) 1999 Anthony D. Long, Ecology & Evolutionary Biology # University of California, Irvine (tdlong@uci.edu) # Portions Copyright 2002,2003 Don Armstrong # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # $Id: bayesian_analysis.R,v 1.5 2006/07/12 03:27:03 don Exp $ pierre.pair.from.unpaired <- function(h,cs,ce,es,ee,end, experror=0.05,winsize=101,conf=10,minrep=3,file="pierre.pair", file.save=TRUE,returnh=FALSE,qvalues=TRUE) { # Use the difference between experimental and control as the test statistic. new.h <- h[,es:ee]-h[,cs:ce] if (cs > 1){ new.h <- cbind(h[,1:cs-1],new.h) } totalexpresscol <- apply(h[,c(cs:ce,es:ee)],1,function(x) {mean(log(x[cs:ce]),log(x[es:ee]))}); new.h <- cbind(new.h,totalexpresscol) if (end > ee){ new.h <- cbind(new.h,h[,(ee+1):end]) } newer.h <- pierre.pair(new.h,cs,ce,(end-(ee-es)),ce+1,file=file,experror=experror, winsize=winsize,conf=conf,minrep=minrep,file.save=file.save,returnh=T) if (returnh){ return(newer.h) } } pierre.pair <- function (h, cs, ce, end, totalexpresscol, experror=0.05, winsize=101, conf=10, minrep=3, file="pierre.pair", file.save=TRUE,returnh=FALSE,qvalues=TRUE) { # note 'totalexpresscol' added April 18th 2001: # this should be the column number of the column in "h[1:N]" that represents # total expression for that gene. # A good value for 'totalexpresscol' would be the mean of the log of the # "total expression level" over both treatments (i.e., control and # experimentals) and replicates, where the total expression level for each # gene/treatment/replicate is given as a fraction of total expression over # all genes for that treatment/replicate. # number of non-zero samples, their mean and st.dev., index_col, running average sd. S.N <- apply(h[, cs:ce], 1, function(x) sum(x != 0 & !is.na(x))) S.mean <- apply(h[, cs:ce], 1, function(x) if (sum(x[x != 0 & !is.na(x)] != 0)) mean(x[x != 0 & !is.na(x)]) else NA) S.sd <- apply(h[, cs:ce], 1, function(x) if (sum(x[x != 0 & !is.na(x)] != 0) >= minrep) sqrt(var(x[x != 0 & !is.na(x)])) else NA) index.col <- h[,totalexpresscol] # running average standard deviation xxx <- S.sd[!is.na(S.sd)][order(index.col[!is.na(S.sd)])] xxx <- runavg(xxx, winsize) xxx <- xxx[rank(index.col[!is.na(S.sd)])] S.rasd <- rep(NA, nrow(h)) S.rasd[!is.na(S.sd)] <- xxx h <- cbind(h, S.N, S.mean, S.sd, S.rasd) colnames(h)[(end + 1):(end + 4)] <- c("N", "x", "sd", "rasd") rm(S.N, S.mean, S.sd, S.rasd) # regular t-test, Bayesian variance, Bayesian t-test, p-reg, p-Bayes # Regular t temp1 <- sqrt(h[end + 1]) * (h[end + 2]/h[end + 3]) # Baysian variance temp2 <- sqrt((conf * (h[end + 4])^2 + (h[end + 1] - 1) * (h[end + 3])^2)/(conf + h[end + 1] - 2)) # Baysian t-test temp3 <- sqrt(h[end + 1]) * (h[end + 2]/temp2) # modify d.f. to reflect effect of conf Jan. 10 / 2001 # calc regular p (temp4 is the same paired t-test as in doitall.pair) temp4 <- 1 - pf((temp1)^2, 1, h[, end + 1] - 1) # and Bayesian-p. temp5 is a _paired_ t-test with a Bayesian correction. # The degrees of freedom are h[,end+1] - 1 + conf - 1. Since it is a paired # t-test the degrees of freedom are fewer than non-paired approaches!! temp5 <- 1 - pf((temp3)^2, 1, h[, end + 1] + conf - 2) h <- cbind(h, temp1, temp2, temp3, temp4, temp5) rm(temp1, temp2, temp3, temp4, temp5) colnames(h)[(end + 5):(end + 9)] <- c("reg-t", "Bay-sd", "Bay-t", "reg-p", "Bay-p") if (returnh & ! file.save){ return(h) } write.table(h, file = paste(file,"_from_R.txt",sep=""), sep = "\t") png(file = paste(file,"_%03d.png",sep=""), width = 1600, height = 1200) plot(h[, end + 2], h[, end + 9], xlab = "difference in expression", ylab = "p-value", main = "Does difference predict significance?", abline(lm(h[, end + 9] ~ h[, end + 2]))) graphics.off() if (returnh){ return(h) } } tstat.general <- function (n1, n2, x1, x2, sd1, sd2, small=0, minrep=2) { ## calculates and returns values associated with a t-test of the ## difference between two means ## x is the dataframe ## nn1 and nn2 are the number of control and experimental observations respectively ## xx1 and xx2 are the means of the C's and E's ## vv1 and vv2 are the standard deviations ## small is the smallest detectable "signal" ## minrep defines the smallest number of replicates required to do a two sample t-test ## this must be AT LEAST TWO otherwise the variance will not be claculated ## depending on the number of repllicate observations different t-tests are carried out ## do the two sample t-test if (n1 >= minrep & n2 >= minrep) { tt <- -(x1 - x2)/ sqrt((((n1 - 1) * sd1^2 + (n2 - 1) * sd2^2)/ (n1 + n2 - 2)) * ((n1 + n2)/(n1 * n2))) dft <- n1 + n2 - 2 rvar <- max((sd1^2)/(sd2^2), (sd2^2)/(sd1^2)) } ## do not do a t-test if (n1 < minrep & n2 < minrep) { tt <- NA dft <- n1 + n2 - 2 rvar <- NA } ## do the one sample versus a constant t-test -- use the mean for ## the constant if available, otherwise use "small" if (n1 < minrep & n2 >= minrep) { if(is.na(x1)){ tt <- (x2 - small)/(sd2/sqrt(n2)) } else{ tt <- (x2 - x1)/(sd2/sqrt(n2)) } dft <- n2 - 1 rvar <- NA } if (n1 >= minrep & n2 < minrep) { if(is.na(x2)){ tt <- -(x1 - small)/(sd1/sqrt(n1)) } else{ tt <- -(x1 - x2)/(sd1/sqrt(n1)) } dft <- n1 - 1 rvar <- NA } return(list(t=tt, df=dft, rvar=rvar) ) } microarray.bayesian <- function (data, winsize=101, conf=10, minrep=3) { # experror is the experiment wide probability of a false positive # winsize control the degree of local averaging # conf is the weighting of the Baysian prior relative to the observed within gene variance # temp1 and 2 are the number of control and experimental observations cont <- attr(data,"controls") exp <- attr(data,"experiments") data[data == 0] <- NA ## temp1 and 2 cont.nsamples <- apply(data[,cont],1,function(x){NROW(which(!is.na(x)))}) exp.nsamples <- apply(data[,exp],1,function(x){NROW(which(!is.na(x)))}) ## temp3 and 4 cont.mean <- apply(data[,cont],1,function(x){mean(x,na.rm=TRUE)}) exp.mean <- apply(data[, exp],1,function(x){mean(x,na.rm=TRUE)}) ## temp5 and 6 cont.sd <- apply(data[,cont],1,function(x){sd(x,na.rm=TRUE)}) exp.sd <- apply(data[, exp],1,function(x){sd(x,na.rm=TRUE)}) ## temp7 and 8 cont.mean.log <- apply(data[,cont],1,function(x){mean(log(x),na.rm=TRUE)}) exp.mean.log <- apply(data[, exp],1,function(x){mean(log(x),na.rm=TRUE)}) # temp9 and 10 cont.sd.log <- apply(data[,cont],1,function(x){sd(log(x),na.rm=TRUE)}) exp.sd.log <- apply(data[, exp],1,function(x){sd(log(x),na.rm=TRUE)}) ## temp11 and 12 are the running average standard deviations where ## the running averages are based on "winsize" and the function ## runavg. In short all the data within a treatment are sorted by ## average expression level and the running average calculated on ## the estimated st. dev. for each locus. All the wierd code is a ## work around of missing values cont.sd.runavg <- runavg(cont.sd[order(cont.mean)])[order(order(cont.mean))] exp.sd.runavg <- runavg( exp.sd[order( exp.mean)])[order(order( exp.mean))] ## xxx <- temp5[!is.na(temp5)][order(temp3[!is.na(temp5)])] ## xxx <- runavg(xxx, winsize) ## xxx <- xxx[rank(temp3[!is.na(temp5)])] ## temp11 <- rep(NA, nrow(h)) ## temp11[!is.na(temp5)] <- xxx ## xxx <- temp6[!is.na(temp6)][order(temp4[!is.na(temp6)])] ## xxx <- runavg(xxx, winsize) ## xxx <- xxx[rank(temp4[!is.na(temp6)])] ## temp12 <- rep(NA, nrow(h)) ## temp12[!is.na(temp6)] <- xxx cont.sd.log.runavg <- runavg(cont.sd.log[order(cont.mean.log)])[order(order(cont.mean.log))] exp.sd.log.runavg <- runavg( exp.sd.log[order( exp.mean.log)])[order(order( exp.mean.log))] ## xxx <- temp9[!is.na(temp9)][order(temp7[!is.na(temp9)])] ## xxx <- runavg(xxx, winsize) ## xxx <- xxx[rank(temp7[!is.na(temp9)])] ## ## like 11 and 12 but for the log transofrmed data ## temp13 <- rep(NA, nrow(h)) ## temp13[!is.na(temp9)] <- xxx ## xxx <- temp10[!is.na(temp10)][order(temp8[!is.na(temp10)])] ## xxx <- runavg(xxx, winsize) ## xxx <- xxx[rank(temp8[!is.na(temp10)])] ## temp14 <- rep(NA, nrow(h)) ## temp14[!is.na(temp10)] <- xxx ## now calculate the weighted average (Baysian) estimate of the ## standard deviation cont.sd.bayes <- sqrt((conf * cont.sd.runavg^2 + (cont.nsamples - 1) * cont.sd^2)/ (conf + cont.nsamples - 1)) exp.sd.bayes <- sqrt((conf * exp.sd.runavg^2 + (exp.nsamples - 1) * exp.sd^2)/ (conf + exp.nsamples - 1)) cont.sd.log.bayes <- sqrt((conf * cont.sd.log.runavg^2 + (cont.nsamples - 1) * cont.sd.log^2)/ (conf + cont.nsamples - 1)) exp.sd.log.bayes <- sqrt((conf * exp.sd.log.runavg^2 + (exp.nsamples - 1) * exp.sd.log^2)/ (conf + exp.nsamples - 1)) ## temp11 <- sqrt((conf * temp11^2 + (temp1 - 1) * temp5^2)/ ## (conf + temp1 - 2)) ## temp12 <- sqrt((conf * temp12^2 + (temp2 - 1) * temp6^2)/ ## (conf + temp2 - 2)) ## temp13 <- sqrt((conf * temp13^2 + (temp1 - 1) * temp9^2)/(conf + ## temp1 - 2)) ## temp14 <- sqrt((conf * temp14^2 + (temp2 - 1) * temp10^2)/(conf + ## temp2 - 2) cont.cols <- 1:NCOL(data[,cont]) exp.cols <- cont.cols[length(cont.cols)]+1:NCOL(data[,exp]) result <- cbind(data[,cont],data[,exp], cont.nsamples, exp.nsamples, cont.mean, exp.mean, cont.sd, exp.sd, cont.mean.log, exp.mean.log, cont.sd.log, exp.sd.log, cont.sd.bayes,exp.sd.bayes, cont.sd.log.bayes,exp.sd.log.bayes ) colnames(result)[cont.cols] <- sapply(colnames(result)[cont.cols], function(x){ paste(sep="",collapse="", "cont.",x)}) colnames(result)[exp.cols] <- sapply(colnames(result)[exp.cols], function(x){ paste(sep="",collapse="", "exp.",x)}) colnames(result)[(NCOL(data[,cont|exp])+1):NCOL(result)] <- c("controls.num","experiments.num", "controls.mean","experiments.mean", "controls.sd","experiments.sd", "controls.mean.log","experiments.mean.log", "controls.sd.log","experiments.sd.log", "controls.sd.bayesian","experiments.sd.bayesian", "controls.sd.log.bayesian","experiments.sd.log.bayesian" ) ## calculate the "lower bound" on the limit to detection the lower ## bound is simply the 0.0025 quantile of the means threshold <- c(quantile(cont.mean, 0.0025,na.rm=TRUE), quantile(exp.mean, 0.0025,na.rm=TRUE) ) ## calculate statistics associated with the t-test for the raw data ## (temp1) and log transformed data (temp2). The call to pierre uses ## the st. dev.'s which incorporate the running average temp1 <- apply(result, 1, function(x) { temp <- tstat.general(n1=x["controls.num"], n2=x["experiments.num"], x1=x["controls.mean"], x2=x["experiments.mean"], sd1=x["controls.sd.bayesian"], sd2=x["experiments.sd.bayesian"], small=(exp(threshold[1]) + exp(threshold[2]))/2, minrep=minrep) return(c(temp$t,temp$df,temp$rvar)) } ) temp2 <- apply(result, 1, function(x) { temp <- tstat.general(n1=x["controls.num"], n2=x["experiments.num"], x1=x["controls.mean.log"], x2=x["experiments.mean.log"], sd1=x["controls.sd.log.bayesian"], sd2=x["experiments.sd.log.bayesian"], small=(exp(threshold[1]) + exp(threshold[2]))/2, minrep=minrep) return(c(temp$t,temp$df,temp$rvar)) } ) result <- cbind(result, t(temp1), t(temp2)) colnames(result)[(NCOL(result)-5):NCOL(result)] <- c("t.bayes", "df.bayes", "vr.bayes", "t.log.bayes", "df.log.bayes", "vr.log.bayes") rm(temp1, temp2) ## p-values associated with the two t-stats change d.f. to ## incorporate Bayesian Jan 10 / 2001 the calculations below are ## both regular t-tests with a Bayesian correction. The last term ## 'h[,end+16|19]' are degrees of freedom associated with the ## regular t-test plus the '2*conf - 2' which is the d.f.'s ## associated with the Bayes estimate ## p-raw p.bayes <- 2*pt(-abs(result[,"t.bayes"]), result[,"df.bayes"] + 2*conf - 2) ## p-log p.log.bayes <- 2*pt(-abs(result[,"t.log.bayes"]), result[,"df.log.bayes"] + 2*conf - 2) qvalue.p.bayes <- rep(NA,times=length(p.log.bayes)) temp.qvalue <- qvalue(p=na.omit(p.bayes)) if (class(temp.qvalue)=="qvalue") { qvalue.p.bayes[!is.na(p.bayes)] <- temp.qvalue$qvalues } qvalue.p.log.bayes <- rep(NA,times=length(p.log.bayes)) temp.qvalue <- qvalue(p=na.omit(p.log.bayes)) if (class(temp.qvalue)=="qvalue") { qvalue.p.log.bayes[!is.na(p.log.bayes)] <- temp.qvalue$qvalues } result <- cbind(result, p.bayes, p.log.bayes, qvalue.p.bayes, qvalue.p.log.bayes, p.adjust(p.bayes,method="BH"), p.adjust(p.log.bayes,method="BH") ) colnames(result)[(NCOL(result)-5):NCOL(result)] <- c("p.bayes","p.log.bayes", "fdr.q.bayes","fdr.q.log.bayes", "fdr.bh.bayes","fdr.bh.log.bayes" ) ## calculate Bonferroni threshold, also calculate fold expression ## (signed) print a file of all genes that pass the Bonferroni ## and print all file that gives the results for all genes ## we no longer bother to calculate a bonferoni correction; that's ## the job of things following to do ## Bonf <- 1 - exp(log(1 - experror)/length(h[!is.na(h[, end + ## 22]), end + 22])) fold.change <- exp.mean/cont.mean fold.change[fold.change<1] <- - 1 / fold.change[fold.change<1] ## temp1 <- -(h[, end + 3]/h[, end + 4]) * ## ((h[, end + 3]/h[,end + 4]) >= 1) + ## (h[, end + 4]/h[, end + 3]) * ## ((h[,end + 3]/h[, end + 4]) < 1) ## calculate fold change stuff ## thres <- (exp(threshold[1]) + exp(threshold[2]))/2 ## I'm not sure if this correction by threshold is actually ## necessary or a good idea. ## temp2 <- is.na(temp1) * is.na(h[,end + 3]) * (h[,end+4]/thres) ## temp3 <- is.na(temp1) * is.na(h[,end + 4]) * -(h[,end+3]/thres) ## ttemp <- cbind(temp1,temp2,temp3) ## rm(temp1,temp2,temp3) ## temp4 <- apply(ttemp,1,function(x) sum(x,na.rm=TRUE)) ## rm(ttemp) result <- cbind(result, fold.change) ## rm(temp4) colnames(result)[NCOL(result)] <- "fold.change" ## calculate the t-test statistic, p-value and degrees of freedom ## for raw data and log transformed data change the na.action to ## na.omit to allow to continue calculating past bad values temp.t <- t(apply(data, 1, function(x) { if ((NROW(na.omit(x[cont])) < 2) || (NROW(na.omit(x[exp])) < 2)){ return(c(NA,NA,NA)) } temp <- t.test(na.omit(x[cont]), na.omit(x[exp]) ) c(temp$p.value,temp$statistic,temp$parameter) })) temp.logt <- t(apply(data, 1, function(x) { if ((NROW(na.omit(x[cont])) < 2) || (NROW(na.omit(x[exp]))<2)){ return(c(NA,NA,NA)) } temp <- t.test(na.omit(log(x[cont])), na.omit(log(x[exp])) ) c(temp$p.value,temp$statistic,temp$parameter) })) result <- cbind(result,temp.t,temp.logt) colnames(result)[(NCOL(result)-5):NCOL(result)] <- c('t.test.p','t.test.t','t.test.df', 't.test.p.log','t.test.t.log','t.test.df.log') ## calculate q values for the normal t test results temp.t.test.q <- result[,"t.test.p"] temp.t.test.q[!is.finite(temp.t.test.q)] <- NA temp.qvalue <- qvalue(p=na.omit(temp.t.test.q)) if (class(temp.qvalue)=="qvalue") temp.t.test.q[!is.na(temp.t.test.q)] <- temp.qvalue$qvalues temp.t.test.q.log <- result[,"t.test.p.log"] temp.t.test.q.log[!is.finite(temp.t.test.q.log)] <- NA temp.qvalue <- qvalue(p=na.omit(temp.t.test.q.log)) if (class(temp.qvalue)=="qvalue") temp.t.test.q.log[!is.na(temp.t.test.q.log)] <- temp.qvalue$qvalues result <- cbind(result,temp.t.test.q, temp.t.test.q.log, p.adjust(result[,"t.test.p"],method="BH"), p.adjust(result[,"t.test.p.log"],method="BH")) colnames(result)[(NCOL(result)-3):NCOL(result)] <- c('t.test.fdr.q','t.test.fdr.q.log', 't.test.fdr.bh','t.test.fdr.bh.log') attr(result,"controls") <- cont.cols attr(result,"experiments") <- exp.cols return(result) } runavg <- function(x,k=1){ if (k < 0) stop("k must be greater than or equal to 0"); if (k==0) return(x) n <- length(x) r <- numeric(n) # one side of the window for (i in 1:n) { if(is.na(x[i])) { r[i] <- NA } else { j <- i - k if (j < 1) j <- 1 l <- i + k if (l > n) l <- n r[i] <- mean(x[j:l],na.rm=TRUE) } } return(r) }