R/bayesian_analysis.R

   1 #   Copyright (C) 1999 Anthony D. Long, Ecology & Evolutionary Biology
   2 #      University of California, Irvine (tdlong@uci.edu)
   3
   4 # Portions Copyright 2002,2003 Don Armstrong <don@donarmstrong.com>
   5
   6 # This program is free software; you can redistribute it and/or
   7 # modify it under the terms of the GNU General Public License
   8 # as published by the Free Software Foundation; either version 2
   9 # of the License, or (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15
  16 # $Id: bayesian_analysis.R,v 1.5 2006/07/12 03:27:03 don Exp $
  17
  18 pierre.pair.from.unpaired <- function(h,cs,ce,es,ee,end,
  19                                       experror=0.05,winsize=101,conf=10,minrep=3,file="pierre.pair",
  20                                       file.save=TRUE,returnh=FALSE,qvalues=TRUE)
  21 {
  22   # Use the difference between experimental and control as the test statistic.
  23   new.h <- h[,es:ee]-h[,cs:ce]
  24   if (cs > 1){
  25     new.h <- cbind(h[,1:cs-1],new.h)
  26   }
  27   totalexpresscol <- apply(h[,c(cs:ce,es:ee)],1,function(x) {mean(log(x[cs:ce]),log(x[es:ee]))});
  28   new.h <- cbind(new.h,totalexpresscol)
  29   if (end > ee){
  30     new.h <- cbind(new.h,h[,(ee+1):end])
  31   }
  32   newer.h <- pierre.pair(new.h,cs,ce,(end-(ee-es)),ce+1,file=file,experror=experror,
  33                          winsize=winsize,conf=conf,minrep=minrep,file.save=file.save,returnh=T)
  34   if (returnh){
  35     return(newer.h)
  36   }
  37 }
  38
  39 pierre.pair <-  function (h, cs, ce, end, totalexpresscol,
  40                           experror=0.05, winsize=101, conf=10, minrep=3, file="pierre.pair",
  41                           file.save=TRUE,returnh=FALSE,qvalues=TRUE)
  42 {
  43 # note 'totalexpresscol' added April 18th 2001:
  44 # this should be the column number of the column in "h[1:N]" that represents
  45 # total expression for that gene.
  46 # A good value for 'totalexpresscol' would be the mean of the log of the
  47 # "total expression level" over both treatments (i.e., control and
  48 # experimentals) and replicates, where the total expression level for each
  49 # gene/treatment/replicate is given as a fraction of total expression over
  50 # all genes for that treatment/replicate.
  51
  52 # number of non-zero samples, their mean and st.dev., index_col, running average sd.
  53
  54      S.N <- apply(h[, cs:ce], 1, function(x) sum(x !=  0 & !is.na(x)))
  55      S.mean <- apply(h[, cs:ce], 1, function(x) if (sum(x[x !=
  56          0 & !is.na(x)] != 0))
  57          mean(x[x != 0 & !is.na(x)])
  58      else NA)
  59      S.sd <- apply(h[, cs:ce], 1, function(x) if (sum(x[x !=
  60          0 & !is.na(x)]  != 0) >= minrep)
  61          sqrt(var(x[x != 0 & !is.na(x)]))
  62      else NA)
  63      index.col <- h[,totalexpresscol]
  64
  65 #   running average standard deviation
  66      xxx <- S.sd[!is.na(S.sd)][order(index.col[!is.na(S.sd)])]
  67      xxx <- runavg(xxx, winsize)
  68      xxx <- xxx[rank(index.col[!is.na(S.sd)])]
  69      S.rasd <- rep(NA, nrow(h))
  70      S.rasd[!is.na(S.sd)] <- xxx
  71
  72      h <- cbind(h, S.N, S.mean, S.sd, S.rasd)
  73      colnames(h)[(end + 1):(end + 4)] <- c("N", "x", "sd", "rasd")
  74      rm(S.N, S.mean, S.sd, S.rasd)
  75
  76 #   regular t-test, Bayesian variance, Bayesian t-test, p-reg, p-Bayes
  77      # Regular t
  78      temp1 <- sqrt(h[end + 1]) * (h[end + 2]/h[end + 3])
  79      # Baysian variance
  80      temp2 <- sqrt((conf * (h[end + 4])^2 + (h[end + 1] - 1) * (h[end + 3])^2)/(conf + h[end + 1] - 2))
  81      # Baysian t-test
  82      temp3 <-  sqrt(h[end + 1]) * (h[end + 2]/temp2)
  83 #  modify d.f. to reflect effect of conf  Jan. 10 / 2001
  84      # calc regular p  (temp4 is the same paired t-test as in doitall.pair)
  85      temp4 <- 1 - pf((temp1)^2, 1, h[, end + 1] - 1)
  86      # and Bayesian-p.  temp5 is a _paired_ t-test with a Bayesian correction.
  87      # The degrees of freedom are  h[,end+1] - 1 + conf - 1.  Since it is a paired
  88      # t-test the degrees of  freedom are fewer than non-paired approaches!!
  89      temp5 <- 1 - pf((temp3)^2, 1, h[, end + 1] + conf - 2)
  90      h <- cbind(h, temp1, temp2, temp3, temp4, temp5)
  91      rm(temp1, temp2, temp3, temp4, temp5)
  92      colnames(h)[(end + 5):(end + 9)] <- c("reg-t", "Bay-sd", "Bay-t", "reg-p", "Bay-p")
  93
  94      if (returnh & ! file.save){
  95        return(h)
  96      }
  97      write.table(h, file = paste(file,"_from_R.txt",sep=""), sep = "\t")
  98      png(file = paste(file,"_%03d.png",sep=""), width = 1600, height = 1200)
  99      plot(h[, end + 2], h[, end + 9], xlab = "difference in expression",
 100           ylab = "p-value", main = "Does difference predict significance?",
 101           abline(lm(h[, end + 9] ~ h[, end + 2])))
 102      graphics.off()
 103      if (returnh){
 104        return(h)
 105      }
 106 }
 107
 108 tstat.general <- function (n1, n2, x1, x2, sd1, sd2,
 109                            small=0, minrep=2)
 110 {
 111      ##  calculates and returns values associated with a t-test of the
 112      ## difference between two means
 113      ##  x is the dataframe
 114      ##  nn1 and nn2 are the number of control and experimental observations respectively
 115      ##  xx1 and xx2 are the means of the C's and E's
 116      ##  vv1 and vv2 are the standard deviations
 117      ##  small is the smallest detectable "signal"
 118      ##  minrep defines the smallest number of replicates required to do a two sample t-test
 119      ##   this must be AT LEAST TWO otherwise the variance will not be claculated
 120      ##  depending on the number of repllicate observations different t-tests are carried out
 121
 122   ##  do the two sample t-test
 123   if (n1 >= minrep & n2 >= minrep) {
 124     tt <- -(x1 - x2)/
 125       sqrt((((n1 - 1) * sd1^2 + (n2 - 1) * sd2^2)/
 126             (n1 + n2 - 2))
 127            * ((n1 + n2)/(n1 * n2)))
 128     dft <- n1 + n2 - 2
 129     rvar <- max((sd1^2)/(sd2^2), (sd2^2)/(sd1^2))
 130   }
 131
 132   ##  do not do a t-test
 133   if (n1 < minrep & n2 < minrep) {
 134     tt <- NA
 135     dft <- n1 + n2 - 2
 136     rvar <- NA
 137   }
 138
 139   ##  do the one sample versus a constant t-test -- use the mean for
 140   ##  the constant if available, otherwise use "small"
 141   if (n1 < minrep & n2 >= minrep) {
 142     if(is.na(x1)){
 143       tt <- (x2 - small)/(sd2/sqrt(n2))
 144     }
 145     else{
 146       tt <- (x2 - x1)/(sd2/sqrt(n2))
 147     }
 148     dft <- n2 - 1
 149     rvar <- NA
 150   }
 151   if (n1 >= minrep & n2 < minrep) {
 152     if(is.na(x2)){
 153       tt <- -(x1 - small)/(sd1/sqrt(n1))
 154     }
 155     else{
 156       tt <- -(x1 - x2)/(sd1/sqrt(n1))
 157     }
 158     dft <- n1 - 1
 159     rvar <- NA
 160   }
 161   return(list(t=tt,
 162               df=dft,
 163               rvar=rvar)
 164          )
 165 }
 166
 167
 168 microarray.bayesian <-
 169   function (data,
 170             winsize=101, conf=10,
 171             minrep=3) {
 172      # experror is the experiment wide probability of a false positive
 173      # winsize control the degree of local averaging
 174      # conf is the weighting of the Baysian prior relative to the observed within gene variance
 175      # temp1 and 2 are the number of control and experimental observations
 176   cont <- attr(data,"controls")
 177   exp  <- attr(data,"experiments")
 178   data[data == 0] <- NA
 179   ## temp1 and 2
 180   cont.nsamples <- apply(data[,cont],1,function(x){NROW(which(!is.na(x)))})
 181   exp.nsamples <- apply(data[,exp],1,function(x){NROW(which(!is.na(x)))})
 182   ## temp3 and 4
 183   cont.mean <- apply(data[,cont],1,function(x){mean(x,na.rm=TRUE)})
 184   exp.mean <-  apply(data[, exp],1,function(x){mean(x,na.rm=TRUE)})
 185   ## temp5 and 6
 186   cont.sd <- apply(data[,cont],1,function(x){sd(x,na.rm=TRUE)})
 187   exp.sd  <- apply(data[, exp],1,function(x){sd(x,na.rm=TRUE)})
 188   ## temp7 and 8
 189   cont.mean.log <- apply(data[,cont],1,function(x){mean(log(x),na.rm=TRUE)})
 190   exp.mean.log  <- apply(data[, exp],1,function(x){mean(log(x),na.rm=TRUE)})
 191   # temp9 and 10
 192   cont.sd.log <- apply(data[,cont],1,function(x){sd(log(x),na.rm=TRUE)})
 193   exp.sd.log  <- apply(data[, exp],1,function(x){sd(log(x),na.rm=TRUE)})
 194   ## temp11 and 12 are the running average standard deviations where
 195   ## the running averages are based on "winsize" and the function
 196   ## runavg. In short all the data within a treatment are sorted by
 197   ## average expression level and the running average calculated on
 198   ## the estimated st. dev. for each locus. All the wierd code is a
 199   ## work around of missing values
 200   cont.sd.runavg <- runavg(cont.sd[order(cont.mean)])[order(order(cont.mean))]
 201    exp.sd.runavg <- runavg( exp.sd[order( exp.mean)])[order(order( exp.mean))]
 202   ## xxx <- temp5[!is.na(temp5)][order(temp3[!is.na(temp5)])]
 203   ## xxx <- runavg(xxx, winsize)
 204   ## xxx <- xxx[rank(temp3[!is.na(temp5)])]
 205   ## temp11 <- rep(NA, nrow(h))
 206   ## temp11[!is.na(temp5)] <- xxx
 207   ## xxx <- temp6[!is.na(temp6)][order(temp4[!is.na(temp6)])]
 208   ## xxx <- runavg(xxx, winsize)
 209   ## xxx <- xxx[rank(temp4[!is.na(temp6)])]
 210   ## temp12 <- rep(NA, nrow(h))
 211   ## temp12[!is.na(temp6)] <- xxx
 212   cont.sd.log.runavg <- runavg(cont.sd.log[order(cont.mean.log)])[order(order(cont.mean.log))]
 213    exp.sd.log.runavg <- runavg( exp.sd.log[order( exp.mean.log)])[order(order( exp.mean.log))]
 214   ## xxx <- temp9[!is.na(temp9)][order(temp7[!is.na(temp9)])]
 215   ## xxx <- runavg(xxx, winsize)
 216   ## xxx <- xxx[rank(temp7[!is.na(temp9)])]
 217   ## ##   like 11 and 12 but for the log transofrmed data
 218   ## temp13 <- rep(NA, nrow(h))
 219   ## temp13[!is.na(temp9)] <- xxx
 220   ## xxx <- temp10[!is.na(temp10)][order(temp8[!is.na(temp10)])]
 221   ## xxx <- runavg(xxx, winsize)
 222   ## xxx <- xxx[rank(temp8[!is.na(temp10)])]
 223   ## temp14 <- rep(NA, nrow(h))
 224   ## temp14[!is.na(temp10)] <- xxx
 225
 226   ## now calculate the weighted average (Baysian) estimate of the
 227   ## standard deviation
 228   cont.sd.bayes <- sqrt((conf * cont.sd.runavg^2 +
 229                         (cont.nsamples - 1) * cont.sd^2)/
 230                           (conf + cont.nsamples - 1))
 231   exp.sd.bayes <- sqrt((conf * exp.sd.runavg^2 +
 232                        (exp.nsamples - 1) * exp.sd^2)/
 233                          (conf + exp.nsamples - 1))
 234   cont.sd.log.bayes <- sqrt((conf * cont.sd.log.runavg^2 +
 235                         (cont.nsamples - 1) * cont.sd.log^2)/
 236                           (conf + cont.nsamples - 1))
 237   exp.sd.log.bayes <- sqrt((conf * exp.sd.log.runavg^2 +
 238                        (exp.nsamples - 1) * exp.sd.log^2)/
 239                          (conf + exp.nsamples - 1))
 240   ## temp11 <- sqrt((conf * temp11^2 + (temp1 - 1) * temp5^2)/
 241   ##                (conf + temp1 - 2))
 242   ## temp12 <- sqrt((conf * temp12^2 + (temp2 - 1) * temp6^2)/
 243   ##                (conf + temp2 - 2))
 244   ## temp13 <- sqrt((conf * temp13^2 + (temp1 - 1) * temp9^2)/(conf +
 245   ##                                                          temp1 - 2))
 246   ## temp14 <- sqrt((conf * temp14^2 + (temp2 - 1) * temp10^2)/(conf +
 247   ##                                                            temp2 - 2)
 248   cont.cols <- 1:NCOL(data[,cont])
 249   exp.cols <- cont.cols[length(cont.cols)]+1:NCOL(data[,exp])
 250   result <- cbind(data[,cont],data[,exp],
 251                   cont.nsamples, exp.nsamples,
 252                   cont.mean, exp.mean,
 253                   cont.sd, exp.sd,
 254                   cont.mean.log, exp.mean.log,
 255                   cont.sd.log, exp.sd.log,
 256                   cont.sd.bayes,exp.sd.bayes,
 257                   cont.sd.log.bayes,exp.sd.log.bayes
 258                   )
 259   colnames(result)[cont.cols] <-
 260     sapply(colnames(result)[cont.cols],
 261            function(x){
 262              paste(sep="",collapse="",
 263                    "cont.",x)})
 264   colnames(result)[exp.cols] <-
 265     sapply(colnames(result)[exp.cols],
 266            function(x){
 267              paste(sep="",collapse="",
 268                    "exp.",x)})
 269   colnames(result)[(NCOL(data[,cont|exp])+1):NCOL(result)] <-
 270     c("controls.num","experiments.num",
 271       "controls.mean","experiments.mean",
 272       "controls.sd","experiments.sd",
 273       "controls.mean.log","experiments.mean.log",
 274       "controls.sd.log","experiments.sd.log",
 275       "controls.sd.bayesian","experiments.sd.bayesian",
 276       "controls.sd.log.bayesian","experiments.sd.log.bayesian"
 277       )
 278   ## calculate the "lower bound" on the limit to detection the lower
 279   ## bound is simply the 0.0025 quantile of the means
 280   threshold <- c(quantile(cont.mean, 0.0025,na.rm=TRUE),
 281                  quantile(exp.mean, 0.0025,na.rm=TRUE)
 282                  )
 283   ## calculate statistics associated with the t-test for the raw data
 284   ## (temp1) and log transformed data (temp2). The call to pierre uses
 285   ## the st. dev.'s which incorporate the running average
 286   temp1 <- apply(result, 1, function(x)
 287                  {
 288                    temp <- tstat.general(n1=x["controls.num"],  n2=x["experiments.num"],
 289                                          x1=x["controls.mean"], x2=x["experiments.mean"],
 290                                          sd1=x["controls.sd.bayesian"], sd2=x["experiments.sd.bayesian"],
 291                                          small=(exp(threshold[1]) + exp(threshold[2]))/2,
 292                                          minrep=minrep)
 293                    return(c(temp$t,temp$df,temp$rvar))
 294                  }
 295                  )
 296   temp2 <- apply(result, 1, function(x)
 297                  {
 298                    temp <- tstat.general(n1=x["controls.num"],  n2=x["experiments.num"],
 299                                          x1=x["controls.mean.log"], x2=x["experiments.mean.log"],
 300                                          sd1=x["controls.sd.log.bayesian"], sd2=x["experiments.sd.log.bayesian"],
 301                                          small=(exp(threshold[1]) + exp(threshold[2]))/2,
 302                                          minrep=minrep)
 303                    return(c(temp$t,temp$df,temp$rvar))
 304                  }
 305                  )
 306   result <- cbind(result, t(temp1), t(temp2))
 307   colnames(result)[(NCOL(result)-5):NCOL(result)] <-
 308     c("t.bayes", "df.bayes", "vr.bayes",
 309       "t.log.bayes", "df.log.bayes", "vr.log.bayes")
 310   rm(temp1, temp2)
 311
 312   ##   p-values associated with the two t-stats change d.f. to
 313   ##   incorporate Bayesian Jan 10 / 2001 the calculations below are
 314   ##   both regular t-tests with a Bayesian correction. The last term
 315   ##   'h[,end+16|19]' are degrees of freedom associated with the
 316   ##   regular t-test plus the '2*conf - 2' which is the d.f.'s
 317   ##   associated with the Bayes estimate
 318
 319   ## p-raw
 320   p.bayes <- 2*pt(-abs(result[,"t.bayes"]), result[,"df.bayes"] + 2*conf - 2)
 321   ## p-log
 322   p.log.bayes <- 2*pt(-abs(result[,"t.log.bayes"]), result[,"df.log.bayes"] + 2*conf - 2)
 323   qvalue.p.bayes <- rep(NA,times=length(p.log.bayes))
 324   temp.qvalue <- qvalue(p=na.omit(p.bayes))
 325   if (class(temp.qvalue)=="qvalue") {
 326     qvalue.p.bayes[!is.na(p.bayes)] <- temp.qvalue$qvalues
 327   }
 328   qvalue.p.log.bayes <- rep(NA,times=length(p.log.bayes))
 329   temp.qvalue <- qvalue(p=na.omit(p.log.bayes))
 330   if (class(temp.qvalue)=="qvalue") {
 331     qvalue.p.log.bayes[!is.na(p.log.bayes)] <- temp.qvalue$qvalues
 332   }
 333   result <- cbind(result,
 334                   p.bayes,
 335                   p.log.bayes,
 336                   qvalue.p.bayes,
 337                   qvalue.p.log.bayes,
 338                   p.adjust(p.bayes,method="BH"),
 339                   p.adjust(p.log.bayes,method="BH")
 340                   )
 341   colnames(result)[(NCOL(result)-5):NCOL(result)] <-
 342     c("p.bayes","p.log.bayes",
 343       "fdr.q.bayes","fdr.q.log.bayes",
 344       "fdr.bh.bayes","fdr.bh.log.bayes"
 345       )
 346
 347   ## calculate Bonferroni threshold, also calculate fold expression
 348   ## (signed) print a file of all genes that pass the Bonferroni
 349   ## and print all file that gives the results for all genes
 350
 351   ## we no longer bother to calculate a bonferoni correction; that's
 352   ## the job of things following to do
 353
 354   ## Bonf <- 1 - exp(log(1 - experror)/length(h[!is.na(h[, end +
 355   ##                                                     22]), end + 22]))
 356
 357   fold.change <- exp.mean/cont.mean
 358   fold.change[fold.change<1] <- - 1 / fold.change[fold.change<1]
 359   ## temp1 <- -(h[, end + 3]/h[, end + 4]) *
 360   ##   ((h[, end + 3]/h[,end + 4]) >= 1) +
 361   ##     (h[, end + 4]/h[, end + 3]) *
 362   ##       ((h[,end + 3]/h[, end + 4]) < 1)
 363
 364   ##  calculate fold change stuff
 365   ## thres <- (exp(threshold[1]) + exp(threshold[2]))/2
 366   ## I'm not sure if this correction by threshold is actually
 367   ## necessary or a good idea.
 368   ## temp2 <- is.na(temp1) *  is.na(h[,end + 3]) * (h[,end+4]/thres)
 369   ## temp3 <- is.na(temp1) *  is.na(h[,end + 4]) * -(h[,end+3]/thres)
 370   ## ttemp <- cbind(temp1,temp2,temp3)
 371   ## rm(temp1,temp2,temp3)
 372   ## temp4 <- apply(ttemp,1,function(x) sum(x,na.rm=TRUE))
 373   ## rm(ttemp)
 374   result <- cbind(result, fold.change)
 375   ## rm(temp4)
 376   colnames(result)[NCOL(result)] <- "fold.change"
 377
 378   ## calculate the t-test statistic, p-value and degrees of freedom
 379   ## for raw data and log transformed data change the na.action to
 380   ## na.omit to allow to continue calculating past bad values
 381
 382   temp.t <- t(apply(data, 1,
 383                     function(x) {
 384                       if ((NROW(na.omit(x[cont])) < 2) ||  (NROW(na.omit(x[exp])) < 2)){
 385                         return(c(NA,NA,NA))
 386                       }
 387                       temp <- t.test(na.omit(x[cont]),
 388                                      na.omit(x[exp])
 389                                      )
 390                       c(temp$p.value,temp$statistic,temp$parameter)
 391                     }))
 392   temp.logt <- t(apply(data, 1,
 393                        function(x) {
 394                          if ((NROW(na.omit(x[cont])) < 2) ||  (NROW(na.omit(x[exp]))<2)){
 395                            return(c(NA,NA,NA))
 396                          }
 397                          temp <- t.test(na.omit(log(x[cont])),
 398                                         na.omit(log(x[exp]))
 399                                         )
 400                          c(temp$p.value,temp$statistic,temp$parameter)
 401                        }))
 402   result <- cbind(result,temp.t,temp.logt)
 403   colnames(result)[(NCOL(result)-5):NCOL(result)] <- c('t.test.p','t.test.t','t.test.df',
 404                                                        't.test.p.log','t.test.t.log','t.test.df.log')
 405
 406   ## calculate q values for the normal t test results
 407   temp.t.test.q <- result[,"t.test.p"]
 408   temp.t.test.q[!is.finite(temp.t.test.q)] <- NA
 409   temp.qvalue <- qvalue(p=na.omit(temp.t.test.q))
 410   if (class(temp.qvalue)=="qvalue")
 411     temp.t.test.q[!is.na(temp.t.test.q)] <-  temp.qvalue$qvalues
 412   temp.t.test.q.log <- result[,"t.test.p.log"]
 413   temp.t.test.q.log[!is.finite(temp.t.test.q.log)] <- NA
 414   temp.qvalue <- qvalue(p=na.omit(temp.t.test.q.log))
 415   if (class(temp.qvalue)=="qvalue")
 416     temp.t.test.q.log[!is.na(temp.t.test.q.log)] <- temp.qvalue$qvalues
 417   result <- cbind(result,temp.t.test.q,
 418                   temp.t.test.q.log,
 419                   p.adjust(result[,"t.test.p"],method="BH"),
 420                   p.adjust(result[,"t.test.p.log"],method="BH"))
 421   colnames(result)[(NCOL(result)-3):NCOL(result)] <-
 422     c('t.test.fdr.q','t.test.fdr.q.log',
 423       't.test.fdr.bh','t.test.fdr.bh.log')
 424   attr(result,"controls") <- cont.cols
 425   attr(result,"experiments") <- exp.cols
 426
 427   return(result)
 428 }
 429
 430
 431 runavg <- function(x,k=1){
 432   if (k < 0)
 433     stop("k must be greater than or equal to 0");
 434   if (k==0)
 435     return(x)
 436   n <- length(x)
 437   r <- numeric(n)
 438   # one side of the window
 439   for (i in 1:n) {
 440     if(is.na(x[i])) {
 441       r[i] <- NA
 442     } else {
 443       j <- i - k
 444       if (j < 1)
 445         j <- 1
 446       l <- i + k
 447       if (l > n)
 448         l <- n
 449       r[i] <- mean(x[j:l],na.rm=TRUE)
 450     }
 451   }
 452   return(r)
 453 }