R/loess_normalization_reference.R

   1 library("stats")
   2 library("parallel")
   3 library("impute")
   4
   5 ##' Normalize using MvA loess using multiple processors
   6 ##'
   7 ##' For analyses where the number of samples makes all-against-all
   8 ##' normalization prohibitive, this variant subselects a set of
   9 ##' comparisons and runs them, iterating a specific number of times
  10 ##' using as many cores as possible.
  11 ##' @title loess_normalization_reference
  12 ##' @param expressions Gene expression values with probes in rows and samples in columns
  13 ##' @param reference Names of rownames of reference genes
  14 ##' @param iterations Number of iterations to run (Default: 2)
  15 ##' @param small.positive Number to replace negative and zero expressions with (Default: 0.1).
  16 ##' @param sample.size Number of combinations of samples to run (Default: 200).
  17 ##' @param num.cores Number of cores to use (Default: half of them)
  18 ##' @param imputation.method Method to impute missing values. String of "mean" (missing values are the mean of the row) or "knn" (use impute.knn to impute missing values) (Default: "mean").
  19 ##' @param offset Offset for expression (Default: 0; set to 1 if sample contains zeros)
  20 ##' @return data.frame of normalized expression values
  21 ##' @author Don Armstrong
  22 loess_normalization_reference <- function(expressions,reference=NA,iterations=2,small.positive=0.1,sample.size=200,num.cores=max(floor(parallel::detectCores()/2),1),imputation.method="mean",offset=0){
  23   n <- ncol(expressions)
  24   ## replace NA with the mean
  25   expressions.na <- is.na(expressions)
  26   expressions.na.which <- which(expressions.na,arr.ind=TRUE)
  27   expression.zero <- sum(expressions==0,na.rm=TRUE)
  28   if (expression.zero && offset == 0) {
  29       message("Sample contains zeros, setting offset to 1")
  30       offset <- 1
  31   }
  32   expressions <- expressions+offset
  33   pb <- txtProgressBar(0,iterations,style=3)
  34   if (is.na(reference)) {
  35       reference <- rownames(expressions)
  36   }
  37   reference.rows <- rownames(expressions) %in% reference
  38   for (q in 1:iterations) {
  39       if (nrow(expressions.na.which)>0) {
  40           expressions[expressions.na.which] <- NA
  41           if (imputation.method=="mean") {
  42               expressions[unique(expressions.na.which[,"row"]),] <-
  43                   apply(expressions[unique(expressions.na.which[,"row"]),],
  44                         1,
  45                         function(x){
  46                             if (all(is.na(x))) {
  47                                 x[] <- small.positive;
  48                             } else {
  49                                 x[is.na(x)] <- mean(x,na.rm=TRUE);
  50                             }
  51                             x})
  52           } else if (imputation.method=="knn") {
  53               expressions <- impute.knn(expressions)$data
  54           } else {
  55               stop("Unknown imputation.method; currently supported methods are 'mean' and 'knn'")
  56           }
  57
  58       }
  59       expressions.comb <-
  60           combn(ncol(expressions),2)
  61       expressions.comb.sample <- NULL
  62       if (sample.size >= ncol(expressions.comb)) {
  63           expressions.comb.sample <-
  64               expressions.comb
  65       } else {
  66           expressions.comb.sample <-
  67               expressions.comb[,sample.int(n=ncol(expressions.comb),
  68                                           size=sample.size,
  69                                            replace=FALSE)]
  70       }
  71       ### calculate the n for each sample
  72       expressions.comb.sample.n <-
  73           apply(sapply(1:ncol(expressions),
  74                        `==`,expressions.comb.sample),
  75                 2,sum)
  76       result.expressions <- array(1,dim=dim(expressions))
  77       m <- 1
  78       ## do this in batches of no more than 200 to avoid making a list
  79       ## which is way too large to fit in memory
  80       while (m <= ncol(expressions.comb.sample)) {
  81           e.c.subsample <-
  82               expressions.comb.sample[,m:min(m+200,ncol(expressions.comb.sample))]
  83           bar.m.hat.list <-
  84               parallel::mclapply(1:ncol(e.c.subsample),
  85                        function(k) {
  86                            i <- e.c.subsample[1,k]
  87                            j <- e.c.subsample[2,k]
  88                            mva <- data.frame(m=log2((expressions[,i])/
  89                                                     (expressions[,j])),
  90                                              a=0.5*log2((expressions[,i])
  91                                                         *(expressions[,j])),
  92                                              reference=reference.rows
  93                                              )
  94                            ## log2((i+1)/(j+1))~0.5*log2((i+1)*(j+1))
  95                            temp.loess <-
  96                                loess(m~a,mva[mva$reference,],
  97                                      control=loess.control(surface="interpolate",
  98                                                            statistics="approximate",
  99                                                            trace.hat="approximate",
 100                                                            iterations=2))
 101                            ## m should be zero, so we want to adjust
 102                            ## the ratio of i/j by the inverse of m.hat
 103
 104                            ## log2(i/j) - 0.5*m.hat = 0
 105
 106                            ## log2(i/j)=0.5*m.hat
 107
 108                            ## i/j=2^(0.5*m.hat)
 109
 110                            ## i = 2^(0.5*m.hat)*j
 111                            m.hat <- predict(temp.loess,mva[,"a"])
 112                            ## if for some reason we cannot predict, assume 0.
 113                            m.hat[is.na(m.hat)] <- 0
 114                            return(data.frame(i=2^(-0.5*m.hat/expressions.comb.sample.n[i]),
 115                                              j=2^(0.5*m.hat/expressions.comb.sample.n[j])))
 116                        },
 117                        mc.cores=num.cores
 118                        )
 119           for (k in 1:ncol(e.c.subsample)) {
 120               result.expressions[,e.c.subsample[1,k]] <-
 121                   result.expressions[,e.c.subsample[1,k]] *
 122                       bar.m.hat.list[[k]][,"i"]
 123               result.expressions[,e.c.subsample[2,k]] <-
 124                   result.expressions[,e.c.subsample[2,k]] *
 125                       bar.m.hat.list[[k]][,"j"]
 126           }
 127           m <- min(m+200,ncol(expressions.comb.sample))+1
 128           setTxtProgressBar(pb,(q-1)+m/(ncol(expressions.comb.sample)+1))
 129       }
 130       setTxtProgressBar(pb,q)
 131       ## correct expressionss
 132       expressions <- expressions*result.expressions
 133   }
 134   if (nrow(expressions.na.which)>0) {
 135       expressions[expressions.na.which] <- NA
 136   }
 137   expressions <- expressions-offset
 138   close(pb)
 139   return(expressions)
 140 }