add reference-based loess normalization

author Don Armstrong <don@donarmstrong.com>

Fri, 7 Apr 2017 19:04:47 +0000 (12:04 -0700)

committer Don Armstrong <don@donarmstrong.com>

Fri, 7 Apr 2017 19:04:47 +0000 (12:04 -0700)
author Don Armstrong <don@donarmstrong.com>
Fri, 7 Apr 2017 19:04:47 +0000 (12:04 -0700)
committer Don Armstrong <don@donarmstrong.com>
Fri, 7 Apr 2017 19:04:47 +0000 (12:04 -0700)
diff --git a/R/loess_normalization_reference.R b/R/loess_normalization_reference.R

new file mode 100644 (file)

index 0000000..c2ac448
--- /dev/null
+++ b/R/loess_normalization_reference.R
@@ -0,0 +1,140 @@
+library("stats")
+library("parallel")
+library("impute")
+
+##' Normalize using MvA loess using multiple processors
+##'
+##' For analyses where the number of samples makes all-against-all
+##' normalization prohibitive, this variant subselects a set of
+##' comparisons and runs them, iterating a specific number of times
+##' using as many cores as possible.
+##' @title loess_normalization_reference
+##' @param expressions Gene expression values with probes in rows and samples in columns
+##' @param reference Names of rownames of reference genes
+##' @param iterations Number of iterations to run (Default: 2)
+##' @param small.positive Number to replace negative and zero expressions with (Default: 0.1).
+##' @param sample.size Number of combinations of samples to run (Default: 200).
+##' @param num.cores Number of cores to use (Default: half of them)
+##' @param imputation.method Method to impute missing values. String of "mean" (missing values are the mean of the row) or "knn" (use impute.knn to impute missing values) (Default: "mean").
+##' @param offset Offset for expression (Default: 0; set to 1 if sample contains zeros)
+##' @return data.frame of normalized expression values
+##' @author Don Armstrong
+loess_normalization_reference <- function(expressions,reference=NA,iterations=2,small.positive=0.1,sample.size=200,num.cores=max(floor(parallel::detectCores()/2),1),imputation.method="mean",offset=0){
+  n <- ncol(expressions)
+  ## replace NA with the mean
+  expressions.na <- is.na(expressions)
+  expressions.na.which <- which(expressions.na,arr.ind=TRUE)
+  expression.zero <- sum(expressions==0,na.rm=TRUE)
+  if (expression.zero && offset == 0) {
+      message("Sample contains zeros, setting offset to 1")
+      offset <- 1
+  }
+  expressions <- expressions+offset
+  pb <- txtProgressBar(0,iterations,style=3)
+  if (is.na(reference)) {
+      reference <- rownames(expressions)
+  }
+  reference.rows <- rownames(expressions) %in% reference
+  for (q in 1:iterations) {
+      if (nrow(expressions.na.which)>0) {
+          expressions[expressions.na.which] <- NA
+          if (imputation.method=="mean") {
+              expressions[unique(expressions.na.which[,"row"]),] <-
+                  apply(expressions[unique(expressions.na.which[,"row"]),],
+                        1,
+                        function(x){
+                            if (all(is.na(x))) {
+                                x[] <- small.positive;
+                            } else {
+                                x[is.na(x)] <- mean(x,na.rm=TRUE);
+                            }
+                            x})
+          } else if (imputation.method=="knn") {
+              expressions <- impute.knn(expressions)$data
+          } else {
+              stop("Unknown imputation.method; currently supported methods are 'mean' and 'knn'")
+          }
+          
+      }
+      expressions.comb <-
+          combn(ncol(expressions),2)
+      expressions.comb.sample <- NULL
+      if (sample.size >= ncol(expressions.comb)) {
+          expressions.comb.sample <-
+              expressions.comb
+      } else {
+          expressions.comb.sample <-
+              expressions.comb[,sample.int(n=ncol(expressions.comb),
+                                          size=sample.size,
+                                           replace=FALSE)]
+      }
+      ### calculate the n for each sample
+      expressions.comb.sample.n <-
+          apply(sapply(1:ncol(expressions),
+                       `==`,expressions.comb.sample),
+                2,sum)
+      result.expressions <- array(1,dim=dim(expressions))
+      m <- 1
+      ## do this in batches of no more than 200 to avoid making a list
+      ## which is way too large to fit in memory
+      while (m <= ncol(expressions.comb.sample)) {
+          e.c.subsample <-
+              expressions.comb.sample[,m:min(m+200,ncol(expressions.comb.sample))]
+          bar.m.hat.list <-
+              parallel::mclapply(1:ncol(e.c.subsample),
+                       function(k) {
+                           i <- e.c.subsample[1,k]
+                           j <- e.c.subsample[2,k]
+                           mva <- data.frame(m=log2((expressions[,i])/
+                                                    (expressions[,j])),
+                                             a=0.5*log2((expressions[,i])
+                                                        *(expressions[,j])),
+                                             reference=reference.rows
+                                             )
+                           ## log2((i+1)/(j+1))~0.5*log2((i+1)*(j+1))
+                           temp.loess <-
+                               loess(m~a,mva[mva$reference,],
+                                     control=loess.control(surface="interpolate",
+                                                           statistics="approximate",
+                                                           trace.hat="approximate",
+                                                           iterations=2))
+                           ## m should be zero, so we want to adjust
+                           ## the ratio of i/j by the inverse of m.hat
+
+                           ## log2(i/j) - 0.5*m.hat = 0
+
+                           ## log2(i/j)=0.5*m.hat
+
+                           ## i/j=2^(0.5*m.hat)
+
+                           ## i = 2^(0.5*m.hat)*j
+                           m.hat <- predict(temp.loess,mva[,"a"])
+                           ## if for some reason we cannot predict, assume 0.
+                           m.hat[is.na(m.hat)] <- 0
+                           return(data.frame(i=2^(-0.5*m.hat/expressions.comb.sample.n[i]),
+                                             j=2^(0.5*m.hat/expressions.comb.sample.n[j])))
+                       },
+                       mc.cores=num.cores
+                       )
+          for (k in 1:ncol(e.c.subsample)) {
+              result.expressions[,e.c.subsample[1,k]] <-
+                  result.expressions[,e.c.subsample[1,k]] *
+                      bar.m.hat.list[[k]][,"i"]
+              result.expressions[,e.c.subsample[2,k]] <-
+                  result.expressions[,e.c.subsample[2,k]] *
+                      bar.m.hat.list[[k]][,"j"]
+          }
+          m <- min(m+200,ncol(expressions.comb.sample))+1
+          setTxtProgressBar(pb,(q-1)+m/(ncol(expressions.comb.sample)+1))
+      }
+      setTxtProgressBar(pb,q)
+      ## correct expressionss
+      expressions <- expressions*result.expressions
+  }
+  if (nrow(expressions.na.which)>0) {
+      expressions[expressions.na.which] <- NA
+  }
+  expressions <- expressions-offset
+  close(pb)
+  return(expressions)
+}
author	Don Armstrong <don@donarmstrong.com>
	Fri, 7 Apr 2017 19:04:47 +0000 (12:04 -0700)
committer	Don Armstrong <don@donarmstrong.com>
	Fri, 7 Apr 2017 19:04:47 +0000 (12:04 -0700)