+library("stats")
+library("parallel")
+library("impute")
+
+##' Normalize using MvA loess using multiple processors
+##'
+##' For analyses where the number of samples makes all-against-all
+##' normalization prohibitive, this variant subselects a set of
+##' comparisons and runs them, iterating a specific number of times
+##' using as many cores as possible.
+##' @title loess_normalization
+##' @param expressions Gene expression values with probes in rows and samples in columns
+##' @param iterations Number of iterations to run (Default: 2)
+##' @param small.positive Number to replace negative and zero expressions with (Default: 0.1).
+##' @param sample.size Number of combinations of samples to run (Default: 200).
+##' @param num.cores Number of cores to use (Default: half of them)
+##' @param noramlization.method Normalization method (Default: mean).
+##' @return data.frame of normalized expression values
+##' @author Don Armstrong
+loess_normalization <- function(expressions,iterations=2,small.positive=0.1,sample.size=200,num.cores=max(floor(detectCores()/2),1),normalization.method="mean"){
+ n <- ncol(expressions)
+ ## replace NA with the mean
+ expressions.na <- is.na(expressions)
+ expressions.na.which <- which(expressions.na,arr.ind=TRUE)
+ if (nrow(expressions.na.which)>0) {
+ if (normalization.method=="mean") {
+ expressions[unique(expressions.na.which[,"row"]),] <-
+ apply(expressions[unique(expressions.na.which[,"row"]),],
+ 1,
+ function(x){
+ if (all(is.na(x))) {
+ x[] <- small.positive;
+ } else {
+ x[is.na(x)] <- mean(x,na.rm=TRUE);
+ }
+ x})
+ } else if (normalization.method=="knn") {
+ expressions <- impute.knn(expressions)$data
+ } else {
+ stop("Unknown normalization.method; currently supported methods are 'mean' and 'knn'")
+ }
+
+ }
+ pb <- txtProgressBar(0,iterations,style=3)
+ for (q in 1:iterations) {
+ expressions.comb <-
+ combn(ncol(expressions),2)
+ expressions.comb.sample <- NULL
+ if (sample.size >= ncol(expressions.comb)) {
+ expressions.comb.sample <-
+ expressions.comb
+ } else {
+ expressions.comb.sample <-
+ expressions.comb[,sample.int(n=ncol(expressions.comb),
+ size=sample.size,
+ replace=FALSE)]
+ }
+ ### calculate the n for each sample
+ expressions.comb.sample.n <-
+ apply(sapply(1:ncol(expressions),
+ `==`,expressions.comb.sample),
+ 2,sum)
+ result.expressions <- array(1,dim=dim(expressions))
+ m <- 1
+ ## do this in batches of no more than 200 to avoid making a list
+ ## which is way too large to fit in memory
+ while (m <= ncol(expressions.comb.sample)) {
+ e.c.subsample <-
+ expressions.comb.sample[,m:min(m+200,ncol(expressions.comb.sample))]
+ bar.m.hat.list <-
+ mclapply(1:ncol(e.c.subsample),
+ function(k) {
+ i <- e.c.subsample[1,k]
+ j <- e.c.subsample[2,k]
+ mva <- data.frame(m=log2(expressions[,i]/expressions[,j]),
+ a=0.5*log2(expressions[,i]*expressions[,j]))
+ temp.loess <-
+ loess(m~a,mva,control=loess.control(surface="interpolate",
+ statistics="approximate",
+ trace.hat="approximate",
+ iterations=2))
+ ## predict for i, predict for j (\hat{M})
+ ## subtract out \hat{M} from M
+ m.hat <- predict(temp.loess,mva[,"a"])
+ return(data.frame(i=2^(-0.5*m.hat/expressions.comb.sample.n[i]),
+ j=2^(0.5*m.hat/expressions.comb.sample.n[j])))
+ },
+ mc.cores=num.cores
+ )
+ for (k in 1:ncol(e.c.subsample)) {
+ result.expressions[,e.c.subsample[1,k]] <-
+ result.expressions[,e.c.subsample[1,k]] *
+ bar.m.hat.list[[k]][,"i"]
+ result.expressions[,e.c.subsample[2,k]] <-
+ result.expressions[,e.c.subsample[2,k]] *
+ bar.m.hat.list[[k]][,"j"]
+ }
+ m <- min(m+200,ncol(expressions.comb.sample))+1
+ setTxtProgressBar(pb,(q-1)+m/(ncol(expressions.comb.sample)+1))
+ }
+ setTxtProgressBar(pb,q)
+ ## correct expressionss
+ expressions <- expressions*result.expressions
+ }
+ close(pb)
+ return(expressions)
+}