library("stats") library("parallel") library("impute") ##' Normalize using MvA loess using multiple processors ##' ##' For analyses where the number of samples makes all-against-all ##' normalization prohibitive, this variant subselects a set of ##' comparisons and runs them, iterating a specific number of times ##' using as many cores as possible. ##' @title loess_normalization ##' @param expressions Gene expression values with probes in rows and samples in columns ##' @param iterations Number of iterations to run (Default: 2) ##' @param small.positive Number to replace negative and zero expressions with (Default: 0.1). ##' @param sample.size Number of combinations of samples to run (Default: 200). ##' @param num.cores Number of cores to use (Default: half of them) ##' @param noramlization.method Normalization method (Default: mean). ##' @return data.frame of normalized expression values ##' @author Don Armstrong loess_normalization <- function(expressions,iterations=2,small.positive=0.1,sample.size=200,num.cores=max(floor(detectCores()/2),1),normalization.method="mean"){ n <- ncol(expressions) ## replace NA with the mean expressions.na <- is.na(expressions) expressions.na.which <- which(expressions.na,arr.ind=TRUE) if (nrow(expressions.na.which)>0) { if (normalization.method=="mean") { expressions[unique(expressions.na.which[,"row"]),] <- apply(expressions[unique(expressions.na.which[,"row"]),], 1, function(x){ if (all(is.na(x))) { x[] <- small.positive; } else { x[is.na(x)] <- mean(x,na.rm=TRUE); } x}) } else if (normalization.method=="knn") { expressions <- impute.knn(expressions)$data } else { stop("Unknown normalization.method; currently supported methods are 'mean' and 'knn'") } } pb <- txtProgressBar(0,iterations,style=3) for (q in 1:iterations) { expressions.comb <- combn(ncol(expressions),2) expressions.comb.sample <- NULL if (sample.size >= ncol(expressions.comb)) { expressions.comb.sample <- expressions.comb } else { expressions.comb.sample <- expressions.comb[,sample.int(n=ncol(expressions.comb), size=sample.size, replace=FALSE)] } ### calculate the n for each sample expressions.comb.sample.n <- apply(sapply(1:ncol(expressions), `==`,expressions.comb.sample), 2,sum) result.expressions <- array(1,dim=dim(expressions)) m <- 1 ## do this in batches of no more than 200 to avoid making a list ## which is way too large to fit in memory while (m <= ncol(expressions.comb.sample)) { e.c.subsample <- expressions.comb.sample[,m:min(m+200,ncol(expressions.comb.sample))] bar.m.hat.list <- mclapply(1:ncol(e.c.subsample), function(k) { i <- e.c.subsample[1,k] j <- e.c.subsample[2,k] mva <- data.frame(m=log2(expressions[,i]/expressions[,j]), a=0.5*log2(expressions[,i]*expressions[,j])) temp.loess <- loess(m~a,mva,control=loess.control(surface="interpolate", statistics="approximate", trace.hat="approximate", iterations=2)) ## predict for i, predict for j (\hat{M}) ## subtract out \hat{M} from M m.hat <- predict(temp.loess,mva[,"a"]) return(data.frame(i=2^(-0.5*m.hat/expressions.comb.sample.n[i]), j=2^(0.5*m.hat/expressions.comb.sample.n[j]))) }, mc.cores=num.cores ) for (k in 1:ncol(e.c.subsample)) { result.expressions[,e.c.subsample[1,k]] <- result.expressions[,e.c.subsample[1,k]] * bar.m.hat.list[[k]][,"i"] result.expressions[,e.c.subsample[2,k]] <- result.expressions[,e.c.subsample[2,k]] * bar.m.hat.list[[k]][,"j"] } m <- min(m+200,ncol(expressions.comb.sample))+1 setTxtProgressBar(pb,(q-1)+m/(ncol(expressions.comb.sample)+1)) } setTxtProgressBar(pb,q) ## correct expressionss expressions <- expressions*result.expressions } close(pb) return(expressions) }