From 98c6ee89e10662524b2849d220ad783c8e4889ce Mon Sep 17 00:00:00 2001 From: Don Armstrong Date: Fri, 7 Apr 2017 12:04:47 -0700 Subject: [PATCH] add reference-based loess normalization --- R/loess_normalization_reference.R | 140 ++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 R/loess_normalization_reference.R diff --git a/R/loess_normalization_reference.R b/R/loess_normalization_reference.R new file mode 100644 index 0000000..c2ac448 --- /dev/null +++ b/R/loess_normalization_reference.R @@ -0,0 +1,140 @@ +library("stats") +library("parallel") +library("impute") + +##' Normalize using MvA loess using multiple processors +##' +##' For analyses where the number of samples makes all-against-all +##' normalization prohibitive, this variant subselects a set of +##' comparisons and runs them, iterating a specific number of times +##' using as many cores as possible. +##' @title loess_normalization_reference +##' @param expressions Gene expression values with probes in rows and samples in columns +##' @param reference Names of rownames of reference genes +##' @param iterations Number of iterations to run (Default: 2) +##' @param small.positive Number to replace negative and zero expressions with (Default: 0.1). +##' @param sample.size Number of combinations of samples to run (Default: 200). +##' @param num.cores Number of cores to use (Default: half of them) +##' @param imputation.method Method to impute missing values. String of "mean" (missing values are the mean of the row) or "knn" (use impute.knn to impute missing values) (Default: "mean"). +##' @param offset Offset for expression (Default: 0; set to 1 if sample contains zeros) +##' @return data.frame of normalized expression values +##' @author Don Armstrong +loess_normalization_reference <- function(expressions,reference=NA,iterations=2,small.positive=0.1,sample.size=200,num.cores=max(floor(parallel::detectCores()/2),1),imputation.method="mean",offset=0){ + n <- ncol(expressions) + ## replace NA with the mean + expressions.na <- is.na(expressions) + expressions.na.which <- which(expressions.na,arr.ind=TRUE) + expression.zero <- sum(expressions==0,na.rm=TRUE) + if (expression.zero && offset == 0) { + message("Sample contains zeros, setting offset to 1") + offset <- 1 + } + expressions <- expressions+offset + pb <- txtProgressBar(0,iterations,style=3) + if (is.na(reference)) { + reference <- rownames(expressions) + } + reference.rows <- rownames(expressions) %in% reference + for (q in 1:iterations) { + if (nrow(expressions.na.which)>0) { + expressions[expressions.na.which] <- NA + if (imputation.method=="mean") { + expressions[unique(expressions.na.which[,"row"]),] <- + apply(expressions[unique(expressions.na.which[,"row"]),], + 1, + function(x){ + if (all(is.na(x))) { + x[] <- small.positive; + } else { + x[is.na(x)] <- mean(x,na.rm=TRUE); + } + x}) + } else if (imputation.method=="knn") { + expressions <- impute.knn(expressions)$data + } else { + stop("Unknown imputation.method; currently supported methods are 'mean' and 'knn'") + } + + } + expressions.comb <- + combn(ncol(expressions),2) + expressions.comb.sample <- NULL + if (sample.size >= ncol(expressions.comb)) { + expressions.comb.sample <- + expressions.comb + } else { + expressions.comb.sample <- + expressions.comb[,sample.int(n=ncol(expressions.comb), + size=sample.size, + replace=FALSE)] + } + ### calculate the n for each sample + expressions.comb.sample.n <- + apply(sapply(1:ncol(expressions), + `==`,expressions.comb.sample), + 2,sum) + result.expressions <- array(1,dim=dim(expressions)) + m <- 1 + ## do this in batches of no more than 200 to avoid making a list + ## which is way too large to fit in memory + while (m <= ncol(expressions.comb.sample)) { + e.c.subsample <- + expressions.comb.sample[,m:min(m+200,ncol(expressions.comb.sample))] + bar.m.hat.list <- + parallel::mclapply(1:ncol(e.c.subsample), + function(k) { + i <- e.c.subsample[1,k] + j <- e.c.subsample[2,k] + mva <- data.frame(m=log2((expressions[,i])/ + (expressions[,j])), + a=0.5*log2((expressions[,i]) + *(expressions[,j])), + reference=reference.rows + ) + ## log2((i+1)/(j+1))~0.5*log2((i+1)*(j+1)) + temp.loess <- + loess(m~a,mva[mva$reference,], + control=loess.control(surface="interpolate", + statistics="approximate", + trace.hat="approximate", + iterations=2)) + ## m should be zero, so we want to adjust + ## the ratio of i/j by the inverse of m.hat + + ## log2(i/j) - 0.5*m.hat = 0 + + ## log2(i/j)=0.5*m.hat + + ## i/j=2^(0.5*m.hat) + + ## i = 2^(0.5*m.hat)*j + m.hat <- predict(temp.loess,mva[,"a"]) + ## if for some reason we cannot predict, assume 0. + m.hat[is.na(m.hat)] <- 0 + return(data.frame(i=2^(-0.5*m.hat/expressions.comb.sample.n[i]), + j=2^(0.5*m.hat/expressions.comb.sample.n[j]))) + }, + mc.cores=num.cores + ) + for (k in 1:ncol(e.c.subsample)) { + result.expressions[,e.c.subsample[1,k]] <- + result.expressions[,e.c.subsample[1,k]] * + bar.m.hat.list[[k]][,"i"] + result.expressions[,e.c.subsample[2,k]] <- + result.expressions[,e.c.subsample[2,k]] * + bar.m.hat.list[[k]][,"j"] + } + m <- min(m+200,ncol(expressions.comb.sample))+1 + setTxtProgressBar(pb,(q-1)+m/(ncol(expressions.comb.sample)+1)) + } + setTxtProgressBar(pb,q) + ## correct expressionss + expressions <- expressions*result.expressions + } + if (nrow(expressions.na.which)>0) { + expressions[expressions.na.which] <- NA + } + expressions <- expressions-offset + close(pb) + return(expressions) +} -- 2.39.2