5 ##' Normalize using MvA loess using multiple processors
7 ##' For analyses where the number of samples makes all-against-all
8 ##' normalization prohibitive, this variant subselects a set of
9 ##' comparisons and runs them, iterating a specific number of times
10 ##' using as many cores as possible.
11 ##' @title loess_normalization_reference
12 ##' @param expressions Gene expression values with probes in rows and samples in columns
13 ##' @param reference Names of rownames of reference genes
14 ##' @param iterations Number of iterations to run (Default: 2)
15 ##' @param small.positive Number to replace negative and zero expressions with (Default: 0.1).
16 ##' @param sample.size Number of combinations of samples to run (Default: 200).
17 ##' @param num.cores Number of cores to use (Default: half of them)
18 ##' @param imputation.method Method to impute missing values. String of "mean" (missing values are the mean of the row) or "knn" (use impute.knn to impute missing values) (Default: "mean").
19 ##' @param offset Offset for expression (Default: 0; set to 1 if sample contains zeros)
20 ##' @return data.frame of normalized expression values
21 ##' @author Don Armstrong
22 loess_normalization_reference <- function(expressions,reference=NA,iterations=2,small.positive=0.1,sample.size=200,num.cores=max(floor(parallel::detectCores()/2),1),imputation.method="mean",offset=0){
23 n <- ncol(expressions)
24 ## replace NA with the mean
25 expressions.na <- is.na(expressions)
26 expressions.na.which <- which(expressions.na,arr.ind=TRUE)
27 expression.zero <- sum(expressions==0,na.rm=TRUE)
28 if (expression.zero && offset == 0) {
29 message("Sample contains zeros, setting offset to 1")
32 expressions <- expressions+offset
33 pb <- txtProgressBar(0,iterations,style=3)
34 if (is.na(reference)) {
35 reference <- rownames(expressions)
37 reference.rows <- rownames(expressions) %in% reference
38 for (q in 1:iterations) {
39 if (nrow(expressions.na.which)>0) {
40 expressions[expressions.na.which] <- NA
41 if (imputation.method=="mean") {
42 expressions[unique(expressions.na.which[,"row"]),] <-
43 apply(expressions[unique(expressions.na.which[,"row"]),],
47 x[] <- small.positive;
49 x[is.na(x)] <- mean(x,na.rm=TRUE);
52 } else if (imputation.method=="knn") {
53 expressions <- impute.knn(expressions)$data
55 stop("Unknown imputation.method; currently supported methods are 'mean' and 'knn'")
60 combn(ncol(expressions),2)
61 expressions.comb.sample <- NULL
62 if (sample.size >= ncol(expressions.comb)) {
63 expressions.comb.sample <-
66 expressions.comb.sample <-
67 expressions.comb[,sample.int(n=ncol(expressions.comb),
71 ### calculate the n for each sample
72 expressions.comb.sample.n <-
73 apply(sapply(1:ncol(expressions),
74 `==`,expressions.comb.sample),
76 result.expressions <- array(1,dim=dim(expressions))
78 ## do this in batches of no more than 200 to avoid making a list
79 ## which is way too large to fit in memory
80 while (m <= ncol(expressions.comb.sample)) {
82 expressions.comb.sample[,m:min(m+200,ncol(expressions.comb.sample))]
84 parallel::mclapply(1:ncol(e.c.subsample),
86 i <- e.c.subsample[1,k]
87 j <- e.c.subsample[2,k]
88 mva <- data.frame(m=log2((expressions[,i])/
90 a=0.5*log2((expressions[,i])
92 reference=reference.rows
94 ## log2((i+1)/(j+1))~0.5*log2((i+1)*(j+1))
96 loess(m~a,mva[mva$reference,],
97 control=loess.control(surface="interpolate",
98 statistics="approximate",
99 trace.hat="approximate",
101 ## m should be zero, so we want to adjust
102 ## the ratio of i/j by the inverse of m.hat
104 ## log2(i/j) - 0.5*m.hat = 0
106 ## log2(i/j)=0.5*m.hat
110 ## i = 2^(0.5*m.hat)*j
111 m.hat <- predict(temp.loess,mva[,"a"])
112 ## if for some reason we cannot predict, assume 0.
113 m.hat[is.na(m.hat)] <- 0
114 return(data.frame(i=2^(-0.5*m.hat/expressions.comb.sample.n[i]),
115 j=2^(0.5*m.hat/expressions.comb.sample.n[j])))
119 for (k in 1:ncol(e.c.subsample)) {
120 result.expressions[,e.c.subsample[1,k]] <-
121 result.expressions[,e.c.subsample[1,k]] *
122 bar.m.hat.list[[k]][,"i"]
123 result.expressions[,e.c.subsample[2,k]] <-
124 result.expressions[,e.c.subsample[2,k]] *
125 bar.m.hat.list[[k]][,"j"]
127 m <- min(m+200,ncol(expressions.comb.sample))+1
128 setTxtProgressBar(pb,(q-1)+m/(ncol(expressions.comb.sample)+1))
130 setTxtProgressBar(pb,q)
131 ## correct expressionss
132 expressions <- expressions*result.expressions
134 if (nrow(expressions.na.which)>0) {
135 expressions[expressions.na.which] <- NA
137 expressions <- expressions-offset