]> git.donarmstrong.com Git - loess_normalization.git/blob - R/ma_loess_norm.R
add initial version of loessnorm
[loess_normalization.git] / R / ma_loess_norm.R
1 library("stats")
2 library("parallel")
3 library("impute")
4
5 ##' Normalize using MvA loess using multiple processors
6 ##'
7 ##' For analyses where the number of samples makes all-against-all
8 ##' normalization prohibitive, this variant subselects a set of
9 ##' comparisons and runs them, iterating a specific number of times
10 ##' using as many cores as possible.
11 ##' @title loess_normalization
12 ##' @param expressions Gene expression values with probes in rows and samples in columns
13 ##' @param iterations Number of iterations to run (Default: 2)
14 ##' @param small.positive Number to replace negative and zero expressions with (Default: 0.1).
15 ##' @param sample.size Number of combinations of samples to run (Default: 200).
16 ##' @param num.cores Number of cores to use (Default: half of them)
17 ##' @param noramlization.method Normalization method (Default: mean).
18 ##' @return data.frame of normalized expression values
19 ##' @author Don Armstrong
20 loess_normalization <- function(expressions,iterations=2,small.positive=0.1,sample.size=200,num.cores=max(floor(detectCores()/2),1),normalization.method="mean"){
21   n <- ncol(expressions)
22   ## replace NA with the mean
23   expressions.na <- is.na(expressions)
24   expressions.na.which <- which(expressions.na,arr.ind=TRUE)
25   if (nrow(expressions.na.which)>0) {
26       if (normalization.method=="mean") {
27           expressions[unique(expressions.na.which[,"row"]),] <-
28               apply(expressions[unique(expressions.na.which[,"row"]),],
29                     1,
30                     function(x){
31                         if (all(is.na(x))) {
32                             x[] <- small.positive;
33                         } else {
34                             x[is.na(x)] <- mean(x,na.rm=TRUE);
35                         }
36                         x})
37       } else if (normalization.method=="knn") {
38           expressions <- impute.knn(expressions)$data
39       } else {
40           stop("Unknown normalization.method; currently supported methods are 'mean' and 'knn'")
41       }
42           
43   }
44   pb <- txtProgressBar(0,iterations,style=3)
45   for (q in 1:iterations) {
46       expressions.comb <-
47           combn(ncol(expressions),2)
48       expressions.comb.sample <- NULL
49       if (sample.size >= ncol(expressions.comb)) {
50           expressions.comb.sample <-
51               expressions.comb
52       } else {
53           expressions.comb.sample <-
54               expressions.comb[,sample.int(n=ncol(expressions.comb),
55                                           size=sample.size,
56                                            replace=FALSE)]
57       }
58       ### calculate the n for each sample
59       expressions.comb.sample.n <-
60           apply(sapply(1:ncol(expressions),
61                        `==`,expressions.comb.sample),
62                 2,sum)
63       result.expressions <- array(1,dim=dim(expressions))
64       m <- 1
65       ## do this in batches of no more than 200 to avoid making a list
66       ## which is way too large to fit in memory
67       while (m <= ncol(expressions.comb.sample)) {
68           e.c.subsample <-
69               expressions.comb.sample[,m:min(m+200,ncol(expressions.comb.sample))]
70           bar.m.hat.list <-
71               mclapply(1:ncol(e.c.subsample),
72                        function(k) {
73                            i <- e.c.subsample[1,k]
74                            j <- e.c.subsample[2,k]
75                            mva <- data.frame(m=log2(expressions[,i]/expressions[,j]),
76                                              a=0.5*log2(expressions[,i]*expressions[,j]))
77                            temp.loess <-
78                                loess(m~a,mva,control=loess.control(surface="interpolate",
79                                                  statistics="approximate",
80                                                  trace.hat="approximate",
81                                                  iterations=2))
82                            ## predict for i, predict for j (\hat{M})
83                            ## subtract out \hat{M} from M
84                            m.hat <- predict(temp.loess,mva[,"a"])
85                            return(data.frame(i=2^(-0.5*m.hat/expressions.comb.sample.n[i]),
86                                              j=2^(0.5*m.hat/expressions.comb.sample.n[j])))
87                        },
88                        mc.cores=num.cores
89                        )
90           for (k in 1:ncol(e.c.subsample)) {
91               result.expressions[,e.c.subsample[1,k]] <-
92                   result.expressions[,e.c.subsample[1,k]] *
93                       bar.m.hat.list[[k]][,"i"]
94               result.expressions[,e.c.subsample[2,k]] <-
95                   result.expressions[,e.c.subsample[2,k]] *
96                       bar.m.hat.list[[k]][,"j"]
97           }
98           m <- min(m+200,ncol(expressions.comb.sample))+1
99           setTxtProgressBar(pb,(q-1)+m/(ncol(expressions.comb.sample)+1))
100       }
101       setTxtProgressBar(pb,q)
102       ## correct expressionss
103       expressions <- expressions*result.expressions
104   }
105   close(pb)
106   return(expressions)
107 }