]> git.donarmstrong.com Git - loess_normalization.git/commitdiff
add initial version of loessnorm
authorDon Armstrong <don@donarmstrong.com>
Tue, 6 Dec 2016 23:01:15 +0000 (17:01 -0600)
committerDon Armstrong <don@donarmstrong.com>
Tue, 6 Dec 2016 23:01:15 +0000 (17:01 -0600)
DESCRIPTION [new file with mode: 0644]
NAMESPACE [new file with mode: 0644]
R/ma_loess_norm.R [new file with mode: 0644]
R/mva_plotting.R [new file with mode: 0644]
R/zzz.R [new file with mode: 0644]
man/loessNormalization-package.Rd [new file with mode: 0644]
man/loess_normalization.Rd [new file with mode: 0644]
man/mva.plot.pair.Rd [new file with mode: 0644]

diff --git a/DESCRIPTION b/DESCRIPTION
new file mode 100644 (file)
index 0000000..9fc803e
--- /dev/null
@@ -0,0 +1,9 @@
+Package: loessNormalization
+Type: Package
+Title: Loess normalization and plotting routines
+Version: 0.1
+Date: 2016-06-14
+Author: Don Armstrong
+Maintainer: Don Armstrong <don@donarmstrong.com>
+Description: Loess normalization and plotting routines
+License: GPL 3+
diff --git a/NAMESPACE b/NAMESPACE
new file mode 100644 (file)
index 0000000..d75f824
--- /dev/null
+++ b/NAMESPACE
@@ -0,0 +1 @@
+exportPattern("^[[:alpha:]]+")
diff --git a/R/ma_loess_norm.R b/R/ma_loess_norm.R
new file mode 100644 (file)
index 0000000..1b5494c
--- /dev/null
@@ -0,0 +1,107 @@
+library("stats")
+library("parallel")
+library("impute")
+
+##' Normalize using MvA loess using multiple processors
+##'
+##' For analyses where the number of samples makes all-against-all
+##' normalization prohibitive, this variant subselects a set of
+##' comparisons and runs them, iterating a specific number of times
+##' using as many cores as possible.
+##' @title loess_normalization
+##' @param expressions Gene expression values with probes in rows and samples in columns
+##' @param iterations Number of iterations to run (Default: 2)
+##' @param small.positive Number to replace negative and zero expressions with (Default: 0.1).
+##' @param sample.size Number of combinations of samples to run (Default: 200).
+##' @param num.cores Number of cores to use (Default: half of them)
+##' @param noramlization.method Normalization method (Default: mean).
+##' @return data.frame of normalized expression values
+##' @author Don Armstrong
+loess_normalization <- function(expressions,iterations=2,small.positive=0.1,sample.size=200,num.cores=max(floor(detectCores()/2),1),normalization.method="mean"){
+  n <- ncol(expressions)
+  ## replace NA with the mean
+  expressions.na <- is.na(expressions)
+  expressions.na.which <- which(expressions.na,arr.ind=TRUE)
+  if (nrow(expressions.na.which)>0) {
+      if (normalization.method=="mean") {
+          expressions[unique(expressions.na.which[,"row"]),] <-
+              apply(expressions[unique(expressions.na.which[,"row"]),],
+                    1,
+                    function(x){
+                        if (all(is.na(x))) {
+                            x[] <- small.positive;
+                        } else {
+                            x[is.na(x)] <- mean(x,na.rm=TRUE);
+                        }
+                        x})
+      } else if (normalization.method=="knn") {
+          expressions <- impute.knn(expressions)$data
+      } else {
+          stop("Unknown normalization.method; currently supported methods are 'mean' and 'knn'")
+      }
+          
+  }
+  pb <- txtProgressBar(0,iterations,style=3)
+  for (q in 1:iterations) {
+      expressions.comb <-
+          combn(ncol(expressions),2)
+      expressions.comb.sample <- NULL
+      if (sample.size >= ncol(expressions.comb)) {
+          expressions.comb.sample <-
+              expressions.comb
+      } else {
+          expressions.comb.sample <-
+              expressions.comb[,sample.int(n=ncol(expressions.comb),
+                                          size=sample.size,
+                                           replace=FALSE)]
+      }
+      ### calculate the n for each sample
+      expressions.comb.sample.n <-
+          apply(sapply(1:ncol(expressions),
+                       `==`,expressions.comb.sample),
+                2,sum)
+      result.expressions <- array(1,dim=dim(expressions))
+      m <- 1
+      ## do this in batches of no more than 200 to avoid making a list
+      ## which is way too large to fit in memory
+      while (m <= ncol(expressions.comb.sample)) {
+          e.c.subsample <-
+              expressions.comb.sample[,m:min(m+200,ncol(expressions.comb.sample))]
+          bar.m.hat.list <-
+              mclapply(1:ncol(e.c.subsample),
+                       function(k) {
+                           i <- e.c.subsample[1,k]
+                           j <- e.c.subsample[2,k]
+                           mva <- data.frame(m=log2(expressions[,i]/expressions[,j]),
+                                             a=0.5*log2(expressions[,i]*expressions[,j]))
+                           temp.loess <-
+                               loess(m~a,mva,control=loess.control(surface="interpolate",
+                                                 statistics="approximate",
+                                                 trace.hat="approximate",
+                                                 iterations=2))
+                           ## predict for i, predict for j (\hat{M})
+                           ## subtract out \hat{M} from M
+                           m.hat <- predict(temp.loess,mva[,"a"])
+                           return(data.frame(i=2^(-0.5*m.hat/expressions.comb.sample.n[i]),
+                                             j=2^(0.5*m.hat/expressions.comb.sample.n[j])))
+                       },
+                       mc.cores=num.cores
+                       )
+          for (k in 1:ncol(e.c.subsample)) {
+              result.expressions[,e.c.subsample[1,k]] <-
+                  result.expressions[,e.c.subsample[1,k]] *
+                      bar.m.hat.list[[k]][,"i"]
+              result.expressions[,e.c.subsample[2,k]] <-
+                  result.expressions[,e.c.subsample[2,k]] *
+                      bar.m.hat.list[[k]][,"j"]
+          }
+          m <- min(m+200,ncol(expressions.comb.sample))+1
+          setTxtProgressBar(pb,(q-1)+m/(ncol(expressions.comb.sample)+1))
+      }
+      setTxtProgressBar(pb,q)
+      ## correct expressionss
+      expressions <- expressions*result.expressions
+  }
+  close(pb)
+  return(expressions)
+}
diff --git a/R/mva_plotting.R b/R/mva_plotting.R
new file mode 100644 (file)
index 0000000..39f68d6
--- /dev/null
@@ -0,0 +1,83 @@
+# MvA plotting for microarray data
+
+
+# Copyright 2003 By Don Armstrong
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+# 02111-1307, USA.
+
+# $Id: mva_plotting.R,v 1.5 2006/06/10 04:14:45 don Exp $
+
+
+library("grid")
+library("ggplot2")
+
+# mva.plot
+# Plots a pairwise M versus A plot with a loess fit line
+##' Plots a pairwise M versus A plot with a loess fit line
+##'
+##' .. content for \details{} ..
+##' @title mva.plot.pair
+##' @param array Array of data to plot
+##' @param title Title of plot (Default: FALSE; no title)
+##' @param small.positive replace numbers less than 0 with this
+##' @author Don Armstrong \email{don@@donarmstrong.com}
+mva.plot.pair <- function(array,title=FALSE,small.positive=0.1){
+  size <- ncol(array)
+  npair <- size*(size-1)/2
+  height <- min(round((npair*3/4)^0.5),3)
+  width <- min(ceiling(npair/height),3)
+  def.par <- par(no.readonly = TRUE)
+  mva.layout <- grid.layout(ncol=width,nrow=height)
+  pushViewport(viewport(layout=mva.layout))
+  ## we could also use combn here instead
+  num.plot <- 0
+  for(a in 1:(size-1)){
+      for(b in (a+1):size){
+          num.plot <- num.plot + 1
+          group.1 <- array[,a]
+          group.2 <- array[,b]
+          group.keep <- is.finite(group.1) & is.finite(group.2)
+          group.1 <- group.1[group.keep]
+          group.2 <- group.2[group.keep]
+          group.min <- min(c(group.1,group.2))
+          if (group.min <= 0) {
+              group.1 <- group.1 - group.min + small.positive
+              group.2 <- group.2 - group.min + small.positive
+          }
+          A <- 0.5*log2(group.1*group.2)
+          M <- log2(group.1/group.2)
+          temp.frame <- na.omit(data.frame(cbind(A,M)))
+          if (num.plot > 1 &&
+              floor((num.plot -1)/ width) %% height == 0 &&
+              (num.plot-1) %% width == 0) {
+              popViewport(1)
+              grid.text(title,x=unit(0.0,"npc")+unit(1,"lines"),y=unit(0.5,"npc"),just="center",rot=90)
+              grid.newpage()
+              pushViewport(viewport(layout=mva.layout))
+          }
+          ## print(floor((num.plot -1)/ width) %% height+1)
+          ## print((num.plot - 1)%% width+1)
+          print(ggplot(temp.frame,aes(y=M,x=A))+
+                geom_hex()+
+                stat_smooth(method="gam")+
+                ggtitle(paste0(colnames(array)[a],"\nvs\n",colnames(array)[b]))+
+                theme(legend.position="none"),
+                vp=viewport(layout.pos.row=floor((num.plot -1)/ width) %% height+1, layout.pos.col=((num.plot - 1)%% width+1)))
+                
+      }
+  }
+  popViewport()
+}
diff --git a/R/zzz.R b/R/zzz.R
new file mode 100644 (file)
index 0000000..f361b37
--- /dev/null
+++ b/R/zzz.R
@@ -0,0 +1,3 @@
+.onAttach <- function(...) {
+    ## do nothing currently.
+}
diff --git a/man/loessNormalization-package.Rd b/man/loessNormalization-package.Rd
new file mode 100644 (file)
index 0000000..19233d9
--- /dev/null
@@ -0,0 +1,36 @@
+\name{loessNormalization-package}
+\alias{loessNormalization-package}
+\alias{loessNormalization}
+\docType{package}
+\title{
+\packageTitle{loessNormalization}
+}
+\description{
+\packageDescription{loessNormalization}
+}
+\details{
+
+The DESCRIPTION file:
+\packageDESCRIPTION{loessNormalization}
+\packageIndices{loessNormalization}
+~~ An overview of how to use the package, including the most important ~~
+~~ functions ~~
+}
+\author{
+\packageAuthor{loessNormalization}
+
+Maintainer: \packageMaintainer{loessNormalization}
+}
+\references{
+~~ Literature or other references for background information ~~
+}
+~~ Optionally other standard keywords, one per line, from file KEYWORDS in ~~
+~~ the R documentation directory ~~
+\keyword{ package }
+\seealso{
+~~ Optional links to other man pages, e.g. ~~
+~~ \code{\link[<pkg>:<pkg>-package]{<pkg>}} ~~
+}
+\examples{
+~~ simple examples of the most important functions ~~
+}
diff --git a/man/loess_normalization.Rd b/man/loess_normalization.Rd
new file mode 100644 (file)
index 0000000..20a93ca
--- /dev/null
@@ -0,0 +1,39 @@
+% Generated by roxygen2 (4.1.1): do not edit by hand
+% Please edit documentation in R/ma_loess_norm.R
+\name{loess_normalization}
+\alias{loess_normalization}
+\title{loess_normalization}
+\usage{
+loess_normalization(expressions, iterations = 2, small.positive = 0.1,
+  sample.size = 200, num.cores = max(floor(detectCores()/2), 1),
+  normalization.method = "mean")
+}
+\arguments{
+\item{expressions}{Gene expression values with probes in rows and samples in columns}
+
+\item{iterations}{Number of iterations to run (Default: 2)}
+
+\item{small.positive}{Number to replace negative and zero expressions with (Default: 0.1).}
+
+\item{sample.size}{Number of combinations of samples to run (Default: 200).}
+
+\item{num.cores}{Number of cores to use (Default: half of them)}
+
+\item{noramlization.method}{Normalization method (Default: mean).}
+}
+\value{
+data.frame of normalized expression values
+}
+\description{
+Normalize using MvA loess using multiple processors
+}
+\details{
+For analyses where the number of samples makes all-against-all
+normalization prohibitive, this variant subselects a set of
+comparisons and runs them, iterating a specific number of times
+using as many cores as possible.
+}
+\author{
+Don Armstrong
+}
+
diff --git a/man/mva.plot.pair.Rd b/man/mva.plot.pair.Rd
new file mode 100644 (file)
index 0000000..3a47edc
--- /dev/null
@@ -0,0 +1,25 @@
+% Generated by roxygen2 (4.1.1): do not edit by hand
+% Please edit documentation in R/mva_plotting.R
+\name{mva.plot.pair}
+\alias{mva.plot.pair}
+\title{mva.plot.pair}
+\usage{
+mva.plot.pair(array, title = F, small.positive = 0.1)
+}
+\arguments{
+\item{array}{Array of data to plot}
+
+\item{title}{Title of plot (Default: FALSE; no title)}
+
+\item{small.positive}{replace numbers less than 0 with this}
+}
+\description{
+Plots a pairwise M versus A plot with a loess fit line
+}
+\details{
+.. content for \details{} ..
+}
+\author{
+Don Armstrong \email{don@donarmstrong.com}
+}
+