From: Don Armstrong Date: Tue, 6 Dec 2016 23:01:15 +0000 (-0600) Subject: add initial version of loessnorm X-Git-Url: https://git.donarmstrong.com/?p=loess_normalization.git;a=commitdiff_plain;h=1a12d21c14bb6c9a2403f1d9913eb9c3f73fd35c add initial version of loessnorm --- 1a12d21c14bb6c9a2403f1d9913eb9c3f73fd35c diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..9fc803e --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,9 @@ +Package: loessNormalization +Type: Package +Title: Loess normalization and plotting routines +Version: 0.1 +Date: 2016-06-14 +Author: Don Armstrong +Maintainer: Don Armstrong +Description: Loess normalization and plotting routines +License: GPL 3+ diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..d75f824 --- /dev/null +++ b/NAMESPACE @@ -0,0 +1 @@ +exportPattern("^[[:alpha:]]+") diff --git a/R/ma_loess_norm.R b/R/ma_loess_norm.R new file mode 100644 index 0000000..1b5494c --- /dev/null +++ b/R/ma_loess_norm.R @@ -0,0 +1,107 @@ +library("stats") +library("parallel") +library("impute") + +##' Normalize using MvA loess using multiple processors +##' +##' For analyses where the number of samples makes all-against-all +##' normalization prohibitive, this variant subselects a set of +##' comparisons and runs them, iterating a specific number of times +##' using as many cores as possible. +##' @title loess_normalization +##' @param expressions Gene expression values with probes in rows and samples in columns +##' @param iterations Number of iterations to run (Default: 2) +##' @param small.positive Number to replace negative and zero expressions with (Default: 0.1). +##' @param sample.size Number of combinations of samples to run (Default: 200). +##' @param num.cores Number of cores to use (Default: half of them) +##' @param noramlization.method Normalization method (Default: mean). +##' @return data.frame of normalized expression values +##' @author Don Armstrong +loess_normalization <- function(expressions,iterations=2,small.positive=0.1,sample.size=200,num.cores=max(floor(detectCores()/2),1),normalization.method="mean"){ + n <- ncol(expressions) + ## replace NA with the mean + expressions.na <- is.na(expressions) + expressions.na.which <- which(expressions.na,arr.ind=TRUE) + if (nrow(expressions.na.which)>0) { + if (normalization.method=="mean") { + expressions[unique(expressions.na.which[,"row"]),] <- + apply(expressions[unique(expressions.na.which[,"row"]),], + 1, + function(x){ + if (all(is.na(x))) { + x[] <- small.positive; + } else { + x[is.na(x)] <- mean(x,na.rm=TRUE); + } + x}) + } else if (normalization.method=="knn") { + expressions <- impute.knn(expressions)$data + } else { + stop("Unknown normalization.method; currently supported methods are 'mean' and 'knn'") + } + + } + pb <- txtProgressBar(0,iterations,style=3) + for (q in 1:iterations) { + expressions.comb <- + combn(ncol(expressions),2) + expressions.comb.sample <- NULL + if (sample.size >= ncol(expressions.comb)) { + expressions.comb.sample <- + expressions.comb + } else { + expressions.comb.sample <- + expressions.comb[,sample.int(n=ncol(expressions.comb), + size=sample.size, + replace=FALSE)] + } + ### calculate the n for each sample + expressions.comb.sample.n <- + apply(sapply(1:ncol(expressions), + `==`,expressions.comb.sample), + 2,sum) + result.expressions <- array(1,dim=dim(expressions)) + m <- 1 + ## do this in batches of no more than 200 to avoid making a list + ## which is way too large to fit in memory + while (m <= ncol(expressions.comb.sample)) { + e.c.subsample <- + expressions.comb.sample[,m:min(m+200,ncol(expressions.comb.sample))] + bar.m.hat.list <- + mclapply(1:ncol(e.c.subsample), + function(k) { + i <- e.c.subsample[1,k] + j <- e.c.subsample[2,k] + mva <- data.frame(m=log2(expressions[,i]/expressions[,j]), + a=0.5*log2(expressions[,i]*expressions[,j])) + temp.loess <- + loess(m~a,mva,control=loess.control(surface="interpolate", + statistics="approximate", + trace.hat="approximate", + iterations=2)) + ## predict for i, predict for j (\hat{M}) + ## subtract out \hat{M} from M + m.hat <- predict(temp.loess,mva[,"a"]) + return(data.frame(i=2^(-0.5*m.hat/expressions.comb.sample.n[i]), + j=2^(0.5*m.hat/expressions.comb.sample.n[j]))) + }, + mc.cores=num.cores + ) + for (k in 1:ncol(e.c.subsample)) { + result.expressions[,e.c.subsample[1,k]] <- + result.expressions[,e.c.subsample[1,k]] * + bar.m.hat.list[[k]][,"i"] + result.expressions[,e.c.subsample[2,k]] <- + result.expressions[,e.c.subsample[2,k]] * + bar.m.hat.list[[k]][,"j"] + } + m <- min(m+200,ncol(expressions.comb.sample))+1 + setTxtProgressBar(pb,(q-1)+m/(ncol(expressions.comb.sample)+1)) + } + setTxtProgressBar(pb,q) + ## correct expressionss + expressions <- expressions*result.expressions + } + close(pb) + return(expressions) +} diff --git a/R/mva_plotting.R b/R/mva_plotting.R new file mode 100644 index 0000000..39f68d6 --- /dev/null +++ b/R/mva_plotting.R @@ -0,0 +1,83 @@ +# MvA plotting for microarray data + + +# Copyright 2003 By Don Armstrong + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA +# 02111-1307, USA. + +# $Id: mva_plotting.R,v 1.5 2006/06/10 04:14:45 don Exp $ + + +library("grid") +library("ggplot2") + +# mva.plot +# Plots a pairwise M versus A plot with a loess fit line +##' Plots a pairwise M versus A plot with a loess fit line +##' +##' .. content for \details{} .. +##' @title mva.plot.pair +##' @param array Array of data to plot +##' @param title Title of plot (Default: FALSE; no title) +##' @param small.positive replace numbers less than 0 with this +##' @author Don Armstrong \email{don@@donarmstrong.com} +mva.plot.pair <- function(array,title=FALSE,small.positive=0.1){ + size <- ncol(array) + npair <- size*(size-1)/2 + height <- min(round((npair*3/4)^0.5),3) + width <- min(ceiling(npair/height),3) + def.par <- par(no.readonly = TRUE) + mva.layout <- grid.layout(ncol=width,nrow=height) + pushViewport(viewport(layout=mva.layout)) + ## we could also use combn here instead + num.plot <- 0 + for(a in 1:(size-1)){ + for(b in (a+1):size){ + num.plot <- num.plot + 1 + group.1 <- array[,a] + group.2 <- array[,b] + group.keep <- is.finite(group.1) & is.finite(group.2) + group.1 <- group.1[group.keep] + group.2 <- group.2[group.keep] + group.min <- min(c(group.1,group.2)) + if (group.min <= 0) { + group.1 <- group.1 - group.min + small.positive + group.2 <- group.2 - group.min + small.positive + } + A <- 0.5*log2(group.1*group.2) + M <- log2(group.1/group.2) + temp.frame <- na.omit(data.frame(cbind(A,M))) + if (num.plot > 1 && + floor((num.plot -1)/ width) %% height == 0 && + (num.plot-1) %% width == 0) { + popViewport(1) + grid.text(title,x=unit(0.0,"npc")+unit(1,"lines"),y=unit(0.5,"npc"),just="center",rot=90) + grid.newpage() + pushViewport(viewport(layout=mva.layout)) + } + ## print(floor((num.plot -1)/ width) %% height+1) + ## print((num.plot - 1)%% width+1) + print(ggplot(temp.frame,aes(y=M,x=A))+ + geom_hex()+ + stat_smooth(method="gam")+ + ggtitle(paste0(colnames(array)[a],"\nvs\n",colnames(array)[b]))+ + theme(legend.position="none"), + vp=viewport(layout.pos.row=floor((num.plot -1)/ width) %% height+1, layout.pos.col=((num.plot - 1)%% width+1))) + + } + } + popViewport() +} diff --git a/R/zzz.R b/R/zzz.R new file mode 100644 index 0000000..f361b37 --- /dev/null +++ b/R/zzz.R @@ -0,0 +1,3 @@ +.onAttach <- function(...) { + ## do nothing currently. +} diff --git a/man/loessNormalization-package.Rd b/man/loessNormalization-package.Rd new file mode 100644 index 0000000..19233d9 --- /dev/null +++ b/man/loessNormalization-package.Rd @@ -0,0 +1,36 @@ +\name{loessNormalization-package} +\alias{loessNormalization-package} +\alias{loessNormalization} +\docType{package} +\title{ +\packageTitle{loessNormalization} +} +\description{ +\packageDescription{loessNormalization} +} +\details{ + +The DESCRIPTION file: +\packageDESCRIPTION{loessNormalization} +\packageIndices{loessNormalization} +~~ An overview of how to use the package, including the most important ~~ +~~ functions ~~ +} +\author{ +\packageAuthor{loessNormalization} + +Maintainer: \packageMaintainer{loessNormalization} +} +\references{ +~~ Literature or other references for background information ~~ +} +~~ Optionally other standard keywords, one per line, from file KEYWORDS in ~~ +~~ the R documentation directory ~~ +\keyword{ package } +\seealso{ +~~ Optional links to other man pages, e.g. ~~ +~~ \code{\link[:-package]{}} ~~ +} +\examples{ +~~ simple examples of the most important functions ~~ +} diff --git a/man/loess_normalization.Rd b/man/loess_normalization.Rd new file mode 100644 index 0000000..20a93ca --- /dev/null +++ b/man/loess_normalization.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/ma_loess_norm.R +\name{loess_normalization} +\alias{loess_normalization} +\title{loess_normalization} +\usage{ +loess_normalization(expressions, iterations = 2, small.positive = 0.1, + sample.size = 200, num.cores = max(floor(detectCores()/2), 1), + normalization.method = "mean") +} +\arguments{ +\item{expressions}{Gene expression values with probes in rows and samples in columns} + +\item{iterations}{Number of iterations to run (Default: 2)} + +\item{small.positive}{Number to replace negative and zero expressions with (Default: 0.1).} + +\item{sample.size}{Number of combinations of samples to run (Default: 200).} + +\item{num.cores}{Number of cores to use (Default: half of them)} + +\item{noramlization.method}{Normalization method (Default: mean).} +} +\value{ +data.frame of normalized expression values +} +\description{ +Normalize using MvA loess using multiple processors +} +\details{ +For analyses where the number of samples makes all-against-all +normalization prohibitive, this variant subselects a set of +comparisons and runs them, iterating a specific number of times +using as many cores as possible. +} +\author{ +Don Armstrong +} + diff --git a/man/mva.plot.pair.Rd b/man/mva.plot.pair.Rd new file mode 100644 index 0000000..3a47edc --- /dev/null +++ b/man/mva.plot.pair.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2 (4.1.1): do not edit by hand +% Please edit documentation in R/mva_plotting.R +\name{mva.plot.pair} +\alias{mva.plot.pair} +\title{mva.plot.pair} +\usage{ +mva.plot.pair(array, title = F, small.positive = 0.1) +} +\arguments{ +\item{array}{Array of data to plot} + +\item{title}{Title of plot (Default: FALSE; no title)} + +\item{small.positive}{replace numbers less than 0 with this} +} +\description{ +Plots a pairwise M versus A plot with a loess fit line +} +\details{ +.. content for \details{} .. +} +\author{ +Don Armstrong \email{don@donarmstrong.com} +} +