From: paradis Date: Thu, 20 Oct 2011 08:13:38 +0000 (+0000) Subject: big update with files from Andrei X-Git-Url: https://git.donarmstrong.com/?p=ape.git;a=commitdiff_plain;h=ef34642fe85694df0a80ef534137b8f6b427b67e big update with files from Andrei git-svn-id: https://svn.mpl.ird.fr/ape/dev/ape@171 6e262413-ae40-0410-9e79-b911bd7a66b7 --- diff --git a/DESCRIPTION b/DESCRIPTION index 6b51928..5882a71 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: ape Version: 2.8 -Date: 2011-09-24 +Date: 2011-10-20 Title: Analyses of Phylogenetics and Evolution -Author: Emmanuel Paradis, Ben Bolker, Julien Claude, Hoa Sien Cuong, Richard Desper, Benoit Durand, Julien Dutheil, Olivier Gascuel, Christoph Heibl, Daniel Lawson, Vincent Lefort, Pierre Legendre, Jim Lemon, Yvonnick Noel, Johan Nylander, Rainer Opgen-Rhein, Klaus Schliep, Korbinian Strimmer, Damien de Vienne +Author: Emmanuel Paradis, Ben Bolker, Julien Claude, Hoa Sien Cuong, Richard Desper, Benoit Durand, Julien Dutheil, Olivier Gascuel, Christoph Heibl, Daniel Lawson, Vincent Lefort, Pierre Legendre, Jim Lemon, Yvonnick Noel, Johan Nylander, Rainer Opgen-Rhein, Andrei-Alin Popescu, Klaus Schliep, Korbinian Strimmer, Damien de Vienne Maintainer: Emmanuel Paradis Depends: R (>= 2.6.0) Suggests: gee @@ -17,6 +17,8 @@ Description: ape provides functions for reading, writing, plotting, skyline plots, estimation of absolute evolutionary rates and clock-like trees using mean path lengths, non-parametric rate smoothing and penalized likelihood. Phylogeny estimation can be - done with the NJ, BIONJ, and ME methods. + done with the NJ, BIONJ, ME, MVR, SDM, and triangle methods, and + several methods handling incomplete distance matrices (NJ*, BIONJ*, + MVR*, and the corresponding triangle method). License: GPL (>= 2) URL: http://ape.mpl.ird.fr/ diff --git a/NEWS b/NEWS index 776fd4a..17f28e2 100644 --- a/NEWS +++ b/NEWS @@ -3,14 +3,24 @@ NEW FEATURES + o Twelve new functions have been written by Andrei-Alin Popescu: + additive, ultrametric, is.compatible, arecompatible, mvr, mvrs, + njs, bionjs, SDM, treePop, triangMtd, triangMtd*. + + o A new class "bitsplits" has been created by Andrei-Alin Popescu + to code splits (aka, bipartition). + + o There is a new generic function as.bitsplits with a method to + convert from the class "prop.part" to the class "bitsplits". + + o The new function ltt.coplot plots on the same scales a tree and + the derived LTT plot. + o ltt.plot() has two new options: backward and tol. It can now handle non-ultrametic trees and its internal coding has been improved. The coordinates of the plot can now be computed with the new function ltt.plot.coords. - o The new function ltt.coplot plots on the same scales a tree and - its LTT plot. - CHANGES IN APE VERSION 2.7-3 diff --git a/R/SDM.R b/R/SDM.R new file mode 100644 index 0000000..94a8f31 --- /dev/null +++ b/R/SDM.R @@ -0,0 +1,183 @@ +## SDM.R (2011-10-11) + +## Construction of Consensus Distance Matrix With SDM + +## Copyright 2011 Andrei-Alin Popescu + +## This file is part of the R-package `ape'. +## See the file ../COPYING for licensing issues. + +SDM <- function(...) +{ + st <- list(...) # first half contains matrices, second half s_p + k <- length(st)/2 + labels <- attr(as.dist(st[[1]]), "Labels") + tot <- length(rownames(st[[1]])) + for (i in 2:k) { + labels <- union(labels, attr(as.dist(st[[i]]), "Labels")) + tot <- tot + length(rownames(st[[i]])) + } + sp <- mat.or.vec(1,k) + for (i in c(k+1:k)) + sp[i - k] <- st[[i]] + + astart <- mat.or.vec(1, tot) # start of aip, astart[p] is start of aip + astart[1] <- k + for (i in 2:k) + astart[i] <- astart[i - 1] + length(rownames(st[[i - 1]])) + + a <- mat.or.vec(1, k + tot + k + length(labels)) + ## first k are alphas, subseqeunt ones aip + ## each matrix p starting at astart[p], next are + ## Lagrange multipliers, miu, niu, lambda in that order + n <- length(labels) + miustart <- k + tot + niustart <- miustart + n + lambstart <- niustart + k - 1 + + X <- mat.or.vec(n, n) + V <- mat.or.vec(n, n) + w <- mat.or.vec(n, n) + + col <- mat.or.vec(k + tot + k + length(labels), 1) # free terms of system + + dimnames(X) <- list(labels, labels) + dimnames(V) <- list(labels, labels) + dimnames(W) <- list(labels, labels) + + for (i in 1:(n - 1)) { + for (j in (i + 1):n) { + for (p in 1:k) { + d <- st[[p]] + if (is.element(rownames(X)[i], rownames(d)) & is.element(colnames(X)[j], colnames(d))) { + w[i, j] <- w[j, i] <- w[i, j] + sp[p] + } + } + } + } + + Q <- mat.or.vec(length(labels) + k + k + tot, length(labels) + k + k + tot) + ## first decompose first sum in paper + for (p in 1:k) { + d_p <- st[[p]] + for (l in 1:k) { # first compute coeficients of alphas + sum <- 0 + dijp <- -1 + if (l == p) { # calculate alpha_p + for (i in 1:n) { + for (j in 1:n) { #check if {i,j}\subset L_l + d <- st[[l]] + if (i != j & is.element(labels[i],rownames(d)) & is.element(labels[j],colnames(d))) { + dij <- d[rownames(d) == labels[i], colnames(d) == labels[j]] + sum <- sum + ((dij*dij) - sp[l]*dij*dij/w[i,j]) + ipos <- which(rownames(d) == labels[i]) + jpos <- which(rownames(d) == labels[j]) + Q[p, astart[l] + ipos] <- Q[p, astart[l] + ipos] + (dij - (sp[l]*dij/w[i,j])) + Q[p, astart[l] + jpos] <- Q[p, astart[l] + jpos] + (dij - (sp[l]*dij/w[i,j])) + } + } + } + } else { + for (i in 1:n) { + for (j in 1:n) { #check if {i,j}\subset L_l + d <- st[[l]] + if (i != j & is.element(labels[i], rownames(d)) & is.element(labels[j], colnames(d)) & is.element(labels[i], rownames(d_p)) & is.element(labels[j], colnames(d_p))) { + dij <- d[rownames(d) == labels[i], colnames(d) == labels[j]] + dijp <- d_p[rownames(d_p) == labels[i], colnames(d_p) == labels[j]] + sum <- sum - sp[l]*dij*dijp/w[i, j] + ipos <- which(rownames(d) == labels[i]) + jpos <- which(rownames(d) == labels[j]) + Q[p,astart[l] + ipos] <- Q[p, astart[l] + ipos] - sp[l]*dijp/w[i, j] + Q[p,astart[l] + jpos] <- Q[p, astart[l] + jpos] - sp[l]*dijp/w[i, j] + } + } + } + } + Q[p, l] <- sum + } + Q[p, lambstart + 1] <- 1 + } + r <- k + for (p in 1:k) { + dp <- st[[p]] + for (i in 1:n) { + if (is.element(labels[i], rownames(dp))) { + r <- r + 1 + for (l in 1:k) { + d <- st[[l]] + if (l == p) { + for (j in 1:n) { + if (i != j & is.element(labels[j], rownames(dp))) { + dij <- d[rownames(d) == labels[i], colnames(d) == labels[j]] + Q[r, l] <- Q[r, l] + (dij - sp[l]*dij/w[i, j]) + ipos <- which(rownames(d) == labels[i]) + jpos <- which(rownames(d) == labels[j]) + Q[r, astart[l] + ipos] <- Q[r, astart[l] + ipos] + (1 - sp[l]/w[i, j]) + Q[r, astart[l] + jpos] <- Q[r, astart[l] + jpos] + (1 - sp[l]/w[i, j]) + } + } + } else { + for (j in 1:n) { + if (i != j & is.element(labels[j], rownames(dp)) & is.element(labels[i], rownames(d)) & is.element(labels[j], colnames(d))) { + dij <- d[rownames(d) == labels[i], colnames(d) == labels[j]] + Q[r,l] <- Q[r,l] - sp[l]*dij/w[i, j] + ipos <- which(rownames(d) == labels[i]) + jpos <- which(rownames(d) == labels[j]) + Q[r, astart[l] + ipos] <- Q[r, astart[l] + ipos] - sp[l]/w[i, j] + Q[r, astart[l] + jpos] <- Q[r, astart[l] + jpos] - sp[l]/w[i, j] + } + } + } + } + if (p < k) Q[r, ] <- Q[r, ] * sp[p] + Q[r, miustart + i] <- 1 + if (p < k) Q[r, niustart + p] <- 1 + } + } + } + r <- r + 1 + col[r] <- k + for (i in 1:k) Q[r,i] <- 1 + + for (i in 1:n) { + r <- r + 1 + for (p in 1:k) { + d <- st[[p]] + if (is.element(labels[i], rownames(d))) { + ipos <- which(rownames(d) == labels[i]) + Q[r, astart[p] + ipos] <- 1 + } + } + } + for (p in 1:(k - 1)) { + r <- r + 1 + for (i in 1:n) { + d <- st[[p]] + if (is.element(labels[i], rownames(d))) { + ipos <- which(rownames(d) == labels[i]) + Q[r, astart[p] + ipos] <- 1 + } + } + } + a <- solve(Q, col, 1e-19) + for(i in 1:n) { + for(j in 1:n) { + sum <- 0 + sumv <- 0 + for(p in 1:k) { + d <- st[[p]] + if (is.element(labels[i], rownames(d)) & is.element(labels[j], rownames(d))) { + ipos <- which(rownames(d) == labels[i]) + jpos <- which(rownames(d) == labels[j]) + sum <- sum + sp[p]*(a[p]*d[ipos, jpos] + a[astart[p] + ipos] + a[astart[p] + jpos]) + sumv <- sumv + sp[p]*(a[p]*d[ipos, jpos])*(a[p]*d[ipos, jpos]) + } + } + X[i, j] <- sum/w[i, j] + V[i, j] <- sumv/(w[i, j] * w[i, j]) + if (i == j) + X[i, j] <- V[i, j] <- 0 + } + } + list(X, V) +} diff --git a/R/additive.R b/R/additive.R new file mode 100644 index 0000000..efd72f9 --- /dev/null +++ b/R/additive.R @@ -0,0 +1,27 @@ +## additive.R (2011-10-11) + +## Incomplete Distance Matrix Filling + +## Copyright 2011 Andrei-Alin Popescu + +## This file is part of the R-package `ape'. +## See the file ../COPYING for licensing issues. + +.addit_ultra <- function(libs, X) +{ + if (is.matrix(X)) X <- as.dist(X) + X[is.na(X)] <- -1 + X[X < 0] <- -1 + X[is.nan(X)] <- -1 + N <- attr(X, "Size") + labels <- attr(X, "Labels") + if (is.null(labels)) labels <- as.character(1:N) + m <- sum(X == -1) + ans <- .C(libs, as.double(X), as.integer(N), + as.integer(m), double(N*N), PACKAGE = "ape") + matrix(ans[[4]], N, N) +} + +additive <- function(X) .addit_ultra("additive", X) + +ultrametric <- function(X) .addit_ultra("ultrametric", X) diff --git a/R/as.bitsplits.R b/R/as.bitsplits.R new file mode 100644 index 0000000..ace9af4 --- /dev/null +++ b/R/as.bitsplits.R @@ -0,0 +1,58 @@ +## as.bitsplits.R (2011-10-19) + +## Conversion Among Split Classes + +## Copyright 2011 Emmanuel Paradis + +## This file is part of the R-package `ape'. +## See the file ../COPYING for licensing issues. + +as.bitsplits <- function(x) UseMethod("as.bitsplits") + +as.bitsplits.prop.part <- function(x) +{ + foo <- function(vect, RAWVECT) { + res <- RAWVECT + for (y in vect) { + i <- ceiling(y/8) + res[i] <- res[i] | as.raw(2^(8 - ((y - 1) %% 8) - 1)) + } + res + } + + N <- length(x) # number of splits + n <- length(x[[1]]) # number of tips + + nr <- ceiling(n/8) + mat <- raw(N * nr) + dim(mat) <- c(nr, N) + + RAWVECT <- raw(nr) + + for (i in 1:N) mat[, i] <- foo(x[[i]], RAWVECT) + + ## add the n trivial splits of size 1... : + mat.bis <- raw(n * nr) + dim(mat.bis) <- c(nr, n) + for (i in 1:n) mat.bis[, i] <- foo(i, RAWVECT) + + ## ... drop the trivial split of size n... : + mat <- cbind(mat.bis, mat[, -1, drop = FALSE]) + + ## ... update the split frequencies... : + freq <- attr(x, "number") + freq <- c(rep(freq[1L], n), freq[-1L]) + + ## ... and numbers: + N <- N + n - 1L + + structure(list(matsplit = mat, labels = attr(x, "labels"), + freq = freq), class = "bitsplits") +} + +print.bitsplits <- function(x, ...) +{ + cat('Object of class "bitsplits"\n') + cat(' ', length(x$labels), 'tips\n') + cat(' ', length(x$freq), 'partitions\n\n') +} diff --git a/R/is.compatible.R b/R/is.compatible.R new file mode 100644 index 0000000..522e4f7 --- /dev/null +++ b/R/is.compatible.R @@ -0,0 +1,36 @@ +## is.compatible.R (2011-10-11) + +## Check Compatibility of Splits + +## Copyright 2011 Andrei-Alin Popescu + +## This file is part of the R-package `ape'. +## See the file ../COPYING for licensing issues. + +is.compatible <- function(obj) UseMethod("is.compatible") + +is.compatible.bitsplits <- function(obj) +{ + m <- obj$matsplit + n <- ncol(m) + ntaxa <- length(obj$labels) + for (i in 1:(n - 1)) + for (j in (i + 1):n) + if (!arecompatible(m[, i], m[, j], ntaxa)) + return(FALSE) + TRUE +} + +arecompatible <-function(x, y, n) +{ + msk <- !as.raw(2^(8 - (n %% 8)) - 1) + + foo <- function(v) { + lv <- length(v) + v[lv] <- v[lv] & msk + as.integer(all(v == as.raw(0))) + } + + nE <- foo(x & y) + foo(x & !y) + foo(!x & y) + foo(!x & !y) + if (nE == 1) TRUE else FALSE +} diff --git a/R/mvr.R b/R/mvr.R new file mode 100644 index 0000000..a2218fc --- /dev/null +++ b/R/mvr.R @@ -0,0 +1,49 @@ +## mvr.R (2011-10-11) + +## Minimum Variance Reduction + +## Copyright 2011 Andrei-Alin Popescu + +## This file is part of the R-package `ape'. +## See the file ../COPYING for licensing issues. + +mvr <- function(X, V) +{ + if (is.matrix(X)) X <- as.dist(X) + if (is.matrix(V)) V <- as.dist(V) + if (any(is.na(X))) + stop("missing values are not allowed in the distance matrix") + if (any(is.na(V))) + stop("missing values are not allowed in the variance matrix") + N <- attr(X, "Size") + labels <- attr(X, "Labels") + if (is.null(labels)) labels <- as.character(1:N) + ans <- .C("mvr", as.double(X), as.double(V), as.integer(N), + integer(2*N - 3), integer(2*N - 3), double(2*N - 3), + DUP = FALSE, NAOK = TRUE, PACKAGE = "ape") + obj <- list(edge = cbind(ans[[4]], ans[[5]]), edge.length = ans[[6]], + tip.label = labels, Nnode = N - 2L) + class(obj) <- "phylo" + reorder(obj) +} + +mvrs <- function(X, V, fs = 15) +{ + if (is.matrix(X)) X <- as.dist(X) + if (is.matrix(V)) V <- as.dist(V) + + X[is.na(X)] <- -1 + X[X < 0] <- -1 + X[is.nan(X)] <- -1 + + N <- attr(X, "Size") + labels <- attr(X, "Labels") + if (is.null(labels)) labels <- as.character(1:N) + ans <- .C("mvrs", as.double(X), as.double(V), as.integer(N), + integer(2*N - 3), integer(2*N - 3), double(2*N - 3), + as.integer(fs), DUP = FALSE, NAOK = TRUE, PACKAGE = "ape") + obj <- list(edge = cbind(ans[[4]], ans[[5]]), edge.length = ans[[6]], + tip.label = labels, Nnode = N - 2L) + class(obj) <- "phylo" + reorder(obj) +} diff --git a/R/njs.R b/R/njs.R new file mode 100644 index 0000000..1f7ee8a --- /dev/null +++ b/R/njs.R @@ -0,0 +1,31 @@ +## njs.R (2011-10-11) + +## Tree Reconstruction from Incomplete Distances With NJ* or bio-NJ* + +## Copyright 2011 Andrei-Alin Popescu + +## This file is part of the R-package `ape'. +## See the file ../COPYING for licensing issues. + +.NJS_BIONJS <- function(libs, X, fs = 15) +{ + if (is.matrix(X)) X <- as.dist(X) + X[is.na(X)] <- -1 + X[X < 0] <- -1 + X[is.nan(X)] <- -1 + N <- attr(X, "Size") + labels <- attr(X, "Labels") + if (is.null(labels)) labels <- as.character(1:N) + ans <- .C(libs, as.double(X), as.integer(N), integer(2*N - 3), + integer(2*N - 3), double(2*N - 3), as.integer(fs), + DUP = FALSE, NAOK = TRUE, PACKAGE = "ape") + obj <- list(edge = cbind(ans[[3]], ans[[4]]), edge.length = ans[[5]], + tip.label = labels, Nnode = N - 2L) + class(obj) <- "phylo" + reorder(obj) +} + + +njs <- function(X, fs = 15) .NJS_BIONJS("njs", X = X, fs = fs) + +bionjs <- function(X, fs = 15) .NJS_BIONJS("bionjs", X = X, fs = fs) diff --git a/R/treePop.R b/R/treePop.R new file mode 100644 index 0000000..673af88 --- /dev/null +++ b/R/treePop.R @@ -0,0 +1,25 @@ +## treePop.R (2011-10-11) + +## Tree Popping + +## Copyright 2011 Andrei-Alin Popescu + +## This file is part of the R-package `ape'. +## See the file ../COPYING for licensing issues. + +treePop <- function(obj) +{ + mf <- obj$matsplit + labels <- obj$labels + n <- length(labels) + imf <- as.integer(mf) + freq <- obj$freq + mimf <- matrix(imf, nrow(mf), ncol(mf)) + ans <- .C("treePop", mimf, as.double(freq), as.integer(ncol(mf)), + as.integer(n), integer(2*n - 3), integer(2*n - 3), + double(2*n - 3), DUP = FALSE, NAOK = TRUE, PACKAGE = "ape") + obj <- list(edge = cbind(ans[[5]], ans[[6]]), edge.length = ans[[7]], + tip.label = labels, Nnode = n - 2L) + class(obj) <- "phylo" + reorder(obj) +} diff --git a/R/triangMtd.R b/R/triangMtd.R new file mode 100644 index 0000000..3f2d891 --- /dev/null +++ b/R/triangMtd.R @@ -0,0 +1,42 @@ +## treePop.R (2011-10-11) + +## Tree Reconstruction With the Triangles Method + +## Copyright 2011 Andrei-Alin Popescu + +## This file is part of the R-package `ape'. +## See the file ../COPYING for licensing issues. + +triangMtd <- function(X) +{ + if (is.matrix(X)) X <- as.dist(X) + if (any(is.na(X))) + stop("missing values are not allowed in the distance matrix") + N <- attr(X, "Size") + labels <- attr(X, "Labels") + if (is.null(labels)) labels <- as.character(1:N) + ans <- .C("triangMtd", as.double(X), as.integer(N), integer(2*N - 3), + integer(2*N - 3), double(2*N - 3), + DUP = FALSE, NAOK = TRUE, PACKAGE = "ape") + obj <- list(edge = cbind(ans[[3]], ans[[4]]), edge.length = ans[[5]], + tip.label = labels, Nnode = N - 2L) + class(obj) <- "phylo" + reorder(obj) +} + +triangMtds <- function(X) +{ + if (is.matrix(X)) X <- as.dist(X) + X[is.na(X)] <- -1 + X[X < 0] <- -1 + N <- attr(X, "Size") + labels <- attr(X, "Labels") + if (is.null(labels)) labels <- as.character(1:N) + ans <- .C("triangMtds", as.double(X), as.integer(N), integer(2*N - 3), + integer(2*N - 3), double(2*N - 3), + DUP = FALSE, NAOK = TRUE, PACKAGE = "ape") + obj <- list(edge = cbind(ans[[3]], ans[[4]]), edge.length = ans[[5]], + tip.label = labels, Nnode = N - 2L) + class(obj) <- "phylo" + reorder(obj) +} diff --git a/Thanks b/Thanks index b979f72..0d9b473 100644 --- a/Thanks +++ b/Thanks @@ -19,3 +19,6 @@ the French "Programme inter-EPST Bioinformatique" (2001-2003). Financial support was provided by the Department of Information Systems of IRD in form a "SPIRALES" project (2006). + +Financial support was provided by Google to Andrei-Alin Popescu during +the 2011 Google Summer of Code (GSoC 2011). \ No newline at end of file diff --git a/man/SDM.Rd b/man/SDM.Rd new file mode 100644 index 0000000..bceedf4 --- /dev/null +++ b/man/SDM.Rd @@ -0,0 +1,34 @@ +\name{SDM} +\alias{SDM} +\title{Construction of Consensus Distance Matrix With SDM} +\description{ + This function implements the SDM method of Criscuolo et al. (2006) for + a set of n distance matrices. +} +\usage{ +SDM(...) +} +\arguments{ + \item{\dots}{2n elements (with n > 1), the first n elements are the + distance matrices, the next n elements are the sequence length from + which the matrices have been estimated (can be seen as a degree of + confidence in matrices).} +} +\details{ + Reconstructs a consensus distance matrix from a set of input distance + matrices on overlapping sets of taxa. Potentially missing values in + the supermatrix are represented by \code{NA}. An error is returned if + the input distance matrices can not resolve to a consensus matrix. +} +\value{ + a 2-element list containing a distance matrix labelled by the union of + the set of taxa of the input distance matrices, and a variance matrix + associated to the returned distance matrix. +} +\references{ + Criscuolo, A., Berry, V., Douzery, E. J. P. , and Gascuel, O. (2006) + SDM: A fast distance-based approach for (super)tree building in + phylogenomics. \emph{Systematic Biology}, \bold{55}, 740--755. +} +\author{Andrei Popescu \email{niteloserpopescu@gmail.com}} +\keyword{models} diff --git a/man/additive.Rd b/man/additive.Rd new file mode 100644 index 0000000..df8c649 --- /dev/null +++ b/man/additive.Rd @@ -0,0 +1,26 @@ +\name{additive} +\alias{additive} +\alias{ultrametric} +\title{Incomplete Distance Matrix Filling} +\description{ + Fills missing entries from incomplete distance matrix using the + additive or the ultrametric procedure (see reference for details). +} +\usage{ +additive(X) +ultrametric(X) +} +\arguments{ + \item{X}{a distance matrix or an object of class \code{"dist"}.} +} +\value{ + a distance matrix. +} +\references{ + Makarenkov, V. and Lapointe, F.-J. (2004) A weighted least-squares + approach for inferring phylogenies from incomplete distance + matrices. \emph{Bioinformatics}, \bold{20}, 2113--2121. + \url{http://dx.doi.org/10.1093/bioinformatics/bth211} +} +\author{Andrei Popescu \email{niteloserpopescu@gmail.com}} +\keyword{manip} diff --git a/man/all.equal.phylo.Rd b/man/all.equal.phylo.Rd index 41351da..9242c9f 100644 --- a/man/all.equal.phylo.Rd +++ b/man/all.equal.phylo.Rd @@ -4,9 +4,9 @@ \title{Global Comparison of two Phylogenies} \usage{ \method{all.equal}{phylo}(target, current, use.edge.length = TRUE, - use.tip.label = TRUE, index.return = FALSE, - tolerance = .Machine$double.eps ^ 0.5, - scale = NULL, \dots) + use.tip.label = TRUE, index.return = FALSE, + tolerance = .Machine$double.eps ^ 0.5, + scale = NULL, \dots) } \arguments{ \item{target}{an object of class \code{"phylo"}.} diff --git a/man/as.bitsplits.Rd b/man/as.bitsplits.Rd new file mode 100644 index 0000000..fed91dd --- /dev/null +++ b/man/as.bitsplits.Rd @@ -0,0 +1,28 @@ +\name{as.bitsplits} +\alias{as.bitsplits} +\alias{as.bitsplits.prop.part} +\alias{print.bitsplits} +\title{Conversion Among Split Classes} +\description{ + \code{as.bitsplits} is a generic function with a method for objects of + class \code{"prop.part"}. +} +\usage{ +as.bitsplits(x) +\method{as.bitsplits}{prop.part}(x) +\method{print}{bitsplits}(x, ...) +} +\arguments{ + \item{x}{an object of the appropriate class.} + \item{\dots}{further arguments passed to or from other methods.} +} +\value{ + an object of class \code{"bitsplits"} or NULL for \code{print}. +} +\author{Emmanuel Paradis} +\examples{ +tr <- rtree(20) +pp <- prop.part(tr) +as.bitsplits(pp) +} +\keyword{manip} diff --git a/man/bd.time.Rd b/man/bd.time.Rd index 6e0975e..11ab1d2 100644 --- a/man/bd.time.Rd +++ b/man/bd.time.Rd @@ -35,7 +35,7 @@ bd.time(phy, birth, death, BIRTH = NULL, DEATH = NULL, primitives can be found in the help page of \code{\link{yule.time}}. The model is fitted by minimizing the least squares deviation between - the observed and predicted distributions of branching times. These + the observed and the predicted distributions of branching times. These computations rely heavily on numerical integrations. If \code{fast = FALSE}, integrations are done with R's \code{\link[stats]{integrate}} function. If \code{fast = TRUE}, a faster but less accurate function diff --git a/man/is.compatible.Rd b/man/is.compatible.Rd new file mode 100644 index 0000000..e8f2af7 --- /dev/null +++ b/man/is.compatible.Rd @@ -0,0 +1,25 @@ +\name{is.compatible} +\alias{is.compatible} +\alias{is.compatible.bitsplits} +\alias{arecompatible} +\title{Check Compatibility of Splits} +\description{ + \code{is.compatible} is a generic function with a method for the class + \code{"bitsplits"}. It checks whether a set of splits is compatible + using the \code{arecompatible} function. +} +\usage{ +is.compatible(obj) +\method{is.compatible}{bitsplits}(obj) +arecompatible(x, y, n) +} +\arguments{ + \item{obj}{an object of class \code{"bitsplits"}.} + \item{x, y}{a vector of mode raw\code{}.} + \item{n}{the number of taxa in the splits.} +} +\value{ + \code{TRUE} if the splits are compatible, \code{FALSE} otherwise. +} +\author{Andrei Popescu \email{niteloserpopescu@gmail.com}} +\keyword{manip} diff --git a/man/mvr.Rd b/man/mvr.Rd new file mode 100644 index 0000000..68f0bc9 --- /dev/null +++ b/man/mvr.Rd @@ -0,0 +1,33 @@ +\name{mvr} +\alias{mvr} +\alias{mvrs} +\title{Minimum Variance Reduction} +\description{ + Phylogenetic tree construction based on the minimum variance reduction. +} +\usage{ +mvr(X, V) +mvrs(X, V, fs = 15) +} +\arguments{ + \item{X}{a distance matrix.} + \item{V}{a variance matrix.} + \item{fs}{agglomeration criterion parameter.} +} +\details{ + The MVR method can be seen as a version of BIONJ which is not + restricted to the Poison model of variance (Gascuel 2000). +} +\value{ + an object of class \code{"phylo"}. +} +\references{ + Criscuolo, A. and Gascuel, O. (2008). Fast NJ-like algorithms to deal + with incomplete distance matrices. \emph{BMC Bioinformatics}, 9. + + Gascuel, O. (2000). Data model and classification by trees: the + minimum variance reduction (MVR) method. \emph{Journal of + Classification}, \bold{17}, 67--99. +} +\author{Andrei Popescu \email{niteloserpopescu@gmail.com}} +\keyword{models} diff --git a/man/njs.Rd b/man/njs.Rd new file mode 100644 index 0000000..90a3444 --- /dev/null +++ b/man/njs.Rd @@ -0,0 +1,32 @@ +\name{njs} +\alias{njs} +\alias{bionjs} +\title{Tree Reconstruction from Incomplete Distances With NJ* or bio-NJ*} +\description{ + Reconstructs a phylogenetic tree from a distance matrix with possibly + missing values. +} +\usage{ +njs(X, fs = 15) +bionjs(X, fs = 15) +} +\arguments{ + \item{X}{a distance matrix.} + \item{fs}{argument \emph{s} of the agglomerative criterion.} +} +\details{ + Missing values represented by either \code{NA} or any negative number. + + Basically, the Q* criterion is applied to all the pairs of leaves, and + the \emph{s} highest scoring ones are chosen for further analysis by + the agglomeration criteria that better handle handle missing + distances (see references for details). +} +\value{ + an object of class \code{"phylo"}. +} +\references{ + \url{http://www.biomedcentral.com/1471-2105/9/166} +} +\author{Andrei Popescu \email{niteloserpopescu@gmail.com}} +\keyword{models} diff --git a/man/treePop.Rd b/man/treePop.Rd new file mode 100644 index 0000000..1262414 --- /dev/null +++ b/man/treePop.Rd @@ -0,0 +1,19 @@ +\name{treePop} +\alias{treePop} +\title{Tree Popping} +\description{ + Method for reconstructing phylogenetic trees from an object of class + splits using tree popping. +} +\usage{ +treePop(obj) +} +\arguments{ + \item{obj}{an object of class \code{"bitsplit"}.} +} +\value{ + an object of class "phylo" which displays all the splits + in the input object. +} +\author{Andrei Popescu \email{niteloserpopescu@gmail.com}} +\keyword{models} diff --git a/man/triangMtd.Rd b/man/triangMtd.Rd new file mode 100644 index 0000000..d90ff1e --- /dev/null +++ b/man/triangMtd.Rd @@ -0,0 +1,23 @@ +\name{triangMtd} +\alias{triangMtd} +\alias{triangMtds} +\title{Tree Reconstruction Based on the Triangles Method} +\usage{ +triangMtd(X) +triangMtds(X) +} +\arguments{ + \item{X}{a distance matrix}. +} +\description{ + Fast distance-based costruction method. Should only be used when + distance measures are fairly reliable. +} +\value{ + an object of class \code{"phylo"}. +} +\references{ + \url{http://archive.numdam.org/ARCHIVE/RO/RO_2001__35_2/RO_2001__35_2_283_0/RO_2001__35_2_283_0.pdf} +} +\author{Andrei Popescu \email{niteloserpopescu@gmail.com}} +\keyword{models} diff --git a/src/additive.c b/src/additive.c new file mode 100644 index 0000000..d2fcfaf --- /dev/null +++ b/src/additive.c @@ -0,0 +1,66 @@ +/* additive.c 2011-10-11 */ + +/* Copyright 2011 Andrei-Alin Popescu */ + +/* This file is part of the R-package `ape'. */ +/* See the file ../COPYING for licensing issues. */ + +#include "ape.h" + +void additive(double *dd, int* np,int* mp,double *ret)//d received as dist object, -1 for missing entries +{ + int n=*np; + int m=*mp; + int i=0,j=0; + double max=dd[0]; + double d[n][n]; + for(i=1;imax) + { + max=dd[give_index(i,j,n)]; + } + } + } + d[n-1][n-1]=0; + + int entrCh=0; + do{ + entrCh=0; + for(i=0;i(d[i][l]+d[j][k]))?(d[i][k]+d[j][l]):(d[i][l]+d[j][k])); + mx-=d[k][l]; + if(mx j) return(DINDEX(j, i)); + else return(DINDEX(i, j)); +} diff --git a/src/ape.h b/src/ape.h new file mode 100644 index 0000000..a0d0720 --- /dev/null +++ b/src/ape.h @@ -0,0 +1,26 @@ +/* ape.h 2011-10-11 */ + +/* Copyright 2011 Emmanuel Paradis */ + +/* This file is part of the R-package `ape'. */ +/* See the file ../COPYING for licensing issues. */ + +#include + +#define DINDEX(i, j) n*(i - 1) - i*(i - 1)/2 + j - i - 1 + +/* in ape.c */ +int give_index(int i, int j, int n); + +/* in njs.c */ +void choosePair(double* D, int n, double* R, int* s, int* sw, int* x, int* y, int fS); +double cnxy(int x, int y, int n, double* D); +int mxy(int x,int y, int n, double* D); +double nxy(int x, int y, int n, double* D); +int cxy(int x, int y, int n, double* D); + +/* in triangMtd.c */ +void triangMtd(double* d, int* np, int* ed1, int* ed2, double* edLen); +int * getPathBetween(int x, int y, int n, int* ed1, int* ed2, int numEdges); +int give_indexx(int i, int j, int n); /* a variant of the above */ + diff --git a/src/bionjs.c b/src/bionjs.c new file mode 100644 index 0000000..b3f599b --- /dev/null +++ b/src/bionjs.c @@ -0,0 +1,394 @@ +/* bionjs.c 2011-10-11 */ + +/* Copyright 2011 Andrei-Alin Popescu */ + +/* This file is part of the R-package `ape'. */ +/* See the file ../COPYING for licensing issues. */ + +#include "ape.h" + +void bionjs(double *D, int *N, int *edge1, int *edge2, double *edge_length,int* fsS) +{ //assume missing values are denoted by -1 + double *S,*R , *v,*new_v, Sdist, Ndist, *new_dist, A, B, smallest_S, x, y; + int n, i, j, k, ij, smallest, OTU1, OTU2, cur_nod, o_l, *otu_label; + /*for(i=0;i 3) { + + ij = 0; + for(i=1;i smallest_S) { + OTU1 = i; + OTU2 = j; + smallest_S = A; + smallest = ij; + } + ij++; + } + } + } + + /*Rprintf("agglomerating %i and %i, Q=%f \n",OTU1,OTU2,smallest_S); + + for(i=1;i 1; i--) + otu_label[i] = otu_label[i - 1]; + if (OTU2 != n) + for (i = OTU2; i < n; i++) + otu_label[i] = otu_label[i + 1]; + otu_label[1] = cur_nod; + + + + n--; + for (i = 0; i < n*(n - 1)/2; i++) + { + D[i] = new_dist[i]; + v[i] = new_v[i]; + if(sw==1) + { + R[i] = newR[i]; + s[i] = newS[i]; + } + } + cur_nod--; + k = k + 2; + } + int dK=0;//number of known distances in final distance matrix + int iUK=-1;//index of unkown distance, if we have one missing distance + int iK=-1;//index of only known distance, only needed if dK==1 + for (i = 0; i < 3; i++) { + edge1[*N*2 - 4 - i] = cur_nod; + edge2[*N*2 - 4 - i] = otu_label[i + 1]; + if(D[i]!=-1){dK++;iK=i;}else{iUK=i;} + } + if(dK==2) + {//if two distances are known: assume our leaves are x,y,z, d(x,z) unknown + //and edge weights of three edges are a,b,c, then any b,c>0 that + //satisfy c-b=d(y,z)-d(x,y) a+c=d(y,z) are good edge weights, but for + //simplicity we assume a=c if d(yz)max)max=D[i]; + } + D[iUK]=max; + } + if(dK==1) + {//through similar motivation as above, if we have just one known distance + //we set the other two distances equal to it + for(i=0;i<3;i++) + {if(i==iK)continue; + D[i]=D[iK]; + } + } + if(dK==0) + {//no distances are known, we just set them to 1 + for(i=0;i<3;i++) + {D[i]=1; + } + } + edge_length[*N*2 - 4] = (D[0] + D[1] - D[2])/2; + edge_length[*N*2 - 5] = (D[0] + D[2] - D[1])/2; + edge_length[*N*2 - 6] = (D[2] + D[1] - D[0])/2; +} + + diff --git a/src/mvr.c b/src/mvr.c new file mode 100644 index 0000000..5f9fc0c --- /dev/null +++ b/src/mvr.c @@ -0,0 +1,189 @@ +/* mvr.c 2011-10-11 */ + +/* Copyright 2011 Andrei-Alin Popescu */ + +/* This file is part of the R-package `ape'. */ +/* See the file ../COPYING for licensing issues. */ + +#include "ape.h" + +void mvr(double *D, double* v,int *N, int *edge1, int *edge2, double *edge_length) +{ + double *S, Sdist, *new_v, Ndist, *new_dist, A, B, smallest_S, x, y; + int n, i, j, k, ij, smallest, OTU1, OTU2, cur_nod, o_l, *otu_label; + + S = &Sdist; + new_dist = &Ndist; + otu_label = &o_l; + n = *N; + cur_nod = 2*n - 2; + + S = (double*)R_alloc(n + 1, sizeof(double)); + new_dist = (double*)R_alloc(n*(n - 1)/2, sizeof(double)); + new_v = (double*)R_alloc(n*(n - 1)/2, sizeof(double)); + otu_label = (int*)R_alloc(n + 1, sizeof(int)); + + for (i = 1; i <= n; i++) otu_label[i] = i; /* otu_label[0] is not used */ + + k = 0; + + while (n > 3) { + + /*for(i=1;i 1; i--) + otu_label[i] = otu_label[i - 1]; + if (OTU2 != n) + for (i = OTU2; i < n; i++) + otu_label[i] = otu_label[i + 1]; + otu_label[1] = cur_nod; + + for (i = 1; i < n; i++) { + if (i == OTU1 || i == OTU2) continue; + for (j = i + 1; j <= n; j++) { + if (j == OTU1 || j == OTU2) continue; + new_dist[ij] = D[DINDEX(i, j)]; + new_v[ij]=v[give_index(i,j,n)]; + ij++; + } + } + + n--; + for (i = 0; i < n*(n - 1)/2; i++) + {D[i] = new_dist[i]; + v[i] = new_v[i]; + } + cur_nod--; + k = k + 2; + } + + for (i = 0; i < 3; i++) { + edge1[*N*2 - 4 - i] = cur_nod; + edge2[*N*2 - 4 - i] = otu_label[i + 1]; + } + + edge_length[*N*2 - 4] = (D[0] + D[1] - D[2])/2; + edge_length[*N*2 - 5] = (D[0] + D[2] - D[1])/2; + edge_length[*N*2 - 6] = (D[2] + D[1] - D[0])/2; +} + diff --git a/src/mvrs.c b/src/mvrs.c new file mode 100644 index 0000000..d2aa342 --- /dev/null +++ b/src/mvrs.c @@ -0,0 +1,397 @@ +/* mvrs.c 2011-10-11 */ + +/* Copyright 2011 Andrei-Alin Popescu */ + +/* This file is part of the R-package `ape'. */ +/* See the file ../COPYING for licensing issues. */ + +#include "ape.h" + +void mvrs(double *D, double* v,int *N, int *edge1, int *edge2, double *edge_length,int* fsS) +{ //assume missing values are denoted by -1 + + double *S,*R ,*new_v, Sdist, Ndist, *new_dist, A, B, smallest_S, x, y; + int n, i, j, k, ij, smallest, OTU1, OTU2, cur_nod, o_l, *otu_label; + /*for(i=0;i 3) { + + ij = 0; + for(i=1;i smallest_S) { + OTU1 = i; + OTU2 = j; + smallest_S = A; + smallest = ij; + } + ij++; + } + } + } + if(s[give_index(OTU1,OTU2,n)]<=2) + {error("distance information insufficient to construct a tree, leaves %i and %i isolated from tree",OTU1,OTU2); + } + //Rprintf("agglomerating %i and %i, Q=%f \n",OTU1,OTU2,smallest_S); + + /*for(i=1;i 1; i--) + otu_label[i] = otu_label[i - 1]; + if (OTU2 != n) + for (i = OTU2; i < n; i++) + otu_label[i] = otu_label[i + 1]; + otu_label[1] = cur_nod; + + + + n--; + for (i = 0; i < n*(n - 1)/2; i++) + { + D[i] = new_dist[i]; + v[i] = new_v[i]; + if(sw==1) + { + R[i] = newR[i]; + s[i] = newS[i]; + } + } + cur_nod--; + k = k + 2; + } + int dK=0;//number of known distances in final distance matrix + int iUK=-1;//index of unkown distance, if we have one missing distance + int iK=-1;//index of only known distance, only needed if dK==1 + for (i = 0; i < 3; i++) { + edge1[*N*2 - 4 - i] = cur_nod; + edge2[*N*2 - 4 - i] = otu_label[i + 1]; + if(D[i]!=-1){dK++;iK=i;}else{iUK=i;} + } + if(dK==2) + {//if two distances are known: assume our leaves are x,y,z, d(x,z) unknown + //and edge weights of three edges are a,b,c, then any b,c>0 that + //satisfy c-b=d(y,z)-d(x,y) a+c=d(y,z) are good edge weights, but for + //simplicity we assume a=c if d(yz)max)max=D[i]; + } + D[iUK]=max; + } + if(dK==1) + {//through similar motivation as above, if we have just one known distance + //we set the other two distances equal to it + for(i=0;i<3;i++) + {if(i==iK)continue; + D[i]=D[iK]; + } + } + if(dK==0) + {//no distances are known, we just set them to 1 + for(i=0;i<3;i++) + {D[i]=1; + } + } + edge_length[*N*2 - 4] = (D[0] + D[1] - D[2])/2; + edge_length[*N*2 - 5] = (D[0] + D[2] - D[1])/2; + edge_length[*N*2 - 6] = (D[2] + D[1] - D[0])/2; +} + + + diff --git a/src/nj.c b/src/nj.c index 9d0b8f6..21b4c1e 100644 --- a/src/nj.c +++ b/src/nj.c @@ -1,21 +1,11 @@ -/* nj.c 2010-04-21 */ +/* nj.c 2011-10-20 */ -/* Copyright 2006-2010 Emmanuel Paradis +/* Copyright 2006-2011 Emmanuel Paradis /* This file is part of the R-package `ape'. */ /* See the file ../COPYING for licensing issues. */ -#include - -#define DINDEX(i, j) n*(i - 1) - i*(i - 1)/2 + j - i - 1 -/* works if i < j strictly, and i = 1, ...; - see give_index() below */ - -int give_index(int i, int j, int n) -{ - if (i > j) return(DINDEX(j, i)); - else return(DINDEX(i, j)); -} +#include "ape.h" double sum_dist_to_i(int n, double *D, int i) /* returns the sum of all distances D_ij between i and j diff --git a/src/njs.c b/src/njs.c new file mode 100644 index 0000000..2ccfdef --- /dev/null +++ b/src/njs.c @@ -0,0 +1,667 @@ +/* njs.c 2011-10-11 */ + +/* Copyright 2011 Andrei-Alin Popescu */ + +/* This file is part of the R-package `ape'. */ +/* See the file ../COPYING for licensing issues. */ + +#include "ape.h" + +int H(double t) +{ + if (t >= 0) return 1; + return 0; +} + +void choosePair(double* D,int n,double* R,int* s, int* sw, int* x, int* y,int fS) +{ + int i=0,j=0,k=0; + int sww=0; + double cFS[fS]; + int iFS[fS]; + int jFS[fS]; + for(k=0;knumb;k++); + //Rprintf("k=%i ",k); + for(tr=fS-1;tr>k;tr--) + {cFS[tr]=cFS[tr-1]; + iFS[tr]=iFS[tr-1]; + jFS[tr]=jFS[tr-1]; + } + + if(kmax){max=nb;} + cFS[i]=nb; + } + /*Rprintf("all N*(x,y)\n"); + for(k=0;kmax){max=nb;} + cFS[i]=nb; + } + /*Rprintf("all c*xy values\n"); + for(k=0;kmax){max=nb;} + cFS[i]=nb; + } + /*Rprintf("all m*xy values\n"); + for(k=0;kmax){max=nb;iPos=i;} + cFS[i]=nb; + } + /*Rprintf("cN*xy\n"); + Rprintf("value[%i]=%f ",0,cFS[0]); + Rprintf("i=%i ",iFS[0]); + Rprintf("j=%i ",jFS[0]); + Rprintf("\n");*/ + *x=iFS[iPos];*y=jFS[iPos]; +} + +double cnxy(int x, int y, int n,double* D) +{ + int i=0; + int j=0; + double nMeanXY=0; + //Rprintf("cN[%i,%i]\n",x,y); + for(i=1;i<=n;i++) + {if(i==x || i==y)continue; + for(j=1;j<=n;j++) + {if(i==j)continue; + if(j==y || j==x)continue; + double n1=0; + double n2=0; + n1=D[give_index(i,x,n)]; + n2=D[give_index(j,y,n)]; + if(n1==-1 || n2==-1 || D[give_index(i,j,n)]==-1)continue; + nMeanXY+=(n1+n2-D[give_index(x,y,n)]-D[give_index(i,j,n)]); + //Rprintf("cnMeanXY after (%i,%i)=%f\n",i,j,nMeanXY); + } + } + return nMeanXY; +} + +int mxy(int x,int y,int n,double* D) +{ + int i=0; + int mx[n+1]; + int my[n+1]; + for(i=1;i<=n;i++) + { + mx[i]=0;my[i]=0; + } + for(i=1;i<=n;i++) + { + if(i!=x && D[give_index(x,i,n)]==-1) + { + mx[i]=1; + } + if(i!=y && D[give_index(y,i,n)]==-1) + { + my[i]=1; + } + } + /*for(i=1;i<=n;i++) + { + Rprintf("mx[%i]=%i ",i,mx[i]); + } + Rprintf("\n"); + + for(i=1;i<=n;i++) + { + Rprintf("my[%i]=%i ",i,my[i]); + } + Rprintf("\n");*/ + + int xmy=0; + int ymx=0; + for(i=1;i<=n;i++) + { + if(mx[i]==1 && my[i]==0) + { + xmy++; + } + if(my[i]==1 && mx[i]==0) + { + ymx++; + } + } + //Rprintf("xmy=%i, ymx=%i, xmy+ymx=%i\n",xmy,ymx,xmy+ymx); + return xmy+ymx; +} +double nxy(int x, int y, int n,double* D) +{ + int sCXY=0; + int i=0; + int j=0; + double nMeanXY=0; + //Rprintf("N[%i,%i]\n",x,y); + for(i=1;i<=n;i++) + {if(i==x || i==y)continue; + for(j=1;j<=n;j++) + {if(i==j)continue; + if(j==x || j==y)continue; + double n1=0; + double n2=0; + n1=D[give_index(i,x,n)]; + n2=D[give_index(j,y,n)]; + if(n1==-1 || n2==-1 || D[give_index(i,j,n)]==-1)continue; + sCXY++; + //Rprintf("considered pair (%i,%i)\n",i,j); + nMeanXY+=H(n1+n2-D[give_index(x,y,n)]-D[give_index(i,j,n)]); + //Rprintf("nMeanXY after (%i,%i)=%f\n",i,j,nMeanXY); + } + } + //Rprintf("sCXY=%i",sCXY); + return nMeanXY/sCXY; +} + +int cxy(int x, int y, int n,double* D) +{ + int sCXY=0; + int i=0; + int j=0; + + for(i=1;i<=n;i++) + {if(i==x || i==y)continue; + for(j=1;j<=n;j++) + {if(i==j)continue; + if(j==x || j==y)continue; + double n1=0; + double n2=0; + if(i!=x)n1=D[give_index(i,x,n)]; + if(j!=y)n2=D[give_index(j,y,n)]; + if(n1==-1 || n2==-1 || D[give_index(i,j,n)]==-1)continue; + sCXY++; + } + } + return sCXY; +} + +void njs(double *D, int *N, int *edge1, int *edge2, double *edge_length,int *fsS) +{ //assume missing values are denoted by -1 + double *S,*R, Sdist, Ndist, *new_dist, A, B, smallest_S, x, y; + int n, i, j, k, ij, smallest, OTU1, OTU2, cur_nod, o_l, *otu_label; + /*for(i=0;i 3) { + ij = 0; + for(i=1;i smallest_S) { + OTU1 = i; + OTU2 = j; + smallest_S = A; + smallest = ij; + } + ij++; + } + } + } + + /*Rprintf("agglomerating %i and %i, Q=%f \n",OTU1,OTU2,smallest_S); + + for(i=1;i 1; i--) + otu_label[i] = otu_label[i - 1]; + if (OTU2 != n) + for (i = OTU2; i < n; i++) + otu_label[i] = otu_label[i + 1]; + otu_label[1] = cur_nod; + + n--; + for (i = 0; i < n*(n - 1)/2; i++) + { + D[i] = new_dist[i]; + if(sw==1) + { + R[i] = newR[i]; + s[i] = newS[i]; + } + } + cur_nod--; + k = k + 2; + } + int dK=0;//number of known distances in final distance matrix + int iUK=-1;//index of unkown distance, if we have one missing distance + int iK=-1;//index of only known distance, only needed if dK==1 + for (i = 0; i < 3; i++) { + edge1[*N*2 - 4 - i] = cur_nod; + edge2[*N*2 - 4 - i] = otu_label[i + 1]; + if(D[i]!=-1){dK++;iK=i;}else{iUK=i;} + } + if(dK==2) + {//if two distances are known: assume our leaves are x,y,z, d(x,z) unknown + //and edge weights of three edges are a,b,c, then any b,c>0 that + //satisfy c-b=d(y,z)-d(x,y) a+c=d(y,z) are good edge weights, but for + //simplicity we assume a=c if d(yz)max)max=D[i]; + } + D[iUK]=max; + } + if(dK==1) + {//through similar motivation as above, if we have just one known distance + //we set the other two distances equal to it + for(i=0;i<3;i++) + {if(i==iK)continue; + D[i]=D[iK]; + } + } + if(dK==0) + {//no distances are known, we just set them to 1 + for(i=0;i<3;i++) + {D[i]=1; + } + } + edge_length[*N*2 - 4] = (D[0] + D[1] - D[2])/2; + edge_length[*N*2 - 5] = (D[0] + D[2] - D[1])/2; + edge_length[*N*2 - 6] = (D[2] + D[1] - D[0])/2; +} + diff --git a/src/treePop.c b/src/treePop.c new file mode 100644 index 0000000..5dfaad4 --- /dev/null +++ b/src/treePop.c @@ -0,0 +1,242 @@ +/* treePop.c 2011-10-11 */ + +/* Copyright 2011 Andrei-Alin Popescu */ + +/* This file is part of the R-package `ape'. */ +/* See the file ../COPYING for licensing issues. */ + +#include +#include +#include + +int lsb(char* a) +{ + int i = 0; + while (a[i] == 0) i++; /* count number of elements = 0 */ + + int b = 7; + while ((a[i] & (1 << b)) == 0) b--; + + return i*8 + (8 - b); +} + +short count_bits(uint8_t n) +{ + short c; /* c accumulates the total bits set in v */ + for (c = 0; n; c++) + n &= n - 1; /* clear the least significant bit set */ + return c; +} + +/* Print n as a binary number */ +/*void printbits(uint8_t n) +{ + uint8_t i; + i = 1 << (sizeof(n) * 8 - 1); + + while (i > 0) { + if (n & i) Rprintf("1"); + else Rprintf("0"); + i >>= 1; + } +}*/ + +uint8_t* setdiff(uint8_t* x, uint8_t *y, int nrow) //x-y +{ + int i = 0; + uint8_t* ret = (uint8_t*)R_alloc(nrow, sizeof(uint8_t)); + for (i = 0; i < nrow; i++) { + uint8_t tmp = (~y[i]); + +/* Rprintf("x[%i]=",i); + printbits(x[i]); + Rprintf("\n"); + Rprintf("y[%i]=",i); + printbits(y[i]); + Rprintf("\n"); + Rprintf("tmp=\n"); + printbits(tmp); + Rprintf("\n"); */ + + ret[i] = (x[i] & tmp); + } + return ret; +} + +void treePop(int* splits, double* w,int* ncolp,int* np, int* ed1, int* ed2, double* edLen) + { + int n=*np; + int ncol=*ncolp; + int nrow=ceil(n/(double)8); + uint8_t split[nrow][ncol]; + int i=0,j=0; + + /*Rprintf("n=%i nrow=%i ncol=%i\n",n,nrow,ncol); + Rprintf("got\n"); + for(i=0;in/2) + { + for(j=0;j=2)//if not trivial split + {nNodes++; + gn=nNodes; + } + else + { + gn=lsb(sp); + } + // Rprintf("gn=%i\n",gn); + ed2[numEdges]=gn; + edLen[numEdges]=w[ind[i]]; + numEdges++; + uint8_t* sdd=setdiff(vl,sp,nrow); + for(ii=0;ii + +int give_indexx(int i, int j, int n) +{ if (i == j) return 0; + if (i > j) return(n*(j - 1) - j*(j - 1)/2 + i - j - 1); + else return(n*(i - 1) - i*(i - 1)/2 + j - i - 1); +} + +int pred(int k, int* ed1, int* ed2, int numEdges) +/* find the predecesor of vertex k */ +{ + int i = 0; + for (i = 0; i <= numEdges; i++) + if (ed2[i] == k) return ed1[i]; + return -1; +} + +int* getPathBetween(int x, int y, int n, int* ed1, int* ed2, int numEdges) +//get the path between vertices x and y in an array ord. +//ord[i]=j means that we go between i and j on the path between x and y + { int i=0; + int k=x; + int ch[2*n-1];//ch[i]==1 implies {k,pred(k)} on path between x and y + for(i=1;i<=2*n-2;i++) + {ch[i]=0; + } + + while(k!=n+1) + { + ch[k]=1; + k=pred(k,ed1,ed2,numEdges); + } + k=y; + + while(k!=n+1) + { + ch[k]++; + k=pred(k,ed1,ed2,numEdges); + } + int *ord=(int*)malloc(sizeof(int)*(2*n-1)); + //starting from x, fill ord + + + int p=x; + + while(ch[p]==1) + { + int aux=p; + p=pred(p,ed1,ed2,numEdges); + ord[aux]=p; + } + p=y; + + while(ch[p]==1) + { + int aux=p; + p=pred(p,ed1,ed2,numEdges); + ord[p]=aux;//other way + } + + return ord; + } + +int getLength(int x, int y, int* ed1, int* ed2, int numEdges, int* edLen) +/* get length of edge {x,y}, -1 if edge does not exist */ +{ + int i = 0; + for (i = 0; i <= numEdges; i++) + if ((ed1[i] == x && ed2[i] == y) || (ed1[i] == y && ed2[i] == x)) + return edLen[i]; + return -1; +} + +void triangMtd(double* d, int* np, int* ed1,int* ed2, double* edLen) +{ + int n=*np; + int i=0; + int j=0; + int ij=-1; + + for(i=0;i%i of length:%f \n",ed1[i],ed2[i],edLen[i]); + } + Rprintf("end new tree\n");*/ + + //calculate distance of leaves not yet added to the star tree + int s; + for(s=1;s<=n;s++) + {if(w[s])continue; + for(i=1;i<=n;i++) + { if(i==x3)continue; + if(!w[i])continue; + double newL=0.5*(d[give_indexx(i,s,n)]+d[give_indexx(s,x3,n)]-d[give_indexx(i,x3,n)]); + if(newL %i ",p,ord[p]); + p=ord[p]; + prevSum=sum; + for(i=0;i<=numEdges;i++) + { + if((ed1[i]==aux && ed2[i]==p)||(ed2[i]==aux && ed1[i]==p)) + { + if(ed2[i]==aux && ed1[i]==p){sw=1;} + subdiv=i; + sum+=edLen[i]; + } + } + } + + nv++; + //subdivide subdiv with a node labelled nv + //length calculation + //multifurcating vertices + + int edd=ed2[subdiv]; + ed2[subdiv]=nv; + edLen[subdiv]= (sw==1)?(lx-prevSum):(sum-lx);//check which 'half' of the + //path the leaf belongs to + //and updates accordingly + //error("sum=%f, prevsum=%f\n",sum,prevSum); + //error("lx-prevSum=%f, sum-lx=%f, minDist=%f",lx-prevSum,sum-lx,minDist); + numEdges++; + ed1[numEdges]=nv; + ed2[numEdges]=edd; + edLen[numEdges]= (sw==1)?(sum-lx):(lx-prevSum); + numEdges++; + edLen[numEdges]=minDist; + ed1[numEdges]=nv; + ed2[numEdges]=z; + + wSize++; + w[z]=1; + + /*update distance matrix, only needed for incomplete distances + int ii; + for(ii=0;ii%i of length:%f \n",ed1[i],ed2[i],edLen[i]); + } + Rprintf("end new tree\n");*/ + } + //for(i=0;i<=numEdges;i++){ed1[i]++;ed2[i]++;} + } diff --git a/src/triangMtds.c b/src/triangMtds.c new file mode 100644 index 0000000..7ec0bd7 --- /dev/null +++ b/src/triangMtds.c @@ -0,0 +1,223 @@ +/* triangMtds.c 2011-10-11 */ + +/* Copyright 2011 Andrei-Alin Popescu */ + +/* This file is part of the R-package `ape'. */ +/* See the file ../COPYING for licensing issues. */ + +#include "ape.h" + +void triangMtds(double* d, int* np, int* ed1,int* ed2, double* edLen) + { + int n=*np; + int k=0; + int i=0; + int j=0; + int ij=-1; + int m[n+1]; + int c[n+1]; + int s[n+1]; + int o[n+1]; + for(i=1;i<=n;i++) + {m[i]=0; + c[i]=i; + s[i]=0; + for(j=1;j<=n;j++) + { + if(i==j){m[i]++;continue;} + if(d[give_index(i,j,n)]==-1)continue; + m[i]++; + } + } + + for(i=1;ik) + { + ed1[i]+=(n-k); + } + if(ed2[i]>k) + { + ed2[i]+=(n-k); + } + } + for(i=0;i<2*k-3;i++) + { + if(ed2[i]<=k) + { + ed2[i]=o[ed2[i]]; + } + } + for(i=1;i<=n;i++) + { + if(s[i]==0)continue;//take only leaves not in Y + m[i]=0; + for(j=1;j<=n;j++) + { + if(s[j]==1)continue;//take only leaves already in Y + if(d[give_index(i,j,n)]==-1)continue;//igonore if distance unknown + m[i]++; + } + } + int numEdges=2*k-4;//0-based, so subtract 1 + //Rprintf("numEdge=%i",numEdges); + int nv=(k-2)+n; + while(k i not added to tree + int max=-1; + int maxPos=-1; + for(i=1;i<=n;i++) + { + if(s[i]==0)continue; + if(m[i]>max) + { + max=m[i]; + maxPos=i; + } + } + s[maxPos]=0;//mark maxPos as added + //calculate new m values for leaves not added, i.e we just increment any + //already present value by 1 if we know the distance between i and maxPos + for(i=1;i<=n;i++) + { + if(s[i]==0)continue; + if(d[give_index(i,maxPos,n)]==-1)continue; + m[i]++; + } + + //find path to attach maxPos to, grow tree + double minDist=1e50; + int z=maxPos; + int x=-1,y=-1; + for(i=1;i %i ",p,ord[p]); + p=ord[p]; + prevSum=sum; + for(i=0;i<=numEdges;i++) + { + if((ed1[i]==aux && ed2[i]==p)||(ed2[i]==aux && ed1[i]==p)) + { + if(ed2[i]==aux && ed1[i]==p){sw=1;} + subdiv=i; + sum+=edLen[i]; + } + } + //if(cc>1000)error("failed to follow path between x=%i y=%i\n",x,y); + } + + + nv++; + //subdivide subdiv with a node labelled nv + //length calculation + + int edd=ed2[subdiv]; + ed2[subdiv]=nv; + edLen[subdiv]= (sw==1)?(lx-prevSum):(sum-lx);//check which 'half' of the + //path the leaf belongs to + //and updates accordingly + //error("sum=%f, prevsum=%f\n",sum,prevSum); + //error("lx-prevSum=%f, sum-lx=%f, minDist=%f",lx-prevSum,sum-lx,minDist); + //Rprintf("adding %i on path %i %i, at distance %f from %i, and %f from tree\n",z,x,y,lx,x,minDist); + // Rprintf("subdividing edge %i\n",subdiv); + numEdges++; + ed1[numEdges]=nv; + ed2[numEdges]=edd; + edLen[numEdges]= (sw==1)?(sum-lx):(lx-prevSum); + numEdges++; + edLen[numEdges]=minDist; + ed1[numEdges]=nv; + ed2[numEdges]=z; + k++; + } + } diff --git a/src/ultrametric.c b/src/ultrametric.c new file mode 100644 index 0000000..1c1d57a --- /dev/null +++ b/src/ultrametric.c @@ -0,0 +1,61 @@ +/* ultrametric.c 2011-10-11 */ + +/* Copyright 2011 Andrei-Alin Popescu */ + +/* This file is part of the R-package `ape'. */ +/* See the file ../COPYING for licensing issues. */ + +#include "ape.h" + +void ultrametric(double *dd, int* np,int* mp,double *ret)//d received as dist object, -1 for missing entries +{ + int n=*np; + int m=*mp; + int i=0,j=0; + double max=dd[0]; + double d[n][n]; + for(i=1;imax) + { + max=dd[give_index(i,j,n)]; + } + } + } + d[n-1][n-1]=0; + + int entrCh=0; + do{ + entrCh=0; + for(i=0;i d[j][k] ? d[i][k] : d[j][k]; + if(mx