R/read.nexus.R

   1 ## read.nexus.R (2011-02-28)
   2
   3 ##   Read Tree File in Nexus Format
   4
   5 ## Copyright 2003-2011 Emmanuel Paradis and 2010 Klaus Schliep
   6
   7 ## This file is part of the R-package `ape'.
   8 ## See the file ../COPYING for licensing issues.
   9
  10 .treeBuildWithTokens <- function(x)
  11 {
  12     phy <- .Call("treeBuildWithTokens", x, PACKAGE = "apex")
  13     dim(phy[[1]]) <- c(length(phy[[1]])/2, 2)
  14     nms <- c("edge", "edge.length", "Nnode", "node.label", "root.edge")
  15     if (length(phy) == 4) nms <- nms[-5]
  16     names(phy) <- nms
  17     if (all(phy$node.label == "")) phy$node.label <- NULL
  18     class(phy) <- "phylo"
  19     phy
  20 }
  21
  22 clado.build <- function(tp)
  23 {
  24     add.internal <- function() {
  25         edge[j, 1] <<- current.node
  26         node <<- node + 1
  27         edge[j, 2] <<- current.node <<- node
  28         index[node] <<- j # set index
  29         j <<- j + 1
  30     }
  31     add.terminal <- function() {
  32         edge[j, 1] <<- current.node
  33         edge[j, 2] <<- tip
  34         index[tip] <<- j # set index
  35         tip.label[tip] <<- tpc[k]
  36         k <<- k + 1
  37         tip <<- tip + 1
  38         j <<- j + 1
  39     }
  40     go.down <- function() {
  41         l <- index[current.node]
  42         node.label[current.node - nb.tip] <<- tpc[k]
  43         k <<- k + 1
  44         current.node <<- edge[l, 1]
  45     }
  46     if (!length(grep(",", tp))) {
  47         obj <- list(edge = matrix(c(2, 1), 1, 2), Nnode = 1)
  48         tp <- unlist(strsplit(tp, "[\\(\\);]"))
  49         obj$tip.label <- tp[2]
  50         if (tp[3] != "") obj$node.label <- tp[3]
  51         class(obj) <- "phylo"
  52         return(obj)
  53     }
  54     tsp <- unlist(strsplit(tp, NULL))
  55     tp <- gsub(")", ")NA", tp)
  56     tp <- gsub(" ", "", tp)
  57     tpc <- unlist(strsplit(tp, "[\\(\\),;]"))
  58     tpc <- tpc[tpc != ""]
  59     skeleton <- tsp[tsp == "(" | tsp == ")" | tsp == "," | tsp == ";"]
  60     nsk <- length(skeleton)
  61     nb.node <- length(skeleton[skeleton == ")"])
  62     nb.tip <- length(skeleton[skeleton == ","]) + 1
  63     ## We will assume there is an edge at the root;
  64     ## if so, it will be removed and put in a vector
  65     nb.edge <- nb.node + nb.tip
  66     node.label <- character(nb.node)
  67     tip.label <- character(nb.tip)
  68
  69     edge <- matrix(NA, nb.edge, 2)
  70     current.node <- node <- nb.tip + 1 # node number
  71     edge[nb.edge, 1] <- 0    # see comment above
  72     edge[nb.edge, 2] <- node #
  73
  74     index <- numeric(nb.edge + 1)
  75     index[node] <- nb.edge
  76     ## j: index of the line number of edge
  77     ## k: index of the line number of tpc
  78     ## tip: tip number
  79     j <- k <- tip <- 1
  80     for (i in 2:nsk) {
  81         if (skeleton[i] == "(") add.internal()      # add an internal branch (on top)
  82         if (skeleton[i] == ",") {
  83             if (skeleton[i - 1] != ")") add.terminal()   # add a terminal branch
  84         }
  85         if (skeleton[i] == ")") {
  86             if (skeleton[i - 1] == ",") {   # add a terminal branch and go down one level
  87                 add.terminal()
  88                 go.down()
  89             }
  90             if (skeleton[i - 1] == ")") go.down()   # go down one level
  91         }
  92     }
  93     edge <- edge[-nb.edge, ]
  94     obj <- list(edge = edge, tip.label = tip.label,
  95                 Nnode = nb.node, node.label = node.label)
  96     obj$node.label <-
  97         if (all(obj$node.label == "NA")) NULL
  98         else gsub("^NA", "", obj$node.label)
  99     class(obj) <- "phylo"
 100     obj
 101 }
 102
 103 read.nexus <- function(file, tree.names = NULL)
 104 {
 105     X <- scan(file = file, what = "", sep = "\n", quiet = TRUE)
 106     ## remove all comments
 107     ## (this might not work if there are square brackets within the comments)
 108     LEFT <- grep("\\[", X)
 109     RIGHT <- grep("\\]", X)
 110     if (length(LEFT)) { # in case there are no comments at all
 111         w <- LEFT == RIGHT
 112         if (any(w)) { # in case all comments use at least 2 lines
 113             s <- LEFT[w]
 114             X[s] <- gsub("\\[[^]]*\\]", "", X[s])
 115             ## The above regexp was quite tough to find: it makes
 116             ## possible to delete series of comments on the same line:
 117             ##       ...[...]xxx[...]...
 118             ## without deleting the "xxx". This regexp is in three parts:
 119             ##       \\[      [^]]*       \\]
 120             ## where [^]]* means "any character, except "]", repeated zero
 121             ## or more times" (note that the ']' is not escaped here).
 122             ## The previous version was:
 123             ##       X[s] <- gsub("\\[.*\\]", "", X[s])
 124             ## which deleted the "xxx". (EP  2008-06-24)
 125         }
 126         w <- !w
 127         if (any(w)) {
 128             s <- LEFT[w]
 129             X[s] <- gsub("\\[.*", "", X[s])
 130             sb <- RIGHT[w]
 131             X[sb] <- gsub(".*\\]", "", X[sb])
 132             if (any(s < sb - 1))
 133                 X <- X[-unlist(mapply(":", (s + 1), (sb - 1)))]
 134         }
 135     }
 136     endblock <- grep("END;|ENDBLOCK;", X, ignore.case = TRUE)
 137     semico <- grep(";", X)
 138     i1 <- grep("BEGIN TREES;", X, ignore.case = TRUE)
 139     i2 <- grep("TRANSLATE", X, ignore.case = TRUE)
 140     translation <- if (length(i2) == 1 && i2 > i1) TRUE else FALSE
 141     if (translation) {
 142         end <- semico[semico > i2][1]
 143         x <- X[(i2 + 1):end] # assumes there's a 'new line' after "TRANSLATE"
 144         ## x <- gsub("TRANSLATE", "", x, ignore.case = TRUE)
 145         x <- unlist(strsplit(x, "[,; \t]"))
 146         x <- x[nzchar(x)]
 147         TRANS <- matrix(x, ncol = 2, byrow = TRUE)
 148         TRANS[, 2] <- gsub("['\"]", "", TRANS[, 2])
 149         n <- dim(TRANS)[1]
 150     }
 151     start <-
 152         if (translation) semico[semico > i2][1] + 1
 153         else semico[semico > i1][1]
 154     end <- endblock[endblock > i1][1] - 1
 155     tree <- X[start:end]
 156     rm(X)
 157     tree <- gsub("^.*= *", "", tree)
 158     ## check whether there are empty lines from the above manips:
 159     tree <- tree[tree != ""]
 160     semico <- grep(";", tree)
 161     Ntree <- length(semico)
 162     ## are some trees on several lines?
 163     if (Ntree == 1 && length(tree) > 1) STRING <- paste(tree, collapse = "") else {
 164         if (any(diff(semico) != 1)) {
 165             STRING <- character(Ntree)
 166             s <- c(1, semico[-Ntree] + 1)
 167             j <- mapply(":", s, semico)
 168             if (is.list(j)) {
 169                 for (i in 1:Ntree)
 170                     STRING[i] <- paste(tree[j[[i]]], collapse = "")
 171             } else {
 172                 for (i in 1:Ntree)
 173                     STRING[i] <- paste(tree[j[, i]], collapse = "")
 174             }
 175         } else STRING <- tree
 176     }
 177     rm(tree)
 178     STRING <- gsub(" ", "", STRING)
 179     colon <- grep(":", STRING)
 180     if (!length(colon)) {
 181         trees <- lapply(STRING, clado.build)
 182     } else if (length(colon) == Ntree) {
 183         trees <-
 184             if (translation) lapply(STRING, .treeBuildWithTokens)
 185             else lapply(STRING, tree.build)
 186     } else {
 187         trees <- vector("list", Ntree)
 188         trees[colon] <- lapply(STRING[colon], tree.build)
 189         nocolon <- (1:Ntree)[!1:Ntree %in% colon]
 190         trees[nocolon] <- lapply(STRING[nocolon], clado.build)
 191         if (translation) {
 192             for (i in 1:Ntree) {
 193                 tr <- trees[[i]]
 194                 for (j in 1:n) {
 195                     ind <- which(tr$tip.label[j] == TRANS[, 1])
 196                     tr$tip.label[j] <- TRANS[ind, 2]
 197                 }
 198                 if (!is.null(tr$node.label)) {
 199                     for (j in 1:length(tr$node.label)) {
 200                         ind <- which(tr$node.label[j] == TRANS[, 1])
 201                         tr$node.label[j] <- TRANS[ind, 2]
 202                     }
 203                 }
 204                 trees[[i]] <- tr
 205             }
 206             translation <- FALSE
 207         }
 208     }
 209     for (i in 1:Ntree) {
 210         tr <- trees[[i]]
 211         ## Check here that the root edge is not incorrectly represented
 212         ## in the object of class "phylo" by simply checking that there
 213         ## is a bifurcation at the root
 214         if (!translation) n <- length(tr$tip.label)
 215         ROOT <- n + 1
 216         if (sum(tr$edge[, 1] == ROOT) == 1 && dim(tr$edge)[1] > 1) {
 217             stop(paste("There is apparently two root edges in your file: cannot read tree file.\n  Reading NEXUS file aborted at tree no.", i, sep = ""))
 218         }
 219     }
 220     if (Ntree == 1) {
 221         trees <- trees[[1]]
 222         if (translation) {
 223             trees$tip.label <-
 224                 if (length(colon)) TRANS[, 2] else
 225                 TRANS[, 2][as.numeric(trees$tip.label)]
 226         }
 227     } else {
 228         if (!is.null(tree.names)) names(trees) <- tree.names
 229         if (translation) {
 230             if (length(colon) == Ntree) # .treeBuildWithTokens() was used
 231                 attr(trees, "TipLabel") <- TRANS[, 2]
 232             else { # reassign the tip labels then compress
 233                 for (i in 1:Ntree)
 234                     trees[[i]]$tip.label <-
 235                         TRANS[, 2][as.numeric(trees[[i]]$tip.label)]
 236                 trees <- .compressTipLabel(trees)
 237             }
 238         }
 239         class(trees) <- "multiPhylo"
 240     }
 241     if (length(grep("[\\/]", file)) == 1)
 242         if (!file.exists(file)) # suggestion by Francois Michonneau
 243             file <- paste(getwd(), file, sep = "/")
 244     attr(trees, "origin") <- file
 245     trees
 246 }