1 ## read.nexus.R (2009-11-21)
3 ## Read Tree File in Nexus Format
5 ## Copyright 2003-2009 Emmanuel Paradis
7 ## This file is part of the R-package `ape'.
8 ## See the file ../COPYING for licensing issues.
10 .treeBuildWithTokens <- function(x)
12 ## remove potential node labels; see ?read.nexus for justification
13 node.label <- gsub("[:;].*$", "", strsplit(x, ")")[[1]][-1])
14 has.node.labels <- FALSE
15 if (any(node.label != "")) {
16 x <- gsub(")[^:]*:", "):", x)
17 x <- gsub(")[^:]*;", ");", x) # if there's no root edge
18 has.node.labels <- TRUE
20 phy <- .Call("treeBuildWithTokens", x, PACKAGE = "ape")
21 dim(phy[[1]]) <- c(length(phy[[1]])/2, 2)
22 nms <- c("edge", "edge.length", "Nnode", "root.edge")
23 if (length(phy) == 3) nms <- nms[-4]
25 if (has.node.labels) phy$node.label <- node.label
30 clado.build <- function(tp)
32 add.internal <- function() {
33 edge[j, 1] <<- current.node
35 edge[j, 2] <<- current.node <<- node
38 add.terminal <- function() {
39 edge[j, 1] <<- current.node
41 tip.label[tip] <<- tpc[k]
46 go.down <- function() {
47 l <- which(edge[, 2] == current.node)
48 node.label[current.node - nb.tip] <<- tpc[k]
50 current.node <<- edge[l, 1]
52 if (!length(grep(",", tp))) {
53 obj <- list(edge = matrix(c(2, 1), 1, 2), Nnode = 1)
54 tp <- unlist(strsplit(tp, "[\\(\\);]"))
55 obj$tip.label <- tp[2]
56 if (length(tp) == 3) obj$node.label <- tp[3]
60 tsp <- unlist(strsplit(tp, NULL))
61 tp <- gsub(")", ")NA", tp)
62 tp <- gsub(" ", "", tp)
63 tpc <- unlist(strsplit(tp, "[\\(\\),;]"))
65 skeleton <- tsp[tsp == "(" | tsp == ")" | tsp == "," | tsp == ";"]
66 nsk <- length(skeleton)
67 nb.node <- length(skeleton[skeleton == ")"])
68 nb.tip <- length(skeleton[skeleton == ","]) + 1
69 ## We will assume there is an edge at the root;
70 ## if so, it will be removed and put in a vector
71 nb.edge <- nb.node + nb.tip
72 node.label <- character(nb.node)
73 tip.label <- character(nb.tip)
75 edge <- matrix(NA, nb.edge, 2)
76 current.node <- node <- nb.tip + 1 # node number
77 edge[nb.edge, 1] <- 0 # see comment above
78 edge[nb.edge, 2] <- node #
80 ## j: index of the line number of edge
81 ## k: index of the line number of tpc
86 if (skeleton[i] == "(") add.internal() # add an internal branch (on top)
87 if (skeleton[i] == ",") {
88 if (skeleton[i - 1] != ")") add.terminal() # add a terminal branch
90 if (skeleton[i] == ")") {
91 if (skeleton[i - 1] == ",") { # add a terminal branch and go down one level
95 if (skeleton[i - 1] == ")") go.down() # go down one level
98 # if(node.label[1] == "NA") node.label[1] <- ""
99 edge <- edge[-nb.edge, ]
100 obj <- list(edge = edge, tip.label = tip.label,
101 Nnode = nb.node, node.label = node.label)
103 if (all(obj$node.label == "NA")) NULL
104 else gsub("^NA", "", obj$node.label)
105 class(obj) <- "phylo"
109 read.nexus <- function(file, tree.names = NULL)
111 X <- scan(file = file, what = "", sep = "\n", quiet = TRUE)
112 ## remove all comments
113 ## (this might not work if there are square brackets within the comments)
114 LEFT <- grep("\\[", X)
115 RIGHT <- grep("\\]", X)
116 if (length(LEFT)) { # in case there are no comments at all
118 if (any(w)) { # in case all comments use at least 2 lines
120 X[s] <- gsub("\\[[^]]*\\]", "", X[s])
121 ## The above regexp was quite tough to find: it makes
122 ## possible to delete series of comments on the same line:
123 ## ...[...]xxx[...]...
124 ## without deleting the "xxx". This regexp is in three parts:
126 ## where [^]]* means "any character, except "]", repeated zero
127 ## or more times" (note that the ']' is not escaped here).
128 ## The previous version was:
129 ## X[s] <- gsub("\\[.*\\]", "", X[s])
130 ## which deleted the "xxx". (EP 2008-06-24)
135 X[s] <- gsub("\\[.*", "", X[s])
137 X[sb] <- gsub(".*\\]", "", X[sb])
139 X <- X[-unlist(mapply(":", (s + 1), (sb - 1)))]
142 endblock <- grep("END;|ENDBLOCK;", X, ignore.case = TRUE)
143 semico <- grep(";", X)
144 i1 <- grep("BEGIN TREES;", X, ignore.case = TRUE)
145 i2 <- grep("TRANSLATE", X, ignore.case = TRUE)
146 translation <- if (length(i2) == 1 && i2 > i1) TRUE else FALSE
148 end <- semico[semico > i2][1]
149 x <- X[(i2 + 1):end] # assumes there's a 'new line' after "TRANSLATE"
150 ## x <- gsub("TRANSLATE", "", x, ignore.case = TRUE)
151 x <- unlist(strsplit(x, "[,; \t]"))
153 TRANS <- matrix(x, ncol = 2, byrow = TRUE)
154 TRANS[, 2] <- gsub("['\"]", "", TRANS[, 2])
158 if (translation) semico[semico > i2][1] + 1
159 else semico[semico > i1][1]
160 end <- endblock[endblock > i1][1] - 1
163 tree <- gsub("^.*= *", "", tree)
164 ## check whether there are empty lines from the above manips:
165 tree <- tree[tree != ""]
166 semico <- grep(";", tree)
167 Ntree <- length(semico)
168 ## are some trees on several lines?
169 if (Ntree == 1 && length(tree) > 1) STRING <- paste(tree, collapse = "") else {
170 if (any(diff(semico) != 1)) {
171 STRING <- character(Ntree)
172 s <- c(1, semico[-Ntree] + 1)
173 j <- mapply(":", s, semico)
176 STRING[i] <- paste(tree[j[[i]]], collapse = "")
179 STRING[i] <- paste(tree[j[, i]], collapse = "")
181 } else STRING <- tree
184 STRING <- gsub(" ", "", STRING)
185 colon <- grep(":", STRING)
186 if (!length(colon)) {
187 trees <- lapply(STRING, clado.build)
188 } else if (length(colon) == Ntree) {
190 if (translation) lapply(STRING, .treeBuildWithTokens)
191 else lapply(STRING, tree.build)
193 trees <- vector("list", Ntree)
194 trees[colon] <- lapply(STRING[colon], tree.build)
195 nocolon <- (1:Ntree)[!1:Ntree %in% colon]
196 trees[nocolon] <- lapply(STRING[nocolon], clado.build)
201 ind <- which(tr$tip.label[j] == TRANS[, 1])
202 tr$tip.label[j] <- TRANS[ind, 2]
204 if (!is.null(tr$node.label)) {
205 for (j in 1:length(tr$node.label)) {
206 ind <- which(tr$node.label[j] == TRANS[, 1])
207 tr$node.label[j] <- TRANS[ind, 2]
217 ## Check here that the root edge is not incorrectly represented
218 ## in the object of class "phylo" by simply checking that there
219 ## is a bifurcation at the root
220 if (!translation) n <- length(tr$tip.label)
222 if (sum(tr$edge[, 1] == ROOT) == 1 && dim(tr$edge)[1] > 1) {
223 stop(paste("There is apparently two root edges in your file: cannot read tree file.\n Reading NEXUS file aborted at tree no.", i, sep = ""))
230 if (length(colon)) TRANS[, 2] else
231 TRANS[, 2][as.numeric(trees$tip.label)]
234 if (!is.null(tree.names)) names(trees) <- tree.names
236 if (length(colon) == Ntree) # .treeBuildWithTokens() was used
237 attr(trees, "TipLabel") <- TRANS[, 2]
238 else { # reassign the tip labels then compress
240 trees[[i]]$tip.label <-
241 TRANS[, 2][as.numeric(trees[[i]]$tip.label)]
242 trees <- .compressTipLabel(trees)
245 class(trees) <- "multiPhylo"
247 if (length(grep("[\\/]", file)) == 1)
248 if (!file.exists(file)) # suggestion by Francois Michonneau
249 file <- paste(getwd(), file, sep = "/")
250 attr(trees, "origin") <- file