1 ## read.dna.R (2008-03-28)
3 ## Read DNA Sequences in a File
5 ## Copyright 2003-2008 Emmanuel Paradis
7 ## This file is part of the R-package `ape'.
8 ## See the file ../COPYING for licensing issues.
10 read.dna <- function(file, format = "interleaved", skip = 0,
11 nlines = 0, comment.char = "#", seq.names = NULL,
14 getTaxaNames <- function(x) {
15 x <- sub("^ +", "", x) # remove the leading spaces
16 x <- sub(" +$", "", x) # remove the trailing spaces
17 x <- sub("^['\"]", "", x) # remove the leading quotes
18 x <- sub("['\"]$", "", x) # remove the trailing quotes
21 format <- match.arg(format, c("interleaved", "sequential", "fasta"))
22 phylip <- if (format %in% c("interleaved", "sequential")) TRUE else FALSE
23 X <- scan(file = file, what = character(), sep = "\n", quiet = TRUE,
24 skip = skip, nlines = nlines, comment.char = comment.char)
27 oop <- options(warn = -1)
28 ## need to remove the possible leading spaces in the first line
29 fl.num <- as.numeric(unlist(strsplit(gsub("^ +", "", fl), " +")))
31 if (all(is.na(fl.num)))
32 stop("the first line of the file must contain the dimensions of the data")
33 if (length(fl.num) != 2)
34 stop("the first line of the file must contain TWO numbers")
40 obj <- vector("character", n*s)
43 if (format == "interleaved") {
45 fl <- unlist(strsplit(fl, NULL))
46 bases <- grep("[-AaCcGgTtUuMmRrWwSsYyKkVvHhDdBbNn?]", fl)
48 for (i in 1:length(z)) if (all(z[i:(i + 8)] == 1)) break
50 if (is.null(seq.names))
51 seq.names <- getTaxaNames(substr(X[1:n], 1, start.seq - 1))
52 X[1:n] <- substr(X[1:n], start.seq, nchar(X[1:n]))
56 obj[i, ] <- unlist(strsplit(X[seq(i, nl, n)], NULL))
58 if (format == "sequential") {
63 bases <- grep("[-AaCcGgTtUuMmRrWwSsYyKkVvHhDdBbNn?]",
64 unlist(strsplit(X[j], NULL)))
66 for (k in 1:length(z)) if (all(z[k:(k + 8)] == 1)) break
68 taxa[i] <- substr(X[j], 1, start.seq - 1)
69 sequ <- substr(X[j], start.seq, nchar(X[j]))
70 sequ <- gsub(" ", "", sequ)
72 while (nchar(sequ) < s) {
73 sequ <- paste(sequ, gsub(" " , "", X[j]), sep = "")
76 obj[i, ] <- unlist(strsplit(sequ, NULL))
78 if (is.null(seq.names)) seq.names <- getTaxaNames(taxa)
80 if (format == "fasta") {
81 start <- grep("^ {0,}>", X)
84 obj <- vector("list", n)
85 if (is.null(seq.names)) {
86 taxa <- sub("^ {0,}>", "", taxa) # remove the hook and the spaces before
87 seq.names <- getTaxaNames(taxa)
89 start <- c(start, length(X) + 1) # this avoids the following to crash when `i = n'
91 obj[[i]] <- unlist(strsplit(gsub(" ", "",
92 X[(start[i] + 1):(start[i + 1] - 1)]),
96 rownames(obj) <- seq.names
99 names(obj) <- seq.names
100 obj <- lapply(obj, tolower)
102 if (!as.character) obj <- as.DNAbin(obj)