1 ## read.dna.R (2008-07-03)
3 ## Read DNA Sequences in a File
5 ## Copyright 2003-2008 Emmanuel Paradis
7 ## This file is part of the R-package `ape'.
8 ## See the file ../COPYING for licensing issues.
10 read.dna <- function(file, format = "interleaved", skip = 0,
11 nlines = 0, comment.char = "#", seq.names = NULL,
14 getTaxaNames <- function(x) {
15 x <- sub("^['\" ]+", "", x) # remove the leading quotes and spaces
16 x <- sub("['\" ]+$", "", x) # " " trailing " " "
19 getNucleotide <- function(x) {
21 x <- strsplit(x, NULL)
24 formats <- c("interleaved", "sequential", "fasta", "clustal")
25 format <- match.arg(format, formats)
26 phylip <- if (format %in% formats[1:2]) TRUE else FALSE
27 X <- scan(file = file, what = "", sep = "\n", quiet = TRUE,
28 skip = skip, nlines = nlines, comment.char = comment.char)
29 pat.base <- "[-AaCcGgTtUuMmRrWwSsYyKkVvHhDdBbNn?]{10}"
31 ## need to remove the possible leading spaces in the first line
32 fl <- gsub("^ +", "", X[1])
33 fl <- as.numeric(unlist(strsplit(fl, " +")))
34 if (length(fl) != 2 || any(is.na(fl)))
35 stop("the first line of the file must contain the dimensions of the data")
38 obj <- matrix("", n, s)
41 if (format == "interleaved") {
42 start.seq <- regexpr(pat.base, X[1])[1]
43 if (is.null(seq.names))
44 seq.names <- getTaxaNames(substr(X[1:n], 1, start.seq - 1))
45 X[1:n] <- substr(X[1:n], start.seq, nchar(X[1:n]))
48 obj[i, ] <- getNucleotide(X[seq(i, nl, n)])
50 if (format == "sequential") {
54 start.seq <- regexpr(pat.base, X[j])[1]
55 taxa[i] <- substr(X[j], 1, start.seq - 1)
56 sequ <- getNucleotide(substr(X[j], start.seq, nchar(X[j])))
58 while (length(sequ) < s) {
59 sequ <- c(sequ, getNucleotide(X[j]))
64 if (is.null(seq.names)) seq.names <- getTaxaNames(taxa)
66 if (format == "fasta") {
67 start <- grep("^ {0,}>", X)
70 obj <- vector("list", n)
71 if (is.null(seq.names)) {
72 taxa <- sub("^ {0,}>", "", taxa) # remove the hook and the spaces before
73 seq.names <- getTaxaNames(taxa)
75 start <- c(start, length(X) + 1) # this avoids the following to crash when `i = n'
77 obj[[i]] <- getNucleotide(X[(start[i] + 1):(start[i + 1] - 1)])
79 if (format == "clustal") {
81 ## find where the 1st sequence starts
82 start.seq <- regexpr(pat.base, X[1])[1]
83 ## find the lines with *********....
84 nspaces <- paste("^ {", start.seq - 1, "}", sep = "", collapse = "")
85 stars <- grep(nspaces, X)
86 ## we now know how many sequences in the file:
88 ## get the sequence names in the same way than "interleaved":
89 if (is.null(seq.names))
90 seq.names <- getTaxaNames(substr(X[1:n], 1, start.seq - 1))
91 ## need to remove the sequence names before getting the sequences:
92 X <- substr(X, start.seq, nchar(X))
94 ## find the length of the 1st sequence:
95 tmp <- getNucleotide(X[seq(1, nl, n + 1)])
97 obj <- matrix("", n, s)
100 obj[i, ] <- getNucleotide(X[seq(i, nl, n + 1)])
102 if (format != "fasta") {
103 rownames(obj) <- seq.names
106 names(obj) <- seq.names
107 obj <- lapply(obj, tolower)
109 if (!as.character) obj <- as.DNAbin(obj)