1 #' Read a file in clustal format
3 #' \code{read.clustal} reads an alignment file in clustal format
6 #' @param file file name of the clustal alignment
10 #' clust.align <- read.clustal("clustal_alignment.txt")
12 read.clustal <- function(file,...) {
13 ## stolen from ape's read.dna.R
14 findFirstSeq <- function(x) {
15 ## actually find the 1st non-blank character
16 tmp <- regexpr("[[:blank:]]+", x[1]) # consider only a single string
17 tmp[1] + attr(tmp, "match.length")
19 getSeq <- function(x) {
21 x <- strsplit(x, NULL)
25 X <- scan(file = file, what = "", sep = "\n", quiet = TRUE)
26 if (!all(grepl("^CLUSTAL",X[1])))
27 stop("Doesn't appear to be a file in clustal format")
28 ## The first line contains CLUSTAL, and isn't interesting
30 start.seq <- findFirstSeq(X)
31 ## now, find how many sequences there are
32 leading.spaces <- paste("^ {",start.seq-1,"}",sep="")
33 stars <- grep(leading.spaces, X)
35 taxa <- gsub(" *$","",substr(X[1:num.seq],1,start.seq-1))
36 ## remove the sequence names
37 X <- substr(X,start.seq,nchar(X))
38 ## number of lines of sequences
41 first.seq <- getSeq(X[seq(1,nl,num.seq+1)])
42 seqs <- matrix("",num.seq,length(first.seq))
44 for (i in 2:num.seq) {
45 seqs[i,] <- getSeq(X[seq(i,nl,num.seq+1)])
47 rownames(seqs) <- taxa