#' Read a file in clustal format #' #' \code{read.clustal} reads an alignment file in clustal format #' #' #' @param file file name of the clustal alignment #' @export #' @examples #' \dontrun{ #' clust.align <- read.clustal("clustal_alignment.txt") #' } read.clustal <- function(file,...) { ## stolen from ape's read.dna.R findFirstSeq <- function(x) { ## actually find the 1st non-blank character tmp <- regexpr("[[:blank:]]+", x[1]) # consider only a single string tmp[1] + attr(tmp, "match.length") } getSeq <- function(x) { x <- gsub(" ", "", x) x <- strsplit(x, NULL) toupper(unlist(x)) } X <- scan(file = file, what = "", sep = "\n", quiet = TRUE) if (!all(grepl("^CLUSTAL",X[1]))) stop("Doesn't appear to be a file in clustal format") ## The first line contains CLUSTAL, and isn't interesting X <- X[-1] start.seq <- findFirstSeq(X) ## now, find how many sequences there are leading.spaces <- paste("^ {",start.seq-1,"}",sep="") stars <- grep(leading.spaces, X) num.seq <- stars[1]-1 taxa <- gsub(" *$","",substr(X[1:num.seq],1,start.seq-1)) ## remove the sequence names X <- substr(X,start.seq,nchar(X)) ## number of lines of sequences nl <- length(X) ## sequence length first.seq <- getSeq(X[seq(1,nl,num.seq+1)]) seqs <- matrix("",num.seq,length(first.seq)) seqs[1,] <- first.seq for (i in 2:num.seq) { seqs[i,] <- getSeq(X[seq(i,nl,num.seq+1)]) } rownames(seqs) <- taxa as.proteinbin(seqs) }