R/read.clustal.R

   1 #' Read a file in clustal format
   2 #'
   3 #' \code{read.clustal} reads an alignment file in clustal format
   4 #'
   5 #'
   6 #' @param file file name of the clustal alignment
   7 #' @export
   8 #' @examples
   9 #' \dontrun{
  10 #' clust.align <- read.clustal("clustal_alignment.txt")
  11 #' }
  12 read.clustal <- function(file,...) {
  13     ## stolen from ape's read.dna.R
  14     findFirstSeq <- function(x) {
  15         ## actually find the 1st non-blank character
  16         tmp <- regexpr("[[:blank:]]+", x[1]) # consider only a single string
  17         tmp[1] + attr(tmp, "match.length")
  18     }
  19     getSeq <- function(x) {
  20         x <- gsub(" ", "", x)
  21         x <- strsplit(x, NULL)
  22         toupper(unlist(x))
  23     }
  24
  25     X <- scan(file = file, what = "", sep = "\n", quiet = TRUE)
  26     if (!all(grepl("^CLUSTAL",X[1])))
  27         stop("Doesn't appear to be a file in clustal format")
  28     ## The first line contains CLUSTAL, and isn't interesting
  29     X <- X[-1]
  30     start.seq <- findFirstSeq(X)
  31     ## now, find how many sequences there are
  32     leading.spaces <- paste("^ {",start.seq-1,"}",sep="")
  33     stars <- grep(leading.spaces, X)
  34     num.seq <- stars[1]-1
  35     taxa <- gsub(" *$","",substr(X[1:num.seq],1,start.seq-1))
  36     ## remove the sequence names
  37     X <- substr(X,start.seq,nchar(X))
  38     ## number of lines of sequences
  39     nl <- length(X)
  40     ## sequence length
  41     first.seq <- getSeq(X[seq(1,nl,num.seq+1)])
  42     seqs <- matrix("",num.seq,length(first.seq))
  43     seqs[1,] <- first.seq
  44     for (i in 2:num.seq) {
  45         seqs[i,] <- getSeq(X[seq(i,nl,num.seq+1)])
  46     }
  47     rownames(seqs) <- taxa
  48     as.proteinbin(seqs)
  49 }