add read clustal and protein

author Don Armstrong <don@donarmstrong.com>

Sat, 27 Sep 2014 01:14:12 +0000 (18:14 -0700)

committer Don Armstrong <don@donarmstrong.com>

Sat, 27 Sep 2014 01:14:12 +0000 (18:14 -0700)
author Don Armstrong <don@donarmstrong.com>
Sat, 27 Sep 2014 01:14:12 +0000 (18:14 -0700)
committer Don Armstrong <don@donarmstrong.com>
Sat, 27 Sep 2014 01:14:12 +0000 (18:14 -0700)
diff --git a/R/protein.R b/R/protein.R

new file mode 100644 (file)

index 0000000..ff00fc7
--- /dev/null
+++ b/R/protein.R
@@ -0,0 +1,322 @@
+#' [.proteinbin
+#'
+#'
+"[.proteinbin" <- function(x,i,j,drop=FALSE) {
+    ans <- NextMethod("[",drop=drop)
+    class(ans) <- "proteinbin"
+    ans
+}
+
+as.matrix.proteinbin <- function(x,...) {
+    if (is.matrix(x)) return(x)
+    if (is.vector(x)) {
+        dim(x) <- c(1, length(x))
+        return(x)
+    }
+    s <- unique(unlist(lapply(x, length)))
+    if (length(s) != 1)
+        stop("Protein sequences in list not of the same length.")
+    nms <- names(x)
+    n <- length(x)
+    y <- matrix(as.raw(0), n, s)
+    for (i in seq_len(n)) y[i, ] <- x[[i]]
+    rownames(y) <- nms
+    class(y) <- "proteinbin"
+    y
+}
+
+as.list.proteinbin <- function(x, ...)
+{
+    if (is.list(x)) return(x)
+    if (is.null(dim(x))) obj <- list(x) # cause is.vector() doesn't work
+    else { # matrix
+        n <- nrow(x)
+        obj <- vector("list", n)
+        for (i in 1:n) obj[[i]] <- x[i, ]
+        names(obj) <- rownames(x)
+    }
+    class(obj) <- "proteinbin"
+    obj
+}
+
+rbind.proteinbin <- function(...)
+### works only with matrices for the moment
+{
+    obj <- list(...)
+    n <- length(obj)
+    if (n == 1) return(obj[[1]])
+    for (i in 1:n)
+        if (!is.matrix(obj[[i]]))
+            stop("the 'rbind' method for \"proteinbin\" accepts only matrices")
+    NC <- unlist(lapply(obj, ncol))
+    if (length(unique(NC)) > 1)
+        stop("matrices do not have the same number of columns.")
+    for (i in 1:n) class(obj[[i]]) <- NULL
+    for (i in 2:n) obj[[1]] <- rbind(obj[[1]], obj[[i]])
+    structure(obj[[1]], class = "proteinbin")
+}
+
+cbind.proteinbin <-
+    function(..., check.names = TRUE, fill.with.gaps = FALSE,
+             quiet = FALSE)
+### works only with matrices for the moment
+{
+    obj <- list(...)
+    n <- length(obj)
+    if (n == 1) return(obj[[1]])
+    for (i in 1:n)
+        if (!is.matrix(obj[[i]]))
+            stop("the 'cbind' method for \"proteinbin\" accepts only matrices")
+    NR <- unlist(lapply(obj, nrow))
+    for (i in 1:n) class(obj[[i]]) <- NULL
+    if (check.names) {
+        nms <- unlist(lapply(obj, rownames))
+        if (fill.with.gaps) {
+            NC <- unlist(lapply(obj, ncol))
+            nms <- unique(nms)
+            ans <- matrix(as.raw(4), length(nms), sum(NC))
+            rownames(ans) <- nms
+            from <- 1
+            for (i in 1:n) {
+                to <- from + NC[i] - 1
+                tmp <- rownames(obj[[i]])
+                nmsi <- tmp[tmp %in% nms]
+                ans[nmsi, from:to] <- obj[[i]][nmsi, , drop = FALSE]
+                from <- to + 1
+            }
+        } else {
+            tab <- table(nms)
+            ubi <- tab == n
+            nms <- names(tab)[which(ubi)]
+            ans <- obj[[1]][nms, , drop = FALSE]
+            for (i in 2:n)
+                ans <- cbind(ans, obj[[i]][nms, , drop = FALSE])
+            if (!quiet && !all(ubi))
+                warning("some rows were dropped.")
+        }
+    } else {
+        if (length(unique(NR)) > 1)
+            stop("matrices do not have the same number of rows.")
+        ans <- matrix(unlist(obj), NR)
+        rownames(ans) <- rownames(obj[[1]])
+    }
+    class(ans) <- "proteinbin"
+    ans
+}
+
+c.proteinbin <- function(..., recursive = FALSE)
+{
+    if (!all(unlist(lapply(list(...), is.list))))
+        stop("the 'c' method for \"proteinbin\" accepts only lists")
+    structure(NextMethod("c"), class = "proteinbin")
+}
+
+print.proteinbin <- function(x, printlen = 6, digits = 3, ...)
+{
+    if (is.list(x)) {
+        n <- length(x)
+        nms <- names(x)
+        if (n == 1) {
+            cat("1 Protein sequence stored in a list.\n\n")
+            nTot <- length(x[[1]])
+            cat("Sequence length:", nTot, "\n\n")
+            cat("Label:", nms, "\n\n")
+        } else {
+            cat(n, "Protein sequences stored in a list.\n\n")
+            tmp <- unlist(lapply(x, length))
+            nTot <- sum(tmp)
+            mini <- range(tmp)
+            maxi <- mini[2]
+            mini <- mini[1]
+            if (mini == maxi)
+                cat("All sequences of same length:", maxi, "\n")
+            else {
+                cat("Mean sequence length:", round(mean(tmp), 3), "\n")
+                cat("   Shortest sequence:", mini, "\n")
+                cat("    Longest sequence:", maxi, "\n")
+            }
+            TAIL <- "\n\n"
+            if (printlen < n) {
+                nms <- nms[1:printlen]
+                TAIL <- "...\n\n"
+            }
+            cat("\nLabels:", paste(nms, collapse = " "), TAIL)
+        }
+    } else {
+        nTot <- length(x)
+        if (is.matrix(x)) {
+            nd <- dim(x)
+            nms <- rownames(x)
+            cat(nd[1], "Protein sequences stored in a matrix.\n\n")
+            cat("All sequences of same length:", nd[2], "\n")
+            TAIL <- "\n\n"
+            if (printlen < nd[1]) {
+                nms <- nms[1:printlen]
+                TAIL <- "...\n\n"
+            }
+            cat("\nLabels:", paste(nms, collapse = " "), TAIL)
+        } else {
+            cat("1 Protein sequence in binary format stored in a vector.\n\n")
+            cat("Sequence length:", nTot, "\n\n")
+        }
+    }
+    if (nTot <= 1e6) {
+        cat("Base composition:\n")
+        print(round(base.freq(x), digits))
+    } else cat("More than 1 million nucleotides: not printing base composition\n")
+}
+
+as.proteinbin <- function(x, ...) UseMethod("as.proteinbin")
+
+._letters_ <-
+    c("A","B","C","D","E","F","G","H","I","K","L","M","N","P",
+      "Q","R","S","T","V","W","X","Y","Z","-")
+._letters_alias_ <-
+    list("?" = "X",
+         "." = "-")
+
+### from http://en.wikipedia.org/wiki/Amino_acid and http://www.bioinformatics.org/sms2/iupac.html
+._aa_3symbol_ <-
+    list("A"="Ala",
+         "B"="Asx",
+         "R"="Arg",
+         "N"="Asn",
+         "D"="Asp",
+         "C"="Cys",
+         "E"="Glu",
+         "Q"="Gln",
+         "G"="Gly",
+         "H"="His",
+         "I"="Ile",
+         "L"="Leu",
+         "K"="Lys",
+         "M"="Met",
+         "F"="Phe",
+         "P"="Pro",
+         "S"="Ser",
+         "T"="Thr",
+         "W"="Trp",
+         "Y"="Tyr",
+         "V"="Val",
+         "X"="Xaa",
+         "Z"="Glx",
+         "-"="Gap")
+._aa_names_ <-
+    list(
+        "A"="Alanine",
+        "B"="Apartic acid or Aspargine",
+        "R"="Arginine",     
+        "N"="Asparagine",   
+        "D"="Aspartic acid",
+        "C"="Cysteine",     
+        "E"="Glutamic acid",
+        "Q"="Glutamine",    
+        "G"="Glycine",      
+        "H"="Histidine",    
+        "I"="Isoleucine",   
+        "L"="Leucine",      
+        "K"="Lysine",       
+        "M"="Methionine",   
+        "F"="Phenylalanine",
+        "P"="Proline",      
+        "S"="Serine",       
+        "T"="Threonine",    
+        "W"="Tryptophan",   
+        "Y"="Tyrosine",
+        "V"="Valine",
+        "X"="Any amino acid",
+        "Z"="Glutamine or Glutamic acid",
+        "-"="Gap in alignment")
+
+as.proteinbin.character <- function(x, ...)
+{
+    ##
+    if (any(nchar(x)>1)) {
+        if (is.matrix(x)) {
+            stop("Cannot convert a matrix of multiple characters into a sequence object")
+        }
+        x <- do.call(rbind,strsplit(x,""))
+    }
+    ans <- x
+    for(alias in names(._letters_alias_)) {
+        ans[ans==alias] <- ._letters_alias_[[alias]]
+    }
+    if (any(!(ans %in% ._letters_))) {
+        stop("invalid characters for protein sequence")
+    }
+    
+    if (is.matrix(x)) {
+        dim(ans) <- dim(x)
+        dimnames(ans) <- dimnames(x)
+    }
+    class(ans) <- "proteinbin"
+    ans
+}
+
+
+as.proteinbin.list <- function(x, ...)
+{
+    obj <- lapply(x, as.proteinbin)
+    class(obj) <- "proteinbin"
+    obj
+}
+
+## they're already characters
+as.character.proteinbin <- function(x, ...)
+{
+    f <- function(xx) {
+        xx
+    }
+    if (is.list(x)) lapply(x, f) else f(x)
+}
+
+base.freq <- function(x, freq = FALSE, all = FALSE)
+{
+    f <- function(x) {
+        ans <- table(x,deparse.level=0)
+    }
+
+    if (is.list(x)) {
+        BF <- rowSums(sapply(x, f))
+        n <- sum(sapply(x, length))
+    } else {
+        n <- length(x)
+        BF <- f(x)
+    }
+
+    if (all) {
+        if (!freq) BF <- BF / n
+    } else {
+        if (!freq) BF <- BF / sum(BF)
+    }
+    BF
+}
+
+
+### where <- function(x, pattern)
+### {
+###     pat <- as.proteinbin(strsplit(pattern, NULL)[[1]])
+###     p <- as.integer(length(pat))
+### 
+###     foo <- function(x, pat, p) {
+###         s <- as.integer(length(x))
+###         if (s < p) stop("sequence shorter than the pattern")
+###         ans <- .C("where", x, pat, s, p, integer(s), 0L,
+###                   DUP = FALSE, NAOK = TRUE, PACKAGE = "ape")
+###         n <- ans[[6]]
+###         if (n) ans[[5]][seq_len(n)] - p + 2L else integer()
+###     }
+### 
+###     if (is.list(x)) return(lapply(x, foo, pat = pat, p = p))
+###     if (is.matrix(x)) {
+###         n <- nrow(x)
+###         res <- vector("list", n)
+###         for (i in seq_along(n))
+###             res[[i]] <- foo(x[i, , drop = TRUE], pat, p)
+###         names(res) <- rownames(x)
+###         return(res)
+###     }
+###     foo(x, pat, p) # if x is a vector
+### }
+
+    
diff --git a/R/read.clustal.R b/R/read.clustal.R

new file mode 100644 (file)

index 0000000..6f76acf
--- /dev/null
+++ b/R/read.clustal.R
@@ -0,0 +1,49 @@
+#' Read a file in clustal format
+#'
+#' \code{read.clustal} reads an alignment file in clustal format
+#' 
+#' 
+#' @param file file name of the clustal alignment
+#' @export
+#' @examples
+#' \dontrun{
+#' clust.align <- read.clustal("clustal_alignment.txt")
+#' }
+read.clustal <- function(file,...) {
+    ## stolen from ape's read.dna.R
+    findFirstSeq <- function(x) {
+        ## actually find the 1st non-blank character
+        tmp <- regexpr("[[:blank:]]+", x[1]) # consider only a single string
+        tmp[1] + attr(tmp, "match.length")
+    }
+    getSeq <- function(x) {
+        x <- gsub(" ", "", x)
+        x <- strsplit(x, NULL)
+        toupper(unlist(x))
+    }
+
+    X <- scan(file = file, what = "", sep = "\n", quiet = TRUE)
+    if (!all(grepl("^CLUSTAL",X[1])))
+        stop("Doesn't appear to be a file in clustal format")
+    ## The first line contains CLUSTAL, and isn't interesting
+    X <- X[-1]
+    start.seq <- findFirstSeq(X)
+    ## now, find how many sequences there are
+    leading.spaces <- paste("^ {",start.seq-1,"}",sep="")
+    stars <- grep(leading.spaces, X)
+    num.seq <- stars[1]-1
+    taxa <- gsub(" *$","",substr(X[1:num.seq],1,start.seq-1))
+    ## remove the sequence names
+    X <- substr(X,start.seq,nchar(X))
+    ## number of lines of sequences
+    nl <- length(X)
+    ## sequence length
+    first.seq <- getSeq(X[seq(1,nl,num.seq+1)])
+    seqs <- matrix("",num.seq,length(first.seq))
+    seqs[1,] <- first.seq
+    for (i in 2:num.seq) {
+        seqs[i,] <- getSeq(X[seq(i,nl,num.seq+1)])
+    }
+    rownames(seqs) <- taxa
+    as.proteinbin(seqs)
+}
author	Don Armstrong <don@donarmstrong.com>
	Sat, 27 Sep 2014 01:14:12 +0000 (18:14 -0700)
committer	Don Armstrong <don@donarmstrong.com>
	Sat, 27 Sep 2014 01:14:12 +0000 (18:14 -0700)
R/protein.R	[new file with mode: 0644]	patch \| blob
R/read.clustal.R	[new file with mode: 0644]	patch \| blob