+#' [.proteinbin
+#'
+#'
+"[.proteinbin" <- function(x,i,j,drop=FALSE) {
+ ans <- NextMethod("[",drop=drop)
+ class(ans) <- "proteinbin"
+ ans
+}
+
+as.matrix.proteinbin <- function(x,...) {
+ if (is.matrix(x)) return(x)
+ if (is.vector(x)) {
+ dim(x) <- c(1, length(x))
+ return(x)
+ }
+ s <- unique(unlist(lapply(x, length)))
+ if (length(s) != 1)
+ stop("Protein sequences in list not of the same length.")
+ nms <- names(x)
+ n <- length(x)
+ y <- matrix(as.raw(0), n, s)
+ for (i in seq_len(n)) y[i, ] <- x[[i]]
+ rownames(y) <- nms
+ class(y) <- "proteinbin"
+ y
+}
+
+as.list.proteinbin <- function(x, ...)
+{
+ if (is.list(x)) return(x)
+ if (is.null(dim(x))) obj <- list(x) # cause is.vector() doesn't work
+ else { # matrix
+ n <- nrow(x)
+ obj <- vector("list", n)
+ for (i in 1:n) obj[[i]] <- x[i, ]
+ names(obj) <- rownames(x)
+ }
+ class(obj) <- "proteinbin"
+ obj
+}
+
+rbind.proteinbin <- function(...)
+### works only with matrices for the moment
+{
+ obj <- list(...)
+ n <- length(obj)
+ if (n == 1) return(obj[[1]])
+ for (i in 1:n)
+ if (!is.matrix(obj[[i]]))
+ stop("the 'rbind' method for \"proteinbin\" accepts only matrices")
+ NC <- unlist(lapply(obj, ncol))
+ if (length(unique(NC)) > 1)
+ stop("matrices do not have the same number of columns.")
+ for (i in 1:n) class(obj[[i]]) <- NULL
+ for (i in 2:n) obj[[1]] <- rbind(obj[[1]], obj[[i]])
+ structure(obj[[1]], class = "proteinbin")
+}
+
+cbind.proteinbin <-
+ function(..., check.names = TRUE, fill.with.gaps = FALSE,
+ quiet = FALSE)
+### works only with matrices for the moment
+{
+ obj <- list(...)
+ n <- length(obj)
+ if (n == 1) return(obj[[1]])
+ for (i in 1:n)
+ if (!is.matrix(obj[[i]]))
+ stop("the 'cbind' method for \"proteinbin\" accepts only matrices")
+ NR <- unlist(lapply(obj, nrow))
+ for (i in 1:n) class(obj[[i]]) <- NULL
+ if (check.names) {
+ nms <- unlist(lapply(obj, rownames))
+ if (fill.with.gaps) {
+ NC <- unlist(lapply(obj, ncol))
+ nms <- unique(nms)
+ ans <- matrix(as.raw(4), length(nms), sum(NC))
+ rownames(ans) <- nms
+ from <- 1
+ for (i in 1:n) {
+ to <- from + NC[i] - 1
+ tmp <- rownames(obj[[i]])
+ nmsi <- tmp[tmp %in% nms]
+ ans[nmsi, from:to] <- obj[[i]][nmsi, , drop = FALSE]
+ from <- to + 1
+ }
+ } else {
+ tab <- table(nms)
+ ubi <- tab == n
+ nms <- names(tab)[which(ubi)]
+ ans <- obj[[1]][nms, , drop = FALSE]
+ for (i in 2:n)
+ ans <- cbind(ans, obj[[i]][nms, , drop = FALSE])
+ if (!quiet && !all(ubi))
+ warning("some rows were dropped.")
+ }
+ } else {
+ if (length(unique(NR)) > 1)
+ stop("matrices do not have the same number of rows.")
+ ans <- matrix(unlist(obj), NR)
+ rownames(ans) <- rownames(obj[[1]])
+ }
+ class(ans) <- "proteinbin"
+ ans
+}
+
+c.proteinbin <- function(..., recursive = FALSE)
+{
+ if (!all(unlist(lapply(list(...), is.list))))
+ stop("the 'c' method for \"proteinbin\" accepts only lists")
+ structure(NextMethod("c"), class = "proteinbin")
+}
+
+print.proteinbin <- function(x, printlen = 6, digits = 3, ...)
+{
+ if (is.list(x)) {
+ n <- length(x)
+ nms <- names(x)
+ if (n == 1) {
+ cat("1 Protein sequence stored in a list.\n\n")
+ nTot <- length(x[[1]])
+ cat("Sequence length:", nTot, "\n\n")
+ cat("Label:", nms, "\n\n")
+ } else {
+ cat(n, "Protein sequences stored in a list.\n\n")
+ tmp <- unlist(lapply(x, length))
+ nTot <- sum(tmp)
+ mini <- range(tmp)
+ maxi <- mini[2]
+ mini <- mini[1]
+ if (mini == maxi)
+ cat("All sequences of same length:", maxi, "\n")
+ else {
+ cat("Mean sequence length:", round(mean(tmp), 3), "\n")
+ cat(" Shortest sequence:", mini, "\n")
+ cat(" Longest sequence:", maxi, "\n")
+ }
+ TAIL <- "\n\n"
+ if (printlen < n) {
+ nms <- nms[1:printlen]
+ TAIL <- "...\n\n"
+ }
+ cat("\nLabels:", paste(nms, collapse = " "), TAIL)
+ }
+ } else {
+ nTot <- length(x)
+ if (is.matrix(x)) {
+ nd <- dim(x)
+ nms <- rownames(x)
+ cat(nd[1], "Protein sequences stored in a matrix.\n\n")
+ cat("All sequences of same length:", nd[2], "\n")
+ TAIL <- "\n\n"
+ if (printlen < nd[1]) {
+ nms <- nms[1:printlen]
+ TAIL <- "...\n\n"
+ }
+ cat("\nLabels:", paste(nms, collapse = " "), TAIL)
+ } else {
+ cat("1 Protein sequence in binary format stored in a vector.\n\n")
+ cat("Sequence length:", nTot, "\n\n")
+ }
+ }
+ if (nTot <= 1e6) {
+ cat("Base composition:\n")
+ print(round(base.freq(x), digits))
+ } else cat("More than 1 million nucleotides: not printing base composition\n")
+}
+
+as.proteinbin <- function(x, ...) UseMethod("as.proteinbin")
+
+._letters_ <-
+ c("A","B","C","D","E","F","G","H","I","K","L","M","N","P",
+ "Q","R","S","T","V","W","X","Y","Z","-")
+._letters_alias_ <-
+ list("?" = "X",
+ "." = "-")
+
+### from http://en.wikipedia.org/wiki/Amino_acid and http://www.bioinformatics.org/sms2/iupac.html
+._aa_3symbol_ <-
+ list("A"="Ala",
+ "B"="Asx",
+ "R"="Arg",
+ "N"="Asn",
+ "D"="Asp",
+ "C"="Cys",
+ "E"="Glu",
+ "Q"="Gln",
+ "G"="Gly",
+ "H"="His",
+ "I"="Ile",
+ "L"="Leu",
+ "K"="Lys",
+ "M"="Met",
+ "F"="Phe",
+ "P"="Pro",
+ "S"="Ser",
+ "T"="Thr",
+ "W"="Trp",
+ "Y"="Tyr",
+ "V"="Val",
+ "X"="Xaa",
+ "Z"="Glx",
+ "-"="Gap")
+._aa_names_ <-
+ list(
+ "A"="Alanine",
+ "B"="Apartic acid or Aspargine",
+ "R"="Arginine",
+ "N"="Asparagine",
+ "D"="Aspartic acid",
+ "C"="Cysteine",
+ "E"="Glutamic acid",
+ "Q"="Glutamine",
+ "G"="Glycine",
+ "H"="Histidine",
+ "I"="Isoleucine",
+ "L"="Leucine",
+ "K"="Lysine",
+ "M"="Methionine",
+ "F"="Phenylalanine",
+ "P"="Proline",
+ "S"="Serine",
+ "T"="Threonine",
+ "W"="Tryptophan",
+ "Y"="Tyrosine",
+ "V"="Valine",
+ "X"="Any amino acid",
+ "Z"="Glutamine or Glutamic acid",
+ "-"="Gap in alignment")
+
+as.proteinbin.character <- function(x, ...)
+{
+ ##
+ if (any(nchar(x)>1)) {
+ if (is.matrix(x)) {
+ stop("Cannot convert a matrix of multiple characters into a sequence object")
+ }
+ x <- do.call(rbind,strsplit(x,""))
+ }
+ ans <- x
+ for(alias in names(._letters_alias_)) {
+ ans[ans==alias] <- ._letters_alias_[[alias]]
+ }
+ if (any(!(ans %in% ._letters_))) {
+ stop("invalid characters for protein sequence")
+ }
+
+ if (is.matrix(x)) {
+ dim(ans) <- dim(x)
+ dimnames(ans) <- dimnames(x)
+ }
+ class(ans) <- "proteinbin"
+ ans
+}
+
+
+as.proteinbin.list <- function(x, ...)
+{
+ obj <- lapply(x, as.proteinbin)
+ class(obj) <- "proteinbin"
+ obj
+}
+
+## they're already characters
+as.character.proteinbin <- function(x, ...)
+{
+ f <- function(xx) {
+ xx
+ }
+ if (is.list(x)) lapply(x, f) else f(x)
+}
+
+base.freq <- function(x, freq = FALSE, all = FALSE)
+{
+ f <- function(x) {
+ ans <- table(x,deparse.level=0)
+ }
+
+ if (is.list(x)) {
+ BF <- rowSums(sapply(x, f))
+ n <- sum(sapply(x, length))
+ } else {
+ n <- length(x)
+ BF <- f(x)
+ }
+
+ if (all) {
+ if (!freq) BF <- BF / n
+ } else {
+ if (!freq) BF <- BF / sum(BF)
+ }
+ BF
+}
+
+
+### where <- function(x, pattern)
+### {
+### pat <- as.proteinbin(strsplit(pattern, NULL)[[1]])
+### p <- as.integer(length(pat))
+###
+### foo <- function(x, pat, p) {
+### s <- as.integer(length(x))
+### if (s < p) stop("sequence shorter than the pattern")
+### ans <- .C("where", x, pat, s, p, integer(s), 0L,
+### DUP = FALSE, NAOK = TRUE, PACKAGE = "ape")
+### n <- ans[[6]]
+### if (n) ans[[5]][seq_len(n)] - p + 2L else integer()
+### }
+###
+### if (is.list(x)) return(lapply(x, foo, pat = pat, p = p))
+### if (is.matrix(x)) {
+### n <- nrow(x)
+### res <- vector("list", n)
+### for (i in seq_along(n))
+### res[[i]] <- foo(x[i, , drop = TRUE], pat, p)
+### names(res) <- rownames(x)
+### return(res)
+### }
+### foo(x, pat, p) # if x is a vector
+### }
+
+