#' [.proteinbin #' #' "[.proteinbin" <- function(x,i,j,drop=FALSE) { ans <- NextMethod("[",drop=drop) class(ans) <- "proteinbin" ans } as.matrix.proteinbin <- function(x,...) { if (is.matrix(x)) return(x) if (is.vector(x)) { dim(x) <- c(1, length(x)) return(x) } s <- unique(unlist(lapply(x, length))) if (length(s) != 1) stop("Protein sequences in list not of the same length.") nms <- names(x) n <- length(x) y <- matrix(as.raw(0), n, s) for (i in seq_len(n)) y[i, ] <- x[[i]] rownames(y) <- nms class(y) <- "proteinbin" y } as.list.proteinbin <- function(x, ...) { if (is.list(x)) return(x) if (is.null(dim(x))) obj <- list(x) # cause is.vector() doesn't work else { # matrix n <- nrow(x) obj <- vector("list", n) for (i in 1:n) obj[[i]] <- x[i, ] names(obj) <- rownames(x) } class(obj) <- "proteinbin" obj } rbind.proteinbin <- function(...) ### works only with matrices for the moment { obj <- list(...) n <- length(obj) if (n == 1) return(obj[[1]]) for (i in 1:n) if (!is.matrix(obj[[i]])) stop("the 'rbind' method for \"proteinbin\" accepts only matrices") NC <- unlist(lapply(obj, ncol)) if (length(unique(NC)) > 1) stop("matrices do not have the same number of columns.") for (i in 1:n) class(obj[[i]]) <- NULL for (i in 2:n) obj[[1]] <- rbind(obj[[1]], obj[[i]]) structure(obj[[1]], class = "proteinbin") } cbind.proteinbin <- function(..., check.names = TRUE, fill.with.gaps = FALSE, quiet = FALSE) ### works only with matrices for the moment { obj <- list(...) n <- length(obj) if (n == 1) return(obj[[1]]) for (i in 1:n) if (!is.matrix(obj[[i]])) stop("the 'cbind' method for \"proteinbin\" accepts only matrices") NR <- unlist(lapply(obj, nrow)) for (i in 1:n) class(obj[[i]]) <- NULL if (check.names) { nms <- unlist(lapply(obj, rownames)) if (fill.with.gaps) { NC <- unlist(lapply(obj, ncol)) nms <- unique(nms) ans <- matrix(as.raw(4), length(nms), sum(NC)) rownames(ans) <- nms from <- 1 for (i in 1:n) { to <- from + NC[i] - 1 tmp <- rownames(obj[[i]]) nmsi <- tmp[tmp %in% nms] ans[nmsi, from:to] <- obj[[i]][nmsi, , drop = FALSE] from <- to + 1 } } else { tab <- table(nms) ubi <- tab == n nms <- names(tab)[which(ubi)] ans <- obj[[1]][nms, , drop = FALSE] for (i in 2:n) ans <- cbind(ans, obj[[i]][nms, , drop = FALSE]) if (!quiet && !all(ubi)) warning("some rows were dropped.") } } else { if (length(unique(NR)) > 1) stop("matrices do not have the same number of rows.") ans <- matrix(unlist(obj), NR) rownames(ans) <- rownames(obj[[1]]) } class(ans) <- "proteinbin" ans } c.proteinbin <- function(..., recursive = FALSE) { if (!all(unlist(lapply(list(...), is.list)))) stop("the 'c' method for \"proteinbin\" accepts only lists") structure(NextMethod("c"), class = "proteinbin") } print.proteinbin <- function(x, printlen = 6, digits = 3, ...) { if (is.list(x)) { n <- length(x) nms <- names(x) if (n == 1) { cat("1 Protein sequence stored in a list.\n\n") nTot <- length(x[[1]]) cat("Sequence length:", nTot, "\n\n") cat("Label:", nms, "\n\n") } else { cat(n, "Protein sequences stored in a list.\n\n") tmp <- unlist(lapply(x, length)) nTot <- sum(tmp) mini <- range(tmp) maxi <- mini[2] mini <- mini[1] if (mini == maxi) cat("All sequences of same length:", maxi, "\n") else { cat("Mean sequence length:", round(mean(tmp), 3), "\n") cat(" Shortest sequence:", mini, "\n") cat(" Longest sequence:", maxi, "\n") } TAIL <- "\n\n" if (printlen < n) { nms <- nms[1:printlen] TAIL <- "...\n\n" } cat("\nLabels:", paste(nms, collapse = " "), TAIL) } } else { nTot <- length(x) if (is.matrix(x)) { nd <- dim(x) nms <- rownames(x) cat(nd[1], "Protein sequences stored in a matrix.\n\n") cat("All sequences of same length:", nd[2], "\n") TAIL <- "\n\n" if (printlen < nd[1]) { nms <- nms[1:printlen] TAIL <- "...\n\n" } cat("\nLabels:", paste(nms, collapse = " "), TAIL) } else { cat("1 Protein sequence in binary format stored in a vector.\n\n") cat("Sequence length:", nTot, "\n\n") } } if (nTot <= 1e6) { cat("Base composition:\n") print(round(base.freq(x), digits)) } else cat("More than 1 million nucleotides: not printing base composition\n") } as.proteinbin <- function(x, ...) UseMethod("as.proteinbin") ._letters_ <- c("A","B","C","D","E","F","G","H","I","K","L","M","N","P", "Q","R","S","T","V","W","X","Y","Z","-") ._letters_alias_ <- list("?" = "X", "." = "-") ### from http://en.wikipedia.org/wiki/Amino_acid and http://www.bioinformatics.org/sms2/iupac.html ._aa_3symbol_ <- list("A"="Ala", "B"="Asx", "R"="Arg", "N"="Asn", "D"="Asp", "C"="Cys", "E"="Glu", "Q"="Gln", "G"="Gly", "H"="His", "I"="Ile", "L"="Leu", "K"="Lys", "M"="Met", "F"="Phe", "P"="Pro", "S"="Ser", "T"="Thr", "W"="Trp", "Y"="Tyr", "V"="Val", "X"="Xaa", "Z"="Glx", "-"="Gap") ._aa_names_ <- list( "A"="Alanine", "B"="Apartic acid or Aspargine", "R"="Arginine", "N"="Asparagine", "D"="Aspartic acid", "C"="Cysteine", "E"="Glutamic acid", "Q"="Glutamine", "G"="Glycine", "H"="Histidine", "I"="Isoleucine", "L"="Leucine", "K"="Lysine", "M"="Methionine", "F"="Phenylalanine", "P"="Proline", "S"="Serine", "T"="Threonine", "W"="Tryptophan", "Y"="Tyrosine", "V"="Valine", "X"="Any amino acid", "Z"="Glutamine or Glutamic acid", "-"="Gap in alignment") as.proteinbin.character <- function(x, ...) { ## if (any(nchar(x)>1)) { if (is.matrix(x)) { stop("Cannot convert a matrix of multiple characters into a sequence object") } x <- do.call(rbind,strsplit(x,"")) } ans <- x for(alias in names(._letters_alias_)) { ans[ans==alias] <- ._letters_alias_[[alias]] } if (any(!(ans %in% ._letters_))) { stop("invalid characters for protein sequence") } if (is.matrix(x)) { dim(ans) <- dim(x) dimnames(ans) <- dimnames(x) } class(ans) <- "proteinbin" ans } as.proteinbin.list <- function(x, ...) { obj <- lapply(x, as.proteinbin) class(obj) <- "proteinbin" obj } ## they're already characters as.character.proteinbin <- function(x, ...) { f <- function(xx) { xx } if (is.list(x)) lapply(x, f) else f(x) } base.freq <- function(x, freq = FALSE, all = FALSE) { f <- function(x) { ans <- table(x,deparse.level=0) } if (is.list(x)) { BF <- rowSums(sapply(x, f)) n <- sum(sapply(x, length)) } else { n <- length(x) BF <- f(x) } if (all) { if (!freq) BF <- BF / n } else { if (!freq) BF <- BF / sum(BF) } BF } ### where <- function(x, pattern) ### { ### pat <- as.proteinbin(strsplit(pattern, NULL)[[1]]) ### p <- as.integer(length(pat)) ### ### foo <- function(x, pat, p) { ### s <- as.integer(length(x)) ### if (s < p) stop("sequence shorter than the pattern") ### ans <- .C("where", x, pat, s, p, integer(s), 0L, ### DUP = FALSE, NAOK = TRUE, PACKAGE = "ape") ### n <- ans[[6]] ### if (n) ans[[5]][seq_len(n)] - p + 2L else integer() ### } ### ### if (is.list(x)) return(lapply(x, foo, pat = pat, p = p)) ### if (is.matrix(x)) { ### n <- nrow(x) ### res <- vector("list", n) ### for (i in seq_along(n)) ### res[[i]] <- foo(x[i, , drop = TRUE], pat, p) ### names(res) <- rownames(x) ### return(res) ### } ### foo(x, pat, p) # if x is a vector ### }