From: Don Armstrong Date: Sat, 27 Sep 2014 01:14:12 +0000 (-0700) Subject: add read clustal and protein X-Git-Url: https://git.donarmstrong.com/?p=plotprotein.git;a=commitdiff_plain;h=a963ccc72e08ddb3dc35a65060f76334f36183c0 add read clustal and protein --- a963ccc72e08ddb3dc35a65060f76334f36183c0 diff --git a/R/protein.R b/R/protein.R new file mode 100644 index 0000000..ff00fc7 --- /dev/null +++ b/R/protein.R @@ -0,0 +1,322 @@ +#' [.proteinbin +#' +#' +"[.proteinbin" <- function(x,i,j,drop=FALSE) { + ans <- NextMethod("[",drop=drop) + class(ans) <- "proteinbin" + ans +} + +as.matrix.proteinbin <- function(x,...) { + if (is.matrix(x)) return(x) + if (is.vector(x)) { + dim(x) <- c(1, length(x)) + return(x) + } + s <- unique(unlist(lapply(x, length))) + if (length(s) != 1) + stop("Protein sequences in list not of the same length.") + nms <- names(x) + n <- length(x) + y <- matrix(as.raw(0), n, s) + for (i in seq_len(n)) y[i, ] <- x[[i]] + rownames(y) <- nms + class(y) <- "proteinbin" + y +} + +as.list.proteinbin <- function(x, ...) +{ + if (is.list(x)) return(x) + if (is.null(dim(x))) obj <- list(x) # cause is.vector() doesn't work + else { # matrix + n <- nrow(x) + obj <- vector("list", n) + for (i in 1:n) obj[[i]] <- x[i, ] + names(obj) <- rownames(x) + } + class(obj) <- "proteinbin" + obj +} + +rbind.proteinbin <- function(...) +### works only with matrices for the moment +{ + obj <- list(...) + n <- length(obj) + if (n == 1) return(obj[[1]]) + for (i in 1:n) + if (!is.matrix(obj[[i]])) + stop("the 'rbind' method for \"proteinbin\" accepts only matrices") + NC <- unlist(lapply(obj, ncol)) + if (length(unique(NC)) > 1) + stop("matrices do not have the same number of columns.") + for (i in 1:n) class(obj[[i]]) <- NULL + for (i in 2:n) obj[[1]] <- rbind(obj[[1]], obj[[i]]) + structure(obj[[1]], class = "proteinbin") +} + +cbind.proteinbin <- + function(..., check.names = TRUE, fill.with.gaps = FALSE, + quiet = FALSE) +### works only with matrices for the moment +{ + obj <- list(...) + n <- length(obj) + if (n == 1) return(obj[[1]]) + for (i in 1:n) + if (!is.matrix(obj[[i]])) + stop("the 'cbind' method for \"proteinbin\" accepts only matrices") + NR <- unlist(lapply(obj, nrow)) + for (i in 1:n) class(obj[[i]]) <- NULL + if (check.names) { + nms <- unlist(lapply(obj, rownames)) + if (fill.with.gaps) { + NC <- unlist(lapply(obj, ncol)) + nms <- unique(nms) + ans <- matrix(as.raw(4), length(nms), sum(NC)) + rownames(ans) <- nms + from <- 1 + for (i in 1:n) { + to <- from + NC[i] - 1 + tmp <- rownames(obj[[i]]) + nmsi <- tmp[tmp %in% nms] + ans[nmsi, from:to] <- obj[[i]][nmsi, , drop = FALSE] + from <- to + 1 + } + } else { + tab <- table(nms) + ubi <- tab == n + nms <- names(tab)[which(ubi)] + ans <- obj[[1]][nms, , drop = FALSE] + for (i in 2:n) + ans <- cbind(ans, obj[[i]][nms, , drop = FALSE]) + if (!quiet && !all(ubi)) + warning("some rows were dropped.") + } + } else { + if (length(unique(NR)) > 1) + stop("matrices do not have the same number of rows.") + ans <- matrix(unlist(obj), NR) + rownames(ans) <- rownames(obj[[1]]) + } + class(ans) <- "proteinbin" + ans +} + +c.proteinbin <- function(..., recursive = FALSE) +{ + if (!all(unlist(lapply(list(...), is.list)))) + stop("the 'c' method for \"proteinbin\" accepts only lists") + structure(NextMethod("c"), class = "proteinbin") +} + +print.proteinbin <- function(x, printlen = 6, digits = 3, ...) +{ + if (is.list(x)) { + n <- length(x) + nms <- names(x) + if (n == 1) { + cat("1 Protein sequence stored in a list.\n\n") + nTot <- length(x[[1]]) + cat("Sequence length:", nTot, "\n\n") + cat("Label:", nms, "\n\n") + } else { + cat(n, "Protein sequences stored in a list.\n\n") + tmp <- unlist(lapply(x, length)) + nTot <- sum(tmp) + mini <- range(tmp) + maxi <- mini[2] + mini <- mini[1] + if (mini == maxi) + cat("All sequences of same length:", maxi, "\n") + else { + cat("Mean sequence length:", round(mean(tmp), 3), "\n") + cat(" Shortest sequence:", mini, "\n") + cat(" Longest sequence:", maxi, "\n") + } + TAIL <- "\n\n" + if (printlen < n) { + nms <- nms[1:printlen] + TAIL <- "...\n\n" + } + cat("\nLabels:", paste(nms, collapse = " "), TAIL) + } + } else { + nTot <- length(x) + if (is.matrix(x)) { + nd <- dim(x) + nms <- rownames(x) + cat(nd[1], "Protein sequences stored in a matrix.\n\n") + cat("All sequences of same length:", nd[2], "\n") + TAIL <- "\n\n" + if (printlen < nd[1]) { + nms <- nms[1:printlen] + TAIL <- "...\n\n" + } + cat("\nLabels:", paste(nms, collapse = " "), TAIL) + } else { + cat("1 Protein sequence in binary format stored in a vector.\n\n") + cat("Sequence length:", nTot, "\n\n") + } + } + if (nTot <= 1e6) { + cat("Base composition:\n") + print(round(base.freq(x), digits)) + } else cat("More than 1 million nucleotides: not printing base composition\n") +} + +as.proteinbin <- function(x, ...) UseMethod("as.proteinbin") + +._letters_ <- + c("A","B","C","D","E","F","G","H","I","K","L","M","N","P", + "Q","R","S","T","V","W","X","Y","Z","-") +._letters_alias_ <- + list("?" = "X", + "." = "-") + +### from http://en.wikipedia.org/wiki/Amino_acid and http://www.bioinformatics.org/sms2/iupac.html +._aa_3symbol_ <- + list("A"="Ala", + "B"="Asx", + "R"="Arg", + "N"="Asn", + "D"="Asp", + "C"="Cys", + "E"="Glu", + "Q"="Gln", + "G"="Gly", + "H"="His", + "I"="Ile", + "L"="Leu", + "K"="Lys", + "M"="Met", + "F"="Phe", + "P"="Pro", + "S"="Ser", + "T"="Thr", + "W"="Trp", + "Y"="Tyr", + "V"="Val", + "X"="Xaa", + "Z"="Glx", + "-"="Gap") +._aa_names_ <- + list( + "A"="Alanine", + "B"="Apartic acid or Aspargine", + "R"="Arginine", + "N"="Asparagine", + "D"="Aspartic acid", + "C"="Cysteine", + "E"="Glutamic acid", + "Q"="Glutamine", + "G"="Glycine", + "H"="Histidine", + "I"="Isoleucine", + "L"="Leucine", + "K"="Lysine", + "M"="Methionine", + "F"="Phenylalanine", + "P"="Proline", + "S"="Serine", + "T"="Threonine", + "W"="Tryptophan", + "Y"="Tyrosine", + "V"="Valine", + "X"="Any amino acid", + "Z"="Glutamine or Glutamic acid", + "-"="Gap in alignment") + +as.proteinbin.character <- function(x, ...) +{ + ## + if (any(nchar(x)>1)) { + if (is.matrix(x)) { + stop("Cannot convert a matrix of multiple characters into a sequence object") + } + x <- do.call(rbind,strsplit(x,"")) + } + ans <- x + for(alias in names(._letters_alias_)) { + ans[ans==alias] <- ._letters_alias_[[alias]] + } + if (any(!(ans %in% ._letters_))) { + stop("invalid characters for protein sequence") + } + + if (is.matrix(x)) { + dim(ans) <- dim(x) + dimnames(ans) <- dimnames(x) + } + class(ans) <- "proteinbin" + ans +} + + +as.proteinbin.list <- function(x, ...) +{ + obj <- lapply(x, as.proteinbin) + class(obj) <- "proteinbin" + obj +} + +## they're already characters +as.character.proteinbin <- function(x, ...) +{ + f <- function(xx) { + xx + } + if (is.list(x)) lapply(x, f) else f(x) +} + +base.freq <- function(x, freq = FALSE, all = FALSE) +{ + f <- function(x) { + ans <- table(x,deparse.level=0) + } + + if (is.list(x)) { + BF <- rowSums(sapply(x, f)) + n <- sum(sapply(x, length)) + } else { + n <- length(x) + BF <- f(x) + } + + if (all) { + if (!freq) BF <- BF / n + } else { + if (!freq) BF <- BF / sum(BF) + } + BF +} + + +### where <- function(x, pattern) +### { +### pat <- as.proteinbin(strsplit(pattern, NULL)[[1]]) +### p <- as.integer(length(pat)) +### +### foo <- function(x, pat, p) { +### s <- as.integer(length(x)) +### if (s < p) stop("sequence shorter than the pattern") +### ans <- .C("where", x, pat, s, p, integer(s), 0L, +### DUP = FALSE, NAOK = TRUE, PACKAGE = "ape") +### n <- ans[[6]] +### if (n) ans[[5]][seq_len(n)] - p + 2L else integer() +### } +### +### if (is.list(x)) return(lapply(x, foo, pat = pat, p = p)) +### if (is.matrix(x)) { +### n <- nrow(x) +### res <- vector("list", n) +### for (i in seq_along(n)) +### res[[i]] <- foo(x[i, , drop = TRUE], pat, p) +### names(res) <- rownames(x) +### return(res) +### } +### foo(x, pat, p) # if x is a vector +### } + + diff --git a/R/read.clustal.R b/R/read.clustal.R new file mode 100644 index 0000000..6f76acf --- /dev/null +++ b/R/read.clustal.R @@ -0,0 +1,49 @@ +#' Read a file in clustal format +#' +#' \code{read.clustal} reads an alignment file in clustal format +#' +#' +#' @param file file name of the clustal alignment +#' @export +#' @examples +#' \dontrun{ +#' clust.align <- read.clustal("clustal_alignment.txt") +#' } +read.clustal <- function(file,...) { + ## stolen from ape's read.dna.R + findFirstSeq <- function(x) { + ## actually find the 1st non-blank character + tmp <- regexpr("[[:blank:]]+", x[1]) # consider only a single string + tmp[1] + attr(tmp, "match.length") + } + getSeq <- function(x) { + x <- gsub(" ", "", x) + x <- strsplit(x, NULL) + toupper(unlist(x)) + } + + X <- scan(file = file, what = "", sep = "\n", quiet = TRUE) + if (!all(grepl("^CLUSTAL",X[1]))) + stop("Doesn't appear to be a file in clustal format") + ## The first line contains CLUSTAL, and isn't interesting + X <- X[-1] + start.seq <- findFirstSeq(X) + ## now, find how many sequences there are + leading.spaces <- paste("^ {",start.seq-1,"}",sep="") + stars <- grep(leading.spaces, X) + num.seq <- stars[1]-1 + taxa <- gsub(" *$","",substr(X[1:num.seq],1,start.seq-1)) + ## remove the sequence names + X <- substr(X,start.seq,nchar(X)) + ## number of lines of sequences + nl <- length(X) + ## sequence length + first.seq <- getSeq(X[seq(1,nl,num.seq+1)]) + seqs <- matrix("",num.seq,length(first.seq)) + seqs[1,] <- first.seq + for (i in 2:num.seq) { + seqs[i,] <- getSeq(X[seq(i,nl,num.seq+1)]) + } + rownames(seqs) <- taxa + as.proteinbin(seqs) +}