From a1b67d97d7bf71af111e8675588c78dfc41a0bed Mon Sep 17 00:00:00 2001 From: paradis Date: Thu, 3 Jul 2008 15:13:42 +0000 Subject: [PATCH] new makeLabel() + modifs in read.dna() & write.dna() git-svn-id: https://svn.mpl.ird.fr/ape/dev/ape@39 6e262413-ae40-0410-9e79-b911bd7a66b7 --- Changes | 12 +++- DESCRIPTION | 2 +- R/makeLabel.R | 71 ++++++++++++++++++++++ R/read.dna.R | 99 ++++++++++++++++-------------- R/write.dna.R | 152 ++++++++++++++++++++--------------------------- man/makeLabel.Rd | 68 +++++++++++++++++++++ man/read.dna.Rd | 27 ++++++--- man/write.dna.Rd | 30 ++++++---- 8 files changed, 306 insertions(+), 155 deletions(-) create mode 100644 R/makeLabel.R create mode 100644 man/makeLabel.Rd diff --git a/Changes b/Changes index 8d49225..c831e78 100644 --- a/Changes +++ b/Changes @@ -3,9 +3,16 @@ NEW FEATURES + o The new function makeLabel() helps to modify labels of trees, + lists of trees, or DNA sequences, with several utilities to + truncate and/or make them unique, substituting some + characters, and so on. + o The new function del.gaps() removes insertion gaps ("-") in a set of DNA sequences. + o read.dna() can now read Clustal files (*.aln). + BUG FIXES @@ -26,7 +33,10 @@ OTHER CHANGES Minin. o The option 'check.labels' of consensus() and prop.part() is now - TRUE. + TRUE by default. + + o write.dna() now does not truncate names to 10 characters with + the Phylip formats. diff --git a/DESCRIPTION b/DESCRIPTION index af4d120..c511656 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: ape Version: 2.2-1 -Date: 2008-06-28 +Date: 2008-07-03 Title: Analyses of Phylogenetics and Evolution Author: Emmanuel Paradis, Ben Bolker, Julien Claude, Hoa Sien Cuong, Richard Desper, Benoit Durand, Julien Dutheil, Olivier Gascuel, diff --git a/R/makeLabel.R b/R/makeLabel.R new file mode 100644 index 0000000..9d38dc1 --- /dev/null +++ b/R/makeLabel.R @@ -0,0 +1,71 @@ +## makeLabel.R (2008-07-03) + +## Label Management + +## Copyright 2008 Emmanuel Paradis + +## This file is part of the R-package `ape'. +## See the file ../COPYING for licensing issues. + +makeLabel <- function(x, ...) UseMethod("makeLabel") + +makeLabel.character <- function(x, len = 99, space = "_", + make.unique = TRUE, illegal = "():;,[]", quote = FALSE, ...) +{ + x <- gsub("[[:space:]]", space, x) + if (illegal != "") { + illegal <- unlist(strsplit(illegal, NULL)) + for (i in illegal) x <- gsub(i, "", x, fixed = TRUE) + } + if (quote) len <- len - 2 + nc <- nchar(x) > len + if (any(nc)) x[nc] <- substr(x[nc], 1, len) + tab <- table(x) + if (all(tab == 1)) make.unique <- FALSE + if (make.unique) { + dup <- tab[which(tab > 1)] + nms <- names(dup) + for (i in 1:length(dup)) { + j <- which(x == nms[i]) + end <- nchar(x[j][1]) + ## w: number of characters to be added as suffix + w <- floor(log10(dup[i])) + 1 + suffix <- formatC(1:dup[i], width = w, flag = "0") + if (end + w > len) { + start <- end - w + 1 + substr(x[j], start, end) <- suffix + } else x[j] <- paste(x[j], suffix, sep = "") + } + } + if (quote) x <- paste('"', x, '"', sep = "") + x +} + +makeLabel.phylo <- function(x, tips = TRUE, nodes = TRUE, ...) +{ + if (tips) + x$tip.label <- makeLabel.character(x$tip.label, ...) + if (!is.null(x$node.label) && nodes) + x$node.label <- makeLabel.character(x$node.label, ...) + x +} + +makeLabel.multiPhylo <- function(x, tips = TRUE, nodes = TRUE, ...) +{ + y <- attr("TipLabel", x) + if (is.null(y)) { + for (i in 1:length(x)) + x[[i]] <- makeLabel.phylo(x[[i]], tips = tips, nodes = nodes, ...) + } else { + attr("TipLabel", x) <- makeLabel.character(y, ...) + } + x +} + +makeLabel.DNAbin <- function(x, ...) +{ + if (is.vector(x) || is.list(x)) + names(x) <- makeLabel.character(names(x), ...) + else rownames(x) <- makeLabel.character(rownames(x), ...) + x +} diff --git a/R/read.dna.R b/R/read.dna.R index f9513ea..8117906 100644 --- a/R/read.dna.R +++ b/R/read.dna.R @@ -1,4 +1,4 @@ -## read.dna.R (2008-03-28) +## read.dna.R (2008-07-03) ## Read DNA Sequences in a File @@ -12,68 +12,54 @@ read.dna <- function(file, format = "interleaved", skip = 0, as.character = FALSE) { getTaxaNames <- function(x) { - x <- sub("^ +", "", x) # remove the leading spaces - x <- sub(" +$", "", x) # remove the trailing spaces - x <- sub("^['\"]", "", x) # remove the leading quotes - x <- sub("['\"]$", "", x) # remove the trailing quotes + x <- sub("^['\" ]+", "", x) # remove the leading quotes and spaces + x <- sub("['\" ]+$", "", x) # " " trailing " " " x } - format <- match.arg(format, c("interleaved", "sequential", "fasta")) - phylip <- if (format %in% c("interleaved", "sequential")) TRUE else FALSE - X <- scan(file = file, what = character(), sep = "\n", quiet = TRUE, + getNucleotide <- function(x) { + x <- gsub(" ", "", x) + x <- strsplit(x, NULL) + unlist(x) + } + formats <- c("interleaved", "sequential", "fasta", "clustal") + format <- match.arg(format, formats) + phylip <- if (format %in% formats[1:2]) TRUE else FALSE + X <- scan(file = file, what = "", sep = "\n", quiet = TRUE, skip = skip, nlines = nlines, comment.char = comment.char) + pat.base <- "[-AaCcGgTtUuMmRrWwSsYyKkVvHhDdBbNn?]{10}" if (phylip) { - fl <- X[1] - oop <- options(warn = -1) ## need to remove the possible leading spaces in the first line - fl.num <- as.numeric(unlist(strsplit(gsub("^ +", "", fl), " +"))) - options(oop) - if (all(is.na(fl.num))) - stop("the first line of the file must contain the dimensions of the data") - if (length(fl.num) != 2) - stop("the first line of the file must contain TWO numbers") - else { - n <- fl.num[1] - s <- fl.num[2] - } + fl <- gsub("^ +", "", X[1]) + fl <- as.numeric(unlist(strsplit(fl, " +"))) + if (length(fl) != 2 || any(is.na(fl))) + stop("the first line of the file must contain the dimensions of the data") + n <- fl[1] + s <- fl[2] + obj <- matrix("", n, s) X <- X[-1] - obj <- vector("character", n*s) - dim(obj) <- c(n, s) } if (format == "interleaved") { - fl <- X[1] - fl <- unlist(strsplit(fl, NULL)) - bases <- grep("[-AaCcGgTtUuMmRrWwSsYyKkVvHhDdBbNn?]", fl) - z <- diff(bases) - for (i in 1:length(z)) if (all(z[i:(i + 8)] == 1)) break - start.seq <- bases[i] + start.seq <- regexpr(pat.base, X[1])[1] if (is.null(seq.names)) - seq.names <- getTaxaNames(substr(X[1:n], 1, start.seq - 1)) + seq.names <- getTaxaNames(substr(X[1:n], 1, start.seq - 1)) X[1:n] <- substr(X[1:n], start.seq, nchar(X[1:n])) - X <- gsub(" ", "", X) nl <- length(X) for (i in 1:n) - obj[i, ] <- unlist(strsplit(X[seq(i, nl, n)], NULL)) + obj[i, ] <- getNucleotide(X[seq(i, nl, n)]) } if (format == "sequential") { - fl <- X[1] taxa <- character(n) j <- 1 for (i in 1:n) { - bases <- grep("[-AaCcGgTtUuMmRrWwSsYyKkVvHhDdBbNn?]", - unlist(strsplit(X[j], NULL))) - z <- diff(bases) - for (k in 1:length(z)) if (all(z[k:(k + 8)] == 1)) break - start.seq <- bases[k] + start.seq <- regexpr(pat.base, X[j])[1] taxa[i] <- substr(X[j], 1, start.seq - 1) - sequ <- substr(X[j], start.seq, nchar(X[j])) - sequ <- gsub(" ", "", sequ) + sequ <- getNucleotide(substr(X[j], start.seq, nchar(X[j]))) j <- j + 1 - while (nchar(sequ) < s) { - sequ <- paste(sequ, gsub(" " , "", X[j]), sep = "") + while (length(sequ) < s) { + sequ <- c(sequ, getNucleotide(X[j])) j <- j + 1 } - obj[i, ] <- unlist(strsplit(sequ, NULL)) + obj[i, ] <- sequ } if (is.null(seq.names)) seq.names <- getTaxaNames(taxa) } @@ -88,11 +74,32 @@ read.dna <- function(file, format = "interleaved", skip = 0, } start <- c(start, length(X) + 1) # this avoids the following to crash when `i = n' for (i in 1:n) - obj[[i]] <- unlist(strsplit(gsub(" ", "", - X[(start[i] + 1):(start[i + 1] - 1)]), - NULL)) + obj[[i]] <- getNucleotide(X[(start[i] + 1):(start[i + 1] - 1)]) } - if (phylip) { + if (format == "clustal") { + X <- X[-1] + ## find where the 1st sequence starts + start.seq <- regexpr(pat.base, X[1])[1] + ## find the lines with *********.... + nspaces <- paste("^ {", start.seq - 1, "}", sep = "", collapse = "") + stars <- grep(nspaces, X) + ## we now know how many sequences in the file: + n <- stars[1] - 1 + ## get the sequence names in the same way than "interleaved": + if (is.null(seq.names)) + seq.names <- getTaxaNames(substr(X[1:n], 1, start.seq - 1)) + ## need to remove the sequence names before getting the sequences: + X <- substr(X, start.seq, nchar(X)) + nl <- length(X) + ## find the length of the 1st sequence: + tmp <- getNucleotide(X[seq(1, nl, n + 1)]) + s <- length(tmp) + obj <- matrix("", n, s) + obj[1, ] <- tmp + for (i in 2:n) + obj[i, ] <- getNucleotide(X[seq(i, nl, n + 1)]) + } + if (format != "fasta") { rownames(obj) <- seq.names obj <- tolower(obj) } else { diff --git a/R/write.dna.R b/R/write.dna.R index 68070b8..59712ea 100644 --- a/R/write.dna.R +++ b/R/write.dna.R @@ -1,8 +1,8 @@ -## write.dna.R (2003-12-29) +## write.dna.R (2008-07-03) ## Write DNA Sequences in a File -## Copyright 2003-2006 Emmanuel Paradis +## Copyright 2003-2008 Emmanuel Paradis ## This file is part of the R-package `ape'. ## See the file ../COPYING for licensing issues. @@ -14,113 +14,99 @@ write.dna <- function(x, file, format = "interleaved", append = FALSE, format <- match.arg(format, c("interleaved", "sequential", "fasta")) phylip <- if (format %in% c("interleaved", "sequential")) TRUE else FALSE if (class(x) == "DNAbin") x <- as.character(x) + aligned <- TRUE if (is.matrix(x)) { - N <- dim(x)[1] + N <- dim(x) + S <- N[2] + N <- N[1] xx <- vector("list", N) for (i in 1:N) xx[[i]] <- x[i, ] names(xx) <- rownames(x) x <- xx rm(xx) - } else N <- length(x) + } else { + N <- length(x) + S <- unique(unlist(lapply(x, length))) + if (length(S) > 1) aligned <- FALSE + } if (is.null(names(x))) names(x) <- as.character(1:N) if (is.null(indent)) indent <- if (phylip) 10 else 0 - if (indent == "") indent <- 0 - if (is.numeric(indent)) indent <- paste(rep(" ", indent), collapse = "") + if (is.numeric(indent)) + indent <- paste(rep(" ", indent), collapse = "") if (format == "interleaved") { - if (blocksep) { - blockseparation <- TRUE - blocksep <- paste(rep("\n", blocksep), collapse = "") - } else blockseparation <- FALSE + blocksep <- paste(rep("\n", blocksep), collapse = "") if (nbcol < 0) format <- "sequential" } zz <- if (append) file(file, "a") else file(file, "w") + on.exit(close(zz)) if (phylip) { - S <- unique(unlist(lapply(x, length))) - ## check that all sequences have the same length - if (length(S) != 1) - stop("sequences must have the same length for interleaved or sequential format.") - ## truncate names if necessary - if (any(nchar(names(x)) > 10)) { - warning("at least one name was longer than 10 characters;\nthey will be truncated which may lead to some redundancy.\n") - names(x) <- substr(names(x), 1, 10) - } - ## left justify - names(x) <- sprintf("%-10s", names(x)) + if (!aligned) + stop("sequences must have the same length for + interleaved or sequential format.") cat(N, " ", S, "\n", sep = "", file = zz) if (nbcol < 0) { nb.block <- 1 - nbcol <- totalcol <- ceiling(S / colw) + nbcol <- totalcol <- ceiling(S/colw) } else { - nb.block <- ceiling(S / (colw * nbcol)) - totalcol <- ceiling(S / colw) + nb.block <- ceiling(S/(colw * nbcol)) + totalcol <- ceiling(S/colw) } ## Prepare the sequences in a matrix whose elements are ## strings with `colw' characters. - SEQ <- matrix(NA, N, totalcol) - mode(SEQ) <- "character" + SEQ <- matrix("", N, totalcol) for (i in 1:N) { - X <- paste(x[[i]], collapse= "") - for (j in 1:totalcol) SEQ[i, j] <- substr(X, 1 + (j - 1)*colw, colw + (j - 1)*colw) + X <- paste(x[[i]], collapse = "") + for (j in 1:totalcol) + SEQ[i, j] <- substr(X, 1 + (j - 1)*colw, colw + (j - 1)*colw) } + ## Prepare the names so that they all have the same nb of chars + max.nc <- max(nchar(names(x))) + ## in case all names are 10 char long or less, sequences are + ## always started on the 11th column of the file + if (max.nc < 11) max.nc <- 9 + fmt <- paste("%-", max.nc + 1, "s", sep = "") + names(x) <- sprintf(fmt, names(x)) } if (format == "interleaved") { ## Write the first block with the taxon names - if (nb.block == 1) { - for (i in 1:N) { - cat(names(x)[i], file = zz) - cat(SEQ[i, ], sep = colsep, file = zz) - cat("\n", file = zz) - } - } else { - for (i in 1:N) { - cat(names(x)[i], file = zz) - cat(SEQ[i, 1:nbcol], sep = colsep, file = zz) - cat("\n", file = zz) - } + colsel <- if (nb.block == 1) 1:totalcol else 1:nbcol + for (i in 1:N) { + cat(names(x)[i], file = zz) + cat(SEQ[i, colsel], sep = colsep, file = zz) + cat("\n", file = zz) } - ## Write the other blocks + ## Write eventually the other blocks if (nb.block > 1) { for (k in 2:nb.block) { - if (blockseparation) cat(blocksep, file = zz) - if (k == nb.block) { - for (i in 1:N) { - cat(indent, file = zz) - cat(SEQ[i, (1 + (k - 1)*nbcol):ncol(SEQ)], sep = colsep, file = zz) - cat("\n", file = zz) - } - } else { - for (i in 1:N) { - cat(indent, file = zz) - cat(SEQ[i, (1 + (k - 1)*nbcol):(nbcol + (k - 1)*nbcol)], sep = colsep, file = zz) - cat("\n", file = zz) - } + cat(blocksep, file = zz) + endcolsel <- if (k == nb.block) totalcol else nbcol + (k - 1)*nbcol + for (i in 1:N) { + cat(indent, file = zz) + cat(SEQ[i, (1 + (k - 1)*nbcol):endcolsel], sep = colsep, file = zz) + cat("\n", file = zz) } } } + } if (format == "sequential") { if (nb.block == 1) { for (i in 1:N) { - cat(names(x)[i], file = zz) - cat(SEQ[i, 1:nbcol], sep = colsep, file = zz) - cat("\n", file = zz) - } + cat(names(x)[i], file = zz) + cat(SEQ[i, ], sep = colsep, file = zz) + cat("\n", file = zz) + } } else { for (i in 1:N) { cat(names(x)[i], file = zz) cat(SEQ[i, 1:nbcol], sep = colsep, file = zz) cat("\n", file = zz) for (k in 2:nb.block) { - if (k == nb.block) { - cat(indent, file = zz) - cat(SEQ[i, (1 + (k - 1)*nbcol):ncol(SEQ)], sep = colsep, file = zz) - cat("\n", file = zz) - } else { - cat(indent, file = zz) - cat(SEQ[i, (1 + (k - 1)*nbcol):(nbcol + (k - 1)*nbcol)], sep = colsep, file = zz) - cat("\n", file = zz) - } + endcolsel <- if (k == nb.block) totalcol else nbcol + (k - 1)*nbcol + cat(indent, file = zz) + cat(SEQ[i, (1 + (k - 1)*nbcol):endcolsel], sep = colsep, file = zz) + cat("\n", file = zz) } } } @@ -129,29 +115,21 @@ write.dna <- function(x, file, format = "interleaved", append = FALSE, for (i in 1:N) { cat(">", names(x)[i], file = zz) cat("\n", file = zz) - X <- paste(x[[i]], collapse= "") + X <- paste(x[[i]], collapse = "") S <- length(x[[i]]) - if (nbcol < 0) { - nb.block <- 1 - nbcol <- totalcol <- ceiling(S / colw) - } else { - totalcol <- ceiling(S / colw) - nb.block <- ceiling(totalcol / nbcol) - } + totalcol <- ceiling(S/colw) + if (nbcol < 0) nbcol <- totalcol + nb.lines <- ceiling(totalcol/nbcol) SEQ <- character(totalcol) - for (j in 1:totalcol) SEQ[j] <- substr(X, 1 + (j - 1)*colw, colw + (j - 1)*colw) - for (k in 1:nb.block) { - if (k == nb.block) { - cat(indent, file = zz) - cat(SEQ[(1 + (k - 1)*nbcol):length(SEQ)], sep = colsep, file = zz) - cat("\n", file = zz) - } else { - cat(indent, file = zz) - cat(SEQ[(1 + (k - 1)*nbcol):(nbcol + (k - 1)*nbcol)], sep = colsep, file = zz) - cat("\n", file = zz) - } + for (j in 1:totalcol) + SEQ[j] <- substr(X, 1 + (j - 1) * colw, colw + (j - 1) * colw) + for (k in 1:nb.lines) { + endsel <- + if (k == nb.lines) length(SEQ) else nbcol + (k - 1)*nbcol + cat(indent, file = zz) + cat(SEQ[(1 + (k - 1)*nbcol):endsel], sep = colsep, file = zz) + cat("\n", file = zz) } } } - close(zz) } diff --git a/man/makeLabel.Rd b/man/makeLabel.Rd new file mode 100644 index 0000000..5c56dbc --- /dev/null +++ b/man/makeLabel.Rd @@ -0,0 +1,68 @@ +\name{makeLabel} +\alias{makeLabel} +\title{Label Management} +\usage{ +makeLabel(x, ...) +\method{makeLabel}{character}(x, len = 99, space = "_", make.unique = TRUE, + illegal = "():;,[]", quote = FALSE, ...) +\method{makeLabel}{phylo}(x, tips = TRUE, nodes = TRUE, ...) +\method{makeLabel}{multiPhylo}(x, tips = TRUE, nodes = TRUE, ...) +\method{makeLabel}{DNAbin}(x, ...) +} +\arguments{ + \item{x}{a vector of mode character or an object for which labels are + to be changed.} + \item{len}{the maximum length of the labels: those longer than `len' + will be truncated.} + \item{space}{the character to replace spaces, tabulations, and + linebreaks.} + \item{make.unique}{a logical specifying whether duplicate labels + should be made unique by appending numerals; \code{TRUE} by + default.} + \item{illegal}{a string specifying the characters to be deleted.} + \item{quote}{a logical specifying whether to quote the labels; + \code{FALSE} by default.} + \item{tips}{a logical specifying whether tip labels are to be + modified; \code{TRUE} by default.} + \item{nodes}{a logical specifying whether node labels are to be + modified; \code{TRUE} by default.} + \item{...}{further arguments to be passed to or from other methods.} +} +\description{ + This is a generic function with methods for character vectors, trees + of class \code{"phylo"}, lists of trees of class \code{"multiPhylo"}, + and DNA sequences of class \code{"DNAbin"}. All options for the class + character may be used in the other methods. +} +\details{ + The option \code{make.unique} does not work exactly in the same way + then the function of the same name: numbers are suffixed to all labels + that are identical (without separator). See the examples. + + If there are 10--99 identical labels, the labels returned are "xxx01", + "xxx02", etc, or "xxx001", "xxx002", etc, if they are 100--999, and so + on. The number of digits added preserves the option `len'. + + The default for `len' makes labels short enough to be read by + PHYML. Clustal accepts labels up to 30 character long. +} +\note{ + The current version does not perform well when trying to make very + short unique labels (e.g., less than 5 character long). +} +\value{ + An object of the appropriate class. +} +\author{Emmanuel Paradis \email{Emmanuel.Paradis@mpl.ird.fr}} +\seealso{ + \code{\link[base]{make.unique}}, \code{\link[base]{make.names}}, + \code{\link[base]{abbreviate}} +} +\examples{ +x <- rep("a", 3) +makeLabel(x) +make.unique(x) # <- from R's base +x <- rep("aaaaa", 2) +makeLabel(x, len = 3) # made unique and of length 3 +makeLabel(x, len = 3, make.unique = FALSE) +} diff --git a/man/read.dna.Rd b/man/read.dna.Rd index 71473d6..af26aa7 100644 --- a/man/read.dna.Rd +++ b/man/read.dna.Rd @@ -10,9 +10,9 @@ read.dna(file, format = "interleaved", skip = 0, \item{file}{a file name specified by either a variable of mode character, or a double-quoted string.} \item{format}{a character string specifying the format of the DNA - sequences. Three choices are possible: \code{"interleaved"}, - \code{"sequential"}, or \code{"fasta"}, or any unambiguous - abbreviation of these.} + sequences. Four choices are possible: \code{"interleaved"}, + \code{"sequential"}, \code{"clustal"}, or \code{"fasta"}, or any + unambiguous abbreviation of these.} \item{skip}{the number of lines of the input file to skip before beginning to read data.} \item{nlines}{the number of lines to be read (by default the file is @@ -34,8 +34,7 @@ read.dna(file, format = "interleaved", skip = 0, \details{ This function follows the interleaved and sequential formats defined in PHYLIP (Felsenstein, 1993) but with the original feature than there - is no restriction on the lengths of the taxa names (though a data file - with 10-characters-long taxa names is fine as well). For these two + is no restriction on the lengths of the taxa names. For these two formats, the first line of the file must contain the dimensions of the data (the numbers of taxa and the numbers of nucleotides); the sequences are considered as aligned and thus must be of the same @@ -63,6 +62,11 @@ read.dna(file, format = "interleaved", skip = 0, sequences are then read until the number of nucleotides specified in the first line of the file is reached. This is repeated for each taxa.} + \item{Clustal:}{this is the format output by the Clustal programs + (.aln). It is somehow similar to the interleaved format: the + differences being that the dimensions of the data are not indicated + in the file, and the names of the sequences are repeated in each block.} + \item{FASTA:}{This looks like the sequential format but the taxa names (or rather a description of the sequence) are on separate lines beginning with a `greater than' character ``>'' (there may be @@ -111,6 +115,14 @@ cat("3 40", " ACTAAAAATT ATCAATCACT", file = "exdna.txt", sep = "\n") ex.dna2 <- read.dna("exdna.txt") +### ... in clustal format... +cat("CLUSTAL (ape) multiple sequence alignment", "", +"No305 NTTCGAAAAACACACCCACTACTAAAANTTATCAGTCACT", +"No304 ATTCGAAAAACACACCCACTACTAAAAATTATCAACCACT", +"No306 ATTCGAAAAACACACCCACTACTAAAAATTATCAATCACT", +" ************************** ****** ****", +file = "exdna.txt", sep = "\n") +ex.dna3 <- read.dna("exdna.txt", format = "clustal") ### ... and in FASTA format cat("> No305", "NTTCGAAAAACACACCCACTACTAAAANTTATCAGTCACT", @@ -119,10 +131,11 @@ cat("> No305", "> No306", "ATTCGAAAAACACACCCACTACTAAAAATTATCAATCACT", file = "exdna.txt", sep = "\n") -ex.dna3 <- read.dna("exdna.txt", format = "fasta") -### These are the same! +ex.dna4 <- read.dna("exdna.txt", format = "fasta") +### The first three are the same! identical(ex.dna, ex.dna2) identical(ex.dna, ex.dna3) +identical(ex.dna, ex.dna4) unlink("exdna.txt") # clean-up } \keyword{IO} diff --git a/man/write.dna.Rd b/man/write.dna.Rd index 3646ef9..8a692f1 100644 --- a/man/write.dna.Rd +++ b/man/write.dna.Rd @@ -33,20 +33,17 @@ write.dna(x, file, format = "interleaved", append = FALSE, } \description{ This function writes in a file a list of DNA sequences in sequential, - interleaved, or FASTA format. The names of the vectors of the list are - used as taxa names. + interleaved, or FASTA format. } \details{ - The same three formats are supported in the present function than in - \code{\link{read.dna}}: see its help page and the references below for - a description of these formats. + Three formats are supported in the present function: see the help page + of \code{\link{read.dna}} and the references below for a description. If the sequences have no names, then they are given "1", "2", ... as names in the file. With the interleaved and sequential formats, the sequences must be all - of the same length; if the taxon names are longer than 10 characters, - they are truncated and a warning message is issued. + of the same length. The names of the sequences are not truncated. The argument `indent' specifies how the rows of nucleotides are indented. In the interleaved and sequential formats, the rows with @@ -58,17 +55,23 @@ write.dna(x, file, format = "interleaved", append = FALSE, it is a numeric). For example, specifying `indent = " "' or `indent = 3' will have exactly the same effect (use `indent = "\t"' for a tabulation). + + The different options are intended to give flexibility in formatting + the sequences. For instance, if the sequences are very long it may be + judicious to remove all the spaces beween columns (colsep = ""), in + the margins (indent = 0), and between the blocks (blocksep = 0) to + produce a smaller file. } \note{ Specifying a negative value for `nbcol' (meaning that the nucleotides are printed on a single line) gives the same result for the interleaved and sequential formats. - The different options are intended to give flexibility in formatting - the sequences. For instance, if the sequences are very long it may be - judicious to remove all the spaces beween columns `(colsep = ""), in - the margins (indent = 0), and between the blocks (blocksep = 0) to - produce a smaller file. + The names of the sequences may be truncated with the function + \code{\link{makeLabel}}. In particular, Clustal is limited to 30 + characters, whereas PHYML seems limited to 99 characters. + + The FASTA format may be used as input for Clustal. } \value{ None (invisible `NULL'). @@ -86,6 +89,7 @@ write.dna(x, file, format = "interleaved", append = FALSE, \url{http://evolution.genetics.washington.edu/phylip/phylip.html} } \seealso{ - \code{\link{read.dna}}, \code{\link{read.GenBank}} + \code{\link{read.dna}}, \code{\link{read.GenBank}}, + \code{\link{makeLabel}} } \keyword{IO} -- 2.39.5