man/read.dna.Rd

   1 \name{read.dna}
   2 \alias{read.dna}
   3 \alias{read.FASTA}
   4 \title{Read DNA Sequences in a File}
   5 \description{
   6   These functions read DNA sequences in a file, and returns a matrix or a
   7   list of DNA sequences with the names of the taxa read in the file as
   8   rownames or names, respectively. By default, the sequences are stored
   9   in binary format, otherwise (if \code{as.character = "TRUE"}) in lower
  10   case.
  11 }
  12 \usage{
  13 read.dna(file, format = "interleaved", skip = 0,
  14          nlines = 0, comment.char = "#",
  15          as.character = FALSE, as.matrix = NULL)
  16 read.FASTA(file)
  17 }
  18 \arguments{
  19   \item{file}{a file name specified by either a variable of mode character,
  20     or a double-quoted string.}
  21   \item{format}{a character string specifying the format of the DNA
  22     sequences. Four choices are possible: \code{"interleaved"},
  23     \code{"sequential"}, \code{"clustal"}, or \code{"fasta"}, or any
  24     unambiguous abbreviation of these.}
  25   \item{skip}{the number of lines of the input file to skip before
  26     beginning to read data (ignored for FASTA files; see below).}
  27   \item{nlines}{the number of lines to be read (by default the file is
  28     read untill its end; ignored for FASTA files)).}
  29   \item{comment.char}{a single character, the remaining of the line
  30     after this character is ignored (ignored for FASTA files).}
  31   \item{as.character}{a logical controlling whether to return the
  32     sequences as an object of class \code{"DNAbin"} (the default).}
  33   \item{as.matrix}{(used if \code{format = "fasta"}) one of the three
  34     followings: (i) \code{NULL}: returns the sequences in a matrix if
  35     they are of the same length, otherwise in a list; (ii) \code{TRUE}:
  36     returns the sequences in a matrix, or stops with an error if they
  37     are of different lengths; (iii) \code{FALSE}: always returns the
  38     sequences in a list.}
  39 }
  40 \details{
  41   \code{read.dna} follows the interleaved and sequential formats defined
  42   in PHYLIP (Felsenstein, 1993) but with the original feature than there
  43   is no restriction on the lengths of the taxa names. For these two
  44   formats, the first line of the file must contain the dimensions of the
  45   data (the numbers of taxa and the numbers of nucleotides); the
  46   sequences are considered as aligned and thus must be of the same
  47   lengths for all taxa. For the FASTA format, the conventions defined in
  48   the URL below (see References) are followed; the sequences are taken as
  49   non-aligned. For all formats, the nucleotides can be arranged in any
  50   way with blanks and line-breaks inside (with the restriction that the
  51   first ten nucleotides must be contiguous for the interleaved and
  52   sequential formats, see below). The names of the sequences are read in
  53   the file. Particularities for each format are detailed below.
  54
  55 \itemize{
  56   \item{Interleaved:}{the function starts to read the sequences after it
  57     finds one or more spaces (or tabulations). All characters before the
  58     sequences are taken as the taxa names after removing the leading and
  59     trailing spaces (so spaces in taxa names are allowed). It is assumed
  60     that the taxa names are not repeated in the subsequent blocks of
  61     nucleotides.}
  62
  63   \item{Sequential:}{the same criterion than for the interleaved format
  64     is used to start reading the sequences and the taxa names; the
  65     sequences are then read until the number of nucleotides specified in
  66     the first line of the file is reached. This is repeated for each taxa.}
  67
  68   \item{Clustal:}{this is the format output by the Clustal programs
  69     (.aln). It is somehow similar to the interleaved format: the
  70     differences being that the dimensions of the data are not indicated
  71     in the file, and the names of the sequences are repeated in each block.}
  72
  73   \item{FASTA:}{This looks like the sequential format but the taxa names
  74     (or rather a description of the sequence) are on separate lines
  75     beginning with a `greater than' character `>' (there may be
  76     leading spaces before this character). These lines are taken as taxa
  77     names after removing the `>' and the possible leading and trailing
  78     spaces. All the data in the file before the first sequence is ignored.}
  79 }}
  80 \value{
  81   a matrix or a list (if \code{format = "fasta"}) of DNA sequences
  82   stored in binary format, or of mode character (if \code{as.character =
  83     "TRUE"}).
  84
  85   \code{read.FASTA} always returns a list of class \code{"DNAbin"}.
  86 }
  87 \references{
  88   Anonymous. FASTA format description.
  89   \url{http://www.ncbi.nlm.nih.gov/BLAST/fasta.html}
  90
  91   Anonymous. IUPAC ambiguity codes.
  92   \url{http://www.ncbi.nlm.nih.gov/SNP/iupac.html}
  93
  94   Felsenstein, J. (1993) Phylip (Phylogeny Inference Package) version
  95   3.5c. Department of Genetics, University of Washington.
  96   \url{http://evolution.genetics.washington.edu/phylip/phylip.html}
  97 }
  98 \seealso{
  99   \code{\link{read.GenBank}}, \code{\link{write.dna}},
 100   \code{\link{DNAbin}}, \code{\link{dist.dna}}, \code{\link{woodmouse}}
 101 }
 102 \author{Emmanuel Paradis}
 103 \examples{
 104 ### a small extract from `data(woddmouse)'
 105 cat("3 40",
 106 "No305     NTTCGAAAAACACACCCACTACTAAAANTTATCAGTCACT",
 107 "No304     ATTCGAAAAACACACCCACTACTAAAAATTATCAACCACT",
 108 "No306     ATTCGAAAAACACACCCACTACTAAAAATTATCAATCACT",
 109 file = "exdna.txt", sep = "\n")
 110 ex.dna <- read.dna("exdna.txt", format = "sequential")
 111 str(ex.dna)
 112 ex.dna
 113 ### the same data in interleaved format...
 114 cat("3 40",
 115 "No305     NTTCGAAAAA CACACCCACT",
 116 "No304     ATTCGAAAAA CACACCCACT",
 117 "No306     ATTCGAAAAA CACACCCACT",
 118 "          ACTAAAANTT ATCAGTCACT",
 119 "          ACTAAAAATT ATCAACCACT",
 120 "          ACTAAAAATT ATCAATCACT",
 121 file = "exdna.txt", sep = "\n")
 122 ex.dna2 <- read.dna("exdna.txt")
 123 ### ... in clustal format...
 124 cat("CLUSTAL (ape) multiple sequence alignment", "",
 125 "No305     NTTCGAAAAACACACCCACTACTAAAANTTATCAGTCACT",
 126 "No304     ATTCGAAAAACACACCCACTACTAAAAATTATCAACCACT",
 127 "No306     ATTCGAAAAACACACCCACTACTAAAAATTATCAATCACT",
 128 "           ************************** ******  ****",
 129 file = "exdna.txt", sep = "\n")
 130 ex.dna3 <- read.dna("exdna.txt", format = "clustal")
 131 ### ... and in FASTA format
 132 cat("> No305",
 133 "NTTCGAAAAACACACCCACTACTAAAANTTATCAGTCACT",
 134 "> No304",
 135 "ATTCGAAAAACACACCCACTACTAAAAATTATCAACCACT",
 136 "> No306",
 137 "ATTCGAAAAACACACCCACTACTAAAAATTATCAATCACT",
 138 file = "exdna.txt", sep = "\n")
 139 ex.dna4 <- read.dna("exdna.txt", format = "fasta")
 140 ### The first three are the same!
 141 identical(ex.dna, ex.dna2)
 142 identical(ex.dna, ex.dna3)
 143 identical(ex.dna, ex.dna4)
 144 unlink("exdna.txt") # clean-up
 145 }
 146 \keyword{IO}