man/dist.dna.Rd

   1 \name{dist.dna}
   2 \alias{dist.dna}
   3 \title{Pairwise Distances from DNA Sequences}
   4 \usage{
   5 dist.dna(x, model = "K80", variance = FALSE,
   6          gamma = FALSE, pairwise.deletion = FALSE,
   7          base.freq = NULL, as.matrix = FALSE)
   8 }
   9 \arguments{
  10   \item{x}{a matrix or a list containing the DNA sequences; this must be
  11     of class \code{"DNAbin"} (use \code{\link{as.DNAbin}} is they are
  12     stored as character).}
  13   \item{model}{a character string specifying the evolutionary model to be
  14     used; must be one of \code{"raw"}, \code{"N"}, \code{"TS"},
  15     \code{"TV"}, \code{"JC69"}, \code{"K80"} (the default),
  16     \code{"F81"}, \code{"K81"}, \code{"F84"}, \code{"BH87"},
  17     \code{"T92"}, \code{"TN93"}, \code{"GG95"}, \code{"logdet"},
  18     \code{"paralin"}, \code{"indel"}, or \code{"indelblock"}.}
  19   \item{variance}{a logical indicating whether to compute the variances
  20     of the distances; defaults to \code{FALSE} so the variances are not
  21     computed.}
  22   \item{gamma}{a value for the gamma parameter possibly used to apply a
  23     correction to the distances (by default no correction is applied).}
  24   \item{pairwise.deletion}{a logical indicating whether to delete the
  25     sites with missing data in a pairwise way. The default is to delete
  26     the sites with at least one missing data for all sequences (ignored
  27     if \code{model = "indel"} or \code{"indelblock"}).}
  28   \item{base.freq}{the base frequencies to be used in the computations
  29     (if applicable). By default, the base frequencies are computed from
  30     the whole set of sequences.}
  31   \item{as.matrix}{a logical indicating whether to return the results as
  32     a matrix. The default is to return an object of class
  33     \link[stats]{dist}.}
  34 }
  35 \description{
  36   This function computes a matrix of pairwise distances from DNA
  37   sequences using a model of DNA evolution. Eleven substitution models
  38   (and the raw distance) are currently available.
  39 }
  40 \details{
  41   The molecular evolutionary models available through the option
  42   \code{model} have been extensively described in the literature. A
  43   brief description is given below; more details can be found in the
  44   References.
  45
  46 \itemize{
  47   \item{\code{raw}, \code{N}: }{This is simply the proportion or the number of
  48     sites that differ between each pair of sequences. This may be useful
  49     to draw ``saturation plots''. The options \code{variance} and
  50     \code{gamma} have no effect, but \code{pairwise.deletion} can.}
  51
  52   \item{\code{TS}, \code{TV}: }{These are the numbers of transitions and
  53     transversions, respectively.}
  54
  55   \item{\code{JC69}: }{This model was developed by Jukes and Cantor (1969). It
  56     assumes that all substitutions (i.e. a change of a base by another
  57     one) have the same probability. This probability is the same for all
  58     sites along the DNA sequence. This last assumption can be relaxed by
  59     assuming that the substition rate varies among site following a
  60     gamma distribution which parameter must be given by the user. By
  61     default, no gamma correction is applied. Another assumption is that
  62     the base frequencies are balanced and thus equal to 0.25.}
  63
  64   \item{\code{K80}: }{The distance derived by Kimura (1980), sometimes referred
  65     to as ``Kimura's 2-parameters distance'', has the same underlying
  66     assumptions than the Jukes--Cantor distance except that two kinds of
  67     substitutions are considered: transitions (A <-> G, C <-> T), and
  68     transversions (A <-> C, A <-> T, C <-> G, G <-> T). They are assumed
  69     to have different probabilities. A transition is the substitution of
  70     a purine (C, T) by another one, or the substitution of a pyrimidine
  71     (A, G) by another one. A transversion is the substitution of a
  72     purine by a pyrimidine, or vice-versa. Both transition and
  73     transversion rates are the same for all sites along the DNA
  74     sequence. Jin and Nei (1990) modified the Kimura model to allow for
  75     variation among sites following a gamma distribution. Like for the
  76     Jukes--Cantor model, the gamma parameter must be given by the
  77     user. By default, no gamma correction is applied.}
  78
  79   \item{\code{F81}: }{Felsenstein (1981) generalized the Jukes--Cantor model
  80     by relaxing the assumption of equal base frequencies. The formulae
  81     used in this function were taken from McGuire et al. (1999)}.
  82
  83   \item{\code{K81}: }{Kimura (1981) generalized his model (Kimura 1980) by
  84     assuming different rates for two kinds of transversions: A <-> C and
  85     G <-> T on one side, and A <-> T and C <-> G on the other. This is
  86     what Kimura called his ``three substitution types model'' (3ST), and
  87     is sometimes referred to as ``Kimura's 3-parameters distance''}.
  88
  89   \item{\code{F84}: }{This model generalizes K80 by relaxing the assumption
  90     of equal base frequencies. It was first introduced by Felsenstein in
  91     1984 in Phylip, and is fully described by Felsenstein and Churchill
  92     (1996). The formulae used in this function were taken from McGuire
  93     et al. (1999)}.
  94
  95   \item{\code{BH87}: }{Barry and Hartigan (1987) developed a distance based
  96     on the observed proportions of changes among the four bases. This
  97     distance is not symmetric.}
  98
  99   \item{\code{T92}: }{Tamura (1992) generalized the Kimura model by relaxing
 100     the assumption of equal base frequencies. This is done by taking
 101     into account the bias in G+C content in the sequences. The
 102     substitution rates are assumed to be the same for all sites along
 103     the DNA sequence.}
 104
 105   \item{\code{TN93}: }{Tamura and Nei (1993) developed a model which assumes
 106     distinct rates for both kinds of transition (A <-> G versus C <->
 107     T), and transversions. The base frequencies are not assumed to be
 108     equal and are estimated from the data. A gamma correction of the
 109     inter-site variation in substitution rates is possible.}
 110
 111   \item{\code{GG95}: }{Galtier and Gouy (1995) introduced a model where the
 112     G+C content may change through time. Different rates are assumed for
 113     transitons and transversions.}
 114
 115   \item{\code{logdet}: }{The Log-Det distance, developed by Lockhart et
 116     al. (1994), is related to BH87. However, this distance is
 117     symmetric. Formulae from Gu and Li (1996) are used.
 118     \code{dist.logdet} in \pkg{phangorn} uses a different
 119     implementation that gives substantially different distances for
 120     low-diverging sequences.}
 121
 122   \item{\code{paralin}: }{Lake (1994) developed the paralinear distance which
 123     can be viewed as another variant of the Barry--Hartigan distance.}
 124
 125   \item{\code{indel}: }{this counts the number of sites where there is an
 126     insertion/deletion gap in one sequence and not in the other.}
 127
 128   \item{\code{indelblock}: }{same than before but contiguous gaps are
 129     counted as a single unit. Note that the distance between \code{-A-} and
 130     \code{A--} is 3 because there are three different blocks of gaps, whereas
 131     the ``indel'' distance will be 2.}
 132 }}
 133 \value{
 134   an object of class \link[stats]{dist} (by default), or a numeric
 135   matrix if \code{as.matrix = TRUE}. If \code{model = "BH87"}, a numeric
 136   matrix is returned because the Barry--Hartigan distance is not
 137   symmetric.
 138
 139   If \code{variance = TRUE} an attribute called \code{"variance"} is
 140   given to the returned object.
 141 }
 142 \references{
 143   Barry, D. and Hartigan, J. A. (1987) Asynchronous distance between
 144   homologous DNA sequences. \emph{Biometrics}, \bold{43}, 261--276.
 145
 146   Felsenstein, J. (1981) Evolutionary trees from DNA sequences: a
 147   maximum likelihood approach. \emph{Journal of Molecular Evolution},
 148   \bold{17}, 368--376.
 149
 150   Felsenstein, J. and Churchill, G. A. (1996) A Hidden Markov model
 151   approach to variation among sites in rate of evolution.
 152   \emph{Molecular Biology and Evolution}, \bold{13}, 93--104.
 153
 154   Galtier, N. and Gouy, M. (1995) Inferring phylogenies from DNA
 155   sequences of unequal base compositions. \emph{Proceedings of the
 156     National Academy of Sciences USA}, \bold{92}, 11317--11321.
 157
 158   Gu, X. and Li, W.-H. (1996) Bias-corrected paralinear and LogDet
 159   distances and tests of molecular clocks and phylogenies under
 160   nonstationary nucleotide frequencies. \emph{Molecular Biology and
 161     Evolution}, \bold{13}, 1375--1383.
 162
 163   Jukes, T. H. and Cantor, C. R. (1969) Evolution of protein
 164   molecules. in \emph{Mammalian Protein Metabolism}, ed. Munro, H. N.,
 165   pp. 21--132, New York: Academic Press.
 166
 167   Kimura, M. (1980) A simple method for estimating evolutionary rates of
 168   base substitutions through comparative studies of nucleotide
 169   sequences. \emph{Journal of Molecular Evolution}, \bold{16}, 111--120.
 170
 171   Kimura, M. (1981) Estimation of evolutionary distances between
 172   homologous nucleotide sequences. \emph{Proceedings of the National
 173     Academy of Sciences USA}, \bold{78}, 454--458.
 174
 175   Jin, L. and Nei, M. (1990) Limitations of the evolutionary parsimony
 176   method of phylogenetic analysis. \emph{Molecular Biology and
 177     Evolution}, \bold{7}, 82--102.
 178
 179   Lake, J. A. (1994) Reconstructing evolutionary trees from DNA and
 180   protein sequences: paralinear distances. \emph{Proceedings of the
 181     National Academy of Sciences USA}, \bold{91}, 1455--1459.
 182
 183   Lockhart, P. J., Steel, M. A., Hendy, M. D. and Penny, D. (1994)
 184   Recovering evolutionary trees under a more realistic model of sequence
 185   evolution. \emph{Molecular Biology and Evolution}, \bold{11},
 186   605--602.
 187
 188   McGuire, G., Prentice, M. J. and Wright, F. (1999). Improved error
 189   bounds for genetic distances from DNA sequences. \emph{Biometrics},
 190   \bold{55}, 1064--1070.
 191
 192   Tamura, K. (1992) Estimation of the number of nucleotide substitutions
 193   when there are strong transition-transversion and G + C-content
 194   biases. \emph{Molecular Biology and Evolution}, \bold{9}, 678--687.
 195
 196   Tamura, K. and Nei, M. (1993) Estimation of the number of nucleotide
 197   substitutions in the control region of mitochondrial DNA in humans and
 198   chimpanzees. \emph{Molecular Biology and Evolution}, \bold{10}, 512--526.
 199 }
 200 \author{Emmanuel Paradis}
 201 \seealso{
 202   \code{\link{read.GenBank}}, \code{\link{read.dna}},
 203   \code{\link{write.dna}},  \code{\link{DNAbin}},
 204   \code{\link{dist.gene}}, \code{\link{cophenetic.phylo}},
 205   \code{\link[stats]{dist}}
 206 }
 207 \keyword{manip}
 208 \keyword{multivariate}