man/dist.dna.Rd

   1 \name{dist.dna}
   2 \alias{dist.dna}
   3 \title{Pairwise Distances from DNA Sequences}
   4 \usage{
   5 dist.dna(x, model = "K80", variance = FALSE,
   6          gamma = FALSE, pairwise.deletion = FALSE,
   7          base.freq = NULL, as.matrix = FALSE)
   8 }
   9 \arguments{
  10   \item{x}{a matrix or a list containing the DNA sequences; this must be
  11     of class \code{"DNAbin"} (use \code{\link{as.DNAbin}} is they are
  12     stored as character).}
  13   \item{model}{a character string specifying the evlutionary model to be
  14     used; must be one of \code{"raw"}, \code{"N"}, \code{"JC69"},
  15     \code{"K80"} (the default), \code{"F81"}, \code{"K81"},
  16     \code{"F84"}, \code{"BH87"}, \code{"T92"}, \code{"TN93"},
  17     \code{"GG95"}, \code{"logdet"}, or \code{"paralin"}.}
  18   \item{variance}{a logical indicating whether to compute the variances
  19     of the distances; defaults to \code{FALSE} so the variances are not
  20     computed.}
  21   \item{gamma}{a value for the gamma parameter which is possibly used to
  22     apply a gamma correction to the distances (by default \code{gamma =
  23       FALSE} so no correction is applied).}
  24   \item{pairwise.deletion}{a logical indicating whether to delete the
  25     sites with missing data in a pairwise way. The default is to delete
  26     the sites with at least one missing data for all sequences.}
  27   \item{base.freq}{the base frequencies to be used in the computations
  28     (if applicable, i.e. if \code{method = "F84"}). By default, the
  29     base frequencies are computed from the whole sample of sequences.}
  30   \item{as.matrix}{a logical indicating whether to return the results as
  31     a matrix. The default is to return an object of class
  32     \link[stats]{dist}.}
  33 }
  34 \description{
  35   This function computes a matrix of pairwise distances from DNA
  36   sequences using a model of DNA evolution. Eleven substitution models
  37   (and the raw distance) are currently available.
  38 }
  39 \details{
  40   The molecular evolutionary models available through the option
  41   \code{model} have been extensively described in the literature. A
  42   brief description is given below; more details can be found in the
  43   References.
  44
  45 \itemize{
  46   \item{``raw'', ``N''}{This is simply the proportion or the number of
  47     sites that differ between each pair of sequences. This may be useful
  48     to draw ``saturation plots''. The options \code{variance} and
  49     \code{gamma} have no effect, but \code{pairwise.deletion} can.}
  50
  51   \item{``JC69''}{This model was developed by Jukes and Cantor (1969). It
  52     assumes that all substitutions (i.e. a change of a base by another
  53     one) have the same probability. This probability is the same for all
  54     sites along the DNA sequence. This last assumption can be relaxed by
  55     assuming that the substition rate varies among site following a
  56     gamma distribution which parameter must be given by the user. By
  57     default, no gamma correction is applied. Another assumption is that
  58     the base frequencies are balanced and thus equal to 0.25.}
  59
  60   \item{``K80''}{The distance derived by Kimura (1980), sometimes referred
  61     to as ``Kimura's 2-parameters distance'', has the same underlying
  62     assumptions than the Jukes--Cantor distance except that two kinds of
  63     substitutions are considered: transitions (A <-> G, C <-> T), and
  64     transversions (A <-> C, A <-> T, C <-> G, G <-> T). They are assumed
  65     to have different probabilities. A transition is the substitution of
  66     a purine (C, T) by another one, or the substitution of a pyrimidine
  67     (A, G) by another one. A transversion is the substitution of a
  68     purine by a pyrimidine, or vice-versa. Both transition and
  69     transversion rates are the same for all sites along the DNA
  70     sequence. Jin and Nei (1990) modified the Kimura model to allow for
  71     variation among sites following a gamma distribution. Like for the
  72     Jukes--Cantor model, the gamma parameter must be given by the
  73     user. By default, no gamma correction is applied.}
  74
  75   \item{``F81''}{Felsenstein (1981) generalized the Jukes--Cantor model
  76     by relaxing the assumption of equal base frequencies. The formulae
  77     used in this function were taken from McGuire et al. (1999)}.
  78
  79   \item{``K81''}{Kimura (1981) generalized his model (Kimura 1980) by
  80     assuming different rates for two kinds of transversions: A <-> C and
  81     G <-> T on one side, and A <-> T and C <-> G on the other. This is
  82     what Kimura called his ``three substitution types model'' (3ST), and
  83     is sometimes referred to as ``Kimura's 3-parameters distance''}.
  84
  85   \item{``F84''}{This model generalizes K80 by relaxing the assumption
  86     of equal base frequencies. It was first introduced by Felsenstein in
  87     1984 in Phylip, and is fully described by Felsenstein and Churchill
  88     (1996). The formulae used in this function were taken from McGuire
  89     et al. (1999)}.
  90
  91   \item{``BH87''}{Barry and Hartigan (1987) developed a distance based
  92     on the observed proportions of changes among the four bases. This
  93     distance is not symmetric.}
  94
  95   \item{``T92''}{Tamura (1992) generalized the Kimura model by relaxing
  96     the assumption of equal base frequencies. This is done by taking
  97     into account the bias in G+C content in the sequences. The
  98     substitution rates are assumed to be the same for all sites along
  99     the DNA sequence.}
 100
 101   \item{``TN93''}{Tamura and Nei (1993) developed a model which assumes
 102     distinct rates for both kinds of transition (A <-> G versus C <->
 103     T), and transversions. The base frequencies are not assumed to be
 104     equal and are estimated from the data. A gamma correction of the
 105     inter-site variation in substitution rates is possible.}
 106
 107   \item{``GG95''}{Galtier and Gouy (1995) introduced a model where the
 108     G+C content may change through time. Different rates are assumed for
 109     transitons and transversions.}
 110
 111   \item{``logdet''}{The Log-Det distance, developed by Lockhart et
 112     al. (1994), is related to BH87. However, this distance is
 113     symmetric. Formulae from Gu and Li (1996) are used.
 114     \code{dist.logdet} in \pkg{phangorn} uses a different
 115     implementation that gives substantially different distances for
 116     low-diverging sequences.}
 117
 118   \item{``paralin''}{Lake (1994) developed the paralinear distance which
 119     can be viewed as another variant of the Barry--Hartigan distance.}
 120 }}
 121 \value{
 122   an object of class \link[stats]{dist} (by default), or a numeric
 123   matrix if \code{as.matrix = TRUE}. If \code{model = "BH87"}, a numeric
 124   matrix is returned because the Barry--Hartigan distance is not
 125   symmetric.
 126
 127   If \code{variance = TRUE} an attribute called \code{"variance"} is
 128   given to the returned object.
 129 }
 130 \references{
 131   Barry, D. and Hartigan, J. A. (1987) Asynchronous distance between
 132   homologous DNA sequences. \emph{Biometrics}, \bold{43}, 261--276.
 133
 134   Felsenstein, J. (1981) Evolutionary trees from DNA sequences: a
 135   maximum likelihood approach. \emph{Journal of Molecular Evolution},
 136   \bold{17}, 368--376.
 137
 138   Felsenstein, J. and Churchill, G. A. (1996) A Hidden Markov model
 139   approach to variation among sites in rate of evolution.
 140   \emph{Molecular Biology and Evolution}, \bold{13}, 93--104.
 141
 142   Galtier, N. and Gouy, M. (1995) Inferring phylogenies from DNA
 143   sequences of unequal base compositions. \emph{Proceedings of the
 144     National Academy of Sciences USA}, \bold{92}, 11317--11321.
 145
 146   Gu, X. and Li, W.-H. (1996) Bias-corrected paralinear and LogDet
 147   distances and tests of molecular clocks and phylogenies under
 148   nonstationary nucleotide frequencies. \emph{Molecular Biology and
 149     Evolution}, \bold{13}, 1375--1383.
 150
 151   Jukes, T. H. and Cantor, C. R. (1969) Evolution of protein
 152   molecules. in \emph{Mammalian Protein Metabolism}, ed. Munro, H. N.,
 153   pp. 21--132, New York: Academic Press.
 154
 155   Kimura, M. (1980) A simple method for estimating evolutionary rates of
 156   base substitutions through comparative studies of nucleotide
 157   sequences. \emph{Journal of Molecular Evolution}, \bold{16}, 111--120.
 158
 159   Kimura, M. (1981) Estimation of evolutionary distances between
 160   homologous nucleotide sequences. \emph{Proceedings of the National
 161     Academy of Sciences USA}, \bold{78}, 454--458.
 162
 163   Jin, L. and Nei, M. (1990) Limitations of the evolutionary parsimony
 164   method of phylogenetic analysis. \emph{Molecular Biology and
 165     Evolution}, \bold{7}, 82--102.
 166
 167   Lake, J. A. (1994) Reconstructing evolutionary trees from DNA and
 168   protein sequences: paralinear distances. \emph{Proceedings of the
 169     National Academy of Sciences USA}, \bold{91}, 1455--1459.
 170
 171   Lockhart, P. J., Steel, M. A., Hendy, M. D. and Penny, D. (1994)
 172   Recovering evolutionary trees under a more realistic model of sequence
 173   evolution. \emph{Molecular Biology and Evolution}, \bold{11},
 174   605--602.
 175
 176   McGuire, G., Prentice, M. J. and Wright, F. (1999). Improved error
 177   bounds for genetic distances from DNA sequences. \emph{Biometrics},
 178   \bold{55}, 1064--1070.
 179
 180   Tamura, K. (1992) Estimation of the number of nucleotide substitutions
 181   when there are strong transition-transversion and G + C-content
 182   biases. \emph{Molecular Biology and Evolution}, \bold{9}, 678--687.
 183
 184   Tamura, K. and Nei, M. (1993) Estimation of the number of nucleotide
 185   substitutions in the control region of mitochondrial DNA in humans and
 186   chimpanzees. \emph{Molecular Biology and Evolution}, \bold{10}, 512--526.
 187 }
 188 \author{Emmanuel Paradis}
 189 \seealso{
 190   \code{\link{read.GenBank}}, \code{\link{read.dna}},
 191   \code{\link{write.dna}},  \code{\link{DNAbin}},
 192   \code{\link{dist.gene}}, \code{\link{cophenetic.phylo}},
 193   \code{\link[stats]{dist}}
 194 }
 195 \keyword{manip}
 196 \keyword{multivariate}