man/dist.dna.Rd

   1 \name{dist.dna}
   2 \alias{dist.dna}
   3 \title{Pairwise Distances from DNA Sequences}
   4 \usage{
   5 dist.dna(x, model = "K80", variance = FALSE,
   6          gamma = FALSE, pairwise.deletion = FALSE,
   7          base.freq = NULL, as.matrix = FALSE)
   8 }
   9 \arguments{
  10   \item{x}{a matrix or a list containing the DNA sequences.}
  11   \item{model}{a character string specifying the evlutionary model to be
  12     used; must be one of \code{"raw"}, \code{"N"}, \code{"JC69"},
  13     \code{"K80"} (the default), \code{"F81"}, \code{"K81"},
  14     \code{"F84"}, \code{"BH87"}, \code{"T92"}, \code{"TN93"},
  15     \code{"GG95"}, \code{"logdet"}, or \code{"paralin"}.}
  16   \item{variance}{a logical indicating whether to compute the variances
  17     of the distances; defaults to \code{FALSE} so the variances are not
  18     computed.}
  19   \item{gamma}{a value for the gamma parameter which is possibly used to
  20     apply a gamma correction to the distances (by default \code{gamma =
  21       FALSE} so no correction is applied).}
  22   \item{pairwise.deletion}{a logical indicating whether to delete the
  23     sites with missing data in a pairwise way. The default is to delete
  24     the sites with at least one missing data for all sequences.}
  25   \item{base.freq}{the base frequencies to be used in the computations
  26     (if applicable, i.e. if \code{method = "F84"}). By default, the
  27     base frequencies are computed from the whole sample of sequences.}
  28   \item{as.matrix}{a logical indicating whether to return the results as
  29     a matrix. The default is to return an object of class
  30     \link[stats]{dist}.}
  31 }
  32 \description{
  33   This function computes a matrix of pairwise distances from DNA
  34   sequences using a model of DNA evolution. Eleven substitution models
  35   (and the raw distance) are currently available.
  36 }
  37 \details{
  38   The molecular evolutionary models available through the option
  39   \code{model} have been extensively described in the literature. A
  40   brief description is given below; more details can be found in the
  41   References.
  42
  43 \itemize{
  44   \item{``raw'', ``N''}{This is simply the proportion or the number of
  45     sites that differ between each pair of sequences. This may be useful
  46     to draw ``saturation plots''. The options \code{variance} and
  47     \code{gamma} have no effect, but \code{pairwise.deletion} can.}
  48
  49   \item{``JC69''}{This model was developed by Jukes and Cantor (1969). It
  50     assumes that all substitutions (i.e. a change of a base by another
  51     one) have the same probability. This probability is the same for all
  52     sites along the DNA sequence. This last assumption can be relaxed by
  53     assuming that the substition rate varies among site following a
  54     gamma distribution which parameter must be given by the user. By
  55     default, no gamma correction is applied. Another assumption is that
  56     the base frequencies are balanced and thus equal to 0.25.}
  57
  58   \item{``K80''}{The distance derived by Kimura (1980), sometimes referred
  59     to as ``Kimura's 2-parameters distance'', has the same underlying
  60     assumptions than the Jukes--Cantor distance except that two kinds of
  61     substitutions are considered: transitions (A <-> G, C <-> T), and
  62     transversions (A <-> C, A <-> T, C <-> G, G <-> T). They are assumed
  63     to have different probabilities. A transition is the substitution of
  64     a purine (C, T) by another one, or the substitution of a pyrimidine
  65     (A, G) by another one. A transversion is the substitution of a
  66     purine by a pyrimidine, or vice-versa. Both transition and
  67     transversion rates are the same for all sites along the DNA
  68     sequence. Jin and Nei (1990) modified the Kimura model to allow for
  69     variation among sites following a gamma distribution. Like for the
  70     Jukes--Cantor model, the gamma parameter must be given by the
  71     user. By default, no gamma correction is applied.}
  72
  73   \item{``F81''}{Felsenstein (1981) generalized the Jukes--Cantor model
  74     by relaxing the assumption of equal base frequencies. The formulae
  75     used in this function were taken from McGuire et al. (1999)}.
  76
  77   \item{``K81''}{Kimura (1981) generalized his model (Kimura 1980) by
  78     assuming different rates for two kinds of transversions: A <-> C and
  79     G <-> T on one side, and A <-> T and C <-> G on the other. This is
  80     what Kimura called his ``three substitution types model'' (3ST), and
  81     is sometimes referred to as ``Kimura's 3-parameters distance''}.
  82
  83   \item{``F84''}{This model generalizes K80 by relaxing the assumption
  84     of equal base frequencies. It was first introduced by Felsenstein in
  85     1984 in Phylip, and is fully described by Felsenstein and Churchill
  86     (1996). The formulae used in this function were taken from McGuire
  87     et al. (1999)}.
  88
  89   \item{``BH87''}{Barry and Hartigan (1987) developed a distance based
  90     on the observed proportions of changes among the four bases. This
  91     distance is not symmetric.}
  92
  93   \item{``T92''}{Tamura (1992) generalized the Kimura model by relaxing
  94     the assumption of equal base frequencies. This is done by taking
  95     into account the bias in G+C content in the sequences. The
  96     substitution rates are assumed to be the same for all sites along
  97     the DNA sequence.}
  98
  99   \item{``TN93''}{Tamura and Nei (1993) developed a model which assumes
 100     distinct rates for both kinds of transition (A <-> G versus C <->
 101     T), and transversions. The base frequencies are not assumed to be
 102     equal and are estimated from the data. A gamma correction of the
 103     inter-site variation in substitution rates is possible.}
 104
 105   \item{``GG95''}{Galtier and Gouy (1995) introduced a model where the
 106     G+C content may change through time. Different rates are assumed for
 107     transitons and transversions.}
 108
 109   \item{``logdet''}{The Log-Det distance, developed by Lockhart et
 110     al. (1994), is related to BH87. However, this distance is symmetric.}
 111
 112   \item{``paralin''}{Lake (1994) developed the paralinear distance which
 113     can be viewed as another variant of the Barry--Hartigan distance.}
 114 }}
 115 \value{
 116   an object of class \link[stats]{dist} (by default), or a numeric
 117   matrix if \code{as.matrix = TRUE}. If \code{model = "BH87"}, a numeric
 118   matrix is returned because the Barry--Hartigan distance is not
 119   symmetric.
 120
 121   If \code{variance = TRUE} an attribute called \code{"variance"} is
 122   given to the returned object.
 123 }
 124 \references{
 125   Barry, D. and Hartigan, J. A. (1987) Asynchronous distance between
 126   homologous DNA sequences. \emph{Biometrics}, \bold{43}, 261--276.
 127
 128   Felsenstein, J. (1981) Evolutionary trees from DNA sequences: a
 129   maximum likelihood approach. \emph{Journal of Molecular Evolution},
 130   \bold{17}, 368--376.
 131
 132   Felsenstein, J. and Churchill, G. A. (1996) A Hidden Markov model
 133   approach to variation among sites in rate of evolution.
 134   \emph{Molecular Biology and Evolution}, \bold{13}, 93--104.
 135
 136   Galtier, N. and Gouy, M. (1995) Inferring phylogenies from DNA
 137   sequences of unequal base compositions. \emph{Proceedings of the
 138     National Academy of Sciences USA}, \bold{92}, 11317--11321.
 139
 140   Jukes, T. H. and Cantor, C. R. (1969) Evolution of protein
 141   molecules. in \emph{Mammalian Protein Metabolism}, ed. Munro, H. N.,
 142   pp. 21--132, New York: Academic Press.
 143
 144   Kimura, M. (1980) A simple method for estimating evolutionary rates of
 145   base substitutions through comparative studies of nucleotide
 146   sequences. \emph{Journal of Molecular Evolution}, \bold{16}, 111--120.
 147
 148   Kimura, M. (1981) Estimation of evolutionary distances between
 149   homologous nucleotide sequences. \emph{Proceedings of the National
 150     Academy of Sciences USA}, \bold{78}, 454--458.
 151
 152   Jin, L. and Nei, M. (1990) Limitations of the evolutionary parsimony
 153   method of phylogenetic analysis. \emph{Molecular Biology and
 154     Evolution}, \bold{7}, 82--102.
 155
 156   Lake, J. A. (1994) Reconstructing evolutionary trees from DNA and
 157   protein sequences: paralinear distances. \emph{Proceedings of the
 158     National Academy of Sciences USA}, \bold{91}, 1455--1459.
 159
 160   Lockhart, P. J., Steel, M. A., Hendy, M. D. and Penny, D. (1994)
 161   Recovering evolutionary trees under a more realistic model of sequence
 162   evolution. \emph{Molecular Biology and Evolution}, \bold{11},
 163   605--602.
 164
 165   McGuire, G., Prentice, M. J. and Wright, F. (1999). Improved error
 166   bounds for genetic distances from DNA sequences. \emph{Biometrics},
 167   \bold{55}, 1064--1070.
 168
 169   Tamura, K. (1992) Estimation of the number of nucleotide substitutions
 170   when there are strong transition-transversion and G + C-content
 171   biases. \emph{Molecular Biology and Evolution}, \bold{9}, 678--687.
 172
 173   Tamura, K. and Nei, M. (1993) Estimation of the number of nucleotide
 174   substitutions in the control region of mitochondrial DNA in humans and
 175   chimpanzees. \emph{Molecular Biology and Evolution}, \bold{10}, 512--526.
 176 }
 177 \author{Emmanuel Paradis \email{Emmanuel.Paradis@mpl.ird.fr}}
 178 \seealso{
 179   \code{\link{read.GenBank}}, \code{\link{read.dna}},
 180   \code{\link{write.dna}},  \code{\link{DNAbin}},
 181   \code{\link{dist.gene}}, \code{\link{cophenetic.phylo}},
 182   \code{\link[stats]{dist}}
 183 }
 184 \keyword{manip}
 185 \keyword{multivariate}