R/read.GenBank.R

   1 ## read.GenBank.R (2010-07-22)
   2
   3 ##   Read DNA Sequences from GenBank via Internet
   4
   5 ## Copyright 2002-2010 Emmanuel Paradis
   6
   7 ## This file is part of the R-package `ape'.
   8 ## See the file ../COPYING for licensing issues.
   9
  10 read.GenBank <-
  11     function(access.nb, seq.names = access.nb, species.names = TRUE,
  12              gene.names = FALSE, as.character = FALSE)
  13 {
  14     N <- length(access.nb)
  15     ## If there are more than 400 sequences, we need to break down the
  16     ## requests, otherwise there is a segmentation fault.
  17     nrequest <- N %/% 400 + as.logical(N %% 400)
  18     X <- character(0)
  19     for (i in 1:nrequest) {
  20         a <- (i - 1) * 400 + 1
  21         b <- 400 * i
  22         if (i == nrequest) b <- N
  23         URL <- paste("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=",
  24                      paste(access.nb[a:b], collapse = ","),
  25                      "&rettype=gb", sep = "")
  26         X <- c(X, scan(file = URL, what = "", sep = "\n", quiet = TRUE))
  27     }
  28     FI <- grep("^ {0,}ORIGIN", X) + 1
  29     LA <- which(X == "//") - 1
  30     obj <- vector("list", N)
  31     for (i in 1:N) {
  32         ## remove all spaces and digits
  33         tmp <- gsub("[[:digit:] ]", "", X[FI[i]:LA[i]])
  34         obj[[i]] <- unlist(strsplit(tmp, NULL))
  35     }
  36     names(obj) <- seq.names
  37     if (!as.character) obj <- as.DNAbin(obj)
  38     if (species.names) {
  39         tmp <- character(N)
  40         sp <- grep("ORGANISM", X)
  41         for (i in 1:N)
  42             tmp[i] <- unlist(strsplit(X[sp[i]], " +ORGANISM +"))[2]
  43         attr(obj, "species") <- gsub(" ", "_", tmp)
  44     }
  45     if (gene.names) {
  46         tmp <- character(N)
  47         sp <- grep(" +gene +<", X)
  48         for (i in 1:N)
  49             tmp[i] <- unlist(strsplit(X[sp[i + 1L]], " +/gene=\""))[2]
  50         attr(obj, "gene") <- gsub("\"$", "", tmp)
  51     }
  52     obj
  53 }