R/read.nexus.data.R

   1 "read.nexus.data" <- function (file)
   2 {
   3     # Simplified NEXUS data parser.
   4     #
   5     # Version: 09/13/2006 01:01:59 PM CEST
   6     #
   7     # By:      Johan Nylander, nylander @ scs.fsu.edu
   8     #
   9     # WARNING: This is parser reads a restricted nexus format,
  10     #          see README for details.
  11     #
  12     # Argument (x) is a nexus formatted data file.
  13     #
  14     # Returns  (Value) a list of data sequences each made of a single
  15     #          vector of mode character where each element is a character.
  16     #
  17     # TODO:    Error checking, gap/missing, find.datatype, etc.
  18     #------------------------------------------------------------------
  19
  20     "find.ntax" <- function (x)
  21     {
  22         for (i in 1:NROW(x)) {
  23             if(any(f <- grep("\\bntax", x[i], ignore.case = TRUE))) {
  24                 ntax <- as.numeric(sub("(.+?)(ntax\\s*\\=\\s*)(\\d+)(.+)",
  25                                        "\\3", x[i], perl = TRUE, ignore.case = TRUE))
  26                 break
  27             }
  28         }
  29         ntax
  30     }
  31
  32     "find.nchar" <- function (x)
  33     {
  34         for (i in 1:NROW(x)) {
  35             if(any(f <- grep("\\bnchar", x[i], ignore.case = TRUE))) {
  36                 nchar <- as.numeric(sub("(.+?)(nchar\\s*\\=\\s*)(\\d+)(.+)",
  37                                         "\\3", x[i], perl = TRUE, ignore.case = TRUE))
  38                 break
  39             }
  40         }
  41         nchar
  42     }
  43
  44     "find.matrix.line" <- function (x)
  45     {
  46         for (i in 1:NROW(x)) {
  47             if(any(f <- grep("\\bmatrix\\b", x[i], ignore.case = TRUE))) {
  48                 matrix.line <- as.numeric(i)
  49                 break
  50             }
  51         }
  52         matrix.line
  53     }
  54
  55     "trim.whitespace" <- function (x)
  56     {
  57         gsub("\\s+", "", x)
  58     }
  59
  60     "trim.semicolon" <- function (x)
  61     {
  62         gsub(";", "", x)
  63     }
  64
  65     if(file.access(file, mode = 4)) {
  66         stop("file could not be found")
  67     }
  68
  69     X <- scan(file = file, what = character(), sep = "\n",
  70               quiet = TRUE, comment.char = "[", strip.white = TRUE)
  71     ntax <- find.ntax(X)
  72     nchar <- find.nchar(X)
  73     matrix.line <- find.matrix.line(X)
  74     start.reading <- matrix.line + 1
  75     Obj <- list()
  76     length(Obj) <- ntax
  77     i <- 1
  78     pos <- 0
  79     tot.nchar <- 0
  80     tot.ntax <- 0
  81
  82     for (j in start.reading:NROW(X)) {
  83         Xj <- trim.semicolon(X[j])
  84         if(Xj == "") {
  85             break
  86         }
  87         if(any(jtmp <- grep("\\bend\\b", X[j], perl = TRUE, ignore.case = TRUE))) {
  88             break
  89         }
  90         ts <- unlist(strsplit(Xj, "(?<=\\S)(\\s+)(?=\\S)", perl = TRUE))
  91         if (length(ts) > 2) {
  92             stop("nexus parser does not handle spaces in sequences or taxon names (ts>2)")
  93         }
  94         if (length(ts) !=2) {
  95             stop("nexus parser failed to read the sequences (ts!=2)")
  96         }
  97         Seq <- trim.whitespace(ts[2])
  98         Name <- trim.whitespace(ts[1])
  99         nAME <- paste(c("\\b", Name, "\\b"), collapse = "")
 100         if (any(l <- grep(nAME, names(Obj)))) {
 101             tsp <- strsplit(Seq, NULL)[[1]]
 102             for (k in 1:length(tsp)) {
 103                 p <- k + pos
 104                 Obj[[l]][p] <- tsp[k]
 105                 chars.done <- k
 106             }
 107         }
 108         else {
 109             names(Obj)[i] <- Name
 110             tsp <- strsplit(Seq, NULL)[[1]]
 111             for (k in 1:length(tsp)) {
 112                 p <- k + pos
 113                 Obj[[i]][p] <- tsp[k]
 114                 chars.done <- k
 115             }
 116         }
 117         tot.ntax <- tot.ntax + 1
 118         if (tot.ntax == ntax) {
 119             i <- 1
 120             tot.ntax <- 0
 121             tot.nchar <- tot.nchar + chars.done
 122             if (tot.nchar == nchar*ntax) {
 123                 print("ntot was more than nchar*ntax")
 124                 break
 125             }
 126             pos <- tot.nchar
 127         }
 128         else {
 129             i <- i + 1
 130         }
 131     }
 132     if (tot.ntax != 0) {
 133         cat("ntax:",ntax,"differ from actual number of taxa in file?\n")
 134         stop("nexus parser did not read names correctly (tot.ntax!=0)")
 135     }
 136     for (i in 1:length(Obj)) {
 137         if (length(Obj[[i]]) != nchar) {
 138             cat(names(Obj[i]),"has",length(Obj[[i]]),"characters\n")
 139             stop("nchar differ from sequence length (length(Obj[[i]])!=nchar)")
 140         }
 141     }
 142     Obj <- lapply(Obj, tolower)
 143     Obj
 144 }
 145