R/read.nexus.data.R

   1 "read.nexus.data" <- function (file)
   2 {
   3     # Simplified NEXUS data parser.
   4     #
   5     # Version: 09/13/2006 01:01:59 PM CEST
   6     #          (modified by EP 2011-06-01)
   7     #
   8     # By:      Johan Nylander, nylander @ scs.fsu.edu
   9     #
  10     # WARNING: This is parser reads a restricted nexus format,
  11     #          see README for details.
  12     #
  13     # Argument (x) is a nexus formatted data file.
  14     #
  15     # Returns  (Value) a list of data sequences each made of a single
  16     #          vector of mode character where each element is a character.
  17     #
  18     # TODO:    Error checking, gap/missing, find.datatype, etc.
  19     #------------------------------------------------------------------
  20
  21     "find.ntax" <- function (x)
  22     {
  23         for (i in 1:NROW(x)) {
  24             if(any(f <- grep("\\bntax", x[i], ignore.case = TRUE))) {
  25                 ntax <- as.numeric(sub("(.+?)(ntax\\s*\\=\\s*)(\\d+)(.+)",
  26                                        "\\3", x[i], perl = TRUE, ignore.case = TRUE))
  27                 break
  28             }
  29         }
  30         ntax
  31     }
  32
  33     "find.nchar" <- function (x)
  34     {
  35         for (i in 1:NROW(x)) {
  36             if(any(f <- grep("\\bnchar", x[i], ignore.case = TRUE))) {
  37                 nchar <- as.numeric(sub("(.+?)(nchar\\s*\\=\\s*)(\\d+)(.+)",
  38                                         "\\3", x[i], perl = TRUE, ignore.case = TRUE))
  39                 break
  40             }
  41         }
  42         nchar
  43     }
  44
  45     "find.matrix.line" <- function (x)
  46     {
  47         for (i in 1:NROW(x)) {
  48             if(any(f <- grep("\\bmatrix\\b", x[i], ignore.case = TRUE))) {
  49                 matrix.line <- as.numeric(i)
  50                 break
  51             }
  52         }
  53         matrix.line
  54     }
  55
  56     "trim.whitespace" <- function (x)
  57     {
  58         gsub("\\s+", "", x)
  59     }
  60
  61     "trim.semicolon" <- function (x)
  62     {
  63         gsub(";", "", x)
  64     }
  65
  66     X <- scan(file = file, what = character(), sep = "\n",
  67               quiet = TRUE, comment.char = "[", strip.white = TRUE)
  68     ntax <- find.ntax(X)
  69     nchar <- find.nchar(X)
  70     matrix.line <- find.matrix.line(X)
  71     start.reading <- matrix.line + 1
  72     Obj <- list()
  73     length(Obj) <- ntax
  74     i <- 1
  75     pos <- 0
  76     tot.nchar <- 0
  77     tot.ntax <- 0
  78
  79     for (j in start.reading:NROW(X)) {
  80         Xj <- trim.semicolon(X[j])
  81         if(Xj == "") {
  82             break
  83         }
  84         if(any(jtmp <- grep("\\bend\\b", X[j], perl = TRUE, ignore.case = TRUE))) {
  85             break
  86         }
  87         ts <- unlist(strsplit(Xj, "(?<=\\S)(\\s+)(?=\\S)", perl = TRUE))
  88         if (length(ts) > 2) {
  89             stop("nexus parser does not handle spaces in sequences or taxon names (ts>2)")
  90         }
  91         if (length(ts) !=2) {
  92             stop("nexus parser failed to read the sequences (ts!=2)")
  93         }
  94         Seq <- trim.whitespace(ts[2])
  95         Name <- trim.whitespace(ts[1])
  96         nAME <- paste(c("\\b", Name, "\\b"), collapse = "")
  97         if (any(l <- grep(nAME, names(Obj)))) {
  98             tsp <- strsplit(Seq, NULL)[[1]]
  99             for (k in 1:length(tsp)) {
 100                 p <- k + pos
 101                 Obj[[l]][p] <- tsp[k]
 102                 chars.done <- k
 103             }
 104         }
 105         else {
 106             names(Obj)[i] <- Name
 107             tsp <- strsplit(Seq, NULL)[[1]]
 108             for (k in 1:length(tsp)) {
 109                 p <- k + pos
 110                 Obj[[i]][p] <- tsp[k]
 111                 chars.done <- k
 112             }
 113         }
 114         tot.ntax <- tot.ntax + 1
 115         if (tot.ntax == ntax) {
 116             i <- 1
 117             tot.ntax <- 0
 118             tot.nchar <- tot.nchar + chars.done
 119             if (tot.nchar == nchar*ntax) {
 120                 print("ntot was more than nchar*ntax")
 121                 break
 122             }
 123             pos <- tot.nchar
 124         }
 125         else {
 126             i <- i + 1
 127         }
 128     }
 129     if (tot.ntax != 0) {
 130         cat("ntax:",ntax,"differ from actual number of taxa in file?\n")
 131         stop("nexus parser did not read names correctly (tot.ntax!=0)")
 132     }
 133     for (i in 1:length(Obj)) {
 134         if (length(Obj[[i]]) != nchar) {
 135             cat(names(Obj[i]),"has",length(Obj[[i]]),"characters\n")
 136             stop("nchar differ from sequence length (length(Obj[[i]])!=nchar)")
 137         }
 138     }
 139     Obj <- lapply(Obj, tolower)
 140     Obj
 141 }
 142