From: paradis <paradis@6e262413-ae40-0410-9e79-b911bd7a66b7>
Date: Thu, 27 Dec 2012 09:48:23 +0000 (+0000)
Subject: new code for reading FASTA files
X-Git-Url: https://git.donarmstrong.com/?p=ape.git;a=commitdiff_plain;h=be6a044200152fd83d0b72f348012a0adc2593c5

new code for reading FASTA files


git-svn-id: https://svn.mpl.ird.fr/ape/dev/ape@202 6e262413-ae40-0410-9e79-b911bd7a66b7
---

diff --git a/DESCRIPTION b/DESCRIPTION
index c2302fd..e84ea2e 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: ape
 Version: 3.0-7
-Date: 2012-12-19
+Date: 2012-12-27
 Title: Analyses of Phylogenetics and Evolution
 Author: Emmanuel Paradis, Ben Bolker, Julien Claude, Hoa Sien Cuong, Richard Desper, Benoit Durand, Julien Dutheil, Olivier Gascuel, Christoph Heibl, Daniel Lawson, Vincent Lefort, Pierre Legendre, Jim Lemon, Yvonnick Noel, Johan Nylander, Rainer Opgen-Rhein, Andrei-Alin Popescu, Klaus Schliep, Korbinian Strimmer, Damien de Vienne
 Maintainer: Emmanuel Paradis <Emmanuel.Paradis@ird.fr>
diff --git a/NEWS b/NEWS
index f6acbff..57d1924 100644
--- a/NEWS
+++ b/NEWS
@@ -15,6 +15,10 @@ NEW FEATURES
     o plot.phylo() has a new option, open.angle, used when plotting
       circular trees.
 
+    o The new function read.FASTA reads FASTA files much faster and
+      more efficiently. It is called internally by read.dna(, "fasta")
+      or can be called directly.
+
 
 BUG FIXES
 
@@ -28,6 +32,8 @@ OTHER CHANGES
 
     o .compressTipLabel() is 10 times faster thanks to Joseph Brown.
 
+    o base.freq() is now faster with lists.
+
 
 
 		CHANGES IN APE VERSION 3.0-6
diff --git a/R/DNA.R b/R/DNA.R
index 31ef4bb..d1b3625 100644
--- a/R/DNA.R
+++ b/R/DNA.R
@@ -1,4 +1,4 @@
-## DNA.R (2012-11-28)
+## DNA.R (2012-12-27)
 
 ##   Manipulations and Comparisons of DNA Sequences
 
@@ -58,21 +58,8 @@ as.alignment <- function(x)
 
 "[.DNAbin" <- function(x, i, j, drop = FALSE)
 {
-    oc <- oldClass(x)
-    class(x) <- NULL
-    if (is.matrix(x)) {
-        if (nargs() == 2 && !missing(i)) ans <- x[i]
-        else {
-            nd <- dim(x)
-            if (missing(i)) i <- 1:nd[1]
-            if (missing(j)) j <- 1:nd[2]
-            ans <- x[i, j, drop = drop]
-        }
-    } else {
-        if (missing(i)) i <- 1:length(x)
-        ans <- x[i]
-    }
-    class(ans) <- oc
+    ans <- NextMethod("[", drop = drop)
+    class(ans) <- "DNAbin"
     ans
 }
 
@@ -293,10 +280,19 @@ as.character.DNAbin <- function(x, ...)
 
 base.freq <- function(x, freq = FALSE, all = FALSE)
 {
-    if (is.list(x)) x <- unlist(x)
-    n <- length(x)
-    BF <-.C("BaseProportion", x, as.integer(n), double(17),
-            DUP = FALSE, NAOK = TRUE, PACKAGE = "ape")[[3]]
+    f <- function(x)
+        .C("BaseProportion", x, as.integer(length(x)), double(17),
+           DUP = FALSE, NAOK = TRUE, PACKAGE = "ape")[[3]]
+
+    if (is.list(x)) {
+        BF <- rowSums(sapply(x, f))
+        n <- sum(sapply(x, length))
+    } else {
+        n <- length(x)
+        BF <-.C("BaseProportion", x, as.integer(n), double(17),
+                DUP = FALSE, NAOK = TRUE, PACKAGE = "ape")[[3]]
+    }
+
     names(BF) <- c("a", "c", "g", "t", "r", "m", "w", "s",
                    "k", "y", "v", "h", "d", "b", "n", "-", "?")
     if (all) {
diff --git a/R/read.dna.R b/R/read.dna.R
index ca4da7a..9e46cba 100644
--- a/R/read.dna.R
+++ b/R/read.dna.R
@@ -1,4 +1,4 @@
-## read.dna.R (2012-05-03)
+## read.dna.R (2012-12-27)
 
 ##   Read DNA Sequences in a File
 
@@ -7,6 +7,19 @@
 ## This file is part of the R-package `ape'.
 ## See the file ../COPYING for licensing issues.
 
+read.FASTA <- function(file)
+{
+    sz <- file.info(file)$size
+    x <- readBin(file, "raw", sz)
+    if (Sys.info()[1] == "Windows") {
+        icr <- which(x == as.raw(0x0d)) # CR
+        x <- x[-icr]
+    }
+    res <- .Call("rawStreamToDNAbin", x, PACKAGE = "ape")
+    class(res) <- "DNAbin"
+    res
+}
+
 read.dna <- function(file, format = "interleaved", skip = 0,
                      nlines = 0, comment.char = "#",
                      as.character = FALSE, as.matrix = NULL)
@@ -29,84 +42,74 @@ read.dna <- function(file, format = "interleaved", skip = 0,
     }
     formats <- c("interleaved", "sequential", "fasta", "clustal")
     format <- match.arg(format, formats)
-    phylip <- if (format %in% formats[1:2]) TRUE else FALSE
-    X <- scan(file = file, what = "", sep = "\n", quiet = TRUE,
-              skip = skip, nlines = nlines, comment.char = comment.char)
+    if (format == "fasta") {
+        obj <- read.FASTA(file)
+    } else {
+        X <- scan(file = file, what = "", sep = "\n", quiet = TRUE,
+                  skip = skip, nlines = nlines, comment.char = comment.char)
 
-    if (phylip) {
-        ## need to remove the possible leading spaces and/or tabs in the first line
-        fl <- gsub("^[[:blank:]]+", "", X[1])
-        fl <- as.numeric(unlist(strsplit(fl, "[[:blank:]]+")))
-        if (length(fl) != 2 || any(is.na(fl)))
-            stop("the first line of the file must contain the dimensions of the data")
-        n <- fl[1]
-        s <- fl[2]
-        obj <- matrix("", n, s)
-        X <- X[-1]
-    }
-    switch(format,
-    "interleaved" = {
-        start.seq <- findFirstNucleotide(X[1])
-        one2n <- 1:n
-        taxa <- getTaxaNames(substr(X[one2n], 1, start.seq - 1))
-        X[one2n] <- substr(X[one2n], start.seq, nchar(X[one2n]))
-        nl <- length(X)
-        for (i in one2n)
-            obj[i, ] <- getNucleotide(X[seq(i, nl, n)])
-    },
-    "sequential" = {
-        taxa <- character(n)
-        j <- 1L # line number
-        for (i in 1:n) {
-            start.seq <- findFirstNucleotide(X[j])
-            taxa[i] <- getTaxaNames(substr(X[j], 1, start.seq - 1))
-            sequ <- getNucleotide(substr(X[j], start.seq, nchar(X[j])))
-            j <- j + 1L
-            while (length(sequ) < s) {
-                sequ <- c(sequ, getNucleotide(X[j]))
-                j <- j + 1L
-            }
-            obj[i, ] <- sequ
+        if (format %in% formats[1:2]) {
+            ## need to remove the possible leading spaces and/or tabs in the first line
+            fl <- gsub("^[[:blank:]]+", "", X[1])
+            fl <- as.numeric(unlist(strsplit(fl, "[[:blank:]]+")))
+            if (length(fl) != 2 || any(is.na(fl)))
+                stop("the first line of the file must contain the dimensions of the data")
+            n <- fl[1]
+            s <- fl[2]
+            obj <- matrix("", n, s)
+            X <- X[-1]
         }
-        taxa <- getTaxaNames(taxa)
-    },
-    "fasta" = {
-        start <- grep("^ {0,}>", X)
-        taxa <- X[start]
-        taxa <- sub("^ {0,}>", "", taxa) # remove the hook and the spaces before
-        taxa <- getTaxaNames(taxa)
-        n <- length(taxa)
-        obj <- vector("list", n)
-        start <- c(start, length(X) + 1) # this avoids the following to crash when `i = n'
-        for (i in 1:n)
-            obj[[i]] <- getNucleotide(X[(start[i] + 1):(start[i + 1] - 1)])
-    },
-    "clustal" = {
-        X <- X[-1] # drop the line with "Clustal bla bla..."
-        ## find where the 1st sequence starts
-        start.seq <- findFirstNucleotide(X[1])
-        ## find the lines with *********....
-        nspaces <- paste("^ {", start.seq - 1, "}", sep = "", collapse = "")
-        stars <- grep(nspaces, X)
-        ## we now know how many sequences in the file:
-        n <- stars[1] - 1
-        taxa <- getTaxaNames(substr(X[1:n], 1, start.seq - 1))
-        ## need to remove the sequence names before getting the sequences:
-        X <- substr(X, start.seq, nchar(X))
-        nl <- length(X)
-        ## find the length of the 1st sequence:
-        tmp <- getNucleotide(X[seq(1, nl, n + 1)])
-        s <- length(tmp)
-        obj <- matrix("", n, s)
-        obj[1, ] <- tmp
-        for (i in 2:n)
-            obj[i, ] <- getNucleotide(X[seq(i, nl, n + 1)])
-    })
+        switch(format,
+               "interleaved" = {
+                   start.seq <- findFirstNucleotide(X[1])
+                   one2n <- 1:n
+                   taxa <- getTaxaNames(substr(X[one2n], 1, start.seq - 1))
+                   X[one2n] <- substr(X[one2n], start.seq, nchar(X[one2n]))
+                   nl <- length(X)
+                   for (i in one2n)
+                       obj[i, ] <- getNucleotide(X[seq(i, nl, n)])
+               },
+               "sequential" = {
+                   taxa <- character(n)
+                   j <- 1L # line number
+                   for (i in 1:n) {
+                       start.seq <- findFirstNucleotide(X[j])
+                       taxa[i] <- getTaxaNames(substr(X[j], 1, start.seq - 1))
+                       sequ <- getNucleotide(substr(X[j], start.seq, nchar(X[j])))
+                       j <- j + 1L
+                       while (length(sequ) < s) {
+                           sequ <- c(sequ, getNucleotide(X[j]))
+                           j <- j + 1L
+                       }
+                       obj[i, ] <- sequ
+                   }
+                   taxa <- getTaxaNames(taxa)
+               },
+               "clustal" = {
+                   X <- X[-1] # drop the line with "Clustal bla bla..."
+                   ## find where the 1st sequence starts
+                   start.seq <- findFirstNucleotide(X[1])
+                   ## find the lines with *********....
+                   nspaces <- paste("^ {", start.seq - 1, "}", sep = "", collapse = "")
+                   stars <- grep(nspaces, X)
+                   ## we now know how many sequences in the file:
+                   n <- stars[1] - 1
+                   taxa <- getTaxaNames(substr(X[1:n], 1, start.seq - 1))
+                   ## need to remove the sequence names before getting the sequences:
+                   X <- substr(X, start.seq, nchar(X))
+                   nl <- length(X)
+                   ## find the length of the 1st sequence:
+                   tmp <- getNucleotide(X[seq(1, nl, n + 1)])
+                   s <- length(tmp)
+                   obj <- matrix("", n, s)
+                   obj[1, ] <- tmp
+                   for (i in 2:n)
+                       obj[i, ] <- getNucleotide(X[seq(i, nl, n + 1)])
+               })
 
     if (format != "fasta") {
         rownames(obj) <- taxa
     } else {
-        names(obj) <- taxa
         LENGTHS <- unique(unlist(lapply(obj, length)))
         allSameLength <- length(LENGTHS) == 1
         if (is.logical(as.matrix)) {
diff --git a/man/read.dna.Rd b/man/read.dna.Rd
index 2863ad8..19bab40 100644
--- a/man/read.dna.Rd
+++ b/man/read.dna.Rd
@@ -1,10 +1,12 @@
 \name{read.dna}
 \alias{read.dna}
+\alias{read.FASTA}
 \title{Read DNA Sequences in a File}
 \usage{
 read.dna(file, format = "interleaved", skip = 0,
          nlines = 0, comment.char = "#",
          as.character = FALSE, as.matrix = NULL)
+read.FASTA(file)
 }
 \arguments{
   \item{file}{a file name specified by either a variable of mode character,
@@ -79,6 +81,8 @@ read.dna(file, format = "interleaved", skip = 0,
   a matrix or a list (if \code{format = "fasta"}) of DNA sequences
   stored in binary format, or of mode character (if \code{as.character =
     "TRUE"}).
+
+  \code{read.FASTA} always returns a list.
 }
 \references{
   Anonymous. FASTA format description.
diff --git a/src/read_dna.c b/src/read_dna.c
new file mode 100644
index 0000000..6d44c66
--- /dev/null
+++ b/src/read_dna.c
@@ -0,0 +1,141 @@
+/* read_dna.c       2012-12-27 */
+
+/* Copyright 2008-2012 Emmanuel Paradis */
+
+/* This file is part of the R-package `ape'. */
+/* See the file ../COPYING for licensing issues. */
+
+#include <R.h>
+#include <Rinternals.h>
+
+// The initial code defining and initialising the translation table:
+//
+//	for (i = 0; i < 122; i++) tab_trans[i] = 0x00;
+//
+//	tab_trans[65] = 0x88; /* A */
+//	tab_trans[71] = 0x48; /* G */
+//	tab_trans[67] = 0x28; /* C */
+// 	tab_trans[84] = 0x18; /* T */
+// 	tab_trans[82] = 0xc0; /* R */
+// 	tab_trans[77] = 0xa0; /* M */
+// 	tab_trans[87] = 0x90; /* W */
+// 	tab_trans[83] = 0x60; /* S */
+// 	tab_trans[75] = 0x50; /* K */
+// 	tab_trans[89] = 0x30; /* Y */
+// 	tab_trans[86] = 0xe0; /* V */
+// 	tab_trans[72] = 0xb0; /* H */
+// 	tab_trans[68] = 0xd0; /* D */
+//  	tab_trans[66] = 0x70; /* B */
+// 	tab_trans[78] = 0xf0; /* N */
+//
+//	tab_trans[97] = 0x88; /* a */
+//	tab_trans[103] = 0x48; /* g */
+//	tab_trans[99] = 0x28; /* c */
+// 	tab_trans[116] = 0x18; /* t */
+// 	tab_trans[114] = 0xc0; /* r */
+// 	tab_trans[109] = 0xa0; /* m */
+// 	tab_trans[119] = 0x90; /* w */
+// 	tab_trans[115] = 0x60; /* s */
+// 	tab_trans[107] = 0x50; /* k */
+// 	tab_trans[121] = 0x30; /* y */
+// 	tab_trans[118] = 0xe0; /* v */
+// 	tab_trans[104] = 0xb0; /* h */
+// 	tab_trans[100] = 0xd0; /* d */
+//  	tab_trans[98] = 0x70; /* b */
+// 	tab_trans[110] = 0xf0; /* n */
+//
+//  	tab_trans[45] = 0x04; /* - */
+//  	tab_trans[63] = 0x02; /* ? */
+
+static const unsigned char tab_trans[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0-9 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 10-19 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 20-29 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 30-39 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, /* 40-49 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 50-59 */
+	0x00, 0x00, 0x00, 0x02, 0x00, 0x88, 0x70, 0x28, 0xd0, 0x00, /* 60-69 */
+	0x48, 0x00, 0xb0, 0x00, 0x00, 0x50, 0x00, 0xa0, 0xf0, 0x00, /* 70-79 */
+	0x00, 0x00, 0xc0, 0x60, 0x18, 0x00, 0xe0, 0x90, 0x00, 0x30, /* 80-89 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0x70, 0x28, /* 90-99 */
+	0xd0, 0x00, 0x00, 0x48, 0xb0, 0x00, 0x00, 0x50, 0x00, 0xa0, /* 100-109 */
+	0xf0, 0x00, 0x00, 0x00, 0xc0, 0x60, 0x18, 0x00, 0xe0, 0x90, /* 110-119 */
+	0x00, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 120-129 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 130-139 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 140-149 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 150-159 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 160-169 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 170-179 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 180-189 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 190-199 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 200-209 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 210-219 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 220-229 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 230-239 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 240-249 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; /* 250-255 */
+
+static const unsigned char hook = 0x3e;
+static const unsigned char lineFeed = 0x0a;
+/* static const unsigned char space = 0x20; */
+
+SEXP rawStreamToDNAbin(SEXP x)
+{
+	int N, i, j, k, n;
+	unsigned char *xr, *rseq, *buffer, tmp;
+	SEXP obj, nms, seq;
+
+	PROTECT(x = coerceVector(x, RAWSXP));
+	N = LENGTH(x);
+	xr = RAW(x);
+
+/* do a 1st pass to find the number of sequences */
+
+	n = 0;
+	j = 0; /* use j as a flag */
+	for (i = 0; i < N; i++) {
+		if (j && xr[i] == lineFeed) {
+			n++;
+			j = 0;
+		} else if (xr[i] == hook) j = 1;
+	}
+
+	PROTECT(obj = allocVector(VECSXP, n));
+	PROTECT(nms = allocVector(STRSXP, n));
+
+/* Refine the way the size of the buffer is set? */
+	buffer = (unsigned char *)R_alloc(N, sizeof(unsigned char *));
+
+	i = j = 0;
+	while (i < N) {
+		/* 1st read the label... */
+		if (xr[i] == hook) {
+			i++;
+			k = 0;
+			while (xr[i] != lineFeed) buffer[k++] = xr[i++];
+			buffer[k] = '\0';
+			SET_STRING_ELT(nms, j, mkChar(buffer));
+		}
+		/* ... then read the sequence */
+		n = 0;
+		while (xr[i] != hook && i < N) {
+			tmp = tab_trans[xr[i++]];
+/* If we are sure that the FASTA file is correct (ie, the sequence on
+   a single line and only the IUAPC code (plus '-' and '?') is used,
+   then the following check would not be needed; additionally the size
+   of tab_trans could be restriced to 0-121. This check has the
+   advantage that all invalid characters are simply ignored without
+   causing error. */
+			if (tmp) buffer[n++] = tmp;
+		}
+		PROTECT(seq = allocVector(RAWSXP, n));
+		rseq = RAW(seq);
+		for (k = 0; k < n; k++) rseq[k] = buffer[k];
+		SET_VECTOR_ELT(obj, j, seq);
+		UNPROTECT(1);
+		j++;
+	}
+	setAttrib(obj, R_NamesSymbol, nms);
+	UNPROTECT(3);
+	return obj;
+}