write shrinking gene names post
authorDon Armstrong <don@donarmstrong.com>
Thu, 23 Mar 2017 20:44:55 +0000 (13:44 -0700)
committerDon Armstrong <don@donarmstrong.com>
Thu, 23 Mar 2017 20:45:37 +0000 (15:45 -0500)
posts/shrinking_gene_names.mdwn [new file with mode: 0644]

diff --git a/posts/shrinking_gene_names.mdwn b/posts/shrinking_gene_names.mdwn
new file mode 100644 (file)
index 0000000..7d2be06
--- /dev/null
@@ -0,0 +1,66 @@
+[[!meta title="Shrinking lists of gene names in R"]]
+
+I've been trying to finish a paper where I compare gene expression in
+14 different placentas. One of the supplemental figures compares
+median expression in gene trees across all 14 species, but because
+tree names like ENT000001 aren't very expressive, and names like
+"COL11A2, COL5A3, COL4A1, COL1A1, COL2A1, COL1A2, COL4A6, COL4A5,
+COL7A1, COL27A1, COL11A1, COL4A4, COL4A3, COL3A1, COL4A2, COL5A2,
+COL5A1, COL24A1" take up too much space, wanted a function which could
+collapse the gene names into something which uses bash glob syntax to
+more succinctly list the gene names, like:
+COL{11A{1,2},1A{1,2},24A1,27A1,2A1,3A1,4A{1,2,3,4,5,6},5A{1,2,3},7A1}.
+
+Thus, a crazy function which uses `lcprefix` from `Biostrings` and
+some looping was born:
+
+[[!format R """
+collapse.gene.names <- function(x,min.collapse=2) {
+    ## longest common substring
+    if (is.null(x) || length(x)==0) {
+        return(as.character(NA))
+    }
+    x <- sort(unique(x))
+    str_collapse <- function(y,len) {
+        if (len == 1 || length(y) < 2) {
+            return(y)
+        }
+        y.tree <-
+            gsub(paste0("^(.{",len,"}).*$"),"\\1",y[1])
+        y.rem <-
+            gsub(paste0("^.{",len,"}"),"",y)
+        y.rem.prefix <-
+            sum(combn(y.rem,2,function(x){Biostrings::lcprefix(x[1],x[2])}) >= 2)
+        if (length(y.rem) > 3 &&
+            y.rem.prefix >= 2
+            ) {
+            y.rem <- 
+                collapse.gene.names(y.rem,min.collapse=1)
+        }
+        paste0(y.tree,
+               "{",paste(collapse=",",
+                         y.rem),"}")
+    }
+    i <- 1
+    ret <- NULL
+    while (i <= length(x)) {
+        col.pmin <-
+            pmin(sapply(x,Biostrings::lcprefix,x[i]))
+        collapseable <-
+            which(col.pmin > min.collapse)
+        if (length(collapseable) == 0) {
+            ret <- c(ret,x[i])
+            i <- i+1
+        } else {
+            ret <- c(ret,
+                     str_collapse(x[collapseable],
+                                  min(col.pmin[collapseable]))
+                     )
+            i <- max(collapseable)+1
+        }
+    }
+    return(paste0(collapse=",",ret))
+}
+"""]]
+
+[[!tag genetics biology tech R]]