--- /dev/null
+[[!meta title="Shrinking lists of gene names in R"]]
+
+I've been trying to finish a paper where I compare gene expression in
+14 different placentas. One of the supplemental figures compares
+median expression in gene trees across all 14 species, but because
+tree names like ENT000001 aren't very expressive, and names like
+"COL11A2, COL5A3, COL4A1, COL1A1, COL2A1, COL1A2, COL4A6, COL4A5,
+COL7A1, COL27A1, COL11A1, COL4A4, COL4A3, COL3A1, COL4A2, COL5A2,
+COL5A1, COL24A1" take up too much space, wanted a function which could
+collapse the gene names into something which uses bash glob syntax to
+more succinctly list the gene names, like:
+COL{11A{1,2},1A{1,2},24A1,27A1,2A1,3A1,4A{1,2,3,4,5,6},5A{1,2,3},7A1}.
+
+Thus, a crazy function which uses `lcprefix` from `Biostrings` and
+some looping was born:
+
+[[!format R """
+collapse.gene.names <- function(x,min.collapse=2) {
+ ## longest common substring
+ if (is.null(x) || length(x)==0) {
+ return(as.character(NA))
+ }
+ x <- sort(unique(x))
+ str_collapse <- function(y,len) {
+ if (len == 1 || length(y) < 2) {
+ return(y)
+ }
+ y.tree <-
+ gsub(paste0("^(.{",len,"}).*$"),"\\1",y[1])
+ y.rem <-
+ gsub(paste0("^.{",len,"}"),"",y)
+ y.rem.prefix <-
+ sum(combn(y.rem,2,function(x){Biostrings::lcprefix(x[1],x[2])}) >= 2)
+ if (length(y.rem) > 3 &&
+ y.rem.prefix >= 2
+ ) {
+ y.rem <-
+ collapse.gene.names(y.rem,min.collapse=1)
+ }
+ paste0(y.tree,
+ "{",paste(collapse=",",
+ y.rem),"}")
+ }
+ i <- 1
+ ret <- NULL
+ while (i <= length(x)) {
+ col.pmin <-
+ pmin(sapply(x,Biostrings::lcprefix,x[i]))
+ collapseable <-
+ which(col.pmin > min.collapse)
+ if (length(collapseable) == 0) {
+ ret <- c(ret,x[i])
+ i <- i+1
+ } else {
+ ret <- c(ret,
+ str_collapse(x[collapseable],
+ min(col.pmin[collapseable]))
+ )
+ i <- max(collapseable)+1
+ }
+ }
+ return(paste0(collapse=",",ret))
+}
+"""]]
+
+[[!tag genetics biology tech R]]