From: Don Armstrong Date: Thu, 23 Mar 2017 20:44:55 +0000 (-0700) Subject: write shrinking gene names post X-Git-Url: https://git.donarmstrong.com/?p=don.git;a=commitdiff_plain;h=41b0016e2504b184cf641c56bec3e8bccc3a9fa8 write shrinking gene names post --- diff --git a/posts/shrinking_gene_names.mdwn b/posts/shrinking_gene_names.mdwn new file mode 100644 index 0000000..7d2be06 --- /dev/null +++ b/posts/shrinking_gene_names.mdwn @@ -0,0 +1,66 @@ +[[!meta title="Shrinking lists of gene names in R"]] + +I've been trying to finish a paper where I compare gene expression in +14 different placentas. One of the supplemental figures compares +median expression in gene trees across all 14 species, but because +tree names like ENT000001 aren't very expressive, and names like +"COL11A2, COL5A3, COL4A1, COL1A1, COL2A1, COL1A2, COL4A6, COL4A5, +COL7A1, COL27A1, COL11A1, COL4A4, COL4A3, COL3A1, COL4A2, COL5A2, +COL5A1, COL24A1" take up too much space, wanted a function which could +collapse the gene names into something which uses bash glob syntax to +more succinctly list the gene names, like: +COL{11A{1,2},1A{1,2},24A1,27A1,2A1,3A1,4A{1,2,3,4,5,6},5A{1,2,3},7A1}. + +Thus, a crazy function which uses `lcprefix` from `Biostrings` and +some looping was born: + +[[!format R """ +collapse.gene.names <- function(x,min.collapse=2) { + ## longest common substring + if (is.null(x) || length(x)==0) { + return(as.character(NA)) + } + x <- sort(unique(x)) + str_collapse <- function(y,len) { + if (len == 1 || length(y) < 2) { + return(y) + } + y.tree <- + gsub(paste0("^(.{",len,"}).*$"),"\\1",y[1]) + y.rem <- + gsub(paste0("^.{",len,"}"),"",y) + y.rem.prefix <- + sum(combn(y.rem,2,function(x){Biostrings::lcprefix(x[1],x[2])}) >= 2) + if (length(y.rem) > 3 && + y.rem.prefix >= 2 + ) { + y.rem <- + collapse.gene.names(y.rem,min.collapse=1) + } + paste0(y.tree, + "{",paste(collapse=",", + y.rem),"}") + } + i <- 1 + ret <- NULL + while (i <= length(x)) { + col.pmin <- + pmin(sapply(x,Biostrings::lcprefix,x[i])) + collapseable <- + which(col.pmin > min.collapse) + if (length(collapseable) == 0) { + ret <- c(ret,x[i]) + i <- i+1 + } else { + ret <- c(ret, + str_collapse(x[collapseable], + min(col.pmin[collapseable])) + ) + i <- max(collapseable)+1 + } + } + return(paste0(collapse=",",ret)) +} +"""]] + +[[!tag genetics biology tech R]]