]> git.donarmstrong.com Git - imprinted_genes.git/commitdiff
add rules to build combined imprinted genes file
authorDon Armstrong <don@donarmstrong.com>
Fri, 8 May 2015 17:58:17 +0000 (10:58 -0700)
committerDon Armstrong <don@donarmstrong.com>
Fri, 8 May 2015 17:58:17 +0000 (10:58 -0700)
Makefile
combine_imprinted_genes.R [new file with mode: 0644]

index cb3d0351a80415120c5430118aba313fb4c0c059..b87aa21f22911187f642ab3af9f2eba75ce82c78 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,8 @@
 #!/usr/bin/make -f
 
+R=R
+ROPTS=-q --no-save --no-restore-data
+
 geneimprint_human.html:
        wget -O $@ "http://www.geneimprint.com/site/genes-by-species.Homo+sapiens"
 
@@ -12,3 +15,6 @@ geneimprint_human.txt: geneimprint_human.html parse_geneimprint.pl
 
 parent_of_origin.txt: parent_of_origin.html parse_parent_of_origin.pl
        ./parse_parent_of_origin.pl $< > $@
+
+combined_imprinted_genes.txt: combine_imprinted_genes.R geneimprint_human.txt parent_of_origin.txt
+       $(R) $(ROPTS) -f $< --args $(wordlist 2,$(words $^),$^) $@
diff --git a/combine_imprinted_genes.R b/combine_imprinted_genes.R
new file mode 100644 (file)
index 0000000..782da8e
--- /dev/null
@@ -0,0 +1,24 @@
+library(data.table)
+
+args <- commandArgs(trailingOnly=TRUE)
+
+geneimprint <- fread(args[1])
+parent <- fread(args[2])
+### fix up the 0 prefixed chromosomes
+parent[,chr:=gsub("^0","",chromosome)]
+### remove aliases in ()
+parent[,Gene:=gsub("\\s*\\([^\\)]+\\)\\s*","",gene)]
+### remove aliases after ,
+parent[,Gene:=gsub("\\s*,\\s*.+","",Gene)]
+
+parent <- parent[grepl("^[A-Z0-9]+$",Gene),]
+setkey(parent,"Gene")
+setkey(geneimprint,"Gene")
+
+imprinted.genes <-
+    union(parent[,Gene],geneimprint[,Gene])
+
+write.table(file=args[length(args)],
+            imprinted.genes,
+            sep="\t",row.names=FALSE,col.names=FALSE,quote=FALSE)
+