X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=bin%2Fstupid_missing_names.pl;fp=bin%2Fstupid_missing_names.pl;h=b248f9f2b94e6d2b5e83ff6d1f3af2b6e55c9c4f;hb=d09b67e0af77d6f2818e41d6b4d648cff651c79d;hp=0000000000000000000000000000000000000000;hpb=5d4602c246e5d2c22435bda4c07116251f1fa546;p=function2gene.git diff --git a/bin/stupid_missing_names.pl b/bin/stupid_missing_names.pl new file mode 100644 index 0000000..b248f9f --- /dev/null +++ b/bin/stupid_missing_names.pl @@ -0,0 +1,117 @@ +#! /usr/bin/perl + + +=head1 NAME + +stupid_missing_names - + +=head1 SYNOPSIS + +Some of the genes don't actually have locations. This misnamed script +is designed to take the names of those missing locations and try to +figure out where the actual genes are located. + +=head1 DESCRIPTION + + + +=cut + + +use warnings; +use strict; + + +use vars qw($DEBUG $REVISION); + +BEGIN{ + ($REVISION) = q$LastChangedRevision: 1$ =~ /LastChangedRevision:\s+([^\s]+)/; + $DEBUG = 0 unless defined $DEBUG; +} + +use URI::ParamMunge; +use LWP::UserAgent; + +# XXX parse config file + +my $LOCATION = 0; + +my %options = (debug => 0, + help => 0, + man => 0, + format => 'xml', + database => 'gene', + dir => '.', + name => '${search}_results_genecard', + terms => '-', + ); + +my $terms; +if ($options{terms} eq '-') { + $terms = \*STDIN; +} + +my $ua = new LWP::UserAgent(agent=>"DA_get_harvester_results/$REVISION"); + +sub get_url($){ + my $url = shift; + + my $request = HTTP::Request->new('GET', $url); + my $response = $ua->request($request); + $response = $response->content; + return $response; +} + +#For every term +while (<$terms>) { + # Get uids to retrieve + chomp; + my $search = $_; + + my $response = get_url(uri_param_munge('http://www.ensembl.org/Homo_sapiens/textview?type=All&x=0&y=0', + {q => $search, + }, + ) + ); + + my ($url) = $response =~ m&
1.\s+Ensembl\s+[^<]+\s+ + [^\"]+
&xis; + + print "NO DATA:1\n" and next if not defined $url; + + $response = get_url("http://www.ensembl.org$url"); + + ($url) = $response =~ m{\s* + Gene\s* + + ([^<]+)\s*\s*\(HUGO\s*ID\) }xis; + + print "NO DATA:2\n" and next if not defined $url; + + $response = get_url("http://www.gene.ucl.ac.uk/cgi-bin/nomenclature/gdlw.pl?title=&col=gd_hgnc_id&col=gd_app_name&col=gd_status&col=gd_aliases&col=gd_pub_chrom_map&col=gd_pub_refseq_ids&status=Approved&status=Approved+Non-Human&status=Entry+Withdrawn&status_opt=3&=on&where=gd_app_sym+like+%27%25${url}%25%27&order_by=gd_app_sym_sort&limit=&format=html&submit=submit&.cgifields=&.cgifields=status&.cgifields=chr"); + + if ($LOCATION) { + my ($location) = $response =~ m{ + Chromosome + \s*\+\s*\s* + ([^<]+?) # The chromosome location + \s*([^<]+)}xis; + + print "NO SEQUENCE\n" and next if not defined $ref_seq; + print $ref_seq, "\n"; + } +} + + + + + + +__END__