#! /usr/bin/perl =head1 NAME stupid_missing_names - =head1 SYNOPSIS Some of the genes don't actually have locations. This misnamed script is designed to take the names of those missing locations and try to figure out where the actual genes are located. =head1 DESCRIPTION =cut use warnings; use strict; use vars qw($DEBUG $REVISION); BEGIN{ ($REVISION) = q$LastChangedRevision: 1$ =~ /LastChangedRevision:\s+([^\s]+)/; $DEBUG = 0 unless defined $DEBUG; } use URI::ParamMunge; use LWP::UserAgent; # XXX parse config file my $LOCATION = 0; my %options = (debug => 0, help => 0, man => 0, format => 'xml', database => 'gene', dir => '.', name => '${search}_results_genecard', terms => '-', ); my $terms; if ($options{terms} eq '-') { $terms = \*STDIN; } my $ua = new LWP::UserAgent(agent=>"DA_get_harvester_results/$REVISION"); sub get_url($){ my $url = shift; my $request = HTTP::Request->new('GET', $url); my $response = $ua->request($request); $response = $response->content; return $response; } #For every term while (<$terms>) { # Get uids to retrieve chomp; my $search = $_; my $response = get_url(uri_param_munge('http://www.ensembl.org/Homo_sapiens/textview?type=All&x=0&y=0', {q => $search, }, ) ); my ($url) = $response =~ m&
1.\s+Ensembl\s+[^<]+\s+ [^\"]+
&xis; print "NO DATA:1\n" and next if not defined $url; $response = get_url("http://www.ensembl.org$url"); ($url) = $response =~ m{\s* }xis; print "NO SEQUENCE\n" and next if not defined $ref_seq; print $ref_seq, "\n"; } } __END__Gene \s*([^<]+)\s*\s*\(HUGO\s*ID\) }xis; print "NO DATA:2\n" and next if not defined $url; $response = get_url("http://www.gene.ucl.ac.uk/cgi-bin/nomenclature/gdlw.pl?title=&col=gd_hgnc_id&col=gd_app_name&col=gd_status&col=gd_aliases&col=gd_pub_chrom_map&col=gd_pub_refseq_ids&status=Approved&status=Approved+Non-Human&status=Entry+Withdrawn&status_opt=3&=on&where=gd_app_sym+like+%27%25${url}%25%27&order_by=gd_app_sym_sort&limit=&format=html&submit=submit&.cgifields=&.cgifields=status&.cgifields=chr"); if ($LOCATION) { my ($location) = $response =~ m{ Chromosome \s*\+\s* \s* ([^<]+?) # The chromosome location \s* ([^<]+)