10 Some of the genes don't actually have locations. This misnamed script
11 is designed to take the names of those missing locations and try to
12 figure out where the actual genes are located.
25 use vars qw($DEBUG $REVISION);
28 ($REVISION) = q$LastChangedRevision: 1$ =~ /LastChangedRevision:\s+([^\s]+)/;
29 $DEBUG = 0 unless defined $DEBUG;
35 # XXX parse config file
39 my %options = (debug => 0,
45 name => '${search}_results_genecard',
50 if ($options{terms} eq '-') {
54 my $ua = new LWP::UserAgent(agent=>"DA_get_harvester_results/$REVISION");
59 my $request = HTTP::Request->new('GET', $url);
60 my $response = $ua->request($request);
61 $response = $response->content;
67 # Get uids to retrieve
71 my $response = get_url(uri_param_munge('http://www.ensembl.org/Homo_sapiens/textview?type=All&x=0&y=0',
77 my ($url) = $response =~ m&<blockquote><b>1.\s+Ensembl\s+[^<]+\s+</B>
78 <A\s+HREF="(/Homo[^"]+)">[^\"]+</A><BR>&xis;
80 print "NO DATA:1\n" and next if not defined $url;
82 $response = get_url("http://www.ensembl.org$url");
84 ($url) = $response =~ m{<tr\s+align="left"\s+valign="middle">\s*
85 <th\s+width="20%">Gene</th>\s*
86 <td\s+width="80%"><b><a\s+href="http://www.gene.ucl.ac.uk/cgi-bin/nomenclature/get_data.pl?[^"]+">
87 ([^<]+)</a>\s*</b>\s*<small>\(HUGO\s*ID\)</small> }xis;
89 print "NO DATA:2\n" and next if not defined $url;
91 $response = get_url("http://www.gene.ucl.ac.uk/cgi-bin/nomenclature/gdlw.pl?title=&col=gd_hgnc_id&col=gd_app_name&col=gd_status&col=gd_aliases&col=gd_pub_chrom_map&col=gd_pub_refseq_ids&status=Approved&status=Approved+Non-Human&status=Entry+Withdrawn&status_opt=3&=on&where=gd_app_sym+like+%27%25${url}%25%27&order_by=gd_app_sym_sort&limit=&format=html&submit=submit&.cgifields=&.cgifields=status&.cgifields=chr");
94 my ($location) = $response =~ m{<th\s+valign="TOP"\s+align="LEFT"\s+bgcolor="\#E6E6FF">
95 Chromosome<a\s+href="[^"]+">
96 \s*\+\s*</a></th><td\s+valign="TOP"\s+align="LEFT">\s*
97 ([^<]+?) # The chromosome location
100 print "NO LOCATION\n" and next if not defined $location;
101 print $location,"\n";
103 #http://www.gene.ucl.ac.uk/cgi-bin/nomenclature/gdlw.pl?title=&col=gd_hgnc_id&col=gd_app_name&col=gd_status&col=gd_aliases&col=gd_pub_chrom_map&col=gd_pub_refseq_ids&status=Approved&status=Approved+Non-Human&status=Entry+Withdrawn&status_opt=3&=on&where=gd_app_sym+like+%27%25HLA-A%25%27&order_by=gd_app_sym_sort&limit=&format=html&submit=submit&.cgifields=&.cgifields=status&.cgifields=chr
105 my ($ref_seq) = $response =~ m{<td><a\s+href="http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi\?val=[^"]+">([^<]+)</a></td></tr>}xis;
107 print "NO SEQUENCE\n" and next if not defined $ref_seq;
108 print $ref_seq, "\n";