]> git.donarmstrong.com Git - function2gene.git/blob - bin/stupid_missing_names.pl
update combine results
[function2gene.git] / bin / stupid_missing_names.pl
1 #! /usr/bin/perl
2
3
4 =head1 NAME
5
6 stupid_missing_names - 
7
8 =head1 SYNOPSIS
9
10 Some of the genes don't actually have locations. This misnamed script
11 is designed to take the names of those missing locations and try to
12 figure out where the actual genes are located.
13
14 =head1 DESCRIPTION
15
16
17
18 =cut
19
20
21 use warnings;
22 use strict;
23
24
25 use vars qw($DEBUG $REVISION);
26
27 BEGIN{
28      ($REVISION) = q$LastChangedRevision: 1$ =~ /LastChangedRevision:\s+([^\s]+)/;
29      $DEBUG = 0 unless defined $DEBUG;
30 }
31
32 use URI::ParamMunge;
33 use LWP::UserAgent;
34
35 # XXX parse config file
36
37 my $LOCATION = 0;
38
39 my %options = (debug    => 0,
40                help     => 0,
41                man      => 0,
42                format   => 'xml',
43                database => 'gene',
44                dir      => '.',
45                name     => '${search}_results_genecard',
46                terms    => '-',
47               );
48
49 my $terms;
50 if ($options{terms} eq '-') {
51      $terms = \*STDIN;
52 }
53
54 my $ua = new LWP::UserAgent(agent=>"DA_get_harvester_results/$REVISION");
55
56 sub get_url($){
57      my $url = shift;
58
59      my $request = HTTP::Request->new('GET', $url);
60      my $response = $ua->request($request);
61      $response = $response->content;
62      return $response;
63 }
64
65 #For every term
66 while (<$terms>) {
67      # Get uids to retrieve
68      chomp;
69      my $search = $_;
70
71      my $response = get_url(uri_param_munge('http://www.ensembl.org/Homo_sapiens/textview?type=All&x=0&y=0',
72                                             {q => $search,
73                                             },
74                                            )
75                            );
76
77      my ($url) = $response =~ m&<blockquote><b>1.\s+Ensembl\s+[^<]+\s+</B>
78                                 <A\s+HREF="(/Homo[^"]+)">[^\"]+</A><BR>&xis;
79
80      print "NO DATA:1\n" and next if not defined $url;
81
82      $response = get_url("http://www.ensembl.org$url");
83
84      ($url) = $response =~ m{<tr\s+align="left"\s+valign="middle">\s*
85                              <th\s+width="20%">Gene</th>\s*
86                              <td\s+width="80%"><b><a\s+href="http://www.gene.ucl.ac.uk/cgi-bin/nomenclature/get_data.pl?[^"]+">
87                              ([^<]+)</a>\s*</b>\s*<small>\(HUGO\s*ID\)</small>&nbsp;}xis;
88
89      print "NO DATA:2\n" and next if not defined $url;
90
91      $response = get_url("http://www.gene.ucl.ac.uk/cgi-bin/nomenclature/gdlw.pl?title=&col=gd_hgnc_id&col=gd_app_name&col=gd_status&col=gd_aliases&col=gd_pub_chrom_map&col=gd_pub_refseq_ids&status=Approved&status=Approved+Non-Human&status=Entry+Withdrawn&status_opt=3&=on&where=gd_app_sym+like+%27%25${url}%25%27&order_by=gd_app_sym_sort&limit=&format=html&submit=submit&.cgifields=&.cgifields=status&.cgifields=chr");
92
93      if ($LOCATION) {
94           my ($location) = $response =~ m{<th\s+valign="TOP"\s+align="LEFT"\s+bgcolor="\#E6E6FF">
95                                           Chromosome<a\s+href="[^"]+">
96                                           \s*\+\s*</a></th><td\s+valign="TOP"\s+align="LEFT">\s*
97                                           ([^<]+?) # The chromosome location
98                                           \s*</td><th}xis;
99
100           print "NO LOCATION\n" and next if not defined $location;
101           print $location,"\n";
102      }
103 #http://www.gene.ucl.ac.uk/cgi-bin/nomenclature/gdlw.pl?title=&col=gd_hgnc_id&col=gd_app_name&col=gd_status&col=gd_aliases&col=gd_pub_chrom_map&col=gd_pub_refseq_ids&status=Approved&status=Approved+Non-Human&status=Entry+Withdrawn&status_opt=3&=on&where=gd_app_sym+like+%27%25HLA-A%25%27&order_by=gd_app_sym_sort&limit=&format=html&submit=submit&.cgifields=&.cgifields=status&.cgifields=chr
104      else{
105           my ($ref_seq) = $response =~ m{<td><a\s+href="http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi\?val=[^"]+">([^<]+)</a></td></tr>}xis;
106
107           print "NO SEQUENCE\n" and next if not defined $ref_seq;
108           print $ref_seq, "\n";
109      }
110 }
111
112
113
114
115
116
117 __END__