]> git.donarmstrong.com Git - function2gene.git/blobdiff - bin/stupid_missing_names.pl
add bin files for search routines
[function2gene.git] / bin / stupid_missing_names.pl
diff --git a/bin/stupid_missing_names.pl b/bin/stupid_missing_names.pl
new file mode 100644 (file)
index 0000000..b248f9f
--- /dev/null
@@ -0,0 +1,117 @@
+#! /usr/bin/perl
+
+
+=head1 NAME
+
+stupid_missing_names - 
+
+=head1 SYNOPSIS
+
+Some of the genes don't actually have locations. This misnamed script
+is designed to take the names of those missing locations and try to
+figure out where the actual genes are located.
+
+=head1 DESCRIPTION
+
+
+
+=cut
+
+
+use warnings;
+use strict;
+
+
+use vars qw($DEBUG $REVISION);
+
+BEGIN{
+     ($REVISION) = q$LastChangedRevision: 1$ =~ /LastChangedRevision:\s+([^\s]+)/;
+     $DEBUG = 0 unless defined $DEBUG;
+}
+
+use URI::ParamMunge;
+use LWP::UserAgent;
+
+# XXX parse config file
+
+my $LOCATION = 0;
+
+my %options = (debug    => 0,
+              help     => 0,
+              man      => 0,
+              format   => 'xml',
+              database => 'gene',
+              dir      => '.',
+              name     => '${search}_results_genecard',
+              terms    => '-',
+             );
+
+my $terms;
+if ($options{terms} eq '-') {
+     $terms = \*STDIN;
+}
+
+my $ua = new LWP::UserAgent(agent=>"DA_get_harvester_results/$REVISION");
+
+sub get_url($){
+     my $url = shift;
+
+     my $request = HTTP::Request->new('GET', $url);
+     my $response = $ua->request($request);
+     $response = $response->content;
+     return $response;
+}
+
+#For every term
+while (<$terms>) {
+     # Get uids to retrieve
+     chomp;
+     my $search = $_;
+
+     my $response = get_url(uri_param_munge('http://www.ensembl.org/Homo_sapiens/textview?type=All&x=0&y=0',
+                                           {q => $search,
+                                           },
+                                          )
+                          );
+
+     my ($url) = $response =~ m&<blockquote><b>1.\s+Ensembl\s+[^<]+\s+</B>
+                               <A\s+HREF="(/Homo[^"]+)">[^\"]+</A><BR>&xis;
+
+     print "NO DATA:1\n" and next if not defined $url;
+
+     $response = get_url("http://www.ensembl.org$url");
+
+     ($url) = $response =~ m{<tr\s+align="left"\s+valign="middle">\s*
+                             <th\s+width="20%">Gene</th>\s*
+                             <td\s+width="80%"><b><a\s+href="http://www.gene.ucl.ac.uk/cgi-bin/nomenclature/get_data.pl?[^"]+">
+                            ([^<]+)</a>\s*</b>\s*<small>\(HUGO\s*ID\)</small>&nbsp;}xis;
+
+     print "NO DATA:2\n" and next if not defined $url;
+
+     $response = get_url("http://www.gene.ucl.ac.uk/cgi-bin/nomenclature/gdlw.pl?title=&col=gd_hgnc_id&col=gd_app_name&col=gd_status&col=gd_aliases&col=gd_pub_chrom_map&col=gd_pub_refseq_ids&status=Approved&status=Approved+Non-Human&status=Entry+Withdrawn&status_opt=3&=on&where=gd_app_sym+like+%27%25${url}%25%27&order_by=gd_app_sym_sort&limit=&format=html&submit=submit&.cgifields=&.cgifields=status&.cgifields=chr");
+
+     if ($LOCATION) {
+         my ($location) = $response =~ m{<th\s+valign="TOP"\s+align="LEFT"\s+bgcolor="\#E6E6FF">
+                                         Chromosome<a\s+href="[^"]+">
+                                         \s*\+\s*</a></th><td\s+valign="TOP"\s+align="LEFT">\s*
+                                         ([^<]+?) # The chromosome location
+                                         \s*</td><th}xis;
+
+         print "NO LOCATION\n" and next if not defined $location;
+         print $location,"\n";
+     }
+#http://www.gene.ucl.ac.uk/cgi-bin/nomenclature/gdlw.pl?title=&col=gd_hgnc_id&col=gd_app_name&col=gd_status&col=gd_aliases&col=gd_pub_chrom_map&col=gd_pub_refseq_ids&status=Approved&status=Approved+Non-Human&status=Entry+Withdrawn&status_opt=3&=on&where=gd_app_sym+like+%27%25HLA-A%25%27&order_by=gd_app_sym_sort&limit=&format=html&submit=submit&.cgifields=&.cgifields=status&.cgifields=chr
+     else{
+         my ($ref_seq) = $response =~ m{<td><a\s+href="http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi\?val=[^"]+">([^<]+)</a></td></tr>}xis;
+
+         print "NO SEQUENCE\n" and next if not defined $ref_seq;
+         print $ref_seq, "\n";
+     }
+}
+
+
+
+
+
+
+__END__