]> git.donarmstrong.com Git - function2gene.git/blob - bin/get_location_from_uniprot
move get_ncbi_results
[function2gene.git] / bin / get_location_from_uniprot
1 #! /usr/bin/perl
2
3 # get_location_from_uniprot retreives files of search results from ncbi,
4 # and is released under the terms of the GPL version 2, or any later
5 # version, at your option. See the file README and COPYING for more
6 # information.
7
8 # Copyright 2004 by Don Armstrong <don@donarmstrong.com>.
9
10 # $Id: ss,v 1.1 2004/06/29 05:26:35 don Exp $
11
12
13 use warnings;
14 use strict;
15
16
17 use Getopt::Long;
18 use Pod::Usage;
19
20 =head1 NAME
21
22   get_location_from_uniprot [options]
23
24 =head1 SYNOPSIS
25
26
27  Options:
28   --terms, -t file of search terms [default -]
29   --debug, -d debugging level [default 0]
30   --help, -h display this help
31   --man, -m display manual
32
33 =head1 OPTIONS
34
35 =over
36
37 =item B<--debug, -d>
38
39 Debug verbosity. (Default 0)
40
41 =item B<--help, -h>
42
43 Display brief useage information.
44
45 =item B<--man, -m>
46
47 Display this manual.
48
49 =back
50
51 =head1 EXAMPLES
52
53   get_location_from_uniprot -t terms.txt > output.txt
54
55 Will pretty much do what you want
56
57 =cut
58
59 # http://www.ebi.uniprot.org/uniprot-srv/extendedView.do?proteinId=1A01_HUMAN
60
61 use vars qw($DEBUG $REVISION);
62
63 BEGIN{
64      ($REVISION) = q$LastChangedRevision: 1$ =~ /LastChangedRevision:\s+([^\s]+)/;
65      $DEBUG = 0 unless defined $DEBUG;
66 }
67
68 use IO::File;
69 use URI::ParamMunge;
70 use LWP::UserAgent;
71
72 # XXX parse config file
73
74 my %options = (debug    => 0,
75                help     => 0,
76                man      => 0,
77                format   => 'xml',
78                database => 'gene',
79                dir      => '.',
80                name     => '${search}_results_harvester',
81                terms    => '-',
82                uniprot_site => 'http://www.ebi.uniprot.org',
83                uniprot_search_url  => '/uniprot-srv/extendedView.do?proteinId=1A01_HUMAN',
84               );
85
86 GetOptions(\%options,'terms|t=s','dir|D=s','debug|d+','help|h|?','man|m');
87
88 pod2usage() if $options{help};
89 pod2usage({verbose=>2}) if $options{man};
90
91 $DEBUG = $options{debug};
92
93 use constant {NAME     => 0,
94               LOCATION => 1,
95               FULLNAME => 2,
96              };
97
98 #open search terms file
99 my $terms;
100 if ($options{terms} eq '-') {
101      $terms = \*STDIN;
102 }
103 else {
104      $terms = new IO::File $options{terms}, 'r' or die "Unable to open file $options{terms}: $!";
105 }
106
107 my $ua = new LWP::UserAgent(agent=>"DA_get_location_from_uniprot/$REVISION");
108
109 #For every term
110 print STDOUT qq("NAME","LOCATION","FULL NAME"\n);
111 while (<$terms>) {
112      my @gene;
113      # Get uids to retrieve
114      chomp;
115      my $search = $_;
116      my $url = uri_param_munge($options{uniprot_site}.$options{uniprot_search_url},
117                                {proteinId => $search,
118                                },
119                               );
120      my $request = HTTP::Request->new('GET', $url);
121      my $response = $ua->request($request);
122      $response = $response->content;
123      $gene[NAME] = $search;
124      ($gene[LOCATION]) = $response =~ m{<!--Chromosome\s+locus-->\s*<tr>\s*
125                                       <td\s+class="import_title"\s+valign="top">&nbsp;</td>\s*
126                                       <td\s+class="value"\s+colspan="5">\s*
127                                       <table\s+width="100%"><tr\s+class="value"><td>Gene\s+name:[^\&]+&nbsp;&nbsp;Location:([^\<]+)</td></tr>\s*
128                                       </table></td></tr>\s*
129                                       <!--\s*end\s+chromosome\s+locus\s+-->}xis;
130      ($gene[FULLNAME]) = $response =~ m{>Protein\s+name</a>\s*
131                                       </td>\s*<td\s+class="value"\s+colspan="5">\s*
132                                       <b>([^\<]+)</b>\s*
133                                       </td>\s*</tr>\s*<!--end\s+title-->}xis;
134      print STDOUT join(',', map {if (defined $_) {qq("$_");} else {qq("NO DATA");}} @gene[0..2]),qq(\n);
135      sleep 2;
136 }
137
138
139
140
141
142
143 __END__