3 # parse_uniprot_results retreives files of search results from ncbi,
4 # and is released under the terms of the GPL version 2, or any later
5 # version, at your option. See the file README and COPYING for more
8 # Copyright 2004 by Don Armstrong <don@donarmstrong.com>.
10 # $Id: ss,v 1.1 2004/06/29 05:26:35 don Exp $
22 parse_uniprot_results [options]
28 --dir, -D directory to stick results into [default .]
29 --name, -n file naming scheme [default ${search}_results.$format]
30 --terms, -t file of search terms [default -]
31 --debug, -d debugging level [default 0]
32 --help, -h display this help
33 --man, -m display manual
41 Debug verbosity. (Default 0)
45 Display brief useage information.
55 parse_uniprot_results -D ./uniprot_results/ -n '${search}_name.html' < search_parameters
57 Will pretty much do what you want
63 use vars qw($DEBUG $REVISION);
66 ($REVISION) = q$LastChangedRevision: 1$ =~ /LastChangedRevision:\s+([^\s]+)/;
67 $DEBUG = 0 unless defined $DEBUG;
73 use HTML::TreeBuilder;
74 use HTML::ElementTable;
76 my %options = (debug => 0,
84 GetOptions(\%options,'keyword|k=s','dir|D=s','debug|d+','help|h|?','man|m',
89 pod2usage() if $options{help};
90 pod2usage({verbose=>2}) if $options{man};
92 $DEBUG = $options{debug};
95 use constant {NAME => 0,
106 if ($options{keywords}) {
108 pod2usage("If the --keywords option is used, exactly one argument (the keyword) must be passed");
110 $options{dir} = "$ARGV[0]_results_uniprot";
113 if (not -d $options{dir}) {
114 die "$options{dir} does not exist or is not a directory";
117 my $dir = new IO::Dir $options{dir} or die "Unable to open dir $options{dir}: $!";
119 print join(",", map {qq("$_");} qw(Name RefSeq Location Alias Function Description Keyword DBName Filename)),qq(\n);
121 my ($keyword) = $options{keyword} || $options{dir} =~ m#(?:^|/)([^\/]+)_results_uniprot#;
123 FILE: while ($_ = $dir->read) {
125 next if $file_name =~ /^\./;
126 next unless -f "$options{dir}/$file_name" and -r "$options{dir}/$file_name";
128 my $file = new IO::File "$options{dir}/$file_name", 'r' or die "Unable to open file $file_name";
131 my $current_field = undef;
132 LINE: while (<$file>) {
133 my ($type,$data) = $_ =~ /^(\w{2})\ {3}(.+)/s;
135 next LINE if not defined $data;
136 if (not defined $current_field and $type ne 'ID') {
139 next FILE if $type eq 'DE' and $data =~ /\(Fragment\)/;
140 next FILE if $type eq 'ID' and $data =~ /Unreviewed;/;
141 if (not defined $current_field or $current_field->{type} ne $type) {
142 if (defined $current_field) {
143 push @fields,$current_field;
145 $current_field = {type => $type,
149 $current_field->{data} .= $data;
151 if (defined $current_field) {
152 push @fields,$current_field;
157 for my $field (@fields) {
158 my $type = $field->{type};
161 ($results[NAME]) = $field->{data} =~ m{Name=(.+?);}xis;
163 elsif ($type eq 'DR') {
164 # Find REF SEQ number
165 ($results[REFSEQ]) = $field->{data} =~ m{RefSeq;\s*([^;]+)}xis;
166 if (not defined $results[REFSEQ]) {
167 ($results[REFSEQ]) = $field->{data} =~ m{Ensembl;\s*([^;]+)}xis;
170 elsif ($type eq 'DE') {
171 # Find gene aliases; these are odd bits separated by ()
173 my $alias = $field->{data};
175 my @aliases = split /(?:\)\s+\(|\s+\(|\)(?:\s+|\.\s*$))/, $alias;
176 $results[ALIAS] = join('; ',map {s/(\s)\s+/$1/g; $_;} @aliases);
178 elsif ($type eq 'CC') {
179 my ($function) = $field->{data} =~ m{-!-\s*FUNCTION:\s*(.+?)(?:-!-|$)}xs;
180 if (defined $function) {
181 $function =~ s/\n//g;
182 $function =~ s/(\s)\s+/$1/g;
183 $results[FUNCTION] = $function;
185 my $description = $field->{data};
186 $description =~ s/\n/ /g;
187 $description =~ s/-!-//g;
188 $description =~ s/(\s)\s+/$1/g;
189 $description =~ s/-----{5,}.+$//;
190 $results[DESCRIPTION] = $description;
194 $results[NAME] ||= 'NO NAME';
195 $results[REFSEQ] ||= 'NO REFSEQ';
197 $results[LOCATION] ||= 'NO LOCATION';
198 $results[ALIAS] ||= 'NO ALIASES';
199 $results[FUNCTION] ||= 'NO FUNCTION';
201 # Figure out the keyword used
202 $results[KEYWORD] ||= $keyword || 'NO KEYWORD';
205 $results[DBNAME] = 'uniprot';
206 $results[FILENAME] = $file_name;
208 print join(',',map {qq("$_")} @results),qq(\n);