get_epub_pmid

   1 #! /usr/bin/perl
   2 # get_epub_pmid tries to get epubs from pubmed, and is released
   3 # under the terms of the GPL version 2, or any later version, at your
   4 # option. See the file README and COPYING for more information.
   5 # Copyright 2008 by Don Armstrong <don@donarmstrong.com>.
   6 # $Id: perl_script 1352 2009-01-25 02:04:38Z don $
   7
   8
   9 use warnings;
  10 use strict;
  11
  12 use Getopt::Long;
  13 use Pod::Usage;
  14
  15 =head1 NAME
  16
  17 get_epub_pmid - try to get an epub
  18
  19 =head1 SYNOPSIS
  20
  21 get_epub_pmid [options] reference [references]
  22
  23  Options:
  24   --debug, -d debugging level (Default 0)
  25   --help, -h display this help
  26   --man, -m display manual
  27
  28 =head1 OPTIONS
  29
  30 =over
  31
  32 =item B<--pmid, -p>
  33
  34 The reference is a pmid
  35
  36 =item B<--cgi-proxy, -C>
  37
  38 Use this cgi proxy style proxy
  39
  40 =item B<--debug, -d>
  41
  42 Debug verbosity. (Default 0)
  43
  44 =item B<--help, -h>
  45
  46 Display brief usage information.
  47
  48 =item B<--man, -m>
  49
  50 Display this manual.
  51
  52 =back
  53
  54 =head1 EXAMPLES
  55
  56
  57 =cut
  58
  59
  60 use vars qw($DEBUG);
  61
  62 use Cwd;
  63 use WWW::Mechanize;
  64
  65 my %options = (debug           => 0,
  66                help            => 0,
  67                man             => 0,
  68                use_links       => 0,
  69               );
  70
  71 my %REFERENCE_TYPES = (pmid => 'pmid|p');
  72
  73 GetOptions(\%options,
  74            values %REFERENCE_TYPES,
  75            'use_links|use-links!',
  76            'cgi_proxy|cgi-proxy|C=s',
  77            'http_proxy|http-proxy|H=s',
  78            'debug|d+','help|h|?','man|m');
  79
  80 pod2usage() if $options{help};
  81 pod2usage({verbose=>2}) if $options{man};
  82
  83 $DEBUG = $options{debug};
  84
  85
  86
  87 if (not grep {exists $options{$_} and
  88                   defined $options{$_} and
  89                   $options{$_}} keys %REFERENCE_TYPES) {
  90     $options{pmid} = 1;
  91 }
  92 my @USAGE_ERRORS;
  93 if (grep {exists $options{$_}
  94               and defined $options{$_}
  95                   and $options{$_}} keys %REFERENCE_TYPES > 1) {
  96     push @USAGE_ERRORS,"You can only specify exactly one of the ".(map { "--$_"} keys %REFERENCE_TYPES)." options";
  97 }
  98
  99 if (not @ARGV) {
 100     push @USAGE_ERRORS,"You must specify at least one reference";
 101 }
 102
 103 pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
 104
 105
 106 if (exists $options{http_proxy}) {
 107     $ENV{http_proxy} = $options{http_proxy};
 108     $ENV{HTTP_PROXY} = $options{http_proxy};
 109     $ENV{CGI_HTTP_PROXY} = $options{http_proxy};
 110 }
 111
 112 if ($options{pmid}) {
 113     my $m = WWW::Mechanize->new(agent => 'Mozilla',cookie_jar => {});
 114     for my $pmid (@ARGV) {
 115         $pmid =~ s/\D//g;
 116         next unless length $pmid;
 117         my $url = "www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&list_uids=${pmid}&dopt=Abstract";
 118         if (exists $options{cgi_proxy}) {
 119             $url = $options{cgi_proxy}.$url;
 120         }
 121         $url = "http://${url}";
 122         eval {
 123             $m->get($url) or die "Unable to get $url";
 124             my $orig_mech = $m->clone();
 125             use Data::Dumper;
 126             my @possible_links = $m->find_all_links(text_regex => qr/to\s*read|free\s*PMC|full\s*text/i);
 127             # try to find the other links
 128             push @possible_links,
 129             grep {my $attr = $_->attrs(); exists $attr->{title} and $attr->{title} =~ qr/(?:Full\s*Text|PMC)/i}
 130                     $m->links();
 131             print STDERR map {"article link: ".$_->url_abs()."\n"} @possible_links if $DEBUG;
 132             die "No links" unless @possible_links;
 133             do {
 134             $m = $orig_mech;
 135             eval {
 136                 print "trying ".$possible_links[0]->url()."\n" if $DEBUG;
 137                 $m->get($possible_links[0]->url()) or
 138                     die "Unable to follow link";
 139                 # try to find epub link
 140                 my $epub_m = find_epub_link($m) or
 141                     die "Unable to find epub";
 142                 my $fh = IO::File->new($pmid.'.epub','w') or
 143                     die "Unable to open ${pmid}.epub for writing: $!";
 144                 print {$fh} $epub_m->content or
 145                     die "Unable to write to ${pmid}.epub: $!";
 146                 close $fh or
 147                     die "Unable to close ${pmid}.epub filehandle: $!";
 148                 };
 149                 shift @possible_links;
 150     } while ($@ and @possible_links);
 151             if ($@) {
 152             die "$@";
 153             }
 154         };
 155         if ($@) {
 156             print STDERR "$@\n" if $DEBUG;
 157             if ($options{use_links}) {
 158                 system('links2',
 159                        # links2 doesn't like the leading http:// of proxies for some reason
 160                        exists $options{http_proxy}?('-http-proxy',(map {s{http://}{}; $_} $options{http_proxy})):(),
 161                        $url
 162                       ) == 0 or next;
 163                 rename('temp.epub',"${pmid}.epub") if -e 'temp.epub';
 164             }
 165         }
 166     }
 167 }
 168
 169
 170 sub find_epub_link {
 171     my ($mech,$guess,$call) = @_;
 172     $guess = 1 unless defined $guess;
 173     $call = 0 unless defined $call;
 174     # avoid looping endlessly
 175     return undef if $call > 5;
 176     my $m = $mech->clone();
 177     if ($m->content =~ /select\s*a\s*website\s*below/i) {
 178         print STDERR $m->uri() if $DEBUG;
 179         print STDERR $m->content() if $DEBUG;
 180         my @inputs = $m->find_all_inputs(type => 'hidden',
 181                                          name => q(urls['sd']),
 182                                         );
 183         return unless @inputs;
 184         $m->get($inputs[0]->value);
 185         print STDERR $m->content() if $DEBUG;
 186     }
 187     my @possible_links;
 188     # this is to prioritize the real link at science direct
 189     push @possible_links, grep {my $temp = $_->attrs();
 190                                 exists $temp->{title} and $temp->{title} =~ qr/Download\s*epub/i}
 191         $m->find_all_links(text_regex => qr/epub/i);
 192     push @possible_links, grep { $_->url_abs() !~ /_orig(?:in)?=article/} $m->find_all_links(text_regex => qr/epub/i);
 193     push @possible_links, $m->find_all_links(tag_regex => qr/meta/,
 194                                              url_regex  => qr/(reprint|\.epub)/i,
 195                                             );
 196     push @possible_links, $m->find_all_links(text_regex => qr/epub/i);
 197     push @possible_links,$m->find_all_links(text_regex => qr/manual\s*download/i);
 198     print STDERR $m->uri() if $DEBUG;
 199     print STDERR $m->content() if $DEBUG;
 200     print STDERR map{"possible epub link: ".$_->url_abs().qq(\n)} @possible_links if $DEBUG;
 201     if (not @possible_links and $DEBUG) {
 202         print STDERR $m->content();
 203     }
 204     my $best_guess = $possible_links[0] if @possible_links;
 205     for my $link (@possible_links) {
 206         print STDERR "trying ".$link->url_abs()."..." if $DEBUG;
 207         my $r = $m->get($link->url_abs());
 208         if ($r->header('Content-Type') =~ /epub|zip/) {
 209             print STDERR "success\n" if $DEBUG;
 210             return $m;
 211         }
 212         print STDERR "failure; content type ".$r->header('Content-Type')."\n" if $DEBUG;
 213         print STDERR $m->content() if $DEBUG;
 214     }
 215     my @sub_frames = $m->find_all_links(tag_regex=>qr/^i?frame$/);
 216     for my $frame (@sub_frames) {
 217         my $r = $m->get($frame->url_abs());
 218         if ($r->header('Content-Type') =~ /epub|zip/) {
 219             return $m;
 220         }
 221         my $epub_m = find_epub_link($m,
 222                                   0,
 223                                   $call+1,
 224                                  );
 225         if (defined $epub_m) {
 226             return $epub_m;
 227         }
 228     }
 229 #     if ($guess and defined $best_guess) {
 230 #       $m->get($best_guess->url_abs());
 231 #       return $m;
 232 #     }
 233     return undef;
 234 }
 235
 236
 237 __END__