get_pdf

   1 #! /usr/bin/perl
   2 # get_pdf tries to get pdfs, and is released
   3 # under the terms of the GPL version 2, or any later version, at your
   4 # option. See the file README and COPYING for more information.
   5 # Copyright 2008 by Don Armstrong <don@donarmstrong.com>.
   6 # $Id: perl_script 1352 2009-01-25 02:04:38Z don $
   7
   8
   9 use warnings;
  10 use strict;
  11
  12 use Getopt::Long;
  13 use Pod::Usage;
  14
  15 =head1 NAME
  16
  17 get_pdf - try to get a pdf
  18
  19 =head1 SYNOPSIS
  20
  21 get_pdf [options] reference [references]
  22
  23  Options:
  24   --debug, -d debugging level (Default 0)
  25   --help, -h display this help
  26   --man, -m display manual
  27
  28 =head1 OPTIONS
  29
  30 =over
  31
  32 =item B<--pmid, -p>
  33
  34 The reference is a pmid
  35
  36 =item B<--cgi-proxy, -C>
  37
  38 Use this cgi proxy style proxy
  39
  40 =item B<--debug, -d>
  41
  42 Debug verbosity. (Default 0)
  43
  44 =item B<--help, -h>
  45
  46 Display brief usage information.
  47
  48 =item B<--man, -m>
  49
  50 Display this manual.
  51
  52 =back
  53
  54 =head1 EXAMPLES
  55
  56
  57 =cut
  58
  59
  60 use vars qw($DEBUG);
  61
  62 use Cwd;
  63 use WWW::Mechanize;
  64 use Data::Printer;
  65
  66 my %options = (debug           => 0,
  67                help            => 0,
  68                man             => 0,
  69                use_links       => 1,
  70                );
  71
  72 my %REFERENCE_TYPES = (pmid => 'pmid|p');
  73
  74 GetOptions(\%options,
  75            values %REFERENCE_TYPES,
  76            'use_links|use-links!',
  77            'cgi_proxy|cgi-proxy|C=s',
  78            'http_proxy|http-proxy|H=s',
  79            'debug|d+','help|h|?','man|m');
  80
  81 pod2usage() if $options{help};
  82 pod2usage({verbose=>2}) if $options{man};
  83
  84 $DEBUG = $options{debug};
  85
  86 binmode(STDOUT,":encoding(UTF-8)");
  87 binmode(STDERR,":encoding(UTF-8)");
  88
  89 if (not grep {exists $options{$_} and
  90                   defined $options{$_} and
  91                       $options{$_}} keys %REFERENCE_TYPES) {
  92     $options{pmid} = 1;
  93 }
  94 my @USAGE_ERRORS;
  95 if (grep {exists $options{$_}
  96               and defined $options{$_}
  97                   and $options{$_}} keys %REFERENCE_TYPES > 1) {
  98     push @USAGE_ERRORS,"You can only specify exactly one of the ".(map { "--$_"} keys %REFERENCE_TYPES)." options";
  99 }
 100
 101 if (not @ARGV) {
 102     push @USAGE_ERRORS,"You must specify at least one reference";
 103 }
 104
 105 pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
 106
 107
 108 if (exists $options{http_proxy}) {
 109     $ENV{http_proxy} = $options{http_proxy};
 110     $ENV{HTTP_PROXY} = $options{http_proxy};
 111     $ENV{CGI_HTTP_PROXY} = $options{http_proxy};
 112 }
 113
 114 if ($options{pmid}) {
 115     my $m = WWW::Mechanize->new(agent => 'Mozilla',cookie_jar => {});
 116     for my $pmid (@ARGV) {
 117         $pmid =~ s/\D//g;
 118         next unless length $pmid;
 119         my $url = "www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&list_uids=${pmid}&dopt=Abstract";
 120         if (exists $options{cgi_proxy}) {
 121             $url = $options{cgi_proxy}.$url;
 122         }
 123         $url = "http://${url}";
 124         eval {
 125             $m->get($url) or die "Unable to get $url";
 126             my $orig_mech = $m->clone();
 127             use Data::Dumper;
 128             my @possible_links = $m->find_all_links(text_regex => qr/to\s*read/i);
 129             # try to find the other links
 130             push @possible_links,
 131             grep {my $attr = $_->attrs();
 132                   exists $attr->{title} and
 133                       $attr->{title} =~ qr/(?:Full\s*Text|PMC)/i}
 134                     $m->links();
 135             print STDERR map {"article link: ".$_->url_abs()."\n"} @possible_links if $DEBUG;
 136             die "No links" unless @possible_links;
 137             do {
 138                 $m = $orig_mech;
 139                 eval {
 140                     print "trying ".$possible_links[0]->url()."\n" if $DEBUG;
 141                     $m->get($possible_links[0]->url()) or
 142                         die "Unable to follow link";
 143                     # try to find pdf link
 144                     my $pdf_m = find_pdf_link($m) or
 145                         die "Unable to find pdf";
 146                     my $fh = IO::File->new($pmid.'.pdf','w') or
 147                         die "Unable to open ${pmid}.pdf for writing: $!";
 148                     print {$fh} $pdf_m->content or
 149                         die "Unable to write to ${pmid}.pdf: $!";
 150                     close $fh or
 151                         die "Unable to close ${pmid}.pdf filehandle: $!";
 152                 };
 153                 shift @possible_links;
 154             } while ($@ and @possible_links);
 155             if ($@) {
 156                 die "$@";
 157             }
 158         };
 159         if ($@) {
 160             print STDERR "$@\n" if $DEBUG;
 161         if ($options{use_links}) {
 162             if ($ENV{DISPLAY}) {
 163                 system('chromium',
 164                        # links2 doesn't like the leading http:// of proxies for some reason
 165                        exists $options{http_proxy}?('--proxy-server',(map {s{http://}{}; $_} $options{http_proxy})):(),
 166                        '--temp-profile',
 167                        $url,
 168                       ) == 0 or next;
 169                 rename('temp.pdf',"${pmid}.pdf") if -e 'temp.pdf';
 170             } else {
 171                 system('links2',
 172                        # links2 doesn't like the leading http:// of proxies for some reason
 173                        exists $options{http_proxy}?('-http-proxy',(map {s{http://}{}; $_} $options{http_proxy})):(),
 174                        $url
 175                       ) == 0 or next;
 176                 rename('temp.pdf',"${pmid}.pdf") if -e 'temp.pdf';
 177             }
 178         }}
 179     }
 180 }
 181
 182
 183 sub check_subframes {
 184     my ($m,$call) = @_;
 185     my @sub_frames = $m->find_all_links(tag_regex=>qr/^i?frame$/);
 186     print STDERR "subframes: \n" if $DEBUG;
 187     p @sub_frames if $DEBUG;
 188     for my $frame (@sub_frames) {
 189         my $r = $m->get($frame->url_abs());
 190         print STDERR "trying: ".$frame->url_abs()."\n" if $DEBUG;
 191         if ($r->header('Content-Type') =~ /pdf/) {
 192             return $m;
 193         }
 194         print STDERR "failed: ".$r->header('Content-Type')."\n" if $DEBUG;
 195     }
 196     for my $frame (@sub_frames) {
 197         my $r = $m->get($frame->url_abs());
 198         my $pdf_m = find_pdf_link($m,
 199                                   0,
 200                                   $call+1,
 201                                  );
 202         if (defined $pdf_m) {
 203             return $pdf_m;
 204         }
 205     }
 206     return undef;
 207 }
 208
 209
 210 sub find_pdf_link {
 211     my ($mech,$guess,$call) = @_;
 212     $guess = 1 unless defined $guess;
 213     $call = 0 unless defined $call;
 214     # avoid looping endlessly
 215     return undef if $call > 5;
 216     my $m = $mech->clone();
 217     if ($m->content =~ /select\s*a\s*website\s*below/i) {
 218         print STDERR $m->uri() if $DEBUG;
 219         print STDERR $m->content() if $DEBUG > 1;
 220         my @inputs = $m->find_all_inputs(type => 'hidden',
 221                                          name => q(urls['sd']),
 222                                         );
 223         return unless @inputs;
 224         $m->get($inputs[0]->value);
 225         print STDERR $m->content() if $DEBUG > 1;
 226     }
 227     my @possible_links;
 228     # this brings forward the actual link at Science
 229     push @possible_links,
 230         grep {my $temp = $_->attrs();
 231               exists $temp->{rel} and $temp->{rel} =~ qr/view-/i and
 232                   defined $_->text() and $_->text() =~ qr/Full\s*Text.*PDF/i
 233               }
 234         $m->find_all_links(text_regex => qr/PDF/i);
 235     push @possible_links,
 236         grep {my $temp = $_->attrs();
 237               exists $temp->{rel} and $temp->{rel} =~ qr/alternate/i and
 238                   exists $temp->{type} and $temp->{type} =~ qr/pdf/i
 239               }
 240         $m->find_all_links(url_regex => qr/pdf/);
 241     # this is to prioritize the real link at science direct
 242     push @possible_links,
 243         grep {my $temp = $_->attrs();
 244               use Data::Dumper;
 245               print STDERR Dumper($temp);
 246               (exists $temp->{title} and $temp->{title} =~ qr/(Download|Full\s*Text)\s*PDF/i) or
 247                   (defined $_->text() and $_->text() =~ qr/(Full\s*Text|Download).*PDF/i)
 248               }
 249         $m->find_all_links(text_regex => qr/PDF/i);
 250     my $possible_links = 0;
 251     if ($DEBUG) {
 252         $possible_links++;
 253         print STDERR "possible links[$possible_links]:\n";
 254         p @possible_links;
 255     }
 256     push @possible_links, grep { $_->url_abs() !~ /_orig(?:in)?=article/} $m->find_all_links(text_regex => qr/PDF/i);
 257     if ($DEBUG) {
 258         $possible_links++;
 259         print STDERR "possible links[$possible_links]:\n";
 260         p @possible_links;
 261     }
 262     push @possible_links, $m->find_all_links(tag_regex => qr/meta/,
 263                                              url_regex  => qr/(reprint|\.pdf)/i,
 264                                             );
 265     if ($DEBUG) {
 266         $possible_links++;
 267         print STDERR "possible links[$possible_links]:\n";
 268         p @possible_links;
 269     }
 270     # The masthead grep here is to handle PNAS, which has a link to their masthead in every article.
 271     push @possible_links,
 272         grep {my $temp = $_->attrs(); (not defined $temp->{title}) or $temp->{title} !~ qr/Masthead/i;}
 273         $m->find_all_links(text_regex => qr/pdf/i);
 274     if ($DEBUG) {
 275         $possible_links++;
 276         print STDERR "possible links[$possible_links]:\n";
 277         p @possible_links;
 278     }
 279     push @possible_links,$m->find_all_links(text_regex => qr/manual\s*download/i);
 280     if ($DEBUG) {
 281         $possible_links++;
 282         print STDERR "possible links[$possible_links]:\n";
 283         p @possible_links;
 284     }
 285     print STDERR $m->uri() if $DEBUG;
 286     print STDERR $m->content() if $DEBUG > 1;
 287     print STDERR map{"possible pdf link: ".$_->url_abs().qq(\n)} @possible_links if $DEBUG;
 288     if (not @possible_links and $DEBUG) {
 289         print STDERR $m->content();
 290     }
 291     my $best_guess = $possible_links[0] if @possible_links;
 292     for my $link (@possible_links) {
 293         print STDERR "trying ".$link->url_abs()."..." if $DEBUG;
 294         my $r = $m->get($link->url_abs());
 295         my $content = $m->content();
 296         if ($r->header('Content-Type') =~ /pdf/) {
 297             print STDERR "success\n" if $DEBUG;
 298             return $m;
 299         }
 300         my $ret = check_subframes($m,$call);
 301         return $ret if defined $ret;
 302         print STDERR "failure; content type ".$r->header('Content-Type')."\n" if $DEBUG;
 303         print STDERR $content if $DEBUG;
 304     }
 305     my $ret = check_subframes($m,$call);
 306     return $ret if defined $ret;
 307 #     if ($guess and defined $best_guess) {
 308 #       $m->get($best_guess->url_abs());
 309 #       return $m;
 310 #     }
 311     return undef;
 312 }
 313
 314
 315 __END__