--- /dev/null
+#! /usr/bin/perl
+# get_pdf tries to get pdfs, and is released
+# under the terms of the GPL version 2, or any later version, at your
+# option. See the file README and COPYING for more information.
+# Copyright 2008 by Don Armstrong <don@donarmstrong.com>.
+# $Id: perl_script 1352 2009-01-25 02:04:38Z don $
+
+
+use warnings;
+use strict;
+
+use Getopt::Long;
+use Pod::Usage;
+
+=head1 NAME
+
+get_pdf - try to get a pdf
+
+=head1 SYNOPSIS
+
+get_pdf [options] reference [references]
+
+ Options:
+ --debug, -d debugging level (Default 0)
+ --help, -h display this help
+ --man, -m display manual
+
+=head1 OPTIONS
+
+=over
+
+=item B<--pmid, -p>
+
+The reference is a pmid
+
+=item B<--cgi-proxy, -C>
+
+Use this cgi proxy style proxy
+
+=item B<--debug, -d>
+
+Debug verbosity. (Default 0)
+
+=item B<--help, -h>
+
+Display brief usage information.
+
+=item B<--man, -m>
+
+Display this manual.
+
+=back
+
+=head1 EXAMPLES
+
+
+=cut
+
+
+use vars qw($DEBUG);
+
+use Cwd;
+use WWW::Mechanize;
+
+my %options = (debug => 0,
+ help => 0,
+ man => 0,
+ );
+
+my %REFERENCE_TYPES = (pmid => 'pmid|p');
+
+GetOptions(\%options,
+ values %REFERENCE_TYPES,
+ 'cgi_proxy|cgi-proxy|C=s',
+ 'http_proxy|http-proxy|H=s',
+ 'debug|d+','help|h|?','man|m');
+
+pod2usage() if $options{help};
+pod2usage({verbose=>2}) if $options{man};
+
+$DEBUG = $options{debug};
+
+
+
+if (not grep {exists $options{$_} and
+ defined $options{$_} and
+ $options{$_}} keys %REFERENCE_TYPES) {
+ $options{pmid} = 1;
+}
+my @USAGE_ERRORS;
+if (grep {exists $options{$_}
+ and defined $options{$_}
+ and $options{$_}} keys %REFERENCE_TYPES > 1) {
+ push @USAGE_ERRORS,"You can only specify exactly one of the ".(map { "--$_"} keys %REFERENCE_TYPES)." options";
+}
+
+if (not @ARGV) {
+ push @USAGE_ERRORS,"You must specify at least one reference";
+}
+
+pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
+
+
+if (exists $options{http_proxy}) {
+ $ENV{http_proxy} = $options{http_proxy};
+ $ENV{HTTP_PROXY} = $options{http_proxy};
+ $ENV{CGI_HTTP_PROXY} = $options{http_proxy};
+}
+
+if ($options{pmid}) {
+ my $m = WWW::Mechanize->new();
+ for my $pmid (@ARGV) {
+ $pmid =~ s/\D//g;
+ next unless length $pmid;
+ my $url = "www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&list_uids=${pmid}&dopt=Abstract";
+ if (exists $options{cgi_proxy}) {
+ $url = $options{cgi_proxy}.$url;
+ }
+ $url = "http://${url}";
+ eval {
+ $m->get($url) or die "Unable to get $url";
+ $m->follow_link(text_regex => qr/to\s*read/i) or
+ die "Unable to follow link";
+ # try to find pdf link
+ my $pdf_m = find_pdf_link($m) or
+ die "Unable to find pdf";
+ my $fh = IO::File->new($pmid.'.pdf','w') or
+ die "Unable to open ${pmid}.pdf for writing: $!";
+ print {$fh} $pdf_m->content or
+ die "Unable to write to ${pmid}.pdf: $!";
+ close $fh or
+ die "Unable to close ${pmid}.pdf filehandle: $!";
+ };
+ if ($@) {
+ print STDERR "$@\n" if $DEBUG;
+ ## system('links',
+ ## exists $options{http_proxy}?('-http-proxy',$options{http_proxy}):(),
+ ## $url
+ ## ) == 0 or next;
+ ## rename('temp.pdf',"${pmid}.pdf") if -e 'temp.pdf';
+ }
+ }
+}
+
+
+sub find_pdf_link {
+ my ($mech,$guess,$call) = @_;
+ $guess = 1 unless defined $guess;
+ $call = 0 unless defined $call;
+ # avoid looping endlessly
+ return undef if $call > 5;
+ my $m = $mech->clone();
+ if ($m->content =~ /select\s*a\s*website\s*below/i) {
+ print STDERR $m->uri() if $DEBUG;
+ print STDERR $m->content() if $DEBUG;
+ my @inputs = $m->find_all_inputs(type => 'hidden',
+ name => q(urls['sd']),
+ );
+ return unless @inputs;
+ $m->get($inputs[0]->value);
+ print STDERR $m->content() if $DEBUG;
+ }
+ my @possible_links = $m->find_all_links(text_regex => qr/pdf/i);
+ push @possible_links,$m->find_all_links(text_regex => qr/manual\s*download/i);
+ print STDERR map{$_->url,qq(\n)} @possible_links if $DEBUG;
+ if (not @possible_links and $DEBUG) {
+ print STDERR $m->content();
+ }
+ my $best_guess = $possible_links[0] if @possible_links;
+ for my $link (@possible_links) {
+ my $r = $m->get($link->url());
+ if ($r->header('Content-Type') =~ /pdf/) {
+ return $m;
+ }
+ }
+ my @sub_frames = $m->find_all_links(tag_regex=>qr/^i?frame$/);
+ for my $frame (@sub_frames) {
+ $m->get($frame->url());
+ my $pdf_m = find_pdf_link($m,
+ 0,
+ $call+1,
+ );
+ if (defined $pdf_m) {
+ return $pdf_m;
+ }
+ }
+ if ($guess and defined $best_guess) {
+ $m->get($best_guess->url());
+ return $m;
+ }
+ return undef;
+}
+
+
+__END__