#! /usr/bin/perl # get_pdf tries to get pdfs, and is released # under the terms of the GPL version 2, or any later version, at your # option. See the file README and COPYING for more information. # Copyright 2008 by Don Armstrong . # $Id: perl_script 1352 2009-01-25 02:04:38Z don $ use warnings; use strict; use Getopt::Long; use Pod::Usage; =head1 NAME get_pdf - try to get a pdf =head1 SYNOPSIS get_pdf [options] reference [references] Options: --debug, -d debugging level (Default 0) --help, -h display this help --man, -m display manual =head1 OPTIONS =over =item B<--pmid, -p> The reference is a pmid =item B<--cgi-proxy, -C> Use this cgi proxy style proxy =item B<--debug, -d> Debug verbosity. (Default 0) =item B<--help, -h> Display brief usage information. =item B<--man, -m> Display this manual. =back =head1 EXAMPLES =cut use vars qw($DEBUG); use Cwd; use WWW::Mechanize; use Data::Printer; my %options = (debug => 0, help => 0, man => 0, use_links => 1, ); my %REFERENCE_TYPES = (pmid => 'pmid|p'); GetOptions(\%options, values %REFERENCE_TYPES, 'use_links|use-links!', 'cgi_proxy|cgi-proxy|C=s', 'http_proxy|http-proxy|H=s', 'debug|d+','help|h|?','man|m'); pod2usage() if $options{help}; pod2usage({verbose=>2}) if $options{man}; $DEBUG = $options{debug}; binmode(STDOUT,":encoding(UTF-8)"); binmode(STDERR,":encoding(UTF-8)"); if (not grep {exists $options{$_} and defined $options{$_} and $options{$_}} keys %REFERENCE_TYPES) { $options{pmid} = 1; } my @USAGE_ERRORS; if (grep {exists $options{$_} and defined $options{$_} and $options{$_}} keys %REFERENCE_TYPES > 1) { push @USAGE_ERRORS,"You can only specify exactly one of the ".(map { "--$_"} keys %REFERENCE_TYPES)." options"; } if (not @ARGV) { push @USAGE_ERRORS,"You must specify at least one reference"; } pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS; if (exists $options{http_proxy}) { $ENV{http_proxy} = $options{http_proxy}; $ENV{HTTP_PROXY} = $options{http_proxy}; $ENV{CGI_HTTP_PROXY} = $options{http_proxy}; } if ($options{pmid}) { my $m = WWW::Mechanize->new(agent => 'Mozilla',cookie_jar => {}); for my $pmid (@ARGV) { $pmid =~ s/\D//g; next unless length $pmid; my $url = "www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&list_uids=${pmid}&dopt=Abstract"; if (exists $options{cgi_proxy}) { $url = $options{cgi_proxy}.$url; } $url = "http://${url}"; eval { $m->get($url) or die "Unable to get $url"; my $orig_mech = $m->clone(); use Data::Dumper; my @possible_links = $m->find_all_links(text_regex => qr/to\s*read/i); # try to find the other links push @possible_links, grep {my $attr = $_->attrs(); exists $attr->{title} and $attr->{title} =~ qr/(?:Full\s*Text|PMC)/i} $m->links(); print STDERR map {"article link: ".$_->url_abs()."\n"} @possible_links if $DEBUG; die "No links" unless @possible_links; do { $m = $orig_mech; eval { print "trying ".$possible_links[0]->url()."\n" if $DEBUG; $m->get($possible_links[0]->url()) or die "Unable to follow link"; # try to find pdf link my $pdf_m = find_pdf_link($m) or die "Unable to find pdf"; my $fh = IO::File->new($pmid.'.pdf','w') or die "Unable to open ${pmid}.pdf for writing: $!"; print {$fh} $pdf_m->content or die "Unable to write to ${pmid}.pdf: $!"; close $fh or die "Unable to close ${pmid}.pdf filehandle: $!"; }; shift @possible_links; } while ($@ and @possible_links); if ($@) { die "$@"; } }; if ($@) { print STDERR "$@\n" if $DEBUG; if ($options{use_links}) { if ($ENV{DISPLAY}) { system('chromium', # links2 doesn't like the leading http:// of proxies for some reason exists $options{http_proxy}?('--proxy-server',(map {s{http://}{}; $_} $options{http_proxy})):(), '--temp-profile', $url, ) == 0 or next; rename('temp.pdf',"${pmid}.pdf") if -e 'temp.pdf'; } else { system('links2', # links2 doesn't like the leading http:// of proxies for some reason exists $options{http_proxy}?('-http-proxy',(map {s{http://}{}; $_} $options{http_proxy})):(), $url ) == 0 or next; rename('temp.pdf',"${pmid}.pdf") if -e 'temp.pdf'; } }} } } sub check_subframes { my ($m,$call) = @_; my @sub_frames = $m->find_all_links(tag_regex=>qr/^i?frame$/); print STDERR "subframes: \n" if $DEBUG; p @sub_frames if $DEBUG; for my $frame (@sub_frames) { my $r = $m->get($frame->url_abs()); print STDERR "trying: ".$frame->url_abs()."\n" if $DEBUG; if ($r->header('Content-Type') =~ /pdf/) { return $m; } print STDERR "failed: ".$r->header('Content-Type')."\n" if $DEBUG; } for my $frame (@sub_frames) { my $r = $m->get($frame->url_abs()); my $pdf_m = find_pdf_link($m, 0, $call+1, ); if (defined $pdf_m) { return $pdf_m; } } return undef; } sub find_pdf_link { my ($mech,$guess,$call) = @_; $guess = 1 unless defined $guess; $call = 0 unless defined $call; # avoid looping endlessly return undef if $call > 5; my $m = $mech->clone(); if ($m->content =~ /select\s*a\s*website\s*below/i) { print STDERR $m->uri() if $DEBUG; print STDERR $m->content() if $DEBUG > 1; my @inputs = $m->find_all_inputs(type => 'hidden', name => q(urls['sd']), ); return unless @inputs; $m->get($inputs[0]->value); print STDERR $m->content() if $DEBUG > 1; } my @possible_links; # this brings forward the actual link at Science push @possible_links, grep {my $temp = $_->attrs(); exists $temp->{rel} and $temp->{rel} =~ qr/view-/i and defined $_->text() and $_->text() =~ qr/Full\s*Text.*PDF/i } $m->find_all_links(text_regex => qr/PDF/i); push @possible_links, grep {my $temp = $_->attrs(); exists $temp->{rel} and $temp->{rel} =~ qr/alternate/i and exists $temp->{type} and $temp->{type} =~ qr/pdf/i } $m->find_all_links(url_regex => qr/pdf/); # this is to prioritize the real link at science direct push @possible_links, grep {my $temp = $_->attrs(); use Data::Dumper; print STDERR Dumper($temp); (exists $temp->{title} and $temp->{title} =~ qr/(Download|Full\s*Text)\s*PDF/i) or (defined $_->text() and $_->text() =~ qr/(Full\s*Text|Download).*PDF/i) } $m->find_all_links(text_regex => qr/PDF/i); my $possible_links = 0; if ($DEBUG) { $possible_links++; print STDERR "possible links[$possible_links]:\n"; p @possible_links; } push @possible_links, grep { $_->url_abs() !~ /_orig(?:in)?=article/} $m->find_all_links(text_regex => qr/PDF/i); if ($DEBUG) { $possible_links++; print STDERR "possible links[$possible_links]:\n"; p @possible_links; } push @possible_links, $m->find_all_links(tag_regex => qr/meta/, url_regex => qr/(reprint|\.pdf)/i, ); if ($DEBUG) { $possible_links++; print STDERR "possible links[$possible_links]:\n"; p @possible_links; } # The masthead grep here is to handle PNAS, which has a link to their masthead in every article. push @possible_links, grep {my $temp = $_->attrs(); (not defined $temp->{title}) or $temp->{title} !~ qr/Masthead/i;} $m->find_all_links(text_regex => qr/pdf/i); if ($DEBUG) { $possible_links++; print STDERR "possible links[$possible_links]:\n"; p @possible_links; } push @possible_links,$m->find_all_links(text_regex => qr/manual\s*download/i); if ($DEBUG) { $possible_links++; print STDERR "possible links[$possible_links]:\n"; p @possible_links; } print STDERR $m->uri() if $DEBUG; print STDERR $m->content() if $DEBUG > 1; print STDERR map{"possible pdf link: ".$_->url_abs().qq(\n)} @possible_links if $DEBUG; if (not @possible_links and $DEBUG) { print STDERR $m->content(); } my $best_guess = $possible_links[0] if @possible_links; for my $link (@possible_links) { print STDERR "trying ".$link->url_abs()."..." if $DEBUG; my $r = $m->get($link->url_abs()); my $content = $m->content(); if ($r->header('Content-Type') =~ /pdf/) { print STDERR "success\n" if $DEBUG; return $m; } my $ret = check_subframes($m,$call); return $ret if defined $ret; print STDERR "failure; content type ".$r->header('Content-Type')."\n" if $DEBUG; print STDERR $content if $DEBUG; } my $ret = check_subframes($m,$call); return $ret if defined $ret; # if ($guess and defined $best_guess) { # $m->get($best_guess->url_abs()); # return $m; # } return undef; } __END__