X-Git-Url: https://git.donarmstrong.com/?p=bin.git;a=blobdiff_plain;f=get_pdf;h=0c1ad2fa9a5ad90825be15bae55124648911c11d;hp=74f62dc6998e1f6ec494b4411eac95f9495320d8;hb=HEAD;hpb=3e61fdc995e43c2eafa722caca93609922f9c7f3 diff --git a/get_pdf b/get_pdf index 74f62dc..0c1ad2f 100755 --- a/get_pdf +++ b/get_pdf @@ -61,16 +61,19 @@ use vars qw($DEBUG); use Cwd; use WWW::Mechanize; +use Data::Printer; my %options = (debug => 0, help => 0, man => 0, + use_links => 1, ); my %REFERENCE_TYPES = (pmid => 'pmid|p'); GetOptions(\%options, values %REFERENCE_TYPES, + 'use_links|use-links!', 'cgi_proxy|cgi-proxy|C=s', 'http_proxy|http-proxy|H=s', 'debug|d+','help|h|?','man|m'); @@ -80,7 +83,8 @@ pod2usage({verbose=>2}) if $options{man}; $DEBUG = $options{debug}; - +binmode(STDOUT,":encoding(UTF-8)"); +binmode(STDERR,":encoding(UTF-8)"); if (not grep {exists $options{$_} and defined $options{$_} and @@ -108,7 +112,7 @@ if (exists $options{http_proxy}) { } if ($options{pmid}) { - my $m = WWW::Mechanize->new(); + my $m = WWW::Mechanize->new(agent => 'Mozilla',cookie_jar => {}); for my $pmid (@ARGV) { $pmid =~ s/\D//g; next unless length $pmid; @@ -124,7 +128,9 @@ if ($options{pmid}) { my @possible_links = $m->find_all_links(text_regex => qr/to\s*read/i); # try to find the other links push @possible_links, - grep {my $attr = $_->attrs(); exists $attr->{title} and $attr->{title} =~ qr/Full\s*Text/i} + grep {my $attr = $_->attrs(); + exists $attr->{title} and + $attr->{title} =~ qr/(?:Full\s*Text|PMC)/i} $m->links(); print STDERR map {"article link: ".$_->url_abs()."\n"} @possible_links if $DEBUG; die "No links" unless @possible_links; @@ -152,16 +158,55 @@ if ($options{pmid}) { }; if ($@) { print STDERR "$@\n" if $DEBUG; - system('links2', - exists $options{http_proxy}?('-http-proxy',$options{http_proxy}):(), - $url - ) == 0 or next; - rename('temp.pdf',"${pmid}.pdf") if -e 'temp.pdf'; - } + if ($options{use_links}) { + if ($ENV{DISPLAY}) { + system('chromium', + # links2 doesn't like the leading http:// of proxies for some reason + exists $options{http_proxy}?('--proxy-server',(map {s{http://}{}; $_} $options{http_proxy})):(), + '--temp-profile', + $url, + ) == 0 or next; + rename('temp.pdf',"${pmid}.pdf") if -e 'temp.pdf'; + } else { + system('links2', + # links2 doesn't like the leading http:// of proxies for some reason + exists $options{http_proxy}?('-http-proxy',(map {s{http://}{}; $_} $options{http_proxy})):(), + $url + ) == 0 or next; + rename('temp.pdf',"${pmid}.pdf") if -e 'temp.pdf'; + } + }} } } +sub check_subframes { + my ($m,$call) = @_; + my @sub_frames = $m->find_all_links(tag_regex=>qr/^i?frame$/); + print STDERR "subframes: \n" if $DEBUG; + p @sub_frames if $DEBUG; + for my $frame (@sub_frames) { + my $r = $m->get($frame->url_abs()); + print STDERR "trying: ".$frame->url_abs()."\n" if $DEBUG; + if ($r->header('Content-Type') =~ /pdf/) { + return $m; + } + print STDERR "failed: ".$r->header('Content-Type')."\n" if $DEBUG; + } + for my $frame (@sub_frames) { + my $r = $m->get($frame->url_abs()); + my $pdf_m = find_pdf_link($m, + 0, + $call+1, + ); + if (defined $pdf_m) { + return $pdf_m; + } + } + return undef; +} + + sub find_pdf_link { my ($mech,$guess,$call) = @_; $guess = 1 unless defined $guess; @@ -170,56 +215,95 @@ sub find_pdf_link { return undef if $call > 5; my $m = $mech->clone(); if ($m->content =~ /select\s*a\s*website\s*below/i) { - print STDERR $m->uri() if $DEBUG; - print STDERR $m->content() if $DEBUG; - my @inputs = $m->find_all_inputs(type => 'hidden', - name => q(urls['sd']), - ); - return unless @inputs; - $m->get($inputs[0]->value); - print STDERR $m->content() if $DEBUG; + print STDERR $m->uri() if $DEBUG; + print STDERR $m->content() if $DEBUG > 1; + my @inputs = $m->find_all_inputs(type => 'hidden', + name => q(urls['sd']), + ); + return unless @inputs; + $m->get($inputs[0]->value); + print STDERR $m->content() if $DEBUG > 1; } my @possible_links; + # this brings forward the actual link at Science + push @possible_links, + grep {my $temp = $_->attrs(); + exists $temp->{rel} and $temp->{rel} =~ qr/view-/i and + defined $_->text() and $_->text() =~ qr/Full\s*Text.*PDF/i + } + $m->find_all_links(text_regex => qr/PDF/i); + push @possible_links, + grep {my $temp = $_->attrs(); + exists $temp->{rel} and $temp->{rel} =~ qr/alternate/i and + exists $temp->{type} and $temp->{type} =~ qr/pdf/i + } + $m->find_all_links(url_regex => qr/pdf/); # this is to prioritize the real link at science direct - push @possible_links, grep {my $temp = $_->attrs(); - exists $temp->{title} and $temp->{title} =~ qr/Download\s*PDF/i} - $m->find_all_links(text_regex => qr/PDF/i); + push @possible_links, + grep {my $temp = $_->attrs(); + use Data::Dumper; + print STDERR Dumper($temp); + (exists $temp->{title} and $temp->{title} =~ qr/(Download|Full\s*Text)\s*PDF/i) or + (defined $_->text() and $_->text() =~ qr/(Full\s*Text|Download).*PDF/i) + } + $m->find_all_links(text_regex => qr/PDF/i); + my $possible_links = 0; + if ($DEBUG) { + $possible_links++; + print STDERR "possible links[$possible_links]:\n"; + p @possible_links; + } push @possible_links, grep { $_->url_abs() !~ /_orig(?:in)?=article/} $m->find_all_links(text_regex => qr/PDF/i); + if ($DEBUG) { + $possible_links++; + print STDERR "possible links[$possible_links]:\n"; + p @possible_links; + } push @possible_links, $m->find_all_links(tag_regex => qr/meta/, url_regex => qr/(reprint|\.pdf)/i, ); - push @possible_links, $m->find_all_links(text_regex => qr/pdf/i); + if ($DEBUG) { + $possible_links++; + print STDERR "possible links[$possible_links]:\n"; + p @possible_links; + } + # The masthead grep here is to handle PNAS, which has a link to their masthead in every article. + push @possible_links, + grep {my $temp = $_->attrs(); (not defined $temp->{title}) or $temp->{title} !~ qr/Masthead/i;} + $m->find_all_links(text_regex => qr/pdf/i); + if ($DEBUG) { + $possible_links++; + print STDERR "possible links[$possible_links]:\n"; + p @possible_links; + } push @possible_links,$m->find_all_links(text_regex => qr/manual\s*download/i); + if ($DEBUG) { + $possible_links++; + print STDERR "possible links[$possible_links]:\n"; + p @possible_links; + } print STDERR $m->uri() if $DEBUG; - print STDERR $m->content() if $DEBUG; + print STDERR $m->content() if $DEBUG > 1; print STDERR map{"possible pdf link: ".$_->url_abs().qq(\n)} @possible_links if $DEBUG; if (not @possible_links and $DEBUG) { - print STDERR $m->content(); + print STDERR $m->content(); } my $best_guess = $possible_links[0] if @possible_links; for my $link (@possible_links) { - print STDERR "trying ".$link->url_abs()."..." if $DEBUG; - my $r = $m->get($link->url_abs()); - if ($r->header('Content-Type') =~ /pdf/) { - print STDERR "success\n" if $DEBUG; - return $m; - } - print STDERR "failure; content type ".$r->header('Content-Type')."\n" if $DEBUG; - } - my @sub_frames = $m->find_all_links(tag_regex=>qr/^i?frame$/); - for my $frame (@sub_frames) { - my $r = $m->get($frame->url_abs()); - if ($r->header('Content-Type') =~ /pdf/) { - return $m; - } - my $pdf_m = find_pdf_link($m, - 0, - $call+1, - ); - if (defined $pdf_m) { - return $pdf_m; - } + print STDERR "trying ".$link->url_abs()."..." if $DEBUG; + my $r = $m->get($link->url_abs()); + my $content = $m->content(); + if ($r->header('Content-Type') =~ /pdf/) { + print STDERR "success\n" if $DEBUG; + return $m; + } + my $ret = check_subframes($m,$call); + return $ret if defined $ret; + print STDERR "failure; content type ".$r->header('Content-Type')."\n" if $DEBUG; + print STDERR $content if $DEBUG; } + my $ret = check_subframes($m,$call); + return $ret if defined $ret; # if ($guess and defined $best_guess) { # $m->get($best_guess->url_abs()); # return $m;