From 8ebb268909eaa0b8dc5bc7b31e7c416dd8f40317 Mon Sep 17 00:00:00 2001 From: Don Armstrong Date: Wed, 10 May 2017 11:37:49 -0700 Subject: [PATCH] use chromium and UTF-8 encoding --- get_pdf | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/get_pdf b/get_pdf index 1b43148..0c1ad2f 100755 --- a/get_pdf +++ b/get_pdf @@ -83,7 +83,8 @@ pod2usage({verbose=>2}) if $options{man}; $DEBUG = $options{debug}; - +binmode(STDOUT,":encoding(UTF-8)"); +binmode(STDERR,":encoding(UTF-8)"); if (not grep {exists $options{$_} and defined $options{$_} and @@ -127,7 +128,9 @@ if ($options{pmid}) { my @possible_links = $m->find_all_links(text_regex => qr/to\s*read/i); # try to find the other links push @possible_links, - grep {my $attr = $_->attrs(); exists $attr->{title} and $attr->{title} =~ qr/(?:Full\s*Text|PMC)/i} + grep {my $attr = $_->attrs(); + exists $attr->{title} and + $attr->{title} =~ qr/(?:Full\s*Text|PMC)/i} $m->links(); print STDERR map {"article link: ".$_->url_abs()."\n"} @possible_links if $DEBUG; die "No links" unless @possible_links; @@ -155,7 +158,16 @@ if ($options{pmid}) { }; if ($@) { print STDERR "$@\n" if $DEBUG; - if ($options{use_links}) { + if ($options{use_links}) { + if ($ENV{DISPLAY}) { + system('chromium', + # links2 doesn't like the leading http:// of proxies for some reason + exists $options{http_proxy}?('--proxy-server',(map {s{http://}{}; $_} $options{http_proxy})):(), + '--temp-profile', + $url, + ) == 0 or next; + rename('temp.pdf',"${pmid}.pdf") if -e 'temp.pdf'; + } else { system('links2', # links2 doesn't like the leading http:// of proxies for some reason exists $options{http_proxy}?('-http-proxy',(map {s{http://}{}; $_} $options{http_proxy})):(), @@ -163,7 +175,7 @@ if ($options{pmid}) { ) == 0 or next; rename('temp.pdf',"${pmid}.pdf") if -e 'temp.pdf'; } - } + }} } } @@ -220,6 +232,12 @@ sub find_pdf_link { defined $_->text() and $_->text() =~ qr/Full\s*Text.*PDF/i } $m->find_all_links(text_regex => qr/PDF/i); + push @possible_links, + grep {my $temp = $_->attrs(); + exists $temp->{rel} and $temp->{rel} =~ qr/alternate/i and + exists $temp->{type} and $temp->{type} =~ qr/pdf/i + } + $m->find_all_links(url_regex => qr/pdf/); # this is to prioritize the real link at science direct push @possible_links, grep {my $temp = $_->attrs(); -- 2.39.2