X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=get_pdf;h=74f62dc6998e1f6ec494b4411eac95f9495320d8;hb=3e61fdc995e43c2eafa722caca93609922f9c7f3;hp=745ea9aedec558b4ddea088c70784c2013f5dd15;hpb=80ccd0fbe2b3f3d6067fb18ce38ed39f49ed4570;p=bin.git diff --git a/get_pdf b/get_pdf index 745ea9a..74f62dc 100755 --- a/get_pdf +++ b/get_pdf @@ -120,11 +120,13 @@ if ($options{pmid}) { eval { $m->get($url) or die "Unable to get $url"; my $orig_mech = $m->clone(); + use Data::Dumper; my @possible_links = $m->find_all_links(text_regex => qr/to\s*read/i); # try to find the other links push @possible_links, grep {my $attr = $_->attrs(); exists $attr->{title} and $attr->{title} =~ qr/Full\s*Text/i} $m->links(); + print STDERR map {"article link: ".$_->url_abs()."\n"} @possible_links if $DEBUG; die "No links" unless @possible_links; do { $m = $orig_mech; @@ -179,7 +181,10 @@ sub find_pdf_link { } my @possible_links; # this is to prioritize the real link at science direct - push @possible_links, grep { $_->url_abs() !~ /_orig=article/} $m->find_all_links(text_regex => qr/PDF/i); + push @possible_links, grep {my $temp = $_->attrs(); + exists $temp->{title} and $temp->{title} =~ qr/Download\s*PDF/i} + $m->find_all_links(text_regex => qr/PDF/i); + push @possible_links, grep { $_->url_abs() !~ /_orig(?:in)?=article/} $m->find_all_links(text_regex => qr/PDF/i); push @possible_links, $m->find_all_links(tag_regex => qr/meta/, url_regex => qr/(reprint|\.pdf)/i, ); @@ -187,16 +192,19 @@ sub find_pdf_link { push @possible_links,$m->find_all_links(text_regex => qr/manual\s*download/i); print STDERR $m->uri() if $DEBUG; print STDERR $m->content() if $DEBUG; - print STDERR map{$_->url_abs(),qq(\n)} @possible_links if $DEBUG; + print STDERR map{"possible pdf link: ".$_->url_abs().qq(\n)} @possible_links if $DEBUG; if (not @possible_links and $DEBUG) { print STDERR $m->content(); } my $best_guess = $possible_links[0] if @possible_links; for my $link (@possible_links) { + print STDERR "trying ".$link->url_abs()."..." if $DEBUG; my $r = $m->get($link->url_abs()); if ($r->header('Content-Type') =~ /pdf/) { + print STDERR "success\n" if $DEBUG; return $m; } + print STDERR "failure; content type ".$r->header('Content-Type')."\n" if $DEBUG; } my @sub_frames = $m->find_all_links(tag_regex=>qr/^i?frame$/); for my $frame (@sub_frames) {