$url = "http://${url}";
eval {
$m->get($url) or die "Unable to get $url";
- $m->follow_link(text_regex => qr/to\s*read/i) or
- die "Unable to follow link";
- # try to find pdf link
- my $pdf_m = find_pdf_link($m) or
- die "Unable to find pdf";
- my $fh = IO::File->new($pmid.'.pdf','w') or
- die "Unable to open ${pmid}.pdf for writing: $!";
- print {$fh} $pdf_m->content or
- die "Unable to write to ${pmid}.pdf: $!";
- close $fh or
- die "Unable to close ${pmid}.pdf filehandle: $!";
+ my $orig_mech = $m->clone();
+ my @possible_links = $m->find_all_links(text_regex => qr/to\s*read/i);
+ # try to find the other links
+ push @possible_links,
+ grep {my $attr = $_->attrs(); exists $attr->{title} and $attr->{title} =~ qr/Full\s*Text/i}
+ $m->links();
+ die "No links" unless @possible_links;
+ do {
+ $m = $orig_mech;
+ eval {
+ print "trying ".$possible_links[0]->url()."\n" if $DEBUG;
+ $m->get($possible_links[0]->url()) or
+ die "Unable to follow link";
+ # try to find pdf link
+ my $pdf_m = find_pdf_link($m) or
+ die "Unable to find pdf";
+ my $fh = IO::File->new($pmid.'.pdf','w') or
+ die "Unable to open ${pmid}.pdf for writing: $!";
+ print {$fh} $pdf_m->content or
+ die "Unable to write to ${pmid}.pdf: $!";
+ close $fh or
+ die "Unable to close ${pmid}.pdf filehandle: $!";
+ };
+ shift @possible_links;
+ } while ($@ and @possible_links);
+ if ($@) {
+ die "$@";
+ }
};
if ($@) {
print STDERR "$@\n" if $DEBUG;
- system('links',
+ system('links2',
exists $options{http_proxy}?('-http-proxy',$options{http_proxy}):(),
$url
) == 0 or next;
print STDERR $m->content() if $DEBUG;
}
my @possible_links;
+ # this is to prioritize the real link at science direct
+ push @possible_links, grep { $_->url_abs() !~ /_orig=article/} $m->find_all_links(text_regex => qr/PDF/i);
push @possible_links, $m->find_all_links(tag_regex => qr/meta/,
url_regex => qr/(reprint|\.pdf)/i,
);
}
my @sub_frames = $m->find_all_links(tag_regex=>qr/^i?frame$/);
for my $frame (@sub_frames) {
- $m->get($frame->url_abs());
+ my $r = $m->get($frame->url_abs());
+ if ($r->header('Content-Type') =~ /pdf/) {
+ return $m;
+ }
my $pdf_m = find_pdf_link($m,
0,
$call+1,