}
if ($options{pmid}) {
- my $m = WWW::Mechanize->new();
+ my $m = WWW::Mechanize->new(agent => 'Mozilla',cookie_jar => {});
for my $pmid (@ARGV) {
$pmid =~ s/\D//g;
next unless length $pmid;
eval {
$m->get($url) or die "Unable to get $url";
my $orig_mech = $m->clone();
+ use Data::Dumper;
my @possible_links = $m->find_all_links(text_regex => qr/to\s*read/i);
# try to find the other links
push @possible_links,
grep {my $attr = $_->attrs(); exists $attr->{title} and $attr->{title} =~ qr/Full\s*Text/i}
$m->links();
+ print STDERR map {"article link: ".$_->url_abs()."\n"} @possible_links if $DEBUG;
die "No links" unless @possible_links;
do {
$m = $orig_mech;
};
if ($@) {
print STDERR "$@\n" if $DEBUG;
- system('links',
+ system('links2',
exists $options{http_proxy}?('-http-proxy',$options{http_proxy}):(),
$url
) == 0 or next;
}
my @possible_links;
# this is to prioritize the real link at science direct
- push @possible_links, grep { $_->url_abs() !~ /_orig=article/} $m->find_all_links(text_regex => qr/PDF/i);
+ push @possible_links, grep {my $temp = $_->attrs();
+ exists $temp->{title} and $temp->{title} =~ qr/Download\s*PDF/i}
+ $m->find_all_links(text_regex => qr/PDF/i);
+ push @possible_links, grep { $_->url_abs() !~ /_orig(?:in)?=article/} $m->find_all_links(text_regex => qr/PDF/i);
push @possible_links, $m->find_all_links(tag_regex => qr/meta/,
url_regex => qr/(reprint|\.pdf)/i,
);
push @possible_links,$m->find_all_links(text_regex => qr/manual\s*download/i);
print STDERR $m->uri() if $DEBUG;
print STDERR $m->content() if $DEBUG;
- print STDERR map{$_->url_abs(),qq(\n)} @possible_links if $DEBUG;
+ print STDERR map{"possible pdf link: ".$_->url_abs().qq(\n)} @possible_links if $DEBUG;
if (not @possible_links and $DEBUG) {
print STDERR $m->content();
}
my $best_guess = $possible_links[0] if @possible_links;
for my $link (@possible_links) {
+ print STDERR "trying ".$link->url_abs()."..." if $DEBUG;
my $r = $m->get($link->url_abs());
if ($r->header('Content-Type') =~ /pdf/) {
+ print STDERR "success\n" if $DEBUG;
return $m;
}
+ print STDERR "failure; content type ".$r->header('Content-Type')."\n" if $DEBUG;
+ print STDERR $m->content() if $DEBUG;
}
my @sub_frames = $m->find_all_links(tag_regex=>qr/^i?frame$/);
for my $frame (@sub_frames) {