use Cwd;
use WWW::Mechanize;
+use Data::Printer;
my %options = (debug => 0,
help => 0,
man => 0,
+ use_links => 1,
);
my %REFERENCE_TYPES = (pmid => 'pmid|p');
GetOptions(\%options,
values %REFERENCE_TYPES,
+ 'use_links|use-links!',
'cgi_proxy|cgi-proxy|C=s',
'http_proxy|http-proxy|H=s',
'debug|d+','help|h|?','man|m');
}
if ($options{pmid}) {
- my $m = WWW::Mechanize->new();
+ my $m = WWW::Mechanize->new(agent => 'Mozilla',cookie_jar => {});
for my $pmid (@ARGV) {
$pmid =~ s/\D//g;
next unless length $pmid;
$url = "http://${url}";
eval {
$m->get($url) or die "Unable to get $url";
- $m->follow_link(text_regex => qr/to\s*read/i) or
- die "Unable to follow link";
- # try to find pdf link
- my $pdf_m = find_pdf_link($m) or
- die "Unable to find pdf";
- my $fh = IO::File->new($pmid.'.pdf','w') or
- die "Unable to open ${pmid}.pdf for writing: $!";
- print {$fh} $pdf_m->content or
- die "Unable to write to ${pmid}.pdf: $!";
- close $fh or
- die "Unable to close ${pmid}.pdf filehandle: $!";
+ my $orig_mech = $m->clone();
+ use Data::Dumper;
+ my @possible_links = $m->find_all_links(text_regex => qr/to\s*read/i);
+ # try to find the other links
+ push @possible_links,
+ grep {my $attr = $_->attrs(); exists $attr->{title} and $attr->{title} =~ qr/(?:Full\s*Text|PMC)/i}
+ $m->links();
+ print STDERR map {"article link: ".$_->url_abs()."\n"} @possible_links if $DEBUG;
+ die "No links" unless @possible_links;
+ do {
+ $m = $orig_mech;
+ eval {
+ print "trying ".$possible_links[0]->url()."\n" if $DEBUG;
+ $m->get($possible_links[0]->url()) or
+ die "Unable to follow link";
+ # try to find pdf link
+ my $pdf_m = find_pdf_link($m) or
+ die "Unable to find pdf";
+ my $fh = IO::File->new($pmid.'.pdf','w') or
+ die "Unable to open ${pmid}.pdf for writing: $!";
+ print {$fh} $pdf_m->content or
+ die "Unable to write to ${pmid}.pdf: $!";
+ close $fh or
+ die "Unable to close ${pmid}.pdf filehandle: $!";
+ };
+ shift @possible_links;
+ } while ($@ and @possible_links);
+ if ($@) {
+ die "$@";
+ }
};
if ($@) {
print STDERR "$@\n" if $DEBUG;
- ## system('links',
- ## exists $options{http_proxy}?('-http-proxy',$options{http_proxy}):(),
- ## $url
- ## ) == 0 or next;
- ## rename('temp.pdf',"${pmid}.pdf") if -e 'temp.pdf';
- }
+ if ($options{use_links}) {
+ system('links2',
+ # links2 doesn't like the leading http:// of proxies for some reason
+ exists $options{http_proxy}?('-http-proxy',(map {s{http://}{}; $_} $options{http_proxy})):(),
+ $url
+ ) == 0 or next;
+ rename('temp.pdf',"${pmid}.pdf") if -e 'temp.pdf';
+ }
+ }
}
}
+sub check_subframes {
+ my ($m,$call) = @_;
+ my @sub_frames = $m->find_all_links(tag_regex=>qr/^i?frame$/);
+ print STDERR "subframes: \n" if $DEBUG;
+ p @sub_frames if $DEBUG;
+ for my $frame (@sub_frames) {
+ my $r = $m->get($frame->url_abs());
+ print STDERR "trying: ".$frame->url_abs()."\n" if $DEBUG;
+ if ($r->header('Content-Type') =~ /pdf/) {
+ return $m;
+ }
+ print STDERR "failed: ".$r->header('Content-Type')."\n" if $DEBUG;
+ }
+ for my $frame (@sub_frames) {
+ my $r = $m->get($frame->url_abs());
+ my $pdf_m = find_pdf_link($m,
+ 0,
+ $call+1,
+ );
+ if (defined $pdf_m) {
+ return $pdf_m;
+ }
+ }
+ return undef;
+}
+
+
sub find_pdf_link {
my ($mech,$guess,$call) = @_;
$guess = 1 unless defined $guess;
return undef if $call > 5;
my $m = $mech->clone();
if ($m->content =~ /select\s*a\s*website\s*below/i) {
- print STDERR $m->uri() if $DEBUG;
- print STDERR $m->content() if $DEBUG;
- my @inputs = $m->find_all_inputs(type => 'hidden',
- name => q(urls['sd']),
- );
- return unless @inputs;
- $m->get($inputs[0]->value);
- print STDERR $m->content() if $DEBUG;
+ print STDERR $m->uri() if $DEBUG;
+ print STDERR $m->content() if $DEBUG > 1;
+ my @inputs = $m->find_all_inputs(type => 'hidden',
+ name => q(urls['sd']),
+ );
+ return unless @inputs;
+ $m->get($inputs[0]->value);
+ print STDERR $m->content() if $DEBUG > 1;
+ }
+ my @possible_links;
+ # this brings forward the actual link at Science
+ push @possible_links,
+ grep {my $temp = $_->attrs();
+ exists $temp->{rel} and $temp->{rel} =~ qr/view-/i and
+ defined $_->text() and $_->text() =~ qr/Full\s*Text.*PDF/i
+ }
+ $m->find_all_links(text_regex => qr/PDF/i);
+ # this is to prioritize the real link at science direct
+ push @possible_links,
+ grep {my $temp = $_->attrs();
+ use Data::Dumper;
+ print STDERR Dumper($temp);
+ (exists $temp->{title} and $temp->{title} =~ qr/(Download|Full\s*Text)\s*PDF/i) or
+ (defined $_->text() and $_->text() =~ qr/(Full\s*Text|Download).*PDF/i)
+ }
+ $m->find_all_links(text_regex => qr/PDF/i);
+ my $possible_links = 0;
+ if ($DEBUG) {
+ $possible_links++;
+ print STDERR "possible links[$possible_links]:\n";
+ p @possible_links;
+ }
+ push @possible_links, grep { $_->url_abs() !~ /_orig(?:in)?=article/} $m->find_all_links(text_regex => qr/PDF/i);
+ if ($DEBUG) {
+ $possible_links++;
+ print STDERR "possible links[$possible_links]:\n";
+ p @possible_links;
+ }
+ push @possible_links, $m->find_all_links(tag_regex => qr/meta/,
+ url_regex => qr/(reprint|\.pdf)/i,
+ );
+ if ($DEBUG) {
+ $possible_links++;
+ print STDERR "possible links[$possible_links]:\n";
+ p @possible_links;
+ }
+ # The masthead grep here is to handle PNAS, which has a link to their masthead in every article.
+ push @possible_links,
+ grep {my $temp = $_->attrs(); (not defined $temp->{title}) or $temp->{title} !~ qr/Masthead/i;}
+ $m->find_all_links(text_regex => qr/pdf/i);
+ if ($DEBUG) {
+ $possible_links++;
+ print STDERR "possible links[$possible_links]:\n";
+ p @possible_links;
}
- my @possible_links = $m->find_all_links(text_regex => qr/pdf/i);
push @possible_links,$m->find_all_links(text_regex => qr/manual\s*download/i);
- print STDERR map{$_->url,qq(\n)} @possible_links if $DEBUG;
+ if ($DEBUG) {
+ $possible_links++;
+ print STDERR "possible links[$possible_links]:\n";
+ p @possible_links;
+ }
+ print STDERR $m->uri() if $DEBUG;
+ print STDERR $m->content() if $DEBUG > 1;
+ print STDERR map{"possible pdf link: ".$_->url_abs().qq(\n)} @possible_links if $DEBUG;
if (not @possible_links and $DEBUG) {
- print STDERR $m->content();
+ print STDERR $m->content();
}
my $best_guess = $possible_links[0] if @possible_links;
for my $link (@possible_links) {
- my $r = $m->get($link->url());
- if ($r->header('Content-Type') =~ /pdf/) {
- return $m;
- }
- }
- my @sub_frames = $m->find_all_links(tag_regex=>qr/^i?frame$/);
- for my $frame (@sub_frames) {
- $m->get($frame->url());
- my $pdf_m = find_pdf_link($m,
- 0,
- $call+1,
- );
- if (defined $pdf_m) {
- return $pdf_m;
- }
- }
- if ($guess and defined $best_guess) {
- $m->get($best_guess->url());
- return $m;
+ print STDERR "trying ".$link->url_abs()."..." if $DEBUG;
+ my $r = $m->get($link->url_abs());
+ my $content = $m->content();
+ if ($r->header('Content-Type') =~ /pdf/) {
+ print STDERR "success\n" if $DEBUG;
+ return $m;
+ }
+ my $ret = check_subframes($m,$call);
+ return $ret if defined $ret;
+ print STDERR "failure; content type ".$r->header('Content-Type')."\n" if $DEBUG;
+ print STDERR $content if $DEBUG;
}
+ my $ret = check_subframes($m,$call);
+ return $ret if defined $ret;
+# if ($guess and defined $best_guess) {
+# $m->get($best_guess->url_abs());
+# return $m;
+# }
return undef;
}