use Cwd;
use WWW::Mechanize;
+use Data::Printer;
my %options = (debug => 0,
help => 0,
$DEBUG = $options{debug};
-
+binmode(STDOUT,":encoding(UTF-8)");
+binmode(STDERR,":encoding(UTF-8)");
if (not grep {exists $options{$_} and
defined $options{$_} and
my @possible_links = $m->find_all_links(text_regex => qr/to\s*read/i);
# try to find the other links
push @possible_links,
- grep {my $attr = $_->attrs(); exists $attr->{title} and $attr->{title} =~ qr/(?:Full\s*Text|PMC)/i}
+ grep {my $attr = $_->attrs();
+ exists $attr->{title} and
+ $attr->{title} =~ qr/(?:Full\s*Text|PMC)/i}
$m->links();
print STDERR map {"article link: ".$_->url_abs()."\n"} @possible_links if $DEBUG;
die "No links" unless @possible_links;
};
if ($@) {
print STDERR "$@\n" if $DEBUG;
- if ($options{use_links}) {
+ if ($options{use_links}) {
+ if ($ENV{DISPLAY}) {
+ system('chromium',
+ # links2 doesn't like the leading http:// of proxies for some reason
+ exists $options{http_proxy}?('--proxy-server',(map {s{http://}{}; $_} $options{http_proxy})):(),
+ '--temp-profile',
+ $url,
+ ) == 0 or next;
+ rename('temp.pdf',"${pmid}.pdf") if -e 'temp.pdf';
+ } else {
system('links2',
# links2 doesn't like the leading http:// of proxies for some reason
exists $options{http_proxy}?('-http-proxy',(map {s{http://}{}; $_} $options{http_proxy})):(),
) == 0 or next;
rename('temp.pdf',"${pmid}.pdf") if -e 'temp.pdf';
}
+ }}
+ }
+}
+
+
+sub check_subframes {
+ my ($m,$call) = @_;
+ my @sub_frames = $m->find_all_links(tag_regex=>qr/^i?frame$/);
+ print STDERR "subframes: \n" if $DEBUG;
+ p @sub_frames if $DEBUG;
+ for my $frame (@sub_frames) {
+ my $r = $m->get($frame->url_abs());
+ print STDERR "trying: ".$frame->url_abs()."\n" if $DEBUG;
+ if ($r->header('Content-Type') =~ /pdf/) {
+ return $m;
+ }
+ print STDERR "failed: ".$r->header('Content-Type')."\n" if $DEBUG;
+ }
+ for my $frame (@sub_frames) {
+ my $r = $m->get($frame->url_abs());
+ my $pdf_m = find_pdf_link($m,
+ 0,
+ $call+1,
+ );
+ if (defined $pdf_m) {
+ return $pdf_m;
}
}
+ return undef;
}
return undef if $call > 5;
my $m = $mech->clone();
if ($m->content =~ /select\s*a\s*website\s*below/i) {
- print STDERR $m->uri() if $DEBUG;
- print STDERR $m->content() if $DEBUG;
- my @inputs = $m->find_all_inputs(type => 'hidden',
- name => q(urls['sd']),
- );
- return unless @inputs;
- $m->get($inputs[0]->value);
- print STDERR $m->content() if $DEBUG;
+ print STDERR $m->uri() if $DEBUG;
+ print STDERR $m->content() if $DEBUG > 1;
+ my @inputs = $m->find_all_inputs(type => 'hidden',
+ name => q(urls['sd']),
+ );
+ return unless @inputs;
+ $m->get($inputs[0]->value);
+ print STDERR $m->content() if $DEBUG > 1;
}
my @possible_links;
+ # this brings forward the actual link at Science
+ push @possible_links,
+ grep {my $temp = $_->attrs();
+ exists $temp->{rel} and $temp->{rel} =~ qr/view-/i and
+ defined $_->text() and $_->text() =~ qr/Full\s*Text.*PDF/i
+ }
+ $m->find_all_links(text_regex => qr/PDF/i);
+ push @possible_links,
+ grep {my $temp = $_->attrs();
+ exists $temp->{rel} and $temp->{rel} =~ qr/alternate/i and
+ exists $temp->{type} and $temp->{type} =~ qr/pdf/i
+ }
+ $m->find_all_links(url_regex => qr/pdf/);
# this is to prioritize the real link at science direct
- push @possible_links, grep {my $temp = $_->attrs();
- exists $temp->{title} and $temp->{title} =~ qr/Download\s*PDF/i}
- $m->find_all_links(text_regex => qr/PDF/i);
+ push @possible_links,
+ grep {my $temp = $_->attrs();
+ use Data::Dumper;
+ print STDERR Dumper($temp);
+ (exists $temp->{title} and $temp->{title} =~ qr/(Download|Full\s*Text)\s*PDF/i) or
+ (defined $_->text() and $_->text() =~ qr/(Full\s*Text|Download).*PDF/i)
+ }
+ $m->find_all_links(text_regex => qr/PDF/i);
+ my $possible_links = 0;
+ if ($DEBUG) {
+ $possible_links++;
+ print STDERR "possible links[$possible_links]:\n";
+ p @possible_links;
+ }
push @possible_links, grep { $_->url_abs() !~ /_orig(?:in)?=article/} $m->find_all_links(text_regex => qr/PDF/i);
+ if ($DEBUG) {
+ $possible_links++;
+ print STDERR "possible links[$possible_links]:\n";
+ p @possible_links;
+ }
push @possible_links, $m->find_all_links(tag_regex => qr/meta/,
url_regex => qr/(reprint|\.pdf)/i,
);
- push @possible_links, $m->find_all_links(text_regex => qr/pdf/i);
+ if ($DEBUG) {
+ $possible_links++;
+ print STDERR "possible links[$possible_links]:\n";
+ p @possible_links;
+ }
+ # The masthead grep here is to handle PNAS, which has a link to their masthead in every article.
+ push @possible_links,
+ grep {my $temp = $_->attrs(); (not defined $temp->{title}) or $temp->{title} !~ qr/Masthead/i;}
+ $m->find_all_links(text_regex => qr/pdf/i);
+ if ($DEBUG) {
+ $possible_links++;
+ print STDERR "possible links[$possible_links]:\n";
+ p @possible_links;
+ }
push @possible_links,$m->find_all_links(text_regex => qr/manual\s*download/i);
+ if ($DEBUG) {
+ $possible_links++;
+ print STDERR "possible links[$possible_links]:\n";
+ p @possible_links;
+ }
print STDERR $m->uri() if $DEBUG;
- print STDERR $m->content() if $DEBUG;
+ print STDERR $m->content() if $DEBUG > 1;
print STDERR map{"possible pdf link: ".$_->url_abs().qq(\n)} @possible_links if $DEBUG;
if (not @possible_links and $DEBUG) {
- print STDERR $m->content();
+ print STDERR $m->content();
}
my $best_guess = $possible_links[0] if @possible_links;
for my $link (@possible_links) {
- print STDERR "trying ".$link->url_abs()."..." if $DEBUG;
- my $r = $m->get($link->url_abs());
- if ($r->header('Content-Type') =~ /pdf/) {
- print STDERR "success\n" if $DEBUG;
- return $m;
- }
- print STDERR "failure; content type ".$r->header('Content-Type')."\n" if $DEBUG;
- print STDERR $m->content() if $DEBUG;
- }
- my @sub_frames = $m->find_all_links(tag_regex=>qr/^i?frame$/);
- for my $frame (@sub_frames) {
- my $r = $m->get($frame->url_abs());
- if ($r->header('Content-Type') =~ /pdf/) {
- return $m;
- }
- my $pdf_m = find_pdf_link($m,
- 0,
- $call+1,
- );
- if (defined $pdf_m) {
- return $pdf_m;
- }
+ print STDERR "trying ".$link->url_abs()."..." if $DEBUG;
+ my $r = $m->get($link->url_abs());
+ my $content = $m->content();
+ if ($r->header('Content-Type') =~ /pdf/) {
+ print STDERR "success\n" if $DEBUG;
+ return $m;
+ }
+ my $ret = check_subframes($m,$call);
+ return $ret if defined $ret;
+ print STDERR "failure; content type ".$r->header('Content-Type')."\n" if $DEBUG;
+ print STDERR $content if $DEBUG;
}
+ my $ret = check_subframes($m,$call);
+ return $ret if defined $ret;
# if ($guess and defined $best_guess) {
# $m->get($best_guess->url_abs());
# return $m;