use Cwd;
use WWW::Mechanize;
+use Data::Printer;
my %options = (debug => 0,
help => 0,
my @possible_links = $m->find_all_links(text_regex => qr/to\s*read/i);
# try to find the other links
push @possible_links,
- grep {my $attr = $_->attrs(); exists $attr->{title} and $attr->{title} =~ qr/Full\s*Text/i}
+ grep {my $attr = $_->attrs(); exists $attr->{title} and $attr->{title} =~ qr/(?:Full\s*Text|PMC)/i}
$m->links();
print STDERR map {"article link: ".$_->url_abs()."\n"} @possible_links if $DEBUG;
die "No links" unless @possible_links;
}
+sub check_subframes {
+ my ($m,$call) = @_;
+ my @sub_frames = $m->find_all_links(tag_regex=>qr/^i?frame$/);
+ print STDERR "subframes: \n" if $DEBUG;
+ p @sub_frames if $DEBUG;
+ for my $frame (@sub_frames) {
+ my $r = $m->get($frame->url_abs());
+ print STDERR "trying: ".$frame->url_abs()."\n" if $DEBUG;
+ if ($r->header('Content-Type') =~ /pdf/) {
+ return $m;
+ }
+ print STDERR "failed: ".$r->header('Content-Type')."\n" if $DEBUG;
+ }
+ for my $frame (@sub_frames) {
+ my $r = $m->get($frame->url_abs());
+ my $pdf_m = find_pdf_link($m,
+ 0,
+ $call+1,
+ );
+ if (defined $pdf_m) {
+ return $pdf_m;
+ }
+ }
+ return undef;
+}
+
+
sub find_pdf_link {
my ($mech,$guess,$call) = @_;
$guess = 1 unless defined $guess;
return undef if $call > 5;
my $m = $mech->clone();
if ($m->content =~ /select\s*a\s*website\s*below/i) {
- print STDERR $m->uri() if $DEBUG;
- print STDERR $m->content() if $DEBUG;
- my @inputs = $m->find_all_inputs(type => 'hidden',
- name => q(urls['sd']),
- );
- return unless @inputs;
- $m->get($inputs[0]->value);
- print STDERR $m->content() if $DEBUG;
+ print STDERR $m->uri() if $DEBUG;
+ print STDERR $m->content() if $DEBUG > 1;
+ my @inputs = $m->find_all_inputs(type => 'hidden',
+ name => q(urls['sd']),
+ );
+ return unless @inputs;
+ $m->get($inputs[0]->value);
+ print STDERR $m->content() if $DEBUG > 1;
}
my @possible_links;
+ # this brings forward the actual link at Science
+ push @possible_links,
+ grep {my $temp = $_->attrs();
+ exists $temp->{rel} and $temp->{rel} =~ qr/view-/i and
+ defined $_->text() and $_->text() =~ qr/Full\s*Text.*PDF/i
+ }
+ $m->find_all_links(text_regex => qr/PDF/i);
# this is to prioritize the real link at science direct
- push @possible_links, grep {my $temp = $_->attrs();
- exists $temp->{title} and $temp->{title} =~ qr/Download\s*PDF/i}
- $m->find_all_links(text_regex => qr/PDF/i);
+ push @possible_links,
+ grep {my $temp = $_->attrs();
+ use Data::Dumper;
+ print STDERR Dumper($temp);
+ (exists $temp->{title} and $temp->{title} =~ qr/(Download|Full\s*Text)\s*PDF/i) or
+ (defined $_->text() and $_->text() =~ qr/(Full\s*Text|Download).*PDF/i)
+ }
+ $m->find_all_links(text_regex => qr/PDF/i);
+ my $possible_links = 0;
+ if ($DEBUG) {
+ $possible_links++;
+ print STDERR "possible links[$possible_links]:\n";
+ p @possible_links;
+ }
push @possible_links, grep { $_->url_abs() !~ /_orig(?:in)?=article/} $m->find_all_links(text_regex => qr/PDF/i);
+ if ($DEBUG) {
+ $possible_links++;
+ print STDERR "possible links[$possible_links]:\n";
+ p @possible_links;
+ }
push @possible_links, $m->find_all_links(tag_regex => qr/meta/,
url_regex => qr/(reprint|\.pdf)/i,
);
- push @possible_links, $m->find_all_links(text_regex => qr/pdf/i);
+ if ($DEBUG) {
+ $possible_links++;
+ print STDERR "possible links[$possible_links]:\n";
+ p @possible_links;
+ }
+ # The masthead grep here is to handle PNAS, which has a link to their masthead in every article.
+ push @possible_links,
+ grep {my $temp = $_->attrs(); (not defined $temp->{title}) or $temp->{title} !~ qr/Masthead/i;}
+ $m->find_all_links(text_regex => qr/pdf/i);
+ if ($DEBUG) {
+ $possible_links++;
+ print STDERR "possible links[$possible_links]:\n";
+ p @possible_links;
+ }
push @possible_links,$m->find_all_links(text_regex => qr/manual\s*download/i);
+ if ($DEBUG) {
+ $possible_links++;
+ print STDERR "possible links[$possible_links]:\n";
+ p @possible_links;
+ }
print STDERR $m->uri() if $DEBUG;
- print STDERR $m->content() if $DEBUG;
+ print STDERR $m->content() if $DEBUG > 1;
print STDERR map{"possible pdf link: ".$_->url_abs().qq(\n)} @possible_links if $DEBUG;
if (not @possible_links and $DEBUG) {
- print STDERR $m->content();
+ print STDERR $m->content();
}
my $best_guess = $possible_links[0] if @possible_links;
for my $link (@possible_links) {
- print STDERR "trying ".$link->url_abs()."..." if $DEBUG;
- my $r = $m->get($link->url_abs());
- if ($r->header('Content-Type') =~ /pdf/) {
- print STDERR "success\n" if $DEBUG;
- return $m;
- }
- print STDERR "failure; content type ".$r->header('Content-Type')."\n" if $DEBUG;
- print STDERR $m->content() if $DEBUG;
- }
- my @sub_frames = $m->find_all_links(tag_regex=>qr/^i?frame$/);
- for my $frame (@sub_frames) {
- my $r = $m->get($frame->url_abs());
- if ($r->header('Content-Type') =~ /pdf/) {
- return $m;
- }
- my $pdf_m = find_pdf_link($m,
- 0,
- $call+1,
- );
- if (defined $pdf_m) {
- return $pdf_m;
- }
+ print STDERR "trying ".$link->url_abs()."..." if $DEBUG;
+ my $r = $m->get($link->url_abs());
+ my $content = $m->content();
+ if ($r->header('Content-Type') =~ /pdf/) {
+ print STDERR "success\n" if $DEBUG;
+ return $m;
+ }
+ my $ret = check_subframes($m,$call);
+ return $ret if defined $ret;
+ print STDERR "failure; content type ".$r->header('Content-Type')."\n" if $DEBUG;
+ print STDERR $content if $DEBUG;
}
+ my $ret = check_subframes($m,$call);
+ return $ret if defined $ret;
# if ($guess and defined $best_guess) {
# $m->get($best_guess->url_abs());
# return $m;