From: Don Armstrong Date: Wed, 30 Jun 2010 02:16:44 +0000 (+0000) Subject: * handle multiple full text sources X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=1d30f0cb452e9c5b4026be5d074aee74cc3efc79;p=bin.git * handle multiple full text sources * properly deal with iframes --- diff --git a/get_pdf b/get_pdf index 0aadc60..3768e52 100755 --- a/get_pdf +++ b/get_pdf @@ -119,17 +119,34 @@ if ($options{pmid}) { $url = "http://${url}"; eval { $m->get($url) or die "Unable to get $url"; - $m->follow_link(text_regex => qr/to\s*read/i) or - die "Unable to follow link"; - # try to find pdf link - my $pdf_m = find_pdf_link($m) or - die "Unable to find pdf"; - my $fh = IO::File->new($pmid.'.pdf','w') or - die "Unable to open ${pmid}.pdf for writing: $!"; - print {$fh} $pdf_m->content or - die "Unable to write to ${pmid}.pdf: $!"; - close $fh or - die "Unable to close ${pmid}.pdf filehandle: $!"; + my $orig_mech = $m->clone(); + my @possible_links = $m->find_all_links(text_regex => qr/to\s*read/i); + # try to find the other links + push @possible_links, + grep {my $attr = $_->attrs(); exists $attr->{title} and $attr->{title} =~ qr/Full\s*Text/i} + $m->links(); + die "No links" unless @possible_links; + do { + $m = $orig_mech; + eval { + print "trying ".$possible_links[0]->url()."\n" if $DEBUG; + $m->get($possible_links[0]->url()) or + die "Unable to follow link"; + # try to find pdf link + my $pdf_m = find_pdf_link($m) or + die "Unable to find pdf"; + my $fh = IO::File->new($pmid.'.pdf','w') or + die "Unable to open ${pmid}.pdf for writing: $!"; + print {$fh} $pdf_m->content or + die "Unable to write to ${pmid}.pdf: $!"; + close $fh or + die "Unable to close ${pmid}.pdf filehandle: $!"; + }; + shift @possible_links; + } while ($@ and @possible_links); + if ($@) { + die "$@"; + } }; if ($@) { print STDERR "$@\n" if $DEBUG; @@ -181,7 +198,10 @@ sub find_pdf_link { } my @sub_frames = $m->find_all_links(tag_regex=>qr/^i?frame$/); for my $frame (@sub_frames) { - $m->get($frame->url_abs()); + my $r = $m->get($frame->url_abs()); + if ($r->header('Content-Type') =~ /pdf/) { + return $m; + } my $pdf_m = find_pdf_link($m, 0, $call+1,