]> git.donarmstrong.com Git - bin.git/commitdiff
* handle multiple full text sources
authorDon Armstrong <don@donarmstrong.com>
Wed, 30 Jun 2010 02:16:44 +0000 (02:16 +0000)
committerDon Armstrong <don@donarmstrong.com>
Wed, 30 Jun 2010 02:16:44 +0000 (02:16 +0000)
 * properly deal with iframes

get_pdf

diff --git a/get_pdf b/get_pdf
index 0aadc600c1935f0e5f5a8f775355b2dbdaaece48..3768e5217970af5e4410e527b23b21c9763b0e01 100755 (executable)
--- a/get_pdf
+++ b/get_pdf
@@ -119,17 +119,34 @@ if ($options{pmid}) {
        $url = "http://${url}";
        eval {
            $m->get($url) or die "Unable to get $url";
-           $m->follow_link(text_regex => qr/to\s*read/i) or
-               die "Unable to follow link";
-           # try to find pdf link
-           my $pdf_m = find_pdf_link($m) or
-               die "Unable to find pdf";
-           my $fh = IO::File->new($pmid.'.pdf','w') or
-               die "Unable to open ${pmid}.pdf for writing: $!";
-           print {$fh} $pdf_m->content or
-               die "Unable to write to ${pmid}.pdf: $!";
-           close $fh or
-               die "Unable to close ${pmid}.pdf filehandle: $!";
+           my $orig_mech = $m->clone();
+           my @possible_links = $m->find_all_links(text_regex => qr/to\s*read/i);
+           # try to find the other links
+           push @possible_links,
+               grep {my $attr = $_->attrs(); exists $attr->{title} and $attr->{title} =~ qr/Full\s*Text/i}
+                   $m->links();
+           die "No links" unless @possible_links;
+           do {
+               $m = $orig_mech;
+               eval {
+                   print "trying ".$possible_links[0]->url()."\n" if $DEBUG;
+                   $m->get($possible_links[0]->url()) or
+                       die "Unable to follow link";
+                   # try to find pdf link
+                   my $pdf_m = find_pdf_link($m) or
+                       die "Unable to find pdf";
+                   my $fh = IO::File->new($pmid.'.pdf','w') or
+                       die "Unable to open ${pmid}.pdf for writing: $!";
+                   print {$fh} $pdf_m->content or
+                       die "Unable to write to ${pmid}.pdf: $!";
+                   close $fh or
+                       die "Unable to close ${pmid}.pdf filehandle: $!";
+               };
+               shift @possible_links;
+           } while ($@ and @possible_links);
+           if ($@) {
+               die "$@";
+           }
        };
        if ($@) {
            print STDERR "$@\n" if $DEBUG;
@@ -181,7 +198,10 @@ sub find_pdf_link {
     }
     my @sub_frames = $m->find_all_links(tag_regex=>qr/^i?frame$/);
     for my $frame (@sub_frames) {
-       $m->get($frame->url_abs());
+       my $r = $m->get($frame->url_abs());
+       if ($r->header('Content-Type') =~ /pdf/) {
+           return $m;
+       }
        my $pdf_m = find_pdf_link($m,
                                  0,
                                  $call+1,