]> git.donarmstrong.com Git - bin.git/blobdiff - get_pdf
* document tex-only and log-only
[bin.git] / get_pdf
diff --git a/get_pdf b/get_pdf
index 7daca792fc7f6332a7e11121b83bb3ad4b211b0b..745ea9aedec558b4ddea088c70784c2013f5dd15 100755 (executable)
--- a/get_pdf
+++ b/get_pdf
@@ -119,25 +119,42 @@ if ($options{pmid}) {
        $url = "http://${url}";
        eval {
            $m->get($url) or die "Unable to get $url";
-           $m->follow_link(text_regex => qr/to\s*read/i) or
-               die "Unable to follow link";
-           # try to find pdf link
-           my $pdf_m = find_pdf_link($m) or
-               die "Unable to find pdf";
-           my $fh = IO::File->new($pmid.'.pdf','w') or
-               die "Unable to open ${pmid}.pdf for writing: $!";
-           print {$fh} $pdf_m->content or
-               die "Unable to write to ${pmid}.pdf: $!";
-           close $fh or
-               die "Unable to close ${pmid}.pdf filehandle: $!";
+           my $orig_mech = $m->clone();
+           my @possible_links = $m->find_all_links(text_regex => qr/to\s*read/i);
+           # try to find the other links
+           push @possible_links,
+               grep {my $attr = $_->attrs(); exists $attr->{title} and $attr->{title} =~ qr/Full\s*Text/i}
+                   $m->links();
+           die "No links" unless @possible_links;
+           do {
+               $m = $orig_mech;
+               eval {
+                   print "trying ".$possible_links[0]->url()."\n" if $DEBUG;
+                   $m->get($possible_links[0]->url()) or
+                       die "Unable to follow link";
+                   # try to find pdf link
+                   my $pdf_m = find_pdf_link($m) or
+                       die "Unable to find pdf";
+                   my $fh = IO::File->new($pmid.'.pdf','w') or
+                       die "Unable to open ${pmid}.pdf for writing: $!";
+                   print {$fh} $pdf_m->content or
+                       die "Unable to write to ${pmid}.pdf: $!";
+                   close $fh or
+                       die "Unable to close ${pmid}.pdf filehandle: $!";
+               };
+               shift @possible_links;
+           } while ($@ and @possible_links);
+           if ($@) {
+               die "$@";
+           }
        };
        if ($@) {
            print STDERR "$@\n" if $DEBUG;
-           ## system('links',
-           ##     exists $options{http_proxy}?('-http-proxy',$options{http_proxy}):(),
-           ##     $url
-           ##    ) == 0 or next;
-           ## rename('temp.pdf',"${pmid}.pdf") if -e 'temp.pdf';
+           system('links2',
+                  exists $options{http_proxy}?('-http-proxy',$options{http_proxy}):(),
+                  $url
+                 ) == 0 or next;
+           rename('temp.pdf',"${pmid}.pdf") if -e 'temp.pdf';
        }
     }
 }
@@ -161,6 +178,8 @@ sub find_pdf_link {
        print STDERR $m->content() if $DEBUG;
     }
     my @possible_links;
+    # this is to prioritize the real link at science direct
+    push @possible_links, grep { $_->url_abs() !~ /_orig=article/} $m->find_all_links(text_regex => qr/PDF/i);
     push @possible_links, $m->find_all_links(tag_regex => qr/meta/,
                                             url_regex  => qr/(reprint|\.pdf)/i,
                                            );
@@ -181,7 +200,10 @@ sub find_pdf_link {
     }
     my @sub_frames = $m->find_all_links(tag_regex=>qr/^i?frame$/);
     for my $frame (@sub_frames) {
-       $m->get($frame->url_abs());
+       my $r = $m->get($frame->url_abs());
+       if ($r->header('Content-Type') =~ /pdf/) {
+           return $m;
+       }
        my $pdf_m = find_pdf_link($m,
                                  0,
                                  $call+1,
@@ -190,10 +212,10 @@ sub find_pdf_link {
            return $pdf_m;
        }
     }
-    if ($guess and defined $best_guess) {
-       $m->get($best_guess->url_abs());
-       return $m;
-    }
+    if ($guess and defined $best_guess) {
+#      $m->get($best_guess->url_abs());
+#      return $m;
+    }
     return undef;
 }