X-Git-Url: https://git.donarmstrong.com/?p=bin.git;a=blobdiff_plain;f=get_pdf;h=0c1ad2fa9a5ad90825be15bae55124648911c11d;hp=74f62dc6998e1f6ec494b4411eac95f9495320d8;hb=HEAD;hpb=3e61fdc995e43c2eafa722caca93609922f9c7f3

diff --git a/get_pdf b/get_pdf
index 74f62dc..0c1ad2f 100755
--- a/get_pdf
+++ b/get_pdf
@@ -61,16 +61,19 @@ use vars qw($DEBUG);
 
 use Cwd;
 use WWW::Mechanize;
+use Data::Printer;
 
 my %options = (debug           => 0,
 	       help            => 0,
 	       man             => 0,
+               use_links       => 1,
 	       );
 
 my %REFERENCE_TYPES = (pmid => 'pmid|p');
 
 GetOptions(\%options,
 	   values %REFERENCE_TYPES,
+           'use_links|use-links!',
 	   'cgi_proxy|cgi-proxy|C=s',
 	   'http_proxy|http-proxy|H=s',
 	   'debug|d+','help|h|?','man|m');
@@ -80,7 +83,8 @@ pod2usage({verbose=>2}) if $options{man};
 
 $DEBUG = $options{debug};
 
-
+binmode(STDOUT,":encoding(UTF-8)");
+binmode(STDERR,":encoding(UTF-8)");
 
 if (not grep {exists $options{$_} and
 		  defined $options{$_} and
@@ -108,7 +112,7 @@ if (exists $options{http_proxy}) {
 }
 
 if ($options{pmid}) {
-    my $m = WWW::Mechanize->new();
+    my $m = WWW::Mechanize->new(agent => 'Mozilla',cookie_jar => {});
     for my $pmid (@ARGV) {
 	$pmid =~ s/\D//g;
 	next unless length $pmid;
@@ -124,7 +128,9 @@ if ($options{pmid}) {
 	    my @possible_links = $m->find_all_links(text_regex => qr/to\s*read/i);
 	    # try to find the other links
 	    push @possible_links,
-		grep {my $attr = $_->attrs(); exists $attr->{title} and $attr->{title} =~ qr/Full\s*Text/i}
+            grep {my $attr = $_->attrs();
+                  exists $attr->{title} and
+                      $attr->{title} =~ qr/(?:Full\s*Text|PMC)/i}
 		    $m->links();
 	    print STDERR map {"article link: ".$_->url_abs()."\n"} @possible_links if $DEBUG;
 	    die "No links" unless @possible_links;
@@ -152,16 +158,55 @@ if ($options{pmid}) {
 	};
 	if ($@) {
 	    print STDERR "$@\n" if $DEBUG;
-	    system('links2',
-	    	   exists $options{http_proxy}?('-http-proxy',$options{http_proxy}):(),
-	    	   $url
-	    	  ) == 0 or next;
-	    rename('temp.pdf',"${pmid}.pdf") if -e 'temp.pdf';
-	}
+        if ($options{use_links}) {
+            if ($ENV{DISPLAY}) {
+                system('chromium',
+                       # links2 doesn't like the leading http:// of proxies for some reason
+                       exists $options{http_proxy}?('--proxy-server',(map {s{http://}{}; $_} $options{http_proxy})):(),
+                       '--temp-profile',
+                       $url,
+                      ) == 0 or next;
+                rename('temp.pdf',"${pmid}.pdf") if -e 'temp.pdf';
+            } else {
+                system('links2',
+                       # links2 doesn't like the leading http:// of proxies for some reason
+                       exists $options{http_proxy}?('-http-proxy',(map {s{http://}{}; $_} $options{http_proxy})):(),
+                       $url
+                      ) == 0 or next;
+                rename('temp.pdf',"${pmid}.pdf") if -e 'temp.pdf';
+            }
+        }}
     }
 }
 
 
+sub check_subframes {
+    my ($m,$call) = @_;
+    my @sub_frames = $m->find_all_links(tag_regex=>qr/^i?frame$/);
+    print STDERR "subframes: \n" if $DEBUG;
+    p @sub_frames if $DEBUG;
+    for my $frame (@sub_frames) {
+        my $r = $m->get($frame->url_abs());
+        print STDERR "trying: ".$frame->url_abs()."\n" if $DEBUG;
+        if ($r->header('Content-Type') =~ /pdf/) {
+            return $m;
+        }
+        print STDERR "failed: ".$r->header('Content-Type')."\n" if $DEBUG;
+    }
+    for my $frame (@sub_frames) {
+        my $r = $m->get($frame->url_abs());
+        my $pdf_m = find_pdf_link($m,
+                                  0,
+                                  $call+1,
+                                 );
+        if (defined $pdf_m) {
+            return $pdf_m;
+        }
+    }
+    return undef;
+}
+
+
 sub find_pdf_link {
     my ($mech,$guess,$call) = @_;
     $guess = 1 unless defined $guess;
@@ -170,56 +215,95 @@ sub find_pdf_link {
     return undef if $call > 5;
     my $m = $mech->clone();
     if ($m->content =~ /select\s*a\s*website\s*below/i) {
-	print STDERR $m->uri() if $DEBUG;
-	print STDERR $m->content() if $DEBUG;
-	my @inputs = $m->find_all_inputs(type => 'hidden',
-					 name => q(urls['sd']),
-					);
-	return unless @inputs;
-	$m->get($inputs[0]->value);
-	print STDERR $m->content() if $DEBUG;
+        print STDERR $m->uri() if $DEBUG;
+        print STDERR $m->content() if $DEBUG > 1;
+        my @inputs = $m->find_all_inputs(type => 'hidden',
+                                         name => q(urls['sd']),
+                                        );
+        return unless @inputs;
+        $m->get($inputs[0]->value);
+        print STDERR $m->content() if $DEBUG > 1;
     }
     my @possible_links;
+    # this brings forward the actual link at Science
+    push @possible_links,
+        grep {my $temp = $_->attrs();
+              exists $temp->{rel} and $temp->{rel} =~ qr/view-/i and
+                  defined $_->text() and $_->text() =~ qr/Full\s*Text.*PDF/i
+              }
+        $m->find_all_links(text_regex => qr/PDF/i);
+    push @possible_links,
+        grep {my $temp = $_->attrs();
+              exists $temp->{rel} and $temp->{rel} =~ qr/alternate/i and
+                  exists $temp->{type} and $temp->{type} =~ qr/pdf/i
+              }
+        $m->find_all_links(url_regex => qr/pdf/);
     # this is to prioritize the real link at science direct
-    push @possible_links, grep {my $temp = $_->attrs();
-				exists $temp->{title} and $temp->{title} =~ qr/Download\s*PDF/i}
-	$m->find_all_links(text_regex => qr/PDF/i);
+    push @possible_links,
+        grep {my $temp = $_->attrs();
+              use Data::Dumper;
+              print STDERR Dumper($temp);
+              (exists $temp->{title} and $temp->{title} =~ qr/(Download|Full\s*Text)\s*PDF/i) or
+                  (defined $_->text() and $_->text() =~ qr/(Full\s*Text|Download).*PDF/i)
+              }
+        $m->find_all_links(text_regex => qr/PDF/i);
+    my $possible_links = 0;
+    if ($DEBUG) {
+        $possible_links++;
+        print STDERR "possible links[$possible_links]:\n";
+        p @possible_links;
+    }
     push @possible_links, grep { $_->url_abs() !~ /_orig(?:in)?=article/} $m->find_all_links(text_regex => qr/PDF/i);
+    if ($DEBUG) {
+        $possible_links++;
+        print STDERR "possible links[$possible_links]:\n";
+        p @possible_links;
+    }
     push @possible_links, $m->find_all_links(tag_regex => qr/meta/,
 					     url_regex  => qr/(reprint|\.pdf)/i,
 					    );
-    push @possible_links, $m->find_all_links(text_regex => qr/pdf/i);
+    if ($DEBUG) {
+        $possible_links++;
+        print STDERR "possible links[$possible_links]:\n";
+        p @possible_links;
+    }
+    # The masthead grep here is to handle PNAS, which has a link to their masthead in every article.
+    push @possible_links,
+        grep {my $temp = $_->attrs(); (not defined $temp->{title}) or $temp->{title} !~ qr/Masthead/i;}
+        $m->find_all_links(text_regex => qr/pdf/i);
+    if ($DEBUG) {
+        $possible_links++;
+        print STDERR "possible links[$possible_links]:\n";
+        p @possible_links;
+    }
     push @possible_links,$m->find_all_links(text_regex => qr/manual\s*download/i);
+    if ($DEBUG) {
+        $possible_links++;
+        print STDERR "possible links[$possible_links]:\n";
+        p @possible_links;
+    }
     print STDERR $m->uri() if $DEBUG;
-    print STDERR $m->content() if $DEBUG;
+    print STDERR $m->content() if $DEBUG > 1;
     print STDERR map{"possible pdf link: ".$_->url_abs().qq(\n)} @possible_links if $DEBUG;
     if (not @possible_links and $DEBUG) {
-	print STDERR $m->content();
+        print STDERR $m->content();
     }
     my $best_guess = $possible_links[0] if @possible_links;
     for my $link (@possible_links) {
-	print STDERR "trying ".$link->url_abs()."..." if $DEBUG;
-	my $r = $m->get($link->url_abs());
-	if ($r->header('Content-Type') =~ /pdf/) {
-	    print STDERR "success\n" if $DEBUG;
-	    return $m;
-	}
-	print STDERR "failure; content type ".$r->header('Content-Type')."\n" if $DEBUG;
-    }
-    my @sub_frames = $m->find_all_links(tag_regex=>qr/^i?frame$/);
-    for my $frame (@sub_frames) {
-	my $r = $m->get($frame->url_abs());
-	if ($r->header('Content-Type') =~ /pdf/) {
-	    return $m;
-	}
-	my $pdf_m = find_pdf_link($m,
-				  0,
-				  $call+1,
-				 );
-	if (defined $pdf_m) {
-	    return $pdf_m;
-	}
+        print STDERR "trying ".$link->url_abs()."..." if $DEBUG;
+        my $r = $m->get($link->url_abs());
+        my $content = $m->content();
+        if ($r->header('Content-Type') =~ /pdf/) {
+            print STDERR "success\n" if $DEBUG;
+            return $m;
+        }
+        my $ret = check_subframes($m,$call);
+        return $ret if defined $ret;
+        print STDERR "failure; content type ".$r->header('Content-Type')."\n" if $DEBUG;
+        print STDERR $content if $DEBUG;
     }
+    my $ret = check_subframes($m,$call);
+    return $ret if defined $ret;
 #     if ($guess and defined $best_guess) {
 # 	$m->get($best_guess->url_abs());
 # 	return $m;