X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=get_pdf;h=74f62dc6998e1f6ec494b4411eac95f9495320d8;hb=3e61fdc995e43c2eafa722caca93609922f9c7f3;hp=745ea9aedec558b4ddea088c70784c2013f5dd15;hpb=80ccd0fbe2b3f3d6067fb18ce38ed39f49ed4570;p=bin.git

diff --git a/get_pdf b/get_pdf
index 745ea9a..74f62dc 100755
--- a/get_pdf
+++ b/get_pdf
@@ -120,11 +120,13 @@ if ($options{pmid}) {
 	eval {
 	    $m->get($url) or die "Unable to get $url";
 	    my $orig_mech = $m->clone();
+	    use Data::Dumper;
 	    my @possible_links = $m->find_all_links(text_regex => qr/to\s*read/i);
 	    # try to find the other links
 	    push @possible_links,
 		grep {my $attr = $_->attrs(); exists $attr->{title} and $attr->{title} =~ qr/Full\s*Text/i}
 		    $m->links();
+	    print STDERR map {"article link: ".$_->url_abs()."\n"} @possible_links if $DEBUG;
 	    die "No links" unless @possible_links;
 	    do {
 		$m = $orig_mech;
@@ -179,7 +181,10 @@ sub find_pdf_link {
     }
     my @possible_links;
     # this is to prioritize the real link at science direct
-    push @possible_links, grep { $_->url_abs() !~ /_orig=article/} $m->find_all_links(text_regex => qr/PDF/i);
+    push @possible_links, grep {my $temp = $_->attrs();
+				exists $temp->{title} and $temp->{title} =~ qr/Download\s*PDF/i}
+	$m->find_all_links(text_regex => qr/PDF/i);
+    push @possible_links, grep { $_->url_abs() !~ /_orig(?:in)?=article/} $m->find_all_links(text_regex => qr/PDF/i);
     push @possible_links, $m->find_all_links(tag_regex => qr/meta/,
 					     url_regex  => qr/(reprint|\.pdf)/i,
 					    );
@@ -187,16 +192,19 @@ sub find_pdf_link {
     push @possible_links,$m->find_all_links(text_regex => qr/manual\s*download/i);
     print STDERR $m->uri() if $DEBUG;
     print STDERR $m->content() if $DEBUG;
-    print STDERR map{$_->url_abs(),qq(\n)} @possible_links if $DEBUG;
+    print STDERR map{"possible pdf link: ".$_->url_abs().qq(\n)} @possible_links if $DEBUG;
     if (not @possible_links and $DEBUG) {
 	print STDERR $m->content();
     }
     my $best_guess = $possible_links[0] if @possible_links;
     for my $link (@possible_links) {
+	print STDERR "trying ".$link->url_abs()."..." if $DEBUG;
 	my $r = $m->get($link->url_abs());
 	if ($r->header('Content-Type') =~ /pdf/) {
+	    print STDERR "success\n" if $DEBUG;
 	    return $m;
 	}
+	print STDERR "failure; content type ".$r->header('Content-Type')."\n" if $DEBUG;
     }
     my @sub_frames = $m->find_all_links(tag_regex=>qr/^i?frame$/);
     for my $frame (@sub_frames) {