add search for subframes to get_pdf

author Don Armstrong <don@donarmstrong.com>

Wed, 20 Nov 2013 23:19:18 +0000 (15:19 -0800)

committer Don Armstrong <don@donarmstrong.com>

Wed, 20 Nov 2013 23:19:18 +0000 (15:19 -0800)
author Don Armstrong <don@donarmstrong.com>
Wed, 20 Nov 2013 23:19:18 +0000 (15:19 -0800)
committer Don Armstrong <don@donarmstrong.com>
Wed, 20 Nov 2013 23:19:18 +0000 (15:19 -0800)
diff --git a/get_pdf b/get_pdf

index 2184c1c80a2314a8e480f5100f1b6974c9739e53..935bfc82ca09dfe5aec7dc3b8fb4e91ae3702d25 100755 (executable)
--- a/get_pdf
+++ b/get_pdf
@@ -61,6 +61,7 @@ use vars qw($DEBUG);
  
  use Cwd;
  use WWW::Mechanize;
+use Data::Printer;
  
  my %options = (debug           => 0,
                help            => 0,
@@ -167,6 +168,33 @@ if ($options{pmid}) {
  }
  
  
+sub check_subframes {
+    my ($m,$call) = @_;
+    my @sub_frames = $m->find_all_links(tag_regex=>qr/^i?frame$/);
+    print STDERR "subframes: \n" if $DEBUG;
+    p @sub_frames if $DEBUG;
+    for my $frame (@sub_frames) {
+        my $r = $m->get($frame->url_abs());
+        print STDERR "trying: ".$frame->url_abs()."\n" if $DEBUG;
+        if ($r->header('Content-Type') =~ /pdf/) {
+            return $m;
+        }
+        print STDERR "failed: ".$r->header('Content-Type')."\n" if $DEBUG;
+    }
+    for my $frame (@sub_frames) {
+        my $r = $m->get($frame->url_abs());
+        my $pdf_m = find_pdf_link($m,
+                                  0,
+                                  $call+1,
+                                 );
+        if (defined $pdf_m) {
+            return $pdf_m;
+        }
+    }
+    return undef;
+}
+
+
  sub find_pdf_link {
      my ($mech,$guess,$call) = @_;
      $guess = 1 unless defined $guess;
@@ -175,57 +203,82 @@ sub find_pdf_link {
      return undef if $call > 5;
      my $m = $mech->clone();
      if ($m->content =~ /select\s*a\s*website\s*below/i) {
-       print STDERR $m->uri() if $DEBUG;
-       print STDERR $m->content() if $DEBUG;
-       my @inputs = $m->find_all_inputs(type => 'hidden',
-                                        name => q(urls['sd']),
-                                       );
-       return unless @inputs;
-       $m->get($inputs[0]->value);
-       print STDERR $m->content() if $DEBUG;
+        print STDERR $m->uri() if $DEBUG;
+        print STDERR $m->content() if $DEBUG > 1;
+        my @inputs = $m->find_all_inputs(type => 'hidden',
+                                         name => q(urls['sd']),
+                                        );
+        return unless @inputs;
+        $m->get($inputs[0]->value);
+        print STDERR $m->content() if $DEBUG > 1;
      }
      my @possible_links;
      # this is to prioritize the real link at science direct
-    push @possible_links, grep {my $temp = $_->attrs();
-                               exists $temp->{title} and $temp->{title} =~ qr/Download\s*PDF/i}
-       $m->find_all_links(text_regex => qr/PDF/i);
+    push @possible_links, 
+        grep {my $temp = $_->attrs();
+              use Data::Dumper;
+              print STDERR Dumper($temp);
+              (exists $temp->{title} and $temp->{title} =~ qr/(Download|Full\s*Text)\s*PDF/i) or
+                  (defined $_->text() and $_->text() =~ qr/(Full\s*Text|Download).*PDF/i)
+              }
+        $m->find_all_links(text_regex => qr/PDF/i);
+    my $possible_links = 0;
+    if ($DEBUG) {
+        $possible_links++;
+        print STDERR "possible links[$possible_links]:\n";
+        p @possible_links;
+    }
      push @possible_links, grep { $_->url_abs() !~ /_orig(?:in)?=article/} $m->find_all_links(text_regex => qr/PDF/i);
+    if ($DEBUG) {
+        $possible_links++;
+        print STDERR "possible links[$possible_links]:\n";
+        p @possible_links;
+    }
      push @possible_links, $m->find_all_links(tag_regex => qr/meta/,
                                              url_regex  => qr/(reprint|\.pdf)/i,
                                             );
-    push @possible_links, $m->find_all_links(text_regex => qr/pdf/i);
+    if ($DEBUG) {
+        $possible_links++;
+        print STDERR "possible links[$possible_links]:\n";
+        p @possible_links;
+    }
+    # The masthead grep here is to handle PNAS, which has a link to their masthead in every article.
+    push @possible_links,
+        grep {my $temp = $_->attrs(); (not defined $temp->{title}) or $temp->{title} !~ qr/Masthead/i;}
+        $m->find_all_links(text_regex => qr/pdf/i);
+    if ($DEBUG) {
+        $possible_links++;
+        print STDERR "possible links[$possible_links]:\n";
+        p @possible_links;
+    }
      push @possible_links,$m->find_all_links(text_regex => qr/manual\s*download/i);
+    if ($DEBUG) {
+        $possible_links++;
+        print STDERR "possible links[$possible_links]:\n";
+        p @possible_links;
+    }
      print STDERR $m->uri() if $DEBUG;
-    print STDERR $m->content() if $DEBUG;
+    print STDERR $m->content() if $DEBUG > 1;
      print STDERR map{"possible pdf link: ".$_->url_abs().qq(\n)} @possible_links if $DEBUG;
      if (not @possible_links and $DEBUG) {
-       print STDERR $m->content();
+        print STDERR $m->content();
      }
      my $best_guess = $possible_links[0] if @possible_links;
      for my $link (@possible_links) {
-       print STDERR "trying ".$link->url_abs()."..." if $DEBUG;
-       my $r = $m->get($link->url_abs());
-       if ($r->header('Content-Type') =~ /pdf/) {
-           print STDERR "success\n" if $DEBUG;
-           return $m;
-       }
-       print STDERR "failure; content type ".$r->header('Content-Type')."\n" if $DEBUG;
-       print STDERR $m->content() if $DEBUG;
-    }
-    my @sub_frames = $m->find_all_links(tag_regex=>qr/^i?frame$/);
-    for my $frame (@sub_frames) {
-       my $r = $m->get($frame->url_abs());
-       if ($r->header('Content-Type') =~ /pdf/) {
-           return $m;
-       }
-       my $pdf_m = find_pdf_link($m,
-                                 0,
-                                 $call+1,
-                                );
-       if (defined $pdf_m) {
-           return $pdf_m;
-       }
+        print STDERR "trying ".$link->url_abs()."..." if $DEBUG;
+        my $r = $m->get($link->url_abs());
+        my $content = $m->content();
+        if ($r->header('Content-Type') =~ /pdf/) {
+            print STDERR "success\n" if $DEBUG;
+            return $m;
+        }
+        my $ret = check_subframes($m,$call);
+        return $ret if defined $ret;
+        print STDERR "failure; content type ".$r->header('Content-Type')."\n" if $DEBUG;
+        print STDERR $content if $DEBUG;
      }
+    my $ret = check_subframes($m,$call);
+    return $ret if defined $ret;
  #     if ($guess and defined $best_guess) {
  #      $m->get($best_guess->url_abs());
  #      return $m;
author	Don Armstrong <don@donarmstrong.com>
	Wed, 20 Nov 2013 23:19:18 +0000 (15:19 -0800)
committer	Don Armstrong <don@donarmstrong.com>
	Wed, 20 Nov 2013 23:19:18 +0000 (15:19 -0800)