add epub ripper

author Don Armstrong <don@donarmstrong.com>

Thu, 12 Sep 2013 22:38:41 +0000 (15:38 -0700)

committer Don Armstrong <don@donarmstrong.com>

Thu, 12 Sep 2013 22:38:41 +0000 (15:38 -0700)
author Don Armstrong <don@donarmstrong.com>
Thu, 12 Sep 2013 22:38:41 +0000 (15:38 -0700)
committer Don Armstrong <don@donarmstrong.com>
Thu, 12 Sep 2013 22:38:41 +0000 (15:38 -0700)
diff --git a/get_epub_pmid b/get_epub_pmid

new file mode 100755 (executable)

index 0000000..e964698
--- /dev/null
+++ b/get_epub_pmid
@@ -0,0 +1,237 @@
+#! /usr/bin/perl
+# get_epub_pmid tries to get epubs from pubmed, and is released
+# under the terms of the GPL version 2, or any later version, at your
+# option. See the file README and COPYING for more information.
+# Copyright 2008 by Don Armstrong <don@donarmstrong.com>.
+# $Id: perl_script 1352 2009-01-25 02:04:38Z don $
+
+
+use warnings;
+use strict;
+
+use Getopt::Long;
+use Pod::Usage;
+
+=head1 NAME
+
+get_epub_pmid - try to get an epub
+
+=head1 SYNOPSIS
+
+get_epub_pmid [options] reference [references]
+
+ Options:
+  --debug, -d debugging level (Default 0)
+  --help, -h display this help
+  --man, -m display manual
+
+=head1 OPTIONS
+
+=over
+
+=item B<--pmid, -p>
+
+The reference is a pmid
+
+=item B<--cgi-proxy, -C>
+
+Use this cgi proxy style proxy
+
+=item B<--debug, -d>
+
+Debug verbosity. (Default 0)
+
+=item B<--help, -h>
+
+Display brief usage information.
+
+=item B<--man, -m>
+
+Display this manual.
+
+=back
+
+=head1 EXAMPLES
+
+
+=cut
+
+
+use vars qw($DEBUG);
+
+use Cwd;
+use WWW::Mechanize;
+
+my %options = (debug           => 0,
+               help            => 0,
+               man             => 0,
+               use_links       => 0,
+              );
+
+my %REFERENCE_TYPES = (pmid => 'pmid|p');
+
+GetOptions(\%options,
+           values %REFERENCE_TYPES,
+           'use_links|use-links!',
+           'cgi_proxy|cgi-proxy|C=s',
+           'http_proxy|http-proxy|H=s',
+           'debug|d+','help|h|?','man|m');
+
+pod2usage() if $options{help};
+pod2usage({verbose=>2}) if $options{man};
+
+$DEBUG = $options{debug};
+
+
+
+if (not grep {exists $options{$_} and
+                  defined $options{$_} and
+                  $options{$_}} keys %REFERENCE_TYPES) {
+    $options{pmid} = 1;
+}
+my @USAGE_ERRORS;
+if (grep {exists $options{$_}
+             and defined $options{$_}
+                 and $options{$_}} keys %REFERENCE_TYPES > 1) {
+    push @USAGE_ERRORS,"You can only specify exactly one of the ".(map { "--$_"} keys %REFERENCE_TYPES)." options";
+}
+
+if (not @ARGV) {
+    push @USAGE_ERRORS,"You must specify at least one reference";
+}
+
+pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
+
+
+if (exists $options{http_proxy}) {
+    $ENV{http_proxy} = $options{http_proxy};
+    $ENV{HTTP_PROXY} = $options{http_proxy};
+    $ENV{CGI_HTTP_PROXY} = $options{http_proxy};
+}
+
+if ($options{pmid}) {
+    my $m = WWW::Mechanize->new(agent => 'Mozilla',cookie_jar => {});
+    for my $pmid (@ARGV) {
+       $pmid =~ s/\D//g;
+       next unless length $pmid;
+       my $url = "www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&list_uids=${pmid}&dopt=Abstract";
+       if (exists $options{cgi_proxy}) {
+           $url = $options{cgi_proxy}.$url;
+       }
+       $url = "http://${url}";
+       eval {
+           $m->get($url) or die "Unable to get $url";
+           my $orig_mech = $m->clone();
+           use Data::Dumper;
+           my @possible_links = $m->find_all_links(text_regex => qr/to\s*read|free\s*PMC|full\s*text/i);
+           # try to find the other links
+           push @possible_links,
+            grep {my $attr = $_->attrs(); exists $attr->{title} and $attr->{title} =~ qr/(?:Full\s*Text|PMC)/i}
+                   $m->links();
+           print STDERR map {"article link: ".$_->url_abs()."\n"} @possible_links if $DEBUG;
+           die "No links" unless @possible_links;
+           do {
+            $m = $orig_mech;
+            eval {
+                print "trying ".$possible_links[0]->url()."\n" if $DEBUG;
+                $m->get($possible_links[0]->url()) or
+                    die "Unable to follow link";
+                # try to find epub link
+                my $epub_m = find_epub_link($m) or
+                    die "Unable to find epub";
+                my $fh = IO::File->new($pmid.'.epub','w') or
+                    die "Unable to open ${pmid}.epub for writing: $!";
+                print {$fh} $epub_m->content or
+                    die "Unable to write to ${pmid}.epub: $!";
+                close $fh or
+                    die "Unable to close ${pmid}.epub filehandle: $!";
+               };
+               shift @possible_links;
+    } while ($@ and @possible_links);
+           if ($@) {
+            die "$@";
+           }
+       };
+       if ($@) {
+           print STDERR "$@\n" if $DEBUG;
+            if ($options{use_links}) {
+                system('links2',
+                       # links2 doesn't like the leading http:// of proxies for some reason
+                       exists $options{http_proxy}?('-http-proxy',(map {s{http://}{}; $_} $options{http_proxy})):(),
+                       $url
+                      ) == 0 or next;
+                rename('temp.epub',"${pmid}.epub") if -e 'temp.epub';
+            }
+        }
+    }
+}
+
+
+sub find_epub_link {
+    my ($mech,$guess,$call) = @_;
+    $guess = 1 unless defined $guess;
+    $call = 0 unless defined $call;
+    # avoid looping endlessly
+    return undef if $call > 5;
+    my $m = $mech->clone();
+    if ($m->content =~ /select\s*a\s*website\s*below/i) {
+       print STDERR $m->uri() if $DEBUG;
+       print STDERR $m->content() if $DEBUG;
+       my @inputs = $m->find_all_inputs(type => 'hidden',
+                                        name => q(urls['sd']),
+                                       );
+       return unless @inputs;
+       $m->get($inputs[0]->value);
+       print STDERR $m->content() if $DEBUG;
+    }
+    my @possible_links;
+    # this is to prioritize the real link at science direct
+    push @possible_links, grep {my $temp = $_->attrs();
+                               exists $temp->{title} and $temp->{title} =~ qr/Download\s*epub/i}
+       $m->find_all_links(text_regex => qr/epub/i);
+    push @possible_links, grep { $_->url_abs() !~ /_orig(?:in)?=article/} $m->find_all_links(text_regex => qr/epub/i);
+    push @possible_links, $m->find_all_links(tag_regex => qr/meta/,
+                                             url_regex  => qr/(reprint|\.epub)/i,
+                                            );
+    push @possible_links, $m->find_all_links(text_regex => qr/epub/i);
+    push @possible_links,$m->find_all_links(text_regex => qr/manual\s*download/i);
+    print STDERR $m->uri() if $DEBUG;
+    print STDERR $m->content() if $DEBUG;
+    print STDERR map{"possible epub link: ".$_->url_abs().qq(\n)} @possible_links if $DEBUG;
+    if (not @possible_links and $DEBUG) {
+       print STDERR $m->content();
+    }
+    my $best_guess = $possible_links[0] if @possible_links;
+    for my $link (@possible_links) {
+       print STDERR "trying ".$link->url_abs()."..." if $DEBUG;
+       my $r = $m->get($link->url_abs());
+       if ($r->header('Content-Type') =~ /epub|zip/) {
+           print STDERR "success\n" if $DEBUG;
+           return $m;
+       }
+       print STDERR "failure; content type ".$r->header('Content-Type')."\n" if $DEBUG;
+       print STDERR $m->content() if $DEBUG;
+    }
+    my @sub_frames = $m->find_all_links(tag_regex=>qr/^i?frame$/);
+    for my $frame (@sub_frames) {
+       my $r = $m->get($frame->url_abs());
+       if ($r->header('Content-Type') =~ /epub|zip/) {
+           return $m;
+       }
+       my $epub_m = find_epub_link($m,
+                                 0,
+                                 $call+1,
+                                );
+       if (defined $epub_m) {
+           return $epub_m;
+       }
+    }
+#     if ($guess and defined $best_guess) {
+#      $m->get($best_guess->url_abs());
+#      return $m;
+#     }
+    return undef;
+}
+
+
+__END__
author	Don Armstrong <don@donarmstrong.com>
	Thu, 12 Sep 2013 22:38:41 +0000 (15:38 -0700)
committer	Don Armstrong <don@donarmstrong.com>
	Thu, 12 Sep 2013 22:38:41 +0000 (15:38 -0700)