From: Don Armstrong Date: Thu, 12 Sep 2013 22:38:41 +0000 (-0700) Subject: add epub ripper X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=853a5a2b3a574c28c9db665acabb3e003e2c896c;p=bin.git add epub ripper --- diff --git a/get_epub_pmid b/get_epub_pmid new file mode 100755 index 0000000..e964698 --- /dev/null +++ b/get_epub_pmid @@ -0,0 +1,237 @@ +#! /usr/bin/perl +# get_epub_pmid tries to get epubs from pubmed, and is released +# under the terms of the GPL version 2, or any later version, at your +# option. See the file README and COPYING for more information. +# Copyright 2008 by Don Armstrong . +# $Id: perl_script 1352 2009-01-25 02:04:38Z don $ + + +use warnings; +use strict; + +use Getopt::Long; +use Pod::Usage; + +=head1 NAME + +get_epub_pmid - try to get an epub + +=head1 SYNOPSIS + +get_epub_pmid [options] reference [references] + + Options: + --debug, -d debugging level (Default 0) + --help, -h display this help + --man, -m display manual + +=head1 OPTIONS + +=over + +=item B<--pmid, -p> + +The reference is a pmid + +=item B<--cgi-proxy, -C> + +Use this cgi proxy style proxy + +=item B<--debug, -d> + +Debug verbosity. (Default 0) + +=item B<--help, -h> + +Display brief usage information. + +=item B<--man, -m> + +Display this manual. + +=back + +=head1 EXAMPLES + + +=cut + + +use vars qw($DEBUG); + +use Cwd; +use WWW::Mechanize; + +my %options = (debug => 0, + help => 0, + man => 0, + use_links => 0, + ); + +my %REFERENCE_TYPES = (pmid => 'pmid|p'); + +GetOptions(\%options, + values %REFERENCE_TYPES, + 'use_links|use-links!', + 'cgi_proxy|cgi-proxy|C=s', + 'http_proxy|http-proxy|H=s', + 'debug|d+','help|h|?','man|m'); + +pod2usage() if $options{help}; +pod2usage({verbose=>2}) if $options{man}; + +$DEBUG = $options{debug}; + + + +if (not grep {exists $options{$_} and + defined $options{$_} and + $options{$_}} keys %REFERENCE_TYPES) { + $options{pmid} = 1; +} +my @USAGE_ERRORS; +if (grep {exists $options{$_} + and defined $options{$_} + and $options{$_}} keys %REFERENCE_TYPES > 1) { + push @USAGE_ERRORS,"You can only specify exactly one of the ".(map { "--$_"} keys %REFERENCE_TYPES)." options"; +} + +if (not @ARGV) { + push @USAGE_ERRORS,"You must specify at least one reference"; +} + +pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS; + + +if (exists $options{http_proxy}) { + $ENV{http_proxy} = $options{http_proxy}; + $ENV{HTTP_PROXY} = $options{http_proxy}; + $ENV{CGI_HTTP_PROXY} = $options{http_proxy}; +} + +if ($options{pmid}) { + my $m = WWW::Mechanize->new(agent => 'Mozilla',cookie_jar => {}); + for my $pmid (@ARGV) { + $pmid =~ s/\D//g; + next unless length $pmid; + my $url = "www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&list_uids=${pmid}&dopt=Abstract"; + if (exists $options{cgi_proxy}) { + $url = $options{cgi_proxy}.$url; + } + $url = "http://${url}"; + eval { + $m->get($url) or die "Unable to get $url"; + my $orig_mech = $m->clone(); + use Data::Dumper; + my @possible_links = $m->find_all_links(text_regex => qr/to\s*read|free\s*PMC|full\s*text/i); + # try to find the other links + push @possible_links, + grep {my $attr = $_->attrs(); exists $attr->{title} and $attr->{title} =~ qr/(?:Full\s*Text|PMC)/i} + $m->links(); + print STDERR map {"article link: ".$_->url_abs()."\n"} @possible_links if $DEBUG; + die "No links" unless @possible_links; + do { + $m = $orig_mech; + eval { + print "trying ".$possible_links[0]->url()."\n" if $DEBUG; + $m->get($possible_links[0]->url()) or + die "Unable to follow link"; + # try to find epub link + my $epub_m = find_epub_link($m) or + die "Unable to find epub"; + my $fh = IO::File->new($pmid.'.epub','w') or + die "Unable to open ${pmid}.epub for writing: $!"; + print {$fh} $epub_m->content or + die "Unable to write to ${pmid}.epub: $!"; + close $fh or + die "Unable to close ${pmid}.epub filehandle: $!"; + }; + shift @possible_links; + } while ($@ and @possible_links); + if ($@) { + die "$@"; + } + }; + if ($@) { + print STDERR "$@\n" if $DEBUG; + if ($options{use_links}) { + system('links2', + # links2 doesn't like the leading http:// of proxies for some reason + exists $options{http_proxy}?('-http-proxy',(map {s{http://}{}; $_} $options{http_proxy})):(), + $url + ) == 0 or next; + rename('temp.epub',"${pmid}.epub") if -e 'temp.epub'; + } + } + } +} + + +sub find_epub_link { + my ($mech,$guess,$call) = @_; + $guess = 1 unless defined $guess; + $call = 0 unless defined $call; + # avoid looping endlessly + return undef if $call > 5; + my $m = $mech->clone(); + if ($m->content =~ /select\s*a\s*website\s*below/i) { + print STDERR $m->uri() if $DEBUG; + print STDERR $m->content() if $DEBUG; + my @inputs = $m->find_all_inputs(type => 'hidden', + name => q(urls['sd']), + ); + return unless @inputs; + $m->get($inputs[0]->value); + print STDERR $m->content() if $DEBUG; + } + my @possible_links; + # this is to prioritize the real link at science direct + push @possible_links, grep {my $temp = $_->attrs(); + exists $temp->{title} and $temp->{title} =~ qr/Download\s*epub/i} + $m->find_all_links(text_regex => qr/epub/i); + push @possible_links, grep { $_->url_abs() !~ /_orig(?:in)?=article/} $m->find_all_links(text_regex => qr/epub/i); + push @possible_links, $m->find_all_links(tag_regex => qr/meta/, + url_regex => qr/(reprint|\.epub)/i, + ); + push @possible_links, $m->find_all_links(text_regex => qr/epub/i); + push @possible_links,$m->find_all_links(text_regex => qr/manual\s*download/i); + print STDERR $m->uri() if $DEBUG; + print STDERR $m->content() if $DEBUG; + print STDERR map{"possible epub link: ".$_->url_abs().qq(\n)} @possible_links if $DEBUG; + if (not @possible_links and $DEBUG) { + print STDERR $m->content(); + } + my $best_guess = $possible_links[0] if @possible_links; + for my $link (@possible_links) { + print STDERR "trying ".$link->url_abs()."..." if $DEBUG; + my $r = $m->get($link->url_abs()); + if ($r->header('Content-Type') =~ /epub|zip/) { + print STDERR "success\n" if $DEBUG; + return $m; + } + print STDERR "failure; content type ".$r->header('Content-Type')."\n" if $DEBUG; + print STDERR $m->content() if $DEBUG; + } + my @sub_frames = $m->find_all_links(tag_regex=>qr/^i?frame$/); + for my $frame (@sub_frames) { + my $r = $m->get($frame->url_abs()); + if ($r->header('Content-Type') =~ /epub|zip/) { + return $m; + } + my $epub_m = find_epub_link($m, + 0, + $call+1, + ); + if (defined $epub_m) { + return $epub_m; + } + } +# if ($guess and defined $best_guess) { +# $m->get($best_guess->url_abs()); +# return $m; +# } + return undef; +} + + +__END__