2 # get_epub_pmid tries to get epubs from pubmed, and is released
3 # under the terms of the GPL version 2, or any later version, at your
4 # option. See the file README and COPYING for more information.
5 # Copyright 2008 by Don Armstrong <don@donarmstrong.com>.
6 # $Id: perl_script 1352 2009-01-25 02:04:38Z don $
17 get_epub_pmid - try to get an epub
21 get_epub_pmid [options] reference [references]
24 --debug, -d debugging level (Default 0)
25 --help, -h display this help
26 --man, -m display manual
34 The reference is a pmid
36 =item B<--cgi-proxy, -C>
38 Use this cgi proxy style proxy
42 Debug verbosity. (Default 0)
46 Display brief usage information.
65 my %options = (debug => 0,
71 my %REFERENCE_TYPES = (pmid => 'pmid|p');
74 values %REFERENCE_TYPES,
75 'use_links|use-links!',
76 'cgi_proxy|cgi-proxy|C=s',
77 'http_proxy|http-proxy|H=s',
78 'debug|d+','help|h|?','man|m');
80 pod2usage() if $options{help};
81 pod2usage({verbose=>2}) if $options{man};
83 $DEBUG = $options{debug};
87 if (not grep {exists $options{$_} and
88 defined $options{$_} and
89 $options{$_}} keys %REFERENCE_TYPES) {
93 if (grep {exists $options{$_}
94 and defined $options{$_}
95 and $options{$_}} keys %REFERENCE_TYPES > 1) {
96 push @USAGE_ERRORS,"You can only specify exactly one of the ".(map { "--$_"} keys %REFERENCE_TYPES)." options";
100 push @USAGE_ERRORS,"You must specify at least one reference";
103 pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
106 if (exists $options{http_proxy}) {
107 $ENV{http_proxy} = $options{http_proxy};
108 $ENV{HTTP_PROXY} = $options{http_proxy};
109 $ENV{CGI_HTTP_PROXY} = $options{http_proxy};
112 if ($options{pmid}) {
113 my $m = WWW::Mechanize->new(agent => 'Mozilla',cookie_jar => {});
114 for my $pmid (@ARGV) {
116 next unless length $pmid;
117 my $url = "www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&list_uids=${pmid}&dopt=Abstract";
118 if (exists $options{cgi_proxy}) {
119 $url = $options{cgi_proxy}.$url;
121 $url = "http://${url}";
123 $m->get($url) or die "Unable to get $url";
124 my $orig_mech = $m->clone();
126 my @possible_links = $m->find_all_links(text_regex => qr/to\s*read|free\s*PMC|full\s*text/i);
127 # try to find the other links
128 push @possible_links,
129 grep {my $attr = $_->attrs(); exists $attr->{title} and $attr->{title} =~ qr/(?:Full\s*Text|PMC)/i}
131 print STDERR map {"article link: ".$_->url_abs()."\n"} @possible_links if $DEBUG;
132 die "No links" unless @possible_links;
136 print "trying ".$possible_links[0]->url()."\n" if $DEBUG;
137 $m->get($possible_links[0]->url()) or
138 die "Unable to follow link";
139 # try to find epub link
140 my $epub_m = find_epub_link($m) or
141 die "Unable to find epub";
142 my $fh = IO::File->new($pmid.'.epub','w') or
143 die "Unable to open ${pmid}.epub for writing: $!";
144 print {$fh} $epub_m->content or
145 die "Unable to write to ${pmid}.epub: $!";
147 die "Unable to close ${pmid}.epub filehandle: $!";
149 shift @possible_links;
150 } while ($@ and @possible_links);
156 print STDERR "$@\n" if $DEBUG;
157 if ($options{use_links}) {
159 # links2 doesn't like the leading http:// of proxies for some reason
160 exists $options{http_proxy}?('-http-proxy',(map {s{http://}{}; $_} $options{http_proxy})):(),
163 rename('temp.epub',"${pmid}.epub") if -e 'temp.epub';
171 my ($mech,$guess,$call) = @_;
172 $guess = 1 unless defined $guess;
173 $call = 0 unless defined $call;
174 # avoid looping endlessly
175 return undef if $call > 5;
176 my $m = $mech->clone();
177 if ($m->content =~ /select\s*a\s*website\s*below/i) {
178 print STDERR $m->uri() if $DEBUG;
179 print STDERR $m->content() if $DEBUG;
180 my @inputs = $m->find_all_inputs(type => 'hidden',
181 name => q(urls['sd']),
183 return unless @inputs;
184 $m->get($inputs[0]->value);
185 print STDERR $m->content() if $DEBUG;
188 # this is to prioritize the real link at science direct
189 push @possible_links, grep {my $temp = $_->attrs();
190 exists $temp->{title} and $temp->{title} =~ qr/Download\s*epub/i}
191 $m->find_all_links(text_regex => qr/epub/i);
192 push @possible_links, grep { $_->url_abs() !~ /_orig(?:in)?=article/} $m->find_all_links(text_regex => qr/epub/i);
193 push @possible_links, $m->find_all_links(tag_regex => qr/meta/,
194 url_regex => qr/(reprint|\.epub)/i,
196 push @possible_links, $m->find_all_links(text_regex => qr/epub/i);
197 push @possible_links,$m->find_all_links(text_regex => qr/manual\s*download/i);
198 print STDERR $m->uri() if $DEBUG;
199 print STDERR $m->content() if $DEBUG;
200 print STDERR map{"possible epub link: ".$_->url_abs().qq(\n)} @possible_links if $DEBUG;
201 if (not @possible_links and $DEBUG) {
202 print STDERR $m->content();
204 my $best_guess = $possible_links[0] if @possible_links;
205 for my $link (@possible_links) {
206 print STDERR "trying ".$link->url_abs()."..." if $DEBUG;
207 my $r = $m->get($link->url_abs());
208 if ($r->header('Content-Type') =~ /epub|zip/) {
209 print STDERR "success\n" if $DEBUG;
212 print STDERR "failure; content type ".$r->header('Content-Type')."\n" if $DEBUG;
213 print STDERR $m->content() if $DEBUG;
215 my @sub_frames = $m->find_all_links(tag_regex=>qr/^i?frame$/);
216 for my $frame (@sub_frames) {
217 my $r = $m->get($frame->url_abs());
218 if ($r->header('Content-Type') =~ /epub|zip/) {
221 my $epub_m = find_epub_link($m,
225 if (defined $epub_m) {
229 # if ($guess and defined $best_guess) {
230 # $m->get($best_guess->url_abs());