2 # get_pdf tries to get pdfs, and is released
3 # under the terms of the GPL version 2, or any later version, at your
4 # option. See the file README and COPYING for more information.
5 # Copyright 2008 by Don Armstrong <don@donarmstrong.com>.
6 # $Id: perl_script 1352 2009-01-25 02:04:38Z don $
17 get_pdf - try to get a pdf
21 get_pdf [options] reference [references]
24 --debug, -d debugging level (Default 0)
25 --help, -h display this help
26 --man, -m display manual
34 The reference is a pmid
36 =item B<--cgi-proxy, -C>
38 Use this cgi proxy style proxy
42 Debug verbosity. (Default 0)
46 Display brief usage information.
66 my %options = (debug => 0,
72 my %REFERENCE_TYPES = (pmid => 'pmid|p');
75 values %REFERENCE_TYPES,
76 'use_links|use-links!',
77 'cgi_proxy|cgi-proxy|C=s',
78 'http_proxy|http-proxy|H=s',
79 'debug|d+','help|h|?','man|m');
81 pod2usage() if $options{help};
82 pod2usage({verbose=>2}) if $options{man};
84 $DEBUG = $options{debug};
86 binmode(STDOUT,":encoding(UTF-8)");
87 binmode(STDERR,":encoding(UTF-8)");
89 if (not grep {exists $options{$_} and
90 defined $options{$_} and
91 $options{$_}} keys %REFERENCE_TYPES) {
95 if (grep {exists $options{$_}
96 and defined $options{$_}
97 and $options{$_}} keys %REFERENCE_TYPES > 1) {
98 push @USAGE_ERRORS,"You can only specify exactly one of the ".(map { "--$_"} keys %REFERENCE_TYPES)." options";
102 push @USAGE_ERRORS,"You must specify at least one reference";
105 pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
108 if (exists $options{http_proxy}) {
109 $ENV{http_proxy} = $options{http_proxy};
110 $ENV{HTTP_PROXY} = $options{http_proxy};
111 $ENV{CGI_HTTP_PROXY} = $options{http_proxy};
114 if ($options{pmid}) {
115 my $m = WWW::Mechanize->new(agent => 'Mozilla',cookie_jar => {});
116 for my $pmid (@ARGV) {
118 next unless length $pmid;
119 my $url = "www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&list_uids=${pmid}&dopt=Abstract";
120 if (exists $options{cgi_proxy}) {
121 $url = $options{cgi_proxy}.$url;
123 $url = "http://${url}";
125 $m->get($url) or die "Unable to get $url";
126 my $orig_mech = $m->clone();
128 my @possible_links = $m->find_all_links(text_regex => qr/to\s*read/i);
129 # try to find the other links
130 push @possible_links,
131 grep {my $attr = $_->attrs();
132 exists $attr->{title} and
133 $attr->{title} =~ qr/(?:Full\s*Text|PMC)/i}
135 print STDERR map {"article link: ".$_->url_abs()."\n"} @possible_links if $DEBUG;
136 die "No links" unless @possible_links;
140 print "trying ".$possible_links[0]->url()."\n" if $DEBUG;
141 $m->get($possible_links[0]->url()) or
142 die "Unable to follow link";
143 # try to find pdf link
144 my $pdf_m = find_pdf_link($m) or
145 die "Unable to find pdf";
146 my $fh = IO::File->new($pmid.'.pdf','w') or
147 die "Unable to open ${pmid}.pdf for writing: $!";
148 print {$fh} $pdf_m->content or
149 die "Unable to write to ${pmid}.pdf: $!";
151 die "Unable to close ${pmid}.pdf filehandle: $!";
153 shift @possible_links;
154 } while ($@ and @possible_links);
160 print STDERR "$@\n" if $DEBUG;
161 if ($options{use_links}) {
164 # links2 doesn't like the leading http:// of proxies for some reason
165 exists $options{http_proxy}?('--proxy-server',(map {s{http://}{}; $_} $options{http_proxy})):(),
169 rename('temp.pdf',"${pmid}.pdf") if -e 'temp.pdf';
172 # links2 doesn't like the leading http:// of proxies for some reason
173 exists $options{http_proxy}?('-http-proxy',(map {s{http://}{}; $_} $options{http_proxy})):(),
176 rename('temp.pdf',"${pmid}.pdf") if -e 'temp.pdf';
183 sub check_subframes {
185 my @sub_frames = $m->find_all_links(tag_regex=>qr/^i?frame$/);
186 print STDERR "subframes: \n" if $DEBUG;
187 p @sub_frames if $DEBUG;
188 for my $frame (@sub_frames) {
189 my $r = $m->get($frame->url_abs());
190 print STDERR "trying: ".$frame->url_abs()."\n" if $DEBUG;
191 if ($r->header('Content-Type') =~ /pdf/) {
194 print STDERR "failed: ".$r->header('Content-Type')."\n" if $DEBUG;
196 for my $frame (@sub_frames) {
197 my $r = $m->get($frame->url_abs());
198 my $pdf_m = find_pdf_link($m,
202 if (defined $pdf_m) {
211 my ($mech,$guess,$call) = @_;
212 $guess = 1 unless defined $guess;
213 $call = 0 unless defined $call;
214 # avoid looping endlessly
215 return undef if $call > 5;
216 my $m = $mech->clone();
217 if ($m->content =~ /select\s*a\s*website\s*below/i) {
218 print STDERR $m->uri() if $DEBUG;
219 print STDERR $m->content() if $DEBUG > 1;
220 my @inputs = $m->find_all_inputs(type => 'hidden',
221 name => q(urls['sd']),
223 return unless @inputs;
224 $m->get($inputs[0]->value);
225 print STDERR $m->content() if $DEBUG > 1;
228 # this brings forward the actual link at Science
229 push @possible_links,
230 grep {my $temp = $_->attrs();
231 exists $temp->{rel} and $temp->{rel} =~ qr/view-/i and
232 defined $_->text() and $_->text() =~ qr/Full\s*Text.*PDF/i
234 $m->find_all_links(text_regex => qr/PDF/i);
235 push @possible_links,
236 grep {my $temp = $_->attrs();
237 exists $temp->{rel} and $temp->{rel} =~ qr/alternate/i and
238 exists $temp->{type} and $temp->{type} =~ qr/pdf/i
240 $m->find_all_links(url_regex => qr/pdf/);
241 # this is to prioritize the real link at science direct
242 push @possible_links,
243 grep {my $temp = $_->attrs();
245 print STDERR Dumper($temp);
246 (exists $temp->{title} and $temp->{title} =~ qr/(Download|Full\s*Text)\s*PDF/i) or
247 (defined $_->text() and $_->text() =~ qr/(Full\s*Text|Download).*PDF/i)
249 $m->find_all_links(text_regex => qr/PDF/i);
250 my $possible_links = 0;
253 print STDERR "possible links[$possible_links]:\n";
256 push @possible_links, grep { $_->url_abs() !~ /_orig(?:in)?=article/} $m->find_all_links(text_regex => qr/PDF/i);
259 print STDERR "possible links[$possible_links]:\n";
262 push @possible_links, $m->find_all_links(tag_regex => qr/meta/,
263 url_regex => qr/(reprint|\.pdf)/i,
267 print STDERR "possible links[$possible_links]:\n";
270 # The masthead grep here is to handle PNAS, which has a link to their masthead in every article.
271 push @possible_links,
272 grep {my $temp = $_->attrs(); (not defined $temp->{title}) or $temp->{title} !~ qr/Masthead/i;}
273 $m->find_all_links(text_regex => qr/pdf/i);
276 print STDERR "possible links[$possible_links]:\n";
279 push @possible_links,$m->find_all_links(text_regex => qr/manual\s*download/i);
282 print STDERR "possible links[$possible_links]:\n";
285 print STDERR $m->uri() if $DEBUG;
286 print STDERR $m->content() if $DEBUG > 1;
287 print STDERR map{"possible pdf link: ".$_->url_abs().qq(\n)} @possible_links if $DEBUG;
288 if (not @possible_links and $DEBUG) {
289 print STDERR $m->content();
291 my $best_guess = $possible_links[0] if @possible_links;
292 for my $link (@possible_links) {
293 print STDERR "trying ".$link->url_abs()."..." if $DEBUG;
294 my $r = $m->get($link->url_abs());
295 my $content = $m->content();
296 if ($r->header('Content-Type') =~ /pdf/) {
297 print STDERR "success\n" if $DEBUG;
300 my $ret = check_subframes($m,$call);
301 return $ret if defined $ret;
302 print STDERR "failure; content type ".$r->header('Content-Type')."\n" if $DEBUG;
303 print STDERR $content if $DEBUG;
305 my $ret = check_subframes($m,$call);
306 return $ret if defined $ret;
307 # if ($guess and defined $best_guess) {
308 # $m->get($best_guess->url_abs());