2 # get_pdf tries to get pdfs, and is released
3 # under the terms of the GPL version 2, or any later version, at your
4 # option. See the file README and COPYING for more information.
5 # Copyright 2008 by Don Armstrong <don@donarmstrong.com>.
6 # $Id: perl_script 1352 2009-01-25 02:04:38Z don $
17 get_pdf - try to get a pdf
21 get_pdf [options] reference [references]
24 --debug, -d debugging level (Default 0)
25 --help, -h display this help
26 --man, -m display manual
34 The reference is a pmid
36 =item B<--cgi-proxy, -C>
38 Use this cgi proxy style proxy
42 Debug verbosity. (Default 0)
46 Display brief usage information.
65 my %options = (debug => 0,
70 my %REFERENCE_TYPES = (pmid => 'pmid|p');
73 values %REFERENCE_TYPES,
74 'cgi_proxy|cgi-proxy|C=s',
75 'http_proxy|http-proxy|H=s',
76 'debug|d+','help|h|?','man|m');
78 pod2usage() if $options{help};
79 pod2usage({verbose=>2}) if $options{man};
81 $DEBUG = $options{debug};
85 if (not grep {exists $options{$_} and
86 defined $options{$_} and
87 $options{$_}} keys %REFERENCE_TYPES) {
91 if (grep {exists $options{$_}
92 and defined $options{$_}
93 and $options{$_}} keys %REFERENCE_TYPES > 1) {
94 push @USAGE_ERRORS,"You can only specify exactly one of the ".(map { "--$_"} keys %REFERENCE_TYPES)." options";
98 push @USAGE_ERRORS,"You must specify at least one reference";
101 pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
104 if (exists $options{http_proxy}) {
105 $ENV{http_proxy} = $options{http_proxy};
106 $ENV{HTTP_PROXY} = $options{http_proxy};
107 $ENV{CGI_HTTP_PROXY} = $options{http_proxy};
110 if ($options{pmid}) {
111 my $m = WWW::Mechanize->new();
112 for my $pmid (@ARGV) {
114 next unless length $pmid;
115 my $url = "www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&list_uids=${pmid}&dopt=Abstract";
116 if (exists $options{cgi_proxy}) {
117 $url = $options{cgi_proxy}.$url;
119 $url = "http://${url}";
121 $m->get($url) or die "Unable to get $url";
122 $m->follow_link(text_regex => qr/to\s*read/i) or
123 die "Unable to follow link";
124 # try to find pdf link
125 my $pdf_m = find_pdf_link($m) or
126 die "Unable to find pdf";
127 my $fh = IO::File->new($pmid.'.pdf','w') or
128 die "Unable to open ${pmid}.pdf for writing: $!";
129 print {$fh} $pdf_m->content or
130 die "Unable to write to ${pmid}.pdf: $!";
132 die "Unable to close ${pmid}.pdf filehandle: $!";
135 print STDERR "$@\n" if $DEBUG;
137 ## exists $options{http_proxy}?('-http-proxy',$options{http_proxy}):(),
140 ## rename('temp.pdf',"${pmid}.pdf") if -e 'temp.pdf';
147 my ($mech,$guess,$call) = @_;
148 $guess = 1 unless defined $guess;
149 $call = 0 unless defined $call;
150 # avoid looping endlessly
151 return undef if $call > 5;
152 my $m = $mech->clone();
153 if ($m->content =~ /select\s*a\s*website\s*below/i) {
154 print STDERR $m->uri() if $DEBUG;
155 print STDERR $m->content() if $DEBUG;
156 my @inputs = $m->find_all_inputs(type => 'hidden',
157 name => q(urls['sd']),
159 return unless @inputs;
160 $m->get($inputs[0]->value);
161 print STDERR $m->content() if $DEBUG;
164 push @possible_links, $m->find_all_links(tag_regex => qr/meta/,
165 url_regex => qr/(reprint|\.pdf)/i,
167 push @possible_links, $m->find_all_links(text_regex => qr/pdf/i);
168 push @possible_links,$m->find_all_links(text_regex => qr/manual\s*download/i);
169 print STDERR $m->uri() if $DEBUG;
170 print STDERR $m->content() if $DEBUG;
171 print STDERR map{$_->url_abs(),qq(\n)} @possible_links if $DEBUG;
172 if (not @possible_links and $DEBUG) {
173 print STDERR $m->content();
175 my $best_guess = $possible_links[0] if @possible_links;
176 for my $link (@possible_links) {
177 my $r = $m->get($link->url_abs());
178 if ($r->header('Content-Type') =~ /pdf/) {
182 my @sub_frames = $m->find_all_links(tag_regex=>qr/^i?frame$/);
183 for my $frame (@sub_frames) {
184 $m->get($frame->url_abs());
185 my $pdf_m = find_pdf_link($m,
189 if (defined $pdf_m) {
193 if ($guess and defined $best_guess) {
194 $m->get($best_guess->url_abs());