From 75086a983f50e94bae03d69f67ad57b2c1450aa2 Mon Sep 17 00:00:00 2001 From: Don Armstrong Date: Wed, 8 Apr 2009 07:38:39 +0000 Subject: [PATCH] * add more debugging information * handle relative urls properly * look in meta tags too --- get_pdf | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/get_pdf b/get_pdf index 23daa99..7daca79 100755 --- a/get_pdf +++ b/get_pdf @@ -160,22 +160,28 @@ sub find_pdf_link { $m->get($inputs[0]->value); print STDERR $m->content() if $DEBUG; } - my @possible_links = $m->find_all_links(text_regex => qr/pdf/i); + my @possible_links; + push @possible_links, $m->find_all_links(tag_regex => qr/meta/, + url_regex => qr/(reprint|\.pdf)/i, + ); + push @possible_links, $m->find_all_links(text_regex => qr/pdf/i); push @possible_links,$m->find_all_links(text_regex => qr/manual\s*download/i); - print STDERR map{$_->url,qq(\n)} @possible_links if $DEBUG; + print STDERR $m->uri() if $DEBUG; + print STDERR $m->content() if $DEBUG; + print STDERR map{$_->url_abs(),qq(\n)} @possible_links if $DEBUG; if (not @possible_links and $DEBUG) { print STDERR $m->content(); } my $best_guess = $possible_links[0] if @possible_links; for my $link (@possible_links) { - my $r = $m->get($link->url()); + my $r = $m->get($link->url_abs()); if ($r->header('Content-Type') =~ /pdf/) { return $m; } } my @sub_frames = $m->find_all_links(tag_regex=>qr/^i?frame$/); for my $frame (@sub_frames) { - $m->get($frame->url()); + $m->get($frame->url_abs()); my $pdf_m = find_pdf_link($m, 0, $call+1, @@ -185,7 +191,7 @@ sub find_pdf_link { } } if ($guess and defined $best_guess) { - $m->get($best_guess->url()); + $m->get($best_guess->url_abs()); return $m; } return undef; -- 2.39.5