#! /usr/bin/perl
# get_pdf tries to get pdfs, and is released
# under the terms of the GPL version 2, or any later version, at your
# option. See the file README and COPYING for more information.
# Copyright 2008 by Don Armstrong <don@donarmstrong.com>.
# $Id: perl_script 1352 2009-01-25 02:04:38Z don $


use warnings;
use strict;

use Getopt::Long;
use Pod::Usage;

=head1 NAME

get_pdf - try to get a pdf

=head1 SYNOPSIS

get_pdf [options] reference [references]

 Options:
  --debug, -d debugging level (Default 0)
  --help, -h display this help
  --man, -m display manual

=head1 OPTIONS

=over

=item B<--pmid, -p>

The reference is a pmid

=item B<--cgi-proxy, -C>

Use this cgi proxy style proxy

=item B<--debug, -d>

Debug verbosity. (Default 0)

=item B<--help, -h>

Display brief usage information.

=item B<--man, -m>

Display this manual.

=back

=head1 EXAMPLES


=cut


use vars qw($DEBUG);

use Cwd;
use WWW::Mechanize;
use Data::Printer;

my %options = (debug           => 0,
	       help            => 0,
	       man             => 0,
               use_links       => 1,
	       );

my %REFERENCE_TYPES = (pmid => 'pmid|p');

GetOptions(\%options,
	   values %REFERENCE_TYPES,
           'use_links|use-links!',
	   'cgi_proxy|cgi-proxy|C=s',
	   'http_proxy|http-proxy|H=s',
	   'debug|d+','help|h|?','man|m');

pod2usage() if $options{help};
pod2usage({verbose=>2}) if $options{man};

$DEBUG = $options{debug};

binmode(STDOUT,":encoding(UTF-8)");
binmode(STDERR,":encoding(UTF-8)");

if (not grep {exists $options{$_} and
		  defined $options{$_} and
		      $options{$_}} keys %REFERENCE_TYPES) {
    $options{pmid} = 1;
}
my @USAGE_ERRORS;
if (grep {exists $options{$_}
	      and defined $options{$_}
		  and $options{$_}} keys %REFERENCE_TYPES > 1) {
    push @USAGE_ERRORS,"You can only specify exactly one of the ".(map { "--$_"} keys %REFERENCE_TYPES)." options";
}

if (not @ARGV) {
    push @USAGE_ERRORS,"You must specify at least one reference";
}

pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;


if (exists $options{http_proxy}) {
    $ENV{http_proxy} = $options{http_proxy};
    $ENV{HTTP_PROXY} = $options{http_proxy};
    $ENV{CGI_HTTP_PROXY} = $options{http_proxy};
}

if ($options{pmid}) {
    my $m = WWW::Mechanize->new(agent => 'Mozilla',cookie_jar => {});
    for my $pmid (@ARGV) {
	$pmid =~ s/\D//g;
	next unless length $pmid;
	my $url = "www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&list_uids=${pmid}&dopt=Abstract";
	if (exists $options{cgi_proxy}) {
	    $url = $options{cgi_proxy}.$url;
	}
	$url = "http://${url}";
	eval {
	    $m->get($url) or die "Unable to get $url";
	    my $orig_mech = $m->clone();
	    use Data::Dumper;
	    my @possible_links = $m->find_all_links(text_regex => qr/to\s*read/i);
	    # try to find the other links
	    push @possible_links,
            grep {my $attr = $_->attrs();
                  exists $attr->{title} and
                      $attr->{title} =~ qr/(?:Full\s*Text|PMC)/i}
		    $m->links();
	    print STDERR map {"article link: ".$_->url_abs()."\n"} @possible_links if $DEBUG;
	    die "No links" unless @possible_links;
	    do {
		$m = $orig_mech;
		eval {
		    print "trying ".$possible_links[0]->url()."\n" if $DEBUG;
		    $m->get($possible_links[0]->url()) or
			die "Unable to follow link";
		    # try to find pdf link
		    my $pdf_m = find_pdf_link($m) or
			die "Unable to find pdf";
		    my $fh = IO::File->new($pmid.'.pdf','w') or
			die "Unable to open ${pmid}.pdf for writing: $!";
		    print {$fh} $pdf_m->content or
			die "Unable to write to ${pmid}.pdf: $!";
		    close $fh or
			die "Unable to close ${pmid}.pdf filehandle: $!";
		};
		shift @possible_links;
	    } while ($@ and @possible_links);
	    if ($@) {
		die "$@";
	    }
	};
	if ($@) {
	    print STDERR "$@\n" if $DEBUG;
        if ($options{use_links}) {
            if ($ENV{DISPLAY}) {
                system('chromium',
                       # links2 doesn't like the leading http:// of proxies for some reason
                       exists $options{http_proxy}?('--proxy-server',(map {s{http://}{}; $_} $options{http_proxy})):(),
                       '--temp-profile',
                       $url,
                      ) == 0 or next;
                rename('temp.pdf',"${pmid}.pdf") if -e 'temp.pdf';
            } else {
                system('links2',
                       # links2 doesn't like the leading http:// of proxies for some reason
                       exists $options{http_proxy}?('-http-proxy',(map {s{http://}{}; $_} $options{http_proxy})):(),
                       $url
                      ) == 0 or next;
                rename('temp.pdf',"${pmid}.pdf") if -e 'temp.pdf';
            }
        }}
    }
}


sub check_subframes {
    my ($m,$call) = @_;
    my @sub_frames = $m->find_all_links(tag_regex=>qr/^i?frame$/);
    print STDERR "subframes: \n" if $DEBUG;
    p @sub_frames if $DEBUG;
    for my $frame (@sub_frames) {
        my $r = $m->get($frame->url_abs());
        print STDERR "trying: ".$frame->url_abs()."\n" if $DEBUG;
        if ($r->header('Content-Type') =~ /pdf/) {
            return $m;
        }
        print STDERR "failed: ".$r->header('Content-Type')."\n" if $DEBUG;
    }
    for my $frame (@sub_frames) {
        my $r = $m->get($frame->url_abs());
        my $pdf_m = find_pdf_link($m,
                                  0,
                                  $call+1,
                                 );
        if (defined $pdf_m) {
            return $pdf_m;
        }
    }
    return undef;
}


sub find_pdf_link {
    my ($mech,$guess,$call) = @_;
    $guess = 1 unless defined $guess;
    $call = 0 unless defined $call;
    # avoid looping endlessly
    return undef if $call > 5;
    my $m = $mech->clone();
    if ($m->content =~ /select\s*a\s*website\s*below/i) {
        print STDERR $m->uri() if $DEBUG;
        print STDERR $m->content() if $DEBUG > 1;
        my @inputs = $m->find_all_inputs(type => 'hidden',
                                         name => q(urls['sd']),
                                        );
        return unless @inputs;
        $m->get($inputs[0]->value);
        print STDERR $m->content() if $DEBUG > 1;
    }
    my @possible_links;
    # this brings forward the actual link at Science
    push @possible_links,
        grep {my $temp = $_->attrs();
              exists $temp->{rel} and $temp->{rel} =~ qr/view-/i and
                  defined $_->text() and $_->text() =~ qr/Full\s*Text.*PDF/i
              }
        $m->find_all_links(text_regex => qr/PDF/i);
    push @possible_links,
        grep {my $temp = $_->attrs();
              exists $temp->{rel} and $temp->{rel} =~ qr/alternate/i and
                  exists $temp->{type} and $temp->{type} =~ qr/pdf/i
              }
        $m->find_all_links(url_regex => qr/pdf/);
    # this is to prioritize the real link at science direct
    push @possible_links,
        grep {my $temp = $_->attrs();
              use Data::Dumper;
              print STDERR Dumper($temp);
              (exists $temp->{title} and $temp->{title} =~ qr/(Download|Full\s*Text)\s*PDF/i) or
                  (defined $_->text() and $_->text() =~ qr/(Full\s*Text|Download).*PDF/i)
              }
        $m->find_all_links(text_regex => qr/PDF/i);
    my $possible_links = 0;
    if ($DEBUG) {
        $possible_links++;
        print STDERR "possible links[$possible_links]:\n";
        p @possible_links;
    }
    push @possible_links, grep { $_->url_abs() !~ /_orig(?:in)?=article/} $m->find_all_links(text_regex => qr/PDF/i);
    if ($DEBUG) {
        $possible_links++;
        print STDERR "possible links[$possible_links]:\n";
        p @possible_links;
    }
    push @possible_links, $m->find_all_links(tag_regex => qr/meta/,
					     url_regex  => qr/(reprint|\.pdf)/i,
					    );
    if ($DEBUG) {
        $possible_links++;
        print STDERR "possible links[$possible_links]:\n";
        p @possible_links;
    }
    # The masthead grep here is to handle PNAS, which has a link to their masthead in every article.
    push @possible_links,
        grep {my $temp = $_->attrs(); (not defined $temp->{title}) or $temp->{title} !~ qr/Masthead/i;}
        $m->find_all_links(text_regex => qr/pdf/i);
    if ($DEBUG) {
        $possible_links++;
        print STDERR "possible links[$possible_links]:\n";
        p @possible_links;
    }
    push @possible_links,$m->find_all_links(text_regex => qr/manual\s*download/i);
    if ($DEBUG) {
        $possible_links++;
        print STDERR "possible links[$possible_links]:\n";
        p @possible_links;
    }
    print STDERR $m->uri() if $DEBUG;
    print STDERR $m->content() if $DEBUG > 1;
    print STDERR map{"possible pdf link: ".$_->url_abs().qq(\n)} @possible_links if $DEBUG;
    if (not @possible_links and $DEBUG) {
        print STDERR $m->content();
    }
    my $best_guess = $possible_links[0] if @possible_links;
    for my $link (@possible_links) {
        print STDERR "trying ".$link->url_abs()."..." if $DEBUG;
        my $r = $m->get($link->url_abs());
        my $content = $m->content();
        if ($r->header('Content-Type') =~ /pdf/) {
            print STDERR "success\n" if $DEBUG;
            return $m;
        }
        my $ret = check_subframes($m,$call);
        return $ret if defined $ret;
        print STDERR "failure; content type ".$r->header('Content-Type')."\n" if $DEBUG;
        print STDERR $content if $DEBUG;
    }
    my $ret = check_subframes($m,$call);
    return $ret if defined $ret;
#     if ($guess and defined $best_guess) {
# 	$m->get($best_guess->url_abs());
# 	return $m;
#     }
    return undef;
}


__END__