#!/usr/bin/perl # jobs_to_org finds jobs in different job sites and turns them into org things # and is released under the terms of the GNU GPL version 3, or any # later version, at your option. See the file README and COPYING for # more information. # Copyright 2016 by Don Armstrong . use warnings; use strict; use Getopt::Long; use Pod::Usage; =head1 NAME jobs_to_org - finds jobs in different job sites and turns them into org things =head1 SYNOPSIS jobs_to_org [options] Options: --debug, -d debugging level (Default 0) --help, -h display this help --man, -m display manual =head1 OPTIONS =over =item B<--debug, -d> Debug verbosity. (Default 0) =item B<--help, -h> Display brief usage information. =item B<--man, -m> Display this manual. =back =head1 EXAMPLES jobs_to_org =cut use OSSP::uuid; use WWW::Mechanize; use WWW::Mechanize::TreeBuilder; use vars qw($DEBUG); use Data::Printer; use Text::Wrap; use POSIX qw(strftime); use DateTime; tie my $uuid, 'OSSP::uuid::tie'; $uuid= ["v1"]; my %options = (debug => 0, help => 0, man => 0, pages => 1, site => 'herc', ); GetOptions(\%options, 'site|s=s', 'pages|p=i', 'debug|d+','help|h|?','man|m'); pod2usage() if $options{help}; pod2usage({verbose=>2}) if $options{man}; $DEBUG = $options{debug}; my @USAGE_ERRORS; # if (1) { # push @USAGE_ERRORS,"You must pass something"; # } pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS; my %sites = (herc => {url => 'http://main.hercjobs.org/jobs/search?keywords=professor+AND+%28genomics+OR+bioinformatics+OR+biology+OR+informatics%29&discipline=academic-faculty&category=academic-faculty&category=allied-health&category=biological-biomedical-sciences&category=computer-information-sciences&category=education&category=interdisciplinary&category=mathematics-statistics&category=medical-research&category=physical-sciences&sort=DATE_POSTED+DESC', next_selector => [class => "bti-pagination-previous-link bti-pagination-prev-next",text => '>'], job_selector => [url_regex => qr{^\/jobs\/\d+/.+}], university => [itemprop=>"hiringOrganization",itemtype=> qr{https?://schema.org/Organization}], description => [class=>"bti-jd-description",itemprop=>"description"], date => [class=>"bti-jd-detail-text", sub {scalar $_[0]->parent()->attr('class') eq 'bti-jd-details-action'}], position => [class=>"bti-jd-title",itemprop=>"title"], }, nature => {url => 'http://www.nature.com/naturejobs/science/jobs?utf8=%E2%9C%93&q%5B%5D=professor&job_type%5B%5D=Assistant+Professor&job_type%5B%5D=Professor&order_by=created_on', next_selector => [class=>"next_page",url_regex=>qr{^/naturejobs/science/jobs},], job_selector => [url_regex => qr{^/naturejobs/science/jobs/\d+-.+}], university => [href => qr{^/naturejobs/science/employer-directory/\d+$}], description => [class=>"job-description"], date => [_tag => 'dd', sub {$_[0]->as_text() =~ qr/\d+\s+days\s+ago$/},], position => [class=>'job-title heading'], }, vitae => {url => 'https://chroniclevitae.com/job_search?job_search%5Bdistance_from_zip%5D=10&job_search%5Bemployment_type%5D=Full-time&job_search%5Bposition_type%5D=63', next_selector => [text_regex => qr{Next}, url_regex=>qr{^/job_search}], job_selector => [url_regex => qr{/jobs/\d+-\d+$}], university => [href => qr{/institutions/\d+$}], description => [class => 'job-listing__content__description'], date => [_tag => 'td', sub {$_[0]->as_text() =~ qr/\,\s+20\d{2}$/}], position => [_tag => 'h1', sub {defined $_[0]->parent()->attr('class') and $_[0]->parent()->attr('class') eq 'page-title page-title--two-col'}, ], }, higheredjobs => {url => 'https://www.higheredjobs.com/search/advanced_action.cfm?JobCat=113&JobCat=259&JobCat=99&JobCat=100&JobCat=108&JobCat=107&PosType=1&InstType=1&InstType=2&InstType=3&Keyword=&Remote=1&Remote=2&Region=&Submit=Search+Jobs', next_selector_tree => [class => 'js-click-submit', href => qr{advanced_action\.cfm}, sub {return 1 if defined $_[0]->look_down(src => qr/active-right/); return 0; }, ], job_selector => [url_regex => qr{^details.cfm\?JobCode=\d+}, ], university => [class => qr/field-value/, sub {my $p = $_[0]->parent(); my $c = $p->look_down(class => qr/field-label/, ); defined $c and $c->as_text() =~ qr/institution/i; } ], date => [class => qr/field-value/, sub {my $p = $_[0]->parent(); my $c = $p->look_down(class => qr/field-label/, ); defined $c and $c->as_text() =~ qr/posted/i; } ], position => [id => 'jobtitle-header', _tag => 'h1', ], description => [id => 'jobDesc'], }, ); binmode STDOUT,":utf8"; get_jobs($options{site},$options{pages}); sub get_jobs { my ($site,$pages) = @_; my $todays_date = strftime('%Y-%m-%d %H:%M:%S',localtime()); my $m = WWW::Mechanize->new(); WWW::Mechanize::TreeBuilder->meta->apply($m); if (not defined $sites{$site}) { die "Unknown site $site"; } print "* Jobs from $site\n"; my $s = $sites{$site}; $m->get($s->{url}); for (1..$pages) { my %seen; my @job_urls = grep { ! $seen{ $_->URI()->abs() }++ } $m->find_all_links(@{$s->{job_selector}}); my $link; if (exists $s->{next_selector}) { ($link) = map {$_->URI()->abs()} $m->find_all_links(@{$s->{next_selector}}); } elsif (exists $s->{next_selector_tree}) { $link = $m->tree->look_down(@{$s->{next_selector_tree}}); if (not defined $link) { $m->tree->dump; } die "Unable to find next link" unless defined $link; $link = $link->attr('href'); } for my $j_u (sort @job_urls) { $m->get($j_u) or next; my $university = 'No university'; eval { $university = $m->tree->look_down(@{$s->{university}})->as_text(); }; my $date = $todays_date; eval { $date = $m->tree->look_down(@{$s->{date}})->as_text() // $todays_date if @{$s->{date}}; if ($date =~ /^\s*(\d+)\s*days\s*ago\s*$/) { $date = strftime('%Y-%m-%d %H:%M:%S', localtime((DateTime->now() - DateTime::Duration->new(days=>$1))->epoch)); } }; my $description = 'unknown'; eval { $description = $m->tree->look_down(@{$s->{description}})->as_text(); }; my $position = 'Unknown'; eval { $position = $m->tree->look_down(@{$s->{position}})->as_text(); }; print format_job($university,$position,$j_u->URI->abs(),$description,$date); $m->back() or die "Unable to go back"; } $m->get($link); } } sub format_job { my ($university,$position,$url,$text,$date) = @_; $text = wrap(' ',' ',$text); my $ret = <<"EOF"; ** TODO $university -- $position :PROPERTIES: :ID: $uuid :END: [[$url]] [$date] $text EOF return $ret; } __END__