X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=jobs_to_org;h=03777c568300eb17ae03aa29002db009d742e7b1;hb=c006ac2fb92fa2c5f281d47973474000e278060c;hp=3906309d99383f9737f8734d2bb4783afae97b96;hpb=8cde1d885fba2b694923a8674c77a260a8083260;p=bin.git diff --git a/jobs_to_org b/jobs_to_org index 3906309..03777c5 100755 --- a/jobs_to_org +++ b/jobs_to_org @@ -56,6 +56,7 @@ use vars qw($DEBUG); use Data::Printer; use Text::Wrap; use POSIX qw(strftime); +use DateTime; tie my $uuid, 'OSSP::uuid::tie'; $uuid= ["v1"]; @@ -99,9 +100,51 @@ my %sites = job_selector => [url_regex => qr{^/naturejobs/science/jobs/\d+-.+}], university => [href => qr{^/naturejobs/science/employer-directory/\d+$}], description => [class=>"job-description"], - date => [], + date => [_tag => 'dd', sub {$_[0]->as_text() =~ qr/\d+\s+days\s+ago$/},], position => [class=>'job-title heading'], }, + vitae => {url => 'https://chroniclevitae.com/job_search?job_search%5Bdistance_from_zip%5D=10&job_search%5Bemployment_type%5D=Full-time&job_search%5Bposition_type%5D=63', + next_selector => [text_regex => qr{Next}, url_regex=>qr{^/job_search}], + job_selector => [url_regex => qr{/jobs/\d+-\d+$}], + university => [href => qr{/institutions/\d+$}], + description => [class => 'job-listing__content__description'], + date => [_tag => 'td', sub {$_[0]->as_text() =~ qr/\,\s+20\d{2}$/}], + position => [_tag => 'h1', + sub {defined $_[0]->parent()->attr('class') and + $_[0]->parent()->attr('class') eq + 'page-title page-title--two-col'}, + ], + }, + higheredjobs => {url => 'https://www.higheredjobs.com/search/advanced_action.cfm?JobCat=113&JobCat=259&JobCat=99&JobCat=100&JobCat=108&JobCat=107&PosType=1&InstType=1&InstType=2&InstType=3&Keyword=&Remote=1&Remote=2&Region=&Submit=Search+Jobs', + next_selector_tree => [class => 'js-click-submit', + href => qr{advanced_action\.cfm}, + sub {my @c = $_[0]->content_list(); + return 0 unless @c; + return (defined $c[0]->attr('href') and + defined $c[0]->attr('src') =~ /active-right\.gif/); + }, + ], + job_selector => [url_regex => qr{^details.cfm\?JobCode=\d+}, + ], + university => [class => qr/field-value/, + sub {my $p = $_[0]->parent(); + my $c = $p->look_down(class => qr/field-label/, + ); + defined $c and $c->as_text() =~ qr/institution/i; + } + ], + date => [class => qr/field-value/, + sub {my $p = $_[0]->parent(); + my $c = $p->look_down(class => qr/field-label/, + ); + defined $c and $c->as_text() =~ qr/posted/i; + } + ], + position => [id => 'jobtitle-header', + _tag => 'h1', + ], + description => [id => 'jobDesc'], + }, ); binmode STDOUT,":utf8"; @@ -123,8 +166,8 @@ sub get_jobs { my %seen; my @job_urls = grep { ! $seen{ $_->URI()->abs() }++ } $m->find_all_links(@{$s->{job_selector}}); - for my $j_u (@job_urls) { - $m->get($j_u); + for my $j_u (sort @job_urls) { + $m->get($j_u) or next; my $university = 'No university'; eval { $university = $m->tree->look_down(@{$s->{university}})->as_text(); @@ -133,6 +176,11 @@ sub get_jobs { eval { $date = $m->tree->look_down(@{$s->{date}})->as_text() // $todays_date if @{$s->{date}}; + if ($date =~ /^\s*(\d+)\s*days\s*ago\s*$/) { + $date = strftime('%Y-%m-%d %H:%M:%S', + localtime((DateTime->now() - + DateTime::Duration->new(days=>$1))->epoch)); + } }; my $description = 'unknown'; eval { @@ -145,7 +193,14 @@ sub get_jobs { print format_job($university,$position,$j_u->URI->abs(),$description,$date); $m->back(); } - $m->follow_link(@{$s->{next_selector}}) or die "Unable to find next link"; + if (exists $s->{next_selector}) { + $m->follow_link(@{$s->{next_selector}}) or die "Unable to find next link"; + } elsif (exists $s->{next_selector_tree}) { + my $link = $m->tree->look_down(@{$s->{next_selector_tree}}) or + die "Unable to find next link"; + $m->get($link->attr('href')) or + die "Unable to get next page"; + } } }