X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=jobs_to_org;h=67a70424f2e445fb8502e9d407206102f1584e43;hb=56c8d8b286ae38e170ce915472f9aa86fabf9fee;hp=fcdfd35345aab2e61ffc34d3591c0414c6961ad8;hpb=af42ffdc3f8791cd37d1d21b1dcc2538abd9696e;p=bin.git diff --git a/jobs_to_org b/jobs_to_org index fcdfd35..67a7042 100755 --- a/jobs_to_org +++ b/jobs_to_org @@ -89,7 +89,7 @@ my %sites = (herc => {url => 'http://main.hercjobs.org/jobs/search?keywords=professor+AND+%28genomics+OR+bioinformatics+OR+biology+OR+informatics%29&discipline=academic-faculty&category=academic-faculty&category=allied-health&category=biological-biomedical-sciences&category=computer-information-sciences&category=education&category=interdisciplinary&category=mathematics-statistics&category=medical-research&category=physical-sciences&sort=DATE_POSTED+DESC', next_selector => [class => "bti-pagination-previous-link bti-pagination-prev-next",text => '>'], job_selector => [url_regex => qr{^\/jobs\/\d+/.+}], - university => [itemprop=>"hiringOrganization",itemtype=>"http://schema.org/Organization"], + university => [itemprop=>"hiringOrganization",itemtype=> qr{https?://schema.org/Organization}], description => [class=>"bti-jd-description",itemprop=>"description"], date => [class=>"bti-jd-detail-text", sub {scalar $_[0]->parent()->attr('class') eq 'bti-jd-details-action'}], @@ -115,6 +115,37 @@ my %sites = 'page-title page-title--two-col'}, ], }, + higheredjobs => {url => 'https://www.higheredjobs.com/search/advanced_action.cfm?JobCat=113&JobCat=259&JobCat=99&JobCat=100&JobCat=108&JobCat=107&PosType=1&InstType=1&InstType=2&InstType=3&Keyword=&Remote=1&Remote=2&Region=&Submit=Search+Jobs', + next_selector_tree => [class => 'js-click-submit', + href => qr{advanced_action\.cfm}, + sub {return 1 + if defined + $_[0]->look_down(src => + qr/active-right/); + return 0; + }, + ], + job_selector => [url_regex => qr{^details.cfm\?JobCode=\d+}, + ], + university => [class => qr/field-value/, + sub {my $p = $_[0]->parent(); + my $c = $p->look_down(class => qr/field-label/, + ); + defined $c and $c->as_text() =~ qr/institution/i; + } + ], + date => [class => qr/field-value/, + sub {my $p = $_[0]->parent(); + my $c = $p->look_down(class => qr/field-label/, + ); + defined $c and $c->as_text() =~ qr/posted/i; + } + ], + position => [id => 'jobtitle-header', + _tag => 'h1', + ], + description => [id => 'jobDesc'], + }, ); binmode STDOUT,":utf8"; @@ -130,12 +161,25 @@ sub get_jobs { if (not defined $sites{$site}) { die "Unknown site $site"; } + print "* Jobs from $site\n"; my $s = $sites{$site}; $m->get($s->{url}); for (1..$pages) { my %seen; my @job_urls = grep { ! $seen{ $_->URI()->abs() }++ } $m->find_all_links(@{$s->{job_selector}}); + my $link; + if (exists $s->{next_selector}) { + ($link) = map {$_->URI()->abs()} + $m->find_all_links(@{$s->{next_selector}}); + } elsif (exists $s->{next_selector_tree}) { + $link = $m->tree->look_down(@{$s->{next_selector_tree}}); + if (not defined $link) { + $m->tree->dump; + } + die "Unable to find next link" unless defined $link; + $link = $link->attr('href'); + } for my $j_u (sort @job_urls) { $m->get($j_u) or next; my $university = 'No university'; @@ -161,9 +205,9 @@ sub get_jobs { $position = $m->tree->look_down(@{$s->{position}})->as_text(); }; print format_job($university,$position,$j_u->URI->abs(),$description,$date); - $m->back(); + $m->back() or die "Unable to go back"; } - $m->follow_link(@{$s->{next_selector}}) or die "Unable to find next link"; + $m->get($link); } }