2 # jobs_to_org finds jobs in different job sites and turns them into org things
3 # and is released under the terms of the GNU GPL version 3, or any
4 # later version, at your option. See the file README and COPYING for
6 # Copyright 2016 by Don Armstrong <don@donarmstrong.com>.
17 jobs_to_org - finds jobs in different job sites and turns them into org things
24 --debug, -d debugging level (Default 0)
25 --help, -h display this help
26 --man, -m display manual
34 Debug verbosity. (Default 0)
38 Display brief usage information.
54 use WWW::Mechanize::TreeBuilder;
58 use POSIX qw(strftime);
61 tie my $uuid, 'OSSP::uuid::tie';
64 my %options = (debug => 0,
74 'debug|d+','help|h|?','man|m');
76 pod2usage() if $options{help};
77 pod2usage({verbose=>2}) if $options{man};
79 $DEBUG = $options{debug};
83 # push @USAGE_ERRORS,"You must pass something";
86 pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
89 (herc => {url => 'http://main.hercjobs.org/jobs/search?keywords=professor+AND+%28genomics+OR+bioinformatics+OR+biology+OR+informatics%29&discipline=academic-faculty&category=academic-faculty&category=allied-health&category=biological-biomedical-sciences&category=computer-information-sciences&category=education&category=interdisciplinary&category=mathematics-statistics&category=medical-research&category=physical-sciences&sort=DATE_POSTED+DESC',
90 next_selector => [class => "bti-pagination-previous-link bti-pagination-prev-next",text => '>'],
91 job_selector => [url_regex => qr{^\/jobs\/\d+/.+}],
92 university => [itemprop=>"hiringOrganization",itemtype=> qr{https?://schema.org/Organization}],
93 description => [class=>"bti-jd-description",itemprop=>"description"],
94 date => [class=>"bti-jd-detail-text",
95 sub {scalar $_[0]->parent()->attr('class') eq 'bti-jd-details-action'}],
96 position => [class=>"bti-jd-title",itemprop=>"title"],
98 nature => {url => 'http://www.nature.com/naturejobs/science/jobs?utf8=%E2%9C%93&q%5B%5D=professor&job_type%5B%5D=Assistant+Professor&job_type%5B%5D=Professor&order_by=created_on',
99 next_selector => [class=>"next_page",url_regex=>qr{^/naturejobs/science/jobs},],
100 job_selector => [url_regex => qr{^/naturejobs/science/jobs/\d+-.+}],
101 university => [href => qr{^/naturejobs/science/employer-directory/\d+$}],
102 description => [class=>"job-description"],
103 date => [_tag => 'dd', sub {$_[0]->as_text() =~ qr/\d+\s+days\s+ago$/},],
104 position => [class=>'job-title heading'],
106 vitae => {url => 'https://chroniclevitae.com/job_search?job_search%5Bdistance_from_zip%5D=10&job_search%5Bemployment_type%5D=Full-time&job_search%5Bposition_type%5D=63',
107 next_selector => [text_regex => qr{Next}, url_regex=>qr{^/job_search}],
108 job_selector => [url_regex => qr{/jobs/\d+-\d+$}],
109 university => [href => qr{/institutions/\d+$}],
110 description => [class => 'job-listing__content__description'],
111 date => [_tag => 'td', sub {$_[0]->as_text() =~ qr/\,\s+20\d{2}$/}],
112 position => [_tag => 'h1',
113 sub {defined $_[0]->parent()->attr('class') and
114 $_[0]->parent()->attr('class') eq
115 'page-title page-title--two-col'},
118 higheredjobs => {url => 'https://www.higheredjobs.com/search/advanced_action.cfm?JobCat=113&JobCat=259&JobCat=99&JobCat=100&JobCat=108&JobCat=107&PosType=1&InstType=1&InstType=2&InstType=3&Keyword=&Remote=1&Remote=2&Region=&Submit=Search+Jobs',
119 next_selector_tree => [class => 'js-click-submit',
120 href => qr{advanced_action\.cfm},
123 $_[0]->look_down(src =>
128 job_selector => [url_regex => qr{^details.cfm\?JobCode=\d+},
130 university => [class => qr/field-value/,
131 sub {my $p = $_[0]->parent();
132 my $c = $p->look_down(class => qr/field-label/,
134 defined $c and $c->as_text() =~ qr/institution/i;
137 date => [class => qr/field-value/,
138 sub {my $p = $_[0]->parent();
139 my $c = $p->look_down(class => qr/field-label/,
141 defined $c and $c->as_text() =~ qr/posted/i;
144 position => [id => 'jobtitle-header',
147 description => [id => 'jobDesc'],
151 binmode STDOUT,":utf8";
152 get_jobs($options{site},$options{pages});
155 my ($site,$pages) = @_;
157 my $todays_date = strftime('%Y-%m-%d %H:%M:%S',localtime());
159 my $m = WWW::Mechanize->new();
160 WWW::Mechanize::TreeBuilder->meta->apply($m);
161 if (not defined $sites{$site}) {
162 die "Unknown site $site";
164 print "* Jobs from $site\n";
165 my $s = $sites{$site};
169 my @job_urls = grep { ! $seen{ $_->URI()->abs() }++ }
170 $m->find_all_links(@{$s->{job_selector}});
172 if (exists $s->{next_selector}) {
173 ($link) = map {$_->URI()->abs()}
174 $m->find_all_links(@{$s->{next_selector}});
175 } elsif (exists $s->{next_selector_tree}) {
176 $link = $m->tree->look_down(@{$s->{next_selector_tree}});
177 if (not defined $link) {
180 die "Unable to find next link" unless defined $link;
181 $link = $link->attr('href');
183 for my $j_u (sort @job_urls) {
184 $m->get($j_u) or next;
185 my $university = 'No university';
187 $university = $m->tree->look_down(@{$s->{university}})->as_text();
189 my $date = $todays_date;
191 $date = $m->tree->look_down(@{$s->{date}})->as_text() // $todays_date if
193 if ($date =~ /^\s*(\d+)\s*days\s*ago\s*$/) {
194 $date = strftime('%Y-%m-%d %H:%M:%S',
195 localtime((DateTime->now() -
196 DateTime::Duration->new(days=>$1))->epoch));
199 my $description = 'unknown';
201 $description = $m->tree->look_down(@{$s->{description}})->as_text();
203 my $position = 'Unknown';
205 $position = $m->tree->look_down(@{$s->{position}})->as_text();
207 print format_job($university,$position,$j_u->URI->abs(),$description,$date);
208 $m->back() or die "Unable to go back";
215 my ($university,$position,$url,$text,$date) = @_;
216 $text = wrap(' ',' ',$text);
218 ** TODO $university -- $position