]> git.donarmstrong.com Git - bin.git/blob - jobs_to_org
support higheredjobs in jobs_to_org
[bin.git] / jobs_to_org
1 #!/usr/bin/perl
2 # jobs_to_org finds jobs in different job sites and turns them into org things
3 # and is released under the terms of the GNU GPL version 3, or any
4 # later version, at your option. See the file README and COPYING for
5 # more information.
6 # Copyright 2016 by Don Armstrong <don@donarmstrong.com>.
7
8
9 use warnings;
10 use strict;
11
12 use Getopt::Long;
13 use Pod::Usage;
14
15 =head1 NAME
16
17 jobs_to_org - finds jobs in different job sites and turns them into org things
18
19 =head1 SYNOPSIS
20
21 jobs_to_org [options]
22
23  Options:
24    --debug, -d debugging level (Default 0)
25    --help, -h display this help
26    --man, -m display manual
27
28 =head1 OPTIONS
29
30 =over
31
32 =item B<--debug, -d>
33
34 Debug verbosity. (Default 0)
35
36 =item B<--help, -h>
37
38 Display brief usage information.
39
40 =item B<--man, -m>
41
42 Display this manual.
43
44 =back
45
46 =head1 EXAMPLES
47
48 jobs_to_org
49
50 =cut
51
52 use OSSP::uuid;
53 use WWW::Mechanize;
54 use WWW::Mechanize::TreeBuilder;
55 use vars qw($DEBUG);
56 use Data::Printer;
57 use Text::Wrap;
58 use POSIX qw(strftime);
59 use DateTime;
60
61 tie my $uuid, 'OSSP::uuid::tie';
62 $uuid= ["v1"];
63
64 my %options = (debug           => 0,
65                help            => 0,
66                man             => 0,
67                pages           => 1,
68                site            => 'herc',
69               );
70
71 GetOptions(\%options,
72            'site|s=s',
73            'pages|p=i',
74            'debug|d+','help|h|?','man|m');
75
76 pod2usage() if $options{help};
77 pod2usage({verbose=>2}) if $options{man};
78
79 $DEBUG = $options{debug};
80
81 my @USAGE_ERRORS;
82 # if (1) {
83 #     push @USAGE_ERRORS,"You must pass something";
84 # }
85
86 pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
87
88 my %sites =
89     (herc => {url => 'http://main.hercjobs.org/jobs/search?keywords=professor+AND+%28genomics+OR+bioinformatics+OR+biology+OR+informatics%29&discipline=academic-faculty&category=academic-faculty&category=allied-health&category=biological-biomedical-sciences&category=computer-information-sciences&category=education&category=interdisciplinary&category=mathematics-statistics&category=medical-research&category=physical-sciences&sort=DATE_POSTED+DESC',
90               next_selector => [class => "bti-pagination-previous-link bti-pagination-prev-next",text => '>'],
91               job_selector => [url_regex => qr{^\/jobs\/\d+/.+}],
92               university => [itemprop=>"hiringOrganization",itemtype=>"http://schema.org/Organization"],
93               description => [class=>"bti-jd-description",itemprop=>"description"],
94               date => [class=>"bti-jd-detail-text",
95                        sub {scalar $_[0]->parent()->attr('class') eq 'bti-jd-details-action'}],
96               position => [class=>"bti-jd-title",itemprop=>"title"],
97              },
98      nature => {url => 'http://www.nature.com/naturejobs/science/jobs?utf8=%E2%9C%93&q%5B%5D=professor&job_type%5B%5D=Assistant+Professor&job_type%5B%5D=Professor&order_by=created_on',
99                 next_selector => [class=>"next_page",url_regex=>qr{^/naturejobs/science/jobs},],
100                 job_selector => [url_regex => qr{^/naturejobs/science/jobs/\d+-.+}],
101                 university => [href => qr{^/naturejobs/science/employer-directory/\d+$}],
102                 description => [class=>"job-description"],
103                 date => [_tag => 'dd', sub {$_[0]->as_text() =~ qr/\d+\s+days\s+ago$/},],
104                 position => [class=>'job-title heading'],
105                },
106      vitae => {url => 'https://chroniclevitae.com/job_search?job_search%5Bdistance_from_zip%5D=10&job_search%5Bemployment_type%5D=Full-time&job_search%5Bposition_type%5D=63',
107                next_selector => [text_regex => qr{Next}, url_regex=>qr{^/job_search}],
108                job_selector => [url_regex => qr{/jobs/\d+-\d+$}],
109                university => [href => qr{/institutions/\d+$}],
110                description => [class => 'job-listing__content__description'],
111                date => [_tag => 'td', sub {$_[0]->as_text() =~ qr/\,\s+20\d{2}$/}],
112                position => [_tag => 'h1',
113                             sub {defined $_[0]->parent()->attr('class') and
114                                      $_[0]->parent()->attr('class') eq
115                                      'page-title page-title--two-col'},
116                            ],
117               },
118      higheredjobs => {url => 'https://www.higheredjobs.com/search/advanced_action.cfm?JobCat=113&JobCat=259&JobCat=99&JobCat=100&JobCat=108&JobCat=107&PosType=1&InstType=1&InstType=2&InstType=3&Keyword=&Remote=1&Remote=2&Region=&Submit=Search+Jobs',
119                       next_selector_tree => [class => 'js-click-submit',
120                                              href => qr{advanced_action\.cfm},
121                                              sub {my @c = $_[0]->content_list();
122                                                   return 0 unless @c;
123                                                   return (defined $c[0]->attr('href') and
124                                                           defined $c[0]->attr('src') =~ /active-right\.gif/);
125                                               },
126                                             ],
127                       job_selector => [url_regex => qr{^details.cfm\?JobCode=\d+},
128                                       ],
129                       university => [class => qr/field-value/,
130                                      sub {my $p = $_[0]->parent();
131                                     my $c = $p->look_down(class => qr/field-label/,
132                                                          );
133                                     defined $c and $c->as_text() =~ qr/institution/i;
134                                 }
135                               ],
136                       date => [class => qr/field-value/,
137                                sub {my $p = $_[0]->parent();
138                                     my $c = $p->look_down(class => qr/field-label/,
139                                                          );
140                                     defined $c and $c->as_text() =~ qr/posted/i;
141                                 }
142                               ],
143                       position => [id => 'jobtitle-header',
144                                    _tag => 'h1',
145                                   ],
146                       description => [id => 'jobDesc'],
147                      },
148     );
149
150 binmode STDOUT,":utf8";
151 get_jobs($options{site},$options{pages});
152
153 sub get_jobs {
154     my ($site,$pages) = @_;
155
156     my $todays_date = strftime('%Y-%m-%d %H:%M:%S',localtime());
157
158     my $m = WWW::Mechanize->new();
159     WWW::Mechanize::TreeBuilder->meta->apply($m);
160     if (not defined $sites{$site}) {
161         die "Unknown site $site";
162     }
163     my $s = $sites{$site};
164     $m->get($s->{url});
165     for (1..$pages) {
166         my %seen;
167         my @job_urls = grep { ! $seen{ $_->URI()->abs() }++ }
168             $m->find_all_links(@{$s->{job_selector}});
169         for my $j_u (sort @job_urls) {
170             $m->get($j_u) or next;
171             my $university = 'No university';
172             eval {
173                 $university = $m->tree->look_down(@{$s->{university}})->as_text();
174             };
175             my $date = $todays_date;
176             eval {
177                 $date = $m->tree->look_down(@{$s->{date}})->as_text() // $todays_date if
178                     @{$s->{date}};
179                 if ($date =~ /^\s*(\d+)\s*days\s*ago\s*$/) {
180                     $date = strftime('%Y-%m-%d %H:%M:%S',
181                                      localtime((DateTime->now() -
182                                                 DateTime::Duration->new(days=>$1))->epoch));
183                 }
184             };
185             my $description = 'unknown';
186             eval {
187                 $description = $m->tree->look_down(@{$s->{description}})->as_text();
188             };
189             my $position = 'Unknown';
190             eval {
191                 $position = $m->tree->look_down(@{$s->{position}})->as_text();
192             };
193             print format_job($university,$position,$j_u->URI->abs(),$description,$date);
194             $m->back();
195         }
196         if (exists $s->{next_selector}) {
197             $m->follow_link(@{$s->{next_selector}}) or die "Unable to find next link";
198         } elsif (exists $s->{next_selector_tree}) {
199             my $link = $m->tree->look_down(@{$s->{next_selector_tree}}) or
200                 die "Unable to find next link";
201             $m->get($link->attr('href')) or
202                 die "Unable to get next page";
203         }
204     }
205 }
206
207 sub format_job {
208     my ($university,$position,$url,$text,$date) = @_;
209     $text = wrap('   ','   ',$text);
210     my $ret = <<"EOF";
211 ** TODO $university -- $position
212    :PROPERTIES:
213    :ID:       $uuid
214    :END:
215 [[$url]]
216 [$date]
217
218 $text
219
220 EOF
221     return $ret;
222 }
223
224
225
226 __END__