]> git.donarmstrong.com Git - bin.git/blob - jobs_to_org
add jobs to org
[bin.git] / jobs_to_org
1 #!/usr/bin/perl
2 # SCRIPTNAME DOES_SOMETHING
3 # and is released under the terms of the GNU GPL version 3, or any
4 # later version, at your option. See the file README and COPYING for
5 # more information.
6 # Copyright 2016 by Don Armstrong <don@donarmstrong.com>.
7
8
9 use warnings;
10 use strict;
11
12 use Getopt::Long;
13 use Pod::Usage;
14
15 =head1 NAME
16
17 SCRIPTNAME - DOES_SOMETHING
18
19 =head1 SYNOPSIS
20
21 SCRIPTNAME [options]
22
23  Options:
24    --debug, -d debugging level (Default 0)
25    --help, -h display this help
26    --man, -m display manual
27
28 =head1 OPTIONS
29
30 =over
31
32 =item B<--debug, -d>
33
34 Debug verbosity. (Default 0)
35
36 =item B<--help, -h>
37
38 Display brief usage information.
39
40 =item B<--man, -m>
41
42 Display this manual.
43
44 =back
45
46 =head1 EXAMPLES
47
48 SCRIPTNAME
49
50 =cut
51
52 use OSSP::uuid;
53 use WWW::Mechanize;
54 use WWW::Mechanize::TreeBuilder;
55 use vars qw($DEBUG);
56 use Data::Printer;
57
58 tie my $uuid, 'OSSP::uuid::tie';
59 $uuid= ["v1"];
60
61 my %options = (debug           => 0,
62                help            => 0,
63                man             => 0,
64                pages           => 1,
65                site            => 'herc',
66               );
67
68 GetOptions(\%options,
69            'site|s=s',
70            'pages|p=i',
71            'debug|d+','help|h|?','man|m');
72
73 pod2usage() if $options{help};
74 pod2usage({verbose=>2}) if $options{man};
75
76 $DEBUG = $options{debug};
77
78 my @USAGE_ERRORS;
79 # if (1) {
80 #     push @USAGE_ERRORS,"You must pass something";
81 # }
82
83 pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
84
85 my %sites =
86     (herc => {url => 'http://main.hercjobs.org/jobs/search?keywords=professor+AND+%28genomics+OR+bioinformatics+OR+biology+OR+informatics%29&discipline=academic-faculty&category=academic-faculty&category=allied-health&category=biological-biomedical-sciences&category=computer-information-sciences&category=education&category=interdisciplinary&category=mathematics-statistics&category=medical-research&category=physical-sciences&sort=DATE_POSTED+DESC',
87               next_selector => [class => "bti-pagination-previous-link bti-pagination-prev-next"],
88               job_selector => [url_regex => qr{^\/jobs\/\d+/.+}],
89               university => [itemprop=>"hiringOrganization",itemtype=>"http://schema.org/Organization"],
90               description => [class=>"bti-jd-description",itemprop=>"description"],
91               date => [class=>"bti-jd-detail-text",
92                        sub {scalar $_[0]->parent()->attr('class') eq 'bti-jd-details-action'}],
93               position => [class=>"bti-jd-title",itemprop=>"title"],
94              },
95     );
96
97 binmode STDOUT,":utf8";
98 get_jobs($options{site},$options{pages});
99 sub get_jobs {
100     my ($site,$pages) = @_;
101
102     my $m = WWW::Mechanize->new();
103     WWW::Mechanize::TreeBuilder->meta->apply($m);
104     if (not defined $sites{$site}) {
105         die "Unknown site $site";
106     }
107     my $s = $sites{$site};
108     $m->get($s->{url});
109     for (1..$pages) {
110         my %seen;
111         my @job_urls = grep { ! $seen{ $_->URI()->abs() }++ }
112             $m->find_all_links(@{$s->{job_selector}});
113         for my $j_u (@job_urls) {
114             $m->get($j_u);
115             my $university = $m->tree->look_down(@{$s->{university}})->as_text();
116             my $date = $m->tree->look_down(@{$s->{date}})->as_text();
117             my $description = $m->tree->look_down(@{$s->{description}})->as_text();
118             my $position = $m->tree->look_down(@{$s->{position}})->as_text();
119             print format_job($university,$position,$j_u->URI->abs(),$description,$date);
120             $m->back();
121         }
122         $m->follow_link(@{$s->{next_selector}});
123     }
124 }
125
126 sub format_job {
127     my ($university,$position,$url,$text,$date) = @_;
128     $text =~ s/(\n)(\s*)/$1   /m;
129     my $ret = <<"EOF";
130 ** TODO $university -- $position
131    :PROPERTIES:
132    :ID:       $uuid
133    :END:
134 [[$url]]
135 [$date]
136
137 $text
138
139 EOF
140     return $ret;
141 }
142
143
144
145 __END__