]> git.donarmstrong.com Git - bin.git/blobdiff - jobs_to_org
indicate where the jobs came from
[bin.git] / jobs_to_org
index 6a25a06aab30410ea7db8e392746b8683f32dc93..be827d08514e87395dc447a6e828933fab3703c6 100755 (executable)
@@ -1,5 +1,5 @@
 #!/usr/bin/perl
-# SCRIPTNAME DOES_SOMETHING
+# jobs_to_org finds jobs in different job sites and turns them into org things
 # and is released under the terms of the GNU GPL version 3, or any
 # later version, at your option. See the file README and COPYING for
 # more information.
@@ -14,11 +14,11 @@ use Pod::Usage;
 
 =head1 NAME
 
-SCRIPTNAME - DOES_SOMETHING
+jobs_to_org - finds jobs in different job sites and turns them into org things
 
 =head1 SYNOPSIS
 
-SCRIPTNAME [options]
+jobs_to_org [options]
 
  Options:
    --debug, -d debugging level (Default 0)
@@ -45,7 +45,7 @@ Display this manual.
 
 =head1 EXAMPLES
 
-SCRIPTNAME
+jobs_to_org
 
 =cut
 
@@ -55,6 +55,8 @@ use WWW::Mechanize::TreeBuilder;
 use vars qw($DEBUG);
 use Data::Printer;
 use Text::Wrap;
+use POSIX qw(strftime);
+use DateTime;
 
 tie my $uuid, 'OSSP::uuid::tie';
 $uuid= ["v1"];
@@ -93,34 +95,113 @@ my %sites =
                        sub {scalar $_[0]->parent()->attr('class') eq 'bti-jd-details-action'}],
               position => [class=>"bti-jd-title",itemprop=>"title"],
              },
+     nature => {url => 'http://www.nature.com/naturejobs/science/jobs?utf8=%E2%9C%93&q%5B%5D=professor&job_type%5B%5D=Assistant+Professor&job_type%5B%5D=Professor&order_by=created_on',
+                next_selector => [class=>"next_page",url_regex=>qr{^/naturejobs/science/jobs},],
+                job_selector => [url_regex => qr{^/naturejobs/science/jobs/\d+-.+}],
+                university => [href => qr{^/naturejobs/science/employer-directory/\d+$}],
+                description => [class=>"job-description"],
+                date => [_tag => 'dd', sub {$_[0]->as_text() =~ qr/\d+\s+days\s+ago$/},],
+                position => [class=>'job-title heading'],
+               },
+     vitae => {url => 'https://chroniclevitae.com/job_search?job_search%5Bdistance_from_zip%5D=10&job_search%5Bemployment_type%5D=Full-time&job_search%5Bposition_type%5D=63',
+               next_selector => [text_regex => qr{Next}, url_regex=>qr{^/job_search}],
+               job_selector => [url_regex => qr{/jobs/\d+-\d+$}],
+               university => [href => qr{/institutions/\d+$}],
+               description => [class => 'job-listing__content__description'],
+               date => [_tag => 'td', sub {$_[0]->as_text() =~ qr/\,\s+20\d{2}$/}],
+               position => [_tag => 'h1',
+                            sub {defined $_[0]->parent()->attr('class') and
+                                     $_[0]->parent()->attr('class') eq
+                                     'page-title page-title--two-col'},
+                           ],
+              },
+     higheredjobs => {url => 'https://www.higheredjobs.com/search/advanced_action.cfm?JobCat=113&JobCat=259&JobCat=99&JobCat=100&JobCat=108&JobCat=107&PosType=1&InstType=1&InstType=2&InstType=3&Keyword=&Remote=1&Remote=2&Region=&Submit=Search+Jobs',
+                      next_selector_tree => [class => 'js-click-submit',
+                                             href => qr{advanced_action\.cfm},
+                                             sub {my @c = $_[0]->content_list();
+                                                  return 0 unless @c;
+                                                  return (defined $c[0]->attr('href') and
+                                                          defined $c[0]->attr('src') =~ /active-right\.gif/);
+                                              },
+                                            ],
+                      job_selector => [url_regex => qr{^details.cfm\?JobCode=\d+},
+                                      ],
+                      university => [class => qr/field-value/,
+                                     sub {my $p = $_[0]->parent();
+                                    my $c = $p->look_down(class => qr/field-label/,
+                                                         );
+                                    defined $c and $c->as_text() =~ qr/institution/i;
+                                }
+                              ],
+                      date => [class => qr/field-value/,
+                               sub {my $p = $_[0]->parent();
+                                    my $c = $p->look_down(class => qr/field-label/,
+                                                         );
+                                    defined $c and $c->as_text() =~ qr/posted/i;
+                                }
+                              ],
+                      position => [id => 'jobtitle-header',
+                                   _tag => 'h1',
+                                  ],
+                      description => [id => 'jobDesc'],
+                     },
     );
 
 binmode STDOUT,":utf8";
 get_jobs($options{site},$options{pages});
+
 sub get_jobs {
     my ($site,$pages) = @_;
 
+    my $todays_date = strftime('%Y-%m-%d %H:%M:%S',localtime());
+
     my $m = WWW::Mechanize->new();
     WWW::Mechanize::TreeBuilder->meta->apply($m);
     if (not defined $sites{$site}) {
         die "Unknown site $site";
     }
+    print "* Jobs from $site\n";
     my $s = $sites{$site};
     $m->get($s->{url});
     for (1..$pages) {
         my %seen;
         my @job_urls = grep { ! $seen{ $_->URI()->abs() }++ }
             $m->find_all_links(@{$s->{job_selector}});
-        for my $j_u (@job_urls) {
-            $m->get($j_u);
-            my $university = $m->tree->look_down(@{$s->{university}})->as_text();
-            my $date = $m->tree->look_down(@{$s->{date}})->as_text();
-            my $description = $m->tree->look_down(@{$s->{description}})->as_text();
-            my $position = $m->tree->look_down(@{$s->{position}})->as_text();
+        for my $j_u (sort @job_urls) {
+            $m->get($j_u) or next;
+            my $university = 'No university';
+            eval {
+                $university = $m->tree->look_down(@{$s->{university}})->as_text();
+            };
+            my $date = $todays_date;
+            eval {
+                $date = $m->tree->look_down(@{$s->{date}})->as_text() // $todays_date if
+                    @{$s->{date}};
+                if ($date =~ /^\s*(\d+)\s*days\s*ago\s*$/) {
+                    $date = strftime('%Y-%m-%d %H:%M:%S',
+                                     localtime((DateTime->now() -
+                                                DateTime::Duration->new(days=>$1))->epoch));
+                }
+            };
+            my $description = 'unknown';
+            eval {
+                $description = $m->tree->look_down(@{$s->{description}})->as_text();
+            };
+            my $position = 'Unknown';
+            eval {
+                $position = $m->tree->look_down(@{$s->{position}})->as_text();
+            };
             print format_job($university,$position,$j_u->URI->abs(),$description,$date);
             $m->back();
         }
-        $m->follow_link(@{$s->{next_selector}}) or die "Unable to find next link";
+        if (exists $s->{next_selector}) {
+            $m->follow_link(@{$s->{next_selector}}) or die "Unable to find next link";
+        } elsif (exists $s->{next_selector_tree}) {
+            my $link = $m->tree->look_down(@{$s->{next_selector_tree}}) or
+                die "Unable to find next link";
+            $m->get($link->attr('href')) or
+                die "Unable to get next page";
+        }
     }
 }