]> git.donarmstrong.com Git - bin.git/blobdiff - jobs_to_org
support vitae
[bin.git] / jobs_to_org
index 2468a78499d42bd9882288dda4949517d8dc5b45..af6a6ff969fc90f98f0d46ee3180897971d29aa8 100755 (executable)
@@ -1,5 +1,5 @@
 #!/usr/bin/perl
-# SCRIPTNAME DOES_SOMETHING
+# jobs_to_org finds jobs in different job sites and turns them into org things
 # and is released under the terms of the GNU GPL version 3, or any
 # later version, at your option. See the file README and COPYING for
 # more information.
@@ -14,11 +14,11 @@ use Pod::Usage;
 
 =head1 NAME
 
-SCRIPTNAME - DOES_SOMETHING
+jobs_to_org - finds jobs in different job sites and turns them into org things
 
 =head1 SYNOPSIS
 
-SCRIPTNAME [options]
+jobs_to_org [options]
 
  Options:
    --debug, -d debugging level (Default 0)
@@ -45,7 +45,7 @@ Display this manual.
 
 =head1 EXAMPLES
 
-SCRIPTNAME
+jobs_to_org
 
 =cut
 
@@ -54,6 +54,8 @@ use WWW::Mechanize;
 use WWW::Mechanize::TreeBuilder;
 use vars qw($DEBUG);
 use Data::Printer;
+use Text::Wrap;
+use POSIX qw(strftime);
 
 tie my $uuid, 'OSSP::uuid::tie';
 $uuid= ["v1"];
@@ -84,7 +86,7 @@ pod2usage(join("\n",@USAGE_ERRORS)) if @USAGE_ERRORS;
 
 my %sites =
     (herc => {url => 'http://main.hercjobs.org/jobs/search?keywords=professor+AND+%28genomics+OR+bioinformatics+OR+biology+OR+informatics%29&discipline=academic-faculty&category=academic-faculty&category=allied-health&category=biological-biomedical-sciences&category=computer-information-sciences&category=education&category=interdisciplinary&category=mathematics-statistics&category=medical-research&category=physical-sciences&sort=DATE_POSTED+DESC',
-              next_selector => [class => "bti-pagination-previous-link bti-pagination-prev-next"],
+              next_selector => [class => "bti-pagination-previous-link bti-pagination-prev-next",text => '>'],
               job_selector => [url_regex => qr{^\/jobs\/\d+/.+}],
               university => [itemprop=>"hiringOrganization",itemtype=>"http://schema.org/Organization"],
               description => [class=>"bti-jd-description",itemprop=>"description"],
@@ -92,13 +94,34 @@ my %sites =
                        sub {scalar $_[0]->parent()->attr('class') eq 'bti-jd-details-action'}],
               position => [class=>"bti-jd-title",itemprop=>"title"],
              },
+     nature => {url => 'http://www.nature.com/naturejobs/science/jobs?utf8=%E2%9C%93&q%5B%5D=professor&job_type%5B%5D=Assistant+Professor&job_type%5B%5D=Professor&order_by=created_on',
+                next_selector => [class=>"next_page",url_regex=>qr{^/naturejobs/science/jobs},],
+                job_selector => [url_regex => qr{^/naturejobs/science/jobs/\d+-.+}],
+                university => [href => qr{^/naturejobs/science/employer-directory/\d+$}],
+                description => [class=>"job-description"],
+                date => [],
+                position => [class=>'job-title heading'],
+               },
+     vitae => {url => 'https://chroniclevitae.com/job_search?job_search%5Bdistance_from_zip%5D=10&job_search%5Bemployment_type%5D=Full-time&job_search%5Bposition_type%5D=63',
+               next_selector => [text_regex => qr{Next}, url_regex=>qr{^/job_search}],
+               job_selector => [url_regex => qr{/jobs/\d+-\d+$}],
+               university => [href => qr{/institutions/\d+$}],
+               description => [class => 'job-listing__content__description'],
+               date => [_tag => 'td', content => qr/\,\s+20\d{2}$/],
+               position => [_tag => 'h1',
+                            sub {scalar $_[0]->parent()->attr('class') eq 'page-title page-title--two-col'},
+                           ],
+              },
     );
 
 binmode STDOUT,":utf8";
 get_jobs($options{site},$options{pages});
+
 sub get_jobs {
     my ($site,$pages) = @_;
 
+    my $todays_date = strftime('%Y-%m-%d %H:%M:%S',localtime());
+
     my $m = WWW::Mechanize->new();
     WWW::Mechanize::TreeBuilder->meta->apply($m);
     if (not defined $sites{$site}) {
@@ -112,20 +135,33 @@ sub get_jobs {
             $m->find_all_links(@{$s->{job_selector}});
         for my $j_u (@job_urls) {
             $m->get($j_u);
-            my $university = $m->tree->look_down(@{$s->{university}})->as_text();
-            my $date = $m->tree->look_down(@{$s->{date}})->as_text();
-            my $description = $m->tree->look_down(@{$s->{description}})->as_text();
-            my $position = $m->tree->look_down(@{$s->{position}})->as_text();
+            my $university = 'No university';
+            eval {
+                $university = $m->tree->look_down(@{$s->{university}})->as_text();
+            };
+            my $date = $todays_date;
+            eval {
+                $date = $m->tree->look_down(@{$s->{date}})->as_text() // $todays_date if
+                    @{$s->{date}};
+            };
+            my $description = 'unknown';
+            eval {
+                $description = $m->tree->look_down(@{$s->{description}})->as_text();
+            };
+            my $position = 'Unknown';
+            eval {
+                $position = $m->tree->look_down(@{$s->{position}})->as_text();
+            };
             print format_job($university,$position,$j_u->URI->abs(),$description,$date);
             $m->back();
         }
-        $m->follow_link(@{$s->{next_selector}});
+        $m->follow_link(@{$s->{next_selector}}) or die "Unable to find next link";
     }
 }
 
 sub format_job {
     my ($university,$position,$url,$text,$date) = @_;
-    $text =~ s/(\n)(\s*)/$1   /m;
+    $text = wrap('   ','   ',$text);
     my $ret = <<"EOF";
 ** TODO $university -- $position
    :PROPERTIES: