src/Modules/wikipedia.pl

   1 # This program is distributed under the same terms as blootbot.
   2
   3 package wikipedia;
   4 use strict;
   5
   6 my $missing;
   7 my $wikipedia_base_url = 'http://www.wikipedia.org/wiki/';
   8 my $wikipedia_search_url = $wikipedia_base_url . 'Special:Search?';
   9 my $wikipedia_export_url = $wikipedia_base_url . 'Special:Export/';
  10
  11 BEGIN {
  12   # utility functions for encoding the wikipedia request
  13   eval "use URI::Escape";
  14   if ($@) {
  15     $missing++;
  16   }
  17
  18   eval "use LWP::UserAgent";
  19   if ($@) {
  20     $missing++;
  21   }
  22
  23   eval "use HTML::Entities";
  24   if ($@) {
  25     $missing++;
  26   }
  27
  28 }
  29
  30 sub wikipedia {
  31   return '' if $missing;
  32   my ($phrase) = @_;
  33   &main::DEBUG("wikipedia($phrase)");
  34
  35   my $ua = new LWP::UserAgent;
  36   $ua->proxy('http', $::param{'httpProxy'}) if (&::IsParam("httpProxy"));
  37   # Let's pretend
  38   $ua->agent("Mozilla/5.0 " . $ua->agent);
  39   $ua->timeout(5);
  40
  41   # chop ? from the end
  42   $phrase =~ s/\?$//;
  43   # convert phrase to wikipedia conventions
  44   $phrase = uri_escape($phrase);
  45   $phrase =~ s/%20/+/g;
  46
  47   # using the search form will make the request case-insensitive
  48   # HEAD will follow redirects, catching the first mode of redirects
  49   # that wikipedia uses
  50   my $url = $wikipedia_search_url . 'search=' . $phrase . '&go=Go';
  51   my $req = HTTP::Request->new('HEAD', $url);
  52   $req->header('Accept-Language' => 'en');
  53   # &main::DEBUG($url);
  54
  55   my $res = $ua->request($req);
  56   # &main::DEBUG($res->code);
  57
  58   if ($res->is_success) {
  59     # we have been redirected somewhere
  60     # (either content or the generic Search form)
  61     # let's find the title of the article
  62     $url = $res->request->uri;
  63     $phrase = $url;
  64     $phrase =~ s/.*\/wiki\///;
  65
  66     if ($res->code == '200' and $url !~ m/Special:Search/ ) {
  67       # we hit content, let's retrieve it
  68       my $text = wikipedia_get_text($phrase);
  69
  70       # filtering unprintables
  71       $text =~ s/[[:cntrl:]]//g;
  72       # filtering headings
  73       $text =~ s/==+[^=]*=+//g;
  74       # filtering wikipedia tables
  75       &main::DEBUG("START:\n" . $text . " :END");
  76       $text =~ s/\{\|[^}]+\|\}//g;
  77       # some people cannot live without HTML tags, even in a wiki
  78       # $text =~ s/&lt;div.*&gt;//gi;
  79       # $text =~ s/&lt;!--.*&gt;//gi;
  80       # $text =~ s/<[^>]*>//g;
  81       # or HTML entities
  82       $text =~ s/&amp;/&/g;
  83       decode_entities($text);
  84       # or tags, again
  85       $text =~ s/<[^>]*>//g;
  86       #$text =~ s/[&#]+[0-9a-z]+;//gi;
  87       # filter wikipedia tags: [[abc: def]]
  88       $text =~ s/\[\[[[:alpha:]]*:[^]]*\]\]//gi;
  89       # {{abc}}:tag
  90       $text =~ s/\{\{[[:alpha:]]+\}\}:[^\s]+//gi;
  91       # {{abc}}
  92       $text =~ s/\{\{[[:alpha:]]+\}\}//gi;
  93       # unescape quotes
  94       $text =~ s/'''/'/g;
  95       $text =~ s/''/"/g;
  96       # filter wikipedia links: [[tag|link]] -> link
  97       $text =~ s/\[\[[^]]+\|([^]]+)\]\]/$1/g;
  98       # [[link]] -> link
  99       $text =~ s/\[\[([^]]+)\]\]/$1/g;
 100       # shrink whitespace
 101       $text =~ s/[[:space:]]+/ /g;
 102       # chop leading whitespace
 103       $text =~ s/^ //g;
 104
 105       # shorten article to first one or two sentences
 106 #      $text = substr($text, 0, 330);
 107 #      $text =~ s/(.+)\.([^.]*)$/$1./g;
 108
 109       &main::pSReply("At " . $url . " (URL), Wikipedia explains: " . $text);
 110     }
 111   }
 112 }
 113
 114 sub wikipedia_get_text {
 115   return '' if $missing;
 116   my ($article) = @_;
 117   &main::DEBUG("wikipedia_get_text($article)");
 118
 119   my $ua = new LWP::UserAgent;
 120   $ua->proxy('http', $::param{'httpProxy'}) if (&::IsParam("httpProxy"));
 121   # Let's pretend
 122   $ua->agent("Mozilla/5.0 " . $ua->agent);
 123   $ua->timeout(5);
 124
 125   my $req = HTTP::Request->new('GET', $wikipedia_export_url .
 126                                $article);
 127   $req->header('Accept-Language' => 'en');
 128   $req->header('Accept-Charset' => 'utf-8');
 129
 130   my $res = $ua->request($req);
 131   my ($title, $redirect, $text);
 132   # &main::DEBUG($res->code);
 133
 134   if ($res->is_success) {
 135     if ($res->code == '200' ) {
 136       foreach (split(/\n/, $res->as_string)) {
 137         if (/<title>(.*?)<\/title>/) {
 138           $title = $1;
 139           $title =~ s/&amp\;/&/g;
 140         } elsif (/#REDIRECT\s*\[\[(.*?)\]\]/) {
 141           $redirect = $1;
 142           $redirect =~ tr/ /_/;
 143           last;
 144         } elsif (/<text>(.*)/) {
 145           $text = $1;
 146         } elsif (/(.*)<\/text>/) {
 147           $text = $text . " " . $1;
 148           last;
 149         } elsif ($text) {
 150           $text = $text . " " . $_;
 151         }
 152       }
 153       if (!$redirect and !$text) {
 154         return;
 155       }
 156       return ($text or wikipedia_get_text($redirect))
 157     }
 158   }
 159
 160 }
 161
 162 1;