* Rebranding from blootbot to infobot

[infobot.git] / src / Modules / wikipedia.pl
diff --git a/src/Modules/wikipedia.pl b/src/Modules/wikipedia.pl

index 98f5705bb6ec5423386c321cfa45e64d95a95564..652188a8170f82a842d44f9591876e111966d280 100644 (file)
--- a/src/Modules/wikipedia.pl
+++ b/src/Modules/wikipedia.pl
@@ -1,4 +1,4 @@
-# This program is distributed under the same terms as blootbot.
+# This program is distributed under the same terms as infobot.
  
  package wikipedia;
  use strict;
@@ -24,16 +24,33 @@ BEGIN {
    if ($@) {
      $missing++;
    }
-
  }
  
  sub wikipedia {
    return '' if $missing;
    my ($phrase) = @_;
-  &main::DEBUG("wikipedia($phrase)");
+  my ($reply, $valid_result) = wikipedia_lookup(@_);
+  if ($reply) {
+    &::performStrictReply($reply);
+  } else {
+    &::performStrictReply("'$phrase' not found in Wikipedia. Perhaps try a different spelling or case?");
+  }
+}
+
+sub wikipedia_silent {
+  return '' if $missing;
+  my ($reply, $valid_result) = wikipedia_lookup(@_);
+  if ($valid_result and $reply) {
+    &::performStrictReply($reply);
+  }
+}
+
+sub wikipedia_lookup {
+  my ($phrase) = @_;
+  &::DEBUG("wikipedia($phrase)");
  
    my $ua = new LWP::UserAgent;
-  $ua->proxy('http', $::param{'httpProxy'}) if (&::IsParam("httpProxy"));
+  $ua->proxy('http', $::param{'httpProxy'}) if (&::IsParam('httpProxy'));
    # Let's pretend
    $ua->agent("Mozilla/5.0 " . $ua->agent);
    $ua->timeout(5);
@@ -41,8 +58,10 @@ sub wikipedia {
    # chop ? from the end
    $phrase =~ s/\?$//;
    # convert phrase to wikipedia conventions
-  $phrase = uri_escape($phrase);
-  $phrase =~ s/%20/+/g;
+#  $phrase = uri_escape($phrase);
+#  $phrase =~ s/%20/+/g;
+#  $phrase =~ s/%25/%/g;
+  $phrase =~ s/ /+/g;
  
    # using the search form will make the request case-insensitive
    # HEAD will follow redirects, catching the first mode of redirects
@@ -50,12 +69,15 @@ sub wikipedia {
    my $url = $wikipedia_search_url . 'search=' . $phrase . '&go=Go';
    my $req = HTTP::Request->new('HEAD', $url);
    $req->header('Accept-Language' => 'en');
-  &main::DEBUG($url);
+  &::DEBUG($url);
  
    my $res = $ua->request($req);
-  &main::DEBUG($res->code);
+  &::DEBUG($res->code);
  
-  if ($res->is_success) {
+  if (!$res->is_success) {
+    return("Wikipedia might be temporarily unavailable (".$res->code."). Please try again in a few minutes...",
+          0);
+  } else {
      # we have been redirected somewhere
      # (either content or the generic Search form)
      # let's find the title of the article
@@ -63,50 +85,61 @@ sub wikipedia {
      $phrase = $url;
      $phrase =~ s/.*\/wiki\///;
  
-    if ($res->code == '200' and $url !~ m/Special:Search/ ) {
-      # we hit content, let's retrieve it
-      my $text = wikipedia_get_text($phrase);
-
-      # filtering unprintables
-      $text =~ s/[[:cntrl:]]//g;
-      # filtering headings
-      $text =~ s/==+[^=]*=+//g;
-      # filtering wikipedia tables
-      &main::DEBUG("START:\n" . $text . " :END");
-      $text =~ s/\{\|[^}]+\|\}//g;
-      # some people cannot live without HTML tags, even in a wiki
-      # $text =~ s/&lt;div.*&gt;//gi;
-      # $text =~ s/&lt;!--.*&gt;//gi;
-      # $text =~ s/<[^>]*>//g;
-      # or HTML entities
-      $text =~ s/&amp;/&/g;
-      decode_entities($text);
-      # or tags, again
-      $text =~ s/<[^>]*>//g;
-      #$text =~ s/[&#]+[0-9a-z]+;//gi;
-      # filter wikipedia tags: [[abc: def]]
-      $text =~ s/\[\[[[:alpha:]]*:[^]]*\]\]//gi;
-      # {{abc}}:tag
-      $text =~ s/\{\{[[:alpha:]]+\}\}:[^\s]+//gi;
-      # {{abc}}
-      $text =~ s/\{\{[[:alpha:]]+\}\}//gi;
-      # unescape quotes
-      $text =~ s/'''/'/g;
-      $text =~ s/''/"/g;
-      # filter wikipedia links: [[tag|link]] -> link
-      $text =~ s/\[\[[^]]+\|([^]]+)\]\]/$1/g;
-      # [[link]] -> link
-      $text =~ s/\[\[([^]]+)\]\]/$1/g;
-      # shrink whitespace
-      $text =~ s/[[:space:]]+/ /g;
-      # chop leading whitespace
-      $text =~ s/^ //g;
-
-      # shorten article to first one or two sentences
-#      $text = substr($text, 0, 330);
-#      $text =~ s/(.+)\.([^.]*)$/$1./g;
-
-      &main::pSReply("At " . $url . " (URL), Wikipedia explains: " . $text);
+    if (!$res->code == '200') {
+      return("Wikipedia might be temporarily unavailable or something is broken (".$res->code."). Please try again later...",
+            0);
+    } else {
+      if ($url =~ m/Special:Search/) {
+       # we were sent to the the search page
+       return("I couldn't find a matching article in wikipedia, look for yerselves: " . $url,
+              0);
+      } else {
+       # we hit content, let's retrieve it
+       my $text = wikipedia_get_text($phrase);
+
+       # filtering unprintables
+       $text =~ s/[[:cntrl:]]//g;
+       # filtering headings
+       $text =~ s/==+[^=]*=+//g;
+       # filtering wikipedia tables
+       $text =~ s/\{\|[^}]+\|\}//g;
+       # some people cannot live without HTML tags, even in a wiki
+       # $text =~ s/&lt;div.*&gt;//gi;
+       # $text =~ s/&lt;!--.*&gt;//gi;
+       # $text =~ s/<[^>]*>//g;
+       # or HTML entities
+       $text =~ s/&amp;/&/g;
+       decode_entities($text);
+       # or tags, again
+       $text =~ s/<[^>]*>//g;
+       #$text =~ s/[&#]+[0-9a-z]+;//gi;
+       # filter wikipedia tags: [[abc: def]]
+       $text =~ s/\[\[[[:alpha:]]*:[^]]*\]\]//gi;
+       # {{abc}}:tag
+       $text =~ s/\{\{[[:alpha:]]+\}\}:[^\s]+//gi;
+       # {{abc}}
+       $text =~ s/\{\{[[:alpha:]]+\}\}//gi;
+       # unescape quotes
+       $text =~ s/'''/'/g;
+       $text =~ s/''/"/g;
+       # filter wikipedia links: [[tag|link]] -> link
+       $text =~ s/\[\[[^]]+\|([^]]+)\]\]/$1/g;
+       # [[link]] -> link
+       $text =~ s/\[\[([^]]+)\]\]/$1/g;
+       # shrink whitespace
+       $text =~ s/[[:space:]]+/ /g;
+       # chop leading whitespace
+       $text =~ s/^ //g;
+
+       # shorten article to first one or two sentences
+       # new: we rely on the output function to know what to do
+       #      with long messages
+       #$text = substr($text, 0, 330);
+       #$text =~ s/(.+)\.([^.]*)$/$1./g;
+
+       return('At ' . $url . " (URL), Wikipedia explains: " . $text,
+              1);
+      }
      }
    }
  }
@@ -114,14 +147,15 @@ sub wikipedia {
  sub wikipedia_get_text {
    return '' if $missing;
    my ($article) = @_;
-  &main::DEBUG("wikipedia_get_text($article)");
+  &::DEBUG("wikipedia_get_text($article)");
  
    my $ua = new LWP::UserAgent;
-  $ua->proxy('http', $::param{'httpProxy'}) if (&::IsParam("httpProxy"));
+  $ua->proxy('http', $::param{'httpProxy'}) if (&::IsParam('httpProxy'));
    # Let's pretend
    $ua->agent("Mozilla/5.0 " . $ua->agent);
    $ua->timeout(5);
  
+  &::DEBUG($wikipedia_export_url . $article);
    my $req = HTTP::Request->new('GET', $wikipedia_export_url .
                                $article);
    $req->header('Accept-Language' => 'en');
@@ -129,7 +163,7 @@ sub wikipedia_get_text {
  
    my $res = $ua->request($req);
    my ($title, $redirect, $text);
-  # &main::DEBUG($res->code);
+  &::DEBUG($res->code);
  
    if ($res->is_success) {
      if ($res->code == '200' ) {
@@ -140,18 +174,22 @@ sub wikipedia_get_text {
         } elsif (/#REDIRECT\s*\[\[(.*?)\]\]/i) {
           $redirect = $1;
           $redirect =~ tr/ /_/;
+         &::DEBUG('wiki redirect to ' . $redirect);
           last;
-       } elsif (/<text>(.*)/) {
-         $text = $1;
+       } elsif (/<text[^>]*>(.*)/) {
+         $text = '"' . $1;
         } elsif (/(.*)<\/text>/) {
-         $text = $text . " " . $1;
+         $text = $text . ' ' . $1 . '"';
           last;
         } elsif ($text) {
-         $text = $text . " " . $_;
+         $text = $text . ' ' . $_;
         }
        }
+      &::DEBUG("wikipedia returned text: " . $text .
+                  ', redirect ' . $redirect. "\n");
+
        if (!$redirect and !$text) {
-       return;
+       return ($res->as_string);
        }
        return ($text or wikipedia_get_text($redirect))
      }