From 4769f1a7ac5b15c14de2bb7851d22f8410174e46 Mon Sep 17 00:00:00 2001 From: timriker Date: Sat, 3 Jul 2004 21:16:24 +0000 Subject: [PATCH] wikipedia git-svn-id: https://svn.code.sf.net/p/infobot/code/trunk/blootbot@985 c11ca15a-4712-0410-83d8-924469b57eb5 --- files/blootbot.help | 5 ++ src/CommandStubs.pl | 4 +- src/Modules/wikipedia.pl | 162 +++++++++++++++++++++++++++++++++++++++ src/modules.pl | 1 + 4 files changed, 171 insertions(+), 1 deletion(-) create mode 100644 src/Modules/wikipedia.pl diff --git a/files/blootbot.help b/files/blootbot.help index 90b6b67..c9c2275 100644 --- a/files/blootbot.help +++ b/files/blootbot.help @@ -391,6 +391,11 @@ uptime: U: ## wantnick: If someone's taken my nick (I hope not) and I'm using some temporary nick, I can change back to my original nick if it's not taken (again). +wikipedia: D: Frontend to the Wikipedia at http://www.wikipedia.org/wiki/ Note that utf8 is used for non-ascii characters. +wikipedia: U: wikipedia +wikipedia: U: wiki +wikipedia: E: wiki irc + -host: D: FIXME: -host: U: ## [user] -host: E: ## *!*@owns.org diff --git a/src/CommandStubs.pl b/src/CommandStubs.pl index 5bedd95..b5238f4 100644 --- a/src/CommandStubs.pl +++ b/src/CommandStubs.pl @@ -21,7 +21,6 @@ $w3search_regex = "google"; # addCmdHook("SECTION", 'TEXT_HOOK', # (CODEREF => 'Blah', # Forker => 1, -# CheckModule => 1, # ??? # Module => 'blah.pl' # preload module. # Identifier => 'config_label', # change to Config? # Help => 'help_label', @@ -261,6 +260,9 @@ sub parseCmdHook { &addCmdHook("extra", 'rss', ('CODEREF' => 'Rss::Rss', 'Identifier' => 'rss', 'Cmdstats' => 'rss', 'Forker' => 1, 'Help' => 'rss') ); +&addCmdHook("extra", 'wiki(pedia)?', ('CODEREF' => 'wikipedia::wikipedia', + 'Identifier' => 'wikipedia', 'Cmdstats' => 'wikipedia', + 'Forker' => 1, 'Help' => 'wikipedia') ); ### ### END OF ADDING HOOKS. ### diff --git a/src/Modules/wikipedia.pl b/src/Modules/wikipedia.pl new file mode 100644 index 0000000..77be6f5 --- /dev/null +++ b/src/Modules/wikipedia.pl @@ -0,0 +1,162 @@ +# This program is distributed under the same terms as blootbot. + +package wikipedia; +use strict; + +my $missing; +my $wikipedia_base_url = 'http://www.wikipedia.org/wiki/'; +my $wikipedia_search_url = $wikipedia_base_url . 'Special:Search?'; +my $wikipedia_export_url = $wikipedia_base_url . 'Special:Export/'; + +BEGIN { + # utility functions for encoding the wikipedia request + eval "use URI::Escape"; + if ($@) { + $missing++; + } + + eval "use LWP::UserAgent"; + if ($@) { + $missing++; + } + + eval "use HTML::Entities"; + if ($@) { + $missing++; + } + +} + +sub wikipedia { + return '' if $missing; + my ($phrase) = @_; + &main::DEBUG("wikipedia($phrase)"); + + my $ua = new LWP::UserAgent; + $ua->proxy('http', $::param{'httpProxy'}) if (&::IsParam("httpProxy")); + # Let's pretend + $ua->agent("Mozilla/5.0 " . $ua->agent); + $ua->timeout(5); + + # chop ? from the end + $phrase =~ s/\?$//; + # convert phrase to wikipedia conventions + $phrase = uri_escape($phrase); + $phrase =~ s/%20/+/g; + + # using the search form will make the request case-insensitive + # HEAD will follow redirects, catching the first mode of redirects + # that wikipedia uses + my $url = $wikipedia_search_url . 'search=' . $phrase . '&go=Go'; + my $req = HTTP::Request->new('HEAD', $url); + $req->header('Accept-Language' => 'en'); + # &main::DEBUG($url); + + my $res = $ua->request($req); + # &main::DEBUG($res->code); + + if ($res->is_success) { + # we have been redirected somewhere + # (either content or the generic Search form) + # let's find the title of the article + $url = $res->request->uri; + $phrase = $url; + $phrase =~ s/.*\/wiki\///; + + if ($res->code == '200' and $url !~ m/Special:Search/ ) { + # we hit content, let's retrieve it + my $text = wikipedia_get_text($phrase); + + # filtering unprintables + $text =~ s/[[:cntrl:]]//g; + # filtering headings + $text =~ s/==+[^=]*=+//g; + # filtering wikipedia tables + &main::DEBUG("START:\n" . $text . " :END"); + $text =~ s/\{\|[^}]+\|\}//g; + # some people cannot live without HTML tags, even in a wiki + # $text =~ s/<div.*>//gi; + # $text =~ s/<!--.*>//gi; + # $text =~ s/<[^>]*>//g; + # or HTML entities + $text =~ s/&/&/g; + decode_entities($text); + # or tags, again + $text =~ s/<[^>]*>//g; + #$text =~ s/[&#]+[0-9a-z]+;//gi; + # filter wikipedia tags: [[abc: def]] + $text =~ s/\[\[[[:alpha:]]*:[^]]*\]\]//gi; + # {{abc}}:tag + $text =~ s/\{\{[[:alpha:]]+\}\}:[^\s]+//gi; + # {{abc}} + $text =~ s/\{\{[[:alpha:]]+\}\}//gi; + # unescape quotes + $text =~ s/'''/'/g; + $text =~ s/''/"/g; + # filter wikipedia links: [[tag|link]] -> link + $text =~ s/\[\[[^]]+\|([^]]+)\]\]/$1/g; + # [[link]] -> link + $text =~ s/\[\[([^]]+)\]\]/$1/g; + # shrink whitespace + $text =~ s/[[:space:]]+/ /g; + # chop leading whitespace + $text =~ s/^ //g; + + # shorten article to first one or two sentences +# $text = substr($text, 0, 330); +# $text =~ s/(.+)\.([^.]*)$/$1./g; + + &main::pSReply("At " . $url . " (URL), Wikipedia explains: " . $text); + } + } +} + +sub wikipedia_get_text { + return '' if $missing; + my ($article) = @_; + &main::DEBUG("wikipedia_get_text($article)"); + + my $ua = new LWP::UserAgent; + $ua->proxy('http', $::param{'httpProxy'}) if (&::IsParam("httpProxy")); + # Let's pretend + $ua->agent("Mozilla/5.0 " . $ua->agent); + $ua->timeout(5); + + my $req = HTTP::Request->new('GET', $wikipedia_export_url . + $article); + $req->header('Accept-Language' => 'en'); + $req->header('Accept-Charset' => 'utf-8'); + + my $res = $ua->request($req); + my ($title, $redirect, $text); + # &main::DEBUG($res->code); + + if ($res->is_success) { + if ($res->code == '200' ) { + foreach (split(/\n/, $res->as_string)) { + if (/(.*?)<\/title>/) { + $title = $1; + $title =~ s/&\;/&/g; + } elsif (/#REDIRECT\s*\[\[(.*?)\]\]/) { + $redirect = $1; + $redirect =~ tr/ /_/; + last; + } elsif (/<text>(.*)/) { + $text = $1; + } elsif (/(.*)<\/text>/) { + $text = $text . " " . $1; + last; + } elsif ($text) { + $text = $text . " " . $_; + } + } + if (!$redirect and !$text) { + return; + } + return ($text or wikipedia_get_text($redirect)) + } + } + +} + +1; diff --git a/src/modules.pl b/src/modules.pl index ce0b5b2..bbce57c 100644 --- a/src/modules.pl +++ b/src/modules.pl @@ -52,6 +52,7 @@ if ($@) { "userinfo" => "UserInfo.pl", "weather" => "Weather.pl", "whatis" => "WhatIs.pl", + "wikipedia" => "wikipedia.pl", "wingate" => "Wingate.pl", "wwwsearch" => "W3Search.pl", "zfi" => "zfi.pl", -- 2.39.5