1 # This program is distributed under the same terms as blootbot.
7 my $wikipedia_base_url = 'http://www.wikipedia.org/wiki/';
8 my $wikipedia_search_url = $wikipedia_base_url . 'Special:Search?';
9 my $wikipedia_export_url = $wikipedia_base_url . 'Special:Export/';
12 # utility functions for encoding the wikipedia request
13 eval "use URI::Escape";
18 eval "use LWP::UserAgent";
23 eval "use HTML::Entities";
31 return '' if $missing;
33 &main::DEBUG("wikipedia($phrase)");
35 my $ua = new LWP::UserAgent;
36 $ua->proxy('http', $::param{'httpProxy'}) if (&::IsParam("httpProxy"));
38 $ua->agent("Mozilla/5.0 " . $ua->agent);
43 # convert phrase to wikipedia conventions
44 $phrase = uri_escape($phrase);
47 # using the search form will make the request case-insensitive
48 # HEAD will follow redirects, catching the first mode of redirects
50 my $url = $wikipedia_search_url . 'search=' . $phrase . '&go=Go';
51 my $req = HTTP::Request->new('HEAD', $url);
52 $req->header('Accept-Language' => 'en');
55 my $res = $ua->request($req);
56 # &main::DEBUG($res->code);
58 if ($res->is_success) {
59 # we have been redirected somewhere
60 # (either content or the generic Search form)
61 # let's find the title of the article
62 $url = $res->request->uri;
64 $phrase =~ s/.*\/wiki\///;
66 if ($res->code == '200' and $url !~ m/Special:Search/ ) {
67 # we hit content, let's retrieve it
68 my $text = wikipedia_get_text($phrase);
70 # filtering unprintables
71 $text =~ s/[[:cntrl:]]//g;
73 $text =~ s/==+[^=]*=+//g;
74 # filtering wikipedia tables
75 &main::DEBUG("START:\n" . $text . " :END");
76 $text =~ s/\{\|[^}]+\|\}//g;
77 # some people cannot live without HTML tags, even in a wiki
78 # $text =~ s/<div.*>//gi;
79 # $text =~ s/<!--.*>//gi;
80 # $text =~ s/<[^>]*>//g;
83 decode_entities($text);
85 $text =~ s/<[^>]*>//g;
86 #$text =~ s/[&#]+[0-9a-z]+;//gi;
87 # filter wikipedia tags: [[abc: def]]
88 $text =~ s/\[\[[[:alpha:]]*:[^]]*\]\]//gi;
90 $text =~ s/\{\{[[:alpha:]]+\}\}:[^\s]+//gi;
92 $text =~ s/\{\{[[:alpha:]]+\}\}//gi;
96 # filter wikipedia links: [[tag|link]] -> link
97 $text =~ s/\[\[[^]]+\|([^]]+)\]\]/$1/g;
99 $text =~ s/\[\[([^]]+)\]\]/$1/g;
101 $text =~ s/[[:space:]]+/ /g;
102 # chop leading whitespace
105 # shorten article to first one or two sentences
106 # $text = substr($text, 0, 330);
107 # $text =~ s/(.+)\.([^.]*)$/$1./g;
109 &main::pSReply("At " . $url . " (URL), Wikipedia explains: " . $text);
114 sub wikipedia_get_text {
115 return '' if $missing;
117 &main::DEBUG("wikipedia_get_text($article)");
119 my $ua = new LWP::UserAgent;
120 $ua->proxy('http', $::param{'httpProxy'}) if (&::IsParam("httpProxy"));
122 $ua->agent("Mozilla/5.0 " . $ua->agent);
125 my $req = HTTP::Request->new('GET', $wikipedia_export_url .
127 $req->header('Accept-Language' => 'en');
128 $req->header('Accept-Charset' => 'utf-8');
130 my $res = $ua->request($req);
131 my ($title, $redirect, $text);
132 # &main::DEBUG($res->code);
134 if ($res->is_success) {
135 if ($res->code == '200' ) {
136 foreach (split(/\n/, $res->as_string)) {
137 if (/<title>(.*?)<\/title>/) {
139 $title =~ s/&\;/&/g;
140 } elsif (/#REDIRECT\s*\[\[(.*?)\]\]/) {
142 $redirect =~ tr/ /_/;
144 } elsif (/<text>(.*)/) {
146 } elsif (/(.*)<\/text>/) {
147 $text = $text . " " . $1;
150 $text = $text . " " . $_;
153 if (!$redirect and !$text) {
156 return ($text or wikipedia_get_text($redirect))