1 # This program is distributed under the same terms as blootbot.
7 my $wikipedia_base_url = 'http://www.wikipedia.org/wiki/';
8 my $wikipedia_search_url = $wikipedia_base_url . 'Special:Search?';
9 my $wikipedia_export_url = $wikipedia_base_url . 'Special:Export/';
12 # utility functions for encoding the wikipedia request
13 eval "use URI::Escape";
18 eval "use LWP::UserAgent";
23 eval "use HTML::Entities";
30 return '' if $missing;
32 my ($reply, $valid_result) = wikipedia_lookup(@_);
34 &main::pSReply($reply);
36 &main::pSReply("'$phrase' not found in Wikipedia. Perhaps try a different spelling or case?");
40 sub wikipedia_silent {
41 return '' if $missing;
42 my ($reply, $valid_result) = wikipedia_lookup(@_);
43 if ($valid_result and $reply) {
44 &main::pSReply($reply);
48 sub wikipedia_lookup {
50 &main::DEBUG("wikipedia($phrase)");
52 my $ua = new LWP::UserAgent;
53 $ua->proxy('http', $::param{'httpProxy'}) if (&::IsParam("httpProxy"));
55 $ua->agent("Mozilla/5.0 " . $ua->agent);
60 # convert phrase to wikipedia conventions
61 # $phrase = uri_escape($phrase);
62 # $phrase =~ s/%20/+/g;
63 # $phrase =~ s/%25/%/g;
66 # using the search form will make the request case-insensitive
67 # HEAD will follow redirects, catching the first mode of redirects
69 my $url = $wikipedia_search_url . 'search=' . $phrase . '&go=Go';
70 my $req = HTTP::Request->new('HEAD', $url);
71 $req->header('Accept-Language' => 'en');
74 my $res = $ua->request($req);
75 &main::DEBUG($res->code);
77 if (!$res->is_success) {
78 return("Wikipedia might be temporarily unavailable (".$res->code."). Please try again in a few minutes...",
81 # we have been redirected somewhere
82 # (either content or the generic Search form)
83 # let's find the title of the article
84 $url = $res->request->uri;
86 $phrase =~ s/.*\/wiki\///;
88 if (!$res->code == '200') {
89 return("Wikipedia might be temporarily unavailable or something is broken (".$res->code."). Please try again later...",
92 if ($url =~ m/Special:Search/) {
93 # we were sent to the the search page
94 return("I couldn't find a matching article in wikipedia, look for yerselves: " . $url,
97 # we hit content, let's retrieve it
98 my $text = wikipedia_get_text($phrase);
100 # filtering unprintables
101 $text =~ s/[[:cntrl:]]//g;
103 $text =~ s/==+[^=]*=+//g;
104 # filtering wikipedia tables
105 $text =~ s/\{\|[^}]+\|\}//g;
106 # some people cannot live without HTML tags, even in a wiki
107 # $text =~ s/<div.*>//gi;
108 # $text =~ s/<!--.*>//gi;
109 # $text =~ s/<[^>]*>//g;
111 $text =~ s/&/&/g;
112 decode_entities($text);
114 $text =~ s/<[^>]*>//g;
115 #$text =~ s/[&#]+[0-9a-z]+;//gi;
116 # filter wikipedia tags: [[abc: def]]
117 $text =~ s/\[\[[[:alpha:]]*:[^]]*\]\]//gi;
119 $text =~ s/\{\{[[:alpha:]]+\}\}:[^\s]+//gi;
121 $text =~ s/\{\{[[:alpha:]]+\}\}//gi;
125 # filter wikipedia links: [[tag|link]] -> link
126 $text =~ s/\[\[[^]]+\|([^]]+)\]\]/$1/g;
128 $text =~ s/\[\[([^]]+)\]\]/$1/g;
130 $text =~ s/[[:space:]]+/ /g;
131 # chop leading whitespace
134 # shorten article to first one or two sentences
135 # new: we rely on the output function to know what to do
137 #$text = substr($text, 0, 330);
138 #$text =~ s/(.+)\.([^.]*)$/$1./g;
140 return("At " . $url . " (URL), Wikipedia explains: " . $text,
147 sub wikipedia_get_text {
148 return '' if $missing;
150 &main::DEBUG("wikipedia_get_text($article)");
152 my $ua = new LWP::UserAgent;
153 $ua->proxy('http', $::param{'httpProxy'}) if (&::IsParam("httpProxy"));
155 $ua->agent("Mozilla/5.0 " . $ua->agent);
158 &main::DEBUG($wikipedia_export_url . $article);
159 my $req = HTTP::Request->new('GET', $wikipedia_export_url .
161 $req->header('Accept-Language' => 'en');
162 $req->header('Accept-Charset' => 'utf-8');
164 my $res = $ua->request($req);
165 my ($title, $redirect, $text);
166 &main::DEBUG($res->code);
168 if ($res->is_success) {
169 if ($res->code == '200' ) {
170 foreach (split(/\n/, $res->as_string)) {
171 if (/<title>(.*?)<\/title>/) {
173 $title =~ s/&\;/&/g;
174 } elsif (/#REDIRECT\s*\[\[(.*?)\]\]/i) {
176 $redirect =~ tr/ /_/;
177 &main::DEBUG("wiki redirect to " . $redirect);
179 } elsif (/<text>(.*)/) {
181 } elsif (/(.*)<\/text>/) {
182 $text = $text . " " . $1 . '"';
185 $text = $text . " " . $_;
188 &main::DEBUG("wikipedia returned text: " . $text .
189 ", redirect " . $redirect. "\n");
191 if (!$redirect and !$text) {
192 return ($res->as_string);
194 return ($text or wikipedia_get_text($redirect))