1 # This program is distributed under the same terms as infobot.
8 my $wikipedia_base_url = 'http://www.wikipedia.org/wiki/';
9 my $wikipedia_search_url = $wikipedia_base_url . 'Special:Search?';
10 my $wikipedia_export_url = $wikipedia_base_url . 'Special:Export/';
14 # utility functions for encoding the wikipedia request
15 eval "use URI::Escape";
20 eval "use LWP::UserAgent";
25 eval "use HTML::Entities";
32 return '' if $missing;
34 my ( $reply, $valid_result ) = wikipedia_lookup(@_);
36 &::performStrictReply($reply);
39 &::performStrictReply(
40 "'$phrase' not found in Wikipedia. Perhaps try a different spelling or case?"
45 sub wikipedia_silent {
46 return '' if $missing;
47 my ( $reply, $valid_result ) = wikipedia_lookup(@_);
48 if ( $valid_result and $reply ) {
49 &::performStrictReply($reply);
53 sub wikipedia_lookup {
55 &::DEBUG("wikipedia($phrase)");
57 my $ua = new LWP::UserAgent;
58 $ua->proxy( 'http', $::param{'httpProxy'} ) if ( &::IsParam('httpProxy') );
61 $ua->agent( "Mozilla/5.0 " . $ua->agent );
67 # convert phrase to wikipedia conventions
68 # $phrase = uri_escape($phrase);
69 # $phrase =~ s/%20/+/g;
70 # $phrase =~ s/%25/%/g;
73 # using the search form will make the request case-insensitive
74 # HEAD will follow redirects, catching the first mode of redirects
76 my $url = $wikipedia_search_url . 'search=' . $phrase . '&go=Go';
77 my $req = HTTP::Request->new( 'HEAD', $url );
78 $req->header( 'Accept-Language' => 'en' );
81 my $res = $ua->request($req);
82 &::DEBUG( $res->code );
84 if ( !$res->is_success ) {
86 "Wikipedia might be temporarily unavailable ("
88 . "). Please try again in a few minutes...",
94 # we have been redirected somewhere
95 # (either content or the generic Search form)
96 # let's find the title of the article
97 $url = $res->request->uri;
99 $phrase =~ s/.*\/wiki\///;
101 if ( !$res->code == '200' ) {
103 "Wikipedia might be temporarily unavailable or something is broken ("
105 . "). Please try again later...",
110 if ( $url =~ m/Special:Search/ ) {
112 # we were sent to the the search page
114 "I couldn't find a matching article in wikipedia, look for yerselves: "
121 # we hit content, let's retrieve it
122 my $text = wikipedia_get_text($phrase);
124 # filtering unprintables
125 $text =~ s/[[:cntrl:]]//g;
128 $text =~ s/==+[^=]*=+//g;
130 # filtering wikipedia tables
131 $text =~ s/\{\|[^}]+\|\}//g;
133 # some people cannot live without HTML tags, even in a wiki
134 # $text =~ s/<div.*>//gi;
135 # $text =~ s/<!--.*>//gi;
136 # $text =~ s/<[^>]*>//g;
138 $text =~ s/&/&/g;
139 decode_entities($text);
142 $text =~ s/<[^>]*>//g;
144 #$text =~ s/[&#]+[0-9a-z]+;//gi;
145 # filter wikipedia tags: [[abc: def]]
146 $text =~ s/\[\[[[:alpha:]]*:[^]]*\]\]//gi;
149 $text =~ s/\{\{[[:alpha:]]+\}\}:[^\s]+//gi;
152 $text =~ s/\{\{[[:alpha:]]+\}\}//gi;
158 # filter wikipedia links: [[tag|link]] -> link
159 $text =~ s/\[\[[^]]+\|([^]]+)\]\]/$1/g;
162 $text =~ s/\[\[([^]]+)\]\]/$1/g;
165 $text =~ s/[[:space:]]+/ /g;
167 # chop leading whitespace
170 # shorten article to first one or two sentences
171 # new: we rely on the output function to know what to do
173 #$text = substr($text, 0, 330);
174 #$text =~ s/(.+)\.([^.]*)$/$1./g;
176 return ( 'At ' . $url . " (URL), Wikipedia explains: " . $text,
183 sub wikipedia_get_text {
184 return '' if $missing;
186 &::DEBUG("wikipedia_get_text($article)");
188 my $ua = new LWP::UserAgent;
189 $ua->proxy( 'http', $::param{'httpProxy'} ) if ( &::IsParam('httpProxy') );
192 $ua->agent( "Mozilla/5.0 " . $ua->agent );
195 &::DEBUG( $wikipedia_export_url . $article );
196 my $req = HTTP::Request->new( 'GET', $wikipedia_export_url . $article );
197 $req->header( 'Accept-Language' => 'en' );
198 $req->header( 'Accept-Charset' => 'utf-8' );
200 my $res = $ua->request($req);
201 my ( $title, $redirect, $text );
202 &::DEBUG( $res->code );
204 if ( $res->is_success ) {
205 if ( $res->code == '200' ) {
206 foreach ( split( /\n/, $res->as_string ) ) {
207 if (/<title>(.*?)<\/title>/) {
209 $title =~ s/&\;/&/g;
211 elsif (/#REDIRECT\s*\[\[(.*?)\]\]/i) {
213 $redirect =~ tr/ /_/;
214 &::DEBUG( 'wiki redirect to ' . $redirect );
217 elsif (/<text[^>]*>(.*)/) {
220 elsif (/(.*)<\/text>/) {
221 $text = $text . ' ' . $1 . '"';
225 $text = $text . ' ' . $_;
228 &::DEBUG( "wikipedia returned text: " . $text
233 if ( !$redirect and !$text ) {
234 return ( $res->as_string );
236 return ( $text or wikipedia_get_text($redirect) );
244 # vim:ts=4:sw=4:expandtab:tw=80