From 8addf484a5b78fe043ac663129be6c05e075f260 Mon Sep 17 00:00:00 2001 From: dondelelcaro Date: Sat, 26 Apr 2008 08:14:57 +0000 Subject: [PATCH] * Merge back with trunk to r1810 git-svn-id: https://svn.code.sf.net/p/infobot/code/branches/don/dpkg@1812 c11ca15a-4712-0410-83d8-924469b57eb5 --- BUGS | 4 + ChangeLog | 39 + VERSION | 2 +- files/infobot.help | 200 ++- files/sample/infobot.chan | 2 + infobot | 87 +- patches/Google.pm | 335 ---- patches/Net_IRC_Connection_pm.patch | 32 - patches/WWW..Search.patch | 444 ----- patches/WWW..Search.patch.old | 31 - patches/WWW_Search.patch | 444 ----- patches/WWW_Search.patch.old | 31 - scripts/backup_table-slave.pl | 95 +- scripts/dbm2mysql.pl | 32 +- scripts/dbm2txt.pl | 12 +- scripts/findparam.pl | 78 +- scripts/fixbadchars.pl | 60 +- scripts/insertDB.pl | 14 +- scripts/irclog2html.pl | 477 ++--- scripts/makepasswd | 4 +- scripts/mysql2txt.pl | 14 +- scripts/oreilly_dumpvar.pl | 36 +- scripts/oreilly_prettyp.pl | 104 +- scripts/parse_warn.pl | 116 +- scripts/showvars.pl | 134 +- scripts/symname.pl | 168 +- scripts/txt2mysql.pl | 57 +- scripts/vartree.pl | 80 +- scripts/webbackup.pl | 91 +- setup/setup.pl | 82 +- src/CLI/Support.pl | 86 +- src/CommandStubs.pl | 1049 +++++------ src/DynaConfig.pl | 1115 ++++++------ src/Factoids/Core.pl | 1028 ++++++----- src/Factoids/DBCommon.pl | 104 +- src/Factoids/Norm.pl | 151 +- src/Factoids/Question.pl | 448 ++--- src/Factoids/Reply.pl | 494 +++--- src/Factoids/Statement.pl | 152 +- src/Factoids/Update.pl | 416 +++-- src/Files.pl | 79 +- src/IRC/Irc.pl | 1128 ++++++------ src/IRC/IrcHelpers.pl | 493 +++--- src/IRC/IrcHooks.pl | 1522 ++++++++-------- src/IRC/Schedulers.pl | 1445 ++++++++------- src/Misc.pl | 726 ++++---- src/Modules/BZFlag.pl | 698 ++++---- src/Modules/Debian.pl | 1552 ++++++++-------- src/Modules/DebianBugs.pm | 205 ++- src/Modules/DebianExtra.pl | 217 ++- src/Modules/Dict.pl | 283 +-- src/Modules/DumpVars.pl | 165 +- src/Modules/DumpVars2.pl | 49 +- src/Modules/Exchange.pl | 683 +++---- src/Modules/Factoids.pl | 1446 +++++++-------- src/Modules/HTTPDtype.pl | 24 +- src/Modules/Kernel.pl | 95 +- src/Modules/Math.pl | 187 +- src/Modules/News.pl | 1468 ++++++++-------- src/Modules/OnJoin.pl | 227 +-- src/Modules/Plug.pl | 63 +- src/Modules/Quote.pl | 38 +- src/Modules/RSSFeeds.pl | 219 +-- src/Modules/RootWarn.pl | 146 +- src/Modules/Rss.pl | 14 +- src/Modules/Search.pl | 31 +- src/Modules/Topic.pl | 583 +++--- src/Modules/Units.pl | 33 +- src/Modules/Uptime.pl | 116 +- src/Modules/UserDCC.pl | 2538 ++++++++++++++------------- src/Modules/UserInfo.pl | 221 +-- src/Modules/W3Search.pl | 59 +- src/Modules/Weather.pl | 196 ++- src/Modules/Wingate.pl | 102 +- src/Modules/Zippy.pl | 14 +- src/Modules/babelfish.pl | 188 +- src/Modules/botmail.pl | 138 +- src/Modules/case.pl | 9 +- src/Modules/countdown.pl | 160 +- src/Modules/dice.pl | 53 +- src/Modules/dns.pl | 51 +- src/Modules/hex2ip.pl | 11 +- src/Modules/insult.pl | 26 +- src/Modules/md5.pl | 4 +- src/Modules/nickometer.pl | 408 ++--- src/Modules/pager.pl | 153 +- src/Modules/piglatin.pl | 65 +- src/Modules/reverse.pl | 4 +- src/Modules/scramble.pl | 83 +- src/Modules/slashdot.pl | 64 +- src/Modules/spell.pl | 118 +- src/Modules/upsidedown.pl | 158 ++ src/Modules/wikipedia.pl | 386 ++-- src/Modules/wtf.pl | 57 +- src/Modules/zfi.pl | 99 +- src/Modules/zsi.pl | 99 +- src/Net.pl | 258 +-- src/Process.pl | 561 +++--- src/Shm.pl | 371 ++-- src/UserExtra.pl | 1109 ++++++------ src/core.pl | 698 ++++---- src/dbi.pl | 827 +++++---- src/logger.pl | 544 +++--- src/modules.pl | 386 ++-- 104 files changed, 16581 insertions(+), 15920 deletions(-) delete mode 100644 patches/Google.pm delete mode 100644 patches/Net_IRC_Connection_pm.patch delete mode 100644 patches/WWW..Search.patch delete mode 100644 patches/WWW..Search.patch.old delete mode 100644 patches/WWW_Search.patch delete mode 100644 patches/WWW_Search.patch.old create mode 100644 src/Modules/upsidedown.pl diff --git a/BUGS b/BUGS index 48d079c..79763d4 100644 --- a/BUGS +++ b/BUGS @@ -17,4 +17,8 @@ Known bugs that should be dealt with soon as possible: * Bot can be flooded offline with a crash if !+factstats help and /msg nick factstats help, are used at the same time + * Wildcards in --HOSTS section has problems. + eg: nick!*@foo.*.someisp.com doesnt recognize for removing factoids but: + nick!foobar@*.someisp.com does + # vim:ts=4:sw=4:expandtab:tw=80 diff --git a/ChangeLog b/ChangeLog index e69de29..4864c93 100644 --- a/ChangeLog +++ b/ChangeLog @@ -0,0 +1,39 @@ +1.5.2 +===== + +* Correction to factoid updates to treat appending as a modification + +* Code formatting cleanups + +* Removed unmaintained patches directory + +* Changed +chan to chanadd + +* Changed -chan to chandel + +* Changed +ban to banadd + +* Changed -ban to bandel + +* Changed +host to hostadd + +* Changed -host to hostdel + +* Changed adduser to useradd + +* Changed deluser to userdel + +1.5.1 +===== + +* Fixed bug in factoid modification code that prevented matching against +created_by properly + +* New +M flag to allow modifying factoids created by same nick + +1.5.0 +===== + +* Rebranding from blootbot + +# vim:ts=4:sw=4:expandtab:tw=80 diff --git a/VERSION b/VERSION index bc80560..4cda8f1 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.5.0 +1.5.2 diff --git a/files/infobot.help b/files/infobot.help index c0370ee..5b3853c 100644 --- a/files/infobot.help +++ b/files/infobot.help @@ -2,76 +2,32 @@ # Author: Tim Riker ### +# Special entry main: I learn mainly by observing declarative statements such as "x is at http://www.xxx.com", and then reply when people ask things like "where can i find x?" action: This is used to override the usual response. "x is does the hokey-pokey". When asked about x, the bot does this "* infobot does the hokey-pokey" +addressing: It is a good idea if I stay in REQUIRE mode so that I won't yell out random crap if I listen in too hard. Currently there is no way to turn this off on-the-fly. (REQUIRE mode requires me to be addressed by name if I am to respond) + alternation: The || symbol in an entry causes an infobot to choose one of the replies at random. "X is Y||Z" will produce "X is Y" or "X is Z" randomly. author: oznoid (mailto:lenzo@ri.cmu.edu) is my original author. -dollar variables: D: To be used in factoids -dollar variables: $Fdunno - ... -dollar variables: $Fquestion - ... -dollar variables: $Fupdate - ... -dollar variables: $channel - channel from which the factoid was requested -dollar variables: $date - current date (GMT) -dollar variables: $day - day of week (full name, locale) -dollar variables: $factoids - factoid count -dollar variables: $host - hostname of factoid requester -dollar variables: $ident - bot nick -dollar variables: $lastspeaker - ... -dollar variables: $memusage - ... -dollar variables: $rand - random number, also $rand100.2 -dollar variables: $randnick - random nick -dollar variables: $startTime - start time -dollar variables: $time - current time (GMT) -dollar variables: $uptime - ... -dollar variables: $user - username of factoid requester -dollar variables: $who - nick of factoid requester - -corrections: If I come back with "...but x is at http://xx.xx.xx" or something like that, and you want to change the entry, use "no, x is at http://sdfsdfsdf". The "No," tells me to supercede the existing value. -corrections: you can append stuff to a factoid with "also". "x is also at ..." - -math: D: math expresions can be evaluated. This uses Perl syntax. -math: E: 1+1 -math: + - add -math: - - subtract -math: * - multiply -math: / - division -math: ** - to the power -math: pi - pi -math: & - and -math: | = or -math: ^ - xor - -redirection: If a factoid x contains simply " see y", then when asked for x, I will deliver factoidor command result y instead. - -reply: There is a special tag, , that is used to override the usual response. Usually, a response is "X is Y", but it can be made "Y" by making the entry "X is Y". - -# now the commands... - -adduser: D: Administrative command to add new user to the .users file -adduser: U: ## -adduser: E: ## bloot bloot!bloot@example.com - -addressing: It is a good idea if I stay in REQUIRE mode so that I won't yell out random crap if I listen in too hard. Currently there is no way to turn this off on-the-fly. (REQUIRE mode requires me to be addressed by name if I am to respond) - babelfish: D: Frontend to babelfish translating service provided by http://babelfish.altavista.com/ Note that utf8 is used for non-ascii characters. babelfish: U: x babelfish: U: translate babelfish: E: x en de your cars rock --ban: D: FIXME: --ban: U: ## --ban: E: ## *!*@owns.org --ban: E: ## MoronMan +bandel: D: FIXME: +bandel: U: ## +bandel: E: ## *!*@owns.org +bandel: E: ## MoronMan -+ban: D: FIXME: -+ban: U: ## [chan] [time] [reason] -+ban: E: ## *!*@owns.org #bots 60 stop flooding. -+ban: E: ## *!*@*microsoft.com STOOPID -+ban: E: ## MoronMan +banadd: D: FIXME: +banadd: U: ## [chan] [time] [reason] +banadd: E: ## *!*@owns.org #bots 60 stop flooding. +banadd: E: ## *!*@*microsoft.com STOOPID +banadd: E: ## MoronMan botmail: D: Send someone botmail botmail: U: ## {for [:] }|stats|check|read @@ -80,13 +36,13 @@ botmail: E: ## stats botmail: E: ## check botmail: E: ## read --chan: D: Leave a channel permanently --chan: U: ## -#channel --chan: E: ## -#botpark +chanadd: D: Join a channel permanently +chanadd: U: ## #channel +chanadd: E: ## #botpark -+chan: D: Join a channel permanently -+chan: U: ## #channel -+chan: E: ## #botpark +chandel: D: Leave a channel permanently +chandel: U: ## -#channel +chandel: E: ## -#botpark chaninfo: D: Display channel statistics on Op, Ban, Deop, Unban, Part, Join, SignOff, PublicMsg, Kick and Topic chaninfo: U: ## [#channel] @@ -129,6 +85,9 @@ contents: E: ## x11amp potato cookie: I can feed your appetite with random factoids. +corrections: If I come back with "...but x is at http://xx.xx.xx" or something like that, and you want to change the entry, use "no, x is at http://sdfsdfsdf". The "No," tells me to supercede the existing value. +corrections: you can append stuff to a factoid with "also". "x is also at ..." + cpustats: cpustats dumps the bot's cpu usage this session crypt: It's good that you thought about encryption. I can do it for you. @@ -149,10 +108,6 @@ dauthor: E: ## Wichert potato dbugs: D: Show the current count of release critical bugs (latest versions) dbugs: U: ## -deluser: D: Administrative command to remove a user from the .users file -deluser: U: ## -deluser: E: ## bloot - ddesc: D: Search the Description: lines in Debian packages ddesc: U: ## [dist] ddesc: E: ## mule @@ -176,6 +131,26 @@ dns: E: ## 3.1.33.7 do: D: operator command to do things in a channel do: U: ## +dollar variables: D: To be used in factoids +dollar variables: $Fdunno - ... +dollar variables: $Fquestion - ... +dollar variables: $Fupdate - ... +dollar variables: $channel - channel from which the factoid was requested +dollar variables: $date - current date (GMT) +dollar variables: $day - day of week (full name, locale) +dollar variables: $factoids - factoid count +dollar variables: $host - hostname of factoid requester +dollar variables: $ident - bot nick +dollar variables: $lastspeaker - ... +dollar variables: $memusage - ... +dollar variables: $rand - random number, also $rand100.2 +dollar variables: $randnick - random nick +dollar variables: $startTime - start time +dollar variables: $time - current time (GMT) +dollar variables: $uptime - ... +dollar variables: $user - username of factoid requester +dollar variables: $who - nick of factoid requester + dstats: D: Show basic stats on the current size of the Debian distros dstats: U: ## [dist] dstats: E: ## @@ -208,6 +183,22 @@ factstats: == unrequest -- unrequested factoids. factstats: == vandalism -- ?? factstats: E: ## new +flags: D: Flags for chattr command +flags: D: "A" - bot administration over /msg (default is only via DCC CHAT) +flags: D: "O" - dynamic ops (as on channel). (automatic +o) +flags: D: "T" - add topics. +flags: D: "a" - ask/request factoid. +flags: D: "m" - modify all factoids. (includes renaming) +flags: D: "M" - modify own factoids. (includes renaming) +flags: D: "n" - bot owner, can "reload" +flags: D: "o" - master of bot (automatic +amrt) +flags: D: - can search on factoid strings shorter than 2 chars +flags: D: - can tell bot to join new channels +flags: D: - can [un]lock factoids +flags: D: "r" - remove factoid. +flags: D: "t" - teach/add factoid. +flags: D: "s" - Bypass +silent on channels + forget: If I have an old/redundant factoid x, "forget x" will cause me to erase it. freshmeat: D: Frontend to www.freshmeat.net @@ -218,6 +209,21 @@ hex: D: Convert ascii to hex hex: U: ## hex: E: ## carrot +hex2ip: D: Convert Hex idents for some gateways to an IP address +hex2ip: U: ## <8 char hex value> +hex2ip: E: ## AabBcC12 + +hostadd: D: admin command to list or add hostmasks to a user account +hostadd: U: ## [user] [] +hostadd: E: ## owner +hostadd: E: ## *!*@owns.org +hostadd: E: ## owner leet!leet@*.heh.org + +hostdel: D: admin command to remove hostmask from a user account +hostdel: U: ## [user] +hostdel: E: ## *!*@owns.org +hostdel: E: ## owner leet!leet@*.heh.org + httpdtype: D: Get httpd server software version / configuration httpdtype: U: ## httpdtype: E: ## example.com @@ -275,6 +281,18 @@ lock: E: ## abuse lock: N: By default, only registered "ops" on the bots or factoids matching the user's nick are able to lock factoids. lock: N: Requires factoid extension (extra) support enabled. +math: D: math expresions can be evaluated. This uses Perl syntax. +math: E: 1+1 +math: + - add +math: - - subtract +math: * - multiply +math: / - division +math: ** - to the power +math: pi - pi +math: & - and +math: | = or +math: ^ - xor + md5: D: calculates the md5sum of a given string md5: U: ## md5: E: ## When will infobot achieve world domination? @@ -349,10 +367,14 @@ quote: D: Frontend to yahoo's online stock market share listing quote: U: ## quote: E: ## RHAT,MSFT +redirection: If a factoid x contains simply " see y", then when asked for x, I will deliver factoidor command result y instead. + rename: D: Factoid renaming rename: U: ## 'from' 'to' rename: E: ## 'infobot' 'infobot' +reply: There is a special tag, , that is used to override the usual response. Usually, a response is "X is Y", but it can be made "Y" by making the entry "X is Y". + reverse: D: reverses a given string reverse: U: ## reverse: E: ## When will infobot achieve world domination? @@ -444,11 +466,27 @@ unlock: D: Factoid unlocking to allow removal by others. unlock: U: ## unlock: E: ## abuse +upsidedown: D: display a string in pseudo upside down unicode text +upsidedown: U: ## +upsidedown: E: ## When will infobot achieve world domination? + uptime: D: Show the current uptime, and the top 3 uptimes recorded uptime: U: ## +useradd: D: Administrative command to add new user to the .users file +useradd: U: ## +useradd: E: ## SomeAccount SomeAccount!someguy@example.com + +userdel: D: Administrative command to remove a user from the .users file +userdel: U: ## +userdel: E: ## SomeAccount + wantnick: If someone's taken my nick (I hope not) and I'm using some temporary nick, I can change back to my original nick if it's not taken (again). +whois: D: List available information for an account on the bot +whois: U: ## +whois: E: ## SomeAccount + wikipedia: D: Frontend to the Wikipedia at http://www.wikipedia.org/wiki/ Note that utf8 is used for non-ascii characters. wikipedia: U: ## wikipedia: U: wiki @@ -458,40 +496,10 @@ wtf: D: Interface to the BSD wtf command wtf: U: ## wtf: E: ## iirc --host: D: admin command to remove hostmask from a user account --host: U: ## [user] --host: E: ## *!*@owns.org --host: E: ## owner leet!leet@*.heh.org - -+host: D: admin command to list or add hostmasks to a user account -+host: U: ## [user] [] -+host: E: ## owner -+host: E: ## *!*@owns.org -+host: E: ## owner leet!leet@*.heh.org - -flags: D: Flags for chattr command -flags: D: "A" - bot administration over /msg (default is only via DCC CHAT) -flags: D: "O" - dynamic ops (as on channel). (automatic +o) -flags: D: "T" - add topics. -flags: D: "a" - ask/request factoid. -flags: D: "m" - modify factoid. (includes renaming) -flags: D: "n" - bot owner, can "reload" -flags: D: "o" - master of bot (automatic +amrt) -flags: D: - can search on factoid strings shorter than 2 chars -flags: D: - can tell bot to join new channels -flags: D: - can [un]lock factoids -flags: D: "r" - remove factoid. -flags: D: "t" - teach/add factoid. -flags: D: "s" - Bypass +silent on channels - rssfeeds: D: rssfeeds is used to control the RSS Feed tracking module rssfeeds: U: rssfeeds [command] rssfeeds: E: rssfeeds flush rssfeeds: D: flush - Will erase the cache file. (Must be chattr +o) rssfeeds: D: update - Force a manual update of the feeds. (Must be chattr +o) -hex2ip: D: Convert Hex idents for some gateways to an IP address -hex2ip: U: ## <8 char hex value> -hex2ip: E: ## AabBcC12 - # vim:ts=4:sw=4:expandtab:tw=80 diff --git a/files/sample/infobot.chan b/files/sample/infobot.chan index c721138..adb5dc1 100644 --- a/files/sample/infobot.chan +++ b/files/sample/infobot.chan @@ -99,6 +99,8 @@ _default +slashdot +spell +tell + +upsidedown + +wikipedia +wtf +zfi +zsi diff --git a/infobot b/infobot index f20af53..7fc8989 100755 --- a/infobot +++ b/infobot @@ -7,27 +7,28 @@ use strict; use vars qw($bot_base_dir $bot_src_dir $bot_misc_dir $bot_state_dir - $bot_data_dir $bot_config_dir $bot_log_dir $bot_run_dir - $bot_pid $memusage %param + $bot_data_dir $bot_config_dir $bot_log_dir $bot_run_dir + $bot_pid $memusage %param ); BEGIN { - if (@ARGV and -f $ARGV[0]) { - # source passed config to allow $bot_*_dir to be set. - do $ARGV[0]; + if ( @ARGV and -f $ARGV[0] ) { + + # source passed config to allow $bot_*_dir to be set. + do $ARGV[0]; } # set any $bot_*_dir var's that aren't already set - $bot_base_dir ||= '.'; - $bot_config_dir ||= 'files/'; - $bot_data_dir ||= 'files/'; - $bot_state_dir ||= 'files/'; - $bot_run_dir ||= '.'; - $bot_src_dir ||= "$bot_base_dir/src"; - $bot_log_dir ||= "$bot_base_dir/log"; - $bot_misc_dir ||= "$bot_base_dir/files"; + $bot_base_dir ||= '.'; + $bot_config_dir ||= 'files/'; + $bot_data_dir ||= 'files/'; + $bot_state_dir ||= 'files/'; + $bot_run_dir ||= '.'; + $bot_src_dir ||= "$bot_base_dir/src"; + $bot_log_dir ||= "$bot_base_dir/log"; + $bot_misc_dir ||= "$bot_base_dir/files"; - $bot_pid = $$; + $bot_pid = $$; require "$bot_src_dir/logger.pl"; require "$bot_src_dir/core.pl"; @@ -36,7 +37,7 @@ BEGIN { # load the configuration (params) file. &setupConfig(); - &showProc(); # to get the first value. + &showProc(); # to get the first value. &status("Initial memory usage: $memusage kB"); &loadCoreModules(); &loadDBModules(); @@ -50,17 +51,19 @@ BEGIN { &duperuncheck(); # initialize everything -&startup(); # first time initialization. +&startup(); # first time initialization. &setup(); -if (!&IsParam("Interface") or $param{'Interface'} =~ /IRC/) { +if ( !&IsParam("Interface") or $param{'Interface'} =~ /IRC/ ) { + # launch the irc event loop &ircloop(); -} else { +} +else { &cliloop(); } -exit 0; # just so you don't look farther down in this file :) +exit 0; # just so you don't look farther down in this file :) # --- support routines @@ -68,30 +71,32 @@ exit 0; # just so you don't look farther down in this file :) # added by the xk sub duperuncheck { - my $pid = $$; - my $file = $file{PID}; - - if ( -f $file) { - open(PIDFILE,$file) or die "error: cannot open $file."; - my $thispid = || "NULL\n"; - close PIDFILE; - chop $thispid; - - if ($thispid =~ /^\D$/) { - &staus("warning: pidfile is invalid; wiping out."); - } else { - if ( -d "/proc/$thispid/") { - &ERROR("bot is already running from this directory."); - &ERROR("if this is incorrect, erase '*.pid'."); - &ERROR("verify with 'ps -axu | grep $thispid'."); - exit 1; - } else { - &status("warning: stale $file found; wiping."); - } - } + my $pid = $$; + my $file = $file{PID}; + + if ( -f $file ) { + open( PIDFILE, $file ) or die "error: cannot open $file."; + my $thispid = || "NULL\n"; + close PIDFILE; + chop $thispid; + + if ( $thispid =~ /^\D$/ ) { + &staus("warning: pidfile is invalid; wiping out."); + } + else { + if ( -d "/proc/$thispid/" ) { + &ERROR("bot is already running from this directory."); + &ERROR("if this is incorrect, erase '*.pid'."); + &ERROR("verify with 'ps -axu | grep $thispid'."); + exit 1; + } + else { + &status("warning: stale $file found; wiping."); + } + } } - open(PIDFILE,">$file") or die "error: cannot write to $file."; + open( PIDFILE, ">$file" ) or die "error: cannot write to $file."; print PIDFILE "$pid\n"; close PIDFILE; diff --git a/patches/Google.pm b/patches/Google.pm deleted file mode 100644 index 04f586e..0000000 --- a/patches/Google.pm +++ /dev/null @@ -1,335 +0,0 @@ -########################################################## -# Google.pm -# by Jim Smyser -# Copyright (C) 1996-1999 by Jim Smyser & USC/ISI -# $Id: Google.pm,v 2.20 2000/07/09 14:29:22 jims Exp $ -########################################################## - - -package WWW::Search::Google; - - -=head1 NAME - -WWW::Search::Google - class for searching Google - - -=head1 SYNOPSIS - -use WWW::Search; -my $Search = new WWW::Search('Google'); # cAsE matters -my $Query = WWW::Search::escape_query("Where is Jimbo"); -$Search->native_query($Query); -while (my $Result = $Search->next_result()) { -print $Result->url, "\n"; -} - -=head1 DESCRIPTION - -This class is a Google specialization of WWW::Search. -It handles making and interpreting Google searches. -F. - -This class exports no public interface; all interaction should -be done through L objects. - -=head1 LINUX SEARCH - -For LINUX lovers like me, you can put Googles in a LINUX only search -mode by changing search URL from: - - 'search_url' => 'http://www.google.com/search', - -to: - - 'search_url' => 'http://www.google.com/linux', - -=head1 SEE ALSO - -To make new back-ends, see L. - -=head1 HOW DOES IT WORK? - -C is called (from C) -before we do anything. It initializes our private variables (which -all begin with underscore) and sets up a URL to the first results -page in C<{_next_url}>. - -C is called (from C) -whenever more hits are needed. It calls C -to fetch the page specified by C<{_next_url}>. -It then parses this page, appending any search hits it finds to -C<{cache}>. If it finds a ``next'' button in the text, -it sets C<{_next_url}> to point to the page for the next -set of results, otherwise it sets it to undef to indicate we''re done. - - -=head1 TESTING - -This module adheres to the C test suite mechanism. - -=head1 AUTHOR - -This backend is written and maintained/supported by Jim Smyser. - - -=head1 BUGS - -Google is not an easy search engine to parse in that it is capable -of altering it's output ever so slightly on different search terms. -There may be new slight results output the author has not yet seen that -will pop at any given time for certain searches. So, if you think you see -a bug keep the above in mind and send me the search words you used so I -may code for any new variations. - -=head1 CHANGES - -2.21.1 -Parsing update from Tim Riker - -2.21 -Minor code correction for empty returned titles - -2.20 -Forgot to add new next url regex in 2.19! - -2.19 -Regex work on some search results url's that has changed. Number found -return should be right now. - -2.17 -Insert url as a title when no title is found. - -2.13 -New regexp to parse newly found results format with certain search terms. - -2.10 -removed warning on absence of description; new test case - -2.09 -Google NOW returning url and title on one line. - -2.07 -Added a new parsing routine for yet another found result line. -Added a substitute for whacky url links some queries can produce. -Added Kingpin's new hash_to_cgi_string() 10/12/99 - -2.06 -Fixed missing links / regexp crap. - -2.05 -Matching overhaul to get the code parsing right due to multiple -tags being used by google on the hit lines. 9/25/99 - -2.02 -Last Minute description changes 7/13/99 - -2.01 -New test mechanism 7/13/99 - -1.00 -First release 7/11/99 - -=head1 LEGALESE - -THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED -WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. - - -=cut -#' - - -##################################################################### - -require Exporter; -@EXPORT = qw(); -@EXPORT_OK = qw(); -@ISA = qw(WWW::Search Exporter); -$VERSION = '2.21.1'; - -$MAINTAINER = 'Jim Smyser '; -$TEST_CASES = <<"ENDTESTCASES"; -# Google looks for partial words it can find results for so it will end up finding "Bogus" pages. -&test('Google', '$MAINTAINER', 'zero', '4036e7757s5', \$TEST_EXACTLY); -&test('Google', '$MAINTAINER', 'one_page', '+LS'.'AM +rep'.'lication', \$TEST_RANGE, 2,99); -&test('Google', '$MAINTAINER', 'multi', 'dir'.'ty ha'.'rr'.'y bimbo', \$TEST_GREATER_THAN, 101); -ENDTESTCASES - -use Carp (); -use WWW::Search(qw(generic_option strip_tags)); -require WWW::SearchResult; - - -sub undef_to_emptystring { -return defined($_[0]) ? $_[0] : ""; -} -# private -sub native_setup_search - { - my($self, $native_query, $native_options_ref) = @_; - $self->user_agent('user'); - $self->{_next_to_retrieve} = 0; - $self->{'_num_hits'} = 100; - if (!defined($self->{_options})) { - $self->{_options} = { - 'search_url' => 'http://www.google.com/search', - 'num' => $self->{'_num_hits'}, - }; - }; - my($options_ref) = $self->{_options}; - if (defined($native_options_ref)) { - # Copy in new options. - foreach (keys %$native_options_ref) { - $options_ref->{$_} = $native_options_ref->{$_}; - }; - }; - # Process the options. - my($options) = ''; - foreach (keys %$options_ref) { - # printf STDERR "option: $_ is " . $options_ref->{$_} . "\n"; - next if (generic_option($_)); - $options .= $_ . '=' . $options_ref->{$_} . '&'; - }; - $self->{_debug} = $options_ref->{'search_debug'}; - $self->{_debug} = 2 if ($options_ref->{'search_parse_debug'}); - $self->{_debug} = 0 if (!defined($self->{_debug})); - - # Finally figure out the url. - $self->{_base_url} = - $self->{_next_url} = - $self->{_options}{'search_url'} . - "?" . $options . - "q=" . $native_query; - } - -# private -sub begin_new_hit { - my($self) = shift; - my($old_hit) = shift; - my($old_raw) = shift; - if (defined($old_hit)) { - $old_hit->raw($old_raw) if (defined($old_raw)); - push(@{$self->{cache}}, $old_hit); - }; - return (new WWW::SearchResult, ''); - } -sub native_retrieve_some { - my ($self) = @_; - # fast exit if already done - return undef if (!defined($self->{_next_url})); - # get some - print STDERR "Fetching " . $self->{_next_url} . "\n" if ($self->{_debug}); - my($response) = $self->http_request('GET', $self->{_next_url}); - $self->{response} = $response; - if (!$response->is_success) { - return undef; - }; - - # parse the output - my($HEADER, $HITS, $TRAILER, $POST_NEXT) = (1..10); - my($hits_found) = 0; - my($state) = ($HEADER); - my($hit) = undef; - my($raw) = ''; - foreach ($self->split_lines($response->content())) { - next if m@^$@; # short circuit for blank lines - - if ($state == $HEADER && m/about ([\d,]+)<\/b>/) - { - my($n) = $1; - $self->approximate_result_count($n); - print STDERR "Found Total: $n\n" ; - $state = $HITS; - } - if ($state == $HITS && - m|

]*)\>(.*?)|i) { - my ($url, $title) = ($1,$2); - ($hit, $raw) = $self->begin_new_hit($hit, $raw); - print STDERR "**Found HIT0 Line** $url - $title\n" if ($self->{_debug}); - $raw .= $_; - $url =~ s/(>.*)//g; - $hit->add_url(strip_tags($url)); - $hits_found++; - $title = "No Title" if ($title =~ /^\s+/); - $hit->title(strip_tags($title)); - $state = $HITS; - } - elsif ($state == $HITS && - m|(.*?)
<.*?>|i) { - my ($url, $title) = ($1,$2); - ($hit, $raw) = $self->begin_new_hit($hit, $raw); - print STDERR "**Found HIT1 Line**\n" if ($self->{_debug}); - $raw .= $_; - $url =~ s/(>.*)//g; - $hit->add_url(strip_tags($url)); - $hits_found++; - $title = "No Title" if ($title =~ /^\s+/); - $hit->title(strip_tags($title)); - $state = $HITS; - } - elsif ($state == $HITS && - m@^

(.*)
(.*)@i || - m@^

(.*).*?
(.*)@i) - { - ($hit, $raw) = $self->begin_new_hit($hit, $raw); - print STDERR "**Found HIT2 Line**\n" if ($self->{_debug}); - my ($url, $title) = ($1,$2); - $mDesc = $3; - $url =~ s/\/url\?sa=\w&start=\d+&q=//g; - $url =~ s/&(.*)//g; - $url =~ s/(>.*)//g; - $raw .= $_; - $hit->add_url(strip_tags($url)); - $hits_found++; - $title = "No Title" if ($title =~ /^\s+/); - $hit->title(strip_tags($title)); - $mDesc =~ s/<.*?>//g; - $mDesc = $mDesc . '
' if not $mDesc =~ m@
@; - $hit->description($mDesc) if (defined($hit)); - $state = $HITS; - } - elsif ($state == $HITS && m@^(\.\.(.+))@i) - { - print STDERR "**Parsing Description Line**\n" if ($self->{_debug}); - $raw .= $_; - $sDesc = $1; - $sDesc ||= ''; - $sDesc =~ s/<.*?>//g; - $sDesc = $mDesc . $sDesc; - $hit->description($sDesc) if $sDesc =~ m@^\.@; - $sDesc = ''; - $state = $HITS; - } - elsif ($state == $HITS && m@