From e945ab941a6f3b081044497f2ed312e667495df2 Mon Sep 17 00:00:00 2001 From: dms Date: Wed, 16 Jan 2002 14:01:28 +0000 Subject: [PATCH] - ok, google is now fixed! git-svn-id: https://svn.code.sf.net/p/infobot/code/trunk/blootbot@545 c11ca15a-4712-0410-83d8-924469b57eb5 --- patches/Google.pm | 308 ---------------------- patches/WWW::Search.patch | 467 ++++++++++++++++++++++++++++++++-- patches/WWW::Search.patch.old | 31 +++ 3 files changed, 471 insertions(+), 335 deletions(-) delete mode 100644 patches/Google.pm create mode 100644 patches/WWW::Search.patch.old diff --git a/patches/Google.pm b/patches/Google.pm deleted file mode 100644 index 0f7aaec..0000000 --- a/patches/Google.pm +++ /dev/null @@ -1,308 +0,0 @@ -########################################################## -# Google.pm -# by Jim Smyser -# Copyright (C) 1996-1999 by Jim Smyser & USC/ISI -# $Id: Google.pm,v 1.1.1.1 2000/07/27 16:10:23 blootbot Exp $ -########################################################## - - -package WWW::Search::Google; - - -=head1 NAME - -WWW::Search::Google - class for searching Google - - -=head1 SYNOPSIS - -use WWW::Search; -my $Search = new WWW::Search('Google'); # cAsE matters -my $Query = WWW::Search::escape_query("Where is Jimbo"); -$Search->native_query($Query); -while (my $Result = $Search->next_result()) { -print $Result->url, "\n"; -} - -=head1 DESCRIPTION - -This class is a Google specialization of WWW::Search. -It handles making and interpreting Google searches. -F. - -Googles returns 100 Hits per page. Custom Linux Only search capable. - -This class exports no public interface; all interaction should -be done through L objects. - -=head1 LINUX SEARCH - -For LINUX lovers like me, you can put Googles in a LINUX only search -mode by changing search URL from: - - 'search_url' => 'http://www.google.com/search', - -to: - - 'search_url' => 'http://www.google.com/linux', - -=head1 SEE ALSO - -To make new back-ends, see L. - -=head1 HOW DOES IT WORK? - -C is called (from C) -before we do anything. It initializes our private variables (which -all begin with underscore) and sets up a URL to the first results -page in C<{_next_url}>. - -C is called (from C) -whenever more hits are needed. It calls C -to fetch the page specified by C<{_next_url}>. -It then parses this page, appending any search hits it finds to -C<{cache}>. If it finds a ``next'' button in the text, -it sets C<{_next_url}> to point to the page for the next -set of results, otherwise it sets it to undef to indicate we''re done. - - -=head1 TESTING - -This module adheres to the C test suite mechanism. - -=head1 BUGS - -2.07 now parses for most of what Google produces, but not all. -Because Google does not produce universial formatting for all -results it produces, there are undoublty a few line formats yet -uncovered by the author. Different search terms creates various -differing format out puts for each line of results. Example, -searching for "visual basic" will create whacky url links, -whereas searching for "Visual C++" does not. It is a parsing -nitemare really! If you think you uncovered a BUG just remember -the above comments! - -With the above said, this back-end will produce proper formated -results for 96+% of what it is asked to produce. Your milage -will vary. - -=head1 AUTHOR - -This backend is maintained and supported by Jim Smyser. - - -=head1 BUGS - -2.09 seems now to parse all hits with the new format change so there really shouldn't be -any like there were with 2.08. - -=head1 VERSION HISTORY - -2.10 -removed warning on absence of description; new test case - -2.09 -Google NOW returning url and title on one line. - -2.07 -Added a new parsing routine for yet another found result line. -Added a substitute for whacky url links some queries can produce. -Added Kingpin's new hash_to_cgi_string() 10/12/99 - -2.06 -Fixed missing links / regexp crap. - -2.05 -Matching overhaul to get the code parsing right due to multiple -tags being used by google on the hit lines. 9/25/99 - -2.02 -Last Minute description changes 7/13/99 - -2.01 -New test mechanism 7/13/99 - -1.00 -First release 7/11/99 - -=head1 LEGALESE - -THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED -WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. - -=cut -#' - -##################################################################### -require Exporter; -@EXPORT = qw(); -@EXPORT_OK = qw(); -@ISA = qw(WWW::Search Exporter); -$VERSION = '2.10'; - -$MAINTAINER = 'Jim Smyser '; -$TEST_CASES = <<"ENDTESTCASES"; -# Google looks for partial words it can find results for so it will end up finding "Bogus" pages. -&test('Google', '$MAINTAINER', 'zero', '4036e7757s5', \$TEST_EXACTLY); -&test('Google', '$MAINTAINER', 'one_page', '+LS'.'AM +rep'.'lication', \$TEST_RANGE, 2,99); -&test('Google', '$MAINTAINER', 'multi', 'dir'.'ty ha'.'rr'.'y bimbo', \$TEST_GREATER_THAN, 101); -ENDTESTCASES - -use Carp (); -use WWW::Search(generic_option); -require WWW::SearchResult; - -sub native_setup_search { - my($self, $native_query, $native_options_ref) = @_; - $self->{_debug} = $native_options_ref->{'search_debug'}; - $self->{_debug} = 2 if ($native_options_ref->{'search_parse_debug'}); - $self->{_debug} = 0 if (!defined($self->{_debug})); - $self->{agent_e_mail} = 'jsmyser@bigfoot.com'; - $self->user_agent('user'); - $self->{_next_to_retrieve} = 1; - $self->{'_num_hits'} = 0; - if (!defined($self->{_options})) { - $self->{'search_base_url'} = 'http://www.google.com'; - $self->{_options} = { - 'search_url' => 'http://www.google.com/search', - 'num' => '100', - 'q' => $native_query, - }; - } - my $options_ref = $self->{_options}; - if (defined($native_options_ref)) - { - # Copy in new options. - foreach (keys %$native_options_ref) - { - $options_ref->{$_} = $native_options_ref->{$_}; - } # foreach - } # if - # Process the options. - my($options) = ''; - foreach (sort keys %$options_ref) - { - # printf STDERR "option: $_ is " . $options_ref->{$_} . "\n"; - next if (generic_option($_)); - $options .= $_ . '=' . $options_ref->{$_} . '&'; - } - chop $options; - # Finally figure out the url. - $self->{_next_url} = $self->{_options}{'search_url'} .'?'. $self->hash_to_cgi_string($self->{_options}); - } # native_setup_search - -# private -sub native_retrieve_some - { - my ($self) = @_; - print STDERR "**Google::native_retrieve_some()**\n" if $self->{_debug}; - # Fast exit if already done: - return undef if (!defined($self->{_next_url})); - - # If this is not the first page of results, sleep so as to not - # overload the server: - $self->user_agent_delay if 1 < $self->{'_next_to_retrieve'}; - - # Get some if were not already scoring somewhere else: - print STDERR "*Sending request (",$self->{_next_url},")\n" if $self->{_debug}; - my($response) = $self->http_request('GET', $self->{_next_url}); - $self->{response} = $response; - if (!$response->is_success) - { - return undef; - } - $self->{'_next_url'} = undef; - print STDERR "**Response\n" if $self->{_debug}; - - # parse the output - my ($HEADER, $START, $HITS, $NEXT) = qw(HE HI ST NX); - my $hits_found = 0; - my $state = $HEADER; - my $hit = (); - foreach ($self->split_lines($response->content())) - { - next if m@^$@; # short circuit for blank lines - print STDERR " $state ===$_=== " if 2 <= $self->{'_debug'}; - if (m|(\d+) matches|i) { - print STDERR "**Found Header Count**\n" if ($self->{_debug}); - $self->approximate_result_count($1); - $state = $START; - # set-up attempting the tricky task of - # fetching the very first HIT line - } - elsif ($state eq $START && m|Search took|i) - { - print STDERR "**Found Start Line**\n" if ($self->{_debug}); - $state = $HITS; - # Attempt to pull the very first hit line - } - if ($state eq $HITS) { - print "\n**state == HITS**\n" if 2 <= $self->{_debug}; - } - if ($state eq $HITS && m@^

(.*)@i) - { - print "**Found HIT**\n" if 2 <= $self->{_debug}; - my ($url, $title) = ($1,$2); - if (defined($hit)) - { - push(@{$self->{cache}}, $hit); - }; - $hit = new WWW::SearchResult; - # some queries *can* create internal junk in the url link - # remove them! - $url =~ s/\/url\?sa=U&start=\d+&q=//g; - $url =~ s/\&exp\=OneBoxNews //g; # ~20000510. - $url =~ s/\&e\=110 //g; # -20000528. - $hits_found++; - $hit->add_url($url); - $hit->title($title); - $state = $HITS; - } - if ($state eq $HITS && m@^
(.*)@i) - { - print "**Found First Description**\n" if 2 <= $self->{_debug}; - $mDesc = $1; - if (not $mDesc =~ m@ @) - { - $mDesc =~ s/<.*?>//g; - $mDesc = $mDesc . '
' if not $mDesc =~ m@
@; - $hit->description($mDesc); - $state = $HITS; - } - } - elsif ($state eq $HITS && - m@^(\.(.+))@i || - m@^
(.*)\s@i) { - print "**Found Second Description**\n" if 2 <= $self->{_debug}; - $sDesc = $1; - $sDesc ||= ''; - $sDesc = $mDesc . $sDesc if (defined $mDesc); - $hit->description($sDesc) if (defined $hit and $sDesc ne ''); - $sDesc =''; - $state = $HITS; - } - elsif ($state eq $HITS && - m|
<.*?>.*?
|i) { - my $nexturl = $self->{'_next_url'}; - if (defined $nexturl) { - print STDERR "**Fetching Next URL-> ", $nexturl, "\n" if 2 <= $self->{_debug}; - } else { - print STDERR "**Fetching Next URL-> UNDEF\n" if 2 <= $self->{_debug}; - } - - my $iURL = $1; - $self->{'_next_url'} = $self->{'search_base_url'} . $iURL; - } - else - { - print STDERR "**Nothing matched.**\n" if 2 <= $self->{_debug}; - } - } - if (defined($hit)) - { - push(@{$self->{cache}}, $hit); - } - return $hits_found; - } # native_retrieve_some -1; diff --git a/patches/WWW::Search.patch b/patches/WWW::Search.patch index eec3ce3..a276101 100644 --- a/patches/WWW::Search.patch +++ b/patches/WWW::Search.patch @@ -1,31 +1,444 @@ ---- WWW/Search/Google.pm.orig Wed May 24 16:55:47 2000 -+++ WWW/Search/Google.pm Wed May 24 16:56:19 2000 -@@ -240,7 +240,7 @@ - if ($state eq $HITS) { - print "\n**state == HITS**\n" if 2 <= $self->{_debug}; - } +--- Google.pm.orig Wed May 24 16:55:47 2000 ++++ Google.pm Wed Jan 16 22:02:53 2002 +@@ -2,7 +2,7 @@ + # Google.pm + # by Jim Smyser + # Copyright (C) 1996-1999 by Jim Smyser & USC/ISI +-# $Id$ ++# $Id$ + ########################################################## + + +@@ -30,8 +30,6 @@ + It handles making and interpreting Google searches. + F. + +-Googles returns 100 Hits per page. Custom Linux Only search capable. +- + This class exports no public interface; all interaction should + be done through L objects. + +@@ -70,33 +68,41 @@ + + This module adheres to the C test suite mechanism. + +-=head1 BUGS +- +-2.07 now parses for most of what Google produces, but not all. +-Because Google does not produce universial formatting for all +-results it produces, there are undoublty a few line formats yet +-uncovered by the author. Different search terms creates various +-differing format out puts for each line of results. Example, +-searching for "visual basic" will create whacky url links, +-whereas searching for "Visual C++" does not. It is a parsing +-nitemare really! If you think you uncovered a BUG just remember +-the above comments! +- +-With the above said, this back-end will produce proper formated +-results for 96+% of what it is asked to produce. Your milage +-will vary. +- + =head1 AUTHOR + +-This backend is maintained and supported by Jim Smyser. ++This backend is written and maintained/supported by Jim Smyser. + + + =head1 BUGS + +-2.09 seems now to parse all hits with the new format change so there really shouldn't be +-any like there were with 2.08. ++Google is not an easy search engine to parse in that it is capable ++of altering it's output ever so slightly on different search terms. ++There may be new slight results output the author has not yet seen that ++will pop at any given time for certain searches. So, if you think you see ++a bug keep the above in mind and send me the search words you used so I ++may code for any new variations. ++ ++=head1 CHANGES ++ ++2.22 ++Fixed up changed format from google ++reformatted code ++ ++2.21 ++Minor code correction for empty returned titles ++ ++2.20 ++Forgot to add new next url regex in 2.19! ++ ++2.19 ++Regex work on some search results url's that has changed. Number found ++return should be right now. ++ ++2.17 ++Insert url as a title when no title is found. + +-=head1 VERSION HISTORY ++2.13 ++New regexp to parse newly found results format with certain search terms. + + 2.10 + removed warning on absence of description; new test case +@@ -131,15 +137,18 @@ + WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + ++ + =cut + #' +- ++ ++ + ##################################################################### ++ + require Exporter; + @EXPORT = qw(); + @EXPORT_OK = qw(); + @ISA = qw(WWW::Search Exporter); +-$VERSION = '2.10'; ++$VERSION = '2.22'; + + $MAINTAINER = 'Jim Smyser '; + $TEST_CASES = <<"ENDTESTCASES"; +@@ -148,160 +157,187 @@ + &test('Google', '$MAINTAINER', 'one_page', '+LS'.'AM +rep'.'lication', \$TEST_RANGE, 2,99); + &test('Google', '$MAINTAINER', 'multi', 'dir'.'ty ha'.'rr'.'y bimbo', \$TEST_GREATER_THAN, 101); + ENDTESTCASES +- ++ + use Carp (); +-use WWW::Search(generic_option); ++use WWW::Search(qw(generic_option strip_tags)); + require WWW::SearchResult; +- ++ ++ ++sub undef_to_emptystring { ++return defined($_[0]) ? $_[0] : ""; ++} ++# private + sub native_setup_search { +- my($self, $native_query, $native_options_ref) = @_; +- $self->{_debug} = $native_options_ref->{'search_debug'}; +- $self->{_debug} = 2 if ($native_options_ref->{'search_parse_debug'}); +- $self->{_debug} = 0 if (!defined($self->{_debug})); +- $self->{agent_e_mail} = 'jsmyser@bigfoot.com'; +- $self->user_agent('user'); +- $self->{_next_to_retrieve} = 1; +- $self->{'_num_hits'} = 0; +- if (!defined($self->{_options})) { +- $self->{'search_base_url'} = 'http://www.google.com'; +- $self->{_options} = { +- 'search_url' => 'http://www.google.com/search', +- 'num' => '100', +- 'q' => $native_query, +- }; +- } +- my $options_ref = $self->{_options}; +- if (defined($native_options_ref)) +- { +- # Copy in new options. +- foreach (keys %$native_options_ref) +- { +- $options_ref->{$_} = $native_options_ref->{$_}; +- } # foreach +- } # if +- # Process the options. +- my($options) = ''; +- foreach (sort keys %$options_ref) +- { +- # printf STDERR "option: $_ is " . $options_ref->{$_} . "\n"; +- next if (generic_option($_)); +- $options .= $_ . '=' . $options_ref->{$_} . '&'; +- } +- chop $options; +- # Finally figure out the url. +- $self->{_next_url} = $self->{_options}{'search_url'} .'?'. $self->hash_to_cgi_string($self->{_options}); +- } # native_setup_search +- ++ my($self, $native_query, $native_options_ref) = @_; ++ $self->user_agent('user'); ++ $self->{_next_to_retrieve} = 0; ++ $self->{'_num_hits'} = 100; ++ ++ if (!defined $self->{_options}) { ++ $self->{_options} = { ++ 'search_url' => 'http://www.google.com/search', ++ 'num' => $self->{'_num_hits'}, ++ }; ++ } ++ ++ my($options_ref) = $self->{_options}; ++ ++ if (defined $native_options_ref) { ++ # Copy in new options. ++ foreach (keys %$native_options_ref) { ++ $options_ref->{$_} = $native_options_ref->{$_}; ++ } ++ } ++ ++ # Process the options. ++ my($options) = ''; ++ foreach (keys %$options_ref) { ++ # printf STDERR "option: $_ is " . $options_ref->{$_} . "\n"; ++ next if (generic_option($_)); ++ $options .= $_ . '=' . $options_ref->{$_} . '&'; ++ } ++ ++ $self->{_debug} = $options_ref->{'search_debug'}; ++ $self->{_debug} = 2 if ($options_ref->{'search_parse_debug'}); ++ $self->{_debug} = 0 if (!defined $self->{_debug}); ++ ++ # Finally figure out the url. ++ $self->{_base_url} = ++ $self->{_next_url} = ++ $self->{_options}{'search_url'} . ++ "?" . $options . ++ "q=" . $native_query; ++} ++ + # private +-sub native_retrieve_some +- { +- my ($self) = @_; +- print STDERR "**Google::native_retrieve_some()**\n" if $self->{_debug}; +- # Fast exit if already done: +- return undef if (!defined($self->{_next_url})); +- +- # If this is not the first page of results, sleep so as to not +- # overload the server: +- $self->user_agent_delay if 1 < $self->{'_next_to_retrieve'}; +- +- # Get some if were not already scoring somewhere else: +- print STDERR "*Sending request (",$self->{_next_url},")\n" if $self->{_debug}; +- my($response) = $self->http_request('GET', $self->{_next_url}); +- $self->{response} = $response; +- if (!$response->is_success) +- { +- return undef; +- } +- $self->{'_next_url'} = undef; +- print STDERR "**Response\n" if $self->{_debug}; +- +- # parse the output +- my ($HEADER, $START, $HITS, $NEXT) = qw(HE HI ST NX); +- my $hits_found = 0; +- my $state = $HEADER; +- my $hit = (); +- foreach ($self->split_lines($response->content())) +- { +- next if m@^$@; # short circuit for blank lines +- print STDERR " $state ===$_=== " if 2 <= $self->{'_debug'}; +- if (m|(\d+)
matches|i) { +- print STDERR "**Found Header Count**\n" if ($self->{_debug}); +- $self->approximate_result_count($1); +- $state = $START; +- # set-up attempting the tricky task of +- # fetching the very first HIT line +- } +- elsif ($state eq $START && m|Search took|i) +- { +- print STDERR "**Found Start Line**\n" if ($self->{_debug}); +- $state = $HITS; +- # Attempt to pull the very first hit line +- } +- if ($state eq $HITS) { +- print "\n**state == HITS**\n" if 2 <= $self->{_debug}; +- } - if ($state eq $HITS && m@^

(.*)$@i) -+ if ($state eq $HITS && m@^

(.*)@i) - { - print "**Found HIT**\n" if 2 <= $self->{_debug}; - my ($url, $title) = ($1,$2); -@@ -252,6 +252,7 @@ - # some queries *can* create internal junk in the url link - # remove them! - $url =~ s/\/url\?sa=U&start=\d+&q=//g; -+ $url =~ s/\&exp\=OneBoxNews\s//g; # new junk. - $hits_found++; - $hit->add_url($url); - $hit->title($title); -@@ -275,9 +276,8 @@ - print "**Found Second Description**\n" if 2 <= $self->{_debug}; - $sDesc = $1; - $sDesc ||= ''; +- { +- print "**Found HIT**\n" if 2 <= $self->{_debug}; +- my ($url, $title) = ($1,$2); +- if (defined($hit)) +- { +- push(@{$self->{cache}}, $hit); +- }; +- $hit = new WWW::SearchResult; +- # some queries *can* create internal junk in the url link +- # remove them! +- $url =~ s/\/url\?sa=U&start=\d+&q=//g; +- $hits_found++; +- $hit->add_url($url); +- $hit->title($title); +- $state = $HITS; +- } +- if ($state eq $HITS && m@^
(.*)@i) +- { +- print "**Found First Description**\n" if 2 <= $self->{_debug}; +- $mDesc = $1; +- if (not $mDesc =~ m@ @) +- { +- $mDesc =~ s/<.*?>//g; +- $mDesc = $mDesc . '
' if not $mDesc =~ m@
@; +- $hit->description($mDesc); +- $state = $HITS; +- } +- } +- elsif ($state eq $HITS && +- m@^(\.(.+))@i || +- m@^
(.*)\s@i) { +- print "**Found Second Description**\n" if 2 <= $self->{_debug}; +- $sDesc = $1; +- $sDesc ||= ''; - $sDesc =~ s/<.*?>//g; - $sDesc = $mDesc . $sDesc; - $hit->description($sDesc); -+ $sDesc = $mDesc . $sDesc if (defined $mDesc); -+ $hit->description($sDesc) if (defined $hit and $sDesc ne ''); - $sDesc =''; - $state = $HITS; - } +- $sDesc =''; +- $state = $HITS; +- } +- elsif ($state eq $HITS && +- m|
<.*?>.*?
|i) { +- my $nexturl = $self->{'_next_url'}; +- if (defined $nexturl) { +- print STDERR "**Fetching Next URL-> ", $nexturl, "\n" if 2 <= $self->{_debug}; +- } else { +- print STDERR "**Fetching Next URL-> UNDEF\n" if 2 <= $self->{_debug}; +- } +- +- my $iURL = $1; +- $self->{'_next_url'} = $self->{'search_base_url'} . $iURL; +- } +- else +- { +- print STDERR "**Nothing matched.**\n" if 2 <= $self->{_debug}; +- } +- } +- if (defined($hit)) +- { +- push(@{$self->{cache}}, $hit); +- } +- return $hits_found; +- } # native_retrieve_some +-1; ++sub begin_new_hit { ++ my($self) = shift; ++ my($old_hit) = shift; ++ my($old_raw) = shift; ++ ++ if (defined $old_hit) { ++ $old_hit->raw($old_raw) if (defined $old_raw); ++ push(@{$self->{cache}}, $old_hit); ++ } ++ ++ return (new WWW::SearchResult, ''); ++} ++ ++sub native_retrieve_some { ++ my ($self) = @_; ++ # fast exit if already done ++ return undef if (!defined $self->{_next_url}); ++ ++ # get some ++ print STDERR "Fetching " . $self->{_next_url} . "\n" if ($self->{_debug}); ++ my($response) = $self->http_request('GET', $self->{_next_url}); ++ $self->{response} = $response; ++ ++ return undef if (!$response->is_success); ++ ++ # parse the output ++ my($HEADER, $HITS, $TRAILER, $POST_NEXT) = (1..10); ++ my($hits_found) = 0; ++ my($state) = ($HEADER); ++ my($hit) = undef; ++ my($raw) = ''; ++ ++ foreach ($self->split_lines($response->content())) { ++ next if m@^$@; # short circuit for blank lines ++ ++ if ($state == $HEADER && m/about ([\d,]+)<\/b>/) { ++ my($n) = $1; ++ $self->approximate_result_count($n); ++ print STDERR "Found Total: $n\n" if ($self->{_debug}); ++ $state = $HITS; ++ ++ } elsif ($state == $HITS && ++ m|(.*?)
<.*?>|i ++ ) { ++ ++ my ($url, $title) = ($1,$2); ++ ($hit, $raw) = $self->begin_new_hit($hit, $raw); ++ print STDERR "**Found HIT1 Line**\n" if ($self->{_debug}); ++ $raw .= $_; ++ $url =~ s/(>.*)//g; ++ $hit->add_url(strip_tags($url)); ++ $hits_found++; ++ $title = "No Title" if ($title =~ /^\s+/); ++ $hit->title(strip_tags($title)); ++ $state = $HITS; ++ ++ } elsif ($state == $HITS && ++ m@^

(.*)
(.*)@i || ++ m@^

(.*).*?(.*)@i ++ ) { ++ print STDERR "**Found HIT2 Line**\n" if ($self->{_debug}); ++ ++ ($hit, $raw) = $self->begin_new_hit($hit, $raw); ++ ++ my ($url, $title) = ($1,$2); ++ $mDesc = $3; ++ ++ $url =~ s/\/url\?sa=\w&start=\d+&q=//g; ++ $url =~ s/\?lang=(\S+)$//g; ++ $url =~ s/&(.*)//g; ++ $url =~ s/(>.*)//g; ++ $url =~ s/\/$//g; # kill trailing slash. ++ ++ $raw .= $_; ++ $hit->add_url(strip_tags($url)); ++ $hits_found++; ++ ++ $title = "No Title" if ($title =~ /^\s+/); ++ $hit->title(strip_tags($title)); ++ ++ $mDesc =~ s/<.*?>//g; ++### $mDesc = $mDesc . '
' if not $mDesc =~ m@
@; ++ $hit->description($mDesc) if (defined $hit); ++ $state = $HITS; ++ ++# description parsing ++ } elsif ($state == $HITS && m@(\.\.(.+)) @i ++ ) { ++ print STDERR "**Parsing Description Line**\n" if ($self->{_debug}); ++ $raw .= $_; ++ # uhm... ++ $sDesc = $1 || ""; ++ ++ $sDesc =~ s/<.*?>//g; ++ $mDesc ||= ""; ++ $sDesc = $mDesc . $sDesc; ++# $hit->description($sDesc) if $sDesc =~ m@^\.@; ++ $sDesc = ''; ++ $state = $HITS; ++ ++ } elsif ($state == $HITS && m@

@i ++ ) { ++ ($hit, $raw) = $self->begin_new_hit($hit, $raw); ++ print STDERR "**Found Last Line**\n" if ($self->{_debug}); ++ # end of hits ++ $state = $TRAILER; ++ ++ } elsif ($state == $TRAILER && ++ m|.*?|i ++ ) { ++ my($relative_url) = $1; ++ print STDERR "**Fetching >>Next<< Page**\n" if ($self->{_debug}); ++ $self->{_next_url} = 'http://www.google.com' . $relative_url; ++ $state = $POST_NEXT; ++ } ++ } ++ ++ if ($state != $POST_NEXT) { ++ # No "Next" Tag ++ $self->{_next_url} = undef; ++ $self->begin_new_hit($hit, $raw) if ($state == $HITS); ++ $self->{_next_url} = undef; ++ } ++ ++ # ZZZzzzzZZZZzzzzzzZZZZZZzzz ++ $self->user_agent_delay if (defined($self->{_next_url})); ++ return $hits_found; ++} ++ ++1; ++ diff --git a/patches/WWW::Search.patch.old b/patches/WWW::Search.patch.old new file mode 100644 index 0000000..eec3ce3 --- /dev/null +++ b/patches/WWW::Search.patch.old @@ -0,0 +1,31 @@ +--- WWW/Search/Google.pm.orig Wed May 24 16:55:47 2000 ++++ WWW/Search/Google.pm Wed May 24 16:56:19 2000 +@@ -240,7 +240,7 @@ + if ($state eq $HITS) { + print "\n**state == HITS**\n" if 2 <= $self->{_debug}; + } +- if ($state eq $HITS && m@^

(.*)$@i) ++ if ($state eq $HITS && m@^

(.*)@i) + { + print "**Found HIT**\n" if 2 <= $self->{_debug}; + my ($url, $title) = ($1,$2); +@@ -252,6 +252,7 @@ + # some queries *can* create internal junk in the url link + # remove them! + $url =~ s/\/url\?sa=U&start=\d+&q=//g; ++ $url =~ s/\&exp\=OneBoxNews\s//g; # new junk. + $hits_found++; + $hit->add_url($url); + $hit->title($title); +@@ -275,9 +276,8 @@ + print "**Found Second Description**\n" if 2 <= $self->{_debug}; + $sDesc = $1; + $sDesc ||= ''; +- $sDesc =~ s/<.*?>//g; +- $sDesc = $mDesc . $sDesc; +- $hit->description($sDesc); ++ $sDesc = $mDesc . $sDesc if (defined $mDesc); ++ $hit->description($sDesc) if (defined $hit and $sDesc ne ''); + $sDesc =''; + $state = $HITS; + } -- 2.39.5