From: djmcgrath Date: Thu, 17 Apr 2008 23:18:53 +0000 (+0000) Subject: * Renamed files that break windows svn checkout X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=096e9be93be7012b5e81ccd1ec8f779e0b680922;hp=d3faaba2ad8c02cdf10e1a9698592afc6ed928bc;p=infobot.git * Renamed files that break windows svn checkout git-svn-id: https://svn.code.sf.net/p/infobot/code/branches/don/dpkg@1763 c11ca15a-4712-0410-83d8-924469b57eb5 --- diff --git a/patches/WWW..Search.patch b/patches/WWW..Search.patch new file mode 100644 index 0000000..a276101 --- /dev/null +++ b/patches/WWW..Search.patch @@ -0,0 +1,444 @@ +--- Google.pm.orig Wed May 24 16:55:47 2000 ++++ Google.pm Wed Jan 16 22:02:53 2002 +@@ -2,7 +2,7 @@ + # Google.pm + # by Jim Smyser + # Copyright (C) 1996-1999 by Jim Smyser & USC/ISI +-# $Id$ ++# $Id$ + ########################################################## + + +@@ -30,8 +30,6 @@ + It handles making and interpreting Google searches. + F. + +-Googles returns 100 Hits per page. Custom Linux Only search capable. +- + This class exports no public interface; all interaction should + be done through L objects. + +@@ -70,33 +68,41 @@ + + This module adheres to the C test suite mechanism. + +-=head1 BUGS +- +-2.07 now parses for most of what Google produces, but not all. +-Because Google does not produce universial formatting for all +-results it produces, there are undoublty a few line formats yet +-uncovered by the author. Different search terms creates various +-differing format out puts for each line of results. Example, +-searching for "visual basic" will create whacky url links, +-whereas searching for "Visual C++" does not. It is a parsing +-nitemare really! If you think you uncovered a BUG just remember +-the above comments! +- +-With the above said, this back-end will produce proper formated +-results for 96+% of what it is asked to produce. Your milage +-will vary. +- + =head1 AUTHOR + +-This backend is maintained and supported by Jim Smyser. ++This backend is written and maintained/supported by Jim Smyser. + + + =head1 BUGS + +-2.09 seems now to parse all hits with the new format change so there really shouldn't be +-any like there were with 2.08. ++Google is not an easy search engine to parse in that it is capable ++of altering it's output ever so slightly on different search terms. ++There may be new slight results output the author has not yet seen that ++will pop at any given time for certain searches. So, if you think you see ++a bug keep the above in mind and send me the search words you used so I ++may code for any new variations. ++ ++=head1 CHANGES ++ ++2.22 ++Fixed up changed format from google ++reformatted code ++ ++2.21 ++Minor code correction for empty returned titles ++ ++2.20 ++Forgot to add new next url regex in 2.19! ++ ++2.19 ++Regex work on some search results url's that has changed. Number found ++return should be right now. ++ ++2.17 ++Insert url as a title when no title is found. + +-=head1 VERSION HISTORY ++2.13 ++New regexp to parse newly found results format with certain search terms. + + 2.10 + removed warning on absence of description; new test case +@@ -131,15 +137,18 @@ + WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + ++ + =cut + #' +- ++ ++ + ##################################################################### ++ + require Exporter; + @EXPORT = qw(); + @EXPORT_OK = qw(); + @ISA = qw(WWW::Search Exporter); +-$VERSION = '2.10'; ++$VERSION = '2.22'; + + $MAINTAINER = 'Jim Smyser '; + $TEST_CASES = <<"ENDTESTCASES"; +@@ -148,160 +157,187 @@ + &test('Google', '$MAINTAINER', 'one_page', '+LS'.'AM +rep'.'lication', \$TEST_RANGE, 2,99); + &test('Google', '$MAINTAINER', 'multi', 'dir'.'ty ha'.'rr'.'y bimbo', \$TEST_GREATER_THAN, 101); + ENDTESTCASES +- ++ + use Carp (); +-use WWW::Search(generic_option); ++use WWW::Search(qw(generic_option strip_tags)); + require WWW::SearchResult; +- ++ ++ ++sub undef_to_emptystring { ++return defined($_[0]) ? $_[0] : ""; ++} ++# private + sub native_setup_search { +- my($self, $native_query, $native_options_ref) = @_; +- $self->{_debug} = $native_options_ref->{'search_debug'}; +- $self->{_debug} = 2 if ($native_options_ref->{'search_parse_debug'}); +- $self->{_debug} = 0 if (!defined($self->{_debug})); +- $self->{agent_e_mail} = 'jsmyser@bigfoot.com'; +- $self->user_agent('user'); +- $self->{_next_to_retrieve} = 1; +- $self->{'_num_hits'} = 0; +- if (!defined($self->{_options})) { +- $self->{'search_base_url'} = 'http://www.google.com'; +- $self->{_options} = { +- 'search_url' => 'http://www.google.com/search', +- 'num' => '100', +- 'q' => $native_query, +- }; +- } +- my $options_ref = $self->{_options}; +- if (defined($native_options_ref)) +- { +- # Copy in new options. +- foreach (keys %$native_options_ref) +- { +- $options_ref->{$_} = $native_options_ref->{$_}; +- } # foreach +- } # if +- # Process the options. +- my($options) = ''; +- foreach (sort keys %$options_ref) +- { +- # printf STDERR "option: $_ is " . $options_ref->{$_} . "\n"; +- next if (generic_option($_)); +- $options .= $_ . '=' . $options_ref->{$_} . '&'; +- } +- chop $options; +- # Finally figure out the url. +- $self->{_next_url} = $self->{_options}{'search_url'} .'?'. $self->hash_to_cgi_string($self->{_options}); +- } # native_setup_search +- ++ my($self, $native_query, $native_options_ref) = @_; ++ $self->user_agent('user'); ++ $self->{_next_to_retrieve} = 0; ++ $self->{'_num_hits'} = 100; ++ ++ if (!defined $self->{_options}) { ++ $self->{_options} = { ++ 'search_url' => 'http://www.google.com/search', ++ 'num' => $self->{'_num_hits'}, ++ }; ++ } ++ ++ my($options_ref) = $self->{_options}; ++ ++ if (defined $native_options_ref) { ++ # Copy in new options. ++ foreach (keys %$native_options_ref) { ++ $options_ref->{$_} = $native_options_ref->{$_}; ++ } ++ } ++ ++ # Process the options. ++ my($options) = ''; ++ foreach (keys %$options_ref) { ++ # printf STDERR "option: $_ is " . $options_ref->{$_} . "\n"; ++ next if (generic_option($_)); ++ $options .= $_ . '=' . $options_ref->{$_} . '&'; ++ } ++ ++ $self->{_debug} = $options_ref->{'search_debug'}; ++ $self->{_debug} = 2 if ($options_ref->{'search_parse_debug'}); ++ $self->{_debug} = 0 if (!defined $self->{_debug}); ++ ++ # Finally figure out the url. ++ $self->{_base_url} = ++ $self->{_next_url} = ++ $self->{_options}{'search_url'} . ++ "?" . $options . ++ "q=" . $native_query; ++} ++ + # private +-sub native_retrieve_some +- { +- my ($self) = @_; +- print STDERR "**Google::native_retrieve_some()**\n" if $self->{_debug}; +- # Fast exit if already done: +- return undef if (!defined($self->{_next_url})); +- +- # If this is not the first page of results, sleep so as to not +- # overload the server: +- $self->user_agent_delay if 1 < $self->{'_next_to_retrieve'}; +- +- # Get some if were not already scoring somewhere else: +- print STDERR "*Sending request (",$self->{_next_url},")\n" if $self->{_debug}; +- my($response) = $self->http_request('GET', $self->{_next_url}); +- $self->{response} = $response; +- if (!$response->is_success) +- { +- return undef; +- } +- $self->{'_next_url'} = undef; +- print STDERR "**Response\n" if $self->{_debug}; +- +- # parse the output +- my ($HEADER, $START, $HITS, $NEXT) = qw(HE HI ST NX); +- my $hits_found = 0; +- my $state = $HEADER; +- my $hit = (); +- foreach ($self->split_lines($response->content())) +- { +- next if m@^$@; # short circuit for blank lines +- print STDERR " $state ===$_=== " if 2 <= $self->{'_debug'}; +- if (m|(\d+) matches|i) { +- print STDERR "**Found Header Count**\n" if ($self->{_debug}); +- $self->approximate_result_count($1); +- $state = $START; +- # set-up attempting the tricky task of +- # fetching the very first HIT line +- } +- elsif ($state eq $START && m|Search took|i) +- { +- print STDERR "**Found Start Line**\n" if ($self->{_debug}); +- $state = $HITS; +- # Attempt to pull the very first hit line +- } +- if ($state eq $HITS) { +- print "\n**state == HITS**\n" if 2 <= $self->{_debug}; +- } +- if ($state eq $HITS && m@^

(.*)$@i) +- { +- print "**Found HIT**\n" if 2 <= $self->{_debug}; +- my ($url, $title) = ($1,$2); +- if (defined($hit)) +- { +- push(@{$self->{cache}}, $hit); +- }; +- $hit = new WWW::SearchResult; +- # some queries *can* create internal junk in the url link +- # remove them! +- $url =~ s/\/url\?sa=U&start=\d+&q=//g; +- $hits_found++; +- $hit->add_url($url); +- $hit->title($title); +- $state = $HITS; +- } +- if ($state eq $HITS && m@^
(.*)@i) +- { +- print "**Found First Description**\n" if 2 <= $self->{_debug}; +- $mDesc = $1; +- if (not $mDesc =~ m@ @) +- { +- $mDesc =~ s/<.*?>//g; +- $mDesc = $mDesc . '
' if not $mDesc =~ m@
@; +- $hit->description($mDesc); +- $state = $HITS; +- } +- } +- elsif ($state eq $HITS && +- m@^(\.(.+))@i || +- m@^
(.*)\s@i) { +- print "**Found Second Description**\n" if 2 <= $self->{_debug}; +- $sDesc = $1; +- $sDesc ||= ''; +- $sDesc =~ s/<.*?>//g; +- $sDesc = $mDesc . $sDesc; +- $hit->description($sDesc); +- $sDesc =''; +- $state = $HITS; +- } +- elsif ($state eq $HITS && +- m|
<.*?>.*?
|i) { +- my $nexturl = $self->{'_next_url'}; +- if (defined $nexturl) { +- print STDERR "**Fetching Next URL-> ", $nexturl, "\n" if 2 <= $self->{_debug}; +- } else { +- print STDERR "**Fetching Next URL-> UNDEF\n" if 2 <= $self->{_debug}; +- } +- +- my $iURL = $1; +- $self->{'_next_url'} = $self->{'search_base_url'} . $iURL; +- } +- else +- { +- print STDERR "**Nothing matched.**\n" if 2 <= $self->{_debug}; +- } +- } +- if (defined($hit)) +- { +- push(@{$self->{cache}}, $hit); +- } +- return $hits_found; +- } # native_retrieve_some +-1; ++sub begin_new_hit { ++ my($self) = shift; ++ my($old_hit) = shift; ++ my($old_raw) = shift; ++ ++ if (defined $old_hit) { ++ $old_hit->raw($old_raw) if (defined $old_raw); ++ push(@{$self->{cache}}, $old_hit); ++ } ++ ++ return (new WWW::SearchResult, ''); ++} ++ ++sub native_retrieve_some { ++ my ($self) = @_; ++ # fast exit if already done ++ return undef if (!defined $self->{_next_url}); ++ ++ # get some ++ print STDERR "Fetching " . $self->{_next_url} . "\n" if ($self->{_debug}); ++ my($response) = $self->http_request('GET', $self->{_next_url}); ++ $self->{response} = $response; ++ ++ return undef if (!$response->is_success); ++ ++ # parse the output ++ my($HEADER, $HITS, $TRAILER, $POST_NEXT) = (1..10); ++ my($hits_found) = 0; ++ my($state) = ($HEADER); ++ my($hit) = undef; ++ my($raw) = ''; ++ ++ foreach ($self->split_lines($response->content())) { ++ next if m@^$@; # short circuit for blank lines ++ ++ if ($state == $HEADER && m/about ([\d,]+)<\/b>/) { ++ my($n) = $1; ++ $self->approximate_result_count($n); ++ print STDERR "Found Total: $n\n" if ($self->{_debug}); ++ $state = $HITS; ++ ++ } elsif ($state == $HITS && ++ m|(.*?)
<.*?>|i ++ ) { ++ ++ my ($url, $title) = ($1,$2); ++ ($hit, $raw) = $self->begin_new_hit($hit, $raw); ++ print STDERR "**Found HIT1 Line**\n" if ($self->{_debug}); ++ $raw .= $_; ++ $url =~ s/(>.*)//g; ++ $hit->add_url(strip_tags($url)); ++ $hits_found++; ++ $title = "No Title" if ($title =~ /^\s+/); ++ $hit->title(strip_tags($title)); ++ $state = $HITS; ++ ++ } elsif ($state == $HITS && ++ m@^

(.*)
(.*)@i || ++ m@^

(.*).*?(.*)@i ++ ) { ++ print STDERR "**Found HIT2 Line**\n" if ($self->{_debug}); ++ ++ ($hit, $raw) = $self->begin_new_hit($hit, $raw); ++ ++ my ($url, $title) = ($1,$2); ++ $mDesc = $3; ++ ++ $url =~ s/\/url\?sa=\w&start=\d+&q=//g; ++ $url =~ s/\?lang=(\S+)$//g; ++ $url =~ s/&(.*)//g; ++ $url =~ s/(>.*)//g; ++ $url =~ s/\/$//g; # kill trailing slash. ++ ++ $raw .= $_; ++ $hit->add_url(strip_tags($url)); ++ $hits_found++; ++ ++ $title = "No Title" if ($title =~ /^\s+/); ++ $hit->title(strip_tags($title)); ++ ++ $mDesc =~ s/<.*?>//g; ++### $mDesc = $mDesc . '
' if not $mDesc =~ m@
@; ++ $hit->description($mDesc) if (defined $hit); ++ $state = $HITS; ++ ++# description parsing ++ } elsif ($state == $HITS && m@(\.\.(.+)) @i ++ ) { ++ print STDERR "**Parsing Description Line**\n" if ($self->{_debug}); ++ $raw .= $_; ++ # uhm... ++ $sDesc = $1 || ""; ++ ++ $sDesc =~ s/<.*?>//g; ++ $mDesc ||= ""; ++ $sDesc = $mDesc . $sDesc; ++# $hit->description($sDesc) if $sDesc =~ m@^\.@; ++ $sDesc = ''; ++ $state = $HITS; ++ ++ } elsif ($state == $HITS && m@

@i ++ ) { ++ ($hit, $raw) = $self->begin_new_hit($hit, $raw); ++ print STDERR "**Found Last Line**\n" if ($self->{_debug}); ++ # end of hits ++ $state = $TRAILER; ++ ++ } elsif ($state == $TRAILER && ++ m|.*?|i ++ ) { ++ my($relative_url) = $1; ++ print STDERR "**Fetching >>Next<< Page**\n" if ($self->{_debug}); ++ $self->{_next_url} = 'http://www.google.com' . $relative_url; ++ $state = $POST_NEXT; ++ } ++ } ++ ++ if ($state != $POST_NEXT) { ++ # No "Next" Tag ++ $self->{_next_url} = undef; ++ $self->begin_new_hit($hit, $raw) if ($state == $HITS); ++ $self->{_next_url} = undef; ++ } ++ ++ # ZZZzzzzZZZZzzzzzzZZZZZZzzz ++ $self->user_agent_delay if (defined($self->{_next_url})); ++ return $hits_found; ++} ++ ++1; ++ diff --git a/patches/WWW..Search.patch.old b/patches/WWW..Search.patch.old new file mode 100644 index 0000000..eec3ce3 --- /dev/null +++ b/patches/WWW..Search.patch.old @@ -0,0 +1,31 @@ +--- WWW/Search/Google.pm.orig Wed May 24 16:55:47 2000 ++++ WWW/Search/Google.pm Wed May 24 16:56:19 2000 +@@ -240,7 +240,7 @@ + if ($state eq $HITS) { + print "\n**state == HITS**\n" if 2 <= $self->{_debug}; + } +- if ($state eq $HITS && m@^

(.*)$@i) ++ if ($state eq $HITS && m@^

(.*)@i) + { + print "**Found HIT**\n" if 2 <= $self->{_debug}; + my ($url, $title) = ($1,$2); +@@ -252,6 +252,7 @@ + # some queries *can* create internal junk in the url link + # remove them! + $url =~ s/\/url\?sa=U&start=\d+&q=//g; ++ $url =~ s/\&exp\=OneBoxNews\s//g; # new junk. + $hits_found++; + $hit->add_url($url); + $hit->title($title); +@@ -275,9 +276,8 @@ + print "**Found Second Description**\n" if 2 <= $self->{_debug}; + $sDesc = $1; + $sDesc ||= ''; +- $sDesc =~ s/<.*?>//g; +- $sDesc = $mDesc . $sDesc; +- $hit->description($sDesc); ++ $sDesc = $mDesc . $sDesc if (defined $mDesc); ++ $hit->description($sDesc) if (defined $hit and $sDesc ne ''); + $sDesc =''; + $state = $HITS; + } diff --git a/patches/WWW::Search.patch b/patches/WWW::Search.patch deleted file mode 100644 index a276101..0000000 --- a/patches/WWW::Search.patch +++ /dev/null @@ -1,444 +0,0 @@ ---- Google.pm.orig Wed May 24 16:55:47 2000 -+++ Google.pm Wed Jan 16 22:02:53 2002 -@@ -2,7 +2,7 @@ - # Google.pm - # by Jim Smyser - # Copyright (C) 1996-1999 by Jim Smyser & USC/ISI --# $Id$ -+# $Id$ - ########################################################## - - -@@ -30,8 +30,6 @@ - It handles making and interpreting Google searches. - F. - --Googles returns 100 Hits per page. Custom Linux Only search capable. -- - This class exports no public interface; all interaction should - be done through L objects. - -@@ -70,33 +68,41 @@ - - This module adheres to the C test suite mechanism. - --=head1 BUGS -- --2.07 now parses for most of what Google produces, but not all. --Because Google does not produce universial formatting for all --results it produces, there are undoublty a few line formats yet --uncovered by the author. Different search terms creates various --differing format out puts for each line of results. Example, --searching for "visual basic" will create whacky url links, --whereas searching for "Visual C++" does not. It is a parsing --nitemare really! If you think you uncovered a BUG just remember --the above comments! -- --With the above said, this back-end will produce proper formated --results for 96+% of what it is asked to produce. Your milage --will vary. -- - =head1 AUTHOR - --This backend is maintained and supported by Jim Smyser. -+This backend is written and maintained/supported by Jim Smyser. - - - =head1 BUGS - --2.09 seems now to parse all hits with the new format change so there really shouldn't be --any like there were with 2.08. -+Google is not an easy search engine to parse in that it is capable -+of altering it's output ever so slightly on different search terms. -+There may be new slight results output the author has not yet seen that -+will pop at any given time for certain searches. So, if you think you see -+a bug keep the above in mind and send me the search words you used so I -+may code for any new variations. -+ -+=head1 CHANGES -+ -+2.22 -+Fixed up changed format from google -+reformatted code -+ -+2.21 -+Minor code correction for empty returned titles -+ -+2.20 -+Forgot to add new next url regex in 2.19! -+ -+2.19 -+Regex work on some search results url's that has changed. Number found -+return should be right now. -+ -+2.17 -+Insert url as a title when no title is found. - --=head1 VERSION HISTORY -+2.13 -+New regexp to parse newly found results format with certain search terms. - - 2.10 - removed warning on absence of description; new test case -@@ -131,15 +137,18 @@ - WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF - MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. - -+ - =cut - #' -- -+ -+ - ##################################################################### -+ - require Exporter; - @EXPORT = qw(); - @EXPORT_OK = qw(); - @ISA = qw(WWW::Search Exporter); --$VERSION = '2.10'; -+$VERSION = '2.22'; - - $MAINTAINER = 'Jim Smyser '; - $TEST_CASES = <<"ENDTESTCASES"; -@@ -148,160 +157,187 @@ - &test('Google', '$MAINTAINER', 'one_page', '+LS'.'AM +rep'.'lication', \$TEST_RANGE, 2,99); - &test('Google', '$MAINTAINER', 'multi', 'dir'.'ty ha'.'rr'.'y bimbo', \$TEST_GREATER_THAN, 101); - ENDTESTCASES -- -+ - use Carp (); --use WWW::Search(generic_option); -+use WWW::Search(qw(generic_option strip_tags)); - require WWW::SearchResult; -- -+ -+ -+sub undef_to_emptystring { -+return defined($_[0]) ? $_[0] : ""; -+} -+# private - sub native_setup_search { -- my($self, $native_query, $native_options_ref) = @_; -- $self->{_debug} = $native_options_ref->{'search_debug'}; -- $self->{_debug} = 2 if ($native_options_ref->{'search_parse_debug'}); -- $self->{_debug} = 0 if (!defined($self->{_debug})); -- $self->{agent_e_mail} = 'jsmyser@bigfoot.com'; -- $self->user_agent('user'); -- $self->{_next_to_retrieve} = 1; -- $self->{'_num_hits'} = 0; -- if (!defined($self->{_options})) { -- $self->{'search_base_url'} = 'http://www.google.com'; -- $self->{_options} = { -- 'search_url' => 'http://www.google.com/search', -- 'num' => '100', -- 'q' => $native_query, -- }; -- } -- my $options_ref = $self->{_options}; -- if (defined($native_options_ref)) -- { -- # Copy in new options. -- foreach (keys %$native_options_ref) -- { -- $options_ref->{$_} = $native_options_ref->{$_}; -- } # foreach -- } # if -- # Process the options. -- my($options) = ''; -- foreach (sort keys %$options_ref) -- { -- # printf STDERR "option: $_ is " . $options_ref->{$_} . "\n"; -- next if (generic_option($_)); -- $options .= $_ . '=' . $options_ref->{$_} . '&'; -- } -- chop $options; -- # Finally figure out the url. -- $self->{_next_url} = $self->{_options}{'search_url'} .'?'. $self->hash_to_cgi_string($self->{_options}); -- } # native_setup_search -- -+ my($self, $native_query, $native_options_ref) = @_; -+ $self->user_agent('user'); -+ $self->{_next_to_retrieve} = 0; -+ $self->{'_num_hits'} = 100; -+ -+ if (!defined $self->{_options}) { -+ $self->{_options} = { -+ 'search_url' => 'http://www.google.com/search', -+ 'num' => $self->{'_num_hits'}, -+ }; -+ } -+ -+ my($options_ref) = $self->{_options}; -+ -+ if (defined $native_options_ref) { -+ # Copy in new options. -+ foreach (keys %$native_options_ref) { -+ $options_ref->{$_} = $native_options_ref->{$_}; -+ } -+ } -+ -+ # Process the options. -+ my($options) = ''; -+ foreach (keys %$options_ref) { -+ # printf STDERR "option: $_ is " . $options_ref->{$_} . "\n"; -+ next if (generic_option($_)); -+ $options .= $_ . '=' . $options_ref->{$_} . '&'; -+ } -+ -+ $self->{_debug} = $options_ref->{'search_debug'}; -+ $self->{_debug} = 2 if ($options_ref->{'search_parse_debug'}); -+ $self->{_debug} = 0 if (!defined $self->{_debug}); -+ -+ # Finally figure out the url. -+ $self->{_base_url} = -+ $self->{_next_url} = -+ $self->{_options}{'search_url'} . -+ "?" . $options . -+ "q=" . $native_query; -+} -+ - # private --sub native_retrieve_some -- { -- my ($self) = @_; -- print STDERR "**Google::native_retrieve_some()**\n" if $self->{_debug}; -- # Fast exit if already done: -- return undef if (!defined($self->{_next_url})); -- -- # If this is not the first page of results, sleep so as to not -- # overload the server: -- $self->user_agent_delay if 1 < $self->{'_next_to_retrieve'}; -- -- # Get some if were not already scoring somewhere else: -- print STDERR "*Sending request (",$self->{_next_url},")\n" if $self->{_debug}; -- my($response) = $self->http_request('GET', $self->{_next_url}); -- $self->{response} = $response; -- if (!$response->is_success) -- { -- return undef; -- } -- $self->{'_next_url'} = undef; -- print STDERR "**Response\n" if $self->{_debug}; -- -- # parse the output -- my ($HEADER, $START, $HITS, $NEXT) = qw(HE HI ST NX); -- my $hits_found = 0; -- my $state = $HEADER; -- my $hit = (); -- foreach ($self->split_lines($response->content())) -- { -- next if m@^$@; # short circuit for blank lines -- print STDERR " $state ===$_=== " if 2 <= $self->{'_debug'}; -- if (m|(\d+) matches|i) { -- print STDERR "**Found Header Count**\n" if ($self->{_debug}); -- $self->approximate_result_count($1); -- $state = $START; -- # set-up attempting the tricky task of -- # fetching the very first HIT line -- } -- elsif ($state eq $START && m|Search took|i) -- { -- print STDERR "**Found Start Line**\n" if ($self->{_debug}); -- $state = $HITS; -- # Attempt to pull the very first hit line -- } -- if ($state eq $HITS) { -- print "\n**state == HITS**\n" if 2 <= $self->{_debug}; -- } -- if ($state eq $HITS && m@^

(.*)$@i) -- { -- print "**Found HIT**\n" if 2 <= $self->{_debug}; -- my ($url, $title) = ($1,$2); -- if (defined($hit)) -- { -- push(@{$self->{cache}}, $hit); -- }; -- $hit = new WWW::SearchResult; -- # some queries *can* create internal junk in the url link -- # remove them! -- $url =~ s/\/url\?sa=U&start=\d+&q=//g; -- $hits_found++; -- $hit->add_url($url); -- $hit->title($title); -- $state = $HITS; -- } -- if ($state eq $HITS && m@^
(.*)@i) -- { -- print "**Found First Description**\n" if 2 <= $self->{_debug}; -- $mDesc = $1; -- if (not $mDesc =~ m@ @) -- { -- $mDesc =~ s/<.*?>//g; -- $mDesc = $mDesc . '
' if not $mDesc =~ m@
@; -- $hit->description($mDesc); -- $state = $HITS; -- } -- } -- elsif ($state eq $HITS && -- m@^(\.(.+))@i || -- m@^
(.*)\s@i) { -- print "**Found Second Description**\n" if 2 <= $self->{_debug}; -- $sDesc = $1; -- $sDesc ||= ''; -- $sDesc =~ s/<.*?>//g; -- $sDesc = $mDesc . $sDesc; -- $hit->description($sDesc); -- $sDesc =''; -- $state = $HITS; -- } -- elsif ($state eq $HITS && -- m|
<.*?>.*?
|i) { -- my $nexturl = $self->{'_next_url'}; -- if (defined $nexturl) { -- print STDERR "**Fetching Next URL-> ", $nexturl, "\n" if 2 <= $self->{_debug}; -- } else { -- print STDERR "**Fetching Next URL-> UNDEF\n" if 2 <= $self->{_debug}; -- } -- -- my $iURL = $1; -- $self->{'_next_url'} = $self->{'search_base_url'} . $iURL; -- } -- else -- { -- print STDERR "**Nothing matched.**\n" if 2 <= $self->{_debug}; -- } -- } -- if (defined($hit)) -- { -- push(@{$self->{cache}}, $hit); -- } -- return $hits_found; -- } # native_retrieve_some --1; -+sub begin_new_hit { -+ my($self) = shift; -+ my($old_hit) = shift; -+ my($old_raw) = shift; -+ -+ if (defined $old_hit) { -+ $old_hit->raw($old_raw) if (defined $old_raw); -+ push(@{$self->{cache}}, $old_hit); -+ } -+ -+ return (new WWW::SearchResult, ''); -+} -+ -+sub native_retrieve_some { -+ my ($self) = @_; -+ # fast exit if already done -+ return undef if (!defined $self->{_next_url}); -+ -+ # get some -+ print STDERR "Fetching " . $self->{_next_url} . "\n" if ($self->{_debug}); -+ my($response) = $self->http_request('GET', $self->{_next_url}); -+ $self->{response} = $response; -+ -+ return undef if (!$response->is_success); -+ -+ # parse the output -+ my($HEADER, $HITS, $TRAILER, $POST_NEXT) = (1..10); -+ my($hits_found) = 0; -+ my($state) = ($HEADER); -+ my($hit) = undef; -+ my($raw) = ''; -+ -+ foreach ($self->split_lines($response->content())) { -+ next if m@^$@; # short circuit for blank lines -+ -+ if ($state == $HEADER && m/about ([\d,]+)<\/b>/) { -+ my($n) = $1; -+ $self->approximate_result_count($n); -+ print STDERR "Found Total: $n\n" if ($self->{_debug}); -+ $state = $HITS; -+ -+ } elsif ($state == $HITS && -+ m|(.*?)
<.*?>|i -+ ) { -+ -+ my ($url, $title) = ($1,$2); -+ ($hit, $raw) = $self->begin_new_hit($hit, $raw); -+ print STDERR "**Found HIT1 Line**\n" if ($self->{_debug}); -+ $raw .= $_; -+ $url =~ s/(>.*)//g; -+ $hit->add_url(strip_tags($url)); -+ $hits_found++; -+ $title = "No Title" if ($title =~ /^\s+/); -+ $hit->title(strip_tags($title)); -+ $state = $HITS; -+ -+ } elsif ($state == $HITS && -+ m@^

(.*)
(.*)@i || -+ m@^

(.*).*?(.*)@i -+ ) { -+ print STDERR "**Found HIT2 Line**\n" if ($self->{_debug}); -+ -+ ($hit, $raw) = $self->begin_new_hit($hit, $raw); -+ -+ my ($url, $title) = ($1,$2); -+ $mDesc = $3; -+ -+ $url =~ s/\/url\?sa=\w&start=\d+&q=//g; -+ $url =~ s/\?lang=(\S+)$//g; -+ $url =~ s/&(.*)//g; -+ $url =~ s/(>.*)//g; -+ $url =~ s/\/$//g; # kill trailing slash. -+ -+ $raw .= $_; -+ $hit->add_url(strip_tags($url)); -+ $hits_found++; -+ -+ $title = "No Title" if ($title =~ /^\s+/); -+ $hit->title(strip_tags($title)); -+ -+ $mDesc =~ s/<.*?>//g; -+### $mDesc = $mDesc . '
' if not $mDesc =~ m@
@; -+ $hit->description($mDesc) if (defined $hit); -+ $state = $HITS; -+ -+# description parsing -+ } elsif ($state == $HITS && m@(\.\.(.+)) @i -+ ) { -+ print STDERR "**Parsing Description Line**\n" if ($self->{_debug}); -+ $raw .= $_; -+ # uhm... -+ $sDesc = $1 || ""; -+ -+ $sDesc =~ s/<.*?>//g; -+ $mDesc ||= ""; -+ $sDesc = $mDesc . $sDesc; -+# $hit->description($sDesc) if $sDesc =~ m@^\.@; -+ $sDesc = ''; -+ $state = $HITS; -+ -+ } elsif ($state == $HITS && m@

@i -+ ) { -+ ($hit, $raw) = $self->begin_new_hit($hit, $raw); -+ print STDERR "**Found Last Line**\n" if ($self->{_debug}); -+ # end of hits -+ $state = $TRAILER; -+ -+ } elsif ($state == $TRAILER && -+ m|.*?|i -+ ) { -+ my($relative_url) = $1; -+ print STDERR "**Fetching >>Next<< Page**\n" if ($self->{_debug}); -+ $self->{_next_url} = 'http://www.google.com' . $relative_url; -+ $state = $POST_NEXT; -+ } -+ } -+ -+ if ($state != $POST_NEXT) { -+ # No "Next" Tag -+ $self->{_next_url} = undef; -+ $self->begin_new_hit($hit, $raw) if ($state == $HITS); -+ $self->{_next_url} = undef; -+ } -+ -+ # ZZZzzzzZZZZzzzzzzZZZZZZzzz -+ $self->user_agent_delay if (defined($self->{_next_url})); -+ return $hits_found; -+} -+ -+1; -+ diff --git a/patches/WWW::Search.patch.old b/patches/WWW::Search.patch.old deleted file mode 100644 index eec3ce3..0000000 --- a/patches/WWW::Search.patch.old +++ /dev/null @@ -1,31 +0,0 @@ ---- WWW/Search/Google.pm.orig Wed May 24 16:55:47 2000 -+++ WWW/Search/Google.pm Wed May 24 16:56:19 2000 -@@ -240,7 +240,7 @@ - if ($state eq $HITS) { - print "\n**state == HITS**\n" if 2 <= $self->{_debug}; - } -- if ($state eq $HITS && m@^

(.*)$@i) -+ if ($state eq $HITS && m@^

(.*)@i) - { - print "**Found HIT**\n" if 2 <= $self->{_debug}; - my ($url, $title) = ($1,$2); -@@ -252,6 +252,7 @@ - # some queries *can* create internal junk in the url link - # remove them! - $url =~ s/\/url\?sa=U&start=\d+&q=//g; -+ $url =~ s/\&exp\=OneBoxNews\s//g; # new junk. - $hits_found++; - $hit->add_url($url); - $hit->title($title); -@@ -275,9 +276,8 @@ - print "**Found Second Description**\n" if 2 <= $self->{_debug}; - $sDesc = $1; - $sDesc ||= ''; -- $sDesc =~ s/<.*?>//g; -- $sDesc = $mDesc . $sDesc; -- $hit->description($sDesc); -+ $sDesc = $mDesc . $sDesc if (defined $mDesc); -+ $hit->description($sDesc) if (defined $hit and $sDesc ne ''); - $sDesc =''; - $state = $HITS; - }