From: timriker Date: Mon, 20 Jan 2003 22:28:38 +0000 (+0000) Subject: parsing update X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=cab2e4e0988bac44f9e7f6eb7ac03e22ef103d89;p=infobot.git parsing update git-svn-id: https://svn.code.sf.net/p/infobot/code/trunk/blootbot@753 c11ca15a-4712-0410-83d8-924469b57eb5 --- diff --git a/patches/Google.pm b/patches/Google.pm index 54997a3..e323eee 100644 --- a/patches/Google.pm +++ b/patches/Google.pm @@ -84,9 +84,8 @@ may code for any new variations. =head1 CHANGES -2.22 -Fixed up changed format from google -reformatted code +2.21.1 +Parsing update from Tim Riker 2.21 Minor code correction for empty returned titles @@ -148,7 +147,7 @@ require Exporter; @EXPORT = qw(); @EXPORT_OK = qw(); @ISA = qw(WWW::Search Exporter); -$VERSION = '2.22'; +$VERSION = '2.21'; $MAINTAINER = 'Jim Smyser '; $TEST_CASES = <<"ENDTESTCASES"; @@ -167,177 +166,170 @@ sub undef_to_emptystring { return defined($_[0]) ? $_[0] : ""; } # private -sub native_setup_search { - my($self, $native_query, $native_options_ref) = @_; - $self->user_agent('user'); - $self->{_next_to_retrieve} = 0; - $self->{'_num_hits'} = 100; - - if (!defined $self->{_options}) { - $self->{_options} = { - 'search_url' => 'http://www.google.com/search', - 'num' => $self->{'_num_hits'}, - }; - } - - my($options_ref) = $self->{_options}; - - if (defined $native_options_ref) { - # Copy in new options. - foreach (keys %$native_options_ref) { - $options_ref->{$_} = $native_options_ref->{$_}; - } - } - - # Process the options. - my($options) = ''; - foreach (keys %$options_ref) { - # printf STDERR "option: $_ is " . $options_ref->{$_} . "\n"; - next if (generic_option($_)); - $options .= $_ . '=' . $options_ref->{$_} . '&'; - } - - $self->{_debug} = $options_ref->{'search_debug'}; - $self->{_debug} = 2 if ($options_ref->{'search_parse_debug'}); - $self->{_debug} = 0 if (!defined $self->{_debug}); - - # Finally figure out the url. - $self->{_base_url} = - $self->{_next_url} = - $self->{_options}{'search_url'} . - "?" . $options . - "q=" . $native_query; -} +sub native_setup_search + { + my($self, $native_query, $native_options_ref) = @_; + $self->user_agent('user'); + $self->{_next_to_retrieve} = 0; + $self->{'_num_hits'} = 100; + if (!defined($self->{_options})) { + $self->{_options} = { + 'search_url' => 'http://www.google.com/search', + 'num' => $self->{'_num_hits'}, + }; + }; + my($options_ref) = $self->{_options}; + if (defined($native_options_ref)) { + # Copy in new options. + foreach (keys %$native_options_ref) { + $options_ref->{$_} = $native_options_ref->{$_}; + }; + }; + # Process the options. + my($options) = ''; + foreach (keys %$options_ref) { + # printf STDERR "option: $_ is " . $options_ref->{$_} . "\n"; + next if (generic_option($_)); + $options .= $_ . '=' . $options_ref->{$_} . '&'; + }; + $self->{_debug} = $options_ref->{'search_debug'}; + $self->{_debug} = 2 if ($options_ref->{'search_parse_debug'}); + $self->{_debug} = 0 if (!defined($self->{_debug})); + + # Finally figure out the url. + $self->{_base_url} = + $self->{_next_url} = + $self->{_options}{'search_url'} . + "?" . $options . + "q=" . $native_query; + } # private sub begin_new_hit { - my($self) = shift; - my($old_hit) = shift; - my($old_raw) = shift; - - if (defined $old_hit) { - $old_hit->raw($old_raw) if (defined $old_raw); - push(@{$self->{cache}}, $old_hit); - } - - return (new WWW::SearchResult, ''); -} - + my($self) = shift; + my($old_hit) = shift; + my($old_raw) = shift; + if (defined($old_hit)) { + $old_hit->raw($old_raw) if (defined($old_raw)); + push(@{$self->{cache}}, $old_hit); + }; + return (new WWW::SearchResult, ''); + } sub native_retrieve_some { - my ($self) = @_; - # fast exit if already done - return undef if (!defined $self->{_next_url}); - - # get some - print STDERR "Fetching " . $self->{_next_url} . "\n" if ($self->{_debug}); - my($response) = $self->http_request('GET', $self->{_next_url}); - $self->{response} = $response; - - return undef if (!$response->is_success); - - # parse the output - my($HEADER, $HITS, $TRAILER, $POST_NEXT) = (1..10); - my($hits_found) = 0; - my($state) = ($HEADER); - my($hit) = undef; - my($raw) = ''; - - foreach ($self->split_lines($response->content())) { - next if m@^$@; # short circuit for blank lines - - if ($state == $HEADER && m/about ([\d,]+)<\/b>/) { - my($n) = $1; - $self->approximate_result_count($n); - print STDERR "Found Total: $n\n" if ($self->{_debug}); - $state = $HITS; - - } elsif ($state == $HITS && - m|(.*?)
<.*?>|i - ) { - - my ($url, $title) = ($1,$2); - ($hit, $raw) = $self->begin_new_hit($hit, $raw); - print STDERR "**Found HIT1 Line**\n" if ($self->{_debug}); - $raw .= $_; - $url =~ s/(>.*)//g; - $hit->add_url(strip_tags($url)); - $hits_found++; - $title = "No Title" if ($title =~ /^\s+/); - $hit->title(strip_tags($title)); - $state = $HITS; - - } elsif ($state == $HITS && - m@^

(.*)
(.*)@i || - m@^

(.*).*?(.*)@i - ) { - print STDERR "**Found HIT2 Line**\n" if ($self->{_debug}); - - ($hit, $raw) = $self->begin_new_hit($hit, $raw); - - my ($url, $title) = ($1,$2); - $mDesc = $3; - - $url =~ s/\/url\?sa=\w&start=\d+&q=//g; - $url =~ s/\?lang=(\S+)$//g; - $url =~ s/&(.*)//g; - $url =~ s/(>.*)//g; - $url =~ s/\/$//g; # kill trailing slash. - - $raw .= $_; - $hit->add_url(strip_tags($url)); - $hits_found++; - - $title = "No Title" if ($title =~ /^\s+/); - $hit->title(strip_tags($title)); - - $mDesc =~ s/<.*?>//g; -### $mDesc = $mDesc . '
' if not $mDesc =~ m@
@; - $hit->description($mDesc) if (defined $hit); - $state = $HITS; - -# description parsing - } elsif ($state == $HITS && m@(\.\.(.+)) @i - ) { - print STDERR "**Parsing Description Line**\n" if ($self->{_debug}); - $raw .= $_; - # uhm... - $sDesc = $1 || ""; - - $sDesc =~ s/<.*?>//g; - $mDesc ||= ""; - $sDesc = $mDesc . $sDesc; -# $hit->description($sDesc) if $sDesc =~ m@^\.@; - $sDesc = ''; - $state = $HITS; - - } elsif ($state == $HITS && m@

@i - ) { - ($hit, $raw) = $self->begin_new_hit($hit, $raw); - print STDERR "**Found Last Line**\n" if ($self->{_debug}); - # end of hits - $state = $TRAILER; - - } elsif ($state == $TRAILER && - m|.*?|i - ) { - my($relative_url) = $1; - print STDERR "**Fetching >>Next<< Page**\n" if ($self->{_debug}); - $self->{_next_url} = 'http://www.google.com' . $relative_url; - $state = $POST_NEXT; - } - } - - if ($state != $POST_NEXT) { - # No "Next" Tag - $self->{_next_url} = undef; - $self->begin_new_hit($hit, $raw) if ($state == $HITS); - $self->{_next_url} = undef; - } - - # ZZZzzzzZZZZzzzzzzZZZZZZzzz - $self->user_agent_delay if (defined($self->{_next_url})); - return $hits_found; -} - + my ($self) = @_; + # fast exit if already done + return undef if (!defined($self->{_next_url})); + # get some + print STDERR "Fetching " . $self->{_next_url} . "\n" if ($self->{_debug}); + my($response) = $self->http_request('GET', $self->{_next_url}); + $self->{response} = $response; + if (!$response->is_success) { + return undef; + }; + + # parse the output + my($HEADER, $HITS, $TRAILER, $POST_NEXT) = (1..10); + my($hits_found) = 0; + my($state) = ($HEADER); + my($hit) = undef; + my($raw) = ''; + foreach ($self->split_lines($response->content())) { + next if m@^$@; # short circuit for blank lines + + if ($state == $HEADER && m/about ([\d,]+)<\/b>/) + { + my($n) = $1; + $self->approximate_result_count($n); + print STDERR "Found Total: $n\n" ; + $state = $HITS; + } + if ($state == $HITS && + m|

]*)\>(.*?)|i) { + my ($url, $title) = ($1,$2); + ($hit, $raw) = $self->begin_new_hit($hit, $raw); + print STDERR "**Found HIT0 Line** $url - $title\n" if ($self->{_debug}); + $raw .= $_; + $url =~ s/(>.*)//g; + $hit->add_url(strip_tags($url)); + $hits_found++; + $title = "No Title" if ($title =~ /^\s+/); + $hit->title(strip_tags($title)); + $state = $HITS; + } + elsif ($state == $HITS && + m|(.*?)
<.*?>|i) { + my ($url, $title) = ($1,$2); + ($hit, $raw) = $self->begin_new_hit($hit, $raw); + print STDERR "**Found HIT1 Line**\n" if ($self->{_debug}); + $raw .= $_; + $url =~ s/(>.*)//g; + $hit->add_url(strip_tags($url)); + $hits_found++; + $title = "No Title" if ($title =~ /^\s+/); + $hit->title(strip_tags($title)); + $state = $HITS; + } + elsif ($state == $HITS && + m@^

(.*)
(.*)@i || + m@^

(.*).*?
(.*)@i) + { + ($hit, $raw) = $self->begin_new_hit($hit, $raw); + print STDERR "**Found HIT2 Line**\n" if ($self->{_debug}); + my ($url, $title) = ($1,$2); + $mDesc = $3; + $url =~ s/\/url\?sa=\w&start=\d+&q=//g; + $url =~ s/&(.*)//g; + $url =~ s/(>.*)//g; + $raw .= $_; + $hit->add_url(strip_tags($url)); + $hits_found++; + $title = "No Title" if ($title =~ /^\s+/); + $hit->title(strip_tags($title)); + $mDesc =~ s/<.*?>//g; + $mDesc = $mDesc . '
' if not $mDesc =~ m@
@; + $hit->description($mDesc) if (defined($hit)); + $state = $HITS; + } + elsif ($state == $HITS && m@^(\.\.(.+))@i) + { + print STDERR "**Parsing Description Line**\n" if ($self->{_debug}); + $raw .= $_; + $sDesc = $1; + $sDesc ||= ''; + $sDesc =~ s/<.*?>//g; + $sDesc = $mDesc . $sDesc; + $hit->description($sDesc) if $sDesc =~ m@^\.@; + $sDesc = ''; + $state = $HITS; + } + elsif ($state == $HITS && m@