1 --- Google.pm.orig Wed May 24 16:55:47 2000
2 +++ Google.pm Wed Jan 16 22:02:53 2002
6 # Copyright (C) 1996-1999 by Jim Smyser & USC/ISI
9 ##########################################################
13 It handles making and interpreting Google searches.
14 F<http://www.google.com>.
16 -Googles returns 100 Hits per page. Custom Linux Only search capable.
18 This class exports no public interface; all interaction should
19 be done through L<WWW::Search> objects.
23 This module adheres to the C<WWW::Search> test suite mechanism.
27 -2.07 now parses for most of what Google produces, but not all.
28 -Because Google does not produce universial formatting for all
29 -results it produces, there are undoublty a few line formats yet
30 -uncovered by the author. Different search terms creates various
31 -differing format out puts for each line of results. Example,
32 -searching for "visual basic" will create whacky url links,
33 -whereas searching for "Visual C++" does not. It is a parsing
34 -nitemare really! If you think you uncovered a BUG just remember
37 -With the above said, this back-end will produce proper formated
38 -results for 96+% of what it is asked to produce. Your milage
43 -This backend is maintained and supported by Jim Smyser.
44 +This backend is written and maintained/supported by Jim Smyser.
49 -2.09 seems now to parse all hits with the new format change so there really shouldn't be
50 -any like there were with 2.08.
51 +Google is not an easy search engine to parse in that it is capable
52 +of altering it's output ever so slightly on different search terms.
53 +There may be new slight results output the author has not yet seen that
54 +will pop at any given time for certain searches. So, if you think you see
55 +a bug keep the above in mind and send me the search words you used so I
56 +may code for any new variations.
61 +Fixed up changed format from google
65 +Minor code correction for empty returned titles
68 +Forgot to add new next url regex in 2.19!
71 +Regex work on some search results url's that has changed. Number found
72 +return should be right now.
75 +Insert url as a title when no title is found.
77 -=head1 VERSION HISTORY
79 +New regexp to parse newly found results format with certain search terms.
82 removed warning on absence of description; new test case
84 WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
85 MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
93 #####################################################################
98 @ISA = qw(WWW::Search Exporter);
102 $MAINTAINER = 'Jim Smyser <jsmyser@bigfoot.com>';
103 $TEST_CASES = <<"ENDTESTCASES";
104 @@ -148,160 +157,187 @@
105 &test('Google', '$MAINTAINER', 'one_page', '+LS'.'AM +rep'.'lication', \$TEST_RANGE, 2,99);
106 &test('Google', '$MAINTAINER', 'multi', 'dir'.'ty ha'.'rr'.'y bimbo', \$TEST_GREATER_THAN, 101);
111 -use WWW::Search(generic_option);
112 +use WWW::Search(qw(generic_option strip_tags));
113 require WWW::SearchResult;
117 +sub undef_to_emptystring {
118 +return defined($_[0]) ? $_[0] : "";
121 sub native_setup_search {
122 - my($self, $native_query, $native_options_ref) = @_;
123 - $self->{_debug} = $native_options_ref->{'search_debug'};
124 - $self->{_debug} = 2 if ($native_options_ref->{'search_parse_debug'});
125 - $self->{_debug} = 0 if (!defined($self->{_debug}));
126 - $self->{agent_e_mail} = 'jsmyser@bigfoot.com';
127 - $self->user_agent('user');
128 - $self->{_next_to_retrieve} = 1;
129 - $self->{'_num_hits'} = 0;
130 - if (!defined($self->{_options})) {
131 - $self->{'search_base_url'} = 'http://www.google.com';
132 - $self->{_options} = {
133 - 'search_url' => 'http://www.google.com/search',
135 - 'q' => $native_query,
138 - my $options_ref = $self->{_options};
139 - if (defined($native_options_ref))
141 - # Copy in new options.
142 - foreach (keys %$native_options_ref)
144 - $options_ref->{$_} = $native_options_ref->{$_};
147 - # Process the options.
149 - foreach (sort keys %$options_ref)
151 - # printf STDERR "option: $_ is " . $options_ref->{$_} . "\n";
152 - next if (generic_option($_));
153 - $options .= $_ . '=' . $options_ref->{$_} . '&';
156 - # Finally figure out the url.
157 - $self->{_next_url} = $self->{_options}{'search_url'} .'?'. $self->hash_to_cgi_string($self->{_options});
158 - } # native_setup_search
160 + my($self, $native_query, $native_options_ref) = @_;
161 + $self->user_agent('user');
162 + $self->{_next_to_retrieve} = 0;
163 + $self->{'_num_hits'} = 100;
165 + if (!defined $self->{_options}) {
166 + $self->{_options} = {
167 + 'search_url' => 'http://www.google.com/search',
168 + 'num' => $self->{'_num_hits'},
172 + my($options_ref) = $self->{_options};
174 + if (defined $native_options_ref) {
175 + # Copy in new options.
176 + foreach (keys %$native_options_ref) {
177 + $options_ref->{$_} = $native_options_ref->{$_};
181 + # Process the options.
183 + foreach (keys %$options_ref) {
184 + # printf STDERR "option: $_ is " . $options_ref->{$_} . "\n";
185 + next if (generic_option($_));
186 + $options .= $_ . '=' . $options_ref->{$_} . '&';
189 + $self->{_debug} = $options_ref->{'search_debug'};
190 + $self->{_debug} = 2 if ($options_ref->{'search_parse_debug'});
191 + $self->{_debug} = 0 if (!defined $self->{_debug});
193 + # Finally figure out the url.
194 + $self->{_base_url} =
195 + $self->{_next_url} =
196 + $self->{_options}{'search_url'} .
198 + "q=" . $native_query;
202 -sub native_retrieve_some
205 - print STDERR "**Google::native_retrieve_some()**\n" if $self->{_debug};
206 - # Fast exit if already done:
207 - return undef if (!defined($self->{_next_url}));
209 - # If this is not the first page of results, sleep so as to not
210 - # overload the server:
211 - $self->user_agent_delay if 1 < $self->{'_next_to_retrieve'};
213 - # Get some if were not already scoring somewhere else:
214 - print STDERR "*Sending request (",$self->{_next_url},")\n" if $self->{_debug};
215 - my($response) = $self->http_request('GET', $self->{_next_url});
216 - $self->{response} = $response;
217 - if (!$response->is_success)
221 - $self->{'_next_url'} = undef;
222 - print STDERR "**Response\n" if $self->{_debug};
225 - my ($HEADER, $START, $HITS, $NEXT) = qw(HE HI ST NX);
226 - my $hits_found = 0;
227 - my $state = $HEADER;
229 - foreach ($self->split_lines($response->content()))
231 - next if m@^$@; # short circuit for blank lines
232 - print STDERR " $state ===$_=== " if 2 <= $self->{'_debug'};
233 - if (m|<b>(\d+)</b></font> matches|i) {
234 - print STDERR "**Found Header Count**\n" if ($self->{_debug});
235 - $self->approximate_result_count($1);
237 - # set-up attempting the tricky task of
238 - # fetching the very first HIT line
240 - elsif ($state eq $START && m|Search took|i)
242 - print STDERR "**Found Start Line**\n" if ($self->{_debug});
244 - # Attempt to pull the very first hit line
246 - if ($state eq $HITS) {
247 - print "\n**state == HITS**\n" if 2 <= $self->{_debug};
249 - if ($state eq $HITS && m@^<p><a href=([^<]+)>(.*)</a>$@i)
251 - print "**Found HIT**\n" if 2 <= $self->{_debug};
252 - my ($url, $title) = ($1,$2);
255 - push(@{$self->{cache}}, $hit);
257 - $hit = new WWW::SearchResult;
258 - # some queries *can* create internal junk in the url link
260 - $url =~ s/\/url\?sa=U&start=\d+&q=//g;
262 - $hit->add_url($url);
263 - $hit->title($title);
266 - if ($state eq $HITS && m@^<font size=-1><br>(.*)@i)
268 - print "**Found First Description**\n" if 2 <= $self->{_debug};
270 - if (not $mDesc =~ m@ @)
272 - $mDesc =~ s/<.*?>//g;
273 - $mDesc = $mDesc . '<br>' if not $mDesc =~ m@<br>@;
274 - $hit->description($mDesc);
278 - elsif ($state eq $HITS &&
280 - m@^<br><font color=green>(.*)\s@i) {
281 - print "**Found Second Description**\n" if 2 <= $self->{_debug};
284 - $sDesc =~ s/<.*?>//g;
285 - $sDesc = $mDesc . $sDesc;
286 - $hit->description($sDesc);
290 - elsif ($state eq $HITS &&
291 - m|<a href=([^<]+)><IMG SRC=/nav_next.gif.*?><br><.*?>.*?</A>|i) {
292 - my $nexturl = $self->{'_next_url'};
293 - if (defined $nexturl) {
294 - print STDERR "**Fetching Next URL-> ", $nexturl, "\n" if 2 <= $self->{_debug};
296 - print STDERR "**Fetching Next URL-> UNDEF\n" if 2 <= $self->{_debug};
300 - $self->{'_next_url'} = $self->{'search_base_url'} . $iURL;
304 - print STDERR "**Nothing matched.**\n" if 2 <= $self->{_debug};
309 - push(@{$self->{cache}}, $hit);
311 - return $hits_found;
312 - } # native_retrieve_some
316 + my($old_hit) = shift;
317 + my($old_raw) = shift;
319 + if (defined $old_hit) {
320 + $old_hit->raw($old_raw) if (defined $old_raw);
321 + push(@{$self->{cache}}, $old_hit);
324 + return (new WWW::SearchResult, '');
327 +sub native_retrieve_some {
329 + # fast exit if already done
330 + return undef if (!defined $self->{_next_url});
333 + print STDERR "Fetching " . $self->{_next_url} . "\n" if ($self->{_debug});
334 + my($response) = $self->http_request('GET', $self->{_next_url});
335 + $self->{response} = $response;
337 + return undef if (!$response->is_success);
340 + my($HEADER, $HITS, $TRAILER, $POST_NEXT) = (1..10);
341 + my($hits_found) = 0;
342 + my($state) = ($HEADER);
346 + foreach ($self->split_lines($response->content())) {
347 + next if m@^$@; # short circuit for blank lines
349 + if ($state == $HEADER && m/about <b>([\d,]+)<\/b>/) {
351 + $self->approximate_result_count($n);
352 + print STDERR "Found Total: $n\n" if ($self->{_debug});
355 + } elsif ($state == $HITS &&
356 + m|<a href=(\S+)\>(.*?)</a><br><font size=-1><font color=\"#008000\"><.*?>|i
359 + my ($url, $title) = ($1,$2);
360 + ($hit, $raw) = $self->begin_new_hit($hit, $raw);
361 + print STDERR "**Found HIT1 Line**\n" if ($self->{_debug});
363 + $url =~ s/(>.*)//g;
364 + $hit->add_url(strip_tags($url));
366 + $title = "No Title" if ($title =~ /^\s+/);
367 + $hit->title(strip_tags($title));
370 + } elsif ($state == $HITS &&
371 + m@^<p><a href=/url\?sa=U&start=\d+&q=([^<]+)\&.*?>(.*)</a><font size=-1><br>(.*)@i ||
372 + m@^<p><a href=(\S+)>(.*)</a>.*?<font size=-1>(.*)@i
374 + print STDERR "**Found HIT2 Line**\n" if ($self->{_debug});
376 + ($hit, $raw) = $self->begin_new_hit($hit, $raw);
378 + my ($url, $title) = ($1,$2);
381 + $url =~ s/\/url\?sa=\w&start=\d+&q=//g;
382 + $url =~ s/\?lang=(\S+)$//g;
383 + $url =~ s/&(.*)//g;
384 + $url =~ s/(>.*)//g;
385 + $url =~ s/\/$//g; # kill trailing slash.
388 + $hit->add_url(strip_tags($url));
391 + $title = "No Title" if ($title =~ /^\s+/);
392 + $hit->title(strip_tags($title));
394 + $mDesc =~ s/<.*?>//g;
395 +### $mDesc = $mDesc . '<br>' if not $mDesc =~ m@<br>@;
396 + $hit->description($mDesc) if (defined $hit);
399 +# description parsing
400 + } elsif ($state == $HITS && m@<b>(\.\.(.+))</b> @i
402 + print STDERR "**Parsing Description Line**\n" if ($self->{_debug});
407 + $sDesc =~ s/<.*?>//g;
409 + $sDesc = $mDesc . $sDesc;
410 +# $hit->description($sDesc) if $sDesc =~ m@^\.@;
414 + } elsif ($state == $HITS && m@<div>@i
416 + ($hit, $raw) = $self->begin_new_hit($hit, $raw);
417 + print STDERR "**Found Last Line**\n" if ($self->{_debug});
421 + } elsif ($state == $TRAILER &&
422 + m|<a href=([^<]+)><img src=/nav_next.gif.*?>.*?|i
424 + my($relative_url) = $1;
425 + print STDERR "**Fetching >>Next<< Page**\n" if ($self->{_debug});
426 + $self->{_next_url} = 'http://www.google.com' . $relative_url;
427 + $state = $POST_NEXT;
431 + if ($state != $POST_NEXT) {
433 + $self->{_next_url} = undef;
434 + $self->begin_new_hit($hit, $raw) if ($state == $HITS);
435 + $self->{_next_url} = undef;
438 + # ZZZzzzzZZZZzzzzzzZZZZZZzzz
439 + $self->user_agent_delay if (defined($self->{_next_url}));
440 + return $hits_found;