1 ##########################################################
4 # Copyright (C) 1996-1999 by Jim Smyser & USC/ISI
5 # $Id: Google.pm,v 2.20 2000/07/09 14:29:22 jims Exp $
6 ##########################################################
9 package WWW::Search::Google;
14 WWW::Search::Google - class for searching Google
20 my $Search = new WWW::Search('Google'); # cAsE matters
21 my $Query = WWW::Search::escape_query("Where is Jimbo");
22 $Search->native_query($Query);
23 while (my $Result = $Search->next_result()) {
24 print $Result->url, "\n";
29 This class is a Google specialization of WWW::Search.
30 It handles making and interpreting Google searches.
31 F<http://www.google.com>.
33 This class exports no public interface; all interaction should
34 be done through L<WWW::Search> objects.
38 For LINUX lovers like me, you can put Googles in a LINUX only search
39 mode by changing search URL from:
41 'search_url' => 'http://www.google.com/search',
45 'search_url' => 'http://www.google.com/linux',
49 To make new back-ends, see L<WWW::Search>.
51 =head1 HOW DOES IT WORK?
53 C<native_setup_search> is called (from C<WWW::Search::setup_search>)
54 before we do anything. It initializes our private variables (which
55 all begin with underscore) and sets up a URL to the first results
56 page in C<{_next_url}>.
58 C<native_retrieve_some> is called (from C<WWW::Search::retrieve_some>)
59 whenever more hits are needed. It calls C<WWW::Search::http_request>
60 to fetch the page specified by C<{_next_url}>.
61 It then parses this page, appending any search hits it finds to
62 C<{cache}>. If it finds a ``next'' button in the text,
63 it sets C<{_next_url}> to point to the page for the next
64 set of results, otherwise it sets it to undef to indicate we''re done.
69 This module adheres to the C<WWW::Search> test suite mechanism.
73 This backend is written and maintained/supported by Jim Smyser.
78 Google is not an easy search engine to parse in that it is capable
79 of altering it's output ever so slightly on different search terms.
80 There may be new slight results output the author has not yet seen that
81 will pop at any given time for certain searches. So, if you think you see
82 a bug keep the above in mind and send me the search words you used so I
83 may code for any new variations.
88 Fixed up changed format from google
92 Minor code correction for empty returned titles
95 Forgot to add new next url regex in 2.19!
98 Regex work on some search results url's that has changed. Number found
99 return should be right now.
102 Insert url as a title when no title is found.
105 New regexp to parse newly found results format with certain search terms.
108 removed warning on absence of description; new test case
111 Google NOW returning url and title on one line.
114 Added a new parsing routine for yet another found result line.
115 Added a substitute for whacky url links some queries can produce.
116 Added Kingpin's new hash_to_cgi_string() 10/12/99
119 Fixed missing links / regexp crap.
122 Matching overhaul to get the code parsing right due to multiple
123 tags being used by google on the hit lines. 9/25/99
126 Last Minute description changes 7/13/99
129 New test mechanism 7/13/99
132 First release 7/11/99
136 THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
137 WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
138 MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
145 #####################################################################
150 @ISA = qw(WWW::Search Exporter);
153 $MAINTAINER = 'Jim Smyser <jsmyser@bigfoot.com>';
154 $TEST_CASES = <<"ENDTESTCASES";
155 # Google looks for partial words it can find results for so it will end up finding "Bogus" pages.
156 &test('Google', '$MAINTAINER', 'zero', '4036e7757s5', \$TEST_EXACTLY);
157 &test('Google', '$MAINTAINER', 'one_page', '+LS'.'AM +rep'.'lication', \$TEST_RANGE, 2,99);
158 &test('Google', '$MAINTAINER', 'multi', 'dir'.'ty ha'.'rr'.'y bimbo', \$TEST_GREATER_THAN, 101);
162 use WWW::Search(qw(generic_option strip_tags));
163 require WWW::SearchResult;
166 sub undef_to_emptystring {
167 return defined($_[0]) ? $_[0] : "";
170 sub native_setup_search {
171 my($self, $native_query, $native_options_ref) = @_;
172 $self->user_agent('user');
173 $self->{_next_to_retrieve} = 0;
174 $self->{'_num_hits'} = 100;
176 if (!defined $self->{_options}) {
177 $self->{_options} = {
178 'search_url' => 'http://www.google.com/search',
179 'num' => $self->{'_num_hits'},
183 my($options_ref) = $self->{_options};
185 if (defined $native_options_ref) {
186 # Copy in new options.
187 foreach (keys %$native_options_ref) {
188 $options_ref->{$_} = $native_options_ref->{$_};
192 # Process the options.
194 foreach (keys %$options_ref) {
195 # printf STDERR "option: $_ is " . $options_ref->{$_} . "\n";
196 next if (generic_option($_));
197 $options .= $_ . '=' . $options_ref->{$_} . '&';
200 $self->{_debug} = $options_ref->{'search_debug'};
201 $self->{_debug} = 2 if ($options_ref->{'search_parse_debug'});
202 $self->{_debug} = 0 if (!defined $self->{_debug});
204 # Finally figure out the url.
207 $self->{_options}{'search_url'} .
209 "q=" . $native_query;
215 my($old_hit) = shift;
216 my($old_raw) = shift;
218 if (defined $old_hit) {
219 $old_hit->raw($old_raw) if (defined $old_raw);
220 push(@{$self->{cache}}, $old_hit);
223 return (new WWW::SearchResult, '');
226 sub native_retrieve_some {
228 # fast exit if already done
229 return undef if (!defined $self->{_next_url});
232 print STDERR "Fetching " . $self->{_next_url} . "\n" if ($self->{_debug});
233 my($response) = $self->http_request('GET', $self->{_next_url});
234 $self->{response} = $response;
236 return undef if (!$response->is_success);
239 my($HEADER, $HITS, $TRAILER, $POST_NEXT) = (1..10);
241 my($state) = ($HEADER);
245 foreach ($self->split_lines($response->content())) {
246 next if m@^$@; # short circuit for blank lines
248 if ($state == $HEADER && m/about <b>([\d,]+)<\/b>/) {
250 $self->approximate_result_count($n);
251 print STDERR "Found Total: $n\n" if ($self->{_debug});
254 } elsif ($state == $HITS &&
255 m|<a href=(\S+)\>(.*?)</a><br><font size=-1><font color=\"#008000\"><.*?>|i
258 my ($url, $title) = ($1,$2);
259 ($hit, $raw) = $self->begin_new_hit($hit, $raw);
260 print STDERR "**Found HIT1 Line**\n" if ($self->{_debug});
263 $hit->add_url(strip_tags($url));
265 $title = "No Title" if ($title =~ /^\s+/);
266 $hit->title(strip_tags($title));
269 } elsif ($state == $HITS &&
270 m@^<p><a href=/url\?sa=U&start=\d+&q=([^<]+)\&.*?>(.*)</a><font size=-1><br>(.*)@i ||
271 m@^<p><a href=(\S+)>(.*)</a>.*?<font size=-1>(.*)@i
273 print STDERR "**Found HIT2 Line**\n" if ($self->{_debug});
275 ($hit, $raw) = $self->begin_new_hit($hit, $raw);
277 my ($url, $title) = ($1,$2);
280 $url =~ s/\/url\?sa=\w&start=\d+&q=//g;
281 $url =~ s/\?lang=(\S+)$//g;
284 $url =~ s/\/$//g; # kill trailing slash.
287 $hit->add_url(strip_tags($url));
290 $title = "No Title" if ($title =~ /^\s+/);
291 $hit->title(strip_tags($title));
293 $mDesc =~ s/<.*?>//g;
294 ### $mDesc = $mDesc . '<br>' if not $mDesc =~ m@<br>@;
295 $hit->description($mDesc) if (defined $hit);
298 # description parsing
299 } elsif ($state == $HITS && m@<b>(\.\.(.+))</b> @i
301 print STDERR "**Parsing Description Line**\n" if ($self->{_debug});
306 $sDesc =~ s/<.*?>//g;
308 $sDesc = $mDesc . $sDesc;
309 # $hit->description($sDesc) if $sDesc =~ m@^\.@;
313 } elsif ($state == $HITS && m@<div>@i
315 ($hit, $raw) = $self->begin_new_hit($hit, $raw);
316 print STDERR "**Found Last Line**\n" if ($self->{_debug});
320 } elsif ($state == $TRAILER &&
321 m|<a href=([^<]+)><img src=/nav_next.gif.*?>.*?|i
323 my($relative_url) = $1;
324 print STDERR "**Fetching >>Next<< Page**\n" if ($self->{_debug});
325 $self->{_next_url} = 'http://www.google.com' . $relative_url;
330 if ($state != $POST_NEXT) {
332 $self->{_next_url} = undef;
333 $self->begin_new_hit($hit, $raw) if ($state == $HITS);
334 $self->{_next_url} = undef;
337 # ZZZzzzzZZZZzzzzzzZZZZZZzzz
338 $self->user_agent_delay if (defined($self->{_next_url}));