1 ##########################################################
4 # Copyright (C) 1996-1999 by Jim Smyser & USC/ISI
5 # $Id: Google.pm,v 2.20 2000/07/09 14:29:22 jims Exp $
6 ##########################################################
9 package WWW::Search::Google;
14 WWW::Search::Google - class for searching Google
20 my $Search = new WWW::Search('Google'); # cAsE matters
21 my $Query = WWW::Search::escape_query("Where is Jimbo");
22 $Search->native_query($Query);
23 while (my $Result = $Search->next_result()) {
24 print $Result->url, "\n";
29 This class is a Google specialization of WWW::Search.
30 It handles making and interpreting Google searches.
31 F<http://www.google.com>.
33 This class exports no public interface; all interaction should
34 be done through L<WWW::Search> objects.
38 For LINUX lovers like me, you can put Googles in a LINUX only search
39 mode by changing search URL from:
41 'search_url' => 'http://www.google.com/search',
45 'search_url' => 'http://www.google.com/linux',
49 To make new back-ends, see L<WWW::Search>.
51 =head1 HOW DOES IT WORK?
53 C<native_setup_search> is called (from C<WWW::Search::setup_search>)
54 before we do anything. It initializes our private variables (which
55 all begin with underscore) and sets up a URL to the first results
56 page in C<{_next_url}>.
58 C<native_retrieve_some> is called (from C<WWW::Search::retrieve_some>)
59 whenever more hits are needed. It calls C<WWW::Search::http_request>
60 to fetch the page specified by C<{_next_url}>.
61 It then parses this page, appending any search hits it finds to
62 C<{cache}>. If it finds a ``next'' button in the text,
63 it sets C<{_next_url}> to point to the page for the next
64 set of results, otherwise it sets it to undef to indicate we''re done.
69 This module adheres to the C<WWW::Search> test suite mechanism.
73 This backend is written and maintained/supported by Jim Smyser.
78 Google is not an easy search engine to parse in that it is capable
79 of altering it's output ever so slightly on different search terms.
80 There may be new slight results output the author has not yet seen that
81 will pop at any given time for certain searches. So, if you think you see
82 a bug keep the above in mind and send me the search words you used so I
83 may code for any new variations.
88 Parsing update from Tim Riker <Tim@Rikers.org>
91 Minor code correction for empty returned titles
94 Forgot to add new next url regex in 2.19!
97 Regex work on some search results url's that has changed. Number found
98 return should be right now.
101 Insert url as a title when no title is found.
104 New regexp to parse newly found results format with certain search terms.
107 removed warning on absence of description; new test case
110 Google NOW returning url and title on one line.
113 Added a new parsing routine for yet another found result line.
114 Added a substitute for whacky url links some queries can produce.
115 Added Kingpin's new hash_to_cgi_string() 10/12/99
118 Fixed missing links / regexp crap.
121 Matching overhaul to get the code parsing right due to multiple
122 tags being used by google on the hit lines. 9/25/99
125 Last Minute description changes 7/13/99
128 New test mechanism 7/13/99
131 First release 7/11/99
135 THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
136 WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
137 MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
144 #####################################################################
149 @ISA = qw(WWW::Search Exporter);
152 $MAINTAINER = 'Jim Smyser <jsmyser@bigfoot.com>';
153 $TEST_CASES = <<"ENDTESTCASES";
154 # Google looks for partial words it can find results for so it will end up finding "Bogus" pages.
155 &test('Google', '$MAINTAINER', 'zero', '4036e7757s5', \$TEST_EXACTLY);
156 &test('Google', '$MAINTAINER', 'one_page', '+LS'.'AM +rep'.'lication', \$TEST_RANGE, 2,99);
157 &test('Google', '$MAINTAINER', 'multi', 'dir'.'ty ha'.'rr'.'y bimbo', \$TEST_GREATER_THAN, 101);
161 use WWW::Search(qw(generic_option strip_tags));
162 require WWW::SearchResult;
165 sub undef_to_emptystring {
166 return defined($_[0]) ? $_[0] : "";
169 sub native_setup_search
171 my($self, $native_query, $native_options_ref) = @_;
172 $self->user_agent('user');
173 $self->{_next_to_retrieve} = 0;
174 $self->{'_num_hits'} = 100;
175 if (!defined($self->{_options})) {
176 $self->{_options} = {
177 'search_url' => 'http://www.google.com/search',
178 'num' => $self->{'_num_hits'},
181 my($options_ref) = $self->{_options};
182 if (defined($native_options_ref)) {
183 # Copy in new options.
184 foreach (keys %$native_options_ref) {
185 $options_ref->{$_} = $native_options_ref->{$_};
188 # Process the options.
190 foreach (keys %$options_ref) {
191 # printf STDERR "option: $_ is " . $options_ref->{$_} . "\n";
192 next if (generic_option($_));
193 $options .= $_ . '=' . $options_ref->{$_} . '&';
195 $self->{_debug} = $options_ref->{'search_debug'};
196 $self->{_debug} = 2 if ($options_ref->{'search_parse_debug'});
197 $self->{_debug} = 0 if (!defined($self->{_debug}));
199 # Finally figure out the url.
202 $self->{_options}{'search_url'} .
204 "q=" . $native_query;
210 my($old_hit) = shift;
211 my($old_raw) = shift;
212 if (defined($old_hit)) {
213 $old_hit->raw($old_raw) if (defined($old_raw));
214 push(@{$self->{cache}}, $old_hit);
216 return (new WWW::SearchResult, '');
218 sub native_retrieve_some {
220 # fast exit if already done
221 return undef if (!defined($self->{_next_url}));
223 print STDERR "Fetching " . $self->{_next_url} . "\n" if ($self->{_debug});
224 my($response) = $self->http_request('GET', $self->{_next_url});
225 $self->{response} = $response;
226 if (!$response->is_success) {
231 my($HEADER, $HITS, $TRAILER, $POST_NEXT) = (1..10);
233 my($state) = ($HEADER);
236 foreach ($self->split_lines($response->content())) {
237 next if m@^$@; # short circuit for blank lines
239 if ($state == $HEADER && m/about <b>([\d,]+)<\/b>/)
242 $self->approximate_result_count($n);
243 print STDERR "Found Total: $n\n" ;
246 if ($state == $HITS &&
247 m|<p><a href=([^\>]*)\>(.*?)</a\><br\>|i) {
248 my ($url, $title) = ($1,$2);
249 ($hit, $raw) = $self->begin_new_hit($hit, $raw);
250 print STDERR "**Found HIT0 Line** $url - $title\n" if ($self->{_debug});
253 $hit->add_url(strip_tags($url));
255 $title = "No Title" if ($title =~ /^\s+/);
256 $hit->title(strip_tags($title));
259 elsif ($state == $HITS &&
260 m|<a href=(.*)\>(.*?)</a><font size=-1><br><font color=green><.*?>|i) {
261 my ($url, $title) = ($1,$2);
262 ($hit, $raw) = $self->begin_new_hit($hit, $raw);
263 print STDERR "**Found HIT1 Line**\n" if ($self->{_debug});
266 $hit->add_url(strip_tags($url));
268 $title = "No Title" if ($title =~ /^\s+/);
269 $hit->title(strip_tags($title));
272 elsif ($state == $HITS &&
273 m@^<p><a href=/url\?sa=U&start=\d+&q=([^<]+)\&.*?>(.*)</a><font size=-1><br>(.*)@i ||
274 m@^<p><a href=([^<]+)>(.*)</a>.*?<font size=-1><br>(.*)@i)
276 ($hit, $raw) = $self->begin_new_hit($hit, $raw);
277 print STDERR "**Found HIT2 Line**\n" if ($self->{_debug});
278 my ($url, $title) = ($1,$2);
280 $url =~ s/\/url\?sa=\w&start=\d+&q=//g;
284 $hit->add_url(strip_tags($url));
286 $title = "No Title" if ($title =~ /^\s+/);
287 $hit->title(strip_tags($title));
288 $mDesc =~ s/<.*?>//g;
289 $mDesc = $mDesc . '<br>' if not $mDesc =~ m@<br>@;
290 $hit->description($mDesc) if (defined($hit));
293 elsif ($state == $HITS && m@^(\.\.(.+))@i)
295 print STDERR "**Parsing Description Line**\n" if ($self->{_debug});
299 $sDesc =~ s/<.*?>//g;
300 $sDesc = $mDesc . $sDesc;
301 $hit->description($sDesc) if $sDesc =~ m@^\.@;
305 elsif ($state == $HITS && m@<div class=nav>@i)
307 ($hit, $raw) = $self->begin_new_hit($hit, $raw);
308 print STDERR "**Found Last Line**\n" if ($self->{_debug});
312 elsif ($state == $TRAILER &&
313 m|<a href=([^<]+)><IMG SRC=/nav_next.gif.*?>.*?|i)
315 my($relative_url) = $1;
316 print STDERR "**Fetching >>Next<< Page**\n" if ($self->{_debug});
317 $self->{_next_url} = 'http://www.google.com' . $relative_url;
322 if ($state != $POST_NEXT) {
324 $self->{_next_url} = undef;
325 if ($state == $HITS) {
326 $self->begin_new_hit($hit, $raw);
328 $self->{_next_url} = undef;
330 # ZZZzzzzZZZZzzzzzzZZZZZZzzz
331 $self->user_agent_delay if (defined($self->{_next_url}));