1 ##########################################################
4 # Copyright (C) 1996-1999 by Jim Smyser & USC/ISI
5 # $Id: Google.pm,v 1.1.1.1 2000/07/27 16:10:23 blootbot Exp $
6 ##########################################################
9 package WWW::Search::Google;
14 WWW::Search::Google - class for searching Google
20 my $Search = new WWW::Search('Google'); # cAsE matters
21 my $Query = WWW::Search::escape_query("Where is Jimbo");
22 $Search->native_query($Query);
23 while (my $Result = $Search->next_result()) {
24 print $Result->url, "\n";
29 This class is a Google specialization of WWW::Search.
30 It handles making and interpreting Google searches.
31 F<http://www.google.com>.
33 Googles returns 100 Hits per page. Custom Linux Only search capable.
35 This class exports no public interface; all interaction should
36 be done through L<WWW::Search> objects.
40 For LINUX lovers like me, you can put Googles in a LINUX only search
41 mode by changing search URL from:
43 'search_url' => 'http://www.google.com/search',
47 'search_url' => 'http://www.google.com/linux',
51 To make new back-ends, see L<WWW::Search>.
53 =head1 HOW DOES IT WORK?
55 C<native_setup_search> is called (from C<WWW::Search::setup_search>)
56 before we do anything. It initializes our private variables (which
57 all begin with underscore) and sets up a URL to the first results
58 page in C<{_next_url}>.
60 C<native_retrieve_some> is called (from C<WWW::Search::retrieve_some>)
61 whenever more hits are needed. It calls C<WWW::Search::http_request>
62 to fetch the page specified by C<{_next_url}>.
63 It then parses this page, appending any search hits it finds to
64 C<{cache}>. If it finds a ``next'' button in the text,
65 it sets C<{_next_url}> to point to the page for the next
66 set of results, otherwise it sets it to undef to indicate we''re done.
71 This module adheres to the C<WWW::Search> test suite mechanism.
75 2.07 now parses for most of what Google produces, but not all.
76 Because Google does not produce universial formatting for all
77 results it produces, there are undoublty a few line formats yet
78 uncovered by the author. Different search terms creates various
79 differing format out puts for each line of results. Example,
80 searching for "visual basic" will create whacky url links,
81 whereas searching for "Visual C++" does not. It is a parsing
82 nitemare really! If you think you uncovered a BUG just remember
85 With the above said, this back-end will produce proper formated
86 results for 96+% of what it is asked to produce. Your milage
91 This backend is maintained and supported by Jim Smyser.
96 2.09 seems now to parse all hits with the new format change so there really shouldn't be
97 any like there were with 2.08.
99 =head1 VERSION HISTORY
102 removed warning on absence of description; new test case
105 Google NOW returning url and title on one line.
108 Added a new parsing routine for yet another found result line.
109 Added a substitute for whacky url links some queries can produce.
110 Added Kingpin's new hash_to_cgi_string() 10/12/99
113 Fixed missing links / regexp crap.
116 Matching overhaul to get the code parsing right due to multiple
117 tags being used by google on the hit lines. 9/25/99
120 Last Minute description changes 7/13/99
123 New test mechanism 7/13/99
126 First release 7/11/99
130 THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
131 WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
132 MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
137 #####################################################################
141 @ISA = qw(WWW::Search Exporter);
144 $MAINTAINER = 'Jim Smyser <jsmyser@bigfoot.com>';
145 $TEST_CASES = <<"ENDTESTCASES";
146 # Google looks for partial words it can find results for so it will end up finding "Bogus" pages.
147 &test('Google', '$MAINTAINER', 'zero', '4036e7757s5', \$TEST_EXACTLY);
148 &test('Google', '$MAINTAINER', 'one_page', '+LS'.'AM +rep'.'lication', \$TEST_RANGE, 2,99);
149 &test('Google', '$MAINTAINER', 'multi', 'dir'.'ty ha'.'rr'.'y bimbo', \$TEST_GREATER_THAN, 101);
153 use WWW::Search(generic_option);
154 require WWW::SearchResult;
156 sub native_setup_search {
157 my($self, $native_query, $native_options_ref) = @_;
158 $self->{_debug} = $native_options_ref->{'search_debug'};
159 $self->{_debug} = 2 if ($native_options_ref->{'search_parse_debug'});
160 $self->{_debug} = 0 if (!defined($self->{_debug}));
161 $self->{agent_e_mail} = 'jsmyser@bigfoot.com';
162 $self->user_agent('user');
163 $self->{_next_to_retrieve} = 1;
164 $self->{'_num_hits'} = 0;
165 if (!defined($self->{_options})) {
166 $self->{'search_base_url'} = 'http://www.google.com';
167 $self->{_options} = {
168 'search_url' => 'http://www.google.com/search',
170 'q' => $native_query,
173 my $options_ref = $self->{_options};
174 if (defined($native_options_ref))
176 # Copy in new options.
177 foreach (keys %$native_options_ref)
179 $options_ref->{$_} = $native_options_ref->{$_};
182 # Process the options.
184 foreach (sort keys %$options_ref)
186 # printf STDERR "option: $_ is " . $options_ref->{$_} . "\n";
187 next if (generic_option($_));
188 $options .= $_ . '=' . $options_ref->{$_} . '&';
191 # Finally figure out the url.
192 $self->{_next_url} = $self->{_options}{'search_url'} .'?'. $self->hash_to_cgi_string($self->{_options});
193 } # native_setup_search
196 sub native_retrieve_some
199 print STDERR "**Google::native_retrieve_some()**\n" if $self->{_debug};
200 # Fast exit if already done:
201 return undef if (!defined($self->{_next_url}));
203 # If this is not the first page of results, sleep so as to not
204 # overload the server:
205 $self->user_agent_delay if 1 < $self->{'_next_to_retrieve'};
207 # Get some if were not already scoring somewhere else:
208 print STDERR "*Sending request (",$self->{_next_url},")\n" if $self->{_debug};
209 my($response) = $self->http_request('GET', $self->{_next_url});
210 $self->{response} = $response;
211 if (!$response->is_success)
215 $self->{'_next_url'} = undef;
216 print STDERR "**Response\n" if $self->{_debug};
219 my ($HEADER, $START, $HITS, $NEXT) = qw(HE HI ST NX);
223 foreach ($self->split_lines($response->content()))
225 next if m@^$@; # short circuit for blank lines
226 print STDERR " $state ===$_=== " if 2 <= $self->{'_debug'};
227 if (m|<b>(\d+)</b></font> matches|i) {
228 print STDERR "**Found Header Count**\n" if ($self->{_debug});
229 $self->approximate_result_count($1);
231 # set-up attempting the tricky task of
232 # fetching the very first HIT line
234 elsif ($state eq $START && m|Search took|i)
236 print STDERR "**Found Start Line**\n" if ($self->{_debug});
238 # Attempt to pull the very first hit line
240 if ($state eq $HITS) {
241 print "\n**state == HITS**\n" if 2 <= $self->{_debug};
243 if ($state eq $HITS && m@^<p><a href=([^<]+)>(.*)</a>@i)
245 print "**Found HIT**\n" if 2 <= $self->{_debug};
246 my ($url, $title) = ($1,$2);
249 push(@{$self->{cache}}, $hit);
251 $hit = new WWW::SearchResult;
252 # some queries *can* create internal junk in the url link
254 $url =~ s/\/url\?sa=U&start=\d+&q=//g;
255 $url =~ s/\&exp\=OneBoxNews //g; # ~20000510.
256 $url =~ s/\&e\=110 //g; # -20000528.
262 if ($state eq $HITS && m@^<font size=-1><br>(.*)@i)
264 print "**Found First Description**\n" if 2 <= $self->{_debug};
266 if (not $mDesc =~ m@ @)
268 $mDesc =~ s/<.*?>//g;
269 $mDesc = $mDesc . '<br>' if not $mDesc =~ m@<br>@;
270 $hit->description($mDesc);
274 elsif ($state eq $HITS &&
276 m@^<br><font color=green>(.*)\s@i) {
277 print "**Found Second Description**\n" if 2 <= $self->{_debug};
280 $sDesc = $mDesc . $sDesc if (defined $mDesc);
281 $hit->description($sDesc) if (defined $hit and $sDesc ne '');
285 elsif ($state eq $HITS &&
286 m|<a href=([^<]+)><IMG SRC=/nav_next.gif.*?><br><.*?>.*?</A>|i) {
287 my $nexturl = $self->{'_next_url'};
288 if (defined $nexturl) {
289 print STDERR "**Fetching Next URL-> ", $nexturl, "\n" if 2 <= $self->{_debug};
291 print STDERR "**Fetching Next URL-> UNDEF\n" if 2 <= $self->{_debug};
295 $self->{'_next_url'} = $self->{'search_base_url'} . $iURL;
299 print STDERR "**Nothing matched.**\n" if 2 <= $self->{_debug};
304 push(@{$self->{cache}}, $hit);
307 } # native_retrieve_some