src/Modules/babel.pl

   1 # This program is copyright Jonathan Feinberg 1999.
   2 # This program is distributed under the same terms as infobot.
   3
   4 # Jonathan Feinberg
   5 # jdf@pobox.com
   6 # http://pobox.com/~jdf/
   7
   8 # Version 1.0
   9 # First public release.
  10
  11 # hacked by Tim@Rikers.org to handle new URL and layout
  12
  13 package babel;
  14 use strict;
  15
  16 my $no_babel;
  17
  18 BEGIN {
  19     eval "use URI::Escape";    # utility functions for encoding the
  20     if ($@) { $no_babel++};    # babelfish request
  21     eval "use LWP::UserAgent";
  22     if ($@) { $no_babel++};
  23 }
  24
  25 BEGIN {
  26   # Translate some feasible abbreviations into the ones babelfish
  27   # expects.
  28     use vars qw!%lang_code $lang_regex!;
  29     %lang_code = (
  30                 'fr' => 'fr',
  31                 'sp' => 'es',
  32                 'es' => 'es',
  33                 'po' => 'pt',
  34                 'pt' => 'pt',
  35                 'it' => 'it',
  36                 'ge' => 'de',
  37                 'de' => 'de',
  38                 'gr' => 'de',
  39                 'en' => 'en',
  40                 'zh' => 'zh',
  41                 'ja' => 'ja',
  42                 'jp' => 'ja',
  43                 'ko' => 'ko',
  44                 'kr' => 'ko',
  45                 'ru' => 'ru'
  46                );
  47
  48   # Here's how we recognize the language you're asking for.  It looks
  49   # like RTSL saves you a few keystrokes in #perl, huh?
  50   $lang_regex = join '|', keys %lang_code;
  51 }
  52
  53 sub babelfish {
  54     return '' if $no_babel;
  55   my ($from, $to, $phrase) = @_;
  56   #&main::DEBUG("babelfish($from, $to, $phrase)");
  57
  58   $from = $lang_code{$from};
  59   $to = $lang_code{$to};
  60
  61   my $ua = new LWP::UserAgent;
  62   $ua->timeout(5);
  63
  64   my $req =
  65     HTTP::Request->new('POST', 'http://babelfish.altavista.com/raging/translate.dyn');
  66
  67 # babelfish ignored this, but it SHOULD work
  68 # Accept-Charset: iso-8859-1
  69 #  $req->header('Accept-Charset' => 'iso-8859-1');
  70 #  print $req->header('Accept-Charset');
  71   $req->content_type('application/x-www-form-urlencoded');
  72
  73   return translate($phrase, "${from}_${to}", $req, $ua);
  74 }
  75
  76 sub translate {
  77     return '' if $no_babel;
  78   my ($phrase, $languagepair, $req, $ua) = @_;
  79   #&main::DEBUG("translate($phrase, $languagepair, $req, $ua)");
  80
  81   my $urltext = uri_escape($phrase);
  82   $req->content("urltext=$urltext&lp=$languagepair");
  83   #&main::DEBUG("http://babelfish.altavista.com/raging/translate.dyn??urltext=$urltext&lp=$languagepair");
  84
  85   my $res = $ua->request($req);
  86   my $translated;
  87
  88   if ($res->is_success) {
  89       my $html = $res->content;
  90       # This method subject to change with the whims of Altavista's design
  91       # staff.
  92       $html =~ s/\s+/ /sg;
  93       #&main::DEBUG("$html\n===============\n");
  94       # look for the first :< which should be the "To English:<", etc.
  95       # strip any trailing tags, grab text that follows up to the next tag.
  96       #my ($translated) = ($html =~ m{:\s*(<[^>]*>\s*)+([^<]*)}sx);
  97       ($translated) = $html;
  98       #(undef, $translated) = ($html =~ m{(:\s+(<[^>]*>\s*)+)([^<\s]*)<}sx);
  99
 100       # Tim@Rikers.org get's frustrated and splits this into steps:
 101       # 1) remove everything up to the first ':' in the text
 102       $translated =~ s/.*?:\s*</</s;
 103       # 2) remove any <attributes> till the first text
 104       $translated =~ s/(<[^>]*>\s*)*//s;
 105       # 3) remove the first trailing <attribute> and everything after it
 106       $translated =~ s/<.*//s;
 107
 108       # look for contents of first textarea - not anymore cause > 40 char does not get one.
 109       #my ($translated) = ($html =~ m{<textarea[^>]*>+([^<]*)}sx);
 110       #&main::DEBUG("\"$translated\"\n===============\n");
 111 #       ($html =~ m{<textarea[^>]*>
 112 #               \s*
 113 #               ([^<]*)
 114 #               }sx);
 115 #         ($html =~ m{<br>
 116 #                         \s+
 117 #                             <font\ face="arial,\ helvetica">
 118 #                                 \s*
 119 #                                     (?:\*\*\s+time\ out\s+\*\*)?
 120 #                                         \s*
 121 #                                             ([^<]*)
 122 #                                             }sx);
 123       $translated =~ s/\n/ /g;
 124       $translated =~ s/\s*$//;
 125       # need a way to do unicode->iso
 126   } else {
 127       $translated = ":("; # failure
 128   }
 129   &main::pSReply($translated);
 130 }
 131
 132 if (0) {
 133     if (-t STDIN) {
 134         #my $result = babel::babelfish('en','sp','hello world');
 135         #my $result = babel::babelfish('en','sp','The cheese is old and moldy, where is the bathroom?');
 136         my $result = babel::babelfish('en','gr','doesn\'t seem to translate things longer than 40 characters');
 137         $result =~ s/; /\n/g;
 138         print "Babelfish says: \"$result\"\n";
 139     }
 140 }
 141
 142 1;