fix babelfish and add new languages

author timriker <timriker@c11ca15a-4712-0410-83d8-924469b57eb5>

Wed, 30 Oct 2002 10:18:35 +0000 (10:18 +0000)

committer timriker <timriker@c11ca15a-4712-0410-83d8-924469b57eb5>

Wed, 30 Oct 2002 10:18:35 +0000 (10:18 +0000)
author timriker <timriker@c11ca15a-4712-0410-83d8-924469b57eb5>
Wed, 30 Oct 2002 10:18:35 +0000 (10:18 +0000)
committer timriker <timriker@c11ca15a-4712-0410-83d8-924469b57eb5>
Wed, 30 Oct 2002 10:18:35 +0000 (10:18 +0000)
diff --git a/files/sample/sample.config b/files/sample/sample.config

index 25a99daf3412933b29795ec7713e2d95723f9b49..751d19e697a32ca0331b662d80ba8af3c337d252 100644 (file)
--- a/files/sample/sample.config
+++ b/files/sample/sample.config
@@ -188,8 +188,8 @@ set backlog         24
  # [str] anything which requires LWP + http proxy.
  ###set httpProxy               http://HOSTNAME:PORT/
  
-# [0/1] babelfish translator.  jdf++. FIXME [DOES NOT WORK]
-set babelfish          false
+# [0/1] babelfish translator.  jdf++.
+set babelfish          true
  
  # [0/1] offer free factoid cookies
  set cookie             true
diff --git a/src/CommandStubs.pl b/src/CommandStubs.pl

index b94f4a40c68cf54e127f360feae6b8bfefb3742f..3e76e16d1da6a05d515c0581643722b7a73176bb 100644 (file)
--- a/src/CommandStubs.pl
+++ b/src/CommandStubs.pl
@@ -5,7 +5,7 @@
  
  #use strict;
  
-$babel_lang_regex = "fr|sp|po|pt|it|ge|de|gr|en";
+$babel_lang_regex = "fr|sp|es|po|pt|it|ge|de|gr|en|zh|ja|jp|ko|kr|ru";
  
  ### COMMAND HOOK IMPLEMENTATION.
  # addCmdHook("SECTION", 'TEXT_HOOK',
@@ -235,9 +235,9 @@ sub Modules {
                 ^\s*
                 (?:babel(?:fish)?|x|xlate|translate)
                 \s+
-               (to|from)               # direction of translation (through)
+               ($babel_lang_regex)\w*  # from language?
                 \s+
-               ($babel_lang_regex)\w*  # which language?
+               ($babel_lang_regex)\w*  # to language?
                 \s*
                 (.+)                    # The phrase to be translated
      }xoi) {
diff --git a/src/Modules/babel.pl b/src/Modules/babel.pl

index 0eb50c4aa988bb4fcce31321361d2226bd4bed9f..4b1a957b597ddfa9323727c74a371040b7067e4e 100644 (file)
--- a/src/Modules/babel.pl
+++ b/src/Modules/babel.pl
@@ -8,114 +8,135 @@
  # Version 1.0
  # First public release.
  
+# hacked by Tim@Rikers.org to handle new URL and layout
+
  package babel;
  use strict;
  
+my $no_babel;
+
+BEGIN {
+    eval "use URI::Escape";    # utility functions for encoding the
+    if ($@) { $no_babel++};    # babelfish request
+    eval "use LWP::UserAgent";
+    if ($@) { $no_babel++};
+}
+
  BEGIN {
-    # Translate some feasible abbreviations into the ones babelfish
-    # expects.
+  # Translate some feasible abbreviations into the ones babelfish
+  # expects.
      use vars qw!%lang_code $lang_regex!;
      %lang_code = (
-               'fr' => 'fr',
-               'sp' => 'es',
-               'po' => 'pt',
-               'pt' => 'pt',
-               'it' => 'it',
-               'ge' => 'de',
-               'de' => 'de',
-               'gr' => 'de',
-               'en' => 'en'
-              );
-
-    # Here's how we recognize the language you're asking for.  It looks
-    # like RTSL saves you a few keystrokes in #perl, huh?
-    $lang_regex = join '|', keys %lang_code;
+                'fr' => 'fr',
+                'sp' => 'es',
+                'es' => 'es',
+                'po' => 'pt',
+                'pt' => 'pt',
+                'it' => 'it',
+                'ge' => 'de',
+                'de' => 'de',
+                'gr' => 'de',
+                'en' => 'en',
+                'zh' => 'zh',
+                'ja' => 'ja',
+                'jp' => 'ja',
+                'ko' => 'ko',
+                'kr' => 'ko',
+                'ru' => 'ru'
+               );
+
+  # Here's how we recognize the language you're asking for.  It looks
+  # like RTSL saves you a few keystrokes in #perl, huh?
+  $lang_regex = join '|', keys %lang_code;
  }
  
  sub babelfish {
-    my ($direction, $lang, $phrase) = @_;
+    return '' if $no_babel;
+  my ($from, $to, $phrase) = @_;
+  #&main::DEBUG("babelfish($from, $to, $phrase)");
  
-    return unless &::loadPerlModule("URI::Escape");
-    return unless &::loadPerlModule("LWP::UserAgent");
+  $from = $lang_code{$from};
+  $to = $lang_code{$to};
  
-    $lang = $lang_code{$lang};
+  my $ua = new LWP::UserAgent;
+  $ua->timeout(5);
  
-    my $ua = new LWP::UserAgent;
-    $ua->timeout(10);
-    $ua->proxy('http', $::param{'httpProxy'}) if &::IsParam("httpProxy");
+  my $req =
+    HTTP::Request->new('POST', 'http://babelfish.altavista.com/raging/translate.dyn');
  
-    my $url = 'http://babelfish.altavista.com/raging/translate.dyn';
-    my $req = HTTP::Request->new('POST',$url);
-
-    $req->content_type('application/x-www-form-urlencoded');
-
-    my $tolang = "en_$lang";
-    my $toenglish = "${lang}_en";
-
-    if ($direction eq 'to') {
-       my $xlate = translate($phrase, $tolang, $req, $ua);
-       &::pSReply($xlate) if ($xlate);
-       return;
-    } elsif ($direction eq 'from') {
-       my $xlate = translate($phrase, $toenglish, $req, $ua);
-       &::pSReply($xlate) if ($xlate);
-       return;
-    }
-    &::DEBUG("what's this junk?");
-
-    my $last_english = $phrase;
-    my $last_lang;
-    my %results = ();
-    my $i = 0;
-    while ($i++ < 7) {
-       last if $results{$phrase}++;    # REMOVE!
-       $last_lang = $phrase = translate($phrase, $tolang, $req, $ua);
-       last if $results{$phrase}++;    # REMOVE!
-       $last_english = $phrase = translate($phrase, $toenglish, $req, $ua);
-    }
+# babelfish ignored this, but it SHOULD work
+# Accept-Charset: iso-8859-1
+#  $req->header('Accept-Charset' => 'iso-8859-1');
+#  print $req->header('Accept-Charset');
+  $req->content_type('application/x-www-form-urlencoded');
  
-    &::pSReply($last_english);
+  return translate($phrase, "${from}_${to}", $req, $ua);
  }
  
  sub translate {
-    my ($phrase, $languagepair, $req, $ua) = @_;
-
-    my $urltext = URI::Escape::uri_escape($phrase);
-    $req->content("urltext=$urltext&lp=$languagepair&doit=done");
-
-    my $res = $ua->request($req);
-
-    my $translated;
-    if ($res->is_success) {            # success.
-       my $html = $res->content;
-       my $textarea    = 0;
-       foreach (split "\n", $html) {
-           $textarea   = 1     if (/<textarea/i);
-           next unless ($textarea);
-
-           &::DEBUG("   '$_'");
-
-           $textarea   = 0     if (/<\/textarea/i);
-       }
-
-       $html   =~ s/\cM//g;
-       $html   =~ s/\n\s*\n/\n/g;
-       $html   =~ s/\n/ /g;    # ...
-
-       if ($html =~ /<textarea.*?>(.*?)<\/textarea/si) {
-           $translated = $1;
-           $translated =~ s/^[\n ]|[\n ]$//g;
-       } else {
-           &::WARN("failed regex for babelfish.");
-       }
+    return '' if $no_babel;
+  my ($phrase, $languagepair, $req, $ua) = @_;
+  #&main::DEBUG("translate($phrase, $languagepair, $req, $ua)");
+
+  my $urltext = uri_escape($phrase);
+  $req->content("urltext=$urltext&lp=$languagepair");
+  #&main::DEBUG("http://babelfish.altavista.com/raging/translate.dyn??urltext=$urltext&lp=$languagepair");
+
+  my $res = $ua->request($req);
+  my $translated;
+
+  if ($res->is_success) {
+      my $html = $res->content;
+      # This method subject to change with the whims of Altavista's design
+      # staff.
+      $html =~ s/\s+/ /sg;
+      #&main::DEBUG("$html\n===============\n");
+      # look for the first :< which should be the "To English:<", etc.
+      # strip any trailing tags, grab text that follows up to the next tag.
+      #my ($translated) = ($html =~ m{:\s*(<[^>]*>\s*)+([^<]*)}sx);
+      ($translated) = $html;
+      #(undef, $translated) = ($html =~ m{(:\s+(<[^>]*>\s*)+)([^<\s]*)<}sx);
+
+      # Tim@Rikers.org get's frustrated and splits this into steps:
+      # 1) remove everything up to the first ':' in the text
+      $translated =~ s/.*?:\s*</</s;
+      # 2) remove any <attributes> till the first text
+      $translated =~ s/(<[^>]*>\s*)*//s;
+      # 3) remove the first trailing <attribute> and everything after it
+      $translated =~ s/<.*//s;
+
+      # look for contents of first textarea - not anymore cause > 40 char does not get one.
+      #my ($translated) = ($html =~ m{<textarea[^>]*>+([^<]*)}sx);
+      #&main::DEBUG("\"$translated\"\n===============\n");
+#       ($html =~ m{<textarea[^>]*>
+#               \s*
+#               ([^<]*)
+#               }sx);
+#         ($html =~ m{<br>
+#                         \s+
+#                             <font\ face="arial,\ helvetica">
+#                                 \s*
+#                                     (?:\*\*\s+time\ out\s+\*\*)?
+#                                         \s*
+#                                             ([^<]*)
+#                                             }sx);
+      $translated =~ s/\n/ /g;
+      $translated =~ s/\s*$//;
+      # need a way to do unicode->iso
+  } else {
+      $translated = ":("; # failure
+  }
+  &main::pSReply($translated);
+}
  
-    } else {                           # failure
-       $translated = "FAILURE w/ babelfish";
+if (0) {
+    if (-t STDIN) {
+        #my $result = babel::babelfish('en','sp','hello world');
+        #my $result = babel::babelfish('en','sp','The cheese is old and moldy, where is the bathroom?');
+        my $result = babel::babelfish('en','gr','doesn\'t seem to translate things longer than 40 characters');
+        $result =~ s/; /\n/g;
+        print "Babelfish says: \"$result\"\n";
      }
-
-    $translated        ||= "NULL reply from babelfish.";
-
-    return $translated;
  }
  
  1;
author	timriker <timriker@c11ca15a-4712-0410-83d8-924469b57eb5>
	Wed, 30 Oct 2002 10:18:35 +0000 (10:18 +0000)
committer	timriker <timriker@c11ca15a-4712-0410-83d8-924469b57eb5>
	Wed, 30 Oct 2002 10:18:35 +0000 (10:18 +0000)
files/sample/sample.config		patch \| blob \| history
src/CommandStubs.pl		patch \| blob \| history
src/Modules/babel.pl		patch \| blob \| history