From a1cda318c9d4c54e1ad5882dc171b8b5a36660ae Mon Sep 17 00:00:00 2001 From: timriker Date: Fri, 16 May 2008 19:41:58 +0000 Subject: [PATCH] babelfish: force utf8 git-svn-id: https://svn.code.sf.net/p/infobot/code/trunk@1819 c11ca15a-4712-0410-83d8-924469b57eb5 --- src/Modules/babelfish.pl | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/Modules/babelfish.pl b/src/Modules/babelfish.pl index ed3ea1b..d1d17ee 100644 --- a/src/Modules/babelfish.pl +++ b/src/Modules/babelfish.pl @@ -74,11 +74,8 @@ sub babelfishParam { my $req = HTTP::Request->new( 'POST', $url ); - # babelfish ignored this, but it SHOULD work - # Accept-Charset: iso-8859-1 - # $req->header('Accept-Charset' => 'iso-8859-1'); - # print $req->header('Accept-Charset'); - $req->header( 'Accept-Language' => 'en' ); + $req->header('Accept-Language' => 'en-us'); + $req->header('Accept-Charset' => 'UTF-8,*'); $req->content_type('application/x-www-form-urlencoded'); return translate( $phrase, "${from}_${to}", $req, $ua ); @@ -103,19 +100,21 @@ sub translate { ($translated) = $html; # strip page head $translated =~ s/.*<\/head>//sg; - &::DEBUG("================================\n$translated\n========================\n"); + # clean before doc-body + $translated =~ s/.*
]*>//sg; + # clean after first form + $translated =~ s/<\/form>.*//sg; # convert back to spaces $translated =~ s/ / /sg; - # strip multiple whitespace - $translated =~ s/\s+/ /sg; + &::DEBUG("================================\n$translated\n========================\n"); # strip up to result $translated =~ s/.*
//sg; # strip rest of page $translated =~ s/<\/div.*//sg; # strip all markup - $translated =~ s/<[^>]*>//sg; + $translated =~ s/<[^>]*>/ /sg; # \n to space - $translated =~ s/\n/ /g; + $translated =~ s/[\n\r\t]/ /g; # strip multiple whitespace $translated =~ s/\s+/ /sg; -- 2.39.5