X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=src%2FModules%2Fbabelfish.pl;h=920bf2e10fcc3e2a00dcc4d3e9b2a0dbfff21388;hb=d043c6d781806b2015b92c4b1e89393d21511111;hp=ed3ea1bda25b20cacbdfd22b58ca3eed777ef39c;hpb=45e51047c2df703229843193e49fb9829489170f;p=infobot.git diff --git a/src/Modules/babelfish.pl b/src/Modules/babelfish.pl index ed3ea1b..920bf2e 100644 --- a/src/Modules/babelfish.pl +++ b/src/Modules/babelfish.pl @@ -74,11 +74,8 @@ sub babelfishParam { my $req = HTTP::Request->new( 'POST', $url ); - # babelfish ignored this, but it SHOULD work - # Accept-Charset: iso-8859-1 - # $req->header('Accept-Charset' => 'iso-8859-1'); - # print $req->header('Accept-Charset'); - $req->header( 'Accept-Language' => 'en' ); + $req->header('Accept-Language' => 'en-us'); + $req->header('Accept-Charset' => 'UTF-8,*'); $req->content_type('application/x-www-form-urlencoded'); return translate( $phrase, "${from}_${to}", $req, $ua ); @@ -103,19 +100,25 @@ sub translate { ($translated) = $html; # strip page head $translated =~ s/.*<\/head>//sg; - &::DEBUG("================================\n$translated\n========================\n"); + # clean before doc-body + $translated =~ s/.*
]*>//sg; + # clean after first form + $translated =~ s/<\/form>.*//sg; # convert back to spaces $translated =~ s/ / /sg; - # strip multiple whitespace - $translated =~ s/\s+/ /sg; + &::DEBUG("================================\n$translated\n========================\n"); # strip up to result $translated =~ s/.*
//sg; # strip rest of page $translated =~ s/<\/div.*//sg; # strip all markup - $translated =~ s/<[^>]*>//sg; + $translated =~ s/<[^>]*>/ /sg; # \n to space - $translated =~ s/\n/ /g; + $translated =~ s/[\n\r\t]/ /g; + # strip leading whitespace + $translated =~ s/^\s+//sg; + # strip trailing whitespace + $translated =~ s/\s+$//sg; # strip multiple whitespace $translated =~ s/\s+/ /sg;