+++ /dev/null
-# This module is part of debbugs, and is released
-# under the terms of the GPL version 2, or any later
-# version at your option.
-# See the file README and COPYING for more information.
-#
-# Copyright 2013 by Don Armstrong <don@donarmstrong.com>.
-
-package Debbugs::UTF8;
-
-=head1 NAME
-
-Debbugs::UTF8 -- Routines for handling conversion of charsets to UTF8
-
-=head1 SYNOPSIS
-
-use Debbugs::UTF8;
-
-
-=head1 DESCRIPTION
-
-This module contains routines which convert from various different
-charsets to UTF8.
-
-=head1 FUNCTIONS
-
-=cut
-
-use warnings;
-use strict;
-use vars qw($VERSION $DEBUG %EXPORT_TAGS @EXPORT_OK @EXPORT);
-use Exporter qw(import);
-
-BEGIN{
- $VERSION = 1.00;
- $DEBUG = 0 unless defined $DEBUG;
-
- %EXPORT_TAGS = (utf8 => [qw(encode_utf8_structure encode_utf8_safely),
- qw(convert_to_utf8 decode_utf8_safely)],
- );
- @EXPORT = (@{$EXPORT_TAGS{utf8}});
- @EXPORT_OK = ();
- Exporter::export_ok_tags(keys %EXPORT_TAGS);
- $EXPORT_TAGS{all} = [@EXPORT_OK];
-}
-
-use Carp;
-$Carp::Verbose = 1;
-
-use Encode qw(encode_utf8 is_utf8 decode decode_utf8);
-use Text::Iconv;
-use Storable qw(dclone);
-
-
-=head1 UTF-8
-
-These functions are exported with the :utf8 tag
-
-=head2 encode_utf8_structure
-
- %newdata = encode_utf8_structure(%newdata);
-
-Takes a complex data structure and encodes any strings with is_utf8
-set into their constituent octets.
-
-=cut
-
-our $depth = 0;
-sub encode_utf8_structure {
- ++$depth;
- my @ret;
- for $_ (@_) {
- if (ref($_) eq 'HASH') {
- push @ret, {encode_utf8_structure(%{$depth == 1 ? dclone($_):$_})};
- }
- elsif (ref($_) eq 'ARRAY') {
- push @ret, [encode_utf8_structure(@{$depth == 1 ? dclone($_):$_})];
- }
- elsif (ref($_)) {
- # we don't know how to handle non hash or non arrays
- push @ret,$_;
- }
- else {
- push @ret,encode_utf8_safely($_);
- }
- }
- --$depth;
- return @ret;
-}
-
-=head2 encode_utf8_safely
-
- $octets = encode_utf8_safely($string);
-
-Given a $string, returns the octet equivalent of $string if $string is
-in perl's internal encoding; otherwise returns $string.
-
-Silently returns REFs without encoding them. [If you want to deeply
-encode REFs, see encode_utf8_structure.]
-
-=cut
-
-
-sub encode_utf8_safely{
- my @ret;
- for my $r (@_) {
- if (not ref($r) and is_utf8($r)) {
- $r = encode_utf8($r);
- }
- push @ret,$r;
- }
- return wantarray ? @ret : (@_ > 1 ? @ret : $ret[0]);
-}
-
-=head2 decode_utf8_safely
-
- $string = decode_utf8_safely($octets);
-
-Given $octets in UTF8, returns the perl-internal equivalent of $octets
-if $octets does not have is_utf8 set; otherwise returns $octets.
-
-Silently returns REFs without encoding them.
-
-=cut
-
-
-sub decode_utf8_safely{
- my @ret;
- for my $r (@_) {
- if (not ref($r) and not is_utf8($r)) {
- $r = decode_utf8($r);
- }
- push @ret, $r;
- }
- return wantarray ? @ret : (@_ > 1 ? @ret : $ret[0]);
-}
-
-
-
-
-=head2 convert_to_utf8
-
- $utf8 = convert_to_utf8("text","charset");
-
-=cut
-
-sub convert_to_utf8 {
- my ($data,$charset,$internal_call) = @_;
- $internal_call //= 0;
- if (is_utf8($data)) {
- cluck("utf8 flag is set when calling convert_to_utf8");
- return $data;
- }
- $charset = uc($charset//'UTF-8');
- if ($charset eq 'RAW') {
- croak("Charset must not be raw when calling convert_to_utf8");
- }
- ## if the charset is unknown or unknown 8 bit, assume that it's UTF-8.
- if ($charset =~ /unknown/i) {
- $charset = 'UTF-8'
- }
- my $iconv_converter;
- eval {
- $iconv_converter = Text::Iconv->new($charset,"UTF-8") or
- die "Unable to create converter for '$charset'";
- };
- if ($@) {
- return undef if $internal_call;
- warn $@;
- # We weren't able to create the converter, so use Encode
- # instead
- return __fallback_convert_to_utf8($data,$charset);
- }
- my $converted_data = $iconv_converter->convert($data);
- # if the conversion failed, retval will be undefined or perhaps
- # -1.
- my $retval = $iconv_converter->retval();
- if (not defined $retval or
- $retval < 0
- ) {
- # try iso8559-1 first
- if (not $internal_call) {
- my $call_back_data = convert_to_utf8($data,'ISO8859-1',1);
- # if there's an à (0xC3), it's probably something
- # horrible, and we shouldn't try to convert it.
- if (defined $call_back_data and $call_back_data !~ /\x{C3}/) {
- return $call_back_data;
- }
- }
- # Fallback to encode, which will probably also fail.
- return __fallback_convert_to_utf8($data,$charset);
- }
- return decode("UTF-8",$converted_data);
-}
-
-# this returns data in perl's internal encoding
-sub __fallback_convert_to_utf8 {
- my ($data, $charset) = @_;
- # raw data just gets returned (that's the charset WordDecorder
- # uses when it doesn't know what to do)
- return $data if $charset eq 'raw';
- if (not defined $charset and not is_utf8($data)) {
- warn ("Undefined charset, and string '$data' is not in perl's internal encoding");
- return $data;
- }
- # lets assume everything that doesn't have a charset is utf8
- $charset //= 'utf8';
- ## if the charset is unknown, assume it's UTF-8
- if ($charset =~ /unknown/i) {
- $charset = 'utf8';
- }
- my $result;
- eval {
- $result = decode($charset,$data,0);
- };
- if ($@) {
- warn "Unable to decode charset; '$charset' and '$data': $@";
- return $data;
- }
- return $result;
-}
-
-
-
-1;
-
-__END__