5 ---------------------------------------------------------------------------------
8 ---------------------------------------------------------------------------------
9 Author: Alexander Minkovsky (a_minkovsky@hotmail.com)
10 ---------------------------------------------------------------------------------
11 License: Choose the more appropriated for You - I don't care.
12 ---------------------------------------------------------------------------------
14 Class provides functionality to convert single byte strings, such as CP1251
15 ti UTF-8 multibyte format and vice versa.
16 Class loads a concrete charset map, for example CP1251.
17 (Refer to ftp://ftp.unicode.org/Public/MAPPINGS/ for map files)
18 Directory containing MAP files is predefined as constant.
19 Each charset is also predefined as constant pointing to the MAP file.
20 ---------------------------------------------------------------------------------
22 Pass the desired charset in the class constructor:
23 $utfConverter = new utf8(CP1251); //defaults to CP1250.
24 or load the charset MAP using loadCharset method like this:
25 $utfConverter->loadCharset(CP1252);
27 $res = $utfConverter->strToUtf8($str);
29 $res = $utfConverter->utf8ToStr($utf);
30 to get the needed encoding.
31 ---------------------------------------------------------------------------------
33 Rewrite or Override the onError method if needed. It's the error handler used from everywhere and takes 2 parameters:
34 err_code and err_text. By default it just prints out a message about the error.
38 // Adapted to fit RoundCube
39 define("UTF8_MAP_DIR", "program/lib/encoding");
41 "CP1250" => UTF8_MAP_DIR . "/CP1250.map",
42 "CP1251" => UTF8_MAP_DIR . "/CP1251.map",
43 "CP1252" => UTF8_MAP_DIR . "/CP1252.map",
44 "CP1253" => UTF8_MAP_DIR . "/CP1253.map",
45 "CP1254" => UTF8_MAP_DIR . "/CP1254.map",
46 "CP1255" => UTF8_MAP_DIR . "/CP1255.map",
47 "CP1256" => UTF8_MAP_DIR . "/CP1256.map",
48 "CP1257" => UTF8_MAP_DIR . "/CP1257.map",
49 "CP1258" => UTF8_MAP_DIR . "/CP1258.map",
50 "ISO-8859-1" => UTF8_MAP_DIR . "/ISO-8859-1.map",
51 "ISO-8859-2" => UTF8_MAP_DIR . "/ISO-8859-2.map",
52 "ISO-8859-3" => UTF8_MAP_DIR . "/ISO-8859-3.map",
53 "ISO-8859-4" => UTF8_MAP_DIR . "/ISO-8859-4.map");
56 define("ERR_OPEN_MAP_FILE","ERR_OPEN_MAP_FILE");
61 var $charset = "ISO-8859-1";
62 var $ascMap = array();
63 var $utfMap = array();
65 // made PHP5 capable by RoundCube
66 function __construct($charset="ISO-8859-1"){
67 $this->loadCharset($charset);
71 function utf8($charset="ISO-8859-1"){
72 $this->__construct($charset);
76 function loadCharset($charset){
79 if (!is_file($utf8_maps[$charset]))
81 $this->onError(ERR_OPEN_MAP_FILE, "Failed to open map file for $charset");
85 if (empty($this->ascMap[$charset]))
87 $lines = file_get_contents($utf8_maps[$charset]);
88 $lines = preg_replace("/#.*$/m","",$lines);
89 $lines = preg_replace("/\n\n/","",$lines);
90 $lines = explode("\n",$lines);
91 foreach($lines as $line){
92 $parts = explode('0x',$line);
94 $asc=hexdec(substr($parts[1],0,2));
95 $utf=hexdec(substr($parts[2],0,4));
96 $this->ascMap[$charset][$asc]=$utf;
101 $this->charset = $charset;
102 $this->utfMap = array_flip($this->ascMap[$charset]);
106 function onError($err_code,$err_text){
107 //print($err_code . " : " . $err_text . "<hr>\n");
108 raise_error(array('code' => 500,
110 'message' => $err_text), TRUE, FALSE);
113 //Translate string ($str) to UTF-8 from given charset
114 function strToUtf8($str){
115 $chars = unpack('C*', $str);
116 $cnt = count($chars);
117 for($i=1;$i<=$cnt;$i++) $this->_charToUtf8($chars[$i]);
118 return implode("",$chars);
121 //Translate UTF-8 string to single byte string in the given charset
122 function utf8ToStr($utf){
123 $chars = unpack('C*', $utf);
124 $cnt = count($chars);
125 $res = ""; //No simple way to do it in place... concatenate char by char
126 for ($i=1;$i<=$cnt;$i++){
127 $res .= $this->_utf8ToChar($chars, $i);
132 //Char to UTF-8 sequence
133 function _charToUtf8(&$char){
134 $c = (int)$this->ascMap[$this->charset][$char];
138 else if($c<0x800) // 2 bytes
139 $char = (chr(0xC0 | $c>>6) . chr(0x80 | $c & 0x3F));
140 else if($c<0x10000) // 3 bytes
141 $char = (chr(0xE0 | $c>>12) . chr(0x80 | $c>>6 & 0x3F) . chr(0x80 | $c & 0x3F));
142 else if($c<0x200000) // 4 bytes
143 $char = (chr(0xF0 | $c>>18) . chr(0x80 | $c>>12 & 0x3F) . chr(0x80 | $c>>6 & 0x3F) . chr(0x80 | $c & 0x3F));
146 //UTF-8 sequence to single byte character
147 function _utf8ToChar(&$chars, &$idx){
148 if(($chars[$idx] >= 240) && ($chars[$idx] <= 255)){ // 4 bytes
149 $utf = (intval($chars[$idx]-240) << 18) +
150 (intval($chars[++$idx]-128) << 12) +
151 (intval($chars[++$idx]-128) << 6) +
152 (intval($chars[++$idx]-128) << 0);
154 else if (($chars[$idx] >= 224) && ($chars[$idx] <= 239)){ // 3 bytes
155 $utf = (intval($chars[$idx]-224) << 12) +
156 (intval($chars[++$idx]-128) << 6) +
157 (intval($chars[++$idx]-128) << 0);
159 else if (($chars[$idx] >= 192) && ($chars[$idx] <= 223)){ // 2 bytes
160 $utf = (intval($chars[$idx]-192) << 6) +
161 (intval($chars[++$idx]-128) << 0);
166 if(array_key_exists($utf,$this->utfMap))
167 return chr($this->utfMap[$utf]);