35 '([\x00-\x7F]'. # ASCII (including control chars)
36 '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
37 '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
38 '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
39 '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
40 '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
41 '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
42 '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
43 '|(.{1}))'; # invalid byte
46 while (preg_match(
'/'.
$UTF8_BAD.
'/S', $str, $matches)) {
47 $bytes = strlen($matches[0]);
48 if ( isset($matches[2])) {
52 $str = substr($str,$bytes);
72 '([\x00-\x7F]'. # ASCII (including control chars)
73 '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
74 '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
75 '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
76 '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
77 '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
78 '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
79 '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
80 '|(.{1}))'; # invalid byte
83 while (preg_match(
'/'.
$UTF8_BAD.
'/S', $str, $matches)) {
84 $bytes = strlen($matches[0]);
85 if ( isset($matches[2])) {
89 $str = substr($str,$bytes);
91 if ( count($badList) > 0 ) {
111 '([\x00-\x7F]'. # ASCII (including control chars)
112 '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
113 '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
114 '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
115 '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
116 '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
117 '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
118 '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
119 '|(.{1}))'; # invalid byte
121 while (preg_match(
'/'.
$UTF8_BAD.
'/S', $str, $matches)) {
122 if ( !isset($matches[2])) {
125 $str = substr($str,strlen($matches[0]));
127 $result = ob_get_contents();
148 '([\x00-\x7F]'. # ASCII (including control chars)
149 '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
150 '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
151 '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
152 '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
153 '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
154 '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
155 '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
156 '|(.{1}))'; # invalid byte
158 while (preg_match(
'/'.
$UTF8_BAD.
'/S', $str, $matches)) {
159 if ( !isset($matches[2])) {
164 $str = substr($str,strlen($matches[0]));
166 $result = ob_get_contents();
180 define(
'UTF8_BAD_5OCTET',1);
190 define(
'UTF8_BAD_6OCTET',2);
199 define(
'UTF8_BAD_SEQID',3);
208 define(
'UTF8_BAD_NONSHORT',4);
217 define(
'UTF8_BAD_SURROGATE',5);
226 define(
'UTF8_BAD_UNIOUTRANGE',6);
236 define(
'UTF8_BAD_SEQINCOMPLETE',7);
259 for($i = 0; $i < $len; $i++) {
267 if (0 == (0x80 & ($in))) {
271 }
else if (0xC0 == (0xE0 & ($in))) {
274 $mUcs4 = ($mUcs4 & 0x1F) << 6;
278 }
else if (0xE0 == (0xF0 & ($in))) {
281 $mUcs4 = ($mUcs4 & 0x0F) << 12;
285 }
else if (0xF0 == (0xF8 & ($in))) {
288 $mUcs4 = ($mUcs4 & 0x07) << 18;
292 }
else if (0xF8 == (0xFC & ($in))) {
303 }
else if (0xFC == (0xFE & ($in))) {
319 if (0x80 == (0xC0 & ($in))) {
322 $shift = ($mState - 1) * 6;
324 $tmp = ($tmp & 0x0000003F) << $shift;
331 if (0 == --$mState) {
334 if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
335 ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
336 ((4 == $mBytes) && ($mUcs4 < 0x10000)) ) {
340 }
else if (($mUcs4 & 0xFFFFF800) == 0xD800) {
344 }
else if ($mUcs4 > 0x10FFFF) {
363 if ( $mState != 0 ) {
389 return 'Five octet sequences are valid UTF-8 but are not supported by Unicode';
393 return 'Six octet sequences are valid UTF-8 but are not supported by Unicode';
397 return 'Invalid octet for use as start of multi-byte UTF-8 sequence';
401 return 'From Unicode 3.1, non-shortest form is illegal';
405 return 'From Unicode 3.2, surrogate characters are illegal';
409 return 'Codepoints outside the Unicode range are illegal';
413 return 'Incomplete multi-octet sequence';
418 trigger_error(
'Unknown error code: '.$code,E_USER_WARNING);