Joomla Platform  13.1
Documentation des API du framework Joomla Platform
 Tout Classes Espaces de nommage Fichiers Fonctions Variables Pages
bad.php
Aller à la documentation de ce fichier.
1 <?php
2 /**
3 * @version $Id$
4 * Tools for locating / replacing bad bytes in UTF-8 strings
5 * The Original Code is Mozilla Communicator client code.
6 * The Initial Developer of the Original Code is
7 * Netscape Communications Corporation.
8 * Portions created by the Initial Developer are Copyright (C) 1998
9 * the Initial Developer. All Rights Reserved.
10 * Ported to PHP by Henri Sivonen (http://hsivonen.iki.fi)
11 * Slight modifications to fit with phputf8 library by Harry Fuecks (hfuecks gmail com)
12 * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUTF8ToUnicode.cpp
13 * @see http://lxr.mozilla.org/seamonkey/source/intl/uconv/src/nsUnicodeToUTF8.cpp
14 * @see http://hsivonen.iki.fi/php-utf8/
15 * @package utf8
16 * @subpackage bad
17 * @see utf8_is_valid
18 */
19 
20 //--------------------------------------------------------------------
21 /**
22 * Locates the first bad byte in a UTF-8 string returning it's
23 * byte index in the string
24 * PCRE Pattern to locate bad bytes in a UTF-8 string
25 * Comes from W3 FAQ: Multilingual Forms
26 * Note: modified to include full ASCII range including control chars
27 * @see http://www.w3.org/International/questions/qa-forms-utf-8
28 * @param string
29 * @return mixed integer byte index or FALSE if no bad found
30 * @package utf8
31 * @subpackage bad
32 */
33 function utf8_bad_find($str) {
34  $UTF8_BAD =
35  '([\x00-\x7F]'. # ASCII (including control chars)
36  '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
37  '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
38  '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
39  '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
40  '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
41  '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
42  '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
43  '|(.{1}))'; # invalid byte
44  $pos = 0;
45  $badList = array();
46  while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
47  $bytes = strlen($matches[0]);
48  if ( isset($matches[2])) {
49  return $pos;
50  }
51  $pos += $bytes;
52  $str = substr($str,$bytes);
53  }
54  return FALSE;
55 }
56 
57 //--------------------------------------------------------------------
58 /**
59 * Locates all bad bytes in a UTF-8 string and returns a list of their
60 * byte index in the string
61 * PCRE Pattern to locate bad bytes in a UTF-8 string
62 * Comes from W3 FAQ: Multilingual Forms
63 * Note: modified to include full ASCII range including control chars
64 * @see http://www.w3.org/International/questions/qa-forms-utf-8
65 * @param string
66 * @return mixed array of integers or FALSE if no bad found
67 * @package utf8
68 * @subpackage bad
69 */
70 function utf8_bad_findall($str) {
71  $UTF8_BAD =
72  '([\x00-\x7F]'. # ASCII (including control chars)
73  '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
74  '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
75  '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
76  '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
77  '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
78  '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
79  '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
80  '|(.{1}))'; # invalid byte
81  $pos = 0;
82  $badList = array();
83  while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
84  $bytes = strlen($matches[0]);
85  if ( isset($matches[2])) {
86  $badList[] = $pos;
87  }
88  $pos += $bytes;
89  $str = substr($str,$bytes);
90  }
91  if ( count($badList) > 0 ) {
92  return $badList;
93  }
94  return FALSE;
95 }
96 
97 //--------------------------------------------------------------------
98 /**
99 * Strips out any bad bytes from a UTF-8 string and returns the rest
100 * PCRE Pattern to locate bad bytes in a UTF-8 string
101 * Comes from W3 FAQ: Multilingual Forms
102 * Note: modified to include full ASCII range including control chars
103 * @see http://www.w3.org/International/questions/qa-forms-utf-8
104 * @param string
105 * @return string
106 * @package utf8
107 * @subpackage bad
108 */
109 function utf8_bad_strip($str) {
110  $UTF8_BAD =
111  '([\x00-\x7F]'. # ASCII (including control chars)
112  '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
113  '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
114  '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
115  '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
116  '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
117  '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
118  '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
119  '|(.{1}))'; # invalid byte
120  ob_start();
121  while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
122  if ( !isset($matches[2])) {
123  echo $matches[0];
124  }
125  $str = substr($str,strlen($matches[0]));
126  }
127  $result = ob_get_contents();
128  ob_end_clean();
129  return $result;
130 }
131 
132 //--------------------------------------------------------------------
133 /**
134 * Replace bad bytes with an alternative character - ASCII character
135 * recommended is replacement char
136 * PCRE Pattern to locate bad bytes in a UTF-8 string
137 * Comes from W3 FAQ: Multilingual Forms
138 * Note: modified to include full ASCII range including control chars
139 * @see http://www.w3.org/International/questions/qa-forms-utf-8
140 * @param string to search
141 * @param string to replace bad bytes with (defaults to '?') - use ASCII
142 * @return string
143 * @package utf8
144 * @subpackage bad
145 */
146 function utf8_bad_replace($str, $replace = '?') {
147  $UTF8_BAD =
148  '([\x00-\x7F]'. # ASCII (including control chars)
149  '|[\xC2-\xDF][\x80-\xBF]'. # non-overlong 2-byte
150  '|\xE0[\xA0-\xBF][\x80-\xBF]'. # excluding overlongs
151  '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'. # straight 3-byte
152  '|\xED[\x80-\x9F][\x80-\xBF]'. # excluding surrogates
153  '|\xF0[\x90-\xBF][\x80-\xBF]{2}'. # planes 1-3
154  '|[\xF1-\xF3][\x80-\xBF]{3}'. # planes 4-15
155  '|\xF4[\x80-\x8F][\x80-\xBF]{2}'. # plane 16
156  '|(.{1}))'; # invalid byte
157  ob_start();
158  while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
159  if ( !isset($matches[2])) {
160  echo $matches[0];
161  } else {
162  echo $replace;
163  }
164  $str = substr($str,strlen($matches[0]));
165  }
166  $result = ob_get_contents();
167  ob_end_clean();
168  return $result;
169 }
170 
171 //--------------------------------------------------------------------
172 /**
173 * Return code from utf8_bad_identify() when a five octet sequence is detected.
174 * Note: 5 octets sequences are valid UTF-8 but are not supported by Unicode so
175 * do not represent a useful character
176 * @see utf8_bad_identify
177 * @package utf8
178 * @subpackage bad
179 */
180 define('UTF8_BAD_5OCTET',1);
181 
182 /**
183 * Return code from utf8_bad_identify() when a six octet sequence is detected.
184 * Note: 6 octets sequences are valid UTF-8 but are not supported by Unicode so
185 * do not represent a useful character
186 * @see utf8_bad_identify
187 * @package utf8
188 * @subpackage bad
189 */
190 define('UTF8_BAD_6OCTET',2);
191 
192 /**
193 * Return code from utf8_bad_identify().
194 * Invalid octet for use as start of multi-byte UTF-8 sequence
195 * @see utf8_bad_identify
196 * @package utf8
197 * @subpackage bad
198 */
199 define('UTF8_BAD_SEQID',3);
200 
201 /**
202 * Return code from utf8_bad_identify().
203 * From Unicode 3.1, non-shortest form is illegal
204 * @see utf8_bad_identify
205 * @package utf8
206 * @subpackage bad
207 */
208 define('UTF8_BAD_NONSHORT',4);
209 
210 /**
211 * Return code from utf8_bad_identify().
212 * From Unicode 3.2, surrogate characters are illegal
213 * @see utf8_bad_identify
214 * @package utf8
215 * @subpackage bad
216 */
217 define('UTF8_BAD_SURROGATE',5);
218 
219 /**
220 * Return code from utf8_bad_identify().
221 * Codepoints outside the Unicode range are illegal
222 * @see utf8_bad_identify
223 * @package utf8
224 * @subpackage bad
225 */
226 define('UTF8_BAD_UNIOUTRANGE',6);
227 
228 /**
229 * Return code from utf8_bad_identify().
230 * Incomplete multi-octet sequence
231 * Note: this is kind of a "catch-all"
232 * @see utf8_bad_identify
233 * @package utf8
234 * @subpackage bad
235 */
236 define('UTF8_BAD_SEQINCOMPLETE',7);
237 
238 //--------------------------------------------------------------------
239 /**
240 * Reports on the type of bad byte found in a UTF-8 string. Returns a
241 * status code on the first bad byte found
242 * @author <hsivonen@iki.fi>
243 * @param string UTF-8 encoded string
244 * @return mixed integer constant describing problem or FALSE if valid UTF-8
245 * @see utf8_bad_explain
246 * @see http://hsivonen.iki.fi/php-utf8/
247 * @package utf8
248 * @subpackage bad
249 */
250 function utf8_bad_identify($str, &$i) {
251 
252  $mState = 0; // cached expected number of octets after the current octet
253  // until the beginning of the next UTF8 character sequence
254  $mUcs4 = 0; // cached Unicode character
255  $mBytes = 1; // cached expected number of octets in the current sequence
256 
257  $len = strlen($str);
258 
259  for($i = 0; $i < $len; $i++) {
260 
261  $in = ord($str{$i});
262 
263  if ( $mState == 0) {
264 
265  // When mState is zero we expect either a US-ASCII character or a
266  // multi-octet sequence.
267  if (0 == (0x80 & ($in))) {
268  // US-ASCII, pass straight through.
269  $mBytes = 1;
270 
271  } else if (0xC0 == (0xE0 & ($in))) {
272  // First octet of 2 octet sequence
273  $mUcs4 = ($in);
274  $mUcs4 = ($mUcs4 & 0x1F) << 6;
275  $mState = 1;
276  $mBytes = 2;
277 
278  } else if (0xE0 == (0xF0 & ($in))) {
279  // First octet of 3 octet sequence
280  $mUcs4 = ($in);
281  $mUcs4 = ($mUcs4 & 0x0F) << 12;
282  $mState = 2;
283  $mBytes = 3;
284 
285  } else if (0xF0 == (0xF8 & ($in))) {
286  // First octet of 4 octet sequence
287  $mUcs4 = ($in);
288  $mUcs4 = ($mUcs4 & 0x07) << 18;
289  $mState = 3;
290  $mBytes = 4;
291 
292  } else if (0xF8 == (0xFC & ($in))) {
293 
294  /* First octet of 5 octet sequence.
295  *
296  * This is illegal because the encoded codepoint must be either
297  * (a) not the shortest form or
298  * (b) outside the Unicode range of 0-0x10FFFF.
299  */
300 
301  return UTF8_BAD_5OCTET;
302 
303  } else if (0xFC == (0xFE & ($in))) {
304 
305  // First octet of 6 octet sequence, see comments for 5 octet sequence.
306  return UTF8_BAD_6OCTET;
307 
308  } else {
309  // Current octet is neither in the US-ASCII range nor a legal first
310  // octet of a multi-octet sequence.
311  return UTF8_BAD_SEQID;
312 
313  }
314 
315  } else {
316 
317  // When mState is non-zero, we expect a continuation of the multi-octet
318  // sequence
319  if (0x80 == (0xC0 & ($in))) {
320 
321  // Legal continuation.
322  $shift = ($mState - 1) * 6;
323  $tmp = $in;
324  $tmp = ($tmp & 0x0000003F) << $shift;
325  $mUcs4 |= $tmp;
326 
327  /**
328  * End of the multi-octet sequence. mUcs4 now contains the final
329  * Unicode codepoint to be output
330  */
331  if (0 == --$mState) {
332 
333  // From Unicode 3.1, non-shortest form is illegal
334  if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
335  ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
336  ((4 == $mBytes) && ($mUcs4 < 0x10000)) ) {
337  return UTF8_BAD_NONSHORT;
338 
339  // From Unicode 3.2, surrogate characters are illegal
340  } else if (($mUcs4 & 0xFFFFF800) == 0xD800) {
341  return UTF8_BAD_SURROGATE;
342 
343  // Codepoints outside the Unicode range are illegal
344  } else if ($mUcs4 > 0x10FFFF) {
345  return UTF8_BAD_UNIOUTRANGE;
346  }
347 
348  //initialize UTF8 cache
349  $mState = 0;
350  $mUcs4 = 0;
351  $mBytes = 1;
352  }
353 
354  } else {
355  // ((0xC0 & (*in) != 0x80) && (mState != 0))
356  // Incomplete multi-octet sequence.
357  $i--;
358  return UTF8_BAD_SEQINCOMPLETE;
359  }
360  }
361  }
362 
363  if ( $mState != 0 ) {
364  // Incomplete multi-octet sequence.
365  $i--;
366  return UTF8_BAD_SEQINCOMPLETE;
367  }
368 
369  // No bad octets found
370  $i = NULL;
371  return FALSE;
372 }
373 
374 //--------------------------------------------------------------------
375 /**
376 * Takes a return code from utf8_bad_identify() are returns a message
377 * (in English) explaining what the problem is.
378 * @param int return code from utf8_bad_identify
379 * @return mixed string message or FALSE if return code unknown
380 * @see utf8_bad_identify
381 * @package utf8
382 * @subpackage bad
383 */
384 function utf8_bad_explain($code) {
385 
386  switch ($code) {
387 
388  case UTF8_BAD_5OCTET:
389  return 'Five octet sequences are valid UTF-8 but are not supported by Unicode';
390  break;
391 
392  case UTF8_BAD_6OCTET:
393  return 'Six octet sequences are valid UTF-8 but are not supported by Unicode';
394  break;
395 
396  case UTF8_BAD_SEQID:
397  return 'Invalid octet for use as start of multi-byte UTF-8 sequence';
398  break;
399 
400  case UTF8_BAD_NONSHORT:
401  return 'From Unicode 3.1, non-shortest form is illegal';
402  break;
403 
404  case UTF8_BAD_SURROGATE:
405  return 'From Unicode 3.2, surrogate characters are illegal';
406  break;
407 
409  return 'Codepoints outside the Unicode range are illegal';
410  break;
411 
413  return 'Incomplete multi-octet sequence';
414  break;
415 
416  }
417 
418  trigger_error('Unknown error code: '.$code,E_USER_WARNING);
419  return FALSE;
420 
421 }