Joomla Platform  13.1
Documentation des API du framework Joomla Platform
 Tout Classes Espaces de nommage Fichiers Fonctions Variables Pages
string.php
Aller à la documentation de ce fichier.
1 <?php
2 /**
3  * @package Joomla.Platform
4  * @subpackage String
5  *
6  * @copyright Copyright (C) 2005 - 2013 Open Source Matters, Inc. All rights reserved.
7  * @license GNU General Public License version 2 or later; see LICENSE
8  */
9 
10 defined('JPATH_PLATFORM') or die;
11 
12 // PHP mbstring and iconv local configuration
13 
14 // Check if mbstring extension is loaded and attempt to load it if not present except for windows
15 if (extension_loaded('mbstring'))
16 {
17  // Make sure to suppress the output in case ini_set is disabled
18  @ini_set('mbstring.internal_encoding', 'UTF-8');
19  @ini_set('mbstring.http_input', 'UTF-8');
20  @ini_set('mbstring.http_output', 'UTF-8');
21 }
22 
23 // Same for iconv
24 if (function_exists('iconv'))
25 {
26  // These are settings that can be set inside code
27  iconv_set_encoding("internal_encoding", "UTF-8");
28  iconv_set_encoding("input_encoding", "UTF-8");
29  iconv_set_encoding("output_encoding", "UTF-8");
30 }
31 
32 /**
33  * Include the utf8 package
34  */
35 jimport('phputf8.utf8');
36 jimport('phputf8.strcasecmp');
37 
38 /**
39  * String handling class for utf-8 data
40  * Wraps the phputf8 library
41  * All functions assume the validity of utf-8 strings.
42  *
43  * @package Joomla.Platform
44  * @subpackage String
45  * @since 11.1
46  */
47 abstract class JString
48 {
49  /**
50  * Increment styles.
51  *
52  * @var array
53  * @since 11.3
54  */
55  protected static $incrementStyles = array(
56  'dash' => array(
57  '#-(\d+)$#',
58  '-%d'
59  ),
60  'default' => array(
61  array('#\((\d+)\)$#', '#\(\d+\)$#'),
62  array(' (%d)', '(%d)'),
63  ),
64  );
65 
66  /**
67  * Split a string in camel case format
68  *
69  * "FooBarABCDef" becomes array("Foo", "Bar", "ABC", "Def");
70  * "JFooBar" becomes array("J", "Foo", "Bar");
71  * "J001FooBar002" becomes array("J001", "Foo", "Bar002");
72  * "abcDef" becomes array("abc", "Def");
73  * "abc_defGhi_Jkl" becomes array("abc_def", "Ghi_Jkl");
74  * "ThisIsA_NASAAstronaut" becomes array("This", "Is", "A_NASA", "Astronaut")),
75  * "JohnFitzgerald_Kennedy" becomes array("John", "Fitzgerald_Kennedy")),
76  *
77  * @param string $string The source string.
78  *
79  * @return array The splitted string.
80  *
81  * @deprecated 12.3 (Platform) & 4.0 (CMS) - Use JStringNormalise::fromCamelCase()
82  * @since 11.3
83  */
84  public static function splitCamelCase($string)
85  {
86  JLog::add('JString::splitCamelCase has been deprecated. Use JStringNormalise::fromCamelCase.', JLog::WARNING, 'deprecated');
87 
88  return JStringNormalise::fromCamelCase($string, true);
89  }
90 
91  /**
92  * Increments a trailing number in a string.
93  *
94  * Used to easily create distinct labels when copying objects. The method has the following styles:
95  *
96  * default: "Label" becomes "Label (2)"
97  * dash: "Label" becomes "Label-2"
98  *
99  * @param string $string The source string.
100  * @param string $style The the style (default|dash).
101  * @param integer $n If supplied, this number is used for the copy, otherwise it is the 'next' number.
102  *
103  * @return string The incremented string.
104  *
105  * @since 11.3
106  */
107  public static function increment($string, $style = 'default', $n = 0)
108  {
109  $styleSpec = isset(self::$incrementStyles[$style]) ? self::$incrementStyles[$style] : self::$incrementStyles['default'];
110 
111  // Regular expression search and replace patterns.
112  if (is_array($styleSpec[0]))
113  {
114  $rxSearch = $styleSpec[0][0];
115  $rxReplace = $styleSpec[0][1];
116  }
117  else
118  {
119  $rxSearch = $rxReplace = $styleSpec[0];
120  }
121 
122  // New and old (existing) sprintf formats.
123  if (is_array($styleSpec[1]))
124  {
125  $newFormat = $styleSpec[1][0];
126  $oldFormat = $styleSpec[1][1];
127  }
128  else
129  {
130  $newFormat = $oldFormat = $styleSpec[1];
131  }
132 
133  // Check if we are incrementing an existing pattern, or appending a new one.
134  if (preg_match($rxSearch, $string, $matches))
135  {
136  $n = empty($n) ? ($matches[1] + 1) : $n;
137  $string = preg_replace($rxReplace, sprintf($oldFormat, $n), $string);
138  }
139  else
140  {
141  $n = empty($n) ? 2 : $n;
142  $string .= sprintf($newFormat, $n);
143  }
144 
145  return $string;
146  }
147 
148  /**
149  * UTF-8 aware alternative to strpos.
150  *
151  * Find position of first occurrence of a string.
152  *
153  * @param string $str String being examined
154  * @param string $search String being searched for
155  * @param integer $offset Optional, specifies the position from which the search should be performed
156  *
157  * @return mixed Number of characters before the first match or FALSE on failure
158  *
159  * @see http://www.php.net/strpos
160  * @since 11.1
161  */
162  public static function strpos($str, $search, $offset = false)
163  {
164  if ($offset === false)
165  {
166  return utf8_strpos($str, $search);
167  }
168  else
169  {
170  return utf8_strpos($str, $search, $offset);
171  }
172  }
173 
174  /**
175  * UTF-8 aware alternative to strrpos
176  * Finds position of last occurrence of a string
177  *
178  * @param string $str String being examined.
179  * @param string $search String being searched for.
180  * @param integer $offset Offset from the left of the string.
181  *
182  * @return mixed Number of characters before the last match or false on failure
183  *
184  * @see http://www.php.net/strrpos
185  * @since 11.1
186  */
187  public static function strrpos($str, $search, $offset = 0)
188  {
189  return utf8_strrpos($str, $search, $offset);
190  }
191 
192  /**
193  * UTF-8 aware alternative to substr
194  * Return part of a string given character offset (and optionally length)
195  *
196  * @param string $str String being processed
197  * @param integer $offset Number of UTF-8 characters offset (from left)
198  * @param integer $length Optional length in UTF-8 characters from offset
199  *
200  * @return mixed string or FALSE if failure
201  *
202  * @see http://www.php.net/substr
203  * @since 11.1
204  */
205  public static function substr($str, $offset, $length = false)
206  {
207  if ($length === false)
208  {
209  return utf8_substr($str, $offset);
210  }
211  else
212  {
213  return utf8_substr($str, $offset, $length);
214  }
215  }
216 
217  /**
218  * UTF-8 aware alternative to strtlower
219  *
220  * Make a string lowercase
221  * Note: The concept of a characters "case" only exists is some alphabets
222  * such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does
223  * not exist in the Chinese alphabet, for example. See Unicode Standard
224  * Annex #21: Case Mappings
225  *
226  * @param string $str String being processed
227  *
228  * @return mixed Either string in lowercase or FALSE is UTF-8 invalid
229  *
230  * @see http://www.php.net/strtolower
231  * @since 11.1
232  */
233  public static function strtolower($str)
234  {
235  return utf8_strtolower($str);
236  }
237 
238  /**
239  * UTF-8 aware alternative to strtoupper
240  * Make a string uppercase
241  * Note: The concept of a characters "case" only exists is some alphabets
242  * such as Latin, Greek, Cyrillic, Armenian and archaic Georgian - it does
243  * not exist in the Chinese alphabet, for example. See Unicode Standard
244  * Annex #21: Case Mappings
245  *
246  * @param string $str String being processed
247  *
248  * @return mixed Either string in uppercase or FALSE is UTF-8 invalid
249  *
250  * @see http://www.php.net/strtoupper
251  * @since 11.1
252  */
253  public static function strtoupper($str)
254  {
255  return utf8_strtoupper($str);
256  }
257 
258  /**
259  * UTF-8 aware alternative to strlen.
260  *
261  * Returns the number of characters in the string (NOT THE NUMBER OF BYTES),
262  *
263  * @param string $str UTF-8 string.
264  *
265  * @return integer Number of UTF-8 characters in string.
266  *
267  * @see http://www.php.net/strlen
268  * @since 11.1
269  */
270  public static function strlen($str)
271  {
272  return utf8_strlen($str);
273  }
274 
275  /**
276  * UTF-8 aware alternative to str_ireplace
277  * Case-insensitive version of str_replace
278  *
279  * @param string $search String to search
280  * @param string $replace Existing string to replace
281  * @param string $str New string to replace with
282  * @param integer $count Optional count value to be passed by referene
283  *
284  * @return string UTF-8 String
285  *
286  * @see http://www.php.net/str_ireplace
287  * @since 11.1
288  */
289  public static function str_ireplace($search, $replace, $str, $count = null)
290  {
291  jimport('phputf8.str_ireplace');
292 
293  if ($count === false)
294  {
295  return utf8_ireplace($search, $replace, $str);
296  }
297  else
298  {
299  return utf8_ireplace($search, $replace, $str, $count);
300  }
301  }
302 
303  /**
304  * UTF-8 aware alternative to str_split
305  * Convert a string to an array
306  *
307  * @param string $str UTF-8 encoded string to process
308  * @param integer $split_len Number to characters to split string by
309  *
310  * @return array
311  *
312  * @see http://www.php.net/str_split
313  * @since 11.1
314  */
315  public static function str_split($str, $split_len = 1)
316  {
317  jimport('phputf8.str_split');
318 
319  return utf8_str_split($str, $split_len);
320  }
321 
322  /**
323  * UTF-8/LOCALE aware alternative to strcasecmp
324  * A case insensitive string comparison
325  *
326  * @param string $str1 string 1 to compare
327  * @param string $str2 string 2 to compare
328  * @param mixed $locale The locale used by strcoll or false to use classical comparison
329  *
330  * @return integer < 0 if str1 is less than str2; > 0 if str1 is greater than str2, and 0 if they are equal.
331  *
332  * @see http://www.php.net/strcasecmp
333  * @see http://www.php.net/strcoll
334  * @see http://www.php.net/setlocale
335  * @since 11.1
336  */
337  public static function strcasecmp($str1, $str2, $locale = false)
338  {
339  if ($locale)
340  {
341  // Get current locale
342  $locale0 = setlocale(LC_COLLATE, 0);
343 
344  if (!$locale = setlocale(LC_COLLATE, $locale))
345  {
346  $locale = $locale0;
347  }
348 
349  // See if we have successfully set locale to UTF-8
350  if (!stristr($locale, 'UTF-8') && stristr($locale, '_') && preg_match('~\.(\d+)$~', $locale, $m))
351  {
352  $encoding = 'CP' . $m[1];
353  }
354  elseif (stristr($locale, 'UTF-8') || stristr($locale, 'utf8'))
355  {
356  $encoding = 'UTF-8';
357  }
358  else
359  {
360  $encoding = 'nonrecodable';
361  }
362 
363  // If we successfully set encoding it to utf-8 or encoding is sth weird don't recode
364  if ($encoding == 'UTF-8' || $encoding == 'nonrecodable')
365  {
366  return strcoll(utf8_strtolower($str1), utf8_strtolower($str2));
367  }
368  else
369  {
370  return strcoll(
371  self::transcode(utf8_strtolower($str1), 'UTF-8', $encoding),
372  self::transcode(utf8_strtolower($str2), 'UTF-8', $encoding)
373  );
374  }
375  }
376  else
377  {
378  return utf8_strcasecmp($str1, $str2);
379  }
380  }
381 
382  /**
383  * UTF-8/LOCALE aware alternative to strcmp
384  * A case sensitive string comparison
385  *
386  * @param string $str1 string 1 to compare
387  * @param string $str2 string 2 to compare
388  * @param mixed $locale The locale used by strcoll or false to use classical comparison
389  *
390  * @return integer < 0 if str1 is less than str2; > 0 if str1 is greater than str2, and 0 if they are equal.
391  *
392  * @see http://www.php.net/strcmp
393  * @see http://www.php.net/strcoll
394  * @see http://www.php.net/setlocale
395  * @since 11.1
396  */
397  public static function strcmp($str1, $str2, $locale = false)
398  {
399  if ($locale)
400  {
401  // Get current locale
402  $locale0 = setlocale(LC_COLLATE, 0);
403 
404  if (!$locale = setlocale(LC_COLLATE, $locale))
405  {
406  $locale = $locale0;
407  }
408 
409  // See if we have successfully set locale to UTF-8
410  if (!stristr($locale, 'UTF-8') && stristr($locale, '_') && preg_match('~\.(\d+)$~', $locale, $m))
411  {
412  $encoding = 'CP' . $m[1];
413  }
414  elseif (stristr($locale, 'UTF-8') || stristr($locale, 'utf8'))
415  {
416  $encoding = 'UTF-8';
417  }
418  else
419  {
420  $encoding = 'nonrecodable';
421  }
422 
423  // If we successfully set encoding it to utf-8 or encoding is sth weird don't recode
424  if ($encoding == 'UTF-8' || $encoding == 'nonrecodable')
425  {
426  return strcoll($str1, $str2);
427  }
428  else
429  {
430  return strcoll(self::transcode($str1, 'UTF-8', $encoding), self::transcode($str2, 'UTF-8', $encoding));
431  }
432  }
433  else
434  {
435  return strcmp($str1, $str2);
436  }
437  }
438 
439  /**
440  * UTF-8 aware alternative to strcspn
441  * Find length of initial segment not matching mask
442  *
443  * @param string $str The string to process
444  * @param string $mask The mask
445  * @param integer $start Optional starting character position (in characters)
446  * @param integer $length Optional length
447  *
448  * @return integer The length of the initial segment of str1 which does not contain any of the characters in str2
449  *
450  * @see http://www.php.net/strcspn
451  * @since 11.1
452  */
453  public static function strcspn($str, $mask, $start = null, $length = null)
454  {
455  jimport('phputf8.strcspn');
456 
457  if ($start === false && $length === false)
458  {
459  return utf8_strcspn($str, $mask);
460  }
461  elseif ($length === false)
462  {
463  return utf8_strcspn($str, $mask, $start);
464  }
465  else
466  {
467  return utf8_strcspn($str, $mask, $start, $length);
468  }
469  }
470 
471  /**
472  * UTF-8 aware alternative to stristr
473  * Returns all of haystack from the first occurrence of needle to the end.
474  * needle and haystack are examined in a case-insensitive manner
475  * Find first occurrence of a string using case insensitive comparison
476  *
477  * @param string $str The haystack
478  * @param string $search The needle
479  *
480  * @return string the sub string
481  *
482  * @see http://www.php.net/stristr
483  * @since 11.1
484  */
485  public static function stristr($str, $search)
486  {
487  jimport('phputf8.stristr');
488 
489  return utf8_stristr($str, $search);
490  }
491 
492  /**
493  * UTF-8 aware alternative to strrev
494  * Reverse a string
495  *
496  * @param string $str String to be reversed
497  *
498  * @return string The string in reverse character order
499  *
500  * @see http://www.php.net/strrev
501  * @since 11.1
502  */
503  public static function strrev($str)
504  {
505  jimport('phputf8.strrev');
506 
507  return utf8_strrev($str);
508  }
509 
510  /**
511  * UTF-8 aware alternative to strspn
512  * Find length of initial segment matching mask
513  *
514  * @param string $str The haystack
515  * @param string $mask The mask
516  * @param integer $start Start optional
517  * @param integer $length Length optional
518  *
519  * @return integer
520  *
521  * @see http://www.php.net/strspn
522  * @since 11.1
523  */
524  public static function strspn($str, $mask, $start = null, $length = null)
525  {
526  jimport('phputf8.strspn');
527 
528  if ($start === null && $length === null)
529  {
530  return utf8_strspn($str, $mask);
531  }
532  elseif ($length === null)
533  {
534  return utf8_strspn($str, $mask, $start);
535  }
536  else
537  {
538  return utf8_strspn($str, $mask, $start, $length);
539  }
540  }
541 
542  /**
543  * UTF-8 aware substr_replace
544  * Replace text within a portion of a string
545  *
546  * @param string $str The haystack
547  * @param string $repl The replacement string
548  * @param integer $start Start
549  * @param integer $length Length (optional)
550  *
551  * @return string
552  *
553  * @see http://www.php.net/substr_replace
554  * @since 11.1
555  */
556  public static function substr_replace($str, $repl, $start, $length = null)
557  {
558  // Loaded by library loader
559  if ($length === false)
560  {
561  return utf8_substr_replace($str, $repl, $start);
562  }
563  else
564  {
565  return utf8_substr_replace($str, $repl, $start, $length);
566  }
567  }
568 
569  /**
570  * UTF-8 aware replacement for ltrim()
571  *
572  * Strip whitespace (or other characters) from the beginning of a string
573  * You only need to use this if you are supplying the charlist
574  * optional arg and it contains UTF-8 characters. Otherwise ltrim will
575  * work normally on a UTF-8 string
576  *
577  * @param string $str The string to be trimmed
578  * @param string $charlist The optional charlist of additional characters to trim
579  *
580  * @return string The trimmed string
581  *
582  * @see http://www.php.net/ltrim
583  * @since 11.1
584  */
585  public static function ltrim($str, $charlist = false)
586  {
587  if (empty($charlist) && $charlist !== false)
588  {
589  return $str;
590  }
591 
592  jimport('phputf8.trim');
593 
594  if ($charlist === false)
595  {
596  return utf8_ltrim($str);
597  }
598  else
599  {
600  return utf8_ltrim($str, $charlist);
601  }
602  }
603 
604  /**
605  * UTF-8 aware replacement for rtrim()
606  * Strip whitespace (or other characters) from the end of a string
607  * You only need to use this if you are supplying the charlist
608  * optional arg and it contains UTF-8 characters. Otherwise rtrim will
609  * work normally on a UTF-8 string
610  *
611  * @param string $str The string to be trimmed
612  * @param string $charlist The optional charlist of additional characters to trim
613  *
614  * @return string The trimmed string
615  *
616  * @see http://www.php.net/rtrim
617  * @since 11.1
618  */
619  public static function rtrim($str, $charlist = false)
620  {
621  if (empty($charlist) && $charlist !== false)
622  {
623  return $str;
624  }
625 
626  jimport('phputf8.trim');
627 
628  if ($charlist === false)
629  {
630  return utf8_rtrim($str);
631  }
632  else
633  {
634  return utf8_rtrim($str, $charlist);
635  }
636  }
637 
638  /**
639  * UTF-8 aware replacement for trim()
640  * Strip whitespace (or other characters) from the beginning and end of a string
641  * Note: you only need to use this if you are supplying the charlist
642  * optional arg and it contains UTF-8 characters. Otherwise trim will
643  * work normally on a UTF-8 string
644  *
645  * @param string $str The string to be trimmed
646  * @param string $charlist The optional charlist of additional characters to trim
647  *
648  * @return string The trimmed string
649  *
650  * @see http://www.php.net/trim
651  * @since 11.1
652  */
653  public static function trim($str, $charlist = false)
654  {
655  if (empty($charlist) && $charlist !== false)
656  {
657  return $str;
658  }
659 
660  jimport('phputf8.trim');
661 
662  if ($charlist === false)
663  {
664  return utf8_trim($str);
665  }
666  else
667  {
668  return utf8_trim($str, $charlist);
669  }
670  }
671 
672  /**
673  * UTF-8 aware alternative to ucfirst
674  * Make a string's first character uppercase or all words' first character uppercase
675  *
676  * @param string $str String to be processed
677  * @param string $delimiter The words delimiter (null means do not split the string)
678  * @param string $newDelimiter The new words delimiter (null means equal to $delimiter)
679  *
680  * @return string If $delimiter is null, return the string with first character as upper case (if applicable)
681  * else consider the string of words separated by the delimiter, apply the ucfirst to each words
682  * and return the string with the new delimiter
683  *
684  * @see http://www.php.net/ucfirst
685  * @since 11.1
686  */
687  public static function ucfirst($str, $delimiter = null, $newDelimiter = null)
688  {
689  jimport('phputf8.ucfirst');
690 
691  if ($delimiter === null)
692  {
693  return utf8_ucfirst($str);
694  }
695  else
696  {
697  if ($newDelimiter === null)
698  {
699  $newDelimiter = $delimiter;
700  }
701  return implode($newDelimiter, array_map('utf8_ucfirst', explode($delimiter, $str)));
702  }
703  }
704 
705  /**
706  * UTF-8 aware alternative to ucwords
707  * Uppercase the first character of each word in a string
708  *
709  * @param string $str String to be processed
710  *
711  * @return string String with first char of each word uppercase
712  *
713  * @see http://www.php.net/ucwords
714  * @since 11.1
715  */
716  public static function ucwords($str)
717  {
718  jimport('phputf8.ucwords');
719 
720  return utf8_ucwords($str);
721  }
722 
723  /**
724  * Transcode a string.
725  *
726  * @param string $source The string to transcode.
727  * @param string $from_encoding The source encoding.
728  * @param string $to_encoding The target encoding.
729  *
730  * @return mixed The transcoded string, or null if the source was not a string.
731  *
732  * @link https://bugs.php.net/bug.php?id=48147
733  *
734  * @since 11.1
735  */
736  public static function transcode($source, $from_encoding, $to_encoding)
737  {
738  if (is_string($source))
739  {
740  switch (ICONV_IMPL)
741  {
742  case 'glibc':
743  return @iconv($from_encoding, $to_encoding . '//TRANSLIT,IGNORE', $source);
744  case 'libiconv':
745  default:
746  return iconv($from_encoding, $to_encoding . '//IGNORE//TRANSLIT', $source);
747  }
748  }
749 
750  return null;
751  }
752 
753  /**
754  * Tests a string as to whether it's valid UTF-8 and supported by the Unicode standard.
755  *
756  * Note: this function has been modified to simple return true or false.
757  *
758  * @param string $str UTF-8 encoded string.
759  *
760  * @return boolean true if valid
761  *
762  * @author <hsivonen@iki.fi>
763  * @see http://hsivonen.iki.fi/php-utf8/
764  * @see JString::compliant()
765  * @since 11.1
766  */
767  public static function valid($str)
768  {
769  // Cached expected number of octets after the current octet
770  // until the beginning of the next UTF8 character sequence
771  $mState = 0;
772 
773  // Cached Unicode character
774  $mUcs4 = 0;
775 
776  // Cached expected number of octets in the current sequence
777  $mBytes = 1;
778 
779  $len = strlen($str);
780 
781  for ($i = 0; $i < $len; $i++)
782  {
783  $in = ord($str{$i});
784 
785  if ($mState == 0)
786  {
787  // When mState is zero we expect either a US-ASCII character or a
788  // multi-octet sequence.
789  if (0 == (0x80 & ($in)))
790  {
791  // US-ASCII, pass straight through.
792  $mBytes = 1;
793  }
794  elseif (0xC0 == (0xE0 & ($in)))
795  {
796  // First octet of 2 octet sequence
797  $mUcs4 = ($in);
798  $mUcs4 = ($mUcs4 & 0x1F) << 6;
799  $mState = 1;
800  $mBytes = 2;
801  }
802  elseif (0xE0 == (0xF0 & ($in)))
803  {
804  // First octet of 3 octet sequence
805  $mUcs4 = ($in);
806  $mUcs4 = ($mUcs4 & 0x0F) << 12;
807  $mState = 2;
808  $mBytes = 3;
809  }
810  elseif (0xF0 == (0xF8 & ($in)))
811  {
812  // First octet of 4 octet sequence
813  $mUcs4 = ($in);
814  $mUcs4 = ($mUcs4 & 0x07) << 18;
815  $mState = 3;
816  $mBytes = 4;
817  }
818  elseif (0xF8 == (0xFC & ($in)))
819  {
820  /* First octet of 5 octet sequence.
821  *
822  * This is illegal because the encoded codepoint must be either
823  * (a) not the shortest form or
824  * (b) outside the Unicode range of 0-0x10FFFF.
825  * Rather than trying to resynchronize, we will carry on until the end
826  * of the sequence and let the later error handling code catch it.
827  */
828  $mUcs4 = ($in);
829  $mUcs4 = ($mUcs4 & 0x03) << 24;
830  $mState = 4;
831  $mBytes = 5;
832  }
833  elseif (0xFC == (0xFE & ($in)))
834  {
835  // First octet of 6 octet sequence, see comments for 5 octet sequence.
836  $mUcs4 = ($in);
837  $mUcs4 = ($mUcs4 & 1) << 30;
838  $mState = 5;
839  $mBytes = 6;
840 
841  }
842  else
843  {
844  /* Current octet is neither in the US-ASCII range nor a legal first
845  * octet of a multi-octet sequence.
846  */
847  return false;
848  }
849  }
850  else
851  {
852  // When mState is non-zero, we expect a continuation of the multi-octet
853  // sequence
854  if (0x80 == (0xC0 & ($in)))
855  {
856  // Legal continuation.
857  $shift = ($mState - 1) * 6;
858  $tmp = $in;
859  $tmp = ($tmp & 0x0000003F) << $shift;
860  $mUcs4 |= $tmp;
861 
862  /**
863  * End of the multi-octet sequence. mUcs4 now contains the final
864  * Unicode codepoint to be output
865  */
866  if (0 == --$mState)
867  {
868  /*
869  * Check for illegal sequences and codepoints.
870  */
871  // From Unicode 3.1, non-shortest form is illegal
872  if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || ((3 == $mBytes) && ($mUcs4 < 0x0800)) || ((4 == $mBytes) && ($mUcs4 < 0x10000))
873  || (4 < $mBytes)
874  || (($mUcs4 & 0xFFFFF800) == 0xD800) // From Unicode 3.2, surrogate characters are illegal
875  || ($mUcs4 > 0x10FFFF)) // Codepoints outside the Unicode range are illegal
876  {
877  return false;
878  }
879 
880  // Initialize UTF8 cache.
881  $mState = 0;
882  $mUcs4 = 0;
883  $mBytes = 1;
884  }
885  }
886  else
887  {
888  /**
889  *((0xC0 & (*in) != 0x80) && (mState != 0))
890  * Incomplete multi-octet sequence.
891  */
892  return false;
893  }
894  }
895  }
896 
897  return true;
898  }
899 
900  /**
901  * Tests whether a string complies as UTF-8. This will be much
902  * faster than utf8_is_valid but will pass five and six octet
903  * UTF-8 sequences, which are not supported by Unicode and
904  * so cannot be displayed correctly in a browser. In other words
905  * it is not as strict as utf8_is_valid but it's faster. If you use
906  * it to validate user input, you place yourself at the risk that
907  * attackers will be able to inject 5 and 6 byte sequences (which
908  * may or may not be a significant risk, depending on what you are
909  * are doing)
910  *
911  * @param string $str UTF-8 string to check
912  *
913  * @return boolean TRUE if string is valid UTF-8
914  *
915  * @see JString::valid()
916  * @see http://www.php.net/manual/en/reference.pcre.pattern.modifiers.php#54805
917  * @since 11.1
918  */
919  public static function compliant($str)
920  {
921  if (strlen($str) == 0)
922  {
923  return true;
924  }
925 
926  /*
927  * If even just the first character can be matched, when the /u
928  * modifier is used, then it's valid UTF-8. If the UTF-8 is somehow
929  * invalid, nothing at all will match, even if the string contains
930  * some valid sequences
931  */
932  return (preg_match('/^.{1}/us', $str, $ar) == 1);
933  }
934 
935  /**
936  * Does a UTF-8 safe version of PHP parse_url function
937  *
938  * @param string $url URL to parse
939  *
940  * @return mixed Associative array or false if badly formed URL.
941  *
942  * @see http://us3.php.net/manual/en/function.parse-url.php
943  * @since 11.1
944  */
945  public static function parse_url($url)
946  {
947  $result = false;
948 
949  // Build arrays of values we need to decode before parsing
950  $entities = array('%21', '%2A', '%27', '%28', '%29', '%3B', '%3A', '%40', '%26', '%3D', '%24', '%2C', '%2F', '%3F', '%23', '%5B', '%5D');
951  $replacements = array('!', '*', "'", "(", ")", ";", ":", "@", "&", "=", "$", ",", "/", "?", "#", "[", "]");
952 
953  // Create encoded URL with special URL characters decoded so it can be parsed
954  // All other characters will be encoded
955  $encodedURL = str_replace($entities, $replacements, urlencode($url));
956 
957  // Parse the encoded URL
958  $encodedParts = parse_url($encodedURL);
959 
960  // Now, decode each value of the resulting array
961  if ($encodedParts)
962  {
963  foreach ($encodedParts as $key => $value)
964  {
965  $result[$key] = urldecode(str_replace($replacements, $entities, $value));
966  }
967  }
968 
969  return $result;
970  }
971 }