Joomla Platform  13.1
Documentation des API du framework Joomla Platform
 Tout Classes Espaces de nommage Fichiers Fonctions Variables Pages
idna_convert.class.php
Aller à la documentation de ce fichier.
1 <?php
2 // {{{ license
3 
4 /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 foldmethod=marker: */
5 //
6 // +----------------------------------------------------------------------+
7 // | This library is free software; you can redistribute it and/or modify |
8 // | it under the terms of the GNU Lesser General Public License as |
9 // | published by the Free Software Foundation; either version 2.1 of the |
10 // | License, or (at your option) any later version. |
11 // | |
12 // | This library is distributed in the hope that it will be useful, but |
13 // | WITHOUT ANY WARRANTY; without even the implied warranty of |
14 // | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
15 // | Lesser General Public License for more details. |
16 // | |
17 // | You should have received a copy of the GNU Lesser General Public |
18 // | License along with this library; if not, write to the Free Software |
19 // | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 |
20 // | USA. |
21 // +----------------------------------------------------------------------+
22 //
23 
24 // }}}
25 
26 /**
27  * Encode/decode Internationalized Domain Names.
28  *
29  * The class allows to convert internationalized domain names
30  * (see RFC 3490 for details) as they can be used with various registries worldwide
31  * to be translated between their original (localized) form and their encoded form
32  * as it will be used in the DNS (Domain Name System).
33  *
34  * The class provides two public methods, encode() and decode(), which do exactly
35  * what you would expect them to do. You are allowed to use complete domain names,
36  * simple strings and complete email addresses as well. That means, that you might
37  * use any of the following notations:
38  *
39  * - www.nörgler.com
40  * - xn--nrgler-wxa
41  * - xn--brse-5qa.xn--knrz-1ra.info
42  *
43  * Unicode input might be given as either UTF-8 string, UCS-4 string or UCS-4
44  * array. Unicode output is available in the same formats.
45  * You can select your preferred format via {@link set_paramter()}.
46  *
47  * ACE input and output is always expected to be ASCII.
48  *
49  * @author Matthias Sommerfeld <mso@phlylabs.de>
50  * @copyright 2004-2007 phlyLabs Berlin, http://phlylabs.de
51  * @version 0.5.1
52  *
53  */
55 {
56  /**
57  * Holds all relevant mapping tables, loaded from a seperate file on construct
58  * See RFC3454 for details
59  *
60  * @var array
61  * @access private
62  */
63  var $NP = array();
64 
65  // Internal settings, do not mess with them
66  var $_punycode_prefix = 'xn--';
67  var $_invalid_ucs = 0x80000000;
68  var $_max_ucs = 0x10FFFF;
69  var $_base = 36;
70  var $_tmin = 1;
71  var $_tmax = 26;
72  var $_skew = 38;
73  var $_damp = 700;
74  var $_initial_bias = 72;
75  var $_initial_n = 0x80;
76  var $_sbase = 0xAC00;
77  var $_lbase = 0x1100;
78  var $_vbase = 0x1161;
79  var $_tbase = 0x11A7;
80  var $_lcount = 19;
81  var $_vcount = 21;
82  var $_tcount = 28;
83  var $_ncount = 588; // _vcount * _tcount
84  var $_scount = 11172; // _lcount * _tcount * _vcount
85  var $_error = false;
86 
87  // See {@link set_paramter()} for details of how to change the following
88  // settings from within your script / application
89  var $_api_encoding = 'utf8'; // Default input charset is UTF-8
90  var $_allow_overlong = false; // Overlong UTF-8 encodings are forbidden
91  var $_strict_mode = false; // Behave strict or not
92 
93  // The constructor
94  function idna_convert($options = false)
95  {
96  $this->slast = $this->_sbase + $this->_lcount * $this->_vcount * $this->_tcount;
97  if (function_exists('file_get_contents')) {
98  $this->NP = unserialize(file_get_contents(dirname(__FILE__).'/npdata.ser'));
99  } else {
100  $this->NP = unserialize(join('', file(dirname(__FILE__).'/npdata.ser')));
101  }
102  // If parameters are given, pass these to the respective method
103  if (is_array($options)) {
104  return $this->set_parameter($options);
105  }
106  return true;
107  }
108 
109  /**
110  * Sets a new option value. Available options and values:
111  * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8,
112  * 'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8]
113  * [overlong - Unicode does not allow unnecessarily long encodings of chars,
114  * to allow this, set this parameter to true, else to false;
115  * default is false.]
116  * [strict - true: strict mode, good for registration purposes - Causes errors
117  * on failures; false: loose mode, ideal for "wildlife" applications
118  * by silently ignoring errors and returning the original input instead
119  *
120  * @param mixed Parameter to set (string: single parameter; array of Parameter => Value pairs)
121  * @param string Value to use (if parameter 1 is a string)
122  * @return boolean true on success, false otherwise
123  * @access public
124  */
125  function set_parameter($option, $value = false)
126  {
127  if (!is_array($option)) {
128  $option = array($option => $value);
129  }
130  foreach ($option as $k => $v) {
131  switch ($k) {
132  case 'encoding':
133  switch ($v) {
134  case 'utf8':
135  case 'ucs4_string':
136  case 'ucs4_array':
137  $this->_api_encoding = $v;
138  break;
139  default:
140  $this->_error('Set Parameter: Unknown parameter '.$v.' for option '.$k);
141  return false;
142  }
143  break;
144  case 'overlong':
145  $this->_allow_overlong = ($v) ? true : false;
146  break;
147  case 'strict':
148  $this->_strict_mode = ($v) ? true : false;
149  break;
150  default:
151  $this->_error('Set Parameter: Unknown option '.$k);
152  return false;
153  }
154  }
155  return true;
156  }
157 
158  /**
159  * Decode a given ACE domain name
160  * @param string Domain name (ACE string)
161  * [@param string Desired output encoding, see {@link set_parameter}]
162  * @return string Decoded Domain name (UTF-8 or UCS-4)
163  * @access public
164  */
165  function decode($input, $one_time_encoding = false)
166  {
167  // Optionally set
168  if ($one_time_encoding) {
169  switch ($one_time_encoding) {
170  case 'utf8':
171  case 'ucs4_string':
172  case 'ucs4_array':
173  break;
174  default:
175  $this->_error('Unknown encoding '.$one_time_encoding);
176  return false;
177  }
178  }
179  // Make sure to drop any newline characters around
180  $input = trim($input);
181 
182  // Negotiate input and try to determine, whether it is a plain string,
183  // an email address or something like a complete URL
184  if (strpos($input, '@')) { // Maybe it is an email address
185  // No no in strict mode
186  if ($this->_strict_mode) {
187  $this->_error('Only simple domain name parts can be handled in strict mode');
188  return false;
189  }
190  list ($email_pref, $input) = explode('@', $input, 2);
191  $arr = explode('.', $input);
192  foreach ($arr as $k => $v) {
193  if (preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $v)) {
194  $conv = $this->_decode($v);
195  if ($conv) $arr[$k] = $conv;
196  }
197  }
198  $input = join('.', $arr);
199  $arr = explode('.', $email_pref);
200  foreach ($arr as $k => $v) {
201  if (preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $v)) {
202  $conv = $this->_decode($v);
203  if ($conv) $arr[$k] = $conv;
204  }
205  }
206  $email_pref = join('.', $arr);
207  $return = $email_pref . '@' . $input;
208  } elseif (preg_match('![:\./]!', $input)) { // Or a complete domain name (with or without paths / parameters)
209  // No no in strict mode
210  if ($this->_strict_mode) {
211  $this->_error('Only simple domain name parts can be handled in strict mode');
212  return false;
213  }
214  $parsed = parse_url($input);
215  if (isset($parsed['host'])) {
216  $arr = explode('.', $parsed['host']);
217  foreach ($arr as $k => $v) {
218  $conv = $this->_decode($v);
219  if ($conv) $arr[$k] = $conv;
220  }
221  $parsed['host'] = join('.', $arr);
222  $return =
223  (empty($parsed['scheme']) ? '' : $parsed['scheme'].(strtolower($parsed['scheme']) == 'mailto' ? ':' : '://'))
224  .(empty($parsed['user']) ? '' : $parsed['user'].(empty($parsed['pass']) ? '' : ':'.$parsed['pass']).'@')
225  .$parsed['host']
226  .(empty($parsed['port']) ? '' : ':'.$parsed['port'])
227  .(empty($parsed['path']) ? '' : $parsed['path'])
228  .(empty($parsed['query']) ? '' : '?'.$parsed['query'])
229  .(empty($parsed['fragment']) ? '' : '#'.$parsed['fragment']);
230  } else { // parse_url seems to have failed, try without it
231  $arr = explode('.', $input);
232  foreach ($arr as $k => $v) {
233  $conv = $this->_decode($v);
234  $arr[$k] = ($conv) ? $conv : $v;
235  }
236  $return = join('.', $arr);
237  }
238  } else { // Otherwise we consider it being a pure domain name string
239  $return = $this->_decode($input);
240  if (!$return) $return = $input;
241  }
242  // The output is UTF-8 by default, other output formats need conversion here
243  // If one time encoding is given, use this, else the objects property
244  switch (($one_time_encoding) ? $one_time_encoding : $this->_api_encoding) {
245  case 'utf8':
246  return $return;
247  break;
248  case 'ucs4_string':
249  return $this->_ucs4_to_ucs4_string($this->_utf8_to_ucs4($return));
250  break;
251  case 'ucs4_array':
252  return $this->_utf8_to_ucs4($return);
253  break;
254  default:
255  $this->_error('Unsupported output format');
256  return false;
257  }
258  }
259 
260  /**
261  * Encode a given UTF-8 domain name
262  * @param string Domain name (UTF-8 or UCS-4)
263  * [@param string Desired input encoding, see {@link set_parameter}]
264  * @return string Encoded Domain name (ACE string)
265  * @access public
266  */
267  function encode($decoded, $one_time_encoding = false)
268  {
269  // Forcing conversion of input to UCS4 array
270  // If one time encoding is given, use this, else the objects property
271  switch ($one_time_encoding ? $one_time_encoding : $this->_api_encoding) {
272  case 'utf8':
273  $decoded = $this->_utf8_to_ucs4($decoded);
274  break;
275  case 'ucs4_string':
276  $decoded = $this->_ucs4_string_to_ucs4($decoded);
277  case 'ucs4_array':
278  break;
279  default:
280  $this->_error('Unsupported input format: '.($one_time_encoding ? $one_time_encoding : $this->_api_encoding));
281  return false;
282  }
283 
284  // No input, no output, what else did you expect?
285  if (empty($decoded)) return '';
286 
287  // Anchors for iteration
288  $last_begin = 0;
289  // Output string
290  $output = '';
291  foreach ($decoded as $k => $v) {
292  // Make sure to use just the plain dot
293  switch($v) {
294  case 0x3002:
295  case 0xFF0E:
296  case 0xFF61:
297  $decoded[$k] = 0x2E;
298  // Right, no break here, the above are converted to dots anyway
299  // Stumbling across an anchoring character
300  case 0x2E:
301  case 0x2F:
302  case 0x3A:
303  case 0x3F:
304  case 0x40:
305  // Neither email addresses nor URLs allowed in strict mode
306  if ($this->_strict_mode) {
307  $this->_error('Neither email addresses nor URLs are allowed in strict mode.');
308  return false;
309  } else {
310  // Skip first char
311  if ($k) {
312  $encoded = '';
313  $encoded = $this->_encode(array_slice($decoded, $last_begin, (($k)-$last_begin)));
314  if ($encoded) {
315  $output .= $encoded;
316  } else {
317  $output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($k)-$last_begin)));
318  }
319  $output .= chr($decoded[$k]);
320  }
321  $last_begin = $k + 1;
322  }
323  }
324  }
325  // Catch the rest of the string
326  if ($last_begin) {
327  $inp_len = sizeof($decoded);
328  $encoded = '';
329  $encoded = $this->_encode(array_slice($decoded, $last_begin, (($inp_len)-$last_begin)));
330  if ($encoded) {
331  $output .= $encoded;
332  } else {
333  $output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($inp_len)-$last_begin)));
334  }
335  return $output;
336  } else {
337  if ($output = $this->_encode($decoded)) {
338  return $output;
339  } else {
340  return $this->_ucs4_to_utf8($decoded);
341  }
342  }
343  }
344 
345  /**
346  * Use this method to get the last error ocurred
347  * @param void
348  * @return string The last error, that occured
349  * @access public
350  */
351  function get_last_error()
352  {
353  return $this->_error;
354  }
355 
356  /**
357  * The actual decoding algorithm
358  * @access private
359  */
360  function _decode($encoded)
361  {
362  // We do need to find the Punycode prefix
363  if (!preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $encoded)) {
364  $this->_error('This is not a punycode string');
365  return false;
366  }
367  $encode_test = preg_replace('!^'.preg_quote($this->_punycode_prefix, '!').'!', '', $encoded);
368  // If nothing left after removing the prefix, it is hopeless
369  if (!$encode_test) {
370  $this->_error('The given encoded string was empty');
371  return false;
372  }
373  // Find last occurence of the delimiter
374  $delim_pos = strrpos($encoded, '-');
375  if ($delim_pos > strlen($this->_punycode_prefix)) {
376  for ($k = strlen($this->_punycode_prefix); $k < $delim_pos; ++$k) {
377  $decoded[] = ord($encoded{$k});
378  }
379  } else {
380  $decoded = array();
381  }
382  $deco_len = count($decoded);
383  $enco_len = strlen($encoded);
384 
385  // Wandering through the strings; init
386  $is_first = true;
387  $bias = $this->_initial_bias;
388  $idx = 0;
389  $char = $this->_initial_n;
390 
391  for ($enco_idx = ($delim_pos) ? ($delim_pos + 1) : 0; $enco_idx < $enco_len; ++$deco_len) {
392  for ($old_idx = $idx, $w = 1, $k = $this->_base; 1 ; $k += $this->_base) {
393  $digit = $this->_decode_digit($encoded{$enco_idx++});
394  $idx += $digit * $w;
395  $t = ($k <= $bias) ? $this->_tmin :
396  (($k >= $bias + $this->_tmax) ? $this->_tmax : ($k - $bias));
397  if ($digit < $t) break;
398  $w = (int) ($w * ($this->_base - $t));
399  }
400  $bias = $this->_adapt($idx - $old_idx, $deco_len + 1, $is_first);
401  $is_first = false;
402  $char += (int) ($idx / ($deco_len + 1));
403  $idx %= ($deco_len + 1);
404  if ($deco_len > 0) {
405  // Make room for the decoded char
406  for ($i = $deco_len; $i > $idx; $i--) {
407  $decoded[$i] = $decoded[($i - 1)];
408  }
409  }
410  $decoded[$idx++] = $char;
411  }
412  return $this->_ucs4_to_utf8($decoded);
413  }
414 
415  /**
416  * The actual encoding algorithm
417  * @access private
418  */
419  function _encode($decoded)
420  {
421  // We cannot encode a domain name containing the Punycode prefix
422  $extract = strlen($this->_punycode_prefix);
423  $check_pref = $this->_utf8_to_ucs4($this->_punycode_prefix);
424  $check_deco = array_slice($decoded, 0, $extract);
425 
426  if ($check_pref == $check_deco) {
427  $this->_error('This is already a punycode string');
428  return false;
429  }
430  // We will not try to encode strings consisting of basic code points only
431  $encodable = false;
432  foreach ($decoded as $k => $v) {
433  if ($v > 0x7a) {
434  $encodable = true;
435  break;
436  }
437  }
438  if (!$encodable) {
439  $this->_error('The given string does not contain encodable chars');
440  return false;
441  }
442 
443  // Do NAMEPREP
444  $decoded = $this->_nameprep($decoded);
445  if (!$decoded || !is_array($decoded)) return false; // NAMEPREP failed
446 
447  $deco_len = count($decoded);
448  if (!$deco_len) return false; // Empty array
449 
450  $codecount = 0; // How many chars have been consumed
451 
452  $encoded = '';
453  // Copy all basic code points to output
454  for ($i = 0; $i < $deco_len; ++$i) {
455  $test = $decoded[$i];
456  // Will match [-0-9a-zA-Z]
457  if ((0x2F < $test && $test < 0x40) || (0x40 < $test && $test < 0x5B)
458  || (0x60 < $test && $test <= 0x7B) || (0x2D == $test)) {
459  $encoded .= chr($decoded[$i]);
460  $codecount++;
461  }
462  }
463  if ($codecount == $deco_len) return $encoded; // All codepoints were basic ones
464 
465  // Start with the prefix; copy it to output
466  $encoded = $this->_punycode_prefix.$encoded;
467 
468  // If we have basic code points in output, add an hyphen to the end
469  if ($codecount) $encoded .= '-';
470 
471  // Now find and encode all non-basic code points
472  $is_first = true;
473  $cur_code = $this->_initial_n;
474  $bias = $this->_initial_bias;
475  $delta = 0;
476  while ($codecount < $deco_len) {
477  // Find the smallest code point >= the current code point and
478  // remember the last ouccrence of it in the input
479  for ($i = 0, $next_code = $this->_max_ucs; $i < $deco_len; $i++) {
480  if ($decoded[$i] >= $cur_code && $decoded[$i] <= $next_code) {
481  $next_code = $decoded[$i];
482  }
483  }
484 
485  $delta += ($next_code - $cur_code) * ($codecount + 1);
486  $cur_code = $next_code;
487 
488  // Scan input again and encode all characters whose code point is $cur_code
489  for ($i = 0; $i < $deco_len; $i++) {
490  if ($decoded[$i] < $cur_code) {
491  $delta++;
492  } elseif ($decoded[$i] == $cur_code) {
493  for ($q = $delta, $k = $this->_base; 1; $k += $this->_base) {
494  $t = ($k <= $bias) ? $this->_tmin :
495  (($k >= $bias + $this->_tmax) ? $this->_tmax : $k - $bias);
496  if ($q < $t) break;
497  $encoded .= $this->_encode_digit(intval($t + (($q - $t) % ($this->_base - $t)))); //v0.4.5 Changed from ceil() to intval()
498  $q = (int) (($q - $t) / ($this->_base - $t));
499  }
500  $encoded .= $this->_encode_digit($q);
501  $bias = $this->_adapt($delta, $codecount+1, $is_first);
502  $codecount++;
503  $delta = 0;
504  $is_first = false;
505  }
506  }
507  $delta++;
508  $cur_code++;
509  }
510  return $encoded;
511  }
512 
513  /**
514  * Adapt the bias according to the current code point and position
515  * @access private
516  */
517  function _adapt($delta, $npoints, $is_first)
518  {
519  $delta = intval($is_first ? ($delta / $this->_damp) : ($delta / 2));
520  $delta += intval($delta / $npoints);
521  for ($k = 0; $delta > (($this->_base - $this->_tmin) * $this->_tmax) / 2; $k += $this->_base) {
522  $delta = intval($delta / ($this->_base - $this->_tmin));
523  }
524  return intval($k + ($this->_base - $this->_tmin + 1) * $delta / ($delta + $this->_skew));
525  }
526 
527  /**
528  * Encoding a certain digit
529  * @access private
530  */
531  function _encode_digit($d)
532  {
533  return chr($d + 22 + 75 * ($d < 26));
534  }
535 
536  /**
537  * Decode a certain digit
538  * @access private
539  */
540  function _decode_digit($cp)
541  {
542  $cp = ord($cp);
543  return ($cp - 48 < 10) ? $cp - 22 : (($cp - 65 < 26) ? $cp - 65 : (($cp - 97 < 26) ? $cp - 97 : $this->_base));
544  }
545 
546  /**
547  * Internal error handling method
548  * @access private
549  */
550  function _error($error = '')
551  {
552  $this->_error = $error;
553  }
554 
555  /**
556  * Do Nameprep according to RFC3491 and RFC3454
557  * @param array Unicode Characters
558  * @return string Unicode Characters, Nameprep'd
559  * @access private
560  */
561  function _nameprep($input)
562  {
563  $output = array();
564  $error = false;
565  //
566  // Mapping
567  // Walking through the input array, performing the required steps on each of
568  // the input chars and putting the result into the output array
569  // While mapping required chars we apply the cannonical ordering
570  foreach ($input as $v) {
571  // Map to nothing == skip that code point
572  if (in_array($v, $this->NP['map_nothing'])) continue;
573 
574  // Try to find prohibited input
575  if (in_array($v, $this->NP['prohibit']) || in_array($v, $this->NP['general_prohibited'])) {
576  $this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v));
577  return false;
578  }
579  foreach ($this->NP['prohibit_ranges'] as $range) {
580  if ($range[0] <= $v && $v <= $range[1]) {
581  $this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v));
582  return false;
583  }
584  }
585  //
586  // Hangul syllable decomposition
587  if (0xAC00 <= $v && $v <= 0xD7AF) {
588  foreach ($this->_hangul_decompose($v) as $out) {
589  $output[] = (int) $out;
590  }
591  // There's a decomposition mapping for that code point
592  } elseif (isset($this->NP['replacemaps'][$v])) {
593  foreach ($this->_apply_cannonical_ordering($this->NP['replacemaps'][$v]) as $out) {
594  $output[] = (int) $out;
595  }
596  } else {
597  $output[] = (int) $v;
598  }
599  }
600  // Before applying any Combining, try to rearrange any Hangul syllables
601  $output = $this->_hangul_compose($output);
602  //
603  // Combine code points
604  //
605  $last_class = 0;
606  $last_starter = 0;
607  $out_len = count($output);
608  for ($i = 0; $i < $out_len; ++$i) {
609  $class = $this->_get_combining_class($output[$i]);
610  if ((!$last_class || $last_class > $class) && $class) {
611  // Try to match
612  $seq_len = $i - $last_starter;
613  $out = $this->_combine(array_slice($output, $last_starter, $seq_len));
614  // On match: Replace the last starter with the composed character and remove
615  // the now redundant non-starter(s)
616  if ($out) {
617  $output[$last_starter] = $out;
618  if (count($out) != $seq_len) {
619  for ($j = $i+1; $j < $out_len; ++$j) {
620  $output[$j-1] = $output[$j];
621  }
622  unset($output[$out_len]);
623  }
624  // Rewind the for loop by one, since there can be more possible compositions
625  $i--;
626  $out_len--;
627  $last_class = ($i == $last_starter) ? 0 : $this->_get_combining_class($output[$i-1]);
628  continue;
629  }
630  }
631  // The current class is 0
632  if (!$class) $last_starter = $i;
633  $last_class = $class;
634  }
635  return $output;
636  }
637 
638  /**
639  * Decomposes a Hangul syllable
640  * (see http://www.unicode.org/unicode/reports/tr15/#Hangul
641  * @param integer 32bit UCS4 code point
642  * @return array Either Hangul Syllable decomposed or original 32bit value as one value array
643  * @access private
644  */
645  function _hangul_decompose($char)
646  {
647  $sindex = (int) $char - $this->_sbase;
648  if ($sindex < 0 || $sindex >= $this->_scount) {
649  return array($char);
650  }
651  $result = array();
652  $result[] = (int) $this->_lbase + $sindex / $this->_ncount;
653  $result[] = (int) $this->_vbase + ($sindex % $this->_ncount) / $this->_tcount;
654  $T = intval($this->_tbase + $sindex % $this->_tcount);
655  if ($T != $this->_tbase) $result[] = $T;
656  return $result;
657  }
658  /**
659  * Ccomposes a Hangul syllable
660  * (see http://www.unicode.org/unicode/reports/tr15/#Hangul
661  * @param array Decomposed UCS4 sequence
662  * @return array UCS4 sequence with syllables composed
663  * @access private
664  */
665  function _hangul_compose($input)
666  {
667  $inp_len = count($input);
668  if (!$inp_len) return array();
669  $result = array();
670  $last = (int) $input[0];
671  $result[] = $last; // copy first char from input to output
672 
673  for ($i = 1; $i < $inp_len; ++$i) {
674  $char = (int) $input[$i];
675  $sindex = $last - $this->_sbase;
676  $lindex = $last - $this->_lbase;
677  $vindex = $char - $this->_vbase;
678  $tindex = $char - $this->_tbase;
679  // Find out, whether two current characters are LV and T
680  if (0 <= $sindex && $sindex < $this->_scount && ($sindex % $this->_tcount == 0)
681  && 0 <= $tindex && $tindex <= $this->_tcount) {
682  // create syllable of form LVT
683  $last += $tindex;
684  $result[(count($result) - 1)] = $last; // reset last
685  continue; // discard char
686  }
687  // Find out, whether two current characters form L and V
688  if (0 <= $lindex && $lindex < $this->_lcount && 0 <= $vindex && $vindex < $this->_vcount) {
689  // create syllable of form LV
690  $last = (int) $this->_sbase + ($lindex * $this->_vcount + $vindex) * $this->_tcount;
691  $result[(count($result) - 1)] = $last; // reset last
692  continue; // discard char
693  }
694  // if neither case was true, just add the character
695  $last = $char;
696  $result[] = $char;
697  }
698  return $result;
699  }
700 
701  /**
702  * Returns the combining class of a certain wide char
703  * @param integer Wide char to check (32bit integer)
704  * @return integer Combining class if found, else 0
705  * @access private
706  */
707  function _get_combining_class($char)
708  {
709  return isset($this->NP['norm_combcls'][$char]) ? $this->NP['norm_combcls'][$char] : 0;
710  }
711 
712  /**
713  * Apllies the cannonical ordering of a decomposed UCS4 sequence
714  * @param array Decomposed UCS4 sequence
715  * @return array Ordered USC4 sequence
716  * @access private
717  */
718  function _apply_cannonical_ordering($input)
719  {
720  $swap = true;
721  $size = count($input);
722  while ($swap) {
723  $swap = false;
724  $last = $this->_get_combining_class(intval($input[0]));
725  for ($i = 0; $i < $size-1; ++$i) {
726  $next = $this->_get_combining_class(intval($input[$i+1]));
727  if ($next != 0 && $last > $next) {
728  // Move item leftward until it fits
729  for ($j = $i + 1; $j > 0; --$j) {
730  if ($this->_get_combining_class(intval($input[$j-1])) <= $next) break;
731  $t = intval($input[$j]);
732  $input[$j] = intval($input[$j-1]);
733  $input[$j-1] = $t;
734  $swap = true;
735  }
736  // Reentering the loop looking at the old character again
737  $next = $last;
738  }
739  $last = $next;
740  }
741  }
742  return $input;
743  }
744 
745  /**
746  * Do composition of a sequence of starter and non-starter
747  * @param array UCS4 Decomposed sequence
748  * @return array Ordered USC4 sequence
749  * @access private
750  */
751  function _combine($input)
752  {
753  $inp_len = count($input);
754  foreach ($this->NP['replacemaps'] as $np_src => $np_target) {
755  if ($np_target[0] != $input[0]) continue;
756  if (count($np_target) != $inp_len) continue;
757  $hit = false;
758  foreach ($input as $k2 => $v2) {
759  if ($v2 == $np_target[$k2]) {
760  $hit = true;
761  } else {
762  $hit = false;
763  break;
764  }
765  }
766  if ($hit) return $np_src;
767  }
768  return false;
769  }
770 
771  /**
772  * This converts an UTF-8 encoded string to its UCS-4 representation
773  * By talking about UCS-4 "strings" we mean arrays of 32bit integers representing
774  * each of the "chars". This is due to PHP not being able to handle strings with
775  * bit depth different from 8. This apllies to the reverse method _ucs4_to_utf8(), too.
776  * The following UTF-8 encodings are supported:
777  * bytes bits representation
778  * 1 7 0xxxxxxx
779  * 2 11 110xxxxx 10xxxxxx
780  * 3 16 1110xxxx 10xxxxxx 10xxxxxx
781  * 4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
782  * 5 26 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
783  * 6 31 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
784  * Each x represents a bit that can be used to store character data.
785  * The five and six byte sequences are part of Annex D of ISO/IEC 10646-1:2000
786  * @access private
787  */
788  function _utf8_to_ucs4($input)
789  {
790  $output = array();
791  $out_len = 0;
792  $inp_len = strlen($input);
793  $mode = 'next';
794  $test = 'none';
795  for ($k = 0; $k < $inp_len; ++$k) {
796  $v = ord($input{$k}); // Extract byte from input string
797 
798  if ($v < 128) { // We found an ASCII char - put into stirng as is
799  $output[$out_len] = $v;
800  ++$out_len;
801  if ('add' == $mode) {
802  $this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
803  return false;
804  }
805  continue;
806  }
807  if ('next' == $mode) { // Try to find the next start byte; determine the width of the Unicode char
808  $start_byte = $v;
809  $mode = 'add';
810  $test = 'range';
811  if ($v >> 5 == 6) { // &110xxxxx 10xxxxx
812  $next_byte = 0; // Tells, how many times subsequent bitmasks must rotate 6bits to the left
813  $v = ($v - 192) << 6;
814  } elseif ($v >> 4 == 14) { // &1110xxxx 10xxxxxx 10xxxxxx
815  $next_byte = 1;
816  $v = ($v - 224) << 12;
817  } elseif ($v >> 3 == 30) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
818  $next_byte = 2;
819  $v = ($v - 240) << 18;
820  } elseif ($v >> 2 == 62) { // &111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
821  $next_byte = 3;
822  $v = ($v - 248) << 24;
823  } elseif ($v >> 1 == 126) { // &1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
824  $next_byte = 4;
825  $v = ($v - 252) << 30;
826  } else {
827  $this->_error('This might be UTF-8, but I don\'t understand it at byte '.$k);
828  return false;
829  }
830  if ('add' == $mode) {
831  $output[$out_len] = (int) $v;
832  ++$out_len;
833  continue;
834  }
835  }
836  if ('add' == $mode) {
837  if (!$this->_allow_overlong && $test == 'range') {
838  $test = 'none';
839  if (($v < 0xA0 && $start_byte == 0xE0) || ($v < 0x90 && $start_byte == 0xF0) || ($v > 0x8F && $start_byte == 0xF4)) {
840  $this->_error('Bogus UTF-8 character detected (out of legal range) at byte '.$k);
841  return false;
842  }
843  }
844  if ($v >> 6 == 2) { // Bit mask must be 10xxxxxx
845  $v = ($v - 128) << ($next_byte * 6);
846  $output[($out_len - 1)] += $v;
847  --$next_byte;
848  } else {
849  $this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
850  return false;
851  }
852  if ($next_byte < 0) {
853  $mode = 'next';
854  }
855  }
856  } // for
857  return $output;
858  }
859 
860  /**
861  * Convert UCS-4 string into UTF-8 string
862  * See _utf8_to_ucs4() for details
863  * @access private
864  */
865  function _ucs4_to_utf8($input)
866  {
867  $output = '';
868  $k = 0;
869  foreach ($input as $v) {
870  ++$k;
871  // $v = ord($v);
872  if ($v < 128) { // 7bit are transferred literally
873  $output .= chr($v);
874  } elseif ($v < (1 << 11)) { // 2 bytes
875  $output .= chr(192 + ($v >> 6)) . chr(128 + ($v & 63));
876  } elseif ($v < (1 << 16)) { // 3 bytes
877  $output .= chr(224 + ($v >> 12)) . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
878  } elseif ($v < (1 << 21)) { // 4 bytes
879  $output .= chr(240 + ($v >> 18)) . chr(128 + (($v >> 12) & 63))
880  . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
881  } elseif ($v < (1 << 26)) { // 5 bytes
882  $output .= chr(248 + ($v >> 24)) . chr(128 + (($v >> 18) & 63))
883  . chr(128 + (($v >> 12) & 63)) . chr(128 + (($v >> 6) & 63))
884  . chr(128 + ($v & 63));
885  } elseif ($v < (1 << 31)) { // 6 bytes
886  $output .= chr(252 + ($v >> 30)) . chr(128 + (($v >> 24) & 63))
887  . chr(128 + (($v >> 18) & 63)) . chr(128 + (($v >> 12) & 63))
888  . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
889  } else {
890  $this->_error('Conversion from UCS-4 to UTF-8 failed: malformed input at byte '.$k);
891  return false;
892  }
893  }
894  return $output;
895  }
896 
897  /**
898  * Convert UCS-4 array into UCS-4 string
899  *
900  * @access private
901  */
902  function _ucs4_to_ucs4_string($input)
903  {
904  $output = '';
905  // Take array values and split output to 4 bytes per value
906  // The bit mask is 255, which reads &11111111
907  foreach ($input as $v) {
908  $output .= chr(($v >> 24) & 255).chr(($v >> 16) & 255).chr(($v >> 8) & 255).chr($v & 255);
909  }
910  return $output;
911  }
912 
913  /**
914  * Convert UCS-4 strin into UCS-4 garray
915  *
916  * @access private
917  */
918  function _ucs4_string_to_ucs4($input)
919  {
920  $output = array();
921  $inp_len = strlen($input);
922  // Input length must be dividable by 4
923  if ($inp_len % 4) {
924  $this->_error('Input UCS4 string is broken');
925  return false;
926  }
927  // Empty input - return empty output
928  if (!$inp_len) return $output;
929  for ($i = 0, $out_len = -1; $i < $inp_len; ++$i) {
930  // Increment output position every 4 input bytes
931  if (!($i % 4)) {
932  $out_len++;
933  $output[$out_len] = 0;
934  }
935  $output[$out_len] += ord($input{$i}) << (8 * (3 - ($i % 4) ) );
936  }
937  return $output;
938  }
939 }
940 
941 /**
942 * Adapter class for aligning the API of idna_convert with that of Net_IDNA
943 * @author Matthias Sommerfeld <mso@phlylabs.de>
944 */
946 {
947  /**
948  * Sets a new option value. Available options and values:
949  * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8,
950  * 'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8]
951  * [overlong - Unicode does not allow unnecessarily long encodings of chars,
952  * to allow this, set this parameter to true, else to false;
953  * default is false.]
954  * [strict - true: strict mode, good for registration purposes - Causes errors
955  * on failures; false: loose mode, ideal for "wildlife" applications
956  * by silently ignoring errors and returning the original input instead
957  *
958  * @param mixed Parameter to set (string: single parameter; array of Parameter => Value pairs)
959  * @param string Value to use (if parameter 1 is a string)
960  * @return boolean true on success, false otherwise
961  * @access public
962  */
963  function setParams($option, $param = false)
964  {
965  return $this->IC->set_parameters($option, $param);
966  }
967 }
968 
969 ?>