Joomla Platform  13.1
Documentation des API du framework Joomla Platform
 Tout Classes Espaces de nommage Fichiers Fonctions Variables Pages
porteren.php
Aller à la documentation de ce fichier.
1 <?php
2 /**
3  * @package Joomla.Platform
4  * @subpackage Language
5  *
6  * @copyright Copyright (C) 2005 - 2013 Open Source Matters, Inc. All rights reserved.
7  * @copyright Copyright (C) 2005 Richard Heyes (http://www.phpguru.org/). All rights reserved.
8  * @license GNU General Public License version 2 or later; see LICENSE
9  */
10 
11 defined('JPATH_PLATFORM') or die;
12 
13 /**
14  * Porter English stemmer class.
15  *
16  * This class was adapted from one written by Richard Heyes.
17  * See copyright and link information above.
18  *
19  * @package Joomla.Platform
20  * @subpackage Language
21  * @since 12.1
22  */
24 {
25  /**
26  * Regex for matching a consonant.
27  *
28  * @var string
29  * @since 12.1
30  */
31  private static $_regex_consonant = '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)';
32 
33  /**
34  * Regex for matching a vowel
35  * @var string
36  * @since 12.1
37  */
38  private static $_regex_vowel = '(?:[aeiou]|(?<![aeiou])y)';
39 
40  /**
41  * Method to stem a token and return the root.
42  *
43  * @param string $token The token to stem.
44  * @param string $lang The language of the token.
45  *
46  * @return string The root token.
47  *
48  * @since 12.1
49  */
50  public function stem($token, $lang)
51  {
52  // Check if the token is long enough to merit stemming.
53  if (strlen($token) <= 2)
54  {
55  return $token;
56  }
57 
58  // Check if the language is English or All.
59  if ($lang !== 'en')
60  {
61  return $token;
62  }
63 
64  // Stem the token if it is not in the cache.
65  if (!isset($this->cache[$lang][$token]))
66  {
67  // Stem the token.
68  $result = $token;
69  $result = self::_step1ab($result);
70  $result = self::_step1c($result);
71  $result = self::_step2($result);
72  $result = self::_step3($result);
73  $result = self::_step4($result);
74  $result = self::_step5($result);
75 
76  // Add the token to the cache.
77  $this->cache[$lang][$token] = $result;
78  }
79 
80  return $this->cache[$lang][$token];
81  }
82 
83  /**
84  * Step 1
85  *
86  * @param string $word The token to stem.
87  *
88  * @return string
89  *
90  * @since 12.1
91  */
92  private static function _step1ab($word)
93  {
94  // Part a
95  if (substr($word, -1) == 's')
96  {
97  self::_replace($word, 'sses', 'ss')
98  or self::_replace($word, 'ies', 'i')
99  or self::_replace($word, 'ss', 'ss')
100  or self::_replace($word, 's', '');
101  }
102 
103  // Part b
104  if (substr($word, -2, 1) != 'e' or !self::_replace($word, 'eed', 'ee', 0))
105  {
106  // First rule
107  $v = self::$_regex_vowel;
108 
109  // Check ing and ed
110  // Note use of && and OR, for precedence reasons
111  if (preg_match("#$v+#", substr($word, 0, -3)) && self::_replace($word, 'ing', '')
112  or preg_match("#$v+#", substr($word, 0, -2)) && self::_replace($word, 'ed', ''))
113  {
114  // If one of above two test successful
115  if (!self::_replace($word, 'at', 'ate') and !self::_replace($word, 'bl', 'ble') and !self::_replace($word, 'iz', 'ize'))
116  {
117  // Double consonant ending
118  if (self::_doubleConsonant($word) and substr($word, -2) != 'll' and substr($word, -2) != 'ss' and substr($word, -2) != 'zz')
119  {
120  $word = substr($word, 0, -1);
121  }
122  elseif (self::_m($word) == 1 and self::_cvc($word))
123  {
124  $word .= 'e';
125  }
126  }
127  }
128  }
129 
130  return $word;
131  }
132 
133  /**
134  * Step 1c
135  *
136  * @param string $word The token to stem.
137  *
138  * @return string
139  *
140  * @since 12.1
141  */
142  private static function _step1c($word)
143  {
144  $v = self::$_regex_vowel;
145 
146  if (substr($word, -1) == 'y' && preg_match("#$v+#", substr($word, 0, -1)))
147  {
148  self::_replace($word, 'y', 'i');
149  }
150 
151  return $word;
152  }
153 
154  /**
155  * Step 2
156  *
157  * @param string $word The token to stem.
158  *
159  * @return string
160  *
161  * @since 12.1
162  */
163  private static function _step2($word)
164  {
165  switch (substr($word, -2, 1))
166  {
167  case 'a':
168  self::_replace($word, 'ational', 'ate', 0)
169  or self::_replace($word, 'tional', 'tion', 0);
170  break;
171  case 'c':
172  self::_replace($word, 'enci', 'ence', 0)
173  or self::_replace($word, 'anci', 'ance', 0);
174  break;
175  case 'e':
176  self::_replace($word, 'izer', 'ize', 0);
177  break;
178  case 'g':
179  self::_replace($word, 'logi', 'log', 0);
180  break;
181  case 'l':
182  self::_replace($word, 'entli', 'ent', 0)
183  or self::_replace($word, 'ousli', 'ous', 0)
184  or self::_replace($word, 'alli', 'al', 0)
185  or self::_replace($word, 'bli', 'ble', 0)
186  or self::_replace($word, 'eli', 'e', 0);
187  break;
188  case 'o':
189  self::_replace($word, 'ization', 'ize', 0)
190  or self::_replace($word, 'ation', 'ate', 0)
191  or self::_replace($word, 'ator', 'ate', 0);
192  break;
193  case 's':
194  self::_replace($word, 'iveness', 'ive', 0)
195  or self::_replace($word, 'fulness', 'ful', 0)
196  or self::_replace($word, 'ousness', 'ous', 0)
197  or self::_replace($word, 'alism', 'al', 0);
198  break;
199  case 't':
200  self::_replace($word, 'biliti', 'ble', 0)
201  or self::_replace($word, 'aliti', 'al', 0)
202  or self::_replace($word, 'iviti', 'ive', 0);
203  break;
204  }
205 
206  return $word;
207  }
208 
209  /**
210  * Step 3
211  *
212  * @param string $word The token to stem.
213  *
214  * @return string
215  *
216  * @since 12.1
217  */
218  private static function _step3($word)
219  {
220  switch (substr($word, -2, 1))
221  {
222  case 'a':
223  self::_replace($word, 'ical', 'ic', 0);
224  break;
225  case 's':
226  self::_replace($word, 'ness', '', 0);
227  break;
228  case 't':
229  self::_replace($word, 'icate', 'ic', 0)
230  or self::_replace($word, 'iciti', 'ic', 0);
231  break;
232  case 'u':
233  self::_replace($word, 'ful', '', 0);
234  break;
235  case 'v':
236  self::_replace($word, 'ative', '', 0);
237  break;
238  case 'z':
239  self::_replace($word, 'alize', 'al', 0);
240  break;
241  }
242 
243  return $word;
244  }
245 
246  /**
247  * Step 4
248  *
249  * @param string $word The token to stem.
250  *
251  * @return string
252  *
253  * @since 12.1
254  */
255  private static function _step4($word)
256  {
257  switch (substr($word, -2, 1))
258  {
259  case 'a':
260  self::_replace($word, 'al', '', 1);
261  break;
262  case 'c':
263  self::_replace($word, 'ance', '', 1)
264  or self::_replace($word, 'ence', '', 1);
265  break;
266  case 'e':
267  self::_replace($word, 'er', '', 1);
268  break;
269  case 'i':
270  self::_replace($word, 'ic', '', 1);
271  break;
272  case 'l':
273  self::_replace($word, 'able', '', 1)
274  or self::_replace($word, 'ible', '', 1);
275  break;
276  case 'n':
277  self::_replace($word, 'ant', '', 1)
278  or self::_replace($word, 'ement', '', 1)
279  or self::_replace($word, 'ment', '', 1)
280  or self::_replace($word, 'ent', '', 1);
281  break;
282  case 'o':
283  if (substr($word, -4) == 'tion' or substr($word, -4) == 'sion')
284  {
285  self::_replace($word, 'ion', '', 1);
286  }
287  else
288  {
289  self::_replace($word, 'ou', '', 1);
290  }
291  break;
292  case 's':
293  self::_replace($word, 'ism', '', 1);
294  break;
295  case 't':
296  self::_replace($word, 'ate', '', 1)
297  or self::_replace($word, 'iti', '', 1);
298  break;
299  case 'u':
300  self::_replace($word, 'ous', '', 1);
301  break;
302  case 'v':
303  self::_replace($word, 'ive', '', 1);
304  break;
305  case 'z':
306  self::_replace($word, 'ize', '', 1);
307  break;
308  }
309 
310  return $word;
311  }
312 
313  /**
314  * Step 5
315  *
316  * @param string $word The token to stem.
317  *
318  * @return string
319  *
320  * @since 12.1
321  */
322  private static function _step5($word)
323  {
324  // Part a
325  if (substr($word, -1) == 'e')
326  {
327  if (self::_m(substr($word, 0, -1)) > 1)
328  {
329  self::_replace($word, 'e', '');
330  }
331  elseif (self::_m(substr($word, 0, -1)) == 1)
332  {
333  if (!self::_cvc(substr($word, 0, -1)))
334  {
335  self::_replace($word, 'e', '');
336  }
337  }
338  }
339 
340  // Part b
341  if (self::_m($word) > 1 and self::_doubleConsonant($word) and substr($word, -1) == 'l')
342  {
343  $word = substr($word, 0, -1);
344  }
345 
346  return $word;
347  }
348 
349  /**
350  * Replaces the first string with the second, at the end of the string. If third
351  * arg is given, then the preceding string must match that m count at least.
352  *
353  * @param string &$str String to check
354  * @param string $check Ending to check for
355  * @param string $repl Replacement string
356  * @param integer $m Optional minimum number of m() to meet
357  *
358  * @return boolean Whether the $check string was at the end
359  * of the $str string. True does not necessarily mean
360  * that it was replaced.
361  *
362  * @since 12.1
363  */
364  private static function _replace(&$str, $check, $repl, $m = null)
365  {
366  $len = 0 - strlen($check);
367 
368  if (substr($str, $len) == $check)
369  {
370  $substr = substr($str, 0, $len);
371 
372  if (is_null($m) or self::_m($substr) > $m)
373  {
374  $str = $substr . $repl;
375  }
376 
377  return true;
378  }
379 
380  return false;
381  }
382 
383  /**
384  * m() measures the number of consonant sequences in $str. if c is
385  * a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
386  * presence,
387  *
388  * <c><v> gives 0
389  * <c>vc<v> gives 1
390  * <c>vcvc<v> gives 2
391  * <c>vcvcvc<v> gives 3
392  *
393  * @param string $str The string to return the m count for
394  *
395  * @return integer The m count
396  *
397  * @since 12.1
398  */
399  private static function _m($str)
400  {
401  $c = self::$_regex_consonant;
402  $v = self::$_regex_vowel;
403 
404  $str = preg_replace("#^$c+#", '', $str);
405  $str = preg_replace("#$v+$#", '', $str);
406 
407  preg_match_all("#($v+$c+)#", $str, $matches);
408 
409  return count($matches[1]);
410  }
411 
412  /**
413  * Returns true/false as to whether the given string contains two
414  * of the same consonant next to each other at the end of the string.
415  *
416  * @param string $str String to check
417  *
418  * @return boolean Result
419  *
420  * @since 12.1
421  */
422  private static function _doubleConsonant($str)
423  {
424  $c = self::$_regex_consonant;
425 
426  return preg_match("#$c{2}$#", $str, $matches) and $matches[0]{0} == $matches[0]{1};
427  }
428 
429  /**
430  * Checks for ending CVC sequence where second C is not W, X or Y
431  *
432  * @param string $str String to check
433  *
434  * @return boolean Result
435  *
436  * @since 12.1
437  */
438  private static function _cvc($str)
439  {
440  $c = self::$_regex_consonant;
441  $v = self::$_regex_vowel;
442 
443  $result = preg_match("#($c$v$c)$#", $str, $matches)
444  and strlen($matches[1]) == 3
445  and $matches[1]{2} != 'w'
446  and $matches[1]{2} != 'x'
447  and $matches[1]{2} != 'y';
448 
449  return $result;
450  }
451 }