jplatform/position_8php_source.html

<?php

/**

* Locate a byte index given a UTF-8 character index

* @version $Id$

* @package utf8

* @subpackage position

*/


//--------------------------------------------------------------------

/**

* Given a string and a character index in the string, in

* terms of the UTF-8 character position, returns the byte

* index of that character. Can be useful when you want to

* PHP's native string functions but we warned, locating

* the byte can be expensive

* Takes variable number of parameters - first must be

* the search string then 1 to n UTF-8 character positions

* to obtain byte indexes for - it is more efficient to search

* the string for multiple characters at once, than make

* repeated calls to this function

*

* @author Chris Smith<chris@jalakai.co.uk>

* @param string string to locate index in

* @param int (n times)

* @return mixed - int if only one input int, array if more

* @return boolean TRUE if it's all ASCII

* @package utf8

* @subpackage position

*/

function utf8_byte_position() {


    $args = func_get_args();

    $str =& array_shift($args);

    if (!is_string($str)) return false;


    $result = array();


    // trivial byte index, character offset pair

    $prev = array(0,0);


    // use a short piece of str to estimate bytes per character

    // $i (& $j) -> byte indexes into $str

    $i = utf8_locate_next_chr($str, 300);


    // $c -> character offset into $str

    $c = strlen(utf8_decode(substr($str,0,$i)));


    // deal with arguments from lowest to highest

    sort($args);


    foreach ($args as $offset) {

        // sanity checks FIXME


        // 0 is an easy check

        if ($offset == 0) { $result[] = 0; continue; }


        // ensure no endless looping

        $safety_valve = 50;


        do {


            if ( ($c - $prev[1]) == 0 ) {

                // Hack: gone past end of string

                $error = 0;

                $i = strlen($str);

                break;

            }


            $j = $i + (int)(($offset-$c) * ($i - $prev[0]) / ($c - $prev[1]));


            // correct to utf8 character boundary

            $j = utf8_locate_next_chr($str, $j);


            // save the index, offset for use next iteration

            $prev = array($i,$c);


            if ($j > $i) {

                // determine new character offset

                $c += strlen(utf8_decode(substr($str,$i,$j-$i)));

            } else {

                // ditto

                $c -= strlen(utf8_decode(substr($str,$j,$i-$j)));

            }


            $error = abs($c-$offset);


            // ready for next time around

            $i = $j;


        // from 7 it is faster to iterate over the string

        } while ( ($error > 7) && --$safety_valve) ;


        if ($error && $error <= 7) {


            if ($c < $offset) {

                // move up

                while ($error--) { $i = utf8_locate_next_chr($str,++$i); }

            } else {

                // move down

                while ($error--) { $i = utf8_locate_current_chr($str,--$i); }

            }


            // ready for next arg

            $c = $offset;

        }

        $result[] = $i;

    }


    if ( count($result) == 1 ) {

        return $result[0];

    }


    return $result;

}


//--------------------------------------------------------------------

/**

* Given a string and any byte index, returns the byte index

* of the start of the current UTF-8 character, relative to supplied

* position. If the current character begins at the same place as the

* supplied byte index, that byte index will be returned. Otherwise

* this function will step backwards, looking for the index where

* curent UTF-8 character begins

* @author Chris Smith<chris@jalakai.co.uk>

* @param string

* @param int byte index in the string

* @return int byte index of start of next UTF-8 character

* @package utf8

* @subpackage position

*/

function utf8_locate_current_chr( &$str, $idx ) {


    if ($idx <= 0) return 0;


    $limit = strlen($str);

    if ($idx >= $limit) return $limit;


    // Binary value for any byte after the first in a multi-byte UTF-8 character

    // will be like 10xxxxxx so & 0xC0 can be used to detect this kind

    // of byte - assuming well formed UTF-8

    while ($idx && ((ord($str[$idx]) & 0xC0) == 0x80)) $idx--;


    return $idx;

}


//--------------------------------------------------------------------

/**

* Given a string and any byte index, returns the byte index

* of the start of the next UTF-8 character, relative to supplied

* position. If the next character begins at the same place as the

* supplied byte index, that byte index will be returned.

* @author Chris Smith<chris@jalakai.co.uk>

* @param string

* @param int byte index in the string

* @return int byte index of start of next UTF-8 character

* @package utf8

* @subpackage position

*/

function utf8_locate_next_chr( &$str, $idx ) {


    if ($idx <= 0) return 0;


    $limit = strlen($str);

    if ($idx >= $limit) return $limit;


    // Binary value for any byte after the first in a multi-byte UTF-8 character

    // will be like 10xxxxxx so & 0xC0 can be used to detect this kind

    // of byte - assuming well formed UTF-8

    while (($idx < $limit) && ((ord($str[$idx]) & 0xC0) == 0x80)) $idx++;


    return $idx;

}