Espaces de nommage
namespace	utf8

Fonctions
	utf8_bad_find ($str)
	utf8_bad_findall ($str)
	utf8_bad_strip ($str)
	utf8_bad_replace ($str, $replace= '?')
	utf8_bad_identify ($str, &$i)
	utf8_bad_explain ($code)

Variables
const	UTF8_BAD_5OCTET = 1
const	UTF8_BAD_6OCTET = 2
const	UTF8_BAD_SEQID = 3
const	UTF8_BAD_NONSHORT = 4
const	UTF8_BAD_SURROGATE = 5
const	UTF8_BAD_UNIOUTRANGE = 6
const	UTF8_BAD_SEQINCOMPLETE = 7

Documentation des fonctions

utf8_bad_explain ( $code )

Définition à la ligne 384 du fichier bad.php.

Références UTF8_BAD_5OCTET, UTF8_BAD_6OCTET, UTF8_BAD_NONSHORT, UTF8_BAD_SEQID, UTF8_BAD_SEQINCOMPLETE, UTF8_BAD_SURROGATE, et UTF8_BAD_UNIOUTRANGE.

                                 {
    switch ($code) {
        case UTF8_BAD_5OCTET:
            return 'Five octet sequences are valid UTF-8 but are not supported by Unicode';
        break;
        case UTF8_BAD_6OCTET:
            return 'Six octet sequences are valid UTF-8 but are not supported by Unicode';
        break;
        case UTF8_BAD_SEQID:
            return 'Invalid octet for use as start of multi-byte UTF-8 sequence';
        break;
        case UTF8_BAD_NONSHORT:
            return 'From Unicode 3.1, non-shortest form is illegal';
        break;
        case UTF8_BAD_SURROGATE:
            return 'From Unicode 3.2, surrogate characters are illegal';
        break;
        case UTF8_BAD_UNIOUTRANGE:
            return 'Codepoints outside the Unicode range are illegal';
        break;
        case UTF8_BAD_SEQINCOMPLETE:
            return 'Incomplete multi-octet sequence';
        break;
    }
    trigger_error('Unknown error code: '.$code,E_USER_WARNING);
    return FALSE;
}

utf8_bad_find ( $str )

Définition à la ligne 33 du fichier bad.php.

Références $UTF8_BAD.

                             {
    $UTF8_BAD =
    '([\x00-\x7F]'.                          # ASCII (including control chars)
    '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
    '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
    '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
    '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
    '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
    '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
    '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
    '|(.{1}))';                              # invalid byte
    $pos = 0;
    $badList = array();
    while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
        $bytes = strlen($matches[0]);
        if ( isset($matches[2])) {
            return $pos;
        }
        $pos += $bytes;
        $str = substr($str,$bytes);
    }
    return FALSE;
}

utf8_bad_findall ( $str )

Définition à la ligne 70 du fichier bad.php.

Références $UTF8_BAD.

                                {
    $UTF8_BAD =
    '([\x00-\x7F]'.                          # ASCII (including control chars)
    '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
    '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
    '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
    '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
    '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
    '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
    '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
    '|(.{1}))';                              # invalid byte
    $pos = 0;
    $badList = array();
    while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
        $bytes = strlen($matches[0]);
        if ( isset($matches[2])) {
            $badList[] = $pos;
        }
        $pos += $bytes;
        $str = substr($str,$bytes);
    }
    if ( count($badList) > 0 ) {
        return $badList;
    }
    return FALSE;
}

utf8_bad_identify	(		$str,
		&	$i
	)

End of the multi-octet sequence. mUcs4 now contains the final Unicode codepoint to be output

Définition à la ligne 250 du fichier bad.php.

Références UTF8_BAD_5OCTET, UTF8_BAD_6OCTET, UTF8_BAD_NONSHORT, UTF8_BAD_SEQID, UTF8_BAD_SEQINCOMPLETE, UTF8_BAD_SURROGATE, et UTF8_BAD_UNIOUTRANGE.

                                      {
    $mState = 0;     // cached expected number of octets after the current octet
                     // until the beginning of the next UTF8 character sequence
    $mUcs4  = 0;     // cached Unicode character
    $mBytes = 1;     // cached expected number of octets in the current sequence
    $len = strlen($str);
    for($i = 0; $i < $len; $i++) {
        $in = ord($str{$i});
        if ( $mState == 0) {
            // When mState is zero we expect either a US-ASCII character or a
            // multi-octet sequence.
            if (0 == (0x80 & ($in))) {
                // US-ASCII, pass straight through.
                $mBytes = 1;
            } else if (0xC0 == (0xE0 & ($in))) {
                // First octet of 2 octet sequence
                $mUcs4 = ($in);
                $mUcs4 = ($mUcs4 & 0x1F) << 6;
                $mState = 1;
                $mBytes = 2;
            } else if (0xE0 == (0xF0 & ($in))) {
                // First octet of 3 octet sequence
                $mUcs4 = ($in);
                $mUcs4 = ($mUcs4 & 0x0F) << 12;
                $mState = 2;
                $mBytes = 3;
            } else if (0xF0 == (0xF8 & ($in))) {
                // First octet of 4 octet sequence
                $mUcs4 = ($in);
                $mUcs4 = ($mUcs4 & 0x07) << 18;
                $mState = 3;
                $mBytes = 4;
            } else if (0xF8 == (0xFC & ($in))) {
                /* First octet of 5 octet sequence.
                *
                * This is illegal because the encoded codepoint must be either
                * (a) not the shortest form or
                * (b) outside the Unicode range of 0-0x10FFFF.
                */
                return UTF8_BAD_5OCTET;
            } else if (0xFC == (0xFE & ($in))) {
                // First octet of 6 octet sequence, see comments for 5 octet sequence.
                return UTF8_BAD_6OCTET;
            } else {
                // Current octet is neither in the US-ASCII range nor a legal first
                // octet of a multi-octet sequence.
                return UTF8_BAD_SEQID;
            }
        } else {
            // When mState is non-zero, we expect a continuation of the multi-octet
            // sequence
            if (0x80 == (0xC0 & ($in))) {
                // Legal continuation.
                $shift = ($mState - 1) * 6;
                $tmp = $in;
                $tmp = ($tmp & 0x0000003F) << $shift;
                $mUcs4 |= $tmp;
                /**
                * End of the multi-octet sequence. mUcs4 now contains the final
                * Unicode codepoint to be output
                */
                if (0 == --$mState) {
                    // From Unicode 3.1, non-shortest form is illegal
                    if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
                        ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
                        ((4 == $mBytes) && ($mUcs4 < 0x10000)) ) {
                        return UTF8_BAD_NONSHORT;
                    // From Unicode 3.2, surrogate characters are illegal
                    } else if (($mUcs4 & 0xFFFFF800) == 0xD800) {
                        return UTF8_BAD_SURROGATE;
                    // Codepoints outside the Unicode range are illegal
                    } else if ($mUcs4 > 0x10FFFF) {
                        return UTF8_BAD_UNIOUTRANGE;
                    }
                    //initialize UTF8 cache
                    $mState = 0;
                    $mUcs4  = 0;
                    $mBytes = 1;
                }
            } else {
                // ((0xC0 & (*in) != 0x80) && (mState != 0))
                // Incomplete multi-octet sequence.
                $i--;
                return UTF8_BAD_SEQINCOMPLETE;
            }
        }
    }
    if ( $mState != 0 ) {
        // Incomplete multi-octet sequence.
        $i--;
        return UTF8_BAD_SEQINCOMPLETE;
    }
    // No bad octets found
    $i = NULL;
    return FALSE;
}

utf8_bad_replace	(	$str,
		$replace = `'?'`
	)

Définition à la ligne 146 du fichier bad.php.

Références $UTF8_BAD.

                                                {
    $UTF8_BAD =
    '([\x00-\x7F]'.                          # ASCII (including control chars)
    '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
    '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
    '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
    '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
    '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
    '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
    '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
    '|(.{1}))';                              # invalid byte
    ob_start();
    while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
        if ( !isset($matches[2])) {
            echo $matches[0];
        } else {
            echo $replace;
        }
        $str = substr($str,strlen($matches[0]));
    }
    $result = ob_get_contents();
    ob_end_clean();
    return $result;
}

utf8_bad_strip ( $str )

Définition à la ligne 109 du fichier bad.php.

Références $UTF8_BAD.

                              {
    $UTF8_BAD =
    '([\x00-\x7F]'.                          # ASCII (including control chars)
    '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
    '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
    '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
    '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
    '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
    '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
    '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
    '|(.{1}))';                              # invalid byte
    ob_start();
    while (preg_match('/'.$UTF8_BAD.'/S', $str, $matches)) {
        if ( !isset($matches[2])) {
            echo $matches[0];
        }
        $str = substr($str,strlen($matches[0]));
    }
    $result = ob_get_contents();
    ob_end_clean();
    return $result;
}