Make WordPress Core

Changeset 60630


Ignore:
Timestamp:
08/12/2025 06:13:48 PM (8 months ago)
Author:
dmsnell
Message:

Add wp_is_valid_utf8() for normalizing UTF-8 checks.

There are several existing mechanisms in Core to determine if a given string contains valid UTF-8 bytes or not. These are spread out and depend on which extensions are installed on the running system and what is set for blog_charset. The seems_utf8() function is one of these mechanisms.

seems_utf8() does not properly validate UTF-8, unfortunately, and is slow, and the purpose of the function is veiled behind its name and historic legacy.

This patch deprecates seems_utf() and introduces wp_is_valid_utf8(); a new, spec-compliant, efficient, and focused UTF-8 validator. This new validator defers to mb_check_encoding() where present, otherwise validating with a pure-PHP implementation. This makes the spec-compliant validator available on all systems regardless of their runtime environment.

Developed in https://github.com/WordPress/wordpress-develop/pull/9317
Discussed in https://core.trac.wordpress.org/ticket/38044

Props dmsnell, jonsurrell, jorbin.
Fixes #38044.

Location:
trunk
Files:
7 added
4 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-admin/includes/export.php

    r58009 r60630  
    244244     */
    245245    function wxr_cdata( $str ) {
    246         if ( ! seems_utf8( $str ) ) {
     246        if ( ! wp_is_valid_utf8( $str ) ) {
    247247            $str = utf8_encode( $str );
    248248        }
  • trunk/src/wp-admin/includes/image.php

    r60475 r60630  
    10401040
    10411041    foreach ( array( 'title', 'caption', 'credit', 'copyright', 'camera', 'iso' ) as $key ) {
    1042         if ( $meta[ $key ] && ! seems_utf8( $meta[ $key ] ) ) {
     1042        if ( $meta[ $key ] && ! wp_is_valid_utf8( $meta[ $key ] ) ) {
    10431043            $meta[ $key ] = utf8_encode( $meta[ $key ] );
    10441044        }
     
    10461046
    10471047    foreach ( $meta['keywords'] as $key => $keyword ) {
    1048         if ( ! seems_utf8( $keyword ) ) {
     1048        if ( ! wp_is_valid_utf8( $keyword ) ) {
    10491049            $meta['keywords'][ $key ] = utf8_encode( $keyword );
    10501050        }
  • trunk/src/wp-includes/formatting.php

    r60399 r60630  
    877877 * @author bmorel at ssi dot fr (modified)
    878878 * @since 1.2.1
     879 * @deprecated 6.9.0 Use {@see wp_is_valid_utf8()} instead.
    879880 *
    880881 * @param string $str The string to be checked.
     
    882883 */
    883884function seems_utf8( $str ) {
     885    _deprecated_function( __FUNCTION__, '6.9.0', 'wp_is_valid_utf8()' );
     886
    884887    mbstring_binary_safe_encoding();
    885888    $length = strlen( $str );
     
    912915    }
    913916
     917    return true;
     918}
     919
     920/**
     921 * Determines if a given byte string represents a valid UTF-8 encoding.
     922 *
     923 * Note that it’s unlikely for non-UTF-8 data to validate as UTF-8, but
     924 * it is still possible. Many texts are simultaneously valid UTF-8,
     925 * valid US-ASCII, and valid ISO-8859-1 (`latin1`).
     926 *
     927 * Example:
     928 *
     929 *     true === wp_is_valid_utf8( '' );
     930 *     true === wp_is_valid_utf8( 'just a test' );
     931 *     true === wp_is_valid_utf8( "\xE2\x9C\x8F" );    // Pencil, U+270F.
     932 *     true === wp_is_valid_utf8( "\u{270F}" );        // Pencil, U+270F.
     933 *     true === wp_is_valid_utf8( '✏' );              // Pencil, U+270F.
     934 *
     935 *     false === wp_is_valid_utf8( "just \xC0 test" ); // Invalid bytes.
     936 *     false === wp_is_valid_utf8( "\xE2\x9C" );       // Invalid/incomplete sequences.
     937 *     false === wp_is_valid_utf8( "\xC1\xBF" );       // Overlong sequences.
     938 *     false === wp_is_valid_utf8( "\xED\xB0\x80" );   // Surrogate halves.
     939 *     false === wp_is_valid_utf8( "B\xFCch" );        // ISO-8859-1 high-bytes.
     940 *                                                     // E.g. The “ü” in ISO-8859-1 is a single byte 0xFC,
     941 *                                                     // but in UTF-8 is the two-byte sequence 0xC3 0xBC.
     942 *
     943 * @see _wp_is_valid_utf8_fallback
     944 *
     945 * @since 6.9.0
     946 *
     947 * @param string $bytes String which might contain text encoded as UTF-8.
     948 * @return bool Whether the provided bytes can decode as valid UTF-8.
     949 */
     950function wp_is_valid_utf8( string $bytes ): bool {
     951    /*
     952     * Since PHP 8.3.0 the UTF-8 validity is cached internally
     953     * on string objects, making this a direct property lookup.
     954     *
     955     * This is to be preferred exclusively once PHP 8.3.0 is
     956     * the minimum supported version, because even when the
     957     * status isn’t cached, it uses highly-optimized code to
     958     * validate the byte stream.
     959     */
     960    return function_exists( 'mb_check_encoding' )
     961        ? mb_check_encoding( $bytes, 'UTF-8' )
     962        : _wp_is_valid_utf8_fallback( $bytes );
     963}
     964
     965/**
     966 * Fallback mechanism for safely validating UTF-8 bytes.
     967 *
     968 * By implementing a raw method here the code will behave in the same way on
     969 * all installed systems, regardless of what extensions are installed.
     970 *
     971 * @see wp_is_valid_utf8
     972 *
     973 * @since 6.9.0
     974 * @access private
     975 *
     976 * @param string $bytes String which might contain text encoded as UTF-8.
     977 * @return bool Whether the provided bytes can decode as valid UTF-8.
     978 */
     979function _wp_is_valid_utf8_fallback( string $bytes ): bool {
     980    $end = strlen( $bytes );
     981
     982    for ( $i = 0; $i < $end; $i++ ) {
     983        /*
     984         * Quickly skip past US-ASCII bytes, all of which are valid UTF-8.
     985         *
     986         * This optimization step improves the speed from 10x to 100x
     987         * depending on whether the JIT has optimized the function.
     988         */
     989        $i += strspn(
     990            $bytes,
     991            "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
     992            "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
     993            " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
     994            $i
     995        );
     996        if ( $i >= $end ) {
     997            break;
     998        }
     999
     1000        /**
     1001         * The above fast-track handled all single-byte UTF-8 characters. What
     1002         * follows MUST be a multibyte sequence otherwise there’s invalid UTF-8.
     1003         *
     1004         * Therefore everything past here is checking those multibyte sequences.
     1005         * Because it’s possible that there are truncated characters, the use of
     1006         * the null-coalescing operator with "\xC0" is a convenience for skipping
     1007         * length checks on every continuation bytes. This works because 0xC0 is
     1008         * always invalid in a UTF-8 string, meaning that if the string has been
     1009         * truncated, it will find 0xC0 and reject as invalid UTF-8.
     1010         *
     1011         *  > [The following table] lists all of the byte sequences that are well-formed
     1012         * > in UTF-8. A range of byte values such as A0..BF indicates that any byte
     1013         * > from A0 to BF (inclusive) is well-formed in that position. Any byte value
     1014         * > outside of the ranges listed is ill-formed.
     1015         *
     1016         * > Table 3-7. Well-Formed UTF-8 Byte Sequences
     1017         *  ╭─────────────────────┬────────────┬──────────────┬─────────────┬──────────────╮
     1018         *  │ Code Points         │ First Byte │ Second Byte  │ Third Byte  │ Fourth Byte  │
     1019         *  ├─────────────────────┼────────────┼──────────────┼─────────────┼──────────────┤
     1020         *  │ U+0000..U+007F      │ 00..7F     │              │             │              │
     1021         *  │ U+0080..U+07FF      │ C2..DF     │ 80..BF       │             │              │
     1022         *  │ U+0800..U+0FFF      │ E0         │ A0..BF       │ 80..BF      │              │
     1023         *  │ U+1000..U+CFFF      │ E1..EC     │ 80..BF       │ 80..BF      │              │
     1024         *  │ U+D000..U+D7FF      │ ED         │ 80..9F       │ 80..BF      │              │
     1025         *  │ U+E000..U+FFFF      │ EE..EF     │ 80..BF       │ 80..BF      │              │
     1026         *  │ U+10000..U+3FFFF    │ F0         │ 90..BF       │ 80..BF      │ 80..BF       │
     1027         *  │ U+40000..U+FFFFF    │ F1..F3     │ 80..BF       │ 80..BF      │ 80..BF       │
     1028         *  │ U+100000..U+10FFFF  │ F4         │ 80..8F       │ 80..BF      │ 80..BF       │
     1029         *  ╰─────────────────────┴────────────┴──────────────┴─────────────┴──────────────╯
     1030         *
     1031         * Notice that all valid third and forth bytes are in the range 80..BF. This
     1032         * validator takes advantage of that to only check the range of those bytes once.
     1033         *
     1034         * @see https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/
     1035         * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506
     1036         */
     1037
     1038        $b1 = ord( $bytes[ $i ] );
     1039        $b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" );
     1040
     1041        // Valid two-byte code points.
     1042
     1043        if ( $b1 >= 0xC2 && $b1 <= 0xDF && $b2 >= 0x80 && $b2 <= 0xBF ) {
     1044            $i++;
     1045            continue;
     1046        }
     1047
     1048        $b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" );
     1049
     1050        // Valid three-byte code points.
     1051
     1052        if ( $b3 < 0x80 || $b3 > 0xBF ) {
     1053            return false;
     1054        }
     1055
     1056        if (
     1057            ( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) ||
     1058            ( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) ||
     1059            ( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) ||
     1060            ( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF )
     1061        ) {
     1062            $i += 2;
     1063            continue;
     1064        }
     1065
     1066        $b4 = ord( $bytes[ $i + 3 ] ?? "\xC0" );
     1067
     1068        // Valid four-byte code points.
     1069
     1070        if ( $b4 < 0x80 || $b4 > 0xBF ) {
     1071            return false;
     1072        }
     1073
     1074        if (
     1075            ( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) ||
     1076            ( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) ||
     1077            ( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F )
     1078        ) {
     1079            $i += 3;
     1080            continue;
     1081        }
     1082
     1083        // Any other sequence is invalid.
     1084        return false;
     1085    }
     1086
     1087    // Reaching the end implies validating every byte.
    9141088    return true;
    9151089}
     
    15981772    }
    15991773
    1600     if ( seems_utf8( $text ) ) {
     1774    if ( wp_is_valid_utf8( $text ) ) {
    16011775
    16021776        /*
     
    20292203    }
    20302204
    2031     if ( ! seems_utf8( $filename ) ) {
     2205    if ( ! wp_is_valid_utf8( $filename ) ) {
    20322206        $_ext     = pathinfo( $filename, PATHINFO_EXTENSION );
    20332207        $_name    = pathinfo( $filename, PATHINFO_FILENAME );
     
    22782452    $title = preg_replace( '|---([a-fA-F0-9][a-fA-F0-9])---|', '%$1', $title );
    22792453
    2280     if ( seems_utf8( $title ) ) {
     2454    if ( wp_is_valid_utf8( $title ) ) {
    22812455        if ( function_exists( 'mb_strtolower' ) ) {
    22822456            $title = mb_strtolower( $title, 'UTF-8' );
  • trunk/tests/phpunit/tests/formatting/seemsUtf8.php

    r56536 r60630  
    1 <?php
    2 
    3 /**
    4  * @group formatting
    5  *
    6  * @covers ::seems_utf8
    7  */
    8 class Tests_Formatting_SeemsUtf8 extends WP_UnitTestCase {
    9 
    10     /**
    11      * `seems_utf8` returns true for utf-8 strings, false otherwise.
    12      *
    13      * @dataProvider data_seems_utf8_returns_true_for_utf8_strings
    14      */
    15     public function test_seems_utf8_returns_true_for_utf8_strings( $utf8_string ) {
    16         // From http://www.i18nguy.com/unicode-example.html
    17         $this->assertTrue( seems_utf8( $utf8_string ) );
    18     }
    19 
    20     public function data_seems_utf8_returns_true_for_utf8_strings() {
    21         $utf8_strings = file( DIR_TESTDATA . '/formatting/utf-8/utf-8.txt' );
    22         foreach ( $utf8_strings as &$string ) {
    23             $string = (array) trim( $string );
    24         }
    25         unset( $string );
    26         return $utf8_strings;
    27     }
    28 
    29     /**
    30      * @dataProvider data_seems_utf8_returns_false_for_non_utf8_strings
    31      */
    32     public function test_seems_utf8_returns_false_for_non_utf8_strings( $big5_string ) {
    33         $this->assertFalse( seems_utf8( $big5_string ) );
    34     }
    35 
    36     public function data_seems_utf8_returns_false_for_non_utf8_strings() {
    37         // Get data from formatting/big5.txt.
    38         $big5_strings = file( DIR_TESTDATA . '/formatting/big5.txt' );
    39         foreach ( $big5_strings as &$string ) {
    40             $string = (array) trim( $string );
    41         }
    42         unset( $string );
    43         return $big5_strings;
    44     }
    45 }
Note: See TracChangeset for help on using the changeset viewer.