Context Navigation

← Previous Changeset
Next Changeset →

Changeset 60630

Timestamp:

08/12/2025 06:13:48 PM (8 months ago)

Author:

dmsnell

Message:

Add wp_is_valid_utf8() for normalizing UTF-8 checks.

There are several existing mechanisms in Core to determine if a given string contains valid UTF-8 bytes or not. These are spread out and depend on which extensions are installed on the running system and what is set for blog_charset. The seems_utf8() function is one of these mechanisms.

seems_utf8() does not properly validate UTF-8, unfortunately, and is slow, and the purpose of the function is veiled behind its name and historic legacy.

This patch deprecates seems_utf() and introduces wp_is_valid_utf8(); a new, spec-compliant, efficient, and focused UTF-8 validator. This new validator defers to mb_check_encoding() where present, otherwise validating with a pure-PHP implementation. This makes the spec-compliant validator available on all systems regardless of their runtime environment.

Developed in https://github.com/WordPress/wordpress-develop/pull/9317
Discussed in https://core.trac.wordpress.org/ticket/38044

Props dmsnell, jonsurrell, jorbin.
Fixes #38044.

Location:

trunk

Files:

: 7 added
: 4 edited

src/wp-admin/includes/export.php (modified) (1 diff)
src/wp-admin/includes/image.php (modified) (2 diffs)
src/wp-includes/formatting.php (modified) (6 diffs)
tests/phpunit/data/unicode (added)
tests/phpunit/data/unicode/utf8tests (added)
tests/phpunit/data/unicode/utf8tests/LICENSE (added)
tests/phpunit/data/unicode/utf8tests/README.md (added)
tests/phpunit/data/unicode/utf8tests/utf8tests.txt (added)
tests/phpunit/tests/formatting/seemsUtf8.php (modified) (1 diff)
tests/phpunit/tests/unicode (added)
tests/phpunit/tests/unicode/wpIsValidUtf8.php (added)

Legend:

: Unmodified
: Added
: Removed

trunk/src/wp-admin/includes/export.php

r58009	r60630
244	244	*/
245	245	function wxr_cdata( $str ) {
246		if ( ! ~~seems~~_utf8( $str ) ) {
	246	if ( ! wp_is_valid_utf8( $str ) ) {
247	247	$str = utf8_encode( $str );
248	248	}

trunk/src/wp-admin/includes/image.php

-                      r60475
+                      r60630
     foreach ( array( 'title', 'caption', 'credit', 'copyright', 'camera', 'iso' ) as $key ) {
         if ( $meta[ $key ] && ! seems_utf8( $meta[ $key ] ) ) {
+        if ( $meta[ $key ] && ! wp_is_valid_utf8( $meta[ $key ] ) ) {
             $meta[ $key ] = utf8_encode( $meta[ $key ] );
+        }
 …
     foreach ( $meta['keywords'] as $key => $keyword ) {
         if ( ! seems_utf8( $keyword ) ) {
+        if ( ! wp_is_valid_utf8( $keyword ) ) {
             $meta['keywords'][ $key ] = utf8_encode( $keyword );
+        }

trunk/src/wp-includes/formatting.php

-                      r60399
+                      r60630
  * @author bmorel at ssi dot fr (modified)
  * @since 1.2.1
+ * @deprecated 6.9.0 Use {@see wp_is_valid_utf8()} instead.
+ *
  * @param string $str The string to be checked.
 …
  */
 function seems_utf8( $str ) {
+    _deprecated_function( __FUNCTION__, '6.9.0', 'wp_is_valid_utf8()' );
     mbstring_binary_safe_encoding();
     $length = strlen( $str );
 …
+    }
+    return true;
+}
+/**
+ * Determines if a given byte string represents a valid UTF-8 encoding.
+ *
+ * Note that it’s unlikely for non-UTF-8 data to validate as UTF-8, but
+ * it is still possible. Many texts are simultaneously valid UTF-8,
+ * valid US-ASCII, and valid ISO-8859-1 (`latin1`).
+ *
+ * Example:
+ *
+ *     true === wp_is_valid_utf8( '' );
+ *     true === wp_is_valid_utf8( 'just a test' );
+ *     true === wp_is_valid_utf8( "\xE2\x9C\x8F" );    // Pencil, U+270F.
+ *     true === wp_is_valid_utf8( "\u{270F}" );        // Pencil, U+270F.
+ *     true === wp_is_valid_utf8( '✏' );              // Pencil, U+270F.
+ *
+ *     false === wp_is_valid_utf8( "just \xC0 test" ); // Invalid bytes.
+ *     false === wp_is_valid_utf8( "\xE2\x9C" );       // Invalid/incomplete sequences.
+ *     false === wp_is_valid_utf8( "\xC1\xBF" );       // Overlong sequences.
+ *     false === wp_is_valid_utf8( "\xED\xB0\x80" );   // Surrogate halves.
+ *     false === wp_is_valid_utf8( "B\xFCch" );        // ISO-8859-1 high-bytes.
+ *                                                     // E.g. The “ü” in ISO-8859-1 is a single byte 0xFC,
+ *                                                     // but in UTF-8 is the two-byte sequence 0xC3 0xBC.
+ *
+ * @see _wp_is_valid_utf8_fallback
+ *
+ * @since 6.9.0
+ *
+ * @param string $bytes String which might contain text encoded as UTF-8.
+ * @return bool Whether the provided bytes can decode as valid UTF-8.
+ */
+function wp_is_valid_utf8( string $bytes ): bool {
+    /*
+     * Since PHP 8.3.0 the UTF-8 validity is cached internally
+     * on string objects, making this a direct property lookup.
+     *
+     * This is to be preferred exclusively once PHP 8.3.0 is
+     * the minimum supported version, because even when the
+     * status isn’t cached, it uses highly-optimized code to
+     * validate the byte stream.
+     */
+    return function_exists( 'mb_check_encoding' )
+        ? mb_check_encoding( $bytes, 'UTF-8' )
+        : _wp_is_valid_utf8_fallback( $bytes );
+}
+/**
+ * Fallback mechanism for safely validating UTF-8 bytes.
+ *
+ * By implementing a raw method here the code will behave in the same way on
+ * all installed systems, regardless of what extensions are installed.
+ *
+ * @see wp_is_valid_utf8
+ *
+ * @since 6.9.0
+ * @access private
+ *
+ * @param string $bytes String which might contain text encoded as UTF-8.
+ * @return bool Whether the provided bytes can decode as valid UTF-8.
+ */
+function _wp_is_valid_utf8_fallback( string $bytes ): bool {
+    $end = strlen( $bytes );
+    for ( $i = 0; $i < $end; $i++ ) {
+        /*
+         * Quickly skip past US-ASCII bytes, all of which are valid UTF-8.
+         *
+         * This optimization step improves the speed from 10x to 100x
+         * depending on whether the JIT has optimized the function.
+         */
+        $i += strspn(
+            $bytes,
+            "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
+            "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
+            " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
+            $i
+        );
+        if ( $i >= $end ) {
+            break;
+        }
+        /**
+         * The above fast-track handled all single-byte UTF-8 characters. What
+         * follows MUST be a multibyte sequence otherwise there’s invalid UTF-8.
+         *
+         * Therefore everything past here is checking those multibyte sequences.
+         * Because it’s possible that there are truncated characters, the use of
+         * the null-coalescing operator with "\xC0" is a convenience for skipping
+         * length checks on every continuation bytes. This works because 0xC0 is
+         * always invalid in a UTF-8 string, meaning that if the string has been
+         * truncated, it will find 0xC0 and reject as invalid UTF-8.
+         *
+         *  > [The following table] lists all of the byte sequences that are well-formed
+         * > in UTF-8. A range of byte values such as A0..BF indicates that any byte
+         * > from A0 to BF (inclusive) is well-formed in that position. Any byte value
+         * > outside of the ranges listed is ill-formed.
+         *
+         * > Table 3-7. Well-Formed UTF-8 Byte Sequences
+         *  ╭─────────────────────┬────────────┬──────────────┬─────────────┬──────────────╮
+         *  │ Code Points         │ First Byte │ Second Byte  │ Third Byte  │ Fourth Byte  │
+         *  ├─────────────────────┼────────────┼──────────────┼─────────────┼──────────────┤
+         *  │ U+0000..U+007F      │ 00..7F     │              │             │              │
+         *  │ U+0080..U+07FF      │ C2..DF     │ 80..BF       │             │              │
+         *  │ U+0800..U+0FFF      │ E0         │ A0..BF       │ 80..BF      │              │
+         *  │ U+1000..U+CFFF      │ E1..EC     │ 80..BF       │ 80..BF      │              │
+         *  │ U+D000..U+D7FF      │ ED         │ 80..9F       │ 80..BF      │              │
+         *  │ U+E000..U+FFFF      │ EE..EF     │ 80..BF       │ 80..BF      │              │
+         *  │ U+10000..U+3FFFF    │ F0         │ 90..BF       │ 80..BF      │ 80..BF       │
+         *  │ U+40000..U+FFFFF    │ F1..F3     │ 80..BF       │ 80..BF      │ 80..BF       │
+         *  │ U+100000..U+10FFFF  │ F4         │ 80..8F       │ 80..BF      │ 80..BF       │
+         *  ╰─────────────────────┴────────────┴──────────────┴─────────────┴──────────────╯
+         *
+         * Notice that all valid third and forth bytes are in the range 80..BF. This
+         * validator takes advantage of that to only check the range of those bytes once.
+         *
+         * @see https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/
+         * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506
+         */
+        $b1 = ord( $bytes[ $i ] );
+        $b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" );
+        // Valid two-byte code points.
+        if ( $b1 >= 0xC2 && $b1 <= 0xDF && $b2 >= 0x80 && $b2 <= 0xBF ) {
+            $i++;
+            continue;
+        }
+        $b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" );
+        // Valid three-byte code points.
+        if ( $b3 < 0x80 || $b3 > 0xBF ) {
+            return false;
+        }
+        if (
+            ( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) ||
+            ( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) ||
+            ( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) ||
+            ( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF )
+        ) {
+            $i += 2;
+            continue;
+        }
+        $b4 = ord( $bytes[ $i + 3 ] ?? "\xC0" );
+        // Valid four-byte code points.
+        if ( $b4 < 0x80 || $b4 > 0xBF ) {
+            return false;
+        }
+        if (
+            ( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) ||
+            ( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) ||
+            ( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F )
+        ) {
+            $i += 3;
+            continue;
+        }
+        // Any other sequence is invalid.
+        return false;
+    }
+    // Reaching the end implies validating every byte.
     return true;
+}
 …
+    }
     if ( seems_utf8( $text ) ) {
+    if ( wp_is_valid_utf8( $text ) ) {
         /*
 …
+    }
     if ( ! seems_utf8( $filename ) ) {
+    if ( ! wp_is_valid_utf8( $filename ) ) {
         $_ext     = pathinfo( $filename, PATHINFO_EXTENSION );
         $_name    = pathinfo( $filename, PATHINFO_FILENAME );
 …
     $title = preg_replace( '|---([a-fA-F0-9][a-fA-F0-9])---|', '%$1', $title );
     if ( seems_utf8( $title ) ) {
+    if ( wp_is_valid_utf8( $title ) ) {
         if ( function_exists( 'mb_strtolower' ) ) {
             $title = mb_strtolower( $title, 'UTF-8' );

trunk/tests/phpunit/tests/formatting/seemsUtf8.php

-                      r56536
+                      r60630
-<?php
-/**
- * @group formatting
+ *
- * @covers ::seems_utf8
- */
-class Tests_Formatting_SeemsUtf8 extends WP_UnitTestCase {
-    /**
-     * `seems_utf8` returns true for utf-8 strings, false otherwise.
+     *
-     * @dataProvider data_seems_utf8_returns_true_for_utf8_strings
-     */
-    public function test_seems_utf8_returns_true_for_utf8_strings( $utf8_string ) {
-        // From http://www.i18nguy.com/unicode-example.html
-        $this->assertTrue( seems_utf8( $utf8_string ) );
+    }
-    public function data_seems_utf8_returns_true_for_utf8_strings() {
-        $utf8_strings = file( DIR_TESTDATA . '/formatting/utf-8/utf-8.txt' );
-        foreach ( $utf8_strings as &$string ) {
-            $string = (array) trim( $string );
+        }
-        unset( $string );
-        return $utf8_strings;
+    }
-    /**
-     * @dataProvider data_seems_utf8_returns_false_for_non_utf8_strings
-     */
-    public function test_seems_utf8_returns_false_for_non_utf8_strings( $big5_string ) {
-        $this->assertFalse( seems_utf8( $big5_string ) );
+    }
-    public function data_seems_utf8_returns_false_for_non_utf8_strings() {
-        // Get data from formatting/big5.txt.
-        $big5_strings = file( DIR_TESTDATA . '/formatting/big5.txt' );
-        foreach ( $big5_strings as &$string ) {
-            $string = (array) trim( $string );
+        }
-        unset( $string );
-        return $big5_strings;
+    }
+}

Note: See TracChangeset for help on using the changeset viewer.

Trac UI Preferences

Make WordPress Core