Changeset 60630
- Timestamp:
- 08/12/2025 06:13:48 PM (8 months ago)
- Location:
- trunk
- Files:
-
- 7 added
- 4 edited
-
src/wp-admin/includes/export.php (modified) (1 diff)
-
src/wp-admin/includes/image.php (modified) (2 diffs)
-
src/wp-includes/formatting.php (modified) (6 diffs)
-
tests/phpunit/data/unicode (added)
-
tests/phpunit/data/unicode/utf8tests (added)
-
tests/phpunit/data/unicode/utf8tests/LICENSE (added)
-
tests/phpunit/data/unicode/utf8tests/README.md (added)
-
tests/phpunit/data/unicode/utf8tests/utf8tests.txt (added)
-
tests/phpunit/tests/formatting/seemsUtf8.php (modified) (1 diff)
-
tests/phpunit/tests/unicode (added)
-
tests/phpunit/tests/unicode/wpIsValidUtf8.php (added)
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/wp-admin/includes/export.php
r58009 r60630 244 244 */ 245 245 function wxr_cdata( $str ) { 246 if ( ! seems_utf8( $str ) ) {246 if ( ! wp_is_valid_utf8( $str ) ) { 247 247 $str = utf8_encode( $str ); 248 248 } -
trunk/src/wp-admin/includes/image.php
r60475 r60630 1040 1040 1041 1041 foreach ( array( 'title', 'caption', 'credit', 'copyright', 'camera', 'iso' ) as $key ) { 1042 if ( $meta[ $key ] && ! seems_utf8( $meta[ $key ] ) ) {1042 if ( $meta[ $key ] && ! wp_is_valid_utf8( $meta[ $key ] ) ) { 1043 1043 $meta[ $key ] = utf8_encode( $meta[ $key ] ); 1044 1044 } … … 1046 1046 1047 1047 foreach ( $meta['keywords'] as $key => $keyword ) { 1048 if ( ! seems_utf8( $keyword ) ) {1048 if ( ! wp_is_valid_utf8( $keyword ) ) { 1049 1049 $meta['keywords'][ $key ] = utf8_encode( $keyword ); 1050 1050 } -
trunk/src/wp-includes/formatting.php
r60399 r60630 877 877 * @author bmorel at ssi dot fr (modified) 878 878 * @since 1.2.1 879 * @deprecated 6.9.0 Use {@see wp_is_valid_utf8()} instead. 879 880 * 880 881 * @param string $str The string to be checked. … … 882 883 */ 883 884 function seems_utf8( $str ) { 885 _deprecated_function( __FUNCTION__, '6.9.0', 'wp_is_valid_utf8()' ); 886 884 887 mbstring_binary_safe_encoding(); 885 888 $length = strlen( $str ); … … 912 915 } 913 916 917 return true; 918 } 919 920 /** 921 * Determines if a given byte string represents a valid UTF-8 encoding. 922 * 923 * Note that it’s unlikely for non-UTF-8 data to validate as UTF-8, but 924 * it is still possible. Many texts are simultaneously valid UTF-8, 925 * valid US-ASCII, and valid ISO-8859-1 (`latin1`). 926 * 927 * Example: 928 * 929 * true === wp_is_valid_utf8( '' ); 930 * true === wp_is_valid_utf8( 'just a test' ); 931 * true === wp_is_valid_utf8( "\xE2\x9C\x8F" ); // Pencil, U+270F. 932 * true === wp_is_valid_utf8( "\u{270F}" ); // Pencil, U+270F. 933 * true === wp_is_valid_utf8( '✏' ); // Pencil, U+270F. 934 * 935 * false === wp_is_valid_utf8( "just \xC0 test" ); // Invalid bytes. 936 * false === wp_is_valid_utf8( "\xE2\x9C" ); // Invalid/incomplete sequences. 937 * false === wp_is_valid_utf8( "\xC1\xBF" ); // Overlong sequences. 938 * false === wp_is_valid_utf8( "\xED\xB0\x80" ); // Surrogate halves. 939 * false === wp_is_valid_utf8( "B\xFCch" ); // ISO-8859-1 high-bytes. 940 * // E.g. The “ü” in ISO-8859-1 is a single byte 0xFC, 941 * // but in UTF-8 is the two-byte sequence 0xC3 0xBC. 942 * 943 * @see _wp_is_valid_utf8_fallback 944 * 945 * @since 6.9.0 946 * 947 * @param string $bytes String which might contain text encoded as UTF-8. 948 * @return bool Whether the provided bytes can decode as valid UTF-8. 949 */ 950 function wp_is_valid_utf8( string $bytes ): bool { 951 /* 952 * Since PHP 8.3.0 the UTF-8 validity is cached internally 953 * on string objects, making this a direct property lookup. 954 * 955 * This is to be preferred exclusively once PHP 8.3.0 is 956 * the minimum supported version, because even when the 957 * status isn’t cached, it uses highly-optimized code to 958 * validate the byte stream. 959 */ 960 return function_exists( 'mb_check_encoding' ) 961 ? mb_check_encoding( $bytes, 'UTF-8' ) 962 : _wp_is_valid_utf8_fallback( $bytes ); 963 } 964 965 /** 966 * Fallback mechanism for safely validating UTF-8 bytes. 967 * 968 * By implementing a raw method here the code will behave in the same way on 969 * all installed systems, regardless of what extensions are installed. 970 * 971 * @see wp_is_valid_utf8 972 * 973 * @since 6.9.0 974 * @access private 975 * 976 * @param string $bytes String which might contain text encoded as UTF-8. 977 * @return bool Whether the provided bytes can decode as valid UTF-8. 978 */ 979 function _wp_is_valid_utf8_fallback( string $bytes ): bool { 980 $end = strlen( $bytes ); 981 982 for ( $i = 0; $i < $end; $i++ ) { 983 /* 984 * Quickly skip past US-ASCII bytes, all of which are valid UTF-8. 985 * 986 * This optimization step improves the speed from 10x to 100x 987 * depending on whether the JIT has optimized the function. 988 */ 989 $i += strspn( 990 $bytes, 991 "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" . 992 "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" . 993 " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f", 994 $i 995 ); 996 if ( $i >= $end ) { 997 break; 998 } 999 1000 /** 1001 * The above fast-track handled all single-byte UTF-8 characters. What 1002 * follows MUST be a multibyte sequence otherwise there’s invalid UTF-8. 1003 * 1004 * Therefore everything past here is checking those multibyte sequences. 1005 * Because it’s possible that there are truncated characters, the use of 1006 * the null-coalescing operator with "\xC0" is a convenience for skipping 1007 * length checks on every continuation bytes. This works because 0xC0 is 1008 * always invalid in a UTF-8 string, meaning that if the string has been 1009 * truncated, it will find 0xC0 and reject as invalid UTF-8. 1010 * 1011 * > [The following table] lists all of the byte sequences that are well-formed 1012 * > in UTF-8. A range of byte values such as A0..BF indicates that any byte 1013 * > from A0 to BF (inclusive) is well-formed in that position. Any byte value 1014 * > outside of the ranges listed is ill-formed. 1015 * 1016 * > Table 3-7. Well-Formed UTF-8 Byte Sequences 1017 * ╭─────────────────────┬────────────┬──────────────┬─────────────┬──────────────╮ 1018 * │ Code Points │ First Byte │ Second Byte │ Third Byte │ Fourth Byte │ 1019 * ├─────────────────────┼────────────┼──────────────┼─────────────┼──────────────┤ 1020 * │ U+0000..U+007F │ 00..7F │ │ │ │ 1021 * │ U+0080..U+07FF │ C2..DF │ 80..BF │ │ │ 1022 * │ U+0800..U+0FFF │ E0 │ A0..BF │ 80..BF │ │ 1023 * │ U+1000..U+CFFF │ E1..EC │ 80..BF │ 80..BF │ │ 1024 * │ U+D000..U+D7FF │ ED │ 80..9F │ 80..BF │ │ 1025 * │ U+E000..U+FFFF │ EE..EF │ 80..BF │ 80..BF │ │ 1026 * │ U+10000..U+3FFFF │ F0 │ 90..BF │ 80..BF │ 80..BF │ 1027 * │ U+40000..U+FFFFF │ F1..F3 │ 80..BF │ 80..BF │ 80..BF │ 1028 * │ U+100000..U+10FFFF │ F4 │ 80..8F │ 80..BF │ 80..BF │ 1029 * ╰─────────────────────┴────────────┴──────────────┴─────────────┴──────────────╯ 1030 * 1031 * Notice that all valid third and forth bytes are in the range 80..BF. This 1032 * validator takes advantage of that to only check the range of those bytes once. 1033 * 1034 * @see https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/ 1035 * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506 1036 */ 1037 1038 $b1 = ord( $bytes[ $i ] ); 1039 $b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" ); 1040 1041 // Valid two-byte code points. 1042 1043 if ( $b1 >= 0xC2 && $b1 <= 0xDF && $b2 >= 0x80 && $b2 <= 0xBF ) { 1044 $i++; 1045 continue; 1046 } 1047 1048 $b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" ); 1049 1050 // Valid three-byte code points. 1051 1052 if ( $b3 < 0x80 || $b3 > 0xBF ) { 1053 return false; 1054 } 1055 1056 if ( 1057 ( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) || 1058 ( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) || 1059 ( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) || 1060 ( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF ) 1061 ) { 1062 $i += 2; 1063 continue; 1064 } 1065 1066 $b4 = ord( $bytes[ $i + 3 ] ?? "\xC0" ); 1067 1068 // Valid four-byte code points. 1069 1070 if ( $b4 < 0x80 || $b4 > 0xBF ) { 1071 return false; 1072 } 1073 1074 if ( 1075 ( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) || 1076 ( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) || 1077 ( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F ) 1078 ) { 1079 $i += 3; 1080 continue; 1081 } 1082 1083 // Any other sequence is invalid. 1084 return false; 1085 } 1086 1087 // Reaching the end implies validating every byte. 914 1088 return true; 915 1089 } … … 1598 1772 } 1599 1773 1600 if ( seems_utf8( $text ) ) {1774 if ( wp_is_valid_utf8( $text ) ) { 1601 1775 1602 1776 /* … … 2029 2203 } 2030 2204 2031 if ( ! seems_utf8( $filename ) ) {2205 if ( ! wp_is_valid_utf8( $filename ) ) { 2032 2206 $_ext = pathinfo( $filename, PATHINFO_EXTENSION ); 2033 2207 $_name = pathinfo( $filename, PATHINFO_FILENAME ); … … 2278 2452 $title = preg_replace( '|---([a-fA-F0-9][a-fA-F0-9])---|', '%$1', $title ); 2279 2453 2280 if ( seems_utf8( $title ) ) {2454 if ( wp_is_valid_utf8( $title ) ) { 2281 2455 if ( function_exists( 'mb_strtolower' ) ) { 2282 2456 $title = mb_strtolower( $title, 'UTF-8' ); -
trunk/tests/phpunit/tests/formatting/seemsUtf8.php
r56536 r60630 1 <?php2 3 /**4 * @group formatting5 *6 * @covers ::seems_utf87 */8 class Tests_Formatting_SeemsUtf8 extends WP_UnitTestCase {9 10 /**11 * `seems_utf8` returns true for utf-8 strings, false otherwise.12 *13 * @dataProvider data_seems_utf8_returns_true_for_utf8_strings14 */15 public function test_seems_utf8_returns_true_for_utf8_strings( $utf8_string ) {16 // From http://www.i18nguy.com/unicode-example.html17 $this->assertTrue( seems_utf8( $utf8_string ) );18 }19 20 public function data_seems_utf8_returns_true_for_utf8_strings() {21 $utf8_strings = file( DIR_TESTDATA . '/formatting/utf-8/utf-8.txt' );22 foreach ( $utf8_strings as &$string ) {23 $string = (array) trim( $string );24 }25 unset( $string );26 return $utf8_strings;27 }28 29 /**30 * @dataProvider data_seems_utf8_returns_false_for_non_utf8_strings31 */32 public function test_seems_utf8_returns_false_for_non_utf8_strings( $big5_string ) {33 $this->assertFalse( seems_utf8( $big5_string ) );34 }35 36 public function data_seems_utf8_returns_false_for_non_utf8_strings() {37 // Get data from formatting/big5.txt.38 $big5_strings = file( DIR_TESTDATA . '/formatting/big5.txt' );39 foreach ( $big5_strings as &$string ) {40 $string = (array) trim( $string );41 }42 unset( $string );43 return $big5_strings;44 }45 }
Note: See TracChangeset
for help on using the changeset viewer.