Changeset 60630 for trunk/src/wp-includes/formatting.php
- Timestamp:
- 08/12/2025 06:13:48 PM (8 months ago)
- File:
-
- 1 edited
-
trunk/src/wp-includes/formatting.php (modified) (6 diffs)
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/wp-includes/formatting.php
r60399 r60630 877 877 * @author bmorel at ssi dot fr (modified) 878 878 * @since 1.2.1 879 * @deprecated 6.9.0 Use {@see wp_is_valid_utf8()} instead. 879 880 * 880 881 * @param string $str The string to be checked. … … 882 883 */ 883 884 function seems_utf8( $str ) { 885 _deprecated_function( __FUNCTION__, '6.9.0', 'wp_is_valid_utf8()' ); 886 884 887 mbstring_binary_safe_encoding(); 885 888 $length = strlen( $str ); … … 912 915 } 913 916 917 return true; 918 } 919 920 /** 921 * Determines if a given byte string represents a valid UTF-8 encoding. 922 * 923 * Note that it’s unlikely for non-UTF-8 data to validate as UTF-8, but 924 * it is still possible. Many texts are simultaneously valid UTF-8, 925 * valid US-ASCII, and valid ISO-8859-1 (`latin1`). 926 * 927 * Example: 928 * 929 * true === wp_is_valid_utf8( '' ); 930 * true === wp_is_valid_utf8( 'just a test' ); 931 * true === wp_is_valid_utf8( "\xE2\x9C\x8F" ); // Pencil, U+270F. 932 * true === wp_is_valid_utf8( "\u{270F}" ); // Pencil, U+270F. 933 * true === wp_is_valid_utf8( '✏' ); // Pencil, U+270F. 934 * 935 * false === wp_is_valid_utf8( "just \xC0 test" ); // Invalid bytes. 936 * false === wp_is_valid_utf8( "\xE2\x9C" ); // Invalid/incomplete sequences. 937 * false === wp_is_valid_utf8( "\xC1\xBF" ); // Overlong sequences. 938 * false === wp_is_valid_utf8( "\xED\xB0\x80" ); // Surrogate halves. 939 * false === wp_is_valid_utf8( "B\xFCch" ); // ISO-8859-1 high-bytes. 940 * // E.g. The “ü” in ISO-8859-1 is a single byte 0xFC, 941 * // but in UTF-8 is the two-byte sequence 0xC3 0xBC. 942 * 943 * @see _wp_is_valid_utf8_fallback 944 * 945 * @since 6.9.0 946 * 947 * @param string $bytes String which might contain text encoded as UTF-8. 948 * @return bool Whether the provided bytes can decode as valid UTF-8. 949 */ 950 function wp_is_valid_utf8( string $bytes ): bool { 951 /* 952 * Since PHP 8.3.0 the UTF-8 validity is cached internally 953 * on string objects, making this a direct property lookup. 954 * 955 * This is to be preferred exclusively once PHP 8.3.0 is 956 * the minimum supported version, because even when the 957 * status isn’t cached, it uses highly-optimized code to 958 * validate the byte stream. 959 */ 960 return function_exists( 'mb_check_encoding' ) 961 ? mb_check_encoding( $bytes, 'UTF-8' ) 962 : _wp_is_valid_utf8_fallback( $bytes ); 963 } 964 965 /** 966 * Fallback mechanism for safely validating UTF-8 bytes. 967 * 968 * By implementing a raw method here the code will behave in the same way on 969 * all installed systems, regardless of what extensions are installed. 970 * 971 * @see wp_is_valid_utf8 972 * 973 * @since 6.9.0 974 * @access private 975 * 976 * @param string $bytes String which might contain text encoded as UTF-8. 977 * @return bool Whether the provided bytes can decode as valid UTF-8. 978 */ 979 function _wp_is_valid_utf8_fallback( string $bytes ): bool { 980 $end = strlen( $bytes ); 981 982 for ( $i = 0; $i < $end; $i++ ) { 983 /* 984 * Quickly skip past US-ASCII bytes, all of which are valid UTF-8. 985 * 986 * This optimization step improves the speed from 10x to 100x 987 * depending on whether the JIT has optimized the function. 988 */ 989 $i += strspn( 990 $bytes, 991 "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" . 992 "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" . 993 " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f", 994 $i 995 ); 996 if ( $i >= $end ) { 997 break; 998 } 999 1000 /** 1001 * The above fast-track handled all single-byte UTF-8 characters. What 1002 * follows MUST be a multibyte sequence otherwise there’s invalid UTF-8. 1003 * 1004 * Therefore everything past here is checking those multibyte sequences. 1005 * Because it’s possible that there are truncated characters, the use of 1006 * the null-coalescing operator with "\xC0" is a convenience for skipping 1007 * length checks on every continuation bytes. This works because 0xC0 is 1008 * always invalid in a UTF-8 string, meaning that if the string has been 1009 * truncated, it will find 0xC0 and reject as invalid UTF-8. 1010 * 1011 * > [The following table] lists all of the byte sequences that are well-formed 1012 * > in UTF-8. A range of byte values such as A0..BF indicates that any byte 1013 * > from A0 to BF (inclusive) is well-formed in that position. Any byte value 1014 * > outside of the ranges listed is ill-formed. 1015 * 1016 * > Table 3-7. Well-Formed UTF-8 Byte Sequences 1017 * ╭─────────────────────┬────────────┬──────────────┬─────────────┬──────────────╮ 1018 * │ Code Points │ First Byte │ Second Byte │ Third Byte │ Fourth Byte │ 1019 * ├─────────────────────┼────────────┼──────────────┼─────────────┼──────────────┤ 1020 * │ U+0000..U+007F │ 00..7F │ │ │ │ 1021 * │ U+0080..U+07FF │ C2..DF │ 80..BF │ │ │ 1022 * │ U+0800..U+0FFF │ E0 │ A0..BF │ 80..BF │ │ 1023 * │ U+1000..U+CFFF │ E1..EC │ 80..BF │ 80..BF │ │ 1024 * │ U+D000..U+D7FF │ ED │ 80..9F │ 80..BF │ │ 1025 * │ U+E000..U+FFFF │ EE..EF │ 80..BF │ 80..BF │ │ 1026 * │ U+10000..U+3FFFF │ F0 │ 90..BF │ 80..BF │ 80..BF │ 1027 * │ U+40000..U+FFFFF │ F1..F3 │ 80..BF │ 80..BF │ 80..BF │ 1028 * │ U+100000..U+10FFFF │ F4 │ 80..8F │ 80..BF │ 80..BF │ 1029 * ╰─────────────────────┴────────────┴──────────────┴─────────────┴──────────────╯ 1030 * 1031 * Notice that all valid third and forth bytes are in the range 80..BF. This 1032 * validator takes advantage of that to only check the range of those bytes once. 1033 * 1034 * @see https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/ 1035 * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506 1036 */ 1037 1038 $b1 = ord( $bytes[ $i ] ); 1039 $b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" ); 1040 1041 // Valid two-byte code points. 1042 1043 if ( $b1 >= 0xC2 && $b1 <= 0xDF && $b2 >= 0x80 && $b2 <= 0xBF ) { 1044 $i++; 1045 continue; 1046 } 1047 1048 $b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" ); 1049 1050 // Valid three-byte code points. 1051 1052 if ( $b3 < 0x80 || $b3 > 0xBF ) { 1053 return false; 1054 } 1055 1056 if ( 1057 ( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) || 1058 ( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) || 1059 ( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) || 1060 ( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF ) 1061 ) { 1062 $i += 2; 1063 continue; 1064 } 1065 1066 $b4 = ord( $bytes[ $i + 3 ] ?? "\xC0" ); 1067 1068 // Valid four-byte code points. 1069 1070 if ( $b4 < 0x80 || $b4 > 0xBF ) { 1071 return false; 1072 } 1073 1074 if ( 1075 ( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) || 1076 ( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) || 1077 ( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F ) 1078 ) { 1079 $i += 3; 1080 continue; 1081 } 1082 1083 // Any other sequence is invalid. 1084 return false; 1085 } 1086 1087 // Reaching the end implies validating every byte. 914 1088 return true; 915 1089 } … … 1598 1772 } 1599 1773 1600 if ( seems_utf8( $text ) ) {1774 if ( wp_is_valid_utf8( $text ) ) { 1601 1775 1602 1776 /* … … 2029 2203 } 2030 2204 2031 if ( ! seems_utf8( $filename ) ) {2205 if ( ! wp_is_valid_utf8( $filename ) ) { 2032 2206 $_ext = pathinfo( $filename, PATHINFO_EXTENSION ); 2033 2207 $_name = pathinfo( $filename, PATHINFO_FILENAME ); … … 2278 2452 $title = preg_replace( '|---([a-fA-F0-9][a-fA-F0-9])---|', '%$1', $title ); 2279 2453 2280 if ( seems_utf8( $title ) ) {2454 if ( wp_is_valid_utf8( $title ) ) { 2281 2455 if ( function_exists( 'mb_strtolower' ) ) { 2282 2456 $title = mb_strtolower( $title, 'UTF-8' );
Note: See TracChangeset
for help on using the changeset viewer.