Changeset 61000
- Timestamp:
- 10/21/2025 02:33:54 AM (5 weeks ago)
- Location:
- trunk
- Files:
-
- 1 added
- 2 edited
-
src/wp-includes/compat-utf8.php (modified) (4 diffs)
-
src/wp-includes/utf8.php (modified) (1 diff)
-
tests/phpunit/tests/unicode/wpHasNoncharacters.php (added)
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/wp-includes/compat-utf8.php
r60969 r61000 36 36 * @access private 37 37 * 38 * @param string $bytes UTF-8 encoded string which might include invalid spans of bytes. 39 * @param int $at Where to start scanning. 40 * @param int $invalid_length Will be set to how many bytes are to be ignored after `$at`. 41 * @param int|null $max_bytes Stop scanning after this many bytes have been seen. 42 * @param int|null $max_code_points Stop scanning after this many code points have been seen. 38 * @param string $bytes UTF-8 encoded string which might include invalid spans of bytes. 39 * @param int $at Where to start scanning. 40 * @param int $invalid_length Will be set to how many bytes are to be ignored after `$at`. 41 * @param int|null $max_bytes Stop scanning after this many bytes have been seen. 42 * @param int|null $max_code_points Stop scanning after this many code points have been seen. 43 * @param bool $has_noncharacters Set to indicate if scanned string contained noncharacters. 43 44 * @return int How many code points were successfully scanned. 44 45 */ 45 function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null ): int { 46 $byte_length = strlen( $bytes ); 47 $end = min( $byte_length, $at + ( $max_bytes ?? PHP_INT_MAX ) ); 48 $invalid_length = 0; 49 $count = 0; 50 $max_count = $max_code_points ?? PHP_INT_MAX; 46 function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null, ?bool &$has_noncharacters = null ): int { 47 $byte_length = strlen( $bytes ); 48 $end = min( $byte_length, $at + ( $max_bytes ?? PHP_INT_MAX ) ); 49 $invalid_length = 0; 50 $count = 0; 51 $max_count = $max_code_points ?? PHP_INT_MAX; 52 $has_noncharacters = false; 51 53 52 54 for ( $i = $at; $i < $end && $count <= $max_count; $i++ ) { … … 146 148 ++$count; 147 149 $i += 2; 150 151 // Covers the range U+FDD0–U+FDEF, U+FFFE, U+FFFF. 152 if ( 0xEF === $b1 ) { 153 $has_noncharacters |= ( 154 ( 0xB7 === $b2 && $b3 >= 0x90 && $b3 <= 0xAF ) || 155 ( 0xBF === $b2 && ( 0xBE === $b3 || 0xBF === $b3 ) ) 156 ); 157 } 158 148 159 continue; 149 160 } … … 163 174 ++$count; 164 175 $i += 3; 176 177 // Covers U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, …, U+10FFFE, U+10FFFF. 178 $has_noncharacters |= ( 179 ( 0x0F === ( $b2 & 0x0F ) ) && 180 0xBF === $b3 && 181 ( 0xBE === $b4 || 0xBF === $b4 ) 182 ); 183 165 184 continue; 166 185 } … … 382 401 383 402 /** 403 * Fallback support for determining if a string contains Unicode noncharacters. 404 * 405 * @since 6.9.0 406 * @access private 407 * 408 * @see \wp_has_noncharacters() 409 * 410 * @param string $text Are there noncharacters in this string? 411 * @return bool Whether noncharacters were found in the string. 412 */ 413 function _wp_has_noncharacters_fallback( string $text ): bool { 414 $at = 0; 415 $invalid_length = 0; 416 $has_noncharacters = false; 417 $end = strlen( $text ); 418 419 while ( $at < $end && ! $has_noncharacters ) { 420 _wp_scan_utf8( $text, $at, $invalid_length, null, null, $has_noncharacters ); 421 $at += $invalid_length; 422 } 423 424 return $has_noncharacters; 425 } 426 427 /** 384 428 * Converts a string from ISO-8859-1 to UTF-8, maintaining backwards compatibility 385 429 * with the deprecated function from the PHP standard library. -
trunk/src/wp-includes/utf8.php
r60793 r61000 134 134 } 135 135 endif; 136 137 if ( _wp_can_use_pcre_u() ) : 138 /** 139 * Returns whether the given string contains Unicode noncharacters. 140 * 141 * XML recommends against using noncharacters and HTML forbids their 142 * use in attribute names. Unicode recommends that they not be used 143 * in open exchange of data. 144 * 145 * Noncharacters are code points within the following ranges: 146 * - U+FDD0–U+FDEF 147 * - U+FFFE–U+FFFF 148 * - U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, …, U+10FFFE, U+10FFFF 149 * 150 * @see https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-23/#G12612 151 * @see https://www.w3.org/TR/xml/#charsets 152 * @see https://html.spec.whatwg.org/#attributes-2 153 * 154 * @since 6.9.0 155 * 156 * @param string $text Are there noncharacters in this string? 157 * @return bool Whether noncharacters were found in the string. 158 */ 159 function wp_has_noncharacters( string $text ): bool { 160 return 1 === preg_match( 161 '/[\x{FDD0}-\x{FDEF}\x{FFFE}\x{FFFF}\x{1FFFE}\x{1FFFF}\x{2FFFE}\x{2FFFF}\x{3FFFE}\x{3FFFF}\x{4FFFE}\x{4FFFF}\x{5FFFE}\x{5FFFF}\x{6FFFE}\x{6FFFF}\x{7FFFE}\x{7FFFF}\x{8FFFE}\x{8FFFF}\x{9FFFE}\x{9FFFF}\x{AFFFE}\x{AFFFF}\x{BFFFE}\x{BFFFF}\x{CFFFE}\x{CFFFF}\x{DFFFE}\x{DFFFF}\x{EFFFE}\x{EFFFF}\x{FFFFE}\x{FFFFF}\x{10FFFE}\x{10FFFF}]/u', 162 $text 163 ); 164 } 165 else : 166 /** 167 * Fallback function for detecting noncharacters in a text. 168 * 169 * @ignore 170 * @private 171 * 172 * @since 6.9.0 173 */ 174 function wp_has_noncharacters( string $text ): bool { 175 return _wp_has_noncharacters_fallback( $text ); 176 } 177 endif;
Note: See TracChangeset
for help on using the changeset viewer.