Make WordPress Core

Changeset 61000


Ignore:
Timestamp:
10/21/2025 02:33:54 AM (5 weeks ago)
Author:
dmsnell
Message:

Charset: wp_has_noncharacters() for more-specific Unicode handling.

Noncharacters are code points that are permanently reserved in the Unicode Standard for internal use. They are not recommended for use in open interchange of Unicode text data. However, they are valid code points and will not cause a string to return as invalid.

Still, HTML and XML both impose semantic rules on their use and it may be important for code to know whether they are present in a string. This patch introduces a new function, wp_has_noncharacters(), which answers this question.

See https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-23/#G12612

Developed in https://github.com/WordPress/wordpress-develop/pull/9827
Discussed in https://core.trac.wordpress.org/ticket/63863

See #63863.

Location:
trunk
Files:
1 added
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/wp-includes/compat-utf8.php

    r60969 r61000  
    3636 * @access private
    3737 *
    38  * @param string   $bytes           UTF-8 encoded string which might include invalid spans of bytes.
    39  * @param int      $at              Where to start scanning.
    40  * @param int      $invalid_length  Will be set to how many bytes are to be ignored after `$at`.
    41  * @param int|null $max_bytes       Stop scanning after this many bytes have been seen.
    42  * @param int|null $max_code_points Stop scanning after this many code points have been seen.
     38 * @param string   $bytes             UTF-8 encoded string which might include invalid spans of bytes.
     39 * @param int      $at                Where to start scanning.
     40 * @param int      $invalid_length    Will be set to how many bytes are to be ignored after `$at`.
     41 * @param int|null $max_bytes         Stop scanning after this many bytes have been seen.
     42 * @param int|null $max_code_points   Stop scanning after this many code points have been seen.
     43 * @param bool     $has_noncharacters Set to indicate if scanned string contained noncharacters.
    4344 * @return int How many code points were successfully scanned.
    4445 */
    45 function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null ): int {
    46     $byte_length    = strlen( $bytes );
    47     $end            = min( $byte_length, $at + ( $max_bytes ?? PHP_INT_MAX ) );
    48     $invalid_length = 0;
    49     $count          = 0;
    50     $max_count      = $max_code_points ?? PHP_INT_MAX;
     46function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null, ?bool &$has_noncharacters = null ): int {
     47    $byte_length       = strlen( $bytes );
     48    $end               = min( $byte_length, $at + ( $max_bytes ?? PHP_INT_MAX ) );
     49    $invalid_length    = 0;
     50    $count             = 0;
     51    $max_count         = $max_code_points ?? PHP_INT_MAX;
     52    $has_noncharacters = false;
    5153
    5254    for ( $i = $at; $i < $end && $count <= $max_count; $i++ ) {
     
    146148            ++$count;
    147149            $i += 2;
     150
     151            // Covers the range U+FDD0–U+FDEF, U+FFFE, U+FFFF.
     152            if ( 0xEF === $b1 ) {
     153                $has_noncharacters |= (
     154                    ( 0xB7 === $b2 && $b3 >= 0x90 && $b3 <= 0xAF ) ||
     155                    ( 0xBF === $b2 && ( 0xBE === $b3 || 0xBF === $b3 ) )
     156                );
     157            }
     158
    148159            continue;
    149160        }
     
    163174            ++$count;
    164175            $i += 3;
     176
     177            // Covers U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, …, U+10FFFE, U+10FFFF.
     178            $has_noncharacters |= (
     179                ( 0x0F === ( $b2 & 0x0F ) ) &&
     180                0xBF === $b3 &&
     181                ( 0xBE === $b4 || 0xBF === $b4 )
     182            );
     183
    165184            continue;
    166185        }
     
    382401
    383402/**
     403 * Fallback support for determining if a string contains Unicode noncharacters.
     404 *
     405 * @since 6.9.0
     406 * @access private
     407 *
     408 * @see \wp_has_noncharacters()
     409 *
     410 * @param string $text Are there noncharacters in this string?
     411 * @return bool Whether noncharacters were found in the string.
     412 */
     413function _wp_has_noncharacters_fallback( string $text ): bool {
     414    $at                = 0;
     415    $invalid_length    = 0;
     416    $has_noncharacters = false;
     417    $end               = strlen( $text );
     418
     419    while ( $at < $end && ! $has_noncharacters ) {
     420        _wp_scan_utf8( $text, $at, $invalid_length, null, null, $has_noncharacters );
     421        $at += $invalid_length;
     422    }
     423
     424    return $has_noncharacters;
     425}
     426
     427/**
    384428 * Converts a string from ISO-8859-1 to UTF-8, maintaining backwards compatibility
    385429 * with the deprecated function from the PHP standard library.
  • trunk/src/wp-includes/utf8.php

    r60793 r61000  
    134134    }
    135135endif;
     136
     137if ( _wp_can_use_pcre_u() ) :
     138    /**
     139     * Returns whether the given string contains Unicode noncharacters.
     140     *
     141     * XML recommends against using noncharacters and HTML forbids their
     142     * use in attribute names. Unicode recommends that they not be used
     143     * in open exchange of data.
     144     *
     145     * Noncharacters are code points within the following ranges:
     146     *  - U+FDD0–U+FDEF
     147     *  - U+FFFE–U+FFFF
     148     *  - U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, …, U+10FFFE, U+10FFFF
     149     *
     150     * @see https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-23/#G12612
     151     * @see https://www.w3.org/TR/xml/#charsets
     152     * @see https://html.spec.whatwg.org/#attributes-2
     153     *
     154     * @since 6.9.0
     155     *
     156     * @param string $text Are there noncharacters in this string?
     157     * @return bool Whether noncharacters were found in the string.
     158     */
     159    function wp_has_noncharacters( string $text ): bool {
     160        return 1 === preg_match(
     161            '/[\x{FDD0}-\x{FDEF}\x{FFFE}\x{FFFF}\x{1FFFE}\x{1FFFF}\x{2FFFE}\x{2FFFF}\x{3FFFE}\x{3FFFF}\x{4FFFE}\x{4FFFF}\x{5FFFE}\x{5FFFF}\x{6FFFE}\x{6FFFF}\x{7FFFE}\x{7FFFF}\x{8FFFE}\x{8FFFF}\x{9FFFE}\x{9FFFF}\x{AFFFE}\x{AFFFF}\x{BFFFE}\x{BFFFF}\x{CFFFE}\x{CFFFF}\x{DFFFE}\x{DFFFF}\x{EFFFE}\x{EFFFF}\x{FFFFE}\x{FFFFF}\x{10FFFE}\x{10FFFF}]/u',
     162            $text
     163        );
     164    }
     165else :
     166    /**
     167     * Fallback function for detecting noncharacters in a text.
     168     *
     169     * @ignore
     170     * @private
     171     *
     172     * @since 6.9.0
     173     */
     174    function wp_has_noncharacters( string $text ): bool {
     175        return _wp_has_noncharacters_fallback( $text );
     176    }
     177endif;
Note: See TracChangeset for help on using the changeset viewer.