Skips contents of script tags.
Source
private function skip_script_data(): bool {
$state = 'unescaped';
$html = $this->html;
$doc_length = strlen( $html );
$at = $this->bytes_already_parsed;
while ( false !== $at && $at < $doc_length ) {
$at += strcspn( $html, '-<', $at );
/*
* Optimization: Terminating a complete script element requires at least eight
* additional bytes in the document. Some checks below may cause local escaped
* state transitions when processing shorter strings, but those transitions are
* irrelevant if the script tag is incomplete and the function must return false.
*
* This may need updating if those transitions become significant or exported from
* this function in some way, such as when building safe methods to embed JavaScript
* or data inside a SCRIPT element.
*
* $at may be here.
* ↓
* ...</script>
* ╰──┬───╯
* $at + 8 additional bytes are required for a non-false return value.
*
* This single check eliminates the need to check lengths for the shorter spans:
*
* $at may be here.
* ↓
* <script><!-- --></script>
* ├╯
* $at + 2 additional characters does not require a length check.
*
* The transition from "escaped" to "unescaped" is not relevant if the document ends:
*
* $at may be here.
* ↓
* <script><!-- -->[[END-OF-DOCUMENT]]
* ╰──┬───╯
* $at + 8 additional bytes is not satisfied, return false.
*/
if ( $at + 8 >= $doc_length ) {
return false;
}
/*
* For all script states a "-->" transitions
* back into the normal unescaped script mode,
* even if that's the current state.
*/
if (
'-' === $html[ $at ] &&
'-' === $html[ $at + 1 ] &&
'>' === $html[ $at + 2 ]
) {
$at += 3;
$state = 'unescaped';
continue;
}
/*
* Everything of interest past here starts with "<".
* Check this character and advance position regardless.
*/
if ( '<' !== $html[ $at++ ] ) {
continue;
}
/*
* "<!--" only transitions from _unescaped_ to _escaped_. This byte sequence is only
* significant in the _unescaped_ state and is ignored in any other state.
*/
if (
'unescaped' === $state &&
'!' === $html[ $at ] &&
'-' === $html[ $at + 1 ] &&
'-' === $html[ $at + 2 ]
) {
$at += 3;
/*
* The parser is ready to enter the _escaped_ state, but may remain in the
* _unescaped_ state. This occurs when "<!--" is immediately followed by a
* sequence of 0 or more "-" followed by ">". This is similar to abruptly closed
* HTML comments like "<!-->" or "<!--->".
*
* Note that this check may advance the position significantly and requires a
* length check to prevent bad offsets on inputs like `<script><!---------`.
*/
$at += strspn( $html, '-', $at );
if ( $at < $doc_length && '>' === $html[ $at ] ) {
++$at;
continue;
}
$state = 'escaped';
continue;
}
if ( '/' === $html[ $at ] ) {
$closer_potentially_starts_at = $at - 1;
$is_closing = true;
++$at;
} else {
$is_closing = false;
}
/*
* At this point the only remaining state-changes occur with the
* <script> and </script> tags; unless one of these appears next,
* proceed scanning to the next potential token in the text.
*/
if ( ! (
( 's' === $html[ $at ] || 'S' === $html[ $at ] ) &&
( 'c' === $html[ $at + 1 ] || 'C' === $html[ $at + 1 ] ) &&
( 'r' === $html[ $at + 2 ] || 'R' === $html[ $at + 2 ] ) &&
( 'i' === $html[ $at + 3 ] || 'I' === $html[ $at + 3 ] ) &&
( 'p' === $html[ $at + 4 ] || 'P' === $html[ $at + 4 ] ) &&
( 't' === $html[ $at + 5 ] || 'T' === $html[ $at + 5 ] )
) ) {
++$at;
continue;
}
/*
* Ensure that the script tag terminates to avoid matching on
* substrings of a non-match. For example, the sequence
* "<script123" should not end a script region even though
* "<script" is found within the text.
*/
$at += 6;
$c = $html[ $at ];
if (
/**
* These characters trigger state transitions of interest:
*
* - @see {https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state}
* - @see {https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state}
* - @see {https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state}
* - @see {https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state}
*
* The "\r" character is not present in the above references. However, "\r" must be
* treated the same as "\n". This is because the HTML Standard requires newline
* normalization during preprocessing which applies this replacement.
*
* - @see https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
* - @see https://infra.spec.whatwg.org/#normalize-newlines
*/
'>' !== $c &&
' ' !== $c &&
"\n" !== $c &&
'/' !== $c &&
"\t" !== $c &&
"\f" !== $c &&
"\r" !== $c
) {
continue;
}
if ( 'escaped' === $state && ! $is_closing ) {
$state = 'double-escaped';
continue;
}
if ( 'double-escaped' === $state && $is_closing ) {
$state = 'escaped';
continue;
}
if ( $is_closing ) {
$this->bytes_already_parsed = $closer_potentially_starts_at;
$this->tag_name_starts_at = $closer_potentially_starts_at;
if ( $this->bytes_already_parsed >= $doc_length ) {
return false;
}
while ( $this->parse_next_attribute() ) {
continue;
}
if ( $this->bytes_already_parsed >= $doc_length ) {
return false;
}
if ( '>' === $html[ $this->bytes_already_parsed ] ) {
++$this->bytes_already_parsed;
return true;
}
}
++$at;
}
return false;
}
Changelog
| Version | Description |
|---|---|
| 6.2.0 | Introduced. |
User Contributed Notes
You must log in before being able to contribute a note or feedback.