Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 22 additions & 24 deletions lib/cli/cli.php
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ function safe_strlen( $str, $encoding = false ) {
if ( ! $encoding ) {
$encoding = mb_detect_encoding( $str, null, true /*strict*/ );
}
$length = mb_strlen( $str, $encoding );
$length = $encoding ? mb_strlen( $str, $encoding ) : mb_strlen( $str ); // mbstring funcs can fail if given `$encoding` arg that evals to false.
if ( 'UTF-8' === $encoding ) {
// Subtract combining characters.
$length -= preg_match_all( get_unicode_regexs( 'm' ), $str, $dummy /*needed for PHP 5.3*/ );
Expand Down Expand Up @@ -209,14 +209,22 @@ function safe_substr( $str, $start, $length = false, $is_width = false, $encodin
if ( $length < 0 || ( $is_width && ( null === $length || false === $length ) ) ) {
return false;
}
$have_safe_strlen = false;
// PHP 5.3 substr takes false as full length, PHP > 5.3 takes null - for compat. do `safe_strlen()`.
// Need this for normalization below and other uses.
$safe_strlen = safe_strlen( $str, $encoding );

// Normalize `$length` when not specified - PHP 5.3 substr takes false as full length, PHP > 5.3 takes null.
if ( null === $length || false === $length ) {
$length = safe_strlen( $str, $encoding );
$have_safe_strlen = true;
$length = $safe_strlen;
}
// Normalize `$start` - various methods treat this differently.
if ( $start > $safe_strlen ) {
return '';
}
if ( $start < 0 && -$start > $safe_strlen ) {
$start = 0;
}

// Allow for selective testings - "1" bit set tests grapheme_substr(), "2" preg_match( '/\X/' ), "4" mb_substr(), "8" substr().
// Allow for selective testings - "1" bit set tests grapheme_substr(), "2" preg_split( '/\X/' ), "4" mb_substr(), "8" substr().
$test_safe_substr = getenv( 'PHP_CLI_TOOLS_TEST_SAFE_SUBSTR' );

// Assume UTF-8 if no encoding given - `grapheme_substr()` will return false (not null like `grapheme_strlen()`) if given non-UTF-8 string.
Expand All @@ -225,22 +233,12 @@ function safe_substr( $str, $start, $length = false, $is_width = false, $encodin
return $is_width ? _safe_substr_eaw( $try, $length ) : $try;
}
}
// Assume UTF-8 if no encoding given - `preg_match()` will return false if given non-UTF-8 string.
// Assume UTF-8 if no encoding given - `preg_split()` returns a one element array if given non-UTF-8 string (PHP bug) so need to check `preg_last_error()`.
if ( ( ! $encoding || 'UTF-8' === $encoding ) && can_use_pcre_x() ) {
if ( $start < 0 ) {
$start = max( $start + ( $have_safe_strlen ? $length : safe_strlen( $str, $encoding ) ), 0 );
}
if ( $start ) {
if ( preg_match( '/^\X{' . $start . '}(\X{0,' . $length . '})/u', $str, $matches ) ) {
if ( ! $test_safe_substr || ( $test_safe_substr & 2 ) ) {
return $is_width ? _safe_substr_eaw( $matches[1], $length ) : $matches[1];
}
}
} else {
if ( preg_match( '/^\X{0,' . $length . '}/u', $str, $matches ) ) {
if ( ! $test_safe_substr || ( $test_safe_substr & 2 ) ) {
return $is_width ? _safe_substr_eaw( $matches[0], $length ) : $matches[0];
}
if ( false !== ( $try = preg_split( '/(\X)/u', $str, $safe_strlen + 1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ) ) && ! preg_last_error() ) {
$try = implode( '', array_slice( $try, $start, $length ) );
if ( ! $test_safe_substr || ( $test_safe_substr & 2 ) ) {
return $is_width ? _safe_substr_eaw( $try, $length ) : $try;
}
}
}
Expand All @@ -250,7 +248,7 @@ function safe_substr( $str, $start, $length = false, $is_width = false, $encodin
$encoding = mb_detect_encoding( $str, null, true /*strict*/ );
}
// Bug: not adjusting for combining chars.
$try = mb_substr( $str, $start, $length, $encoding );
$try = $encoding ? mb_substr( $str, $start, $length, $encoding ) : mb_substr( $str, $start, $length ); // mbstring funcs can fail if given `$encoding` arg that evals to false.
if ( 'UTF-8' === $encoding && $is_width ) {
$try = _safe_substr_eaw( $try, $length );
}
Expand All @@ -275,7 +273,7 @@ function _safe_substr_eaw( $str, $length ) {
// Note that if the length ends in the middle of a double-width char, the char is excluded, not included.

// See if it's all EAW.
if ( preg_match_all( $eaw_regex, $str, $dummy /*needed for PHP 5.3*/ ) === $length ) {
if ( function_exists( 'mb_substr' ) && preg_match_all( $eaw_regex, $str, $dummy /*needed for PHP 5.3*/ ) === $length ) {
// Just halve the length so (rounded down to a minimum of 1).
$str = mb_substr( $str, 0, max( (int) ( $length / 2 ), 1 ), 'UTF-8' );
} else {
Expand Down Expand Up @@ -344,7 +342,7 @@ function strwidth( $string, $encoding = false ) {
if ( ! $encoding ) {
$encoding = mb_detect_encoding( $string, null, true /*strict*/ );
}
$width = mb_strwidth( $string, $encoding );
$width = $encoding ? mb_strwidth( $string, $encoding ) : mb_strwidth( $string ); // mbstring funcs can fail if given `$encoding` arg that evals to false.
if ( 'UTF-8' === $encoding ) {
// Subtract combining characters.
$width -= preg_match_all( $m_regex, $string, $dummy /*needed for PHP 5.3*/ );
Expand Down
52 changes: 40 additions & 12 deletions tests/test-cli.php
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,12 @@ function test_various_substr() {
putenv( 'PHP_CLI_TOOLS_TEST_SAFE_SUBSTR' );

// Latin, kana, Latin, Latin combining, Thai combining, Hangul.
$str = 'lムnöม้p를';
$str = 'lムnöม้p를'; // 18 bytes.

// Large string.
$large_str_str_start = 65536 * 2;
$large_str = str_repeat( 'a', $large_str_str_start ) . $str;
$large_str_len = strlen( $large_str ); // 128K + 18 bytes.

if ( \cli\can_use_icu() ) {
putenv( 'PHP_CLI_TOOLS_TEST_SAFE_SUBSTR=1' ); // Tests grapheme_substr().
Expand All @@ -124,6 +129,10 @@ function test_various_substr() {
$this->assertSame( 'lムnöม้p', \cli\safe_substr( $str, 0, 6 ) );
$this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, 0, 7 ) );
$this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, 0, 8 ) );
$this->assertSame( '', \cli\safe_substr( $str, 19 ) ); // Start too large.
$this->assertSame( '', \cli\safe_substr( $str, 19, 7 ) ); // Start too large, with length.
$this->assertSame( '', \cli\safe_substr( $str, 8 ) ); // Start same as length.
$this->assertSame( '', \cli\safe_substr( $str, 8, 0 ) ); // Start same as length, with zero length.
$this->assertSame( '를', \cli\safe_substr( $str, -1 ) );
$this->assertSame( 'p를', \cli\safe_substr( $str, -2 ) );
$this->assertSame( 'ม้p를', \cli\safe_substr( $str, -3 ) );
Expand All @@ -134,11 +143,18 @@ function test_various_substr() {
$this->assertSame( 'ムnöม้p를', \cli\safe_substr( $str, -6 ) );
$this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, -7 ) );
$this->assertSame( 'lムnö', \cli\safe_substr( $str, -7, 4 ) );
// $this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, -8 ) ); // grapheme_substr() returns false on this.
$this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, -8 ) );
$this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, -9 ) ); // Negative start too large.

$this->assertSame( $large_str, \cli\safe_substr( $large_str, 0 ) );
$this->assertSame( '', \cli\safe_substr( $large_str, $large_str_str_start, 0 ) );
$this->assertSame( 'l', \cli\safe_substr( $large_str, $large_str_str_start, 1 ) );
$this->assertSame( 'lム', \cli\safe_substr( $large_str, $large_str_str_start, 2 ) );
$this->assertSame( 'p를', \cli\safe_substr( $large_str, -2 ) );
}

if ( \cli\can_use_pcre_x() ) {
putenv( 'PHP_CLI_TOOLS_TEST_SAFE_SUBSTR=2' ); // Tests preg_match( '/\X/u' ).
putenv( 'PHP_CLI_TOOLS_TEST_SAFE_SUBSTR=2' ); // Tests preg_split( '/\X/u' ).
$this->assertSame( '', \cli\safe_substr( $str, 0, 0 ) );
$this->assertSame( 'l', \cli\safe_substr( $str, 0, 1 ) );
$this->assertSame( 'lム', \cli\safe_substr( $str, 0, 2 ) );
Expand All @@ -148,6 +164,10 @@ function test_various_substr() {
$this->assertSame( 'lムnöม้p', \cli\safe_substr( $str, 0, 6 ) );
$this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, 0, 7 ) );
$this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, 0, 8 ) );
$this->assertSame( '', \cli\safe_substr( $str, 19 ) ); // Start too large.
$this->assertSame( '', \cli\safe_substr( $str, 19, 7 ) ); // Start too large, with length.
$this->assertSame( '', \cli\safe_substr( $str, 8 ) ); // Start same as length.
$this->assertSame( '', \cli\safe_substr( $str, 8, 0 ) ); // Start same as length, with zero length.
$this->assertSame( '를', \cli\safe_substr( $str, -1 ) );
$this->assertSame( 'p를', \cli\safe_substr( $str, -2 ) );
$this->assertSame( 'ม้p를', \cli\safe_substr( $str, -3 ) );
Expand All @@ -159,6 +179,13 @@ function test_various_substr() {
$this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, -7 ) );
$this->assertSame( 'lムnö', \cli\safe_substr( $str, -7, 4 ) );
$this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, -8 ) );
$this->assertSame( 'lムnöม้p를', \cli\safe_substr( $str, -9 ) ); // Negative start too large.

$this->assertSame( $large_str, \cli\safe_substr( $large_str, 0 ) );
$this->assertSame( '', \cli\safe_substr( $large_str, $large_str_str_start, 0 ) );
$this->assertSame( 'l', \cli\safe_substr( $large_str, $large_str_str_start, 1 ) );
$this->assertSame( 'lム', \cli\safe_substr( $large_str, $large_str_str_start, 2 ) );
$this->assertSame( 'p를', \cli\safe_substr( $large_str, -2 ) );
}

if ( function_exists( 'mb_substr' ) ) {
Expand All @@ -174,8 +201,9 @@ function test_various_substr() {
$this->assertSame( '', \cli\safe_substr( $str, 0, 0 ) );
$this->assertSame( 'l', \cli\safe_substr( $str, 0, 1 ) );
$this->assertSame( "l\xe3", \cli\safe_substr( $str, 0, 2 ) ); // Corrupt.
$this->assertSame( '', \cli\safe_substr( $str, strlen( $str ) + 1 ) ); // Return '' not false to match behavior of other methods when `$start` > strlen.

// Non-UTF-8 - both grapheme_substr() and preg_match( '/\X/u' ) will fail.
// Non-UTF-8 - both grapheme_substr() and preg_split( '/\X/u' ) will fail.

putenv( 'PHP_CLI_TOOLS_TEST_SAFE_SUBSTR' );

Expand Down Expand Up @@ -392,10 +420,10 @@ function test_strwidth() {

if ( \cli\can_use_icu() ) {
$this->assertSame( 5, \cli\strwidth( $str ) ); // Tests grapheme_strlen().
putenv( 'PHP_CLI_TOOLS_TEST_STRWIDTH=2' ); // Test preg_match_all( '/\X/u' ).
putenv( 'PHP_CLI_TOOLS_TEST_STRWIDTH=2' ); // Test preg_split( '/\X/u' ).
$this->assertSame( 5, \cli\strwidth( $str ) );
} else {
$this->assertSame( 5, \cli\strwidth( $str ) ); // Tests preg_match_all( '/\X/u' ).
$this->assertSame( 5, \cli\strwidth( $str ) ); // Tests preg_split( '/\X/u' ).
}

if ( function_exists( 'mb_strwidth' ) && function_exists( 'mb_detect_order' ) ) {
Expand All @@ -422,10 +450,10 @@ function test_strwidth() {

if ( \cli\can_use_icu() ) {
$this->assertSame( 11, \cli\strwidth( $str ) ); // Tests grapheme_strlen().
putenv( 'PHP_CLI_TOOLS_TEST_STRWIDTH=2' ); // Test preg_match_all( '/\X/u' ).
putenv( 'PHP_CLI_TOOLS_TEST_STRWIDTH=2' ); // Test preg_split( '/\X/u' ).
$this->assertSame( 11, \cli\strwidth( $str ) );
} else {
$this->assertSame( 11, \cli\strwidth( $str ) ); // Tests preg_match_all( '/\X/u' ).
$this->assertSame( 11, \cli\strwidth( $str ) ); // Tests preg_split( '/\X/u' ).
}

if ( function_exists( 'mb_strwidth' ) && function_exists( 'mb_detect_order' ) ) {
Expand All @@ -434,7 +462,7 @@ function test_strwidth() {
$this->assertSame( 11, \cli\strwidth( $str ) );
}

// Non-UTF-8 - both grapheme_strlen() and preg_match_all( '/\X/u' ) will fail.
// Non-UTF-8 - both grapheme_strlen() and preg_split( '/\X/u' ) will fail.

putenv( 'PHP_CLI_TOOLS_TEST_STRWIDTH' );

Expand Down Expand Up @@ -486,11 +514,11 @@ function test_safe_strlen() {
putenv( 'PHP_CLI_TOOLS_TEST_SAFE_STRLEN' ); // Test grapheme_strlen().
$this->assertSame( 7, \cli\safe_strlen( $str ) );
if ( \cli\can_use_pcre_x() ) {
putenv( 'PHP_CLI_TOOLS_TEST_SAFE_STRLEN=2' ); // Test preg_match_all( '/\X/u' ).
putenv( 'PHP_CLI_TOOLS_TEST_SAFE_STRLEN=2' ); // Test preg_split( '/\X/u' ).
$this->assertSame( 7, \cli\safe_strlen( $str ) );
}
} elseif ( \cli\can_use_pcre_x() ) {
$this->assertSame( 7, \cli\safe_strlen( $str ) ); // Tests preg_match_all( '/\X/u' ).
$this->assertSame( 7, \cli\safe_strlen( $str ) ); // Tests preg_split( '/\X/u' ).
} else {
putenv( 'PHP_CLI_TOOLS_TEST_SAFE_STRLEN=8' ); // Test strlen().
$this->assertSame( 18, \cli\safe_strlen( $str ) ); // strlen() - no. of bytes.
Expand All @@ -504,7 +532,7 @@ function test_safe_strlen() {
$this->assertSame( 9, mb_strlen( $str, 'UTF-8' ) ); // mb_strlen() - counts the 2 combining chars.
}

// Non-UTF-8 - both grapheme_strlen() and preg_match_all( '/\X/u' ) will fail.
// Non-UTF-8 - both grapheme_strlen() and preg_split( '/\X/u' ) will fail.

putenv( 'PHP_CLI_TOOLS_TEST_SAFE_STRLEN' );

Expand Down