Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 140 additions & 6 deletions src/System.Management.Automation/engine/parser/tokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -507,6 +507,9 @@ private static readonly Dictionary<string, TokenKind> s_keywordTable
private static readonly Dictionary<string, TokenKind> s_operatorTable
= new Dictionary<string, TokenKind>(StringComparer.OrdinalIgnoreCase);

private static readonly char s_invalidChar = char.MaxValue;
private static readonly int s_maxNumberOfUnicodeHexDigits = 6;

private readonly Parser _parser;
private PositionHelper _positionHelper;
private int _nestedTokensAdjustment;
Expand Down Expand Up @@ -1229,8 +1232,10 @@ private int SkipBlockComment(int i)
return i;
}

private static char Backtick(char c)
private char Backtick(char c, out char surrogateCharacter)
{
surrogateCharacter = s_invalidChar;

switch (c)
{
case '0': return '\0';
Expand All @@ -1241,11 +1246,105 @@ private static char Backtick(char c)
case 'n': return '\n';
case 'r': return '\r';
case 't': return '\t';
case 'u': return ScanUnicodeEscape(out surrogateCharacter);
case 'v': return '\v';
default: return c;
}
}

private char ScanUnicodeEscape(out char surrogateCharacter)
{
int escSeqStartIndex = _currentIndex - 2;
surrogateCharacter = s_invalidChar;

char c = GetChar();
if (c != '{')
{
UngetChar();

IScriptExtent errorExtent = NewScriptExtent(escSeqStartIndex, _currentIndex);
ReportError(errorExtent, () => ParserStrings.InvalidUnicodeEscapeSequence);
return s_invalidChar;
}

// Scan the rest of the Unicode escape sequence - one to six hex digits terminated plus the closing '}'.
var sb = GetStringBuilder();
int i;
for (i = 0; i < s_maxNumberOfUnicodeHexDigits + 1; i++)
{
c = GetChar();

// Sequence has been terminated.
if (c == '}')
{
if (i == 0)
{
// Sequence must have at least one hex char.
Release(sb);
IScriptExtent errorExtent = NewScriptExtent(escSeqStartIndex, _currentIndex);
ReportError(errorExtent, () => ParserStrings.InvalidUnicodeEscapeSequence);
return s_invalidChar;
}

break;
}
else if (!c.IsHexDigit())
{
UngetChar();

Release(sb);
ReportError(_currentIndex,
i < s_maxNumberOfUnicodeHexDigits
? (Expression<Func<string>>)(() => ParserStrings.InvalidUnicodeEscapeSequence)
: () => ParserStrings.MissingUnicodeEscapeSequenceTerminator);
return s_invalidChar;
}
else if (i == s_maxNumberOfUnicodeHexDigits) {
UngetChar();

Release(sb);
ReportError(_currentIndex, () => ParserStrings.TooManyDigitsInUnicodeEscapeSequence);
return s_invalidChar;
}

sb.Append(c);
}

string hexStr = GetStringAndRelease(sb);

uint unicodeValue = uint.Parse(hexStr, NumberStyles.AllowHexSpecifier, NumberFormatInfo.InvariantInfo);
if (unicodeValue <= Char.MaxValue)
{
return ((char)unicodeValue);
}
else if (unicodeValue <= 0x10FFFF)
{
return GetCharsFromUtf32(unicodeValue, out surrogateCharacter);
}
else
{
// Place the error indicator under only the hex digits in the esc sequence.
IScriptExtent errorExtent = NewScriptExtent(escSeqStartIndex + 3, _currentIndex - 1);
ReportError(errorExtent, () => ParserStrings.InvalidUnicodeEscapeSequenceValue);
return s_invalidChar;
}
}

private static char GetCharsFromUtf32(uint codepoint, out char lowSurrogate)
{
if (codepoint < (uint)0x00010000)
{
lowSurrogate = s_invalidChar;
return (char)codepoint;
}
else
{
Diagnostics.Assert((codepoint > 0x0000FFFF) && (codepoint <= 0x0010FFFF), "Codepoint is out of range for a surrogate pair");
lowSurrogate = (char)((codepoint - 0x00010000) % 0x0400 + 0xDC00);
return (char)((codepoint - 0x00010000) / 0x0400 + 0xD800);
}
}

private void ScanToEndOfCommentLine(out bool sawBeginSig, out bool matchedRequires)
{
// When we get here, we are scanning a line comment. To avoid rescanning,
Expand Down Expand Up @@ -2029,7 +2128,13 @@ private TokenFlags ScanStringExpandable(StringBuilder sb, StringBuilder formatSb
if (c1 != 0)
{
SkipChar();
c = Backtick(c1);
c = Backtick(c1, out char surrogateCharacter);
if (surrogateCharacter != s_invalidChar)
{
sb.Append(c).Append(surrogateCharacter);
formatSb.Append(c).Append(surrogateCharacter);
continue;
}
}
}
if (c == '{' || c == '}')
Expand Down Expand Up @@ -2338,7 +2443,13 @@ private Token ScanHereStringExpandable()
if (c1 != 0)
{
SkipChar();
c = Backtick(c1);
c = Backtick(c1, out char surrogateCharacter);
if (surrogateCharacter != s_invalidChar)
{
sb.Append(c).Append(surrogateCharacter);
formatSb.Append(c).Append(surrogateCharacter);
continue;
}
}
}
if (c == '{' || c == '}')
Expand Down Expand Up @@ -2407,7 +2518,12 @@ private Token ScanVariable(bool splatted, bool inStringExpandable)
UngetChar();
goto end_braced_variable_scan;
}
c = Backtick(c1);
c = Backtick(c1, out char surrogateCharacter);
if (surrogateCharacter != s_invalidChar)
{
sb.Append(c).Append(surrogateCharacter);
continue;
}
break;
}
case '"':
Expand Down Expand Up @@ -2845,6 +2961,17 @@ private Token ScanGenericToken(char firstChar)
return ScanGenericToken(sb);
}

private Token ScanGenericToken(char firstChar, char surrogateCharacter)
{
var sb = GetStringBuilder();
sb.Append(firstChar);
if (surrogateCharacter != s_invalidChar)
{
sb.Append(surrogateCharacter);
}
return ScanGenericToken(sb);
}

private Token ScanGenericToken(StringBuilder sb)
{
// On entry, we've already scanned an unknown number of characters
Expand Down Expand Up @@ -2885,7 +3012,13 @@ private Token ScanGenericToken(StringBuilder sb)
if (c1 != 0)
{
SkipChar();
c = Backtick(c1);
c = Backtick(c1, out char surrogateCharacter);
if (surrogateCharacter != s_invalidChar)
{
sb.Append(c).Append(surrogateCharacter);
formatSb.Append(c).Append(surrogateCharacter);
continue;
}
}
}
else if (c.IsSingleQuote())
Expand Down Expand Up @@ -3801,7 +3934,8 @@ internal Token NextToken()
goto again;
}

return ScanGenericToken(Backtick(c1));
c = Backtick(c1, out char surrogateCharacter);
return ScanGenericToken(c, surrogateCharacter);

case '=':
return CheckOperatorInCommandMode(c, TokenKind.Equals);
Expand Down
12 changes: 12 additions & 0 deletions src/System.Management.Automation/resources/ParserStrings.resx
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,18 @@
<data name="IncompleteString" xml:space="preserve">
<value>Incomplete string token.</value>
</data>
<data name="InvalidUnicodeEscapeSequence" xml:space="preserve">
<value>The Unicode escape sequence is not valid. A valid sequence is `u{ followed by one to six hex digits and a closing '}'.</value>
</data>
<data name="InvalidUnicodeEscapeSequenceValue" xml:space="preserve">
<value>The Unicode escape sequence value is out of range. The maximum value is 0x10FFFF.</value>
</data>
<data name="MissingUnicodeEscapeSequenceTerminator" xml:space="preserve">
<value>The Unicode escape sequence is missing the closing '}'.</value>
</data>
<data name="TooManyDigitsInUnicodeEscapeSequence" xml:space="preserve">
<value>The Unicode escape sequence contains more than the maximum of six hex digits between braces.</value>
</data>
<data name="NumberBothLongAndFloatingPoint" xml:space="preserve">
<value>A number cannot be both a long and floating point.</value>
</data>
Expand Down
78 changes: 77 additions & 1 deletion test/powershell/Language/Parser/Parser.Tests.ps1
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Describe "ParserTests (admin\monad\tests\monad\src\engine\core\ParserTests.cs)" -Tags "CI" {
Describe "ParserTests (admin\monad\tests\monad\src\engine\core\ParserTests.cs)" -Tags "CI" {
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI, I believe this line appears changed because I had to save this file with encoding UTF8 w/BOM otherwise the Unicode characters I use in this file don't get read back in properly when Pester runs.

BeforeAll {
$functionDefinitionFile = Join-Path -Path $TestDrive -ChildPath "functionDefinition.ps1"
$functionDefinition = @'
Expand Down Expand Up @@ -270,6 +270,82 @@ Describe "ParserTests (admin\monad\tests\monad\src\engine\core\ParserTests.cs)"
$result | should be ([char]0x1b)
}

Context "Test Unicode escape sequences." {
# These tests require the file to be saved with a BOM. Unfortunately when this UTF8 file is read by
# PowerShell without a BOM, the file is incorrectly interpreted as ASCII.
It 'Test that the bracketed Unicode escape sequence `u{0} returns minimum char.' {
$result = ExecuteCommand '"`u{0}"'
[int]$result[0] | should be 0
}

It 'Test that the bracketed Unicode escape sequence `u{10FFFF} returns maximum surrogate char pair.' {
$result = ExecuteCommand '"`u{10FFFF}"'
[int]$result[0] | should be 0xDBFF # max value for high surrogate of surrogate pair
[int]$result[1] | should be 0xDFFF # max value for low surrogate of surrogate pair
}

It 'Test that the bracketed Unicode escape sequence `u{a9} returns the © character.' {
$result = ExecuteCommand '"`u{a9}"'
$result | should be '©'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should add here a test without brackets too.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe the first test in this context is sufficient for testing the non-bracketed "`uXXXX" form. And "`uA9" would be a parse error.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Closed.

}

It 'Test that Unicode escape sequence `u{2195} in string returns the ↕ character.' {
$result = ExecuteCommand '"foo`u{2195}abc"'
$result | should be "foo↕abc"
}

It 'Test that the bracketed Unicode escape sequence `u{1f44d} returns surrogate pair for emoji 👍 character.' {
$result = ExecuteCommand '"`u{1f44d}"'
$result | should be "👍"
}

It 'Test that Unicode escape sequence `u{2195} in here string returns the ↕ character.' {
$result = ExecuteCommand ("@`"`n`n" + 'foo`u{2195}abc' + "`n`n`"@")
$result | should be "`nfoo↕abc`n"
}

It 'Test that Unicode escape sequence in single quoted is not processed.' {
$result = ExecuteCommand '''foo`u{2195}abc'''
$result | should be 'foo`u{2195}abc'
}

It 'Test that Unicode escape sequence in single quoted here string is not processed.' {
$result = ExecuteCommand @"
@'

foo``u{2195}abc

'@
"@
$result | should be "`r`nfoo``u{2195}abc`r`n"
}

It "Test that two consecutive Unicode escape sequences are tokenized correctly." {
$result = ExecuteCommand '"`u{007b}`u{007d}"'
$result | should be '{}'
}

It "Test that a Unicode escape sequence can be used in a command name." {
function xyzzy`u{2195}($p) {$p}
$cmd = Get-Command xyzzy`u{2195} -ErrorAction SilentlyContinue
$cmd | should not BeNullOrEmpty
$cmd.Name | should be 'xyzzy↕'
xyzzy`u{2195} 42 | should be 42
}

It "Test that a Unicode escape sequence can be used in a variable name." {
${fooxyzzy`u{2195}} = 42
$var = Get-Variable -Name fooxyzzy* -ErrorAction SilentlyContinue
$var | should not BeNullOrEmpty
$var.Name | should be "fooxyzzy↕"
$var.Value | should be 42
}

It "Test that a Unicode escape sequence can be used in an argument." {
Write-Output `u{a9}` Acme` Inc | should be "© Acme Inc"
}
}

It "Test that escaping any character with no special meaning just returns that char. (line 602)" {
$result = ExecuteCommand '"fo`obar"'
$result | should be "foobar"
Expand Down
13 changes: 13 additions & 0 deletions test/powershell/Language/Parser/Parsing.Tests.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -302,3 +302,16 @@ Describe 'expressions parsing' -Tags "CI" {
Describe 'Hash Expression parsing' -Tags "CI" {
ShouldBeParseError '@{ a=1;b=2;c=3;' MissingEndCurlyBrace 2
}

Describe 'Unicode escape sequence parsing' -Tag "CI" {
ShouldBeParseError '"`u{}"' InvalidUnicodeEscapeSequence 1 # error span is >>`u{}<<
ShouldBeParseError '"`u{219z}"' InvalidUnicodeEscapeSequence 7 # error offset is "`u{219>>z<<}"
ShouldBeParseError '"`u{12345z}"' InvalidUnicodeEscapeSequence 9 # error offset is "`u{12345>>z<<}"
ShouldBeParseError '"`u{1234567}"' TooManyDigitsInUnicodeEscapeSequence 10 # error offset is "`u{123456>>7<<}"
ShouldBeParseError '"`u{110000}"' InvalidUnicodeEscapeSequenceValue 4 # error offset is "`u{>>1<<10000}"
ShouldBeParseError '"`u2195}"' InvalidUnicodeEscapeSequence 1
ShouldBeParseError '"`u{' InvalidUnicodeEscapeSequence,TerminatorExpectedAtEndOfString 4,0
ShouldBeParseError '"`u{1' InvalidUnicodeEscapeSequence,TerminatorExpectedAtEndOfString 5,0
ShouldBeParseError '"`u{123456' MissingUnicodeEscapeSequenceTerminator,TerminatorExpectedAtEndOfString 10,0
ShouldBeParseError '"`u{1234567' TooManyDigitsInUnicodeEscapeSequence,TerminatorExpectedAtEndOfString 10,0
}