PowerShell · TravisEz13 · Jun 28, 2017 · Jun 3, 2017 · Jun 23, 2017 · Jun 23, 2017
@@ -507,6 +507,9 @@ private static readonly Dictionary<string, TokenKind> s_keywordTable
         private static readonly Dictionary<string, TokenKind> s_operatorTable
             = new Dictionary<string, TokenKind>(StringComparer.OrdinalIgnoreCase);
 
+        private static readonly char s_invalidChar = char.MaxValue;
+        private static readonly int s_maxNumberOfUnicodeHexDigits = 6;
+
         private readonly Parser _parser;
         private PositionHelper _positionHelper;
         private int _nestedTokensAdjustment;
@@ -1229,8 +1232,10 @@ private int SkipBlockComment(int i)
             return i;
         }
 
-        private static char Backtick(char c)
+        private char Backtick(char c, out char surrogateCharacter)
         {
+            surrogateCharacter = s_invalidChar;
+
             switch (c)
             {
                 case '0': return '\0';
@@ -1241,11 +1246,105 @@ private static char Backtick(char c)
                 case 'n': return '\n';
                 case 'r': return '\r';
                 case 't': return '\t';
+                case 'u': return ScanUnicodeEscape(out surrogateCharacter);
                 case 'v': return '\v';
                 default: return c;
             }
         }
 
+        private char ScanUnicodeEscape(out char surrogateCharacter)
+        {
+            int escSeqStartIndex = _currentIndex - 2;
+            surrogateCharacter = s_invalidChar;
+
+            char c = GetChar();
+            if (c != '{')
+            {
+                UngetChar();
+
+                IScriptExtent errorExtent = NewScriptExtent(escSeqStartIndex, _currentIndex);
+                ReportError(errorExtent, () => ParserStrings.InvalidUnicodeEscapeSequence);
+                return s_invalidChar;
+            }
+
+            // Scan the rest of the Unicode escape sequence - one to six hex digits terminated plus the closing '}'.
+            var sb = GetStringBuilder();
+            int i;
+            for (i = 0; i < s_maxNumberOfUnicodeHexDigits + 1; i++)
+            {
+                c = GetChar();
+
+                // Sequence has been terminated.
+                if (c == '}')
+                {
+                    if (i == 0)
+                    {
+                        // Sequence must have at least one hex char.
+                        Release(sb);
+                        IScriptExtent errorExtent = NewScriptExtent(escSeqStartIndex, _currentIndex);
+                        ReportError(errorExtent, () => ParserStrings.InvalidUnicodeEscapeSequence);
+                        return s_invalidChar;
+                    }
+
+                    break;
+                }
+                else if (!c.IsHexDigit())
+                {
+                    UngetChar();
+
+                    Release(sb);
+                    ReportError(_currentIndex,
+                        i < s_maxNumberOfUnicodeHexDigits
+                            ? (Expression<Func<string>>)(() => ParserStrings.InvalidUnicodeEscapeSequence)
+                            : () => ParserStrings.MissingUnicodeEscapeSequenceTerminator);
+                    return s_invalidChar;
+                }
+                else if (i == s_maxNumberOfUnicodeHexDigits) {
+                    UngetChar();
+
+                    Release(sb);
+                    ReportError(_currentIndex, () => ParserStrings.TooManyDigitsInUnicodeEscapeSequence);
+                    return s_invalidChar;
+                }
+
+                sb.Append(c);
+            }
+
+            string hexStr = GetStringAndRelease(sb);
+
+            uint unicodeValue = uint.Parse(hexStr, NumberStyles.AllowHexSpecifier, NumberFormatInfo.InvariantInfo);
+            if (unicodeValue <= Char.MaxValue)
+            {
+                return ((char)unicodeValue);
+            }
+            else if (unicodeValue <= 0x10FFFF)
+            {
+                return GetCharsFromUtf32(unicodeValue, out surrogateCharacter);
+            }
+            else
+            {
+                // Place the error indicator under only the hex digits in the esc sequence.
+                IScriptExtent errorExtent = NewScriptExtent(escSeqStartIndex + 3, _currentIndex - 1);
+                ReportError(errorExtent, () => ParserStrings.InvalidUnicodeEscapeSequenceValue);
+                return s_invalidChar;
+            }
+        }
+
+        private static char GetCharsFromUtf32(uint codepoint, out char lowSurrogate)
+        {
+            if (codepoint < (uint)0x00010000)
+            {
+                lowSurrogate = s_invalidChar;
+                return (char)codepoint;
+            }
+            else
+            {
+                Diagnostics.Assert((codepoint > 0x0000FFFF) && (codepoint <= 0x0010FFFF), "Codepoint is out of range for a surrogate pair");
+                lowSurrogate = (char)((codepoint - 0x00010000) % 0x0400 + 0xDC00);
+                return (char)((codepoint - 0x00010000) / 0x0400 + 0xD800);
+            }
+        }
+
         private void ScanToEndOfCommentLine(out bool sawBeginSig, out bool matchedRequires)
         {
             // When we get here, we are scanning a line comment.  To avoid rescanning,
@@ -2029,7 +2128,13 @@ private TokenFlags ScanStringExpandable(StringBuilder sb, StringBuilder formatSb
                     if (c1 != 0)
                     {
                         SkipChar();
-                        c = Backtick(c1);
+                        c = Backtick(c1, out char surrogateCharacter);
+                        if (surrogateCharacter != s_invalidChar)
+                        {
+                            sb.Append(c).Append(surrogateCharacter);
+                            formatSb.Append(c).Append(surrogateCharacter);
+                            continue;
+                        }
                     }
                 }
                 if (c == '{' || c == '}')
@@ -2338,7 +2443,13 @@ private Token ScanHereStringExpandable()
                         if (c1 != 0)
                         {
                             SkipChar();
-                            c = Backtick(c1);
+                            c = Backtick(c1, out char surrogateCharacter);
+                            if (surrogateCharacter != s_invalidChar)
+                            {
+                                sb.Append(c).Append(surrogateCharacter);
+                                formatSb.Append(c).Append(surrogateCharacter);
+                                continue;
+                            }
                         }
                     }
                     if (c == '{' || c == '}')
@@ -2407,7 +2518,12 @@ private Token ScanVariable(bool splatted, bool inStringExpandable)
                                     UngetChar();
                                     goto end_braced_variable_scan;
                                 }
-                                c = Backtick(c1);
+                                c = Backtick(c1, out char surrogateCharacter);
+                                if (surrogateCharacter != s_invalidChar)
+                                {
+                                    sb.Append(c).Append(surrogateCharacter);
+                                    continue;
+                                }
                                 break;
                             }
                         case '"':
@@ -2845,6 +2961,17 @@ private Token ScanGenericToken(char firstChar)
             return ScanGenericToken(sb);
         }
 
+        private Token ScanGenericToken(char firstChar, char surrogateCharacter)
+        {
+            var sb = GetStringBuilder();
+            sb.Append(firstChar);
+            if (surrogateCharacter != s_invalidChar)
+            {
+                sb.Append(surrogateCharacter);
+            }
+            return ScanGenericToken(sb);
+        }
+
         private Token ScanGenericToken(StringBuilder sb)
         {
             // On entry, we've already scanned an unknown number of characters
@@ -2885,7 +3012,13 @@ private Token ScanGenericToken(StringBuilder sb)
                     if (c1 != 0)
                     {
                         SkipChar();
-                        c = Backtick(c1);
+                        c = Backtick(c1, out char surrogateCharacter);
+                        if (surrogateCharacter != s_invalidChar)
+                        {
+                            sb.Append(c).Append(surrogateCharacter);
+                            formatSb.Append(c).Append(surrogateCharacter);
+                            continue;
+                        }
                     }
                 }
                 else if (c.IsSingleQuote())
@@ -3801,7 +3934,8 @@ internal Token NextToken()
                         goto again;
                     }
 
-                    return ScanGenericToken(Backtick(c1));
+                    c = Backtick(c1, out char surrogateCharacter);
+                    return ScanGenericToken(c, surrogateCharacter);
 
                 case '=':
                     return CheckOperatorInCommandMode(c, TokenKind.Equals);

diff --git a/src/System.Management.Automation/resources/ParserStrings.resx b/src/System.Management.Automation/resources/ParserStrings.resx
@@ -129,6 +129,18 @@
   <data name="IncompleteString" xml:space="preserve">
     <value>Incomplete string token.</value>
   </data>
+  <data name="InvalidUnicodeEscapeSequence" xml:space="preserve">
+    <value>The Unicode escape sequence is not valid. A valid sequence is `u{ followed by one to six hex digits and a closing '}'.</value>
+  </data>
+  <data name="InvalidUnicodeEscapeSequenceValue" xml:space="preserve">
+    <value>The Unicode escape sequence value is out of range. The maximum value is 0x10FFFF.</value>
+  </data>
+  <data name="MissingUnicodeEscapeSequenceTerminator" xml:space="preserve">
+    <value>The Unicode escape sequence is missing the closing '}'.</value>
+  </data>
+  <data name="TooManyDigitsInUnicodeEscapeSequence" xml:space="preserve">
+    <value>The Unicode escape sequence contains more than the maximum of six hex digits between braces.</value>
+  </data>
   <data name="NumberBothLongAndFloatingPoint" xml:space="preserve">
     <value>A number cannot be both a long and floating point.</value>
   </data>

diff --git a/test/powershell/Language/Parser/Parser.Tests.ps1 b/test/powershell/Language/Parser/Parser.Tests.ps1
@@ -1,4 +1,4 @@
-Describe "ParserTests (admin\monad\tests\monad\src\engine\core\ParserTests.cs)" -Tags "CI" {
+Describe "ParserTests (admin\monad\tests\monad\src\engine\core\ParserTests.cs)" -Tags "CI" {
     BeforeAll {
 		$functionDefinitionFile = Join-Path -Path $TestDrive -ChildPath "functionDefinition.ps1"
 		$functionDefinition = @'
@@ -270,6 +270,82 @@ Describe "ParserTests (admin\monad\tests\monad\src\engine\core\ParserTests.cs)"
         $result | should be ([char]0x1b)
     }
 
+    Context "Test Unicode escape sequences." {
+        # These tests require the file to be saved with a BOM.  Unfortunately when this UTF8 file is read by
+        # PowerShell without a BOM, the file is incorrectly interpreted as ASCII.
+        It 'Test that the bracketed Unicode escape sequence `u{0} returns minimum char.' {
+            $result = ExecuteCommand '"`u{0}"'
+            [int]$result[0] | should be 0
+        }
+
+        It 'Test that the bracketed Unicode escape sequence `u{10FFFF} returns maximum surrogate char pair.' {
+            $result = ExecuteCommand '"`u{10FFFF}"'
+            [int]$result[0] | should be 0xDBFF # max value for high surrogate of surrogate pair
+            [int]$result[1] | should be 0xDFFF # max value for low surrogate of surrogate pair
+        }
+
+        It 'Test that the bracketed Unicode escape sequence `u{a9} returns the © character.' {
+            $result = ExecuteCommand '"`u{a9}"'
+            $result | should be '©'
+        }
+
+        It 'Test that Unicode escape sequence `u{2195} in string returns the ↕ character.' {
+            $result = ExecuteCommand '"foo`u{2195}abc"'
+            $result | should be "foo↕abc"
+        }
+
+        It 'Test that the bracketed Unicode escape sequence `u{1f44d} returns surrogate pair for emoji 👍 character.' {
+            $result = ExecuteCommand '"`u{1f44d}"'
+            $result | should be "👍"
+        }
+
+        It 'Test that Unicode escape sequence `u{2195} in here string returns the ↕ character.' {
+            $result = ExecuteCommand ("@`"`n`n" + 'foo`u{2195}abc' + "`n`n`"@")
+            $result | should be "`nfoo↕abc`n"
+        }
+
+        It 'Test that Unicode escape sequence in single quoted is not processed.' {
+            $result = ExecuteCommand '''foo`u{2195}abc'''
+            $result | should be 'foo`u{2195}abc'
+        }
+
+        It 'Test that Unicode escape sequence in single quoted here string is not processed.' {
+            $result = ExecuteCommand @"
+@'
+
+foo``u{2195}abc
+
+'@
+"@
+            $result | should be "`r`nfoo``u{2195}abc`r`n"
+        }
+
+        It "Test that two consecutive Unicode escape sequences are tokenized correctly." {
+            $result = ExecuteCommand '"`u{007b}`u{007d}"'
+            $result | should be '{}'
+        }
+
+        It "Test that a Unicode escape sequence can be used in a command name." {
+            function xyzzy`u{2195}($p) {$p}
+            $cmd = Get-Command xyzzy`u{2195} -ErrorAction SilentlyContinue
+            $cmd | should not BeNullOrEmpty
+            $cmd.Name | should be 'xyzzy↕'
+            xyzzy`u{2195} 42 | should be 42
+        }
+
+        It "Test that a Unicode escape sequence can be used in a variable name." {
+            ${fooxyzzy`u{2195}} = 42
+            $var = Get-Variable -Name fooxyzzy* -ErrorAction SilentlyContinue
+            $var | should not BeNullOrEmpty
+            $var.Name | should be "fooxyzzy↕"
+            $var.Value | should be 42
+        }
+
+        It "Test that a Unicode escape sequence can be used in an argument." {
+            Write-Output `u{a9}` Acme` Inc | should be "© Acme Inc"
+        }
+    }
+
 	It "Test that escaping any character with no special meaning just returns that char. (line 602)" {
         $result = ExecuteCommand '"fo`obar"'
 		$result | should be "foobar"

diff --git a/test/powershell/Language/Parser/Parsing.Tests.ps1 b/test/powershell/Language/Parser/Parsing.Tests.ps1
@@ -302,3 +302,16 @@ Describe 'expressions parsing' -Tags "CI" {
 Describe 'Hash Expression parsing' -Tags "CI" {
     ShouldBeParseError '@{ a=1;b=2;c=3;' MissingEndCurlyBrace 2
 }
+
+Describe 'Unicode escape sequence parsing' -Tag "CI" {
+    ShouldBeParseError '"`u{}"' InvalidUnicodeEscapeSequence 1                 # error span is >>`u{}<<
+    ShouldBeParseError '"`u{219z}"' InvalidUnicodeEscapeSequence 7             # error offset is "`u{219>>z<<}"
+    ShouldBeParseError '"`u{12345z}"' InvalidUnicodeEscapeSequence 9           # error offset is "`u{12345>>z<<}"
+    ShouldBeParseError '"`u{1234567}"' TooManyDigitsInUnicodeEscapeSequence 10 # error offset is "`u{123456>>7<<}"
+    ShouldBeParseError '"`u{110000}"' InvalidUnicodeEscapeSequenceValue 4      # error offset is "`u{>>1<<10000}"
+    ShouldBeParseError '"`u2195}"' InvalidUnicodeEscapeSequence 1
+    ShouldBeParseError '"`u{' InvalidUnicodeEscapeSequence,TerminatorExpectedAtEndOfString 4,0
+    ShouldBeParseError '"`u{1' InvalidUnicodeEscapeSequence,TerminatorExpectedAtEndOfString 5,0
+    ShouldBeParseError '"`u{123456' MissingUnicodeEscapeSequenceTerminator,TerminatorExpectedAtEndOfString 10,0
+    ShouldBeParseError '"`u{1234567' TooManyDigitsInUnicodeEscapeSequence,TerminatorExpectedAtEndOfString 10,0
+}