gh-91760: More strict rules for numerical group references and group …

…names in RE Only sequence of ASCII digits not starting with 0 (except group 0) is now accepted as a numerical reference. The group name in bytes patterns and replacement strings can now only contain ASCII letters and digits and underscore.
python · serhiy-storchaka · May 8, 2022 · Apr 19, 2022 · Apr 23, 2022 · Apr 24, 2022
commit 8909d1481f4e1a96e9d01ba4fa4a776dccfea635
diff --git a/Doc/library/re.rst b/Doc/library/re.rst
@@ -395,7 +395,8 @@ The special characters are:
 ``(?P<name>...)``
    Similar to regular parentheses, but the substring matched by the group is
    accessible via the symbolic group name *name*.  Group names must be valid
-   Python identifiers, and each group name must be defined only once within a
+   Python identifiers, and in bytes patterns they must contain only characters
+   in the ASCII range.  Each group name must be defined only once within a
    regular expression.  A symbolic group is also a numbered group, just as if
    the group were not named.
 
@@ -417,6 +418,10 @@ The special characters are:
    |                                       | * ``\1``                         |
    +---------------------------------------+----------------------------------+
 
+   .. versionchanged:: 3.11
+      In bytes patterns group names must contain only characters in
+      the ASCII range.
+
 .. index:: single: (?P=; in regular expressions
 
 ``(?P=name)``
@@ -486,6 +491,9 @@ The special characters are:
    will match with ``'<user@host.com>'`` as well as ``'user@host.com'``, but
    not with ``'<user@host.com'`` nor ``'user@host.com>'``.
 
+   .. versionchanged:: 3.11
+      Group *id* can only contain ASCII digits and cannot start with ``0``.
+
 
 The special sequences consist of ``'\'`` and a character from the list below.
 If the ordinary character is not an ASCII digit or an ASCII letter, then the
@@ -995,6 +1003,12 @@ form.
       Empty matches for the pattern are replaced when adjacent to a previous
       non-empty match.
 
+   .. versionchanged:: 3.11
+      Group *id* can only contain ASCII digits and cannot start with ``0``
+      (except group 0).
+      In bytes replacement strings group names must contain only characters
+      in the ASCII range.
+
 
 .. function:: subn(pattern, repl, string, count=0, flags=0)
 

diff --git a/Doc/whatsnew/3.11.rst b/Doc/whatsnew/3.11.rst
@@ -1060,6 +1060,14 @@ Changes in the Python API
   before.
   (Contributed by Ma Lin in :issue:`35859`.)
 
+* More strict rules are now applied for numerical group references and
+  group names in regular expressions.
+  Only sequence of ASCII digits not starting with ``0`` (except group 0) is
+  now accepted as a numerical reference.
+  The group name in bytes patterns and replacement strings can now only
+  contain ASCII letters and digits and underscore.
+  (Contributed by Serhiy Storchaka in :issue:`91760`.)
+
 * The *population* parameter of :func:`random.sample` must be a sequence.
   Automatic conversion of sets to lists is no longer supported. If the sample size
   is larger than the population size, a :exc:`ValueError` is raised.

diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py
@@ -295,6 +295,14 @@ def seek(self, index):
     def error(self, msg, offset=0):
         return error(msg, self.string, self.tell() - offset)
 
+    def checkgroupname(self, name, offset):
+        if not (self.istext or name.isascii()):
+            msg = "bad character in group name %a" % name
+            raise self.error(msg, len(name) + offset)
+        if not name.isidentifier():
+            msg = "bad character in group name %r" % name
+            raise self.error(msg, len(name) + offset)
+
 def _class_escape(source, escape):
     # handle escape code inside character class
     code = ESCAPES.get(escape)
@@ -707,15 +715,11 @@ def _parse(source, state, verbose, nested, first=False):
                     if sourcematch("<"):
                         # named group: skip forward to end of name
                         name = source.getuntil(">", "group name")
-                        if not name.isidentifier():
-                            msg = "bad character in group name %r" % name
-                            raise source.error(msg, len(name) + 1)
+                        source.checkgroupname(name, 1)
                     elif sourcematch("="):
                         # named backreference
                         name = source.getuntil(")", "group name")
-                        if not name.isidentifier():
-                            msg = "bad character in group name %r" % name
-                            raise source.error(msg, len(name) + 1)
+                        source.checkgroupname(name, 1)
                         gid = state.groupdict.get(name)
                         if gid is None:
                             msg = "unknown group name %r" % name
@@ -776,25 +780,21 @@ def _parse(source, state, verbose, nested, first=False):
                 elif char == "(":
                     # conditional backreference group
                     condname = source.getuntil(")", "group name")
-                    if condname.isidentifier():
-                        condgroup = state.groupdict.get(condname)
-                        if condgroup is None:
-                            msg = "unknown group name %r" % condname
-                            raise source.error(msg, len(condname) + 1)
-                    else:
-                        try:
-                            condgroup = int(condname)
-                            if condgroup < 0:
-                                raise ValueError
-                        except ValueError:
-                            msg = "bad character in group name %r" % condname
-                            raise source.error(msg, len(condname) + 1) from None
+                    if (condname.isdecimal() and condname.isascii() and
+                            (condname[0] != "0" or condname == "0")):
+                        condgroup = int(condname)
                         if not condgroup:
                             raise source.error("bad group number",
                                                len(condname) + 1)
                         if condgroup >= MAXGROUPS:
                             msg = "invalid group reference %d" % condgroup
                             raise source.error(msg, len(condname) + 1)
+                    else:
+                        source.checkgroupname(condname, 1)
+                        condgroup = state.groupdict.get(condname)
+                        if condgroup is None:
+                            msg = "unknown group name %r" % condname
+                            raise source.error(msg, len(condname) + 1)
                     state.checklookbehindgroup(condgroup, source)
                     item_yes = _parse(source, state, verbose, nested + 1)
                     if source.match("|"):
@@ -1006,26 +1006,21 @@ def addgroup(index, pos):
             # group
             c = this[1]
             if c == "g":
-                name = ""
                 if not s.match("<"):
                     raise s.error("missing <")
                 name = s.getuntil(">", "group name")
-                if name.isidentifier():
+                if (name.isdecimal() and name.isascii() and
+                        (name[0] != "0" or name == "0")):
+                    index = int(name)
+                    if index >= MAXGROUPS:
+                        raise s.error("invalid group reference %d" % index,
+                                      len(name) + 1)
+                else:
+                    s.checkgroupname(name, 1)
                     try:
                         index = groupindex[name]
                     except KeyError:
                         raise IndexError("unknown group name %r" % name) from None
-                else:
-                    try:
-                        index = int(name)
-                        if index < 0:
-                            raise ValueError
-                    except ValueError:
-                        raise s.error("bad character in group name %r" % name,
-                                      len(name) + 1) from None
-                    if index >= MAXGROUPS:
-                        raise s.error("invalid group reference %d" % index,
-                                      len(name) + 1)
                 addgroup(index, len(name) + 1)
             elif c == "0":
                 if s.next in OCTDIGITS:

diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
@@ -135,6 +135,7 @@ def test_basic_re_sub(self):
         self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx'), 'xxxx')
         self.assertEqual(re.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx'), 'xxxx')
         self.assertEqual(re.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx'), 'xxxx')
+        self.assertEqual(re.sub('()x', r'\g<0>\g<0>', 'xx'), 'xxxx')
 
         self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
         self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
@@ -274,6 +275,12 @@ def test_symbolic_groups_errors(self):
         self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
         self.checkPatternError('(?P=©)', "bad character in group name '©'", 4)
         self.checkPatternError('(?(©)y)', "bad character in group name '©'", 3)
+        self.checkPatternError(b'(?P<\xc2\xb5>x)',
+                               r"bad character in group name '\xc2\xb5'", 4)
+        self.checkPatternError(b'(?P=\xc2\xb5)',
+                               r"bad character in group name '\xc2\xb5'", 4)
+        self.checkPatternError(b'(?(\xc2\xb5)y)',
+                               r"bad character in group name '\xc2\xb5'", 3)
 
     def test_symbolic_refs(self):
         self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
@@ -306,12 +313,24 @@ def test_symbolic_refs_errors(self):
             re.sub('(?P<a>x)', r'\g<ab>', 'xx')
         self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx',
                                 "bad character in group name '-1'", 3)
+        self.checkTemplateError('(?P<a>x)', r'\g<+1>', 'xx',
+                                "bad character in group name '+1'", 3)
+        self.checkTemplateError('(?P<a>x)', r'\g<01>', 'xx',
+                                "bad character in group name '01'", 3)
+        self.checkTemplateError('()'*10, r'\g<1_0>', 'xx',
+                                "bad character in group name '1_0'", 3)
+        self.checkTemplateError('(?P<a>x)', r'\g< 1 >', 'xx',
+                                "bad character in group name ' 1 '", 3)
         self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx',
                                 "bad character in group name '©'", 3)
+        self.checkTemplateError(b'(?P<a>x)', b'\\g<\xc2\xb5>', b'xx',
+                                r"bad character in group name '\xc2\xb5'", 3)
         self.checkTemplateError('(?P<a>x)', r'\g<㊀>', 'xx',
                                 "bad character in group name '㊀'", 3)
         self.checkTemplateError('(?P<a>x)', r'\g<¹>', 'xx',
                                 "bad character in group name '¹'", 3)
+        self.checkTemplateError('(?P<a>x)', r'\g<१>', 'xx',
+                                "bad character in group name '१'", 3)
 
     def test_re_subn(self):
         self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
@@ -577,10 +596,20 @@ def test_re_groupref_exists_errors(self):
         self.checkPatternError(r'(?P<a>)(?(0)a|b)', 'bad group number', 10)
         self.checkPatternError(r'()(?(-1)a|b)',
                                "bad character in group name '-1'", 5)
+        self.checkPatternError(r'()(?(+1)a|b)',
+                               "bad character in group name '+1'", 5)
+        self.checkPatternError(r'()(?(01)a|b)',
+                               "bad character in group name '01'", 5)
+        self.checkPatternError(r'()'*10 + r'(?(1_0)a|b)',
+                               "bad character in group name '1_0'", 23)
+        self.checkPatternError(r'()(?( 1 )a|b)',
+                               "bad character in group name ' 1 '", 5)
         self.checkPatternError(r'()(?(㊀)a|b)',
                                "bad character in group name '㊀'", 5)
         self.checkPatternError(r'()(?(¹)a|b)',
                                "bad character in group name '¹'", 5)
+        self.checkPatternError(r'()(?(१)a|b)',
+                               "bad character in group name '१'", 5)
         self.checkPatternError(r'()(?(1',
                                "missing ), unterminated name", 5)
         self.checkPatternError(r'()(?(1)a',

diff --git a/Misc/NEWS.d/next/Library/2022-04-21-19-14-29.gh-issue-91760.54AR-m.rst b/Misc/NEWS.d/next/Library/2022-04-21-19-14-29.gh-issue-91760.54AR-m.rst
@@ -0,0 +1,5 @@
+Apply more strict rules for numerical group references and group names in
+regular expressions. Only sequence of ASCII digits not starting with 0
+(except group 0) is now accepted as a numerical reference. The group name in
+bytes patterns and replacement strings can now only contain ASCII letters
+and digits and underscore.