Skip to content
Merged
Next Next commit
gh-91760: More strict rules for numerical group references and group …
…names in RE

Only sequence of ASCII digits not starting with 0 (except group 0) is
now accepted as a numerical reference.
The group name in bytes patterns and replacement strings can now only
contain ASCII letters and digits and underscore.
  • Loading branch information
serhiy-storchaka committed Apr 21, 2022
commit 8909d1481f4e1a96e9d01ba4fa4a776dccfea635
16 changes: 15 additions & 1 deletion Doc/library/re.rst
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,8 @@ The special characters are:
``(?P<name>...)``
Similar to regular parentheses, but the substring matched by the group is
accessible via the symbolic group name *name*. Group names must be valid
Python identifiers, and each group name must be defined only once within a
Python identifiers, and in bytes patterns they must contain only characters
in the ASCII range. Each group name must be defined only once within a
regular expression. A symbolic group is also a numbered group, just as if
the group were not named.

Expand All @@ -417,6 +418,10 @@ The special characters are:
| | * ``\1`` |
+---------------------------------------+----------------------------------+

.. versionchanged:: 3.11
In bytes patterns group names must contain only characters in
the ASCII range.

.. index:: single: (?P=; in regular expressions

``(?P=name)``
Expand Down Expand Up @@ -486,6 +491,9 @@ The special characters are:
will match with ``'<user@host.com>'`` as well as ``'user@host.com'``, but
not with ``'<user@host.com'`` nor ``'user@host.com>'``.

.. versionchanged:: 3.11
Group *id* can only contain ASCII digits and cannot start with ``0``.


The special sequences consist of ``'\'`` and a character from the list below.
If the ordinary character is not an ASCII digit or an ASCII letter, then the
Expand Down Expand Up @@ -995,6 +1003,12 @@ form.
Empty matches for the pattern are replaced when adjacent to a previous
non-empty match.

.. versionchanged:: 3.11
Group *id* can only contain ASCII digits and cannot start with ``0``
(except group 0).
In bytes replacement strings group names must contain only characters
in the ASCII range.


.. function:: subn(pattern, repl, string, count=0, flags=0)

Expand Down
8 changes: 8 additions & 0 deletions Doc/whatsnew/3.11.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1060,6 +1060,14 @@ Changes in the Python API
before.
(Contributed by Ma Lin in :issue:`35859`.)

* More strict rules are now applied for numerical group references and
group names in regular expressions.
Only sequence of ASCII digits not starting with ``0`` (except group 0) is
now accepted as a numerical reference.
The group name in bytes patterns and replacement strings can now only
contain ASCII letters and digits and underscore.
(Contributed by Serhiy Storchaka in :issue:`91760`.)

* The *population* parameter of :func:`random.sample` must be a sequence.
Automatic conversion of sets to lists is no longer supported. If the sample size
is larger than the population size, a :exc:`ValueError` is raised.
Expand Down
59 changes: 27 additions & 32 deletions Lib/re/_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,14 @@ def seek(self, index):
def error(self, msg, offset=0):
return error(msg, self.string, self.tell() - offset)

def checkgroupname(self, name, offset):
if not (self.istext or name.isascii()):
msg = "bad character in group name %a" % name
raise self.error(msg, len(name) + offset)
if not name.isidentifier():
msg = "bad character in group name %r" % name
raise self.error(msg, len(name) + offset)

def _class_escape(source, escape):
# handle escape code inside character class
code = ESCAPES.get(escape)
Expand Down Expand Up @@ -707,15 +715,11 @@ def _parse(source, state, verbose, nested, first=False):
if sourcematch("<"):
# named group: skip forward to end of name
name = source.getuntil(">", "group name")
if not name.isidentifier():
msg = "bad character in group name %r" % name
raise source.error(msg, len(name) + 1)
source.checkgroupname(name, 1)
elif sourcematch("="):
# named backreference
name = source.getuntil(")", "group name")
if not name.isidentifier():
msg = "bad character in group name %r" % name
raise source.error(msg, len(name) + 1)
source.checkgroupname(name, 1)
gid = state.groupdict.get(name)
if gid is None:
msg = "unknown group name %r" % name
Expand Down Expand Up @@ -776,25 +780,21 @@ def _parse(source, state, verbose, nested, first=False):
elif char == "(":
# conditional backreference group
condname = source.getuntil(")", "group name")
if condname.isidentifier():
condgroup = state.groupdict.get(condname)
if condgroup is None:
msg = "unknown group name %r" % condname
raise source.error(msg, len(condname) + 1)
else:
try:
condgroup = int(condname)
if condgroup < 0:
raise ValueError
except ValueError:
msg = "bad character in group name %r" % condname
raise source.error(msg, len(condname) + 1) from None
if (condname.isdecimal() and condname.isascii() and
(condname[0] != "0" or condname == "0")):
condgroup = int(condname)
if not condgroup:
raise source.error("bad group number",
len(condname) + 1)
if condgroup >= MAXGROUPS:
msg = "invalid group reference %d" % condgroup
raise source.error(msg, len(condname) + 1)
else:
source.checkgroupname(condname, 1)
condgroup = state.groupdict.get(condname)
if condgroup is None:
msg = "unknown group name %r" % condname
raise source.error(msg, len(condname) + 1)
state.checklookbehindgroup(condgroup, source)
item_yes = _parse(source, state, verbose, nested + 1)
if source.match("|"):
Expand Down Expand Up @@ -1006,26 +1006,21 @@ def addgroup(index, pos):
# group
c = this[1]
if c == "g":
name = ""
if not s.match("<"):
raise s.error("missing <")
name = s.getuntil(">", "group name")
if name.isidentifier():
if (name.isdecimal() and name.isascii() and
(name[0] != "0" or name == "0")):
index = int(name)
if index >= MAXGROUPS:
raise s.error("invalid group reference %d" % index,
len(name) + 1)
else:
s.checkgroupname(name, 1)
try:
index = groupindex[name]
except KeyError:
raise IndexError("unknown group name %r" % name) from None
else:
try:
index = int(name)
if index < 0:
raise ValueError
except ValueError:
raise s.error("bad character in group name %r" % name,
len(name) + 1) from None
if index >= MAXGROUPS:
raise s.error("invalid group reference %d" % index,
len(name) + 1)
addgroup(index, len(name) + 1)
elif c == "0":
if s.next in OCTDIGITS:
Expand Down
29 changes: 29 additions & 0 deletions Lib/test/test_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ def test_basic_re_sub(self):
self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx'), 'xxxx')
self.assertEqual(re.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx'), 'xxxx')
self.assertEqual(re.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx'), 'xxxx')
self.assertEqual(re.sub('()x', r'\g<0>\g<0>', 'xx'), 'xxxx')

self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
Expand Down Expand Up @@ -274,6 +275,12 @@ def test_symbolic_groups_errors(self):
self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
self.checkPatternError('(?P=©)', "bad character in group name '©'", 4)
self.checkPatternError('(?(©)y)', "bad character in group name '©'", 3)
self.checkPatternError(b'(?P<\xc2\xb5>x)',
r"bad character in group name '\xc2\xb5'", 4)
self.checkPatternError(b'(?P=\xc2\xb5)',
r"bad character in group name '\xc2\xb5'", 4)
self.checkPatternError(b'(?(\xc2\xb5)y)',
r"bad character in group name '\xc2\xb5'", 3)

def test_symbolic_refs(self):
self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
Expand Down Expand Up @@ -306,12 +313,24 @@ def test_symbolic_refs_errors(self):
re.sub('(?P<a>x)', r'\g<ab>', 'xx')
self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx',
"bad character in group name '-1'", 3)
self.checkTemplateError('(?P<a>x)', r'\g<+1>', 'xx',
"bad character in group name '+1'", 3)
self.checkTemplateError('(?P<a>x)', r'\g<01>', 'xx',
"bad character in group name '01'", 3)
self.checkTemplateError('()'*10, r'\g<1_0>', 'xx',
"bad character in group name '1_0'", 3)
self.checkTemplateError('(?P<a>x)', r'\g< 1 >', 'xx',
"bad character in group name ' 1 '", 3)
self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx',
"bad character in group name '©'", 3)
self.checkTemplateError(b'(?P<a>x)', b'\\g<\xc2\xb5>', b'xx',
r"bad character in group name '\xc2\xb5'", 3)
self.checkTemplateError('(?P<a>x)', r'\g<㊀>', 'xx',
"bad character in group name '㊀'", 3)
self.checkTemplateError('(?P<a>x)', r'\g<¹>', 'xx',
"bad character in group name '¹'", 3)
self.checkTemplateError('(?P<a>x)', r'\g<१>', 'xx',
"bad character in group name '१'", 3)

def test_re_subn(self):
self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
Expand Down Expand Up @@ -577,10 +596,20 @@ def test_re_groupref_exists_errors(self):
self.checkPatternError(r'(?P<a>)(?(0)a|b)', 'bad group number', 10)
self.checkPatternError(r'()(?(-1)a|b)',
"bad character in group name '-1'", 5)
self.checkPatternError(r'()(?(+1)a|b)',
"bad character in group name '+1'", 5)
self.checkPatternError(r'()(?(01)a|b)',
"bad character in group name '01'", 5)
self.checkPatternError(r'()'*10 + r'(?(1_0)a|b)',
"bad character in group name '1_0'", 23)
self.checkPatternError(r'()(?( 1 )a|b)',
"bad character in group name ' 1 '", 5)
self.checkPatternError(r'()(?(㊀)a|b)',
"bad character in group name '㊀'", 5)
self.checkPatternError(r'()(?(¹)a|b)',
"bad character in group name '¹'", 5)
self.checkPatternError(r'()(?(१)a|b)',
"bad character in group name '१'", 5)
self.checkPatternError(r'()(?(1',
"missing ), unterminated name", 5)
self.checkPatternError(r'()(?(1)a',
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Apply more strict rules for numerical group references and group names in
regular expressions. Only sequence of ASCII digits not starting with 0
(except group 0) is now accepted as a numerical reference. The group name in
bytes patterns and replacement strings can now only contain ASCII letters
and digits and underscore.