Skip to content

Commit 29db6ca

Browse files
committed
This PR references issue gh-140797
It adds validation to re.Scanner.init that rejects lexicon patterns containing capturing groups. If a user-supplied pattern contains any capturing groups, Scanner now raises ValueError with a clear message advising the use of non-capturing groups (?:...) instead.
1 parent 349de57 commit 29db6ca

File tree

3 files changed

+31
-1
lines changed

3 files changed

+31
-1
lines changed

Lib/re/__init__.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -397,9 +397,16 @@ def __init__(self, lexicon, flags=0):
397397
s = _parser.State()
398398
s.flags = flags
399399
for phrase, action in lexicon:
400+
sub_pattern = _parser.parse(phrase, flags)
401+
if sub_pattern.state.groups != 1: # <- 1 means always has \0
402+
raise ValueError(
403+
"re.Scanner lexicon patterns must not contain capturing groups;\n"
404+
"Please use non-capturing groups (?:...) instead"
405+
)
406+
400407
gid = s.opengroup()
401408
p.append(_parser.SubPattern(s, [
402-
(SUBPATTERN, (gid, 0, 0, _parser.parse(phrase, flags))),
409+
(SUBPATTERN, (gid, 0, 0, sub_pattern)),
403410
]))
404411
s.closegroup(gid, p[-1])
405412
p = _parser.SubPattern(s, [(BRANCH, (None, p))])

Lib/test/test_re.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1639,6 +1639,25 @@ def s_int(scanner, token): return int(token)
16391639
(['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
16401640
'op+', 'bar'], ''))
16411641

1642+
def test_bug_140797(self):
1643+
#bug 140797: remove capturing groups compilation form re.Scanner
1644+
1645+
#Presence of Capturing group throws an error
1646+
lex = [("(a)b", None)]
1647+
with self.assertRaises(ValueError):
1648+
Scanner(lex)
1649+
1650+
#Presence of non-capturing groups should pass normally
1651+
s = Scanner([("(?:a)b", lambda scanner, token: token)])
1652+
result, rem = s.scan("ab")
1653+
self.assertEqual(result,['ab'])
1654+
self.assertEqual(rem,'')
1655+
1656+
#Testing a very complex capturing group
1657+
pattern= "(?P<name>a)"
1658+
with self.assertRaises(ValueError):
1659+
Scanner([(pattern, None)])
1660+
16421661
def test_bug_448951(self):
16431662
# bug 448951 (similar to 429357, but with single char match)
16441663
# (Also test greedy matches.)
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
The re.Scanner class now forbids regular expressions containing capturing
2+
groups in its lexicon patterns. Patterns using capturing groups could
3+
previously lead to crashes with segmentation fault. Use non-capturing groups
4+
(?:...) instead.

0 commit comments

Comments
 (0)