Skip to content

Commit 12bf67b

Browse files
committed
This PR references issue gh-140797
It adds validation to re.Scanner.init that rejects lexicon patterns containing capturing groups. If a user-supplied pattern contains any capturing groups, Scanner now raises ValueError with a clear message advising the use of non-capturing groups (?:...) instead.
1 parent 349de57 commit 12bf67b

File tree

3 files changed

+39
-0
lines changed

3 files changed

+39
-0
lines changed

Lib/re/__init__.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,17 @@ def _pickle(p):
389389
class Scanner:
390390
def __init__(self, lexicon, flags=0):
391391
from ._constants import BRANCH, SUBPATTERN
392+
393+
394+
for phrase,action in lexicon:
395+
_compiled_phrase= _compiler.compile(phrase,flags)
396+
397+
if _compiled_phrase.groups !=0:
398+
raise ValueError(
399+
"re.Scanner lexicon patterns must not contain capturing groups;\n"
400+
"Please use non-capturing groups (?:...) instead"
401+
)
402+
392403
if isinstance(flags, RegexFlag):
393404
flags = flags.value
394405
self.lexicon = lexicon

Lib/test/test_re.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1639,6 +1639,30 @@ def s_int(scanner, token): return int(token)
16391639
(['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
16401640
'op+', 'bar'], ''))
16411641

1642+
def test_bug_140797(self):
1643+
#bug 140797: remove capturing groups compilation form re.Scanner
1644+
1645+
#Presence of Capturing group throws an error
1646+
lex = [("(a)b", None)]
1647+
with self.assertRaises(ValueError):
1648+
Scanner(lex)
1649+
1650+
#Presence of non-capturing groups should pass normally
1651+
s = Scanner([("(?:a)b", lambda scanner, token: token)])
1652+
result, rem = s.scan("ab")
1653+
self.assertEqual(result,['ab'])
1654+
self.assertEqual(rem,'')
1655+
1656+
#Testing a very complex capturing group
1657+
pattern= "(?P<name>[A-Za-z]+)*="
1658+
with self.assertRaises(ValueError):
1659+
Scanner([(pattern, None)])
1660+
1661+
#Testing the pattern given by the user
1662+
lexicon = [('(?P<group1>a)(?P<group2>b)', None)]
1663+
with self.assertRaises(ValueError):
1664+
Scanner(lexicon)
1665+
16421666
def test_bug_448951(self):
16431667
# bug 448951 (similar to 429357, but with single char match)
16441668
# (Also test greedy matches.)
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
The re.Scanner class now forbids regular expressions containing capturing
2+
groups in its lexicon patterns. Patterns using capturing groups could
3+
previously lead to crashes with segmentation fault. Use non-capturing groups
4+
(?:...) instead.

0 commit comments

Comments
 (0)