Skip to content

Commit 8977c39

Browse files
authored
Updated the re library + test (#6648)
* Updated re library + test * Copied over generate_sre_constants from cpython/Tools * Customized `generate_sre_constants.py` + ran to update `constants.rs` * Clarified `dump_enum` docstring in `generate_sre_constants.py`
1 parent bdf3b36 commit 8977c39

File tree

10 files changed

+565
-227
lines changed

10 files changed

+565
-227
lines changed

Lib/re/__init__.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,8 @@
117117
U UNICODE For compatibility only. Ignored for string patterns (it
118118
is the default), and forbidden for bytes patterns.
119119
120-
This module also defines an exception 'error'.
120+
This module also defines exception 'PatternError', aliased to 'error' for
121+
backward compatibility.
121122
122123
"""
123124

@@ -133,7 +134,7 @@
133134
"findall", "finditer", "compile", "purge", "escape",
134135
"error", "Pattern", "Match", "A", "I", "L", "M", "S", "X", "U",
135136
"ASCII", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE",
136-
"UNICODE", "NOFLAG", "RegexFlag",
137+
"UNICODE", "NOFLAG", "RegexFlag", "PatternError"
137138
]
138139

139140
__version__ = "2.2.1"
@@ -155,7 +156,7 @@ class RegexFlag:
155156
_numeric_repr_ = hex
156157

157158
# sre exception
158-
error = _compiler.error
159+
PatternError = error = _compiler.PatternError
159160

160161
# --------------------------------------------------------------------
161162
# public interface

Lib/re/_casefix.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Auto-generated by Tools/scripts/generate_re_casefix.py.
1+
# Auto-generated by Tools/build/generate_re_casefix.py.
22

33
# Maps the code of lowercased character to codes of different lowercased
44
# characters which have the same uppercase.

Lib/re/_compiler.py

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -101,8 +101,6 @@ def _compile(code, pattern, flags):
101101
else:
102102
emit(ANY)
103103
elif op in REPEATING_CODES:
104-
if flags & SRE_FLAG_TEMPLATE:
105-
raise error("internal: unsupported template operator %r" % (op,))
106104
if _simple(av[2]):
107105
emit(REPEATING_CODES[op][2])
108106
skip = _len(code); emit(0)
@@ -152,7 +150,7 @@ def _compile(code, pattern, flags):
152150
if lo > MAXCODE:
153151
raise error("looks too much behind")
154152
if lo != hi:
155-
raise error("look-behind requires fixed-width pattern")
153+
raise PatternError("look-behind requires fixed-width pattern")
156154
emit(lo) # look behind
157155
_compile(code, av[1], flags)
158156
emit(SUCCESS)
@@ -211,7 +209,7 @@ def _compile(code, pattern, flags):
211209
else:
212210
code[skipyes] = _len(code) - skipyes + 1
213211
else:
214-
raise error("internal: unsupported operand type %r" % (op,))
212+
raise PatternError(f"internal: unsupported operand type {op!r}")
215213

216214
def _compile_charset(charset, flags, code):
217215
# compile charset subprogram
@@ -237,7 +235,7 @@ def _compile_charset(charset, flags, code):
237235
else:
238236
emit(av)
239237
else:
240-
raise error("internal: unsupported set operator %r" % (op,))
238+
raise PatternError(f"internal: unsupported set operator {op!r}")
241239
emit(FAILURE)
242240

243241
def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
@@ -250,19 +248,19 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
250248
while True:
251249
try:
252250
if op is LITERAL:
253-
if fixup:
254-
lo = fixup(av)
255-
charmap[lo] = 1
256-
if fixes and lo in fixes:
257-
for k in fixes[lo]:
251+
if fixup: # IGNORECASE and not LOCALE
252+
av = fixup(av)
253+
charmap[av] = 1
254+
if fixes and av in fixes:
255+
for k in fixes[av]:
258256
charmap[k] = 1
259257
if not hascased and iscased(av):
260258
hascased = True
261259
else:
262260
charmap[av] = 1
263261
elif op is RANGE:
264262
r = range(av[0], av[1]+1)
265-
if fixup:
263+
if fixup: # IGNORECASE and not LOCALE
266264
if fixes:
267265
for i in map(fixup, r):
268266
charmap[i] = 1
@@ -289,8 +287,7 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
289287
# Character set contains non-BMP character codes.
290288
# For range, all BMP characters in the range are already
291289
# proceeded.
292-
if fixup:
293-
hascased = True
290+
if fixup: # IGNORECASE and not LOCALE
294291
# For now, IN_UNI_IGNORE+LITERAL and
295292
# IN_UNI_IGNORE+RANGE_UNI_IGNORE work for all non-BMP
296293
# characters, because two characters (at least one of
@@ -301,7 +298,13 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
301298
# Also, both c.lower() and c.lower().upper() are single
302299
# characters for every non-BMP character.
303300
if op is RANGE:
304-
op = RANGE_UNI_IGNORE
301+
if fixes: # not ASCII
302+
op = RANGE_UNI_IGNORE
303+
hascased = True
304+
else:
305+
assert op is LITERAL
306+
if not hascased and iscased(av):
307+
hascased = True
305308
tail.append((op, av))
306309
break
307310

@@ -763,4 +766,3 @@ def compile(p, flags=0):
763766
p.state.groups-1,
764767
groupindex, tuple(indexgroup)
765768
)
766-

Lib/re/_constants.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,14 @@
1313

1414
# update when constants are added or removed
1515

16-
MAGIC = 20221023
16+
MAGIC = 20230612
1717

1818
from _sre import MAXREPEAT, MAXGROUPS
1919

2020
# SRE standard exception (access as sre.error)
2121
# should this really be here?
2222

23-
class error(Exception):
23+
class PatternError(Exception):
2424
"""Exception raised for invalid regular expressions.
2525
2626
Attributes:
@@ -53,6 +53,9 @@ def __init__(self, msg, pattern=None, pos=None):
5353
super().__init__(msg)
5454

5555

56+
# Backward compatibility after renaming in 3.13
57+
error = PatternError
58+
5659
class _NamedIntConstant(int):
5760
def __new__(cls, value, name):
5861
self = super(_NamedIntConstant, cls).__new__(cls, value)
@@ -204,7 +207,6 @@ def _makecodes(*names):
204207
}
205208

206209
# flags
207-
SRE_FLAG_TEMPLATE = 1 # template mode (unknown purpose, deprecated)
208210
SRE_FLAG_IGNORECASE = 2 # case insensitive
209211
SRE_FLAG_LOCALE = 4 # honour system locale
210212
SRE_FLAG_MULTILINE = 8 # treat target as multiline string
@@ -218,4 +220,3 @@ def _makecodes(*names):
218220
SRE_INFO_PREFIX = 1 # has prefix
219221
SRE_INFO_LITERAL = 2 # entire pattern is literal (given by prefix)
220222
SRE_INFO_CHARSET = 4 # pattern starts with character from given set
221-
RE_INFO_CHARSET = 4 # pattern starts with character from given set

Lib/re/_parser.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,12 +61,11 @@
6161
"x": SRE_FLAG_VERBOSE,
6262
# extensions
6363
"a": SRE_FLAG_ASCII,
64-
"t": SRE_FLAG_TEMPLATE,
6564
"u": SRE_FLAG_UNICODE,
6665
}
6766

6867
TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE
69-
GLOBAL_FLAGS = SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE
68+
GLOBAL_FLAGS = SRE_FLAG_DEBUG
7069

7170
# Maximal value returned by SubPattern.getwidth().
7271
# Must be larger than MAXREPEAT, MAXCODE and sys.maxsize.
@@ -781,8 +780,10 @@ def _parse(source, state, verbose, nested, first=False):
781780
source.tell() - start)
782781
if char == "=":
783782
subpatternappend((ASSERT, (dir, p)))
784-
else:
783+
elif p:
785784
subpatternappend((ASSERT_NOT, (dir, p)))
785+
else:
786+
subpatternappend((FAILURE, ()))
786787
continue
787788

788789
elif char == "(":

Lib/sre_constants.py

Lines changed: 0 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -5,81 +5,3 @@
55

66
from re import _constants as _
77
globals().update({k: v for k, v in vars(_).items() if k[:2] != '__'})
8-
9-
if __name__ == "__main__":
10-
def dump(f, d, typ, int_t, prefix):
11-
items = sorted(d)
12-
f.write(f"""\
13-
#[derive(num_enum::TryFromPrimitive, Debug)]
14-
#[repr({int_t})]
15-
#[allow(non_camel_case_types, clippy::upper_case_acronyms)]
16-
pub enum {typ} {{
17-
""")
18-
for item in items:
19-
name = str(item).removeprefix(prefix)
20-
val = int(item)
21-
f.write(f" {name} = {val},\n")
22-
f.write("""\
23-
}
24-
""")
25-
import sys
26-
if len(sys.argv) > 1:
27-
constants_file = sys.argv[1]
28-
else:
29-
import os
30-
constants_file = os.path.join(os.path.dirname(__file__), "../../sre-engine/src/constants.rs")
31-
with open(constants_file, "w") as f:
32-
f.write("""\
33-
/*
34-
* Secret Labs' Regular Expression Engine
35-
*
36-
* regular expression matching engine
37-
*
38-
* NOTE: This file is generated by sre_constants.py. If you need
39-
* to change anything in here, edit sre_constants.py and run it.
40-
*
41-
* Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
42-
*
43-
* See the _sre.c file for information on usage and redistribution.
44-
*/
45-
46-
""")
47-
48-
f.write("use bitflags::bitflags;\n\n");
49-
50-
f.write("pub const SRE_MAGIC: usize = %d;\n" % MAGIC)
51-
52-
dump(f, OPCODES, "SreOpcode", "u32", "")
53-
dump(f, ATCODES, "SreAtCode", "u32", "AT_")
54-
dump(f, CHCODES, "SreCatCode", "u32", "CATEGORY_")
55-
56-
def bitflags(typ, int_t, prefix, flags):
57-
f.write(f"""\
58-
bitflags! {{
59-
pub struct {typ}: {int_t} {{
60-
""")
61-
for name in flags:
62-
val = globals()[prefix + name]
63-
f.write(f" const {name} = {val};\n")
64-
f.write("""\
65-
}
66-
}
67-
""")
68-
69-
bitflags("SreFlag", "u16", "SRE_FLAG_", [
70-
"TEMPLATE",
71-
"IGNORECASE",
72-
"LOCALE",
73-
"MULTILINE",
74-
"DOTALL",
75-
"UNICODE",
76-
"VERBOSE",
77-
"DEBUG",
78-
"ASCII",
79-
])
80-
81-
bitflags("SreInfo", "u32", "SRE_INFO_", [
82-
"PREFIX", "LITERAL", "CHARSET",
83-
])
84-
85-
print("done")

0 commit comments

Comments
 (0)