Skip to content

Commit 280337a

Browse files
qingshi163youknowone
authored andcommitted
Add Lib/re/* from CPython 3.12
1 parent 02cec85 commit 280337a

5 files changed

Lines changed: 2301 additions & 89 deletions

File tree

Lib/re.py renamed to Lib/re/__init__.py

Lines changed: 132 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -122,65 +122,40 @@
122122
"""
123123

124124
import enum
125-
import sre_compile
126-
import sre_parse
125+
from . import _compiler, _parser
127126
import functools
128-
try:
129-
import _locale
130-
except ImportError:
131-
_locale = None
127+
import _sre
132128

133129

134130
# public symbols
135131
__all__ = [
136132
"match", "fullmatch", "search", "sub", "subn", "split",
137-
"findall", "finditer", "compile", "purge", "template", "escape",
133+
"findall", "finditer", "compile", "purge", "escape",
138134
"error", "Pattern", "Match", "A", "I", "L", "M", "S", "X", "U",
139135
"ASCII", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE",
140-
"UNICODE",
136+
"UNICODE", "NOFLAG", "RegexFlag",
141137
]
142138

143139
__version__ = "2.2.1"
144140

145-
class RegexFlag(enum.IntFlag):
146-
ASCII = A = sre_compile.SRE_FLAG_ASCII # assume ascii "locale"
147-
IGNORECASE = I = sre_compile.SRE_FLAG_IGNORECASE # ignore case
148-
LOCALE = L = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale
149-
UNICODE = U = sre_compile.SRE_FLAG_UNICODE # assume unicode "locale"
150-
MULTILINE = M = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline
151-
DOTALL = S = sre_compile.SRE_FLAG_DOTALL # make dot match newline
152-
VERBOSE = X = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments
141+
@enum.global_enum
142+
@enum._simple_enum(enum.IntFlag, boundary=enum.KEEP)
143+
class RegexFlag:
144+
NOFLAG = 0
145+
ASCII = A = _compiler.SRE_FLAG_ASCII # assume ascii "locale"
146+
IGNORECASE = I = _compiler.SRE_FLAG_IGNORECASE # ignore case
147+
LOCALE = L = _compiler.SRE_FLAG_LOCALE # assume current 8-bit locale
148+
UNICODE = U = _compiler.SRE_FLAG_UNICODE # assume unicode "locale"
149+
MULTILINE = M = _compiler.SRE_FLAG_MULTILINE # make anchors look for newline
150+
DOTALL = S = _compiler.SRE_FLAG_DOTALL # make dot match newline
151+
VERBOSE = X = _compiler.SRE_FLAG_VERBOSE # ignore whitespace and comments
153152
# sre extensions (experimental, don't rely on these)
154-
TEMPLATE = T = sre_compile.SRE_FLAG_TEMPLATE # disable backtracking
155-
DEBUG = sre_compile.SRE_FLAG_DEBUG # dump pattern after compilation
156-
157-
def __repr__(self):
158-
if self._name_ is not None:
159-
return f're.{self._name_}'
160-
value = self._value_
161-
members = []
162-
negative = value < 0
163-
if negative:
164-
value = ~value
165-
for m in self.__class__:
166-
if value & m._value_:
167-
value &= ~m._value_
168-
members.append(f're.{m._name_}')
169-
if value:
170-
members.append(hex(value))
171-
res = '|'.join(members)
172-
if negative:
173-
if len(members) > 1:
174-
res = f'~({res})'
175-
else:
176-
res = f'~{res}'
177-
return res
153+
DEBUG = _compiler.SRE_FLAG_DEBUG # dump pattern after compilation
178154
__str__ = object.__str__
179-
180-
globals().update(RegexFlag.__members__)
155+
_numeric_repr_ = hex
181156

182157
# sre exception
183-
error = sre_compile.error
158+
error = _compiler.error
184159

185160
# --------------------------------------------------------------------
186161
# public interface
@@ -200,16 +175,39 @@ def search(pattern, string, flags=0):
200175
a Match object, or None if no match was found."""
201176
return _compile(pattern, flags).search(string)
202177

203-
def sub(pattern, repl, string, count=0, flags=0):
178+
class _ZeroSentinel(int):
179+
pass
180+
_zero_sentinel = _ZeroSentinel()
181+
182+
def sub(pattern, repl, string, *args, count=_zero_sentinel, flags=_zero_sentinel):
204183
"""Return the string obtained by replacing the leftmost
205184
non-overlapping occurrences of the pattern in string by the
206185
replacement repl. repl can be either a string or a callable;
207186
if a string, backslash escapes in it are processed. If it is
208187
a callable, it's passed the Match object and must return
209188
a replacement string to be used."""
189+
if args:
190+
if count is not _zero_sentinel:
191+
raise TypeError("sub() got multiple values for argument 'count'")
192+
count, *args = args
193+
if args:
194+
if flags is not _zero_sentinel:
195+
raise TypeError("sub() got multiple values for argument 'flags'")
196+
flags, *args = args
197+
if args:
198+
raise TypeError("sub() takes from 3 to 5 positional arguments "
199+
"but %d were given" % (5 + len(args)))
200+
201+
import warnings
202+
warnings.warn(
203+
"'count' is passed as positional argument",
204+
DeprecationWarning, stacklevel=2
205+
)
206+
210207
return _compile(pattern, flags).sub(repl, string, count)
208+
sub.__text_signature__ = '(pattern, repl, string, count=0, flags=0)'
211209

212-
def subn(pattern, repl, string, count=0, flags=0):
210+
def subn(pattern, repl, string, *args, count=_zero_sentinel, flags=_zero_sentinel):
213211
"""Return a 2-tuple containing (new_string, number).
214212
new_string is the string obtained by replacing the leftmost
215213
non-overlapping occurrences of the pattern in the source
@@ -218,17 +216,55 @@ def subn(pattern, repl, string, count=0, flags=0):
218216
callable; if a string, backslash escapes in it are processed.
219217
If it is a callable, it's passed the Match object and must
220218
return a replacement string to be used."""
219+
if args:
220+
if count is not _zero_sentinel:
221+
raise TypeError("subn() got multiple values for argument 'count'")
222+
count, *args = args
223+
if args:
224+
if flags is not _zero_sentinel:
225+
raise TypeError("subn() got multiple values for argument 'flags'")
226+
flags, *args = args
227+
if args:
228+
raise TypeError("subn() takes from 3 to 5 positional arguments "
229+
"but %d were given" % (5 + len(args)))
230+
231+
import warnings
232+
warnings.warn(
233+
"'count' is passed as positional argument",
234+
DeprecationWarning, stacklevel=2
235+
)
236+
221237
return _compile(pattern, flags).subn(repl, string, count)
238+
subn.__text_signature__ = '(pattern, repl, string, count=0, flags=0)'
222239

223-
def split(pattern, string, maxsplit=0, flags=0):
240+
def split(pattern, string, *args, maxsplit=_zero_sentinel, flags=_zero_sentinel):
224241
"""Split the source string by the occurrences of the pattern,
225242
returning a list containing the resulting substrings. If
226243
capturing parentheses are used in pattern, then the text of all
227244
groups in the pattern are also returned as part of the resulting
228245
list. If maxsplit is nonzero, at most maxsplit splits occur,
229246
and the remainder of the string is returned as the final element
230247
of the list."""
248+
if args:
249+
if maxsplit is not _zero_sentinel:
250+
raise TypeError("split() got multiple values for argument 'maxsplit'")
251+
maxsplit, *args = args
252+
if args:
253+
if flags is not _zero_sentinel:
254+
raise TypeError("split() got multiple values for argument 'flags'")
255+
flags, *args = args
256+
if args:
257+
raise TypeError("split() takes from 2 to 4 positional arguments "
258+
"but %d were given" % (4 + len(args)))
259+
260+
import warnings
261+
warnings.warn(
262+
"'maxsplit' is passed as positional argument",
263+
DeprecationWarning, stacklevel=2
264+
)
265+
231266
return _compile(pattern, flags).split(string, maxsplit)
267+
split.__text_signature__ = '(pattern, string, maxsplit=0, flags=0)'
232268

233269
def findall(pattern, string, flags=0):
234270
"""Return a list of all non-overlapping matches in the string.
@@ -254,11 +290,9 @@ def compile(pattern, flags=0):
254290
def purge():
255291
"Clear the regular expression caches"
256292
_cache.clear()
257-
_compile_repl.cache_clear()
293+
_cache2.clear()
294+
_compile_template.cache_clear()
258295

259-
def template(pattern, flags=0):
260-
"Compile a template pattern, returning a Pattern object"
261-
return _compile(pattern, flags|T)
262296

263297
# SPECIAL_CHARS
264298
# closing ')', '}' and ']'
@@ -277,60 +311,69 @@ def escape(pattern):
277311
pattern = str(pattern, 'latin1')
278312
return pattern.translate(_special_chars_map).encode('latin1')
279313

280-
Pattern = type(sre_compile.compile('', 0))
281-
Match = type(sre_compile.compile('', 0).match(''))
314+
Pattern = type(_compiler.compile('', 0))
315+
Match = type(_compiler.compile('', 0).match(''))
282316

283317
# --------------------------------------------------------------------
284318
# internals
285319

286-
_cache = {} # ordered!
287-
320+
# Use the fact that dict keeps the insertion order.
321+
# _cache2 uses the simple FIFO policy which has better latency.
322+
# _cache uses the LRU policy which has better hit rate.
323+
_cache = {} # LRU
324+
_cache2 = {} # FIFO
288325
_MAXCACHE = 512
326+
_MAXCACHE2 = 256
327+
assert _MAXCACHE2 < _MAXCACHE
328+
289329
def _compile(pattern, flags):
290330
# internal: compile pattern
291331
if isinstance(flags, RegexFlag):
292332
flags = flags.value
293333
try:
294-
return _cache[type(pattern), pattern, flags]
334+
return _cache2[type(pattern), pattern, flags]
295335
except KeyError:
296336
pass
297-
if isinstance(pattern, Pattern):
298-
if flags:
299-
raise ValueError(
300-
"cannot process flags argument with a compiled pattern")
301-
return pattern
302-
if not sre_compile.isstring(pattern):
303-
raise TypeError("first argument must be string or compiled pattern")
304-
p = sre_compile.compile(pattern, flags)
305-
if not (flags & DEBUG):
337+
338+
key = (type(pattern), pattern, flags)
339+
# Item in _cache should be moved to the end if found.
340+
p = _cache.pop(key, None)
341+
if p is None:
342+
if isinstance(pattern, Pattern):
343+
if flags:
344+
raise ValueError(
345+
"cannot process flags argument with a compiled pattern")
346+
return pattern
347+
if not _compiler.isstring(pattern):
348+
raise TypeError("first argument must be string or compiled pattern")
349+
p = _compiler.compile(pattern, flags)
350+
if flags & DEBUG:
351+
return p
306352
if len(_cache) >= _MAXCACHE:
307-
# Drop the oldest item
353+
# Drop the least recently used item.
354+
# next(iter(_cache)) is known to have linear amortized time,
355+
# but it is used here to avoid a dependency from using OrderedDict.
356+
# For the small _MAXCACHE value it doesn't make much of a difference.
308357
try:
309358
del _cache[next(iter(_cache))]
310359
except (StopIteration, RuntimeError, KeyError):
311360
pass
312-
_cache[type(pattern), pattern, flags] = p
361+
# Append to the end.
362+
_cache[key] = p
363+
364+
if len(_cache2) >= _MAXCACHE2:
365+
# Drop the oldest item.
366+
try:
367+
del _cache2[next(iter(_cache2))]
368+
except (StopIteration, RuntimeError, KeyError):
369+
pass
370+
_cache2[key] = p
313371
return p
314372

315373
@functools.lru_cache(_MAXCACHE)
316-
def _compile_repl(repl, pattern):
374+
def _compile_template(pattern, repl):
317375
# internal: compile replacement pattern
318-
return sre_parse.parse_template(repl, pattern)
319-
320-
def _expand(pattern, match, template):
321-
# internal: Match.expand implementation hook
322-
template = sre_parse.parse_template(template, pattern)
323-
return sre_parse.expand_template(template, match)
324-
325-
def _subx(pattern, template):
326-
# internal: Pattern.sub/subn implementation helper
327-
template = _compile_repl(template, pattern)
328-
if not template[0] and len(template[1]) == 1:
329-
# literal replacement
330-
return template[1][0]
331-
def filter(match, template=template):
332-
return sre_parse.expand_template(template, match)
333-
return filter
376+
return _sre.template(pattern, _parser.parse_template(repl, pattern))
334377

335378
# register myself for pickling
336379

@@ -346,22 +389,22 @@ def _pickle(p):
346389

347390
class Scanner:
348391
def __init__(self, lexicon, flags=0):
349-
from sre_constants import BRANCH, SUBPATTERN
392+
from ._constants import BRANCH, SUBPATTERN
350393
if isinstance(flags, RegexFlag):
351394
flags = flags.value
352395
self.lexicon = lexicon
353396
# combine phrases into a compound pattern
354397
p = []
355-
s = sre_parse.State()
398+
s = _parser.State()
356399
s.flags = flags
357400
for phrase, action in lexicon:
358401
gid = s.opengroup()
359-
p.append(sre_parse.SubPattern(s, [
360-
(SUBPATTERN, (gid, 0, 0, sre_parse.parse(phrase, flags))),
402+
p.append(_parser.SubPattern(s, [
403+
(SUBPATTERN, (gid, 0, 0, _parser.parse(phrase, flags))),
361404
]))
362405
s.closegroup(gid, p[-1])
363-
p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
364-
self.scanner = sre_compile.compile(p)
406+
p = _parser.SubPattern(s, [(BRANCH, (None, p))])
407+
self.scanner = _compiler.compile(p)
365408
def scan(self, string):
366409
result = []
367410
append = result.append

0 commit comments

Comments
 (0)