Skip to content

Commit dfa6790

Browse files
committed
New re version from AMK
1 parent f3d729c commit dfa6790

File tree

3 files changed

+201
-58
lines changed

3 files changed

+201
-58
lines changed

Lib/re.py

Lines changed: 93 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
#!/usr/bin/env python
22
# -*- mode: python -*-
3-
# $Id$
4-
53

64
import sys
75
import string
@@ -12,11 +10,12 @@
1210
#
1311

1412
# pcre.error and re.error should be the same, since exceptions can be
15-
# raised from either module.
13+
# raised from either module.
1614

1715
# compilation flags
1816

1917
I = IGNORECASE
18+
L = LOCALE
2019
M = MULTILINE
2120
S = DOTALL
2221
X = VERBOSE
@@ -61,8 +60,25 @@ def split(pattern, string, maxsplit=0):
6160
pattern = _cachecompile(pattern)
6261
return pattern.split(string, maxsplit)
6362

63+
def escape(pattern):
64+
"Escape all non-alphanumeric characters in pattern."
65+
result = []
66+
alphanum=string.letters+'_'+string.digits
67+
for char in pattern:
68+
if char not in alphanum:
69+
result.append('\\')
70+
result.append(char)
71+
return string.join(result, '')
72+
73+
def compile(pattern, flags=0):
74+
"Compile a regular expression pattern, returning a RegexObject."
75+
groupindex={}
76+
code=pcre_compile(pattern, flags, groupindex)
77+
return RegexObject(pattern, flags, code, groupindex)
78+
79+
6480
#
65-
#
81+
# Class definitions
6682
#
6783

6884
class RegexObject:
@@ -71,31 +87,54 @@ def __init__(self, pattern, flags, code, groupindex):
7187
self.flags = flags
7288
self.pattern = pattern
7389
self.groupindex = groupindex
74-
def search(self, string, pos=0):
75-
regs = self.code.match(string, pos, 0)
90+
91+
def search(self, string, pos=0, endpos=None):
92+
"""Scan through string looking for a match to the pattern, returning
93+
a MatchObject instance, or None if no match was found."""
94+
95+
if endpos is None or endpos>len(string):
96+
endpos=len(string)
97+
if endpos<pos: endpos=pos
98+
regs = self.code.match(string, pos, endpos, 0)
7699
if regs is None:
77100
return None
78-
self.num_regs=len(regs)
101+
self._num_regs=len(regs)
79102

80103
return MatchObject(self,
81104
string,
82-
pos,
105+
pos, endpos,
83106
regs)
84107

85-
def match(self, string, pos=0):
86-
regs = self.code.match(string, pos, ANCHORED)
108+
def match(self, string, pos=0, endpos=None):
109+
"""Try to apply the pattern at the start of the string, returning
110+
a MatchObject instance, or None if no match was found."""
111+
112+
if endpos is None or endpos>len(string):
113+
endpos=len(string)
114+
if endpos<pos: endpos=pos
115+
regs = self.code.match(string, pos, endpos, ANCHORED)
87116
if regs is None:
88117
return None
89-
self.num_regs=len(regs)
118+
self._num_regs=len(regs)
90119
return MatchObject(self,
91120
string,
92-
pos,
121+
pos, endpos,
93122
regs)
94123

95124
def sub(self, repl, string, count=0):
125+
"""Return the string obtained by replacing the leftmost
126+
non-overlapping occurrences of the pattern in string by the
127+
replacement repl"""
128+
96129
return self.subn(repl, string, count)[0]
97130

98-
def subn(self, repl, source, count=0):
131+
def subn(self, repl, source, count=0):
132+
"""Return a 2-tuple containing (new_string, number).
133+
new_string is the string obtained by replacing the leftmost
134+
non-overlapping occurrences of the pattern in string by the
135+
replacement repl. number is the number of substitutions that
136+
were made."""
137+
99138
if count < 0:
100139
raise error, "negative substitution count"
101140
if count == 0:
@@ -134,6 +173,9 @@ def subn(self, repl, source, count=0):
134173
return (string.join(results, ''), n)
135174

136175
def split(self, source, maxsplit=0):
176+
"""Split \var{string} by the occurrences of the pattern,
177+
returning a list containing the resulting substrings."""
178+
137179
if maxsplit < 0:
138180
raise error, "negative split count"
139181
if maxsplit == 0:
@@ -156,50 +198,77 @@ def split(self, source, maxsplit=0):
156198
pos = pos+1
157199
continue
158200
results.append(source[lastmatch:i])
159-
g = m.group()
201+
g = m.groups()
160202
if g:
203+
if type(g)==type( "" ): g = [g]
161204
results[len(results):] = list(g)
162205
pos = lastmatch = j
163206
results.append(source[lastmatch:])
164207
return results
165208

209+
# The following 3 functions were contributed by Mike Fletcher, and
210+
# allow pickling and unpickling of RegexObject instances.
211+
def __getinitargs__(self):
212+
return (None,None,None,None) # any 4 elements, to work around
213+
# problems with the
214+
# pickle/cPickle modules not yet
215+
# ignoring the __init__ function
216+
def __getstate__(self):
217+
return self.pattern, self.flags, self.groupindex
218+
def __setstate__(self, statetuple):
219+
self.pattern = statetuple[0]
220+
self.flags = statetuple[1]
221+
self.groupindex = statetuple[2]
222+
self.code = apply(pcre_compile, statetuple)
223+
166224
class MatchObject:
167-
def __init__(self, re, string, pos, regs):
225+
def __init__(self, re, string, pos, endpos, regs):
168226
self.re = re
169227
self.string = string
170-
self.pos = pos
228+
self.pos = pos
229+
self.endpos = endpos
171230
self.regs = regs
172231

173-
def start(self, g):
232+
def start(self, g = 0):
233+
"Return the start of the substring matched by group g"
174234
if type(g) == type(''):
175235
try:
176236
g = self.re.groupindex[g]
177237
except (KeyError, TypeError):
178238
raise IndexError, ('group "' + g + '" is undefined')
179239
return self.regs[g][0]
180240

181-
def end(self, g):
241+
def end(self, g = 0):
242+
"Return the end of the substring matched by group g"
182243
if type(g) == type(''):
183244
try:
184245
g = self.re.groupindex[g]
185246
except (KeyError, TypeError):
186247
raise IndexError, ('group "' + g + '" is undefined')
187248
return self.regs[g][1]
188249

189-
def span(self, g):
250+
def span(self, g = 0):
251+
"""Return a tuple containing the start,end of the substring
252+
matched by group g"""
190253
if type(g) == type(''):
191254
try:
192255
g = self.re.groupindex[g]
193256
except (KeyError, TypeError):
194257
raise IndexError, ('group "' + g + '" is undefined')
195258
return self.regs[g]
196259

260+
def groups(self):
261+
"Return a tuple containing all subgroups of the match object"
262+
263+
# If _num_regs==1, we don't want to call self.group with an
264+
# empty tuple.
265+
if self.re._num_regs == 1: return ()
266+
return apply(self.group, tuple(range(1, self.re._num_regs) ) )
267+
197268
def group(self, *groups):
269+
"Return one or more groups of the match."
198270
if len(groups) == 0:
199-
groups = range(1, self.re.num_regs)
200-
use_all = 1
201-
else:
202-
use_all = 0
271+
groups = (0,)
203272
result = []
204273
for g in groups:
205274
if type(g) == type(''):
@@ -212,25 +281,10 @@ def group(self, *groups):
212281
result.append(None)
213282
else:
214283
result.append(self.string[self.regs[g][0]:self.regs[g][1]])
215-
if use_all or len(result) > 1:
284+
if len(result) > 1:
216285
return tuple(result)
217286
elif len(result) == 1:
218287
return result[0]
219288
else:
220289
return ()
221290

222-
def escape(pattern):
223-
result = []
224-
alphanum=string.letters+'_'+string.digits
225-
for char in pattern:
226-
if char not in alphanum:
227-
result.append('\\')
228-
result.append(char)
229-
return string.join(result, '')
230-
231-
def compile(pattern, flags=0):
232-
groupindex={}
233-
code=pcre_compile(pattern, flags, groupindex)
234-
return RegexObject(pattern, flags, code, groupindex)
235-
236-

Lib/test/re_tests.py

Lines changed: 66 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# -*- mode: python -*-
33
# $Id$
44

5-
# Re test suite and benchmark suite v1.5a2
5+
# Re test suite and benchmark suite v1.5b2
66

77
# The 3 possible outcomes for each pattern
88
[SUCCEED, FAIL, SYNTAX_ERROR] = range(3)
@@ -47,7 +47,62 @@
4747
#
4848
# If the regex isn't expected to work, the latter two elements can be omitted.
4949

50-
tests = [
50+
tests = [
51+
# Test ?P< and ?P= extensions
52+
('(?P<foo_123', '', SYNTAX_ERROR), # Unterminated group identifier
53+
('(?P<1>a)', '', SYNTAX_ERROR), # Begins with a digit
54+
('(?P<!>a)', '', SYNTAX_ERROR), # Begins with an illegal char
55+
('(?P<foo!>a)', '', SYNTAX_ERROR), # Begins with an illegal char
56+
57+
# Same tests, for the ?P= form
58+
('(?P<foo_123>a)(?P=foo_123', 'aa', SYNTAX_ERROR),
59+
('(?P<foo_123>a)(?P=1)', 'aa', SYNTAX_ERROR),
60+
('(?P<foo_123>a)(?P=!)', 'aa', SYNTAX_ERROR),
61+
('(?P<foo_123>a)(?P=foo_124', 'aa', SYNTAX_ERROR), # Backref to undefined group
62+
63+
('(?P<foo_123>a)', 'a', SUCCEED, 'g1', 'a'),
64+
('(?P<foo_123>a)(?P=foo_123)', 'aa', SUCCEED, 'g1', 'a'),
65+
66+
# Test octal escapes
67+
('\\1', 'a', SYNTAX_ERROR),
68+
('\\09', chr(0) + '9', SUCCEED, 'found', chr(0) + '9'),
69+
('\\141', 'a', SUCCEED, 'found', 'a'),
70+
('(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)\\119', 'abcdefghijklk9', SUCCEED, 'found+"-"+g11', 'abcdefghijklk9-k'),
71+
72+
# Test that a literal \0 is handled everywhere
73+
('\0', '\0', SUCCEED, 'found', '\0'),
74+
(r'\0', '\0', SUCCEED, 'found', '\0'),
75+
('[\0a]', '\0', SUCCEED, 'found', '\0'),
76+
('[a\0]', '\0', SUCCEED, 'found', '\0'),
77+
('[^a\0]', '\0', FAIL),
78+
(r'[\0a]', '\0', SUCCEED, 'found', '\0'),
79+
(r'[a\0]', '\0', SUCCEED, 'found', '\0'),
80+
(r'[^a\0]', '\0', FAIL),
81+
82+
# Test various letter escapes
83+
(r'\a[\b]\f\n\r\t\v', '\a\b\f\n\r\t\v', SUCCEED, 'found', '\a\b\f\n\r\t\v'),
84+
(r'[\a][\b][\f][\n][\r][\t][\v]', '\a\b\f\n\r\t\v', SUCCEED, 'found', '\a\b\f\n\r\t\v'),
85+
(r'\u', '', SYNTAX_ERROR), # A Perl escape
86+
(r'\c\e\g\h\i\j\k\m\o\p\q\y\z', 'ceghijkmopqyz', SUCCEED, 'found', 'ceghijkmopqyz'),
87+
(r'\xff', '\377', SUCCEED, 'found', chr(255)),
88+
(r'\x00ffffffffffffff', '\377', SUCCEED, 'found', chr(255)),
89+
(r'\x00f', '\017', SUCCEED, 'found', chr(15)),
90+
(r'\x00fe', '\376', SUCCEED, 'found', chr(254)),
91+
92+
(r"^\w+=(\\[\000-\277]|[^\n\\])*", "SRC=eval.c g.c blah blah blah \\\\\n\tapes.c",
93+
SUCCEED, 'found', "SRC=eval.c g.c blah blah blah \\\\"),
94+
95+
# Test that . only matches \n in DOTALL mode
96+
('a.b', 'acb', SUCCEED, 'found', 'acb'),
97+
('a.b', 'a\nb', FAIL),
98+
('a.*b', 'acc\nccb', FAIL),
99+
('a.{4,5}b', 'acc\nccb', FAIL),
100+
('a.b', 'a\rb', SUCCEED, 'found', 'a\rb'),
101+
('a.b(?s)', 'a\nb', SUCCEED, 'found', 'a\nb'),
102+
('a.*(?s)b', 'acc\nccb', SUCCEED, 'found', 'acc\nccb'),
103+
('(?s)a.{4,5}b', 'acc\nccb', SUCCEED, 'found', 'acc\nccb'),
104+
('(?s)a.b', 'a\nb', SUCCEED, 'found', 'a\nb'),
105+
51106
('abc', 'abc', SUCCEED, 'found', 'abc'),
52107
('abc', 'xbc', FAIL),
53108
('abc', 'axc', FAIL),
@@ -338,8 +393,9 @@
338393
('(.*)c(.*)', 'abcde', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcde-ab-de'),
339394
('\\((.*), (.*)\\)', '(a, b)', SUCCEED, 'g2+"-"+g1', 'b-a'),
340395
('[k]', 'ab', FAIL),
341-
##('abcd', 'abcd', SUCCEED, 'found+"-"+\\found+"-"+\\\\found', 'abcd-$&-\\abcd'),
342-
##('a(bc)d', 'abcd', SUCCEED, 'g1+"-"+\\g1+"-"+\\\\g1', 'bc-$1-\\bc'),
396+
# XXX
397+
# ('abcd', 'abcd', SUCCEED, 'found+"-"+\\found+"-"+\\\\found', 'abcd-$&-\\abcd'),
398+
# ('a(bc)d', 'abcd', SUCCEED, 'g1+"-"+\\g1+"-"+\\\\g1', 'bc-$1-\\bc'),
343399
('a[-]?c', 'ac', SUCCEED, 'found', 'ac'),
344400
('(abc)\\1', 'abcabc', SUCCEED, 'g1', 'abc'),
345401
('([a-c]*)\\1', 'abcabc', SUCCEED, 'g1', 'abc'),
@@ -470,15 +526,14 @@
470526
('(?i)(.*)c(.*)', 'ABCDE', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABCDE-AB-DE'),
471527
('(?i)\\((.*), (.*)\\)', '(A, B)', SUCCEED, 'g2+"-"+g1', 'B-A'),
472528
('(?i)[k]', 'AB', FAIL),
473-
##('(?i)abcd', 'ABCD', SUCCEED, 'found+"-"+\\found+"-"+\\\\found', 'ABCD-$&-\\ABCD'),
474-
##('(?i)a(bc)d', 'ABCD', SUCCEED, 'g1+"-"+\\g1+"-"+\\\\g1', 'BC-$1-\\BC'),
529+
# ('(?i)abcd', 'ABCD', SUCCEED, 'found+"-"+\\found+"-"+\\\\found', 'ABCD-$&-\\ABCD'),
530+
# ('(?i)a(bc)d', 'ABCD', SUCCEED, 'g1+"-"+\\g1+"-"+\\\\g1', 'BC-$1-\\BC'),
475531
('(?i)a[-]?c', 'AC', SUCCEED, 'found', 'AC'),
476532
('(?i)(abc)\\1', 'ABCABC', SUCCEED, 'g1', 'ABC'),
477533
('(?i)([a-c]*)\\1', 'ABCABC', SUCCEED, 'g1', 'ABC'),
478-
# these zero-width assertions are not supported
479-
#('a(?!b).', 'abad', SUCCEED, 'found', 'ad'),
480-
#('a(?=d).', 'abad', SUCCEED, 'found', 'ad'),
481-
#('a(?=c|d).', 'abad', SUCCEED, 'found', 'ad'),
534+
('a(?!b).', 'abad', SUCCEED, 'found', 'ad'),
535+
('a(?=d).', 'abad', SUCCEED, 'found', 'ad'),
536+
('a(?=c|d).', 'abad', SUCCEED, 'found', 'ad'),
482537
('a(?:b|c|d)(.)', 'ace', SUCCEED, 'g1', 'e'),
483538
('a(?:b|c|d)*(.)', 'ace', SUCCEED, 'g1', 'e'),
484539
('a(?:b|c|d)+?(.)', 'ace', SUCCEED, 'g1', 'e'),
@@ -535,5 +590,5 @@
535590
(r'\t\n\v\r\f\a\g', '\t\n\v\r\f\ag', SUCCEED, 'found', '\t\n\v\r\f\ag'),
536591
('\t\n\v\r\f\a\g', '\t\n\v\r\f\ag', SUCCEED, 'found', '\t\n\v\r\f\ag'),
537592
(r'\t\n\v\r\f\a', '\t\n\v\r\f\a', SUCCEED, 'found', chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)),
538-
(r'[\t][\n][\v][\r][\f][\a][\A][\b][\B][\Z][\g]', '\t\n\v\r\f\aA\bBZg', SUCCEED, 'found', '\t\n\v\r\f\aA\bBZg'),
593+
(r'[\t][\n][\v][\r][\f][\b]', '\t\n\v\r\f\b', SUCCEED, 'found', '\t\n\v\r\f\b'),
539594
]

0 commit comments

Comments
 (0)