Skip to content

Commit c5694c8

Browse files
committed
Moved gencodec.py to the Tools/unicode/ directory.
Added new support for decoding tables. Cleaned up the implementation a bit.
1 parent 3144130 commit c5694c8

File tree

1 file changed

+179
-88
lines changed

1 file changed

+179
-88
lines changed
Lines changed: 179 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -15,17 +15,22 @@
1515
The tool also writes marshalled versions of the mapping tables to the
1616
same location (with .mapping extension).
1717
18-
Written by Marc-Andre Lemburg (mal@lemburg.com).
18+
Written by Marc-Andre Lemburg (mal@lemburg.com). Modified to generate
19+
Unicode table maps for decoding.
1920
2021
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
2122
(c) Copyright Guido van Rossum, 2000.
23+
(c) Copyright Marc-Andre Lemburg, 2005.
2224
2325
"""#"
2426

25-
import re,os,time,marshal
27+
import re, os, time, marshal, codecs
2628

27-
# Create numeric tables or character based ones ?
28-
numeric = 1
29+
# Maximum allowed size of charmap tables
30+
MAX_TABLE_SIZE = 8192
31+
32+
# Standard undefined Unicode code point
33+
UNI_UNDEFINED = unichr(0xFFFE)
2934

3035
mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
3136
'\s+'
@@ -69,8 +74,15 @@ def readmap(filename):
6974
enc2uni = {}
7075
identity = []
7176
unmapped = range(256)
72-
for i in range(256):
73-
unmapped[i] = i
77+
78+
# UTC mapping tables per convention don't include the identity
79+
# mappings for code points 0x00 - 0x1F and 0x7F, unless these are
80+
# explicitly mapped to different characters or undefined
81+
for i in range(32) + [127]:
82+
identity.append(i)
83+
unmapped.remove(i)
84+
enc2uni[i] = (i, 'CONTROL CHARACTER')
85+
7486
for line in lines:
7587
line = line.strip()
7688
if not line or line[0] == '#':
@@ -82,22 +94,23 @@ def readmap(filename):
8294
enc,uni,comment = m.groups()
8395
enc = parsecodes(enc)
8496
uni = parsecodes(uni)
85-
if not comment:
97+
if comment is None:
8698
comment = ''
8799
else:
88-
comment = comment[1:]
100+
comment = comment[1:].strip()
89101
if enc < 256:
90-
unmapped.remove(enc)
102+
if enc in unmapped:
103+
unmapped.remove(enc)
91104
if enc == uni:
92105
identity.append(enc)
93-
else:
94-
enc2uni[enc] = (uni,comment)
106+
enc2uni[enc] = (uni,comment)
95107
else:
96108
enc2uni[enc] = (uni,comment)
109+
97110
# If there are more identity-mapped entries than unmapped entries,
98111
# it pays to generate an identity dictionary first, and add explicit
99112
# mappings to None for the rest
100-
if len(identity)>=len(unmapped):
113+
if len(identity) >= len(unmapped):
101114
for enc in unmapped:
102115
enc2uni[enc] = (None, "")
103116
enc2uni['IDENTITY'] = 256
@@ -112,44 +125,146 @@ def hexrepr(t):
112125
len(t)
113126
except:
114127
return '0x%04x' % t
115-
return '(' + ', '.join(map(lambda t: '0x%04x' % t, t)) + ')'
128+
try:
129+
return '(' + ', '.join(map(lambda t: '0x%04x' % t, t)) + ')'
130+
except TypeError, why:
131+
print '* failed to convert %r: %s' % (t, why)
132+
raise
116133

117-
def unicoderepr(t):
134+
def python_mapdef_code(varname, map, comments=1):
118135

119-
if t is None:
120-
return 'None'
121-
if numeric:
122-
return hexrepr(t)
136+
l = []
137+
append = l.append
138+
if map.has_key("IDENTITY"):
139+
append("%s = codecs.make_identity_dict(range(%d))" %
140+
(varname, map["IDENTITY"]))
141+
append("%s.update({" % varname)
142+
splits = 1
143+
del map["IDENTITY"]
144+
identity = 1
123145
else:
124-
try:
125-
len(t)
126-
except:
127-
return repr(unichr(t))
128-
return repr(''.join(map(unichr, t)))
129-
130-
def keyrepr(t):
146+
append("%s = {" % varname)
147+
splits = 0
148+
identity = 0
131149

132-
if t is None:
133-
return 'None'
134-
if numeric:
135-
return hexrepr(t)
150+
mappings = map.items()
151+
mappings.sort()
152+
i = 0
153+
for mapkey, mapvalue in mappings:
154+
mapcomment = ''
155+
if isinstance(mapkey, tuple):
156+
(mapkey, mapcomment) = mapkey
157+
if isinstance(mapvalue, tuple):
158+
(mapvalue, mapcomment) = mapvalue
159+
if mapkey is None:
160+
continue
161+
if (identity and
162+
mapkey == mapvalue and
163+
mapkey < 256):
164+
# No need to include identity mappings, since these
165+
# are already set for the first 256 code points.
166+
continue
167+
key = hexrepr(mapkey)
168+
value = hexrepr(mapvalue)
169+
if mapcomment and comments:
170+
append(' %s: %s,\t# %s' % (key, value, mapcomment))
171+
else:
172+
append(' %s: %s,' % (key, value))
173+
i += 1
174+
if i == 4096:
175+
# Split the definition into parts to that the Python
176+
# parser doesn't dump core
177+
if splits == 0:
178+
append('}')
179+
else:
180+
append('})')
181+
append('%s.update({' % varname)
182+
i = 0
183+
splits = splits + 1
184+
if splits == 0:
185+
append('}')
136186
else:
137-
try:
138-
len(t)
139-
except:
140-
if t < 256:
141-
return repr(chr(t))
187+
append('})')
188+
189+
return l
190+
191+
def python_tabledef_code(varname, map, comments=1):
192+
193+
l = []
194+
append = l.append
195+
append('%s = (' % varname)
196+
197+
# Analyze map and create table dict
198+
mappings = map.items()
199+
mappings.sort()
200+
table = {}
201+
maxkey = 0
202+
if map.has_key('IDENTITY'):
203+
for key in range(256):
204+
table[key] = (key, '')
205+
maxkey = 255
206+
del map['IDENTITY']
207+
for mapkey, mapvalue in mappings:
208+
mapcomment = ''
209+
if isinstance(mapkey, tuple):
210+
(mapkey, mapcomment) = mapkey
211+
if isinstance(mapvalue, tuple):
212+
(mapvalue, mapcomment) = mapvalue
213+
if mapkey is None:
214+
continue
215+
table[mapkey] = (mapvalue, mapcomment)
216+
if mapkey > maxkey:
217+
maxkey = mapkey
218+
if maxkey > MAX_TABLE_SIZE:
219+
# Table too large
220+
return None
221+
222+
# Create table code
223+
for key in range(maxkey + 1):
224+
if key not in table:
225+
mapvalue = None
226+
mapcomment = 'UNDEFINED'
227+
else:
228+
mapvalue, mapcomment = table[key]
229+
if mapvalue is None:
230+
mapchar = UNI_UNDEFINED
231+
else:
232+
if isinstance(mapvalue, tuple):
233+
# 1-n mappings not supported
234+
return None
142235
else:
143-
return repr(unichr(t))
144-
return repr(''.join(map(chr, t)))
236+
mapchar = unichr(mapvalue)
237+
if mapcomment and comments:
238+
append(' %r\t# %s -> %s' % (mapchar,
239+
hexrepr(key),
240+
mapcomment))
241+
else:
242+
append(' %r' % mapchar)
145243

146-
def codegen(name,map,comments=1):
244+
append(')')
245+
return l
246+
247+
def codegen(name, map, comments=1):
147248

148249
""" Returns Python source for the given map.
149250
150251
Comments are included in the source, if comments is true (default).
151252
152253
"""
254+
# Generate code
255+
decoding_map_code = python_mapdef_code(
256+
'decoding_map',
257+
map,
258+
comments=comments)
259+
decoding_table_code = python_tabledef_code(
260+
'decoding_table',
261+
map,
262+
comments=comments)
263+
encoding_map_code = python_mapdef_code(
264+
'encoding_map',
265+
codecs.make_encoding_map(map),
266+
comments=comments)
267+
153268
l = [
154269
'''\
155270
""" Python Character Mapping Codec generated from '%s' with gencodec.py.
@@ -167,9 +282,16 @@ def encode(self,input,errors='strict'):
167282
return codecs.charmap_encode(input,errors,encoding_map)
168283
169284
def decode(self,input,errors='strict'):
170-
171-
return codecs.charmap_decode(input,errors,decoding_map)
172-
285+
''' % name
286+
]
287+
if decoding_table_code:
288+
l.append('''\
289+
return codecs.charmap_decode(input,errors,decoding_table)''')
290+
else:
291+
l.append('''\
292+
return codecs.charmap_decode(input,errors,decoding_map)''')
293+
294+
l.append('''
173295
class StreamWriter(Codec,codecs.StreamWriter):
174296
pass
175297
@@ -183,54 +305,21 @@ def getregentry():
183305
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
184306
185307
### Decoding Map
186-
''' % name,
187-
]
308+
''')
309+
l.extend(decoding_map_code)
188310

189-
if map.has_key("IDENTITY"):
190-
l.append("decoding_map = codecs.make_identity_dict(range(%d))"
191-
% map["IDENTITY"])
192-
l.append("decoding_map.update({")
193-
splits = 1
194-
del map["IDENTITY"]
195-
else:
196-
l.append("decoding_map = {")
197-
splits = 0
311+
# Add optional decoding table
312+
if decoding_table_code:
313+
l.append('''
314+
### Decoding Table
315+
''')
316+
l.extend(decoding_table_code)
198317

199-
mappings = map.items()
200-
mappings.sort()
201-
append = l.append
202-
i = 0
203-
for e,value in mappings:
204-
try:
205-
(u,c) = value
206-
except TypeError:
207-
u = value
208-
c = ''
209-
key = keyrepr(e)
210-
if c and comments:
211-
append('\t%s: %s,\t# %s' % (key,unicoderepr(u),c))
212-
else:
213-
append('\t%s: %s,' % (key,unicoderepr(u)))
214-
i += 1
215-
if i == 4096:
216-
# Split the definition into parts to that the Python
217-
# parser doesn't dump core
218-
if splits == 0:
219-
append('}')
220-
else:
221-
append('})')
222-
append('decoding_map.update({')
223-
i = 0
224-
splits = splits + 1
225-
if splits == 0:
226-
append('}')
227-
else:
228-
append('})')
229-
append('''
318+
l.append('''
230319
### Encoding Map
231-
232-
encoding_map = codecs.make_encoding_map(decoding_map)
233320
''')
321+
l.extend(encoding_map_code)
322+
234323
return '\n'.join(l)
235324

236325
def pymap(name,map,pyfile,comments=1):
@@ -253,6 +342,7 @@ def convertdir(dir,prefix='',comments=1):
253342

254343
mapnames = os.listdir(dir)
255344
for mapname in mapnames:
345+
mappathname = os.path.join(dir, mapname)
256346
name = os.path.split(mapname)[1]
257347
name = name.replace('-','_')
258348
name = name.split('.')[0]
@@ -267,10 +357,11 @@ def convertdir(dir,prefix='',comments=1):
267357
if not map:
268358
print '* map is empty; skipping'
269359
else:
270-
pymap(mapname, map, prefix + codefile,comments)
271-
marshalmap(mapname, map, prefix + marshalfile)
272-
except ValueError:
273-
print '* conversion failed'
360+
pymap(mappathname, map, prefix + codefile,comments)
361+
marshalmap(mappathname, map, prefix + marshalfile)
362+
except ValueError, why:
363+
print '* conversion failed: %s' % why
364+
raise
274365

275366
def rewritepythondir(dir,prefix='',comments=1):
276367

0 commit comments

Comments
 (0)