1515The tool also writes marshalled versions of the mapping tables to the
1616same location (with .mapping extension).
1717
18- Written by Marc-Andre Lemburg (mal@lemburg.com).
18+ Written by Marc-Andre Lemburg (mal@lemburg.com). Modified to generate
19+ Unicode table maps for decoding.
1920
2021(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
2122(c) Copyright Guido van Rossum, 2000.
23+ (c) Copyright Marc-Andre Lemburg, 2005.
2224
2325""" #"
2426
25- import re ,os ,time ,marshal
27+ import re , os , time , marshal , codecs
2628
27- # Create numeric tables or character based ones ?
28- numeric = 1
29+ # Maximum allowed size of charmap tables
30+ MAX_TABLE_SIZE = 8192
31+
32+ # Standard undefined Unicode code point
33+ UNI_UNDEFINED = unichr (0xFFFE )
2934
3035mapRE = re .compile ('((?:0x[0-9a-fA-F]+\+?)+)'
3136 '\s+'
@@ -69,8 +74,15 @@ def readmap(filename):
6974 enc2uni = {}
7075 identity = []
7176 unmapped = range (256 )
72- for i in range (256 ):
73- unmapped [i ] = i
77+
78+ # UTC mapping tables per convention don't include the identity
79+ # mappings for code points 0x00 - 0x1F and 0x7F, unless these are
80+ # explicitly mapped to different characters or undefined
81+ for i in range (32 ) + [127 ]:
82+ identity .append (i )
83+ unmapped .remove (i )
84+ enc2uni [i ] = (i , 'CONTROL CHARACTER' )
85+
7486 for line in lines :
7587 line = line .strip ()
7688 if not line or line [0 ] == '#' :
@@ -82,22 +94,23 @@ def readmap(filename):
8294 enc ,uni ,comment = m .groups ()
8395 enc = parsecodes (enc )
8496 uni = parsecodes (uni )
85- if not comment :
97+ if comment is None :
8698 comment = ''
8799 else :
88- comment = comment [1 :]
100+ comment = comment [1 :]. strip ()
89101 if enc < 256 :
90- unmapped .remove (enc )
102+ if enc in unmapped :
103+ unmapped .remove (enc )
91104 if enc == uni :
92105 identity .append (enc )
93- else :
94- enc2uni [enc ] = (uni ,comment )
106+ enc2uni [enc ] = (uni ,comment )
95107 else :
96108 enc2uni [enc ] = (uni ,comment )
109+
97110 # If there are more identity-mapped entries than unmapped entries,
98111 # it pays to generate an identity dictionary first, and add explicit
99112 # mappings to None for the rest
100- if len (identity )>= len (unmapped ):
113+ if len (identity ) >= len (unmapped ):
101114 for enc in unmapped :
102115 enc2uni [enc ] = (None , "" )
103116 enc2uni ['IDENTITY' ] = 256
@@ -112,44 +125,146 @@ def hexrepr(t):
112125 len (t )
113126 except :
114127 return '0x%04x' % t
115- return '(' + ', ' .join (map (lambda t : '0x%04x' % t , t )) + ')'
128+ try :
129+ return '(' + ', ' .join (map (lambda t : '0x%04x' % t , t )) + ')'
130+ except TypeError , why :
131+ print '* failed to convert %r: %s' % (t , why )
132+ raise
116133
117- def unicoderepr ( t ):
134+ def python_mapdef_code ( varname , map , comments = 1 ):
118135
119- if t is None :
120- return 'None'
121- if numeric :
122- return hexrepr (t )
136+ l = []
137+ append = l .append
138+ if map .has_key ("IDENTITY" ):
139+ append ("%s = codecs.make_identity_dict(range(%d))" %
140+ (varname , map ["IDENTITY" ]))
141+ append ("%s.update({" % varname )
142+ splits = 1
143+ del map ["IDENTITY" ]
144+ identity = 1
123145 else :
124- try :
125- len (t )
126- except :
127- return repr (unichr (t ))
128- return repr ('' .join (map (unichr , t )))
129-
130- def keyrepr (t ):
146+ append ("%s = {" % varname )
147+ splits = 0
148+ identity = 0
131149
132- if t is None :
133- return 'None'
134- if numeric :
135- return hexrepr (t )
150+ mappings = map .items ()
151+ mappings .sort ()
152+ i = 0
153+ for mapkey , mapvalue in mappings :
154+ mapcomment = ''
155+ if isinstance (mapkey , tuple ):
156+ (mapkey , mapcomment ) = mapkey
157+ if isinstance (mapvalue , tuple ):
158+ (mapvalue , mapcomment ) = mapvalue
159+ if mapkey is None :
160+ continue
161+ if (identity and
162+ mapkey == mapvalue and
163+ mapkey < 256 ):
164+ # No need to include identity mappings, since these
165+ # are already set for the first 256 code points.
166+ continue
167+ key = hexrepr (mapkey )
168+ value = hexrepr (mapvalue )
169+ if mapcomment and comments :
170+ append (' %s: %s,\t # %s' % (key , value , mapcomment ))
171+ else :
172+ append (' %s: %s,' % (key , value ))
173+ i += 1
174+ if i == 4096 :
175+ # Split the definition into parts to that the Python
176+ # parser doesn't dump core
177+ if splits == 0 :
178+ append ('}' )
179+ else :
180+ append ('})' )
181+ append ('%s.update({' % varname )
182+ i = 0
183+ splits = splits + 1
184+ if splits == 0 :
185+ append ('}' )
136186 else :
137- try :
138- len (t )
139- except :
140- if t < 256 :
141- return repr (chr (t ))
187+ append ('})' )
188+
189+ return l
190+
191+ def python_tabledef_code (varname , map , comments = 1 ):
192+
193+ l = []
194+ append = l .append
195+ append ('%s = (' % varname )
196+
197+ # Analyze map and create table dict
198+ mappings = map .items ()
199+ mappings .sort ()
200+ table = {}
201+ maxkey = 0
202+ if map .has_key ('IDENTITY' ):
203+ for key in range (256 ):
204+ table [key ] = (key , '' )
205+ maxkey = 255
206+ del map ['IDENTITY' ]
207+ for mapkey , mapvalue in mappings :
208+ mapcomment = ''
209+ if isinstance (mapkey , tuple ):
210+ (mapkey , mapcomment ) = mapkey
211+ if isinstance (mapvalue , tuple ):
212+ (mapvalue , mapcomment ) = mapvalue
213+ if mapkey is None :
214+ continue
215+ table [mapkey ] = (mapvalue , mapcomment )
216+ if mapkey > maxkey :
217+ maxkey = mapkey
218+ if maxkey > MAX_TABLE_SIZE :
219+ # Table too large
220+ return None
221+
222+ # Create table code
223+ for key in range (maxkey + 1 ):
224+ if key not in table :
225+ mapvalue = None
226+ mapcomment = 'UNDEFINED'
227+ else :
228+ mapvalue , mapcomment = table [key ]
229+ if mapvalue is None :
230+ mapchar = UNI_UNDEFINED
231+ else :
232+ if isinstance (mapvalue , tuple ):
233+ # 1-n mappings not supported
234+ return None
142235 else :
143- return repr (unichr (t ))
144- return repr ('' .join (map (chr , t )))
236+ mapchar = unichr (mapvalue )
237+ if mapcomment and comments :
238+ append (' %r\t # %s -> %s' % (mapchar ,
239+ hexrepr (key ),
240+ mapcomment ))
241+ else :
242+ append (' %r' % mapchar )
145243
146- def codegen (name ,map ,comments = 1 ):
244+ append (')' )
245+ return l
246+
247+ def codegen (name , map , comments = 1 ):
147248
148249 """ Returns Python source for the given map.
149250
150251 Comments are included in the source, if comments is true (default).
151252
152253 """
254+ # Generate code
255+ decoding_map_code = python_mapdef_code (
256+ 'decoding_map' ,
257+ map ,
258+ comments = comments )
259+ decoding_table_code = python_tabledef_code (
260+ 'decoding_table' ,
261+ map ,
262+ comments = comments )
263+ encoding_map_code = python_mapdef_code (
264+ 'encoding_map' ,
265+ codecs .make_encoding_map (map ),
266+ comments = comments )
267+
153268 l = [
154269 '''\
155270 """ Python Character Mapping Codec generated from '%s' with gencodec.py.
@@ -167,9 +282,16 @@ def encode(self,input,errors='strict'):
167282 return codecs.charmap_encode(input,errors,encoding_map)
168283
169284 def decode(self,input,errors='strict'):
170-
171- return codecs.charmap_decode(input,errors,decoding_map)
172-
285+ ''' % name
286+ ]
287+ if decoding_table_code :
288+ l .append ('''\
289+ return codecs.charmap_decode(input,errors,decoding_table)''' )
290+ else :
291+ l .append ('''\
292+ return codecs.charmap_decode(input,errors,decoding_map)''' )
293+
294+ l .append ('''
173295class StreamWriter(Codec,codecs.StreamWriter):
174296 pass
175297
@@ -183,54 +305,21 @@ def getregentry():
183305 return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
184306
185307### Decoding Map
186- ''' % name ,
187- ]
308+ ''' )
309+ l . extend ( decoding_map_code )
188310
189- if map .has_key ("IDENTITY" ):
190- l .append ("decoding_map = codecs.make_identity_dict(range(%d))"
191- % map ["IDENTITY" ])
192- l .append ("decoding_map.update({" )
193- splits = 1
194- del map ["IDENTITY" ]
195- else :
196- l .append ("decoding_map = {" )
197- splits = 0
311+ # Add optional decoding table
312+ if decoding_table_code :
313+ l .append ('''
314+ ### Decoding Table
315+ ''' )
316+ l .extend (decoding_table_code )
198317
199- mappings = map .items ()
200- mappings .sort ()
201- append = l .append
202- i = 0
203- for e ,value in mappings :
204- try :
205- (u ,c ) = value
206- except TypeError :
207- u = value
208- c = ''
209- key = keyrepr (e )
210- if c and comments :
211- append ('\t %s: %s,\t # %s' % (key ,unicoderepr (u ),c ))
212- else :
213- append ('\t %s: %s,' % (key ,unicoderepr (u )))
214- i += 1
215- if i == 4096 :
216- # Split the definition into parts to that the Python
217- # parser doesn't dump core
218- if splits == 0 :
219- append ('}' )
220- else :
221- append ('})' )
222- append ('decoding_map.update({' )
223- i = 0
224- splits = splits + 1
225- if splits == 0 :
226- append ('}' )
227- else :
228- append ('})' )
229- append ('''
318+ l .append ('''
230319### Encoding Map
231-
232- encoding_map = codecs.make_encoding_map(decoding_map)
233320''' )
321+ l .extend (encoding_map_code )
322+
234323 return '\n ' .join (l )
235324
236325def pymap (name ,map ,pyfile ,comments = 1 ):
@@ -253,6 +342,7 @@ def convertdir(dir,prefix='',comments=1):
253342
254343 mapnames = os .listdir (dir )
255344 for mapname in mapnames :
345+ mappathname = os .path .join (dir , mapname )
256346 name = os .path .split (mapname )[1 ]
257347 name = name .replace ('-' ,'_' )
258348 name = name .split ('.' )[0 ]
@@ -267,10 +357,11 @@ def convertdir(dir,prefix='',comments=1):
267357 if not map :
268358 print '* map is empty; skipping'
269359 else :
270- pymap (mapname , map , prefix + codefile ,comments )
271- marshalmap (mapname , map , prefix + marshalfile )
272- except ValueError :
273- print '* conversion failed'
360+ pymap (mappathname , map , prefix + codefile ,comments )
361+ marshalmap (mappathname , map , prefix + marshalfile )
362+ except ValueError , why :
363+ print '* conversion failed: %s' % why
364+ raise
274365
275366def rewritepythondir (dir ,prefix = '' ,comments = 1 ):
276367
0 commit comments