11#
2- # generate a compact version of the unicode property database
2+ # (re)generate unicode property and type databases
3+ #
4+ # this script converts a unicode 3.0 database file to
5+ # Modules/unicodedata_db.h and Objects/unicodetype_db.h
36#
47# history:
58# 2000-09-24 fl created (based on bits and pieces from unidb)
69# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
10+ # 2000-09-25 fl added character type table
711#
812# written by Fredrik Lundh (fredrik@pythonware.com), September 2000
913#
1317SCRIPT = sys .argv [0 ]
1418VERSION = "1.1"
1519
16- UNICODE_DATA = "../ UnicodeData-Latest.txt"
20+ UNICODE_DATA = "UnicodeData-Latest.txt"
1721
1822CATEGORY_NAMES = [ "Cn" , "Lu" , "Ll" , "Lt" , "Mn" , "Mc" , "Me" , "Nd" ,
1923 "Nl" , "No" , "Zs" , "Zl" , "Zp" , "Cc" , "Cf" , "Cs" , "Co" , "Cn" , "Lm" ,
2428 "PDF" , "EN" , "ES" , "ET" , "AN" , "CS" , "NSM" , "BN" , "B" , "S" , "WS" ,
2529 "ON" ]
2630
27- def maketable ():
31+ ALPHA_MASK = 0x01
32+ DECIMAL_MASK = 0x02
33+ DIGIT_MASK = 0x04
34+ LOWER_MASK = 0x08
35+ NUMERIC_MASK = 0x10
36+ SPACE_MASK = 0x20
37+ TITLE_MASK = 0x40
38+ UPPER_MASK = 0x80
39+
40+ def maketables ():
2841
2942 unicode = UnicodeData (UNICODE_DATA )
3043
@@ -74,7 +87,7 @@ def maketable():
7487 i = 0
7588 decomp_index [char ] = i
7689
77- FILE = "unicodedata_db.h"
90+ FILE = "Modules/ unicodedata_db.h"
7891
7992 sys .stdout = open (FILE , "w" )
8093
@@ -87,6 +100,9 @@ def maketable():
87100 print "};"
88101 print
89102
103+ # FIXME: the following tables should be made static, and
104+ # the support code moved into unicodedatabase.c
105+
90106 print "/* string literals */"
91107 print "const char *_PyUnicode_CategoryNames[] = {"
92108 for name in CATEGORY_NAMES :
@@ -106,24 +122,96 @@ def maketable():
106122 print " NULL"
107123 print "};"
108124
109- # split index table
125+ # split record index table
110126 index1 , index2 , shift = splitbins (index )
111127
112- print "/* index tables used to find the right database record */"
128+ print "/* index tables for the database records */"
113129 print "#define SHIFT" , shift
114130 Array ("index1" , index1 ).dump (sys .stdout )
115131 Array ("index2" , index2 ).dump (sys .stdout )
116132
117- # split index table
133+ # split decomposition index table
118134 index1 , index2 , shift = splitbins (decomp_index )
119135
120- print "/* same, for the decomposition data */"
136+ print "/* index tables for the decomposition data */"
121137 print "#define DECOMP_SHIFT" , shift
122138 Array ("decomp_index1" , index1 ).dump (sys .stdout )
123139 Array ("decomp_index2" , index2 ).dump (sys .stdout )
124140
125141 sys .stdout = sys .__stdout__
126142
143+ #
144+ # 3) unicode type data
145+
146+ # extract unicode types
147+ dummy = (0 , 0 , 0 , 0 )
148+ table = [dummy ]
149+ cache = {0 : dummy }
150+ index = [0 ] * len (unicode .chars )
151+
152+ for char in unicode .chars :
153+ record = unicode .table [char ]
154+ if record :
155+ # extract database properties
156+ category = record [2 ]
157+ bidirectional = record [4 ]
158+ flags = 0
159+ if category in ["Lm" , "Lt" , "Lu" , "Ll" , "Lo" ]:
160+ flags |= ALPHA_MASK
161+ if category == "Ll" :
162+ flags |= LOWER_MASK
163+ if category == "Zs" or bidirectional in ("WS" , "B" , "S" ):
164+ flags |= SPACE_MASK
165+ if category in ["Lt" , "Lu" ]:
166+ flags |= TITLE_MASK
167+ if category == "Lu" :
168+ flags |= UPPER_MASK
169+ # use delta predictor for upper/lower/title
170+ if record [12 ]:
171+ upper = (int (record [12 ], 16 ) - char ) & 0xffff
172+ else :
173+ upper = 0
174+ if record [13 ]:
175+ lower = (int (record [13 ], 16 ) - char ) & 0xffff
176+ else :
177+ lower = 0
178+ if record [14 ]:
179+ title = (int (record [14 ], 16 ) - char ) & 0xffff
180+ else :
181+ title = 0
182+ item = (
183+ flags , upper , lower , title
184+ )
185+ # add entry to index and item tables
186+ i = cache .get (item )
187+ if i is None :
188+ cache [item ] = i = len (table )
189+ table .append (item )
190+ index [char ] = i
191+
192+ FILE = "Objects/unicodetype_db.h"
193+
194+ sys .stdout = open (FILE , "w" )
195+
196+ print "/* this file was generated by %s %s */" % (SCRIPT , VERSION )
197+ print
198+ print "/* a list of unique character type descriptors */"
199+ print "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
200+ for item in table :
201+ print " {%d, %d, %d, %d}," % item
202+ print "};"
203+ print
204+
205+ # split decomposition index table
206+ index1 , index2 , shift = splitbins (index )
207+
208+ print "/* type indexes */"
209+ print "#define SHIFT" , shift
210+ Array ("index1" , index1 ).dump (sys .stdout )
211+ Array ("index2" , index2 ).dump (sys .stdout )
212+
213+ sys .stdout = sys .__stdout__
214+
127215# --------------------------------------------------------------------
128216# the following support code is taken from the unidb utilities
129217# Copyright (c) 1999-2000 by Secret Labs AB
@@ -259,4 +347,4 @@ def dump(t1, t2, shift, bytes):
259347 return best
260348
261349if __name__ == "__main__" :
262- maketable ()
350+ maketables ()
0 commit comments