Skip to content

Commit e9133f7

Browse files
author
Fredrik Lundh
committed
unicode database compression, step 3:
- use unidb compression for the unicodectype module. smaller, faster, and slightly more portable... - also mention the unicode directory in Tools/README
1 parent e53793b commit e9133f7

File tree

2 files changed

+100
-9
lines changed

2 files changed

+100
-9
lines changed

Tools/README

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ scripts A number of useful single-file programs, e.g. tabnanny.py
2121
(by Tim Peters), which checks for inconsistent mixing
2222
of tabs and spaces.
2323

24+
unicode Tools used to generate unicode database files for
25+
Python 2.0 (by Fredrik Lundh).
26+
2427
versioncheck A tool to automate checking whether you have the latest
2528
version of a package (by Jack Jansen).
2629

Tools/unicode/makeunicodedata.py

Lines changed: 97 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
11
#
2-
# generate a compact version of the unicode property database
2+
# (re)generate unicode property and type databases
3+
#
4+
# this script converts a unicode 3.0 database file to
5+
# Modules/unicodedata_db.h and Objects/unicodetype_db.h
36
#
47
# history:
58
# 2000-09-24 fl created (based on bits and pieces from unidb)
69
# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
10+
# 2000-09-25 fl added character type table
711
#
812
# written by Fredrik Lundh (fredrik@pythonware.com), September 2000
913
#
@@ -13,7 +17,7 @@
1317
SCRIPT = sys.argv[0]
1418
VERSION = "1.1"
1519

16-
UNICODE_DATA = "../UnicodeData-Latest.txt"
20+
UNICODE_DATA = "UnicodeData-Latest.txt"
1721

1822
CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
1923
"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
@@ -24,7 +28,16 @@
2428
"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
2529
"ON" ]
2630

27-
def maketable():
31+
ALPHA_MASK = 0x01
32+
DECIMAL_MASK = 0x02
33+
DIGIT_MASK = 0x04
34+
LOWER_MASK = 0x08
35+
NUMERIC_MASK = 0x10
36+
SPACE_MASK = 0x20
37+
TITLE_MASK = 0x40
38+
UPPER_MASK = 0x80
39+
40+
def maketables():
2841

2942
unicode = UnicodeData(UNICODE_DATA)
3043

@@ -74,7 +87,7 @@ def maketable():
7487
i = 0
7588
decomp_index[char] = i
7689

77-
FILE = "unicodedata_db.h"
90+
FILE = "Modules/unicodedata_db.h"
7891

7992
sys.stdout = open(FILE, "w")
8093

@@ -87,6 +100,9 @@ def maketable():
87100
print "};"
88101
print
89102

103+
# FIXME: the following tables should be made static, and
104+
# the support code moved into unicodedatabase.c
105+
90106
print "/* string literals */"
91107
print "const char *_PyUnicode_CategoryNames[] = {"
92108
for name in CATEGORY_NAMES:
@@ -106,24 +122,96 @@ def maketable():
106122
print " NULL"
107123
print "};"
108124

109-
# split index table
125+
# split record index table
110126
index1, index2, shift = splitbins(index)
111127

112-
print "/* index tables used to find the right database record */"
128+
print "/* index tables for the database records */"
113129
print "#define SHIFT", shift
114130
Array("index1", index1).dump(sys.stdout)
115131
Array("index2", index2).dump(sys.stdout)
116132

117-
# split index table
133+
# split decomposition index table
118134
index1, index2, shift = splitbins(decomp_index)
119135

120-
print "/* same, for the decomposition data */"
136+
print "/* index tables for the decomposition data */"
121137
print "#define DECOMP_SHIFT", shift
122138
Array("decomp_index1", index1).dump(sys.stdout)
123139
Array("decomp_index2", index2).dump(sys.stdout)
124140

125141
sys.stdout = sys.__stdout__
126142

143+
#
144+
# 3) unicode type data
145+
146+
# extract unicode types
147+
dummy = (0, 0, 0, 0)
148+
table = [dummy]
149+
cache = {0: dummy}
150+
index = [0] * len(unicode.chars)
151+
152+
for char in unicode.chars:
153+
record = unicode.table[char]
154+
if record:
155+
# extract database properties
156+
category = record[2]
157+
bidirectional = record[4]
158+
flags = 0
159+
if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
160+
flags |= ALPHA_MASK
161+
if category == "Ll":
162+
flags |= LOWER_MASK
163+
if category == "Zs" or bidirectional in ("WS", "B", "S"):
164+
flags |= SPACE_MASK
165+
if category in ["Lt", "Lu"]:
166+
flags |= TITLE_MASK
167+
if category == "Lu":
168+
flags |= UPPER_MASK
169+
# use delta predictor for upper/lower/title
170+
if record[12]:
171+
upper = (int(record[12], 16) - char) & 0xffff
172+
else:
173+
upper = 0
174+
if record[13]:
175+
lower = (int(record[13], 16) - char) & 0xffff
176+
else:
177+
lower = 0
178+
if record[14]:
179+
title = (int(record[14], 16) - char) & 0xffff
180+
else:
181+
title = 0
182+
item = (
183+
flags, upper, lower, title
184+
)
185+
# add entry to index and item tables
186+
i = cache.get(item)
187+
if i is None:
188+
cache[item] = i = len(table)
189+
table.append(item)
190+
index[char] = i
191+
192+
FILE = "Objects/unicodetype_db.h"
193+
194+
sys.stdout = open(FILE, "w")
195+
196+
print "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
197+
print
198+
print "/* a list of unique character type descriptors */"
199+
print "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
200+
for item in table:
201+
print " {%d, %d, %d, %d}," % item
202+
print "};"
203+
print
204+
205+
# split decomposition index table
206+
index1, index2, shift = splitbins(index)
207+
208+
print "/* type indexes */"
209+
print "#define SHIFT", shift
210+
Array("index1", index1).dump(sys.stdout)
211+
Array("index2", index2).dump(sys.stdout)
212+
213+
sys.stdout = sys.__stdout__
214+
127215
# --------------------------------------------------------------------
128216
# the following support code is taken from the unidb utilities
129217
# Copyright (c) 1999-2000 by Secret Labs AB
@@ -259,4 +347,4 @@ def dump(t1, t2, shift, bytes):
259347
return best
260348

261349
if __name__ == "__main__":
262-
maketable()
350+
maketables()

0 commit comments

Comments
 (0)