unicode database compression, step 3:

Fredrik Lundh · Fredrik Lundh · commit e9133f7e2efb · 2000-09-25T17:59:57.000Z
- use unidb compression for the unicodectype module.  smaller,
  faster, and slightly more portable...

- also mention the unicode directory in Tools/README
diff --git a/Tools/README b/Tools/README
@@ -21,6 +21,9 @@ scripts		A number of useful single-file programs, e.g. tabnanny.py
 		(by Tim Peters), which checks for inconsistent mixing
 		of tabs and spaces.
 
+unicode		Tools used to generate unicode database files for
+		Python 2.0 (by Fredrik Lundh).
+
 versioncheck	A tool to automate checking whether you have the latest
 		version of a package (by Jack Jansen).
 
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
@@ -1,9 +1,13 @@
 #
-# generate a compact version of the unicode property database
+# (re)generate unicode property and type databases
+#
+# this script converts a unicode 3.0 database file to
+# Modules/unicodedata_db.h and Objects/unicodetype_db.h
 #
 # history:
 # 2000-09-24 fl   created (based on bits and pieces from unidb)
 # 2000-09-25 fl   merged tim's splitbin fixes, separate decomposition table
+# 2000-09-25 fl   added character type table
 #
 # written by Fredrik Lundh (fredrik@pythonware.com), September 2000
 #
@@ -13,7 +17,7 @@
 SCRIPT = sys.argv[0]
 VERSION = "1.1"
 
-UNICODE_DATA = "../UnicodeData-Latest.txt"
+UNICODE_DATA = "UnicodeData-Latest.txt"
 
 CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
     "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
@@ -24,7 +28,16 @@
     "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
     "ON" ]
 
-def maketable():
+ALPHA_MASK = 0x01
+DECIMAL_MASK = 0x02
+DIGIT_MASK = 0x04
+LOWER_MASK = 0x08
+NUMERIC_MASK = 0x10
+SPACE_MASK = 0x20
+TITLE_MASK = 0x40
+UPPER_MASK = 0x80
+
+def maketables():
 
     unicode = UnicodeData(UNICODE_DATA)
 
@@ -74,7 +87,7 @@ def maketable():
                 i = 0
             decomp_index[char] = i
 
-    FILE = "unicodedata_db.h"
+    FILE = "Modules/unicodedata_db.h"
 
     sys.stdout = open(FILE, "w")
 
@@ -87,6 +100,9 @@ def maketable():
     print "};"
     print
 
+    # FIXME: the following tables should be made static, and
+    # the support code moved into unicodedatabase.c
+
     print "/* string literals */"
     print "const char *_PyUnicode_CategoryNames[] = {"
     for name in CATEGORY_NAMES:
@@ -106,24 +122,96 @@ def maketable():
     print "    NULL"
     print "};"
 
-    # split index table
+    # split record index table
     index1, index2, shift = splitbins(index)
 
-    print "/* index tables used to find the right database record */"
+    print "/* index tables for the database records */"
     print "#define SHIFT", shift
     Array("index1", index1).dump(sys.stdout)
     Array("index2", index2).dump(sys.stdout)
 
-    # split index table
+    # split decomposition index table
     index1, index2, shift = splitbins(decomp_index)
 
-    print "/* same, for the decomposition data */"
+    print "/* index tables for the decomposition data */"
     print "#define DECOMP_SHIFT", shift
     Array("decomp_index1", index1).dump(sys.stdout)
     Array("decomp_index2", index2).dump(sys.stdout)
 
     sys.stdout = sys.__stdout__
 
+    #
+    # 3) unicode type data
+
+    # extract unicode types
+    dummy = (0, 0, 0, 0)
+    table = [dummy]
+    cache = {0: dummy}
+    index = [0] * len(unicode.chars)
+
+    for char in unicode.chars:
+        record = unicode.table[char]
+        if record:
+            # extract database properties
+            category = record[2]
+            bidirectional = record[4]
+            flags = 0
+            if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
+                flags |= ALPHA_MASK
+            if category == "Ll":
+                flags |= LOWER_MASK
+            if category == "Zs" or bidirectional in ("WS", "B", "S"):
+                flags |= SPACE_MASK
+            if category in ["Lt", "Lu"]:
+                flags |= TITLE_MASK
+            if category == "Lu":
+                flags |= UPPER_MASK
+            # use delta predictor for upper/lower/title
+            if record[12]:
+                upper = (int(record[12], 16) - char) & 0xffff
+            else:
+                upper = 0
+            if record[13]:
+                lower = (int(record[13], 16) - char) & 0xffff
+            else:
+                lower = 0
+            if record[14]:
+                title = (int(record[14], 16) - char) & 0xffff
+            else:
+                title = 0
+            item = (
+                flags, upper, lower, title
+                )
+            # add entry to index and item tables
+            i = cache.get(item)
+            if i is None:
+                cache[item] = i = len(table)
+                table.append(item)
+            index[char] = i
+
+    FILE = "Objects/unicodetype_db.h"
+
+    sys.stdout = open(FILE, "w")
+
+    print "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
+    print
+    print "/* a list of unique character type descriptors */"
+    print "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
+    for item in table:
+        print "    {%d, %d, %d, %d}," % item
+    print "};"
+    print
+
+    # split decomposition index table
+    index1, index2, shift = splitbins(index)
+
+    print "/* type indexes */"
+    print "#define SHIFT", shift
+    Array("index1", index1).dump(sys.stdout)
+    Array("index2", index2).dump(sys.stdout)
+
+    sys.stdout = sys.__stdout__
+
 # --------------------------------------------------------------------
 # the following support code is taken from the unidb utilities
 # Copyright (c) 1999-2000 by Secret Labs AB
@@ -259,4 +347,4 @@ def dump(t1, t2, shift, bytes):
     return best
 
 if __name__ == "__main__":
-    maketable()
+    maketables()