@@ -155,11 +155,11 @@ def format_table_content(f, content, indent):
155155 line = " " * indent + chunk
156156 f .write (line )
157157
158- def load_properties (f , interestingprops ):
158+ def load_properties (f , interestingprops : "list[str | tuple[str, str]] | None" = None ):
159159 fetch (f )
160160 props = {}
161- re1 = re .compile (r"^ *([0-9A-F]+) *; *(\w+)" )
162- re2 = re .compile (r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)" )
161+ re1 = re .compile (r"^\s *([0-9A-F]+)\s*;\s *(\w+)(?:\s*;\s*(\w+))? " )
162+ re2 = re .compile (r"^\s *([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s *(\w+)(?:\s*;\s*(\w+))? " )
163163
164164 for line in fileinput .input (os .path .basename (f )):
165165 prop = None
@@ -168,17 +168,21 @@ def load_properties(f, interestingprops):
168168 m = re1 .match (line )
169169 if m :
170170 d_lo = m .group (1 )
171- d_hi = m . group ( 1 )
171+ d_hi = d_lo
172172 prop = m .group (2 )
173+ value = m .group (3 )
173174 else :
174175 m = re2 .match (line )
175176 if m :
176177 d_lo = m .group (1 )
177178 d_hi = m .group (2 )
178179 prop = m .group (3 )
180+ value = m .group (4 )
179181 else :
180182 continue
181- if interestingprops and prop not in interestingprops :
183+ if value is not None :
184+ prop = (prop , value )
185+ if interestingprops is not None and prop not in interestingprops :
182186 continue
183187 d_lo = int (d_lo , 16 )
184188 d_hi = int (d_hi , 16 )
@@ -195,7 +199,7 @@ def load_properties(f, interestingprops):
195199def escape_char (c ):
196200 return "'\\ u{%x}'" % c
197201
198- def emit_table (f , name , t_data , t_type = "&'static [(char, char)]" , is_pub = True ,
202+ def emit_table (f , name , t_data , t_type = "&[(char, char)]" , is_pub = True ,
199203 pfun = lambda x : "(%s,%s)" % (escape_char (x [0 ]), escape_char (x [1 ])), is_const = True ):
200204 pub_string = "const"
201205 if not is_const :
@@ -217,7 +221,7 @@ def emit_util_mod(f):
217221 f .write ("""
218222pub mod util {
219223 #[inline]
220- pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
224+ pub fn bsearch_range_table(c: char, r: &[(char,char)]) -> bool {
221225 use core::cmp::Ordering::{Equal, Less, Greater};
222226 r.binary_search_by(|&(lo,hi)| {
223227 if lo <= c && c <= hi { Equal }
@@ -252,13 +256,22 @@ def emit_util_mod(f):
252256
253257""" )
254258
255- def emit_property_module (f , mod , tbl , emit ):
256- f .write ("mod %s {\n " % mod )
257- for cat in sorted (emit ):
258- emit_table (f , "%s_table" % cat , tbl [cat ], is_pub = False )
259+ def emit_property_module (f , mod , tbl , emit : "list[str | tuple[str, str]]" ):
260+ f .write ("pub mod %s {\n " % mod )
261+
262+ cats = []
263+ for cat in emit :
264+ if type (cat ) is tuple :
265+ cats .append ((f"{ cat [0 ]} _{ cat [1 ]} " , cat ))
266+ else :
267+ cats .append ((cat , cat ))
268+ cats .sort (key = lambda x : x [0 ])
269+
270+ for cat_str , cat in cats :
271+ emit_table (f , "%s_table" % cat_str , tbl [cat ], is_pub = False )
259272 f .write (" #[inline]\n " )
260- f .write (" pub fn %s(c: char) -> bool {\n " % cat )
261- f .write (" super::util::bsearch_range_table(c, %s_table)\n " % cat )
273+ f .write (" pub fn %s(c: char) -> bool {\n " % cat_str )
274+ f .write (" super::util::bsearch_range_table(c, %s_table)\n " % cat_str )
262275 f .write (" }\n \n " )
263276 f .write ("}\n \n " )
264277
@@ -303,7 +316,7 @@ def emit_break_module(f, break_table, break_cats, name):
303316 f .write ((" %sC_" % Name [0 ]) + cat + ",\n " )
304317 f .write (""" }
305318
306- fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) {
319+ fn bsearch_range_value_table(c: char, r: &[(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) {
307320 use core::cmp::Ordering::{Equal, Less, Greater};
308321 match r.binary_search_by(|&(lo, hi, _)| {
309322 if lo <= c && c <= hi { Equal }
@@ -355,11 +368,11 @@ def emit_break_module(f, break_table, break_cats, name):
355368 else :
356369 lookup_type = "u32"
357370
358- emit_table (f , "%s_cat_lookup" % name , lookup_table , "&'static [%s]" % lookup_type ,
371+ emit_table (f , "%s_cat_lookup" % name , lookup_table , "&[%s]" % lookup_type ,
359372 pfun = lambda x : "%d" % x ,
360373 is_pub = False , is_const = True )
361374
362- emit_table (f , "%s_cat_table" % name , break_table , "&'static [(char, char, %sCat)]" % Name ,
375+ emit_table (f , "%s_cat_table" % name , break_table , "&[(char, char, %sCat)]" % Name ,
363376 pfun = lambda x : "(%s,%s,%sC_%s)" % (escape_char (x [0 ]), escape_char (x [1 ]), Name [0 ], x [2 ]),
364377 is_pub = False , is_const = True )
365378 f .write ("}\n " )
@@ -379,17 +392,26 @@ def emit_break_module(f, break_table, break_cats, name):
379392
380393 # download and parse all the data
381394 gencats = load_gencats ("UnicodeData.txt" )
382- derived = load_properties ("DerivedCoreProperties.txt" , ["Alphabetic" ])
395+ derived = load_properties ("DerivedCoreProperties.txt" , ["Alphabetic" , ( "InCB" , "Consonant" ), ( "InCB" , "Extend" ), ( "InCB" , "Linker" ) ])
383396
384397 emit_util_mod (rf )
385398 for (name , cat , pfuns ) in ("general_category" , gencats , ["N" ]), \
386- ("derived_property" , derived , ["Alphabetic" ]):
399+ ("derived_property" , derived , ["Alphabetic" , ( "InCB" , "Extend" ) ]):
387400 emit_property_module (rf , name , cat , pfuns )
388401
402+ rf .write ("""pub fn is_incb_linker(c: char) -> bool {
403+ matches!(c,""" )
404+
405+ for (lo , hi ) in derived [("InCB" , "Linker" )]:
406+ rf .write (f" | '\\ u{{{ lo :X} }}'" )
407+ if lo != hi :
408+ rf .write (f"..'\\ u{{{ lo :X} }}'" )
409+
410+ rf .write (")\n }\n \n " )
411+
389412 ### grapheme cluster module
390413 # from http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values
391- grapheme_cats = load_properties ("auxiliary/GraphemeBreakProperty.txt" , [])
392-
414+ grapheme_cats = load_properties ("auxiliary/GraphemeBreakProperty.txt" )
393415 # Control
394416 # Note:
395417 # This category also includes Cs (surrogate codepoints), but Rust's `char`s are
@@ -398,22 +420,22 @@ def emit_break_module(f, break_table, break_cats, name):
398420 grapheme_cats ["Control" ] = group_cat (list (
399421 set (ungroup_cat (grapheme_cats ["Control" ]))
400422 - set (ungroup_cat ([surrogate_codepoints ]))))
401-
423+ grapheme_cats ["InCB_Consonant" ] = derived [("InCB" , "Consonant" )]
424+ emoji_props = load_properties ("emoji-data.txt" , ["Extended_Pictographic" ])
425+ grapheme_cats ["Extended_Pictographic" ] = emoji_props ["Extended_Pictographic" ]
402426 grapheme_table = []
403427 for cat in grapheme_cats :
404428 grapheme_table .extend ([(x , y , cat ) for (x , y ) in grapheme_cats [cat ]])
405- emoji_props = load_properties ("emoji-data.txt" , ["Extended_Pictographic" ])
406- grapheme_table .extend ([(x , y , "Extended_Pictographic" ) for (x , y ) in emoji_props ["Extended_Pictographic" ]])
407429 grapheme_table .sort (key = lambda w : w [0 ])
408430 last = - 1
409431 for chars in grapheme_table :
410432 if chars [0 ] <= last :
411433 raise "Grapheme tables and Extended_Pictographic values overlap; need to store these separately!"
412434 last = chars [1 ]
413- emit_break_module (rf , grapheme_table , list (grapheme_cats .keys ()) + [ "Extended_Pictographic" ] , "grapheme" )
435+ emit_break_module (rf , grapheme_table , list (grapheme_cats .keys ()), "grapheme" )
414436 rf .write ("\n " )
415437
416- word_cats = load_properties ("auxiliary/WordBreakProperty.txt" , [] )
438+ word_cats = load_properties ("auxiliary/WordBreakProperty.txt" )
417439 word_table = []
418440 for cat in word_cats :
419441 word_table .extend ([(x , y , cat ) for (x , y ) in word_cats [cat ]])
@@ -425,7 +447,7 @@ def emit_break_module(f, break_table, break_cats, name):
425447 emoji_table = [(x , y , "Extended_Pictographic" ) for (x , y ) in emoji_props ["Extended_Pictographic" ]]
426448 emit_break_module (rf , emoji_table , ["Extended_Pictographic" ], "emoji" )
427449
428- sentence_cats = load_properties ("auxiliary/SentenceBreakProperty.txt" , [] )
450+ sentence_cats = load_properties ("auxiliary/SentenceBreakProperty.txt" )
429451 sentence_table = []
430452 for cat in sentence_cats :
431453 sentence_table .extend ([(x , y , cat ) for (x , y ) in sentence_cats [cat ]])
0 commit comments