5454# these are the surrogate codepoints, which are not valid rust characters
5555surrogate_codepoints = (0xd800 , 0xdfff )
5656
57- UNICODE_VERSION = (10 , 0 , 0 )
57+ UNICODE_VERSION = (11 , 0 , 0 )
5858
5959UNICODE_VERSION_NUMBER = "%s.%s.%s" % UNICODE_VERSION
6060
@@ -63,8 +63,12 @@ def is_surrogate(n):
6363
6464def fetch (f ):
6565 if not os .path .exists (os .path .basename (f )):
66- os .system ("curl -O http://www.unicode.org/Public/%s/ucd/%s"
67- % (UNICODE_VERSION_NUMBER , f ))
66+ if "emoji" in f :
67+ os .system ("curl -O https://www.unicode.org/Public/emoji/%s.%s/%s"
68+ % (UNICODE_VERSION [0 ], UNICODE_VERSION [1 ], f ))
69+ else :
70+ os .system ("curl -O http://www.unicode.org/Public/%s/ucd/%s"
71+ % (UNICODE_VERSION_NUMBER , f ))
6872
6973 if not os .path .exists (os .path .basename (f )):
7074 sys .stderr .write ("cannot load %s" % f )
@@ -266,7 +270,7 @@ def emit_break_module(f, break_table, break_cats, name):
266270 pub use self::%sCat::*;
267271
268272 #[allow(non_camel_case_types)]
269- #[derive(Clone, Copy, PartialEq, Eq)]
273+ #[derive(Clone, Copy, PartialEq, Eq, Debug )]
270274 pub enum %sCat {
271275""" % (name , Name , Name ))
272276
@@ -340,8 +344,15 @@ def emit_break_module(f, break_table, break_cats, name):
340344 grapheme_table = []
341345 for cat in grapheme_cats :
342346 grapheme_table .extend ([(x , y , cat ) for (x , y ) in grapheme_cats [cat ]])
347+ emoji_props = load_properties ("emoji-data.txt" , ["Extended_Pictographic" ])
348+ grapheme_table .extend ([(x , y , "Extended_Pictographic" ) for (x , y ) in emoji_props ["Extended_Pictographic" ]])
343349 grapheme_table .sort (key = lambda w : w [0 ])
344- emit_break_module (rf , grapheme_table , list (grapheme_cats .keys ()), "grapheme" )
350+ last = - 1
351+ for chars in grapheme_table :
352+ if chars [0 ] <= last :
353+ raise "Grapheme tables and Extended_Pictographic values overlap; need to store these separately!"
354+ last = chars [1 ]
355+ emit_break_module (rf , grapheme_table , list (grapheme_cats .keys ()) + ["Extended_Pictographic" ], "grapheme" )
345356 rf .write ("\n " )
346357
347358 word_cats = load_properties ("auxiliary/WordBreakProperty.txt" , [])
@@ -351,6 +362,11 @@ def emit_break_module(f, break_table, break_cats, name):
351362 word_table .sort (key = lambda w : w [0 ])
352363 emit_break_module (rf , word_table , list (word_cats .keys ()), "word" )
353364
365+ # There are some emoji which are also ALetter, so this needs to be stored separately
366+ # For efficiency, we could still merge the two tables and produce an ALetterEP state
367+ emoji_table = [(x , y , "Extended_Pictographic" ) for (x , y ) in emoji_props ["Extended_Pictographic" ]]
368+ emit_break_module (rf , emoji_table , ["Extended_Pictographic" ], "emoji" )
369+
354370 sentence_cats = load_properties ("auxiliary/SentenceBreakProperty.txt" , [])
355371 sentence_table = []
356372 for cat in sentence_cats :
0 commit comments