Skip to content

Commit 2a1804e

Browse files
committed
Generate new tables for isalpha/toupper/tolower from UnicodeDate.txt
1 parent 1b8aae1 commit 2a1804e

6 files changed

Lines changed: 2158 additions & 1150 deletions

File tree

genucd.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
# Create utfdata.h from UnicodeData.txt
2+
3+
tolower = []
4+
toupper = []
5+
isalpha = []
6+
7+
for line in open("UnicodeData.txt").readlines():
8+
line = line.split(";")
9+
code = int(line[0],16)
10+
# if code > 65535: continue # skip non-BMP codepoints
11+
if line[2][0] == 'L':
12+
isalpha.append(code)
13+
if line[12]:
14+
toupper.append((code,int(line[12],16)))
15+
if line[13]:
16+
tolower.append((code,int(line[13],16)))
17+
18+
def dumpalpha():
19+
table = []
20+
prev = 0
21+
start = 0
22+
for code in isalpha:
23+
if code != prev+1:
24+
if start:
25+
table.append((start,prev))
26+
start = code
27+
prev = code
28+
table.append((start,prev))
29+
30+
print("")
31+
print("static const Rune ucd_alpha2[] = {")
32+
for a, b in table:
33+
if b - a > 0:
34+
print(hex(a)+","+hex(b)+",")
35+
print("};");
36+
37+
print("")
38+
print("static const Rune ucd_alpha1[] = {")
39+
for a, b in table:
40+
if b - a == 0:
41+
print(hex(a)+",")
42+
print("};");
43+
44+
def dumpmap(name, input):
45+
table = []
46+
prev_a = 0
47+
prev_b = 0
48+
start_a = 0
49+
start_b = 0
50+
for a, b in input:
51+
if a != prev_a+1 or b != prev_b+1:
52+
if start_a:
53+
table.append((start_a,prev_a,start_b))
54+
start_a = a
55+
start_b = b
56+
prev_a = a
57+
prev_b = b
58+
table.append((start_a,prev_a,start_b))
59+
60+
print("")
61+
print("static const Rune " + name + "2[] = {")
62+
for a, b, n in table:
63+
if b - a > 0:
64+
print(hex(a)+","+hex(b)+","+str(n-a)+",")
65+
print("};");
66+
67+
print("")
68+
print("static const Rune " + name + "1[] = {")
69+
for a, b, n in table:
70+
if b - a == 0:
71+
print(hex(a)+","+str(n-a)+",")
72+
print("};");
73+
74+
print("/* This file was automatically created from UnicodeData.txt */")
75+
dumpalpha()
76+
dumpmap("ucd_tolower", tolower)
77+
dumpmap("ucd_toupper", toupper)

one.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,3 @@
2424
#include "jsvalue.c"
2525
#include "regexp.c"
2626
#include "utf.c"
27-
#include "utftype.c"

utf.c

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@
1515
#include <string.h>
1616

1717
#include "utf.h"
18+
#include "utfdata.h"
19+
20+
#define nelem(a) (int)(sizeof (a) / sizeof (a)[0])
1821

1922
typedef unsigned char uchar;
2023

@@ -210,3 +213,93 @@ utflen(const char *s)
210213
n++;
211214
}
212215
}
216+
217+
static const Rune *
218+
ucd_bsearch(Rune c, const Rune *t, int n, int ne)
219+
{
220+
const Rune *p;
221+
int m;
222+
223+
while(n > 1) {
224+
m = n/2;
225+
p = t + m*ne;
226+
if(c >= p[0]) {
227+
t = p;
228+
n = n-m;
229+
} else
230+
n = m;
231+
}
232+
if(n && c >= t[0])
233+
return t;
234+
return 0;
235+
}
236+
237+
Rune
238+
tolowerrune(Rune c)
239+
{
240+
const Rune *p;
241+
242+
p = ucd_bsearch(c, ucd_tolower2, nelem(ucd_tolower2)/3, 3);
243+
if(p && c >= p[0] && c <= p[1])
244+
return c + p[2];
245+
p = ucd_bsearch(c, ucd_tolower1, nelem(ucd_tolower1)/2, 2);
246+
if(p && c == p[0])
247+
return c + p[1];
248+
return c;
249+
}
250+
251+
Rune
252+
toupperrune(Rune c)
253+
{
254+
const Rune *p;
255+
256+
p = ucd_bsearch(c, ucd_toupper2, nelem(ucd_toupper2)/3, 3);
257+
if(p && c >= p[0] && c <= p[1])
258+
return c + p[2];
259+
p = ucd_bsearch(c, ucd_toupper1, nelem(ucd_toupper1)/2, 2);
260+
if(p && c == p[0])
261+
return c + p[1];
262+
return c;
263+
}
264+
265+
int
266+
islowerrune(Rune c)
267+
{
268+
const Rune *p;
269+
270+
p = ucd_bsearch(c, ucd_toupper2, nelem(ucd_toupper2)/3, 3);
271+
if(p && c >= p[0] && c <= p[1])
272+
return 1;
273+
p = ucd_bsearch(c, ucd_toupper1, nelem(ucd_toupper1)/2, 2);
274+
if(p && c == p[0])
275+
return 1;
276+
return 0;
277+
}
278+
279+
int
280+
isupperrune(Rune c)
281+
{
282+
const Rune *p;
283+
284+
p = ucd_bsearch(c, ucd_tolower2, nelem(ucd_tolower2)/3, 3);
285+
if(p && c >= p[0] && c <= p[1])
286+
return 1;
287+
p = ucd_bsearch(c, ucd_tolower1, nelem(ucd_tolower1)/2, 2);
288+
if(p && c == p[0])
289+
return 1;
290+
return 0;
291+
}
292+
293+
int
294+
isalpharune(Rune c)
295+
{
296+
const Rune *p;
297+
298+
p = ucd_bsearch(c, ucd_alpha2, nelem(ucd_alpha2)/2, 2);
299+
if(p && c >= p[0] && c <= p[1])
300+
return 1;
301+
p = ucd_bsearch(c, ucd_alpha1, nelem(ucd_alpha1), 1);
302+
if(p && c == p[0])
303+
return 1;
304+
return 0;
305+
}

utf.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,8 @@ typedef int Rune; /* 32 bits */
2323

2424
#define isalpharune jsU_isalpharune
2525
#define islowerrune jsU_islowerrune
26-
#define isspacerune jsU_isspacerune
27-
#define istitlerune jsU_istitlerune
2826
#define isupperrune jsU_isupperrune
2927
#define tolowerrune jsU_tolowerrune
30-
#define totitlerune jsU_totitlerune
3128
#define toupperrune jsU_toupperrune
3229

3330
enum
@@ -46,11 +43,8 @@ int utflen(const char *s);
4643

4744
int isalpharune(Rune c);
4845
int islowerrune(Rune c);
49-
int isspacerune(Rune c);
50-
int istitlerune(Rune c);
5146
int isupperrune(Rune c);
5247
Rune tolowerrune(Rune c);
53-
Rune totitlerune(Rune c);
5448
Rune toupperrune(Rune c);
5549

5650
#endif

0 commit comments

Comments
 (0)