diff --git a/index.html b/index.html index d164786..fb20d9d 100644 --- a/index.html +++ b/index.html @@ -5283,7 +5283,6 @@
The following dictionary provides a compact form for the operator dictionary, suitable for @@ -5316,23 +5315,58 @@
fence
, separator
to false
.
Content
is a single character in the
- BMP Private Use Area (range U+E000–U+F8FF)
+ range U+0320–U+03FF
then exit with NotFound
status.Content
an UTF-16 strings of lengths more than 1
(including the case of surrogate pairs) and is listed in
Operators_multichar
then
replace Content
with the Unicode character
- "U+E000 plus the index of Content
in
- Operators_multichar
". Otherwise, exit with
- NotFound
status.
+ "U+0320 plus the index of Content
in
+ Operators_multichar
". If it is not listed, then
+ exit with NotFound
status.
Content
, Form
)
- corresponds to one category of
- then
- set the properties according to
- .
- Otherwise, exit with NotFound
status.
+ Content
, Form
) from
+ and
+ either exit with NotFound
status or and move to
+ the next point. More precisely, this can be done as follows:
+ Content
, Form
)
+ according to .
+ If a result is found then set the properties according to
+ .
+ Otherwise exit with NotFound
status.
+ Key
to Content
if it is in
+ range U+0000–U+03FF ; or to Content
− 0x1C00
+ if it is in range U+2000–U+2BFF. Otherwise, exit with
+ NotFound
status.
+ Key
is at most 0x0FFF.
+ Key
according to whether Form
+ is infix
, prefix
,
+ postfix
respectively.
+ Key
is at most 0x2FFF.
+ Entry
in table
+
+ such Entry
% 0x4000 is equal to
+ Key
. Either exit with
+ NotFound
status or
+ set the properties corresponding to the category with
+ encoding Entry
/ 0x1000 in
+ .
+ Content
is in
Operators_fence
then set property fence
to true.fence
and separator
properties do not
+ have any visible effect on the layout described in this
+ specification. So step 5 and 6 as well as the corresponding tables
+ may be ignored.
+ - After conversion to a single UTF-16 character, determining the - category of ('Content', 'Form') can be done by binary searches - on the tables corresponding to the 'Form' value - of . - For tables of ranges, the binary search can be performed on the - range start code point. Note that small tables only have a few - ranges or code points to check and so can be handled by direct - comparaisons. + When encoded as ranges, one can perform a binary search by looking + for the range start, followed by an extra check on the range length. + Since log is concave, + it is worse to do one binary search on each large subtable + of than one + binary search on the whole table of + . + One can see that there are several contiguous Unicode blocks, so + encoding tables as ranges allow to get almost 8 bits per entry.
- The possible characters 'Content' values after conversion - characters are located into the three small ranges - U+0000–U+03FF, U+2000–U+2BFF and - U+E000–U+E04F and after simple offset shift can be encoded on - 12 bits. Note that all Unicode ranges from - and - contain between 1 and 32 characters. By splitting ranges into - at most two parts, each range can be encoded on 16 bits. - Due to several contiguous Unicode blocks, the tables would still be - encoded in significantly less than 16bits/entry but all the - tables are now encoded and treated the same way. -
-- Alternatively, discarding the smallest tables as explained above, - one can consider only those having a 4bits encoding in - . - Using the 12-bit encoding of the 'Content' described - above this means that these tables can be encoded with - 16bits/entry but binary search would now be performed on a single - table. -
-- Continuing on the previous approach, it is possible to + Alternatively, it is possible to use a perfect hash function to implement table lookup in constant - time [[?gperf]] [[?CMPH]]. This would add 16 bits per empty entry + time [[?gperf]] [[?CMPH]]. This would instead take + 16 bits per entry, plus 16 bits per extra empty entry (for non-minimal perfect hash function) as well as extra data to store the hash function parameters. For minimal perfect hash function, the theorical lower bound for storing these parameters is @@ -5395,11 +5417,6 @@
Special Table | Entries |
---|---|
Operators_multichar | 41 entries (null-terminated UTF-16 strings): {U+0021,U+0021,U+0000}, {U+0021,U+003D,U+0000}, {U+0026,U+0026,U+0000}, {U+002A,U+003D,U+0000}, {U+002B,U+002B,U+0000}, {U+002B,U+003D,U+0000}, {U+002D,U+002D,U+0000}, {U+002D,U+003D,U+0000}, {U+002D,U+003E,U+0000}, {U+002E,U+002E,U+0000}, {U+002E,U+002E,U+002E,U+0000}, {U+002F,U+003D,U+0000}, {U+003A,U+003D,U+0000}, {U+003C,U+003D,U+0000}, {U+003D,U+003D,U+0000}, {U+003E,U+003D,U+0000}, {U+007C,U+007C,U+0000}, {U+007C,U+007C,U+007C,U+0000}, {U+223D,U+0331,U+0000}, {U+2242,U+0338,U+0000}, {U+224E,U+0338,U+0000}, {U+224F,U+0338,U+0000}, {U+2266,U+0338,U+0000}, {U+226A,U+0338,U+0000}, {U+226B,U+0338,U+0000}, {U+227F,U+0338,U+0000}, {U+2282,U+20D2,U+0000}, {U+2283,U+20D2,U+0000}, {U+228F,U+0338,U+0000}, {U+2290,U+0338,U+0000}, {U+29CF,U+0338,U+0000}, {U+29D0,U+0338,U+0000}, {U+2A7D,U+0338,U+0000}, {U+2A7E,U+0338,U+0000}, {U+2AA1,U+0338,U+0000}, {U+2AA2,U+0338,U+0000}, {U+2AAF,U+0338,U+0000}, {U+2AB0,U+0338,U+0000}, {U+2ADD,U+0338,U+0000}, {U+D83B,U+DEF0,U+0000}, {U+D83B,U+DEF1,U+0000}, |
Operators_fence | 57 entries (15 Unicode ranges): [U+0028–U+0029], {U+005B}, {U+005D}, [U+007B–U+007D], {U+2016}, [U+2018–U+2019], [U+201C–U+201D], [U+2308–U+230B], [U+2329–U+232A], [U+2772–U+2773], [U+27E6–U+27EF], {U+2980}, [U+2983–U+2998], [U+29FC–U+29FD], [U+E010–U+E011], |
Operators_separator | 3 entries: U+002C, U+003B, U+2063, |
(Content, Form) keys | Category |
---|---|
138 entries (18 Unicode ranges) in infix form: [U+2190–U+2199], [U+219C–U+21AD], [U+21AF–U+21B5], {U+21B9}, [U+21BC–U+21CC], [U+21D0–U+21DD], [U+21E0–U+21F0], {U+21F3}, [U+21F5–U+21F6], [U+21FD–U+21FF], [U+27F0–U+27F1], [U+27F5–U+27FF], [U+290A–U+2910], [U+2912–U+2913], [U+2921–U+2922], [U+294E–U+2961], [U+296E–U+296F], [U+2B45–U+2B46], | A |
103 entries (36 Unicode ranges) in infix form: {U+002B}, {U+002D}, {U+002F}, {U+00B1}, {U+00F7}, [U+2212–U+2214], {U+2216}, {U+2218}, {U+2224}, [U+2227–U+222A], {U+2236}, {U+2238}, [U+228C–U+228F], [U+2293–U+2296], {U+2298}, [U+229D–U+229F], [U+22BB–U+22BD], {U+22C4}, {U+22C6}, [U+22CE–U+22CF], [U+22D2–U+22D3], [U+2795–U+2797], {U+27F4}, {U+29BC}, {U+29F6}, [U+2A22–U+2A2E], [U+2A38–U+2A3A], [U+2A40–U+2A4F], [U+2A51–U+2A63], [U+2ADA–U+2ADB], {U+2AFB}, {U+2AFD}, {U+2B32}, {U+E002}, {U+E005}, {U+E007}, | B |
89 entries (42 Unicode ranges) in infix form: {U+0025}, {U+002A}, {U+002E}, {U+0040}, {U+00B7}, {U+00D7}, {U+2022}, {U+2043}, {U+2206}, {U+220E}, {U+2217}, [U+223F–U+2240], {U+2297}, {U+2299}, [U+22A0–U+22A1], {U+22C5}, {U+22C7}, [U+22C9–U+22CC], [U+2305–U+2306], [U+25A0–U+25A1], [U+25AA–U+25AB], [U+25AD–U+25B1], [U+2981–U+2982], [U+2999–U+299A], {U+29B5}, [U+29C2–U+29C3], [U+29C9–U+29CD], [U+29D8–U+29D9], {U+29DB}, [U+29DF–U+29E0], {U+29E2}, [U+29E7–U+29ED], [U+29F8–U+29FB], [U+2A1D–U+2A21], [U+2A2F–U+2A37], [U+2A3B–U+2A3D], {U+2A3F}, {U+2A50}, [U+2ADC–U+2ADD], {U+2AFE}, [U+E010–U+E012], {U+E026}, | C |
53 entries (22 Unicode ranges) in prefix form: {U+0021}, {U+002B}, {U+002D}, {U+00AC}, {U+00B1}, {U+2018}, {U+201C}, [U+2200–U+2201], [U+2203–U+2204], {U+2207}, [U+2212–U+2213], [U+221B–U+221C], [U+221F–U+2222], {U+223C}, [U+22BE–U+22BF], {U+2310}, {U+2319}, [U+2795–U+2796], {U+27C0}, [U+299B–U+29AF], [U+2AEC–U+2AED], [U+E010–U+E011], | D |
42 entries (22 Unicode ranges) in postfix form: [U+0021–U+0022], [U+0026–U+0027], {U+0060}, {U+00A8}, {U+00B0}, [U+00B2–U+00B4], [U+00B8–U+00B9], [U+02CA–U+02CB], [U+02D8–U+02DA], {U+02DD}, {U+0311}, [U+2019–U+201B], [U+201D–U+201F], [U+2032–U+2037], {U+2057}, [U+20DB–U+20DC], {U+23CD}, {U+E000}, {U+E004}, {U+E006}, [U+E009–U+E00A], [U+E010–U+E011], | E |
26 entries (16 Unicode ranges) in postfix form: [U+005E–U+005F], {U+007E}, {U+00AF}, [U+02C6–U+02C7], {U+02C9}, {U+02CD}, {U+02DC}, {U+02F7}, {U+0302}, {U+2016}, {U+203E}, [U+2322–U+2323], [U+23B4–U+23B5], [U+23DC–U+23E1], {U+2980}, [U+E027–U+E028], | F |
25 entries in prefix form: U+0028, U+005B, U+007B, U+007C, U+2308, U+230A, U+2329, U+2772, U+27E6, U+27E8, U+27EA, U+27EC, U+27EE, U+2983, U+2985, U+2987, U+2989, U+298B, U+298D, U+298F, U+2991, U+2993, U+2995, U+2997, U+29FC, | G |
25 entries in postfix form: U+0029, U+005D, U+007C, U+007D, U+2309, U+230B, U+232A, U+2773, U+27E7, U+27E9, U+27EB, U+27ED, U+27EF, U+2984, U+2986, U+2988, U+298A, U+298C, U+298E, U+2990, U+2992, U+2994, U+2996, U+2998, U+29FD, | H |
22 entries (3 Unicode ranges) in prefix form: [U+222B–U+2233], [U+2A0B–U+2A0F], [U+2A15–U+2A1C], | I |
18 entries (5 Unicode ranges) in prefix form: [U+220F–U+2210], [U+22C0–U+22C3], [U+2A00–U+2A09], {U+2AFC}, {U+2AFF}, | J |
7 entries (3 Unicode ranges) in prefix form: {U+2211}, {U+2A0A}, [U+2A10–U+2A14], | K |
6 entries (3 Unicode ranges) in infix form: {U+005C}, [U+2061–U+2064], {U+2396}, | L |
3 entries in infix form: U+002C, U+003A, U+003B, | M |
3 entries in prefix form: U+2145, U+2146, U+2202, | N |
Category | encoding | rspace | lspace | properties |
---|---|---|---|---|
A | 0x0 | 0.2777777777777778em | 0.2777777777777778em | stretchy |
B | 0x4 | 0.2222222222222222em | 0.2222222222222222em | N/A |
C | 0x8 | 0.16666666666666666em | 0.16666666666666666em | N/A |
D | 0x1 | 0 | 0 | N/A |
E | 0x2 | 0 | 0 | N/A |
F | 0x6 | 0 | 0 | stretchy |
G | 0x5 | 0 | 0 | stretchy symmetric |
H | 0xA | 0 | 0 | stretchy symmetric |
I | 0x9 | 0.16666666666666666em | 0.16666666666666666em | symmetric largeop |
J | 0xD | 0.05555555555555555em | 0.1111111111111111em | symmetric largeop movablelimits |
K | N/A | 0.16666666666666666em | 0.16666666666666666em | symmetric largeop movablelimits |
L | 0xC | 0 | 0 | N/A |
M | N/A | 0 | 0.16666666666666666em | N/A |
N | N/A | 0.16666666666666666em | 0 | N/A |
Special Table | Entries |
---|---|
Operators_multichar | 41 entries (null-terminated UTF-16 strings): {U+0021,U+0021,U+0000}, {U+0021,U+003D,U+0000}, {U+0026,U+0026,U+0000}, {U+002A,U+003D,U+0000}, {U+002B,U+002B,U+0000}, {U+002B,U+003D,U+0000}, {U+002D,U+002D,U+0000}, {U+002D,U+003D,U+0000}, {U+002D,U+003E,U+0000}, {U+002E,U+002E,U+0000}, {U+002E,U+002E,U+002E,U+0000}, {U+002F,U+003D,U+0000}, {U+003A,U+003D,U+0000}, {U+003C,U+003D,U+0000}, {U+003D,U+003D,U+0000}, {U+003E,U+003D,U+0000}, {U+007C,U+007C,U+0000}, {U+007C,U+007C,U+007C,U+0000}, {U+223D,U+0331,U+0000}, {U+2242,U+0338,U+0000}, {U+224E,U+0338,U+0000}, {U+224F,U+0338,U+0000}, {U+2266,U+0338,U+0000}, {U+226A,U+0338,U+0000}, {U+226B,U+0338,U+0000}, {U+227F,U+0338,U+0000}, {U+2282,U+20D2,U+0000}, {U+2283,U+20D2,U+0000}, {U+228F,U+0338,U+0000}, {U+2290,U+0338,U+0000}, {U+29CF,U+0338,U+0000}, {U+29D0,U+0338,U+0000}, {U+2A7D,U+0338,U+0000}, {U+2A7E,U+0338,U+0000}, {U+2AA1,U+0338,U+0000}, {U+2AA2,U+0338,U+0000}, {U+2AAF,U+0338,U+0000}, {U+2AB0,U+0338,U+0000}, {U+2ADD,U+0338,U+0000}, {U+D83B,U+DEF0,U+0000}, {U+D83B,U+DEF1,U+0000}, |
Operators_fence | 57 entries (15 Unicode ranges): [U+0028–U+0029], {U+005B}, {U+005D}, [U+007B–U+007D], [U+0330–U+0331], {U+2016}, [U+2018–U+2019], [U+201C–U+201D], [U+2308–U+230B], [U+2329–U+232A], [U+2772–U+2773], [U+27E6–U+27EF], {U+2980}, [U+2983–U+2998], [U+29FC–U+29FD], |
Operators_separator | 3 entries: U+002C, U+003B, U+2063, |
(Content, Form) keys | Category |
---|---|
138 entries (18 Unicode ranges) in infix form: [U+2190–U+2199], [U+219C–U+21AD], [U+21AF–U+21B5], {U+21B9}, [U+21BC–U+21CC], [U+21D0–U+21DD], [U+21E0–U+21F0], {U+21F3}, [U+21F5–U+21F6], [U+21FD–U+21FF], [U+27F0–U+27F1], [U+27F5–U+27FF], [U+290A–U+2910], [U+2912–U+2913], [U+2921–U+2922], [U+294E–U+2961], [U+296E–U+296F], [U+2B45–U+2B46], | A |
103 entries (36 Unicode ranges) in infix form: {U+002B}, {U+002D}, {U+002F}, {U+00B1}, {U+00F7}, {U+0322}, {U+0325}, {U+0327}, [U+2212–U+2214], {U+2216}, {U+2218}, {U+2224}, [U+2227–U+222A], {U+2236}, {U+2238}, [U+228C–U+228F], [U+2293–U+2296], {U+2298}, [U+229D–U+229F], [U+22BB–U+22BD], {U+22C4}, {U+22C6}, [U+22CE–U+22CF], [U+22D2–U+22D3], [U+2795–U+2797], {U+27F4}, {U+29BC}, {U+29F6}, [U+2A22–U+2A2E], [U+2A38–U+2A3A], [U+2A40–U+2A4F], [U+2A51–U+2A63], [U+2ADA–U+2ADB], {U+2AFB}, {U+2AFD}, {U+2B32}, | B |
89 entries (42 Unicode ranges) in infix form: {U+0025}, {U+002A}, {U+002E}, {U+0040}, {U+00B7}, {U+00D7}, [U+0330–U+0332], {U+0346}, {U+2022}, {U+2043}, {U+2206}, {U+220E}, {U+2217}, [U+223F–U+2240], {U+2297}, {U+2299}, [U+22A0–U+22A1], {U+22C5}, {U+22C7}, [U+22C9–U+22CC], [U+2305–U+2306], [U+25A0–U+25A1], [U+25AA–U+25AB], [U+25AD–U+25B1], [U+2981–U+2982], [U+2999–U+299A], {U+29B5}, [U+29C2–U+29C3], [U+29C9–U+29CD], [U+29D8–U+29D9], {U+29DB}, [U+29DF–U+29E0], {U+29E2}, [U+29E7–U+29ED], [U+29F8–U+29FB], [U+2A1D–U+2A21], [U+2A2F–U+2A37], [U+2A3B–U+2A3D], {U+2A3F}, {U+2A50}, [U+2ADC–U+2ADD], {U+2AFE}, | C |
53 entries (22 Unicode ranges) in prefix form: {U+0021}, {U+002B}, {U+002D}, {U+00AC}, {U+00B1}, [U+0330–U+0331], {U+2018}, {U+201C}, [U+2200–U+2201], [U+2203–U+2204], {U+2207}, [U+2212–U+2213], [U+221B–U+221C], [U+221F–U+2222], {U+223C}, [U+22BE–U+22BF], {U+2310}, {U+2319}, [U+2795–U+2796], {U+27C0}, [U+299B–U+29AF], [U+2AEC–U+2AED], | D |
42 entries (22 Unicode ranges) in postfix form: [U+0021–U+0022], [U+0026–U+0027], {U+0060}, {U+00A8}, {U+00B0}, [U+00B2–U+00B4], [U+00B8–U+00B9], [U+02CA–U+02CB], [U+02D8–U+02DA], {U+02DD}, {U+0311}, {U+0320}, {U+0324}, {U+0326}, [U+0329–U+032A], [U+0330–U+0331], [U+2019–U+201B], [U+201D–U+201F], [U+2032–U+2037], {U+2057}, [U+20DB–U+20DC], {U+23CD}, | E |
26 entries (16 Unicode ranges) in postfix form: [U+005E–U+005F], {U+007E}, {U+00AF}, [U+02C6–U+02C7], {U+02C9}, {U+02CD}, {U+02DC}, {U+02F7}, {U+0302}, [U+0347–U+0348], {U+2016}, {U+203E}, [U+2322–U+2323], [U+23B4–U+23B5], [U+23DC–U+23E1], {U+2980}, | F |
25 entries in prefix form: U+0028, U+005B, U+007B, U+007C, U+2308, U+230A, U+2329, U+2772, U+27E6, U+27E8, U+27EA, U+27EC, U+27EE, U+2983, U+2985, U+2987, U+2989, U+298B, U+298D, U+298F, U+2991, U+2993, U+2995, U+2997, U+29FC, | G |
25 entries in postfix form: U+0029, U+005D, U+007C, U+007D, U+2309, U+230B, U+232A, U+2773, U+27E7, U+27E9, U+27EB, U+27ED, U+27EF, U+2984, U+2986, U+2988, U+298A, U+298C, U+298E, U+2990, U+2992, U+2994, U+2996, U+2998, U+29FD, | H |
22 entries (3 Unicode ranges) in prefix form: [U+222B–U+2233], [U+2A0B–U+2A0F], [U+2A15–U+2A1C], | I |
18 entries (5 Unicode ranges) in prefix form: [U+220F–U+2210], [U+22C0–U+22C3], [U+2A00–U+2A09], {U+2AFC}, {U+2AFF}, | J |
7 entries (3 Unicode ranges) in prefix form: {U+2211}, {U+2A0A}, [U+2A10–U+2A14], | K |
6 entries (3 Unicode ranges) in infix form: {U+005C}, [U+2061–U+2064], {U+2396}, | L |
3 entries in infix form: U+002C, U+003A, U+003B, | M |
3 entries in prefix form: U+2145, U+2146, U+2202, | N |
Category | encoding | rspace | lspace | properties |
---|---|---|---|---|
A | 0x0 | 0.2777777777777778em | 0.2777777777777778em | stretchy |
B | 0x4 | 0.2222222222222222em | 0.2222222222222222em | N/A |
C | 0x8 | 0.16666666666666666em | 0.16666666666666666em | N/A |
D | 0x1 | 0 | 0 | N/A |
E | 0x2 | 0 | 0 | N/A |
F | 0x6 | 0 | 0 | stretchy |
G | 0x5 | 0 | 0 | stretchy symmetric |
H | 0xA | 0 | 0 | stretchy symmetric |
I | 0x9 | 0.16666666666666666em | 0.16666666666666666em | symmetric largeop |
J | 0xD | 0.05555555555555555em | 0.1111111111111111em | symmetric largeop movablelimits |
K | N/A | 0.16666666666666666em | 0.16666666666666666em | symmetric largeop movablelimits |
L | 0xC | 0 | 0 | N/A |
M | N/A | 0 | 0.16666666666666666em | N/A |
N | N/A | 0.16666666666666666em | 0 | N/A |
Operators_%s
" % (count, len(ranges)))
for entry in ranges:
@@ -587,7 +593,7 @@ def serializeValue(value, fence, separator):
md.write("")
totalEntryCount += count
- ranges = toUnicodeRanges(knownTables[name]["singleChar"])
+ ranges = toRanges(knownTables[name]["singleChar"])
if (3 * len(ranges) < 2 * count):
md.write("%d entries (%d Unicode ranges) in %s form: " % (count, len(ranges), knownTables[name]["value"]["form"]))
for entry in ranges:
@@ -606,6 +612,14 @@ def serializeValue(value, fence, separator):
md.write('Mapping from operator (Content, Form) to a category.
Total size: %d entries, %d bytes.
(assuming characters are UTF-16 and 1-byte range lengths) ' % (totalEntryCount, totalBytes))
md.write('')
+def formValueFromString(value):
+ form = knownTables[name]["value"]["form"]
+ if form == "infix":
+ return 0
+ if form == "prefix":
+ return 1
+ assert form == "postfix"
+ return 2
category_for_form = [0, 0, 0]
value_index = 0
@@ -620,13 +634,7 @@ def serializeValue(value, fence, separator):
for entry in knownTables[name]["singleChar"]:
md.write("");
md.write("%s " % chr(ord('A') + value_index))
- form = knownTables[name]["value"]["form"]
- if form == "infix":
- form = 0
- elif form == "prefix":
- form = 1
- elif form == "postfix":
- form = 2
+ form = formValueFromString(knownTables[name]["singleChar"])
if category_for_form[form] >= 4:
md.write("N/A ")
else:
@@ -644,7 +652,43 @@ def serializeValue(value, fence, separator):
print("done.");
-# Print more statistics
-print()
-printCodePointStats()
-printRangeStats()
+# Calculate compact form for the largest categories.
+compact_table = []
+category_for_form = [0, 0, 0]
+totalEntryCount = 0
+for name, item in sorted(knownTables.items(),
+ key=(lambda v: len(v[1]["singleChar"])),
+ reverse=True):
+ if name in ["fence", "separator"]:
+ continue
+ count = len(knownTables[name]["singleChar"])
+ form = formValueFromString(knownTables[name]["singleChar"])
+ if category_for_form[form] >= 4:
+ continue
+ totalEntryCount += count
+ hexa = form + (category_for_form[form] << 2)
+ category_for_form[form] += 1
+
+ for entry in knownTables[name]["singleChar"]:
+ assert entry <= 0x3FF or (0x2000 <= entry and entry <= 0x2BFF)
+ if 0x2000 <= entry and entry <= 0x2BFF:
+ entry = entry - 0x1C00
+ entry = entry + (hexa << 12)
+ compact_table.append(entry)
+
+bits_per_range = 4
+compact_table = toRanges(compact_table, 1 << bits_per_range)
+rangeCount = 0
+
+md.write('')