-
Notifications
You must be signed in to change notification settings - Fork 0
/
make_data.py
140 lines (111 loc) · 4.59 KB
/
make_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import json
import sys
import xml.etree.ElementTree as ET
# Code point sequence -> set(keywords)
merged_keywords = dict()
control_chars = set()
# -----------------------------------------------------------------------------
# Process CLDR annotations
#
# See https://unicode.org/reports/tr35/tr35-general.html#Annotations
# -----------------------------------------------------------------------------
for path in ("common/annotations/en.xml", "common/annotationsDerived/en.xml"):
tree = ET.parse(path)
root = tree.getroot()
for anno in root.find("annotations").findall("annotation"):
cp = anno.get("cp")
# Per https://unicode.org/reports/tr35/tr35-general.html#Annotations
#
# "The cp attribute value has two formats: either a single string, or if
# contained within […] a UnicodeSet. The latter format can contain
# multiple code points or strings."
#
# I do not know whether multi code point sequences are allowed (which
# would make parsing more difficult and I am lazy), but this feature
# does not seem to be used in the current versions of the annotations
# and derived annotations files anyway so we just ignore these entries
# if we happen to find them.
if cp[0] == "[" and cp[-1] == "]":
continue
if anno.get("type") == "tts":
keywords = set(kw.strip() for kw in anno.text.split(":", 1))
else:
keywords = set(kw.strip() for kw in anno.text.split("|"))
if cp not in merged_keywords:
merged_keywords[cp] = keywords
else:
merged_keywords[cp].update(keywords)
# -----------------------------------------------------------------------------
# Process UCD
#
# See https://unicode.org/reports/tr42/ and section "4.4 Properties" specifically
# -----------------------------------------------------------------------------
NS = {"ucd": "http://www.unicode.org/ns/2003/ucd/1.0"}
# Only list Latin, Greek and Common (Zyyy) characters.
# See "Script (sc)" in PropertyValueAliases.txt (make fetch-docs).
INCLUDE_SCRIPTS = set(["Latn", "Grek", "Zyyy"])
tree = ET.parse("ucd.nounihan.grouped.xml")
root = tree.getroot()
for group in root.find("ucd:repertoire", NS).findall("ucd:group", NS):
for char in group.findall("ucd:char", NS):
# Merge char and group attributes
attrs = dict(group.attrib)
attrs.update(char.attrib)
# General_Category
gc = attrs.get("gc")
# Skip unassigned characters (Cn)
if gc[0] == "Cn":
continue
# Skip combining marks
if gc[0] == "M":
continue
# Skip modifiers (Modifier_Letter and Modifier_Symbol)
if gc in ("Lm", "Sk"):
continue
# Skip deprecated codepoints
if attrs.get("Dep") == "Y":
continue
# Only include certain scripts
if attrs.get("sc") not in INCLUDE_SCRIPTS:
continue
# Ignore ranges
if attrs.get("first-cp"):
continue
# Get name
na = attrs.get("na")
# "If a code point has the attribute na (either directly or by
# inheritence from an enclosing group), then occurrences of the
# character # in the name are to be interpreted as the value of the
# code point"
if "#" in na:
continue
# Build keyword list from na, na1 and any name aliases
na1 = attrs.get("na1", "")
keywords = set([na, na1])
for alias in char.findall("ucd:name-alias", NS):
keywords.add(alias.get("alias"))
cp = attrs.get("cp")
cp = chr(int(cp, 16))
if cp not in merged_keywords:
merged_keywords[cp] = keywords
else:
merged_keywords[cp].update(keywords)
if gc[0] == "C":
control_chars.add(cp)
# -----------------------------------------------------------------------------
# Format output
# -----------------------------------------------------------------------------
# Re-write as sorted array
output = []
for cp, keywords in merged_keywords.items():
# Convert to lowercase and remove duplicates/empty values
keywords = set(kw.lower() for kw in keywords if kw)
keywords = list(sorted(keywords, key=lambda kw: (len(kw), kw)))
is_control = cp in control_chars
display = repr(cp)[1:-1] if is_control else cp
output.append(
{"cp": cp, "keywords": keywords, "display": display, "is_control": is_control}
)
output.sort(key=lambda char: (len(char["cp"]), char["cp"]))
json.dump(output, sys.stdout)
sys.stdout.write("\n")