-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Update Ren(G_i) per WD-imsc-hrm-20220322
Added missing entries to codepoint sets used to compute `GCpy` and `Ren(G_i)`
- Loading branch information
Showing
5 changed files
with
46,383 additions
and
5,366 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,9 +23,8 @@ | |
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | ||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
|
||
"""Generates a Python source file that initializes a set of unicode codepoints that belong | ||
to the following scripts: latin, greek, cyrillic, hebrew or common. | ||
Used to determine the value of GCpy at IMSC, 10.5""" | ||
"""Generates a Python source file that contains sets of Unicode codepoints used | ||
by the IMSC HRM, e.g., to determine the value of GCpy at IMSC, 10.5""" | ||
|
||
__author__ = "Pierre-Anthony Lemieux <[email protected]>" | ||
|
||
|
@@ -35,35 +34,44 @@ | |
TEMPLATE="""#!/usr/bin/env python | ||
# -*- coding: UTF-8 -*- | ||
'''Set of unicode codepoints that belong to the following scripts: latin, greek, cyrillic, hebrew or common. | ||
Used to determine the value of GCpy at IMSC, 10.5''' | ||
'''Sets of Unicode codepoints used by the IMSC HRM.''' | ||
# Generated from the Unicode Character Database (http://www.unicode.org/Public/UNIDATA/Scripts.txt) | ||
# Version: {version} | ||
# Date: {date} | ||
# Normalized glyph copy performance factor (GCpy) for Latin, Greek, Cyrillic, Hebrew or Common | ||
GCPY_12 = set( | ||
( | ||
{codepoints} | ||
{codepoints_GCPY_12} | ||
) | ||
) | ||
# Text rendering performance factor Ren(Gi) for Han, Katakana, Hiragana, Bopomofo or Hangul scripts | ||
RENGI_06 = set( | ||
( | ||
{codepoints_RENGI_06} | ||
) | ||
) | ||
""" | ||
|
||
SCRIPT_LINE_PATTERN = re.compile(r"(?P<start>[a-fA-F0-9]{4})(?:\.\.(?P<end>[a-fA-F0-9]{4})?)\s+;\s+(?P<script>\w*)") | ||
SCRIPT_LINE_PATTERN = re.compile(r"(?P<start>[a-fA-F0-9]{4})(?:\.\.(?P<end>[a-fA-F0-9]{4}))?\s+;\s+(?P<script>\w*)") | ||
|
||
VERSION_LINE_PATTERN = re.compile(r"^#\s+(.+)$") | ||
|
||
DATE_LINE_PATTERN = re.compile(r"^#\s*Date:\s*(.+)$") | ||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser(description='Generates the _GCPY_12.py file') | ||
parser = argparse.ArgumentParser(description='Generates the _codepoints_sets.py file') | ||
parser.add_argument('scripts_file', type=str, help='Path to the input unicode Scripts.txt file') | ||
parser.add_argument('gcpy_py_file', type=str, help='Path to the generated Python source file') | ||
parser.add_argument('py_file', type=str, help='Path to the generated Python source file') | ||
|
||
args = parser.parse_args() | ||
|
||
gcpy_12 = set() | ||
|
||
rengi_06 = set() | ||
|
||
with open(args.scripts_file, encoding="latin-1") as f: | ||
|
||
file_version = VERSION_LINE_PATTERN.match(f.readline()).group(1) | ||
|
@@ -76,21 +84,28 @@ | |
if m is None: | ||
continue | ||
|
||
if m.group("script").lower() not in ("common", "latin", "greek", "hebrew", "cyrillic"): | ||
script = m.group("script").lower() | ||
|
||
if script in ("common", "latin", "greek", "hebrew", "cyrillic"): | ||
cp_group = gcpy_12 | ||
elif script in ("han", "katakana", "hiragana", "bopomofo", "hangul"): | ||
cp_group = rengi_06 | ||
else: | ||
continue | ||
|
||
start = int(m.group("start"), 16) | ||
|
||
end = int(m.group("end"), 16) if m.group("end") is not None else start + 1 | ||
|
||
for i in range(start, end): | ||
gcpy_12.add(i) | ||
cp_group.add(i) | ||
|
||
with open(args.gcpy_py_file, "w", encoding="utf-8") as f: | ||
with open(args.py_file, "w", encoding="utf-8") as f: | ||
|
||
f.write( | ||
TEMPLATE.format( | ||
codepoints=",\n".join(map(str, gcpy_12)), | ||
codepoints_GCPY_12=",\n".join(map(str, gcpy_12)), | ||
codepoints_RENGI_06=",\n".join(map(str, rengi_06)), | ||
date=file_date, | ||
version=file_version | ||
) | ||
|
Oops, something went wrong.