Skip to content

Commit

Permalink
Update Ren(G_i) per WD-imsc-hrm-20220322
Browse files Browse the repository at this point in the history
Added missing entries to codepoint sets used to compute `GCpy` and `Ren(G_i)`
  • Loading branch information
palemieux authored Apr 6, 2022
1 parent 61181f7 commit 2792530
Show file tree
Hide file tree
Showing 5 changed files with 46,383 additions and 5,366 deletions.
41 changes: 28 additions & 13 deletions scripts/make_gcpy_source.py → scripts/make_codepoint_sets.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,8 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

"""Generates a Python source file that initializes a set of unicode codepoints that belong
to the following scripts: latin, greek, cyrillic, hebrew or common.
Used to determine the value of GCpy at IMSC, 10.5"""
"""Generates a Python source file that contains sets of Unicode codepoints used
by the IMSC HRM, e.g., to determine the value of GCpy at IMSC, 10.5"""

__author__ = "Pierre-Anthony Lemieux <[email protected]>"

Expand All @@ -35,35 +34,44 @@
TEMPLATE="""#!/usr/bin/env python
# -*- coding: UTF-8 -*-
'''Set of unicode codepoints that belong to the following scripts: latin, greek, cyrillic, hebrew or common.
Used to determine the value of GCpy at IMSC, 10.5'''
'''Sets of Unicode codepoints used by the IMSC HRM.'''
# Generated from the Unicode Character Database (http://www.unicode.org/Public/UNIDATA/Scripts.txt)
# Version: {version}
# Date: {date}
# Normalized glyph copy performance factor (GCpy) for Latin, Greek, Cyrillic, Hebrew or Common
GCPY_12 = set(
(
{codepoints}
{codepoints_GCPY_12}
)
)
# Text rendering performance factor Ren(Gi) for Han, Katakana, Hiragana, Bopomofo or Hangul scripts
RENGI_06 = set(
(
{codepoints_RENGI_06}
)
)
"""

SCRIPT_LINE_PATTERN = re.compile(r"(?P<start>[a-fA-F0-9]{4})(?:\.\.(?P<end>[a-fA-F0-9]{4})?)\s+;\s+(?P<script>\w*)")
SCRIPT_LINE_PATTERN = re.compile(r"(?P<start>[a-fA-F0-9]{4})(?:\.\.(?P<end>[a-fA-F0-9]{4}))?\s+;\s+(?P<script>\w*)")

VERSION_LINE_PATTERN = re.compile(r"^#\s+(.+)$")

DATE_LINE_PATTERN = re.compile(r"^#\s*Date:\s*(.+)$")

if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Generates the _GCPY_12.py file')
parser = argparse.ArgumentParser(description='Generates the _codepoints_sets.py file')
parser.add_argument('scripts_file', type=str, help='Path to the input unicode Scripts.txt file')
parser.add_argument('gcpy_py_file', type=str, help='Path to the generated Python source file')
parser.add_argument('py_file', type=str, help='Path to the generated Python source file')

args = parser.parse_args()

gcpy_12 = set()

rengi_06 = set()

with open(args.scripts_file, encoding="latin-1") as f:

file_version = VERSION_LINE_PATTERN.match(f.readline()).group(1)
Expand All @@ -76,21 +84,28 @@
if m is None:
continue

if m.group("script").lower() not in ("common", "latin", "greek", "hebrew", "cyrillic"):
script = m.group("script").lower()

if script in ("common", "latin", "greek", "hebrew", "cyrillic"):
cp_group = gcpy_12
elif script in ("han", "katakana", "hiragana", "bopomofo", "hangul"):
cp_group = rengi_06
else:
continue

start = int(m.group("start"), 16)

end = int(m.group("end"), 16) if m.group("end") is not None else start + 1

for i in range(start, end):
gcpy_12.add(i)
cp_group.add(i)

with open(args.gcpy_py_file, "w", encoding="utf-8") as f:
with open(args.py_file, "w", encoding="utf-8") as f:

f.write(
TEMPLATE.format(
codepoints=",\n".join(map(str, gcpy_12)),
codepoints_GCPY_12=",\n".join(map(str, gcpy_12)),
codepoints_RENGI_06=",\n".join(map(str, rengi_06)),
date=file_date,
version=file_version
)
Expand Down
Loading

0 comments on commit 2792530

Please sign in to comment.