some 'Sk' fixes and a small table size optimization

jquast · Oct 3, 2023 · 08242d0 · 08242d0
1 parent 9e22c0c
commit 08242d0
Show file tree

Hide file tree

Showing 6 changed files with 231 additions and 466 deletions.
diff --git a/bin/update-tables.py b/bin/update-tables.py
@@ -127,7 +127,38 @@ class SequenceEntry:
 class TableDef:
     filename: str
     date: str
-    values: list[tuple[str, str, str]]
+    values: set[tuple[int, int]]
+
+    def optimized_values(self) -> list[tuple[int, int]]:
+        """
+        Given unsorted, overlapping set of (start, end) ranges in field,
+        'values', compress into a sorted ascending list of non-overlapping
+        ranges.
+        """
+        result = []
+        for start, end in sorted(self.values):
+            if result and result[-1][1] + 1 == start:
+                result[-1] = (result[-1][0], end)
+            else:
+                result.append((start, end))
+        return result
+
+
+    @property
+    def hex_range_descriptions(self) -> list[tuple[str, str, str]]:
+        """Convert integers into string table of (hex_start, hex_end, txt_description)."""
+        pytable_values: list[tuple[str, str, str]] = []
+        for start, end in self.optimized_values():
+            hex_start, hex_end = f'0x{start:05x}', f'0x{end:05x}'
+            ucs_start, ucs_end = chr(start), chr(end)
+            name_start = name_ucs(ucs_start) or '(nil)'
+            name_end = name_ucs(ucs_end) or '(nil)'
+            if name_start != name_end:
+                txt_description = f'{name_start[:24].rstrip():24s}..{name_end[:24].rstrip()}'
+            else:
+                txt_description = f'{name_start[:48]}'
+            pytable_values.append((hex_start, hex_end, txt_description))
+        return pytable_values
 
 
 @dataclass
@@ -323,9 +354,11 @@ def fetch_table_wide_data() -> UnicodeTableRenderCtx:
     table: dict[UnicodeVersion, TableDef] = {}
     for version in fetch_unicode_versions():
         fname = os.path.join(PATH_DATA, f'EastAsianWidth-{version}.txt')
+        fname_dgc = os.path.join(PATH_DATA, f'DerivedGeneralCategory-{version}.txt')
         do_retrieve(url=URL_EASTASIAN_WIDTH.format(version=version), fname=fname)
         # , 'Sk'
-        table[version] = parse_category(fname=fname, category_codes=('W', 'F', 'Sk'), wide=2)
+        table[version] = parse_category(fname=fname, category_codes=('W', 'F'), wide=2)
+        table[version].values.update(parse_category(fname=fname_dgc, category_codes=('Sk',), wide=2).values)
     return UnicodeTableRenderCtx('WIDE_EASTASIAN', table)
 
 
@@ -337,10 +370,8 @@ def fetch_table_zero_data() -> UnicodeTableRenderCtx:
         do_retrieve(url=URL_DERIVED_CATEGORY.format(version=version), fname=fname)
         # Determine values of zero-width character lookup table by the following category codes
         table[version] = parse_category(fname=fname, category_codes=('Me', 'Mn', 'Cf', 'Zl', 'Zp', 'Sk'), wide=0)
-        #, 'Sk'
         # Inject NULL into all table versions.
-        table[version].values.append(('0x00000', '0x00000', 'NULL'))
-        table[version].values.sort()
+        table[version].values.add((0, 0))
     return UnicodeTableRenderCtx('ZERO_WIDTH', table)
 
 
@@ -394,23 +425,6 @@ def name_ucs(ucs: str) -> str:
     except ValueError:
         return None
 
-def convert_values_to_string_table(
-    values: Collection[tuple[int, int]],
-) -> list[tuple[str, str, str]]:
-    """Convert integers into string table of (hex_start, hex_end, txt_description)."""
-    pytable_values: list[tuple[str, str, str]] = []
-    for start, end in values:
-        hex_start, hex_end = f'0x{start:05x}', f'0x{end:05x}'
-        ucs_start, ucs_end = chr(start), chr(end)
-        name_start = name_ucs(ucs_start) or '(nil)'
-        name_end = name_ucs(ucs_end) or '(nil)'
-        if name_start != name_end:
-            txt_description = f'{name_start[:24].rstrip():24s}..{name_end[:24].rstrip()}'
-        else:
-            txt_description = f'{name_start[:48]}'
-        pytable_values.append((hex_start, hex_end, txt_description))
-    return pytable_values
-
 
 def parse_unicode_table(file: Iterable[str]) -> Iterator[TableEntry]:
     """
@@ -447,22 +461,22 @@ def parse_category(fname: str, category_codes: Container[str], wide: int) -> Tab
         version = next(table_iter).comment.strip()
         # and "date string" from second line
         date = next(table_iter).comment.split(':', 1)[1].strip()
-
         values = parse_category_values(category_codes, table_iter, wide)
-        txt_values = convert_values_to_string_table(make_table(values))
     print('ok')
-    return TableDef(version, date, txt_values)
+    return TableDef(version, date, values)
 
 def parse_category_values(category_codes: str,
                           table_iter: Iterator[TableEntry],
-                          wide: int) -> list[int]:
+                          wide: int) -> set[int]:
     """
     Parse value ranges of unicode data files, by given categories.
     """
     def filter_entry(entry):
-        # I doubt the unicode folks wanted to design the 'Sk' category this way, but for the most
-        # part, we can distinguish narrow, zero, and wide characters by their individual categories,
-        # but 'Zk' contains characters of all widths -- 0, 1, and 2 !!
+        # I doubt the unicode folks wanted to design the 'Sk' category this way,
+        # but for the most part, we can distinguish narrow, zero, and wide
+        # characters by their individual categories, but 'Sk' contains
+        # characters of all widths -- 0, 1, and 2. Their type is determined by
+        # the presence of phrases, 'MODIFIER' or 'FULLWIDTH' !!
         if entry.code_range is None:
             return False
         if entry.properties[0] != 'Sk':
@@ -473,12 +487,7 @@ def filter_entry(entry):
             return wide == 2
         else:
             return wide == 1
-    return [
-            codepoint
-            for entry in table_iter
-            if filter_entry(entry)
-            for codepoint in range(*entry.code_range)
-            ]
+    return {(entry.code_range[0], entry.code_range[1]-1) for entry in table_iter if filter_entry(entry)}
 
 
 def parse_zwj_file(file: Iterable[str]) -> Iterator[SequenceEntry]:

diff --git a/code_templates/python_table.py.j2 b/code_templates/python_table.py.j2
@@ -9,7 +9,7 @@ This code generated by {{this_filepath}} on {{utc_now}}.
         # Source: {{ table_def.filename }}
         # Date: {{ table_def.date }}
         #
-{%- for hex_start, hex_end, txt_description in table_def.values %}
+{%- for hex_start, hex_end, txt_description in table_def.hex_range_descriptions %}
         ({{ hex_start }}, {{ hex_end }},),  # {{txt_description}}
 {%- endfor %}
     ),

diff --git a/wcwidth/emoji_zwj_sequences.py b/wcwidth/emoji_zwj_sequences.py
@@ -1,7 +1,7 @@
 """
 Exports EMOJI_ZWJ_SEQUENCES sequences keyed by supporting emoji version.
 
-This code generated by wcwidth/bin/update-tables.py on 2023-09-30 06:56:40 UTC.
+This code generated by wcwidth/bin/update-tables.py on 2023-10-03 11:06:48 UTC.
 
 This module is used only for testing purposes, it is not imported or used by the
 public API.

diff --git a/wcwidth/table_wide.py b/wcwidth/table_wide.py
@@ -1,7 +1,7 @@
 """
 Exports WIDE_EASTASIAN table keyed by supporting unicode version level.
 
-This code generated by wcwidth/bin/update-tables.py on 2023-10-03 10:32:28 UTC.
+This code generated by wcwidth/bin/update-tables.py on 2023-10-03 11:06:48 UTC.
 """
 WIDE_EASTASIAN = {
     '4.1.0': (