Support for UTF-8 Character Class Processing

- Added UTF-8 utility functions to Tf to determine whether a code point is in the XID_Start / XID_Continue character classes - Added static data for holding code point flags for XID_Start / XID_Continue - Added tests for XID_Start / XID_Continue code point validity - Added pre-processing script to generate character class ranges for XID_Start / XID_Continue from source DerivedCoreProperties.txt
PixarAnimationStudios · Dec 6, 2023 · c0e2930 · c0e2930
1 parent 0e11364
commit c0e2930
Show file tree

Hide file tree

Showing 8 changed files with 2,842 additions and 1 deletion.
diff --git a/pxr/base/tf/CMakeLists.txt b/pxr/base/tf/CMakeLists.txt
@@ -260,6 +260,7 @@ pxr_library(tf
         atomicRenameUtil
         debugCodes
         noticeRegistry
+        unicodeCharacterClasses
 
     PYTHON_PRIVATE_CLASSES
         pyErrorInternal

diff --git a/pxr/base/tf/testenv/unicodeUtils.cpp b/pxr/base/tf/testenv/unicodeUtils.cpp
@@ -101,10 +101,171 @@ TestUtf8CodePointView()
     return true;
 }
 
+static bool
+TestCharacterClasses()
+{
+    // a mix of code points that should fall into the following
+    // character classes that make up XID_Start:
+    // Lu | Ll | Lt | Lm | Lo | Nl 
+    std::vector<uint32_t> xidStartCodePoints = {
+        0x0043u,    // Latin captial letter C (Lu)
+        0x006Au,    // Latin small letter j (Ll)
+        0x0254u,    // Latin small letter Open o (Ll)
+        0x01C6u,    // Latin small letter DZ with Caron (Ll)
+        0x01CBu,    // Latin capital letter N with small letter j (Lt)
+        0x02B3u,    // Modifier letter small r (Lm)
+        0x10464u,   // Shavian letter Loll (Lo)
+        0x132B5u,   // Egyptian hieroglpyh R0004 (Lo)
+        0x12421u,   // Cuneiform numeric sign four geshu (Nl)
+        0xFDABu,    // Arabic Ligature seen with Khan 
+                    //with Alef Maksura FInal Form (Lo)
+        0x18966u,   // Tangut Component-359 (Lo)
+        0x10144u,   // Greek acrophonic attick fifty (Nl)
+        0x037Fu,    // Greek captial letter YOT (Lu) 
+                    // [test singular code point range]
+        0x2F800u,   // CJK Compatibility Ideograph-2F800 (Lo) 
+                    // [test start range]
+        0x3134Au,   // CJK Ideograph Extension G Last (Lo) 
+                    // [test end range]
+    };
+
+
+    // a mix of code points that should fall into the following
+    // character classes that make up XID_Continue
+    // XID_Start | Nd | Mn | Mc | Pc
+    std::vector<uint32_t> xidContinueCodePoints = {
+        0x0032u,    // Digit two (Nd)
+        0x0668u,    // Arabic-Indic Digit Eight (Nd)
+        0x07C0u,    // NKO Digit Zero (Nd)
+        0x1E145u,   // Nyiakeng Puachue Hmong Digit Five (Nd)
+        0x0300u,    // Combining Grave Accent (Mn)
+        0x2CEFu,    // Coptic Combining NI Above (Mn)
+        0x10A02u,   // Kharoshthi Vowel Sign U (Mn)
+        0x16F92u,   // Miao Tone Below (Mn)
+        0x0903u,    // Devanagari Sign Visarga (Mc)
+        0x16F55u,   // Miao Vowel Sign AA (Mc)
+        0x1D172u,   // Musical Symbol Combining Flag-5 (Mc)
+        0x203Fu,    // Undertie (Pc)
+        0x005Fu,    // Low line (underscore) (Pc)
+        0xFE4Fu,    // Wavy Low Line (Pc)
+        0x05BFu,    // Hebrew Point Rafe (Mn) [test singular code point range]
+        0x1E2ECu,   // Wancho Tone Tup (Mn) [test start range]
+        0xE01EFu,   // Variation Selector-256 (Mn) [test end range]
+    };
+
+    // code points that shouldn't fall into either XID_Start
+    // or XID_Continue
+    std::vector<uint32_t> invalidCodePoints = {
+        0x002Du,    // Hyphen-Minus (Pd)
+        0x00ABu,    // Left-Pointing Double Angle Quotation Mark (Pi)
+        0x2019u,    // Right Single Quotation Mark (Pf)
+        0x2021u,    // Double Dagger (Po)
+        0x1ECB0u,   // Indic Siyaq Rupee Mark (Sc)
+        0x0020u,    // Space (Zs)
+        0x3000u,    // Ideographic Space (Zs)
+        0x000Bu,    // Line tabulation (Cc)
+        0xF8FEu,    // Private Use (Co)
+    };
+
+    for (size_t i = 0; i < xidStartCodePoints.size(); i++)
+    {
+        TF_AXIOM(TfIsUtf8CodePointXidStart(xidStartCodePoints[i]));
+
+        // XID_Continue sets contain XID_Start
+        TF_AXIOM(TfIsUtf8CodePointXidContinue(xidStartCodePoints[i]));
+    }
+
+    for (size_t i = 0; i < xidContinueCodePoints.size(); i++)
+    {
+        TF_AXIOM(TfIsUtf8CodePointXidContinue(xidContinueCodePoints[i]));
+    }
+
+    for (size_t i = 0; i < invalidCodePoints.size(); i++)
+    {
+        TF_AXIOM(!TfIsUtf8CodePointXidStart(invalidCodePoints[i]));
+        TF_AXIOM(!TfIsUtf8CodePointXidContinue(invalidCodePoints[i]));
+    }
+
+    // now test some strings with some characters from each of these sets
+    // such that we can exercise the iterator converting from UTF-8 char
+    // to code point
+    std::string s1 = "ⅈ75_hgòð㤻";
+    std::string s2 = "㤼01৪∫";
+    std::string s3 = "㤻üaf-∫⁇…🔗";
+    std::string s3_1 = s3.substr(0, s3.find("-"));
+    std::string s3_2 = s3.substr(s3.find("-"));
+    std::string_view sv1 {s1};
+    std::string_view sv2 {s2};
+    std::string_view sv3 {s3_1};
+    std::string_view sv4 {s3_2};
+
+    TfUtf8CodePointView view1 {sv1};
+    TfUtf8CodePointView view2 {sv2};
+    TfUtf8CodePointView view3 {sv3};
+    TfUtf8CodePointView view4 {sv4};
+
+    // s1 should start with XID_Start and then have XID_Continue
+    bool first = true;
+    for (const uint32_t codePoint : view1)
+    {
+        bool result = first ? TfIsUtf8CodePointXidStart(codePoint)
+            : TfIsUtf8CodePointXidContinue(codePoint);
+        TF_AXIOM(result);
+
+        first = false;
+    }
+
+    // s2 should start with XID_Start, have three characters that are 
+    // XID_Continue, then one that isn't in either 
+    size_t count = 0;
+    for (const uint32_t codePoint : view2)
+    {
+        if (count == 0)
+        {
+            TF_AXIOM(TfIsUtf8CodePointXidStart(codePoint));
+        }
+        else if (count == 4)
+        {
+            TF_AXIOM(!TfIsUtf8CodePointXidContinue(codePoint));
+        }
+        else
+        {
+            TF_AXIOM(TfIsUtf8CodePointXidContinue(codePoint));
+        }
+
+        count++;
+    }
+
+    // s3 should have all XID_Start characters in the first set
+    // (before the "-") and all invalid characters after
+    for (const uint32_t codePoint : view3)
+    {
+        TF_AXIOM(TfIsUtf8CodePointXidStart(codePoint));
+    }
+    for (const uint32_t codePoint : view4)
+    {
+        TF_AXIOM(!TfIsUtf8CodePointXidContinue(codePoint));
+    }
+
+    // test uint32_t max, which should overflow the number of code points
+    // and make sure it returns invalid
+    TF_AXIOM(!TfIsUtf8CodePointXidStart(
+        std::numeric_limits<uint32_t>::max()));
+    TF_AXIOM(!TfIsUtf8CodePointXidContinue(
+        std::numeric_limits<uint32_t>::max()));
+
+    // also TF_MAX_CODE_POINT is our upper limit, and should be an invalid
+    // code point due to its being reserved
+    TF_AXIOM(!TfIsUtf8CodePointXidStart(TF_MAX_CODE_POINT));
+    TF_AXIOM(!TfIsUtf8CodePointXidContinue(TF_MAX_CODE_POINT));
+
+    return true;
+}
+
 static bool
 Test_TfUnicodeUtils()
 {
-    return TestUtf8CodePointView();
+    return TestUtf8CodePointView() && TestCharacterClasses();
 }
 
 TF_ADD_REGTEST(TfUnicodeUtils);
diff --git a/pxr/base/tf/unicode/README.md b/pxr/base/tf/unicode/README.md
@@ -0,0 +1,46 @@
+# Generating Character Classes from the Unicode Database
+
+To properly process UTF-8 encoded strings, the system needs to know what code
+points fall into what Unicode character class.  This is useful for e.g.,
+processing identifiers to determine whether the first character is represented
+by a Unicode code point that falls into the `XidStart` Unicode character class.
+Unicode defines a maximum of 17 * 2^16 code points, and we need an efficient
+way of representing character class containment for each of these code points.
+The chosen data structures of interest are implemented in the
+`pxr/base/tf/unicode/unicodeCharacterClasses.template.cpp` file, but the code
+points of interest for each character class must be generated from a source
+version of the Unicode database.
+
+This directory contains a script `tfGenCharacterClasses.py` that will read in
+character class information from a source version of the Unicode database and
+generate the `pxr/base/tf/unicodeCharacterClasses.cpp` file from the provided
+`pxr/base/tf/unicode/unicodeCharacterClasses.template.cpp` file.  The Unicode
+database provides a post-processed file called `DerivedCoreProperties.txt` in
+its core collateral.  For the script to function, this file must be present
+locally on disk (see below for information about where to obtain the source
+Unicode character class data).  Once the script is present locally, the
+character classes can be generated via the following command:
+
+```
+# example run from the pxr/base/tf/unicode directory
+python tfGenCharacterClasses.py --srcDir <path/to/DerivedCoreProperties.txt>
+    --destDir .. --srcTemplate unicodeCharacterClasses.template.cpp
+```
+
+This command will overwrite the current 
+`pxr/base/tf/unicodeCharacterClasses.cpp` file with the newly generated
+version.
+
+**NOTE: This script need only be run once when upgrading to a new**
+**Unicode version**
+
+## Source Unicode Database
+
+The Unicode Character Database consists of a set of files representing
+Unicode character properties and can be found at https://unicode.org/ucd/
+and the `DerivedCoreProperties.txt` file can be obtained in the `ucd`
+directory of the collateral at whatever version you are interested in
+supporting.
+
+The current version of `pxr/base/tf/unicodeCharacterClasses.cpp`
+was generated from `DerivedCoreProperties.txt` for Unicode Version 15.1.0.
diff --git a/pxr/base/tf/unicode/tfGenCharacterClasses.py b/pxr/base/tf/unicode/tfGenCharacterClasses.py
@@ -0,0 +1,164 @@
+#!/usr/bin/env python
+#
+# Copyright 2023 Pixar
+#
+# Licensed under the Apache License, Version 2.0 (the "Apache License")
+# with the following modification; you may not use this file except in
+# compliance with the Apache License and the following modification to it:
+# Section 6. Trademarks. is deleted and replaced with:
+#
+# 6. Trademarks. This License does not grant permission to use the trade
+#    names, trademarks, service marks, or product names of the Licensor
+#    and its affiliates, except as required to comply with Section 4(c) of
+#    the License and to reproduce the content of the NOTICE file.
+#
+# You may obtain a copy of the Apache License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the Apache License with the above modification is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the Apache License for the specific
+# language governing permissions and limitations under the Apache License.
+#
+# A script for generating the character class sets for XID_Start and
+# XID_Continue character classes.  This takes a source UnicodeData.txt
+# from the Unicode standard and generates C++ source files that populate
+# data structures with the appropriate code points.
+'''This script reads the DerivedCoreProperties.txt from a versioned set of
+Unicode collateral and generates appropriate data structures for the XID_Start
+and XID_Continue character classes used to process UTF-8 encoded strings in
+the Tf library.'''
+
+import os
+
+from argparse import ArgumentParser
+
+DERIVEDCOREPROPERTIES_FILE = "DerivedCoreProperties.txt"
+TEMPLATE_FILE_NAME = "unicodeCharacterClasses.template.cpp"
+CPP_FILE_NAME = "unicodeCharacterClasses.cpp"
+
+xid_start_range_pairs = []
+xid_continue_range_pairs = []
+
+def _write_cpp_file(source_template_path : str, destination_directory : str):
+    """
+    Writes the C++ code file that will initialize character class
+    sets with the values read by this script.
+    Args:
+        source_template_path : A string defining the path at which the source
+                               template file exists.
+        destination_directory: A string defining the path at which the 
+                               generated cpp file will be written to.
+                               If the specified directory does not exist,
+                               it will be created.
+    """
+    if not os.path.exists(source_template_path):
+        raise ValueError(f"Provided source template file \
+{source_template_path} does not exist!")
+
+    source_template_content = None
+    with open(source_template_path, 'r') as source_template_file:
+        source_template_content = source_template_file.read()
+
+    if not os.path.exists(destination_directory):
+        os.mkdir(destination_directory)
+
+    generated_cpp_file_name = os.path.join(destination_directory,
+                                           CPP_FILE_NAME)
+    with open(generated_cpp_file_name, 'w') as generated_cpp_file:
+        # we need to replace two markers, {xid_start_ranges}
+        # and {xid_continue_ranges} with the content we derived
+        # from DerivedCoreProperties.txt
+        xid_start_range_expression = "ranges = {\n"
+        for x in xid_start_range_pairs:
+            range_expression = "{" + str(x[0]) + ", " + str(x[1]) + "}"
+            xid_start_range_expression += f"        {range_expression},\n"
+        xid_start_range_expression += "    };"
+
+        xid_continue_range_expression = "ranges = {\n"
+        for x in xid_continue_range_pairs:
+            range_expression = "{" + str(x[0]) + ", " + str(x[1]) + "}"
+            xid_continue_range_expression += f"        {range_expression},\n"
+        xid_continue_range_expression += "    };"
+
+        destination_template_content = source_template_content.replace(
+            r"{xid_start_ranges}", xid_start_range_expression)
+        destination_template_content = destination_template_content.replace(
+            r"{xid_continue_ranges}", xid_continue_range_expression)
+
+        generated_cpp_file.write(destination_template_content)
+
+def _parseArguments():
+    """
+    Parses the arguments sent to the script.
+    Returns:
+        An object containing the parsed arguments as accessible fields.
+    """
+    parser = ArgumentParser(
+        description='Generate character class sets for Unicode characters.')
+    parser.add_argument('--srcDir', required=False, default=os.getcwd(),
+        help='The source directory where the DerivedCoreProperties.txt \
+file exists.')
+    parser.add_argument('--destDir', required=False, default=os.getcwd(),
+        help='The destination directory where the processed cpp file will \
+be written to.')
+    parser.add_argument("--srcTemplate", required=False, 
+        default=os.path.join(os.getcwd(), TEMPLATE_FILE_NAME),
+        help='The full path to the source template file to use.')
+
+    return parser.parse_args()
+
+if __name__ == '__main__':
+    arguments = _parseArguments()
+
+    # parse the DerivedCoreProperties.txt file
+    # sections of that file contain the derived properties XID_Start
+    # and XID_Continue based on the allowed character classes and code points
+    # sourced from UnicodeData.txt each line in the file that we are interested
+    # in is of one of two forms:
+    # codePoint ; XID_Start # character class Character Name
+    # codePointRangeStart..codePointRangeEnd ; XID_Start 
+    # # character class [# of elements in range] Character Name 
+    file_name = os.path.join(arguments.srcDir, DERIVEDCOREPROPERTIES_FILE)
+    if not os.path.exists(file_name):
+        raise RuntimeError(f"Error in script: Could not find \
+'DerivedCoreProperties.txt' at path {arguments.srcDir}!")
+
+    with open(file_name, 'r') as file:
+        for line in file:
+            if "; XID_Start" in line:
+                # this is an XID_Start single code point or range
+                tokens = line.split(';')
+                code_points = tokens[0].strip()
+                if ".." in code_points:
+                    # this is a ranged code point
+                    code_point_ranges = code_points.split("..")
+                    start_code_point = int(code_point_ranges[0], 16)
+                    end_code_point = int(code_point_ranges[1], 16)
+                else:
+                    # this is a single code point
+                    start_code_point = int(code_points, 16)
+                    end_code_point = start_code_point
+
+                xid_start_range_pairs.append((start_code_point, 
+                                              end_code_point))
+            elif "; XID_Continue" in line:
+                # this is an XID_Continue single code point or range
+                tokens = line.split(';')
+                code_points = tokens[0].strip()
+                if ".." in code_points:
+                    # this is a ranged code point
+                    code_point_ranges = code_points.split("..")
+                    start_code_point = int(code_point_ranges[0], 16)
+                    end_code_point = int(code_point_ranges[1], 16)
+                else:
+                    # this is a single code point
+                    start_code_point = int(code_points, 16)
+                    end_code_point = start_code_point
+
+                xid_continue_range_pairs.append((start_code_point, 
+                                                 end_code_point))
+
+    _write_cpp_file(arguments.srcTemplate, arguments.destDir)