Skip to content

Commit

Permalink
Support for UTF-8 Character Class Processing
Browse files Browse the repository at this point in the history
- Added UTF-8 utility functions to Tf to determine whether
  a code point is in the XID_Start / XID_Continue character classes
- Added static data for holding code point flags for XID_Start /
  XID_Continue
- Added tests for XID_Start / XID_Continue code point validity
- Added pre-processing script to generate character class ranges
  for XID_Start / XID_Continue from source DerivedCoreProperties.txt
  • Loading branch information
erslavin committed Dec 6, 2023
1 parent 0e11364 commit c0e2930
Show file tree
Hide file tree
Showing 8 changed files with 2,842 additions and 1 deletion.
1 change: 1 addition & 0 deletions pxr/base/tf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,7 @@ pxr_library(tf
atomicRenameUtil
debugCodes
noticeRegistry
unicodeCharacterClasses

PYTHON_PRIVATE_CLASSES
pyErrorInternal
Expand Down
163 changes: 162 additions & 1 deletion pxr/base/tf/testenv/unicodeUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,171 @@ TestUtf8CodePointView()
return true;
}

static bool
TestCharacterClasses()
{
// a mix of code points that should fall into the following
// character classes that make up XID_Start:
// Lu | Ll | Lt | Lm | Lo | Nl
std::vector<uint32_t> xidStartCodePoints = {
0x0043u, // Latin captial letter C (Lu)
0x006Au, // Latin small letter j (Ll)
0x0254u, // Latin small letter Open o (Ll)
0x01C6u, // Latin small letter DZ with Caron (Ll)
0x01CBu, // Latin capital letter N with small letter j (Lt)
0x02B3u, // Modifier letter small r (Lm)
0x10464u, // Shavian letter Loll (Lo)
0x132B5u, // Egyptian hieroglpyh R0004 (Lo)
0x12421u, // Cuneiform numeric sign four geshu (Nl)
0xFDABu, // Arabic Ligature seen with Khan
//with Alef Maksura FInal Form (Lo)
0x18966u, // Tangut Component-359 (Lo)
0x10144u, // Greek acrophonic attick fifty (Nl)
0x037Fu, // Greek captial letter YOT (Lu)
// [test singular code point range]
0x2F800u, // CJK Compatibility Ideograph-2F800 (Lo)
// [test start range]
0x3134Au, // CJK Ideograph Extension G Last (Lo)
// [test end range]
};


// a mix of code points that should fall into the following
// character classes that make up XID_Continue
// XID_Start | Nd | Mn | Mc | Pc
std::vector<uint32_t> xidContinueCodePoints = {
0x0032u, // Digit two (Nd)
0x0668u, // Arabic-Indic Digit Eight (Nd)
0x07C0u, // NKO Digit Zero (Nd)
0x1E145u, // Nyiakeng Puachue Hmong Digit Five (Nd)
0x0300u, // Combining Grave Accent (Mn)
0x2CEFu, // Coptic Combining NI Above (Mn)
0x10A02u, // Kharoshthi Vowel Sign U (Mn)
0x16F92u, // Miao Tone Below (Mn)
0x0903u, // Devanagari Sign Visarga (Mc)
0x16F55u, // Miao Vowel Sign AA (Mc)
0x1D172u, // Musical Symbol Combining Flag-5 (Mc)
0x203Fu, // Undertie (Pc)
0x005Fu, // Low line (underscore) (Pc)
0xFE4Fu, // Wavy Low Line (Pc)
0x05BFu, // Hebrew Point Rafe (Mn) [test singular code point range]
0x1E2ECu, // Wancho Tone Tup (Mn) [test start range]
0xE01EFu, // Variation Selector-256 (Mn) [test end range]
};

// code points that shouldn't fall into either XID_Start
// or XID_Continue
std::vector<uint32_t> invalidCodePoints = {
0x002Du, // Hyphen-Minus (Pd)
0x00ABu, // Left-Pointing Double Angle Quotation Mark (Pi)
0x2019u, // Right Single Quotation Mark (Pf)
0x2021u, // Double Dagger (Po)
0x1ECB0u, // Indic Siyaq Rupee Mark (Sc)
0x0020u, // Space (Zs)
0x3000u, // Ideographic Space (Zs)
0x000Bu, // Line tabulation (Cc)
0xF8FEu, // Private Use (Co)
};

for (size_t i = 0; i < xidStartCodePoints.size(); i++)
{
TF_AXIOM(TfIsUtf8CodePointXidStart(xidStartCodePoints[i]));

// XID_Continue sets contain XID_Start
TF_AXIOM(TfIsUtf8CodePointXidContinue(xidStartCodePoints[i]));
}

for (size_t i = 0; i < xidContinueCodePoints.size(); i++)
{
TF_AXIOM(TfIsUtf8CodePointXidContinue(xidContinueCodePoints[i]));
}

for (size_t i = 0; i < invalidCodePoints.size(); i++)
{
TF_AXIOM(!TfIsUtf8CodePointXidStart(invalidCodePoints[i]));
TF_AXIOM(!TfIsUtf8CodePointXidContinue(invalidCodePoints[i]));
}

// now test some strings with some characters from each of these sets
// such that we can exercise the iterator converting from UTF-8 char
// to code point
std::string s1 = "ⅈ75_hgòð㤻";
std::string s2 = "㤼01৪∫";
std::string s3 = "㤻üaf-∫⁇…🔗";
std::string s3_1 = s3.substr(0, s3.find("-"));
std::string s3_2 = s3.substr(s3.find("-"));
std::string_view sv1 {s1};
std::string_view sv2 {s2};
std::string_view sv3 {s3_1};
std::string_view sv4 {s3_2};

TfUtf8CodePointView view1 {sv1};
TfUtf8CodePointView view2 {sv2};
TfUtf8CodePointView view3 {sv3};
TfUtf8CodePointView view4 {sv4};

// s1 should start with XID_Start and then have XID_Continue
bool first = true;
for (const uint32_t codePoint : view1)
{
bool result = first ? TfIsUtf8CodePointXidStart(codePoint)
: TfIsUtf8CodePointXidContinue(codePoint);
TF_AXIOM(result);

first = false;
}

// s2 should start with XID_Start, have three characters that are
// XID_Continue, then one that isn't in either
size_t count = 0;
for (const uint32_t codePoint : view2)
{
if (count == 0)
{
TF_AXIOM(TfIsUtf8CodePointXidStart(codePoint));
}
else if (count == 4)
{
TF_AXIOM(!TfIsUtf8CodePointXidContinue(codePoint));
}
else
{
TF_AXIOM(TfIsUtf8CodePointXidContinue(codePoint));
}

count++;
}

// s3 should have all XID_Start characters in the first set
// (before the "-") and all invalid characters after
for (const uint32_t codePoint : view3)
{
TF_AXIOM(TfIsUtf8CodePointXidStart(codePoint));
}
for (const uint32_t codePoint : view4)
{
TF_AXIOM(!TfIsUtf8CodePointXidContinue(codePoint));
}

// test uint32_t max, which should overflow the number of code points
// and make sure it returns invalid
TF_AXIOM(!TfIsUtf8CodePointXidStart(
std::numeric_limits<uint32_t>::max()));
TF_AXIOM(!TfIsUtf8CodePointXidContinue(
std::numeric_limits<uint32_t>::max()));

// also TF_MAX_CODE_POINT is our upper limit, and should be an invalid
// code point due to its being reserved
TF_AXIOM(!TfIsUtf8CodePointXidStart(TF_MAX_CODE_POINT));
TF_AXIOM(!TfIsUtf8CodePointXidContinue(TF_MAX_CODE_POINT));

return true;
}

static bool
Test_TfUnicodeUtils()
{
return TestUtf8CodePointView();
return TestUtf8CodePointView() && TestCharacterClasses();
}

TF_ADD_REGTEST(TfUnicodeUtils);
46 changes: 46 additions & 0 deletions pxr/base/tf/unicode/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Generating Character Classes from the Unicode Database

To properly process UTF-8 encoded strings, the system needs to know what code
points fall into what Unicode character class. This is useful for e.g.,
processing identifiers to determine whether the first character is represented
by a Unicode code point that falls into the `XidStart` Unicode character class.
Unicode defines a maximum of 17 * 2^16 code points, and we need an efficient
way of representing character class containment for each of these code points.
The chosen data structures of interest are implemented in the
`pxr/base/tf/unicode/unicodeCharacterClasses.template.cpp` file, but the code
points of interest for each character class must be generated from a source
version of the Unicode database.

This directory contains a script `tfGenCharacterClasses.py` that will read in
character class information from a source version of the Unicode database and
generate the `pxr/base/tf/unicodeCharacterClasses.cpp` file from the provided
`pxr/base/tf/unicode/unicodeCharacterClasses.template.cpp` file. The Unicode
database provides a post-processed file called `DerivedCoreProperties.txt` in
its core collateral. For the script to function, this file must be present
locally on disk (see below for information about where to obtain the source
Unicode character class data). Once the script is present locally, the
character classes can be generated via the following command:

```
# example run from the pxr/base/tf/unicode directory
python tfGenCharacterClasses.py --srcDir <path/to/DerivedCoreProperties.txt>
--destDir .. --srcTemplate unicodeCharacterClasses.template.cpp
```

This command will overwrite the current
`pxr/base/tf/unicodeCharacterClasses.cpp` file with the newly generated
version.

**NOTE: This script need only be run once when upgrading to a new**
**Unicode version**

## Source Unicode Database

The Unicode Character Database consists of a set of files representing
Unicode character properties and can be found at https://unicode.org/ucd/
and the `DerivedCoreProperties.txt` file can be obtained in the `ucd`
directory of the collateral at whatever version you are interested in
supporting.

The current version of `pxr/base/tf/unicodeCharacterClasses.cpp`
was generated from `DerivedCoreProperties.txt` for Unicode Version 15.1.0.
164 changes: 164 additions & 0 deletions pxr/base/tf/unicode/tfGenCharacterClasses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
#!/usr/bin/env python
#
# Copyright 2023 Pixar
#
# Licensed under the Apache License, Version 2.0 (the "Apache License")
# with the following modification; you may not use this file except in
# compliance with the Apache License and the following modification to it:
# Section 6. Trademarks. is deleted and replaced with:
#
# 6. Trademarks. This License does not grant permission to use the trade
# names, trademarks, service marks, or product names of the Licensor
# and its affiliates, except as required to comply with Section 4(c) of
# the License and to reproduce the content of the NOTICE file.
#
# You may obtain a copy of the Apache License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the Apache License with the above modification is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the Apache License for the specific
# language governing permissions and limitations under the Apache License.
#
# A script for generating the character class sets for XID_Start and
# XID_Continue character classes. This takes a source UnicodeData.txt
# from the Unicode standard and generates C++ source files that populate
# data structures with the appropriate code points.
'''This script reads the DerivedCoreProperties.txt from a versioned set of
Unicode collateral and generates appropriate data structures for the XID_Start
and XID_Continue character classes used to process UTF-8 encoded strings in
the Tf library.'''

import os

from argparse import ArgumentParser

DERIVEDCOREPROPERTIES_FILE = "DerivedCoreProperties.txt"
TEMPLATE_FILE_NAME = "unicodeCharacterClasses.template.cpp"
CPP_FILE_NAME = "unicodeCharacterClasses.cpp"

xid_start_range_pairs = []
xid_continue_range_pairs = []

def _write_cpp_file(source_template_path : str, destination_directory : str):
"""
Writes the C++ code file that will initialize character class
sets with the values read by this script.
Args:
source_template_path : A string defining the path at which the source
template file exists.
destination_directory: A string defining the path at which the
generated cpp file will be written to.
If the specified directory does not exist,
it will be created.
"""
if not os.path.exists(source_template_path):
raise ValueError(f"Provided source template file \
{source_template_path} does not exist!")

source_template_content = None
with open(source_template_path, 'r') as source_template_file:
source_template_content = source_template_file.read()

if not os.path.exists(destination_directory):
os.mkdir(destination_directory)

generated_cpp_file_name = os.path.join(destination_directory,
CPP_FILE_NAME)
with open(generated_cpp_file_name, 'w') as generated_cpp_file:
# we need to replace two markers, {xid_start_ranges}
# and {xid_continue_ranges} with the content we derived
# from DerivedCoreProperties.txt
xid_start_range_expression = "ranges = {\n"
for x in xid_start_range_pairs:
range_expression = "{" + str(x[0]) + ", " + str(x[1]) + "}"
xid_start_range_expression += f" {range_expression},\n"
xid_start_range_expression += " };"

xid_continue_range_expression = "ranges = {\n"
for x in xid_continue_range_pairs:
range_expression = "{" + str(x[0]) + ", " + str(x[1]) + "}"
xid_continue_range_expression += f" {range_expression},\n"
xid_continue_range_expression += " };"

destination_template_content = source_template_content.replace(
r"{xid_start_ranges}", xid_start_range_expression)
destination_template_content = destination_template_content.replace(
r"{xid_continue_ranges}", xid_continue_range_expression)

generated_cpp_file.write(destination_template_content)

def _parseArguments():
"""
Parses the arguments sent to the script.
Returns:
An object containing the parsed arguments as accessible fields.
"""
parser = ArgumentParser(
description='Generate character class sets for Unicode characters.')
parser.add_argument('--srcDir', required=False, default=os.getcwd(),
help='The source directory where the DerivedCoreProperties.txt \
file exists.')
parser.add_argument('--destDir', required=False, default=os.getcwd(),
help='The destination directory where the processed cpp file will \
be written to.')
parser.add_argument("--srcTemplate", required=False,
default=os.path.join(os.getcwd(), TEMPLATE_FILE_NAME),
help='The full path to the source template file to use.')

return parser.parse_args()

if __name__ == '__main__':
arguments = _parseArguments()

# parse the DerivedCoreProperties.txt file
# sections of that file contain the derived properties XID_Start
# and XID_Continue based on the allowed character classes and code points
# sourced from UnicodeData.txt each line in the file that we are interested
# in is of one of two forms:
# codePoint ; XID_Start # character class Character Name
# codePointRangeStart..codePointRangeEnd ; XID_Start
# # character class [# of elements in range] Character Name
file_name = os.path.join(arguments.srcDir, DERIVEDCOREPROPERTIES_FILE)
if not os.path.exists(file_name):
raise RuntimeError(f"Error in script: Could not find \
'DerivedCoreProperties.txt' at path {arguments.srcDir}!")

with open(file_name, 'r') as file:
for line in file:
if "; XID_Start" in line:
# this is an XID_Start single code point or range
tokens = line.split(';')
code_points = tokens[0].strip()
if ".." in code_points:
# this is a ranged code point
code_point_ranges = code_points.split("..")
start_code_point = int(code_point_ranges[0], 16)
end_code_point = int(code_point_ranges[1], 16)
else:
# this is a single code point
start_code_point = int(code_points, 16)
end_code_point = start_code_point

xid_start_range_pairs.append((start_code_point,
end_code_point))
elif "; XID_Continue" in line:
# this is an XID_Continue single code point or range
tokens = line.split(';')
code_points = tokens[0].strip()
if ".." in code_points:
# this is a ranged code point
code_point_ranges = code_points.split("..")
start_code_point = int(code_point_ranges[0], 16)
end_code_point = int(code_point_ranges[1], 16)
else:
# this is a single code point
start_code_point = int(code_points, 16)
end_code_point = start_code_point

xid_continue_range_pairs.append((start_code_point,
end_code_point))

_write_cpp_file(arguments.srcTemplate, arguments.destDir)
Loading

0 comments on commit c0e2930

Please sign in to comment.