Skip to content

Commit

Permalink
Add an iterator to tf to decode UTF-8 strings
Browse files Browse the repository at this point in the history
  • Loading branch information
erslavin authored and nvmkuruc committed Nov 9, 2023
1 parent dcbc1ca commit b83e289
Show file tree
Hide file tree
Showing 4 changed files with 612 additions and 0 deletions.
5 changes: 5 additions & 0 deletions pxr/base/tf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ pxr_library(tf
type
typeFunctions
typeNotice
unicodeUtils
warning
weakBase
weakPtr
Expand Down Expand Up @@ -383,6 +384,7 @@ pxr_build_test(testTf
testenv/type.cpp
testenv/typeMultipleInheritance.cpp
testenv/typeInfoMap.cpp
testenv/unicodeUtils.cpp
testenv/weakPtr.cpp
)

Expand Down Expand Up @@ -646,6 +648,9 @@ pxr_register_test(TfTypeInfoMap
pxr_register_test(TfType_MultipleInheritance
COMMAND "${CMAKE_INSTALL_PREFIX}/tests/testTf TfType_MultipleInheritance"
)
pxr_register_test(TfUnicodeUtils
COMMAND "${CMAKE_INSTALL_PREFIX}/tests/testTf TfUnicodeUtils"
)
pxr_register_test(TfWeakPtr
COMMAND "${CMAKE_INSTALL_PREFIX}/tests/testTf TfWeakPtr"
)
Expand Down
110 changes: 110 additions & 0 deletions pxr/base/tf/testenv/unicodeUtils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
//
// Copyright 2023 Pixar
//
// Licensed under the Apache License, Version 2.0 (the "Apache License")
// with the following modification; you may not use this file except in
// compliance with the Apache License and the following modification to it:
// Section 6. Trademarks. is deleted and replaced with:
//
// 6. Trademarks. This License does not grant permission to use the trade
// names, trademarks, service marks, or product names of the Licensor
// and its affiliates, except as required to comply with Section 4(c) of
// the License and to reproduce the content of the NOTICE file.
//
// You may obtain a copy of the Apache License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the Apache License with the above modification is
// distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the Apache License for the specific
// language governing permissions and limitations under the Apache License.
//
#include "pxr/pxr.h"
#include "pxr/base/tf/diagnosticLite.h"
#include "pxr/base/tf/regTest.h"
#include "pxr/base/tf/unicodeUtils.h"

#include <string_view>

PXR_NAMESPACE_USING_DIRECTIVE

static bool
TestUtf8CodePointView()
{

{
TF_AXIOM(TfUnicodeUtils::Utf8CodePointView{}.empty());
}

// Exercise the iterator converting from UTF-8 char to code point
{
const std::string_view s1{"ⅈ75_hgòð㤻"};
TfUnicodeUtils::Utf8CodePointView u1{s1};
auto i1 = std::cbegin(u1);
TF_AXIOM(i1.GetBase() == s1.begin());
TF_AXIOM(*i1 == 8520);
std::advance(i1, 9);
TF_AXIOM(i1 == std::cend(u1));

for (const uint32_t codePoint : u1) {
TF_AXIOM(codePoint != TfUnicodeUtils::INVALID_CODE_POINT);
}
}

{
const std::string_view s2{"㤼01৪∫"};
TfUnicodeUtils::Utf8CodePointView u2{s2};
auto i2 = std::cbegin(u2);
TF_AXIOM(i2.GetBase() == s2.begin());
TF_AXIOM(*i2 == 14652);
std::advance(i2, 5);
TF_AXIOM(i2 == std::cend(u2));

for (const uint32_t codePoint : u2) {
TF_AXIOM(codePoint != TfUnicodeUtils::INVALID_CODE_POINT);
}
}

{
const std::string_view s3{"㤻üaf-∫⁇…🔗"};
TfUnicodeUtils::Utf8CodePointView u3{s3};
auto i3a = std::cbegin(u3);
auto i3b = std::cbegin(u3);

// The C++20 ranges version of find_if can be used with sentinels in
// C++20
for (; i3b != std::cend(u3); ++i3b) {
if (*(i3b.GetBase()) == '-') {
break;
}
}
TF_AXIOM(i3b != std::cend(u3));

// i3a should contain all characters before the "-"
TF_AXIOM(*i3a == 14651);
std::advance(i3a, 4);
TF_AXIOM(i3a == i3b);
TF_AXIOM(i3a.GetBase() == i3b.GetBase());

// i3b should include the "-" character
TF_AXIOM(*i3b == 45);
std::advance(i3b, 5);
TF_AXIOM(i3b == std::cend(u3));

for (const uint32_t codePoint : u3) {
TF_AXIOM(codePoint != TfUnicodeUtils::INVALID_CODE_POINT);
}

}
return true;
}

static bool
Test_TfUnicodeUtils()
{
return TestUtf8CodePointView();
}

TF_ADD_REGTEST(TfUnicodeUtils);
191 changes: 191 additions & 0 deletions pxr/base/tf/unicodeUtils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
//
// Copyright 2023 Pixar
//
// Licensed under the Apache License, Version 2.0 (the "Apache License")
// with the following modification; you may not use this file except in
// compliance with the Apache License and the following modification to it:
// Section 6. Trademarks. is deleted and replaced with:
//
// 6. Trademarks. This License does not grant permission to use the trade
// names, trademarks, service marks, or product names of the Licensor
// and its affiliates, except as required to comply with Section 4(c) of
// the License and to reproduce the content of the NOTICE file.
//
// You may obtain a copy of the Apache License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the Apache License with the above modification is
// distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the Apache License for the specific
// language governing permissions and limitations under the Apache License.
//

#include "pxr/base/tf/diagnostic.h"
#include "pxr/base/tf/unicodeUtils.h"

PXR_NAMESPACE_OPEN_SCOPE

namespace TfUnicodeUtils {

uint32_t Utf8CodePointIterator::_GetCodePoint() const
{
// determine what encoding length the character is
_EncodingLength encodingLength = this->_GetEncodingLength();
if (encodingLength > std::distance(_it, _end)) {
// error condition, would read bytes past the end of the range
return INVALID_CODE_POINT;
}
if (encodingLength == 1)
{
return static_cast<uint32_t>(static_cast<unsigned char>(*_it));
}
auto begin = _it;
if (encodingLength == 2)
{
unsigned char byte1 = static_cast<unsigned char>(*begin);
unsigned char byte2 = static_cast<unsigned char>(*(++begin));

// ensure the ranges we expect, or it's not a valid character
if (byte1 < static_cast<unsigned char>('\xc2') ||
byte1 > static_cast<unsigned char>('\xdf'))
{
return INVALID_CODE_POINT;
}
if (byte2 < static_cast<unsigned char>('\x80') ||
byte2 > static_cast<unsigned char>('\xbf'))
{
return INVALID_CODE_POINT;
}

// the code point is constructed from the last 5 bits of byte1
// and the last 6 bits of byte2
return ((byte1 & 0x1f) << 6) + (byte2 & 0x3f);
}
else if (encodingLength == 3)
{
unsigned char byte1 = static_cast<unsigned char>(*begin);
unsigned char byte2 = static_cast<unsigned char>(*(++begin));
unsigned char byte3 = static_cast<unsigned char>(*(++begin));

// ensure the ranges we expect, or it's not a valid character
if (byte1 == static_cast<unsigned char>('\xe0'))
{
// byte2 must be in range A0..BF
// byte3 must be in range 80..BF
if (byte2 < static_cast<unsigned char>('\xa0') ||
byte2 > static_cast<unsigned char>('\xbf') ||
byte3 < static_cast<unsigned char>('\x80') ||
byte3 > static_cast<unsigned char>('\xbf'))
{
return INVALID_CODE_POINT;
}
}
else if ((byte1 >= static_cast<unsigned char>('\xe1') &&
byte1 <= static_cast<unsigned char>('\xec')) ||
byte1 == static_cast<unsigned char>('\xee') ||
byte1 == static_cast<unsigned char>('\xef'))
{
// byte2 must be in range 80..BF
// byte3 must be in range 80..BF
if (byte2 < static_cast<unsigned char>('\x80') ||
byte2 > static_cast<unsigned char>('\xbf') ||
byte3 < static_cast<unsigned char>('\x80') ||
byte3 > static_cast<unsigned char>('\xbf'))
{
return INVALID_CODE_POINT;
}
}
else if (byte1 == static_cast<unsigned char>('\xed'))
{
// byte2 must be in range 80..9F
// byte3 must be in range 80..BF
if (byte2 < static_cast<unsigned char>('\x80') ||
byte2 > static_cast<unsigned char>('\x9f') ||
byte3 < static_cast<unsigned char>('\x80') ||
byte3 > static_cast<unsigned char>('\xbf'))
{
return INVALID_CODE_POINT;
}
}
else
{
// byte 1 invalid
return INVALID_CODE_POINT;
}

// code point is constructed from the last 4 bits of byte1
// and the last 6 bits of bytes 2 and 3
return ((byte1 & 0xf) << 12) + ((byte2 & 0x3f) << 6) +
(byte3 & 0x3f);
}
else if (encodingLength == 4)
{
unsigned char byte1 = static_cast<unsigned char>(*begin);
unsigned char byte2 = static_cast<unsigned char>(*(++begin));
unsigned char byte3 = static_cast<unsigned char>(*(++begin));
unsigned char byte4 = static_cast<unsigned char>(*(++begin));

if (byte1 == static_cast<unsigned char>('\xf0'))
{
// byte2 must be in range 90..BF
// byte3 must be in range 80..BF
// byte4 must be in range 80..BF
if (byte2 < static_cast<unsigned char>('\x90') ||
byte2 > static_cast<unsigned char>('\xbf') ||
byte3 < static_cast<unsigned char>('\x80') ||
byte3 > static_cast<unsigned char>('\xbf') ||
byte4 < static_cast<unsigned char>('\x80') ||
byte4 > static_cast<unsigned char>('\xbf'))
{
return INVALID_CODE_POINT;
}
}
else if (byte1 >= static_cast<unsigned char>('\xf1') &&
byte1 <= static_cast<unsigned char>('\xf3'))
{
// byte2 must be in range 80..BF
// byte3 must be in range 80..BF
// byte4 must be in range 80..BF
if (byte2 < static_cast<unsigned char>('\x80') ||
byte2 > static_cast<unsigned char>('\xbf') ||
byte3 < static_cast<unsigned char>('\x80') ||
byte3 > static_cast<unsigned char>('\xbf') ||
byte4 < static_cast<unsigned char>('\x80') ||
byte4 > static_cast<unsigned char>('\xbf'))
{
return INVALID_CODE_POINT;
}
}
else if (byte1 == static_cast<unsigned char>('\xf4'))
{
// byte2 must be in range 80..8F
// byte3 must be in range 80..BF
// byte4 must be in range 80..BF
if (byte2 < static_cast<unsigned char>('\x80') ||
byte2 > static_cast<unsigned char>('\x8f') ||
byte3 < static_cast<unsigned char>('\x80') ||
byte3 > static_cast<unsigned char>('\xbf') ||
byte4 < static_cast<unsigned char>('\x80') ||
byte4 > static_cast<unsigned char>('\xbf'))
{
return INVALID_CODE_POINT;
}
}
else
{
// byte 1 is invalid
return INVALID_CODE_POINT;
}

// code point is constructed from the last 3 bits of byte 1
// and the last 6 bits of bytes 2, 3, and 4
return ((byte1 & 0x7) << 18) + ((byte2 & 0x3f) << 12) +
((byte3 & 0x3f) << 6) + (byte4 & 0x3f);
}
return INVALID_CODE_POINT;
}
} // end TfUnicodeUtils

PXR_NAMESPACE_CLOSE_SCOPE
Loading

0 comments on commit b83e289

Please sign in to comment.