-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add an iterator to tf to decode UTF-8 strings
- Loading branch information
Showing
4 changed files
with
612 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
// | ||
// Copyright 2023 Pixar | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "Apache License") | ||
// with the following modification; you may not use this file except in | ||
// compliance with the Apache License and the following modification to it: | ||
// Section 6. Trademarks. is deleted and replaced with: | ||
// | ||
// 6. Trademarks. This License does not grant permission to use the trade | ||
// names, trademarks, service marks, or product names of the Licensor | ||
// and its affiliates, except as required to comply with Section 4(c) of | ||
// the License and to reproduce the content of the NOTICE file. | ||
// | ||
// You may obtain a copy of the Apache License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the Apache License with the above modification is | ||
// distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the Apache License for the specific | ||
// language governing permissions and limitations under the Apache License. | ||
// | ||
#include "pxr/pxr.h" | ||
#include "pxr/base/tf/diagnosticLite.h" | ||
#include "pxr/base/tf/regTest.h" | ||
#include "pxr/base/tf/unicodeUtils.h" | ||
|
||
#include <string_view> | ||
|
||
PXR_NAMESPACE_USING_DIRECTIVE | ||
|
||
static bool | ||
TestUtf8CodePointView() | ||
{ | ||
|
||
{ | ||
TF_AXIOM(TfUnicodeUtils::Utf8CodePointView{}.empty()); | ||
} | ||
|
||
// Exercise the iterator converting from UTF-8 char to code point | ||
{ | ||
const std::string_view s1{"ⅈ75_hgòð㤻"}; | ||
TfUnicodeUtils::Utf8CodePointView u1{s1}; | ||
auto i1 = std::cbegin(u1); | ||
TF_AXIOM(i1.GetBase() == s1.begin()); | ||
TF_AXIOM(*i1 == 8520); | ||
std::advance(i1, 9); | ||
TF_AXIOM(i1 == std::cend(u1)); | ||
|
||
for (const uint32_t codePoint : u1) { | ||
TF_AXIOM(codePoint != TfUnicodeUtils::INVALID_CODE_POINT); | ||
} | ||
} | ||
|
||
{ | ||
const std::string_view s2{"㤼01৪∫"}; | ||
TfUnicodeUtils::Utf8CodePointView u2{s2}; | ||
auto i2 = std::cbegin(u2); | ||
TF_AXIOM(i2.GetBase() == s2.begin()); | ||
TF_AXIOM(*i2 == 14652); | ||
std::advance(i2, 5); | ||
TF_AXIOM(i2 == std::cend(u2)); | ||
|
||
for (const uint32_t codePoint : u2) { | ||
TF_AXIOM(codePoint != TfUnicodeUtils::INVALID_CODE_POINT); | ||
} | ||
} | ||
|
||
{ | ||
const std::string_view s3{"㤻üaf-∫⁇…🔗"}; | ||
TfUnicodeUtils::Utf8CodePointView u3{s3}; | ||
auto i3a = std::cbegin(u3); | ||
auto i3b = std::cbegin(u3); | ||
|
||
// The C++20 ranges version of find_if can be used with sentinels in | ||
// C++20 | ||
for (; i3b != std::cend(u3); ++i3b) { | ||
if (*(i3b.GetBase()) == '-') { | ||
break; | ||
} | ||
} | ||
TF_AXIOM(i3b != std::cend(u3)); | ||
|
||
// i3a should contain all characters before the "-" | ||
TF_AXIOM(*i3a == 14651); | ||
std::advance(i3a, 4); | ||
TF_AXIOM(i3a == i3b); | ||
TF_AXIOM(i3a.GetBase() == i3b.GetBase()); | ||
|
||
// i3b should include the "-" character | ||
TF_AXIOM(*i3b == 45); | ||
std::advance(i3b, 5); | ||
TF_AXIOM(i3b == std::cend(u3)); | ||
|
||
for (const uint32_t codePoint : u3) { | ||
TF_AXIOM(codePoint != TfUnicodeUtils::INVALID_CODE_POINT); | ||
} | ||
|
||
} | ||
return true; | ||
} | ||
|
||
static bool | ||
Test_TfUnicodeUtils() | ||
{ | ||
return TestUtf8CodePointView(); | ||
} | ||
|
||
TF_ADD_REGTEST(TfUnicodeUtils); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,191 @@ | ||
// | ||
// Copyright 2023 Pixar | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "Apache License") | ||
// with the following modification; you may not use this file except in | ||
// compliance with the Apache License and the following modification to it: | ||
// Section 6. Trademarks. is deleted and replaced with: | ||
// | ||
// 6. Trademarks. This License does not grant permission to use the trade | ||
// names, trademarks, service marks, or product names of the Licensor | ||
// and its affiliates, except as required to comply with Section 4(c) of | ||
// the License and to reproduce the content of the NOTICE file. | ||
// | ||
// You may obtain a copy of the Apache License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the Apache License with the above modification is | ||
// distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the Apache License for the specific | ||
// language governing permissions and limitations under the Apache License. | ||
// | ||
|
||
#include "pxr/base/tf/diagnostic.h" | ||
#include "pxr/base/tf/unicodeUtils.h" | ||
|
||
PXR_NAMESPACE_OPEN_SCOPE | ||
|
||
namespace TfUnicodeUtils { | ||
|
||
uint32_t Utf8CodePointIterator::_GetCodePoint() const | ||
{ | ||
// determine what encoding length the character is | ||
_EncodingLength encodingLength = this->_GetEncodingLength(); | ||
if (encodingLength > std::distance(_it, _end)) { | ||
// error condition, would read bytes past the end of the range | ||
return INVALID_CODE_POINT; | ||
} | ||
if (encodingLength == 1) | ||
{ | ||
return static_cast<uint32_t>(static_cast<unsigned char>(*_it)); | ||
} | ||
auto begin = _it; | ||
if (encodingLength == 2) | ||
{ | ||
unsigned char byte1 = static_cast<unsigned char>(*begin); | ||
unsigned char byte2 = static_cast<unsigned char>(*(++begin)); | ||
|
||
// ensure the ranges we expect, or it's not a valid character | ||
if (byte1 < static_cast<unsigned char>('\xc2') || | ||
byte1 > static_cast<unsigned char>('\xdf')) | ||
{ | ||
return INVALID_CODE_POINT; | ||
} | ||
if (byte2 < static_cast<unsigned char>('\x80') || | ||
byte2 > static_cast<unsigned char>('\xbf')) | ||
{ | ||
return INVALID_CODE_POINT; | ||
} | ||
|
||
// the code point is constructed from the last 5 bits of byte1 | ||
// and the last 6 bits of byte2 | ||
return ((byte1 & 0x1f) << 6) + (byte2 & 0x3f); | ||
} | ||
else if (encodingLength == 3) | ||
{ | ||
unsigned char byte1 = static_cast<unsigned char>(*begin); | ||
unsigned char byte2 = static_cast<unsigned char>(*(++begin)); | ||
unsigned char byte3 = static_cast<unsigned char>(*(++begin)); | ||
|
||
// ensure the ranges we expect, or it's not a valid character | ||
if (byte1 == static_cast<unsigned char>('\xe0')) | ||
{ | ||
// byte2 must be in range A0..BF | ||
// byte3 must be in range 80..BF | ||
if (byte2 < static_cast<unsigned char>('\xa0') || | ||
byte2 > static_cast<unsigned char>('\xbf') || | ||
byte3 < static_cast<unsigned char>('\x80') || | ||
byte3 > static_cast<unsigned char>('\xbf')) | ||
{ | ||
return INVALID_CODE_POINT; | ||
} | ||
} | ||
else if ((byte1 >= static_cast<unsigned char>('\xe1') && | ||
byte1 <= static_cast<unsigned char>('\xec')) || | ||
byte1 == static_cast<unsigned char>('\xee') || | ||
byte1 == static_cast<unsigned char>('\xef')) | ||
{ | ||
// byte2 must be in range 80..BF | ||
// byte3 must be in range 80..BF | ||
if (byte2 < static_cast<unsigned char>('\x80') || | ||
byte2 > static_cast<unsigned char>('\xbf') || | ||
byte3 < static_cast<unsigned char>('\x80') || | ||
byte3 > static_cast<unsigned char>('\xbf')) | ||
{ | ||
return INVALID_CODE_POINT; | ||
} | ||
} | ||
else if (byte1 == static_cast<unsigned char>('\xed')) | ||
{ | ||
// byte2 must be in range 80..9F | ||
// byte3 must be in range 80..BF | ||
if (byte2 < static_cast<unsigned char>('\x80') || | ||
byte2 > static_cast<unsigned char>('\x9f') || | ||
byte3 < static_cast<unsigned char>('\x80') || | ||
byte3 > static_cast<unsigned char>('\xbf')) | ||
{ | ||
return INVALID_CODE_POINT; | ||
} | ||
} | ||
else | ||
{ | ||
// byte 1 invalid | ||
return INVALID_CODE_POINT; | ||
} | ||
|
||
// code point is constructed from the last 4 bits of byte1 | ||
// and the last 6 bits of bytes 2 and 3 | ||
return ((byte1 & 0xf) << 12) + ((byte2 & 0x3f) << 6) + | ||
(byte3 & 0x3f); | ||
} | ||
else if (encodingLength == 4) | ||
{ | ||
unsigned char byte1 = static_cast<unsigned char>(*begin); | ||
unsigned char byte2 = static_cast<unsigned char>(*(++begin)); | ||
unsigned char byte3 = static_cast<unsigned char>(*(++begin)); | ||
unsigned char byte4 = static_cast<unsigned char>(*(++begin)); | ||
|
||
if (byte1 == static_cast<unsigned char>('\xf0')) | ||
{ | ||
// byte2 must be in range 90..BF | ||
// byte3 must be in range 80..BF | ||
// byte4 must be in range 80..BF | ||
if (byte2 < static_cast<unsigned char>('\x90') || | ||
byte2 > static_cast<unsigned char>('\xbf') || | ||
byte3 < static_cast<unsigned char>('\x80') || | ||
byte3 > static_cast<unsigned char>('\xbf') || | ||
byte4 < static_cast<unsigned char>('\x80') || | ||
byte4 > static_cast<unsigned char>('\xbf')) | ||
{ | ||
return INVALID_CODE_POINT; | ||
} | ||
} | ||
else if (byte1 >= static_cast<unsigned char>('\xf1') && | ||
byte1 <= static_cast<unsigned char>('\xf3')) | ||
{ | ||
// byte2 must be in range 80..BF | ||
// byte3 must be in range 80..BF | ||
// byte4 must be in range 80..BF | ||
if (byte2 < static_cast<unsigned char>('\x80') || | ||
byte2 > static_cast<unsigned char>('\xbf') || | ||
byte3 < static_cast<unsigned char>('\x80') || | ||
byte3 > static_cast<unsigned char>('\xbf') || | ||
byte4 < static_cast<unsigned char>('\x80') || | ||
byte4 > static_cast<unsigned char>('\xbf')) | ||
{ | ||
return INVALID_CODE_POINT; | ||
} | ||
} | ||
else if (byte1 == static_cast<unsigned char>('\xf4')) | ||
{ | ||
// byte2 must be in range 80..8F | ||
// byte3 must be in range 80..BF | ||
// byte4 must be in range 80..BF | ||
if (byte2 < static_cast<unsigned char>('\x80') || | ||
byte2 > static_cast<unsigned char>('\x8f') || | ||
byte3 < static_cast<unsigned char>('\x80') || | ||
byte3 > static_cast<unsigned char>('\xbf') || | ||
byte4 < static_cast<unsigned char>('\x80') || | ||
byte4 > static_cast<unsigned char>('\xbf')) | ||
{ | ||
return INVALID_CODE_POINT; | ||
} | ||
} | ||
else | ||
{ | ||
// byte 1 is invalid | ||
return INVALID_CODE_POINT; | ||
} | ||
|
||
// code point is constructed from the last 3 bits of byte 1 | ||
// and the last 6 bits of bytes 2, 3, and 4 | ||
return ((byte1 & 0x7) << 18) + ((byte2 & 0x3f) << 12) + | ||
((byte3 & 0x3f) << 6) + (byte4 & 0x3f); | ||
} | ||
return INVALID_CODE_POINT; | ||
} | ||
} // end TfUnicodeUtils | ||
|
||
PXR_NAMESPACE_CLOSE_SCOPE |
Oops, something went wrong.