Skip to content

Commit

Permalink
feat(Foundation): Include utf8proc for enhanced Unicode support (#4710)
Browse files Browse the repository at this point in the history
  • Loading branch information
obiltschnig committed Sep 27, 2024
1 parent 6faf907 commit 92e0649
Show file tree
Hide file tree
Showing 10 changed files with 18,726 additions and 4 deletions.
7 changes: 6 additions & 1 deletion Foundation/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ POCO_MESSAGES(SRCS Logging src/pocomsg.mc)
if(POCO_UNBUNDLED)
find_package(PCRE2 REQUIRED)
find_package(ZLIB REQUIRED)
find_package(Utf8Proc REQUIRED)

#HACK: Unicode.cpp requires functions from these files. The can't be taken from the library
POCO_SOURCES(SRCS RegExp
Expand Down Expand Up @@ -86,6 +87,10 @@ else()
src/trees.c
src/zutil.c
)

POCO_SOURCES(SRCS utf8proc
src/utf8proc.c
)
endif(POCO_UNBUNDLED)


Expand All @@ -99,7 +104,7 @@ set_target_properties(Foundation
)

if(POCO_UNBUNDLED)
target_link_libraries(Foundation PUBLIC Pcre2::Pcre2 ZLIB::ZLIB)
target_link_libraries(Foundation PUBLIC Pcre2::Pcre2 ZLIB::ZLIB Utf8Proc::Utf8Proc)
target_compile_definitions(Foundation PUBLIC POCO_UNBUNDLED)
endif(POCO_UNBUNDLED)

Expand Down
6 changes: 4 additions & 2 deletions Foundation/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,13 @@ pcre_objects = pcre2_auto_possess pcre2_chartables pcre2_compile pcre2_config \

pcre_utf8_objects = pcre2_ucd pcre2_tables

utf8proc_objects = utf8proc

ifdef POCO_UNBUNDLED
SYSLIBS += -lpcre2-8 -lz
SYSLIBS += -lpcre2-8 -lutf8proc -lz
objects += $(pcre_utf8_objects)
else
objects += $(zlib_objects) $(pcre_objects) $(pcre_utf8_objects)
objects += $(zlib_objects) $(pcre_objects) $(pcre_utf8_objects) $(utf8proc_objects)
endif

ifeq ($(findstring MinGW, $(POCO_CONFIG)), MinGW)
Expand Down
17 changes: 17 additions & 0 deletions Foundation/include/Poco/UTF8String.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,15 @@ struct Foundation_API UTF8
/// removeBOM() removes the UTF-8 Byte Order Mark sequence (0xEF, 0xBB, 0xBF)
/// from the beginning of the given string, if it's there.
{
enum NormalizationForm
/// Normalization form for normalize().
{
NORMALIZATION_FORM_D, /// Canonical Decomposition
NORMALIZATION_FORM_C, /// Canonical Decomposition, followed by Canonical Composition
NORMALIZATION_FORM_KD, /// Compatibility Decomposition
NORMALIZATION_FORM_KC /// Compatibility Decomposition, followed by Canonical Composition
};

static int icompare(const std::string& str, std::string::size_type pos, std::string::size_type n, std::string::const_iterator it2, std::string::const_iterator end2);
static int icompare(const std::string& str1, const std::string& str2);
static int icompare(const std::string& str1, std::string::size_type n1, const std::string& str2, std::string::size_type n2);
Expand Down Expand Up @@ -74,6 +83,14 @@ struct Foundation_API UTF8

static std::string unescape(const std::string::const_iterator& begin, const std::string::const_iterator& end);
/// Creates an UTF8 string from a string that contains escaped characters.

static std::string normalize(const std::string& s, NormalizationForm form);
/// Normalizes the given UTF8 string according to the given normalization form.
/// Returns the normalized UTF8 string.

static std::string normalize(const std::string::const_iterator& begin, const std::string::const_iterator& end, NormalizationForm form);
/// Normalizes the given UTF8 string range according to the given normalization form.
/// Returns the normalized UTF8 string.
};


Expand Down
58 changes: 58 additions & 0 deletions Foundation/src/UTF8String.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,17 @@
#include "Poco/UTF8Encoding.h"
#include "Poco/NumberFormatter.h"
#include "Poco/Ascii.h"
#include "Poco/Buffer.h"
#include "Poco/Exception.h"
#include <algorithm>
#include <iterator>


#if defined(POCO_UNBUNDLED)
#include <utf8proc.h>
#else
#include "utf8proc.h"
#endif


#if !defined(POCO_OS_FAMILY_WINDOWS)
Expand Down Expand Up @@ -410,4 +420,52 @@ std::string UTF8::unescape(const std::string::const_iterator& begin, const std::
}


namespace
{
std::string doNormalize(const char* str, std::size_t size, utf8proc_option_t options)
{
utf8proc_ssize_t n = utf8proc_decompose_custom(reinterpret_cast<const utf8proc_uint8_t*>(str), size, NULL, 0, options, NULL, NULL);
if (n < 0) throw Poco::RuntimeException("Normalization decompose failed"s, utf8proc_errmsg(n));

Poco::Buffer<utf8proc_int32_t> buffer(n + 1); // utf8proc_reencode() needs space for terminating NUL
n = utf8proc_decompose_custom(reinterpret_cast<const utf8proc_uint8_t*>(str), size, buffer.begin(), n, options, NULL, NULL);
if (n < 0) throw Poco::RuntimeException("Normalization decompose failed"s, utf8proc_errmsg(n));

n = utf8proc_reencode(buffer.begin(), n, options);
if (n < 0) throw Poco::RuntimeException("Normalization reeencode failed"s, utf8proc_errmsg(n));

return std::string(reinterpret_cast<char*>(buffer.begin()), n);
}

int formToOptions(UTF8::NormalizationForm form)
{
switch (form)
{
case UTF8::NORMALIZATION_FORM_D:
return UTF8PROC_STABLE | UTF8PROC_DECOMPOSE;
case UTF8::NORMALIZATION_FORM_C:
return UTF8PROC_STABLE | UTF8PROC_COMPOSE;
case UTF8::NORMALIZATION_FORM_KD:
return UTF8PROC_STABLE | UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT;
case UTF8::NORMALIZATION_FORM_KC:
return UTF8PROC_STABLE | UTF8PROC_COMPOSE | UTF8PROC_COMPAT;
default:
return 0;
}
}
}


std::string UTF8::normalize(const std::string& s, NormalizationForm form)
{
return doNormalize(s.data(), s.size(), static_cast<utf8proc_option_t>(formToOptions(form)));
}


std::string UTF8::normalize(const std::string::const_iterator& begin, const std::string::const_iterator& end, NormalizationForm form)
{
return doNormalize(&*begin, static_cast<std::size_t>(std::distance(begin, end)), static_cast<utf8proc_option_t>(formToOptions(form)));
}


} // namespace Poco
Loading

0 comments on commit 92e0649

Please sign in to comment.