From 6e5efa48c553ab5a10a10d06faf7aba7226b6737 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Wed, 25 Oct 2023 11:49:27 +0800 Subject: [PATCH] Fix splitting utf8 string into words (#385) --- CMakeLists.txt | 4 +- cmake/utfcpp.cmake | 45 ----------------------- sherpa-onnx/csrc/text-utils.cc | 67 +++++++++++++--------------------- 3 files changed, 26 insertions(+), 90 deletions(-) delete mode 100644 cmake/utfcpp.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 0e15bd1a2..d88cd9baa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR) project(sherpa-onnx) -set(SHERPA_ONNX_VERSION "1.8.4") +set(SHERPA_ONNX_VERSION "1.8.5") # Disable warning about # @@ -175,8 +175,6 @@ if(SHERPA_ONNX_ENABLE_WEBSOCKET) include(asio) endif() -include(utfcpp) - add_subdirectory(sherpa-onnx) if(SHERPA_ONNX_ENABLE_C_API) diff --git a/cmake/utfcpp.cmake b/cmake/utfcpp.cmake deleted file mode 100644 index 1dc724374..000000000 --- a/cmake/utfcpp.cmake +++ /dev/null @@ -1,45 +0,0 @@ -function(download_utfcpp) - include(FetchContent) - - set(utfcpp_URL "https://github.com/nemtrif/utfcpp/archive/refs/tags/v3.2.5.tar.gz") - set(utfcpp_URL2 "https://huggingface.co/csukuangfj/sherpa-onnx-cmake-deps/resolve/main/utfcpp-3.2.5.tar.gz") - set(utfcpp_HASH "SHA256=14fd1b3c466814cb4c40771b7f207b61d2c7a0aa6a5e620ca05c00df27f25afd") - - # If you don't have access to the Internet, - # please pre-download utfcpp - set(possible_file_locations - $ENV{HOME}/Downloads/utfcpp-3.2.5.tar.gz - ${PROJECT_SOURCE_DIR}/utfcpp-3.2.5.tar.gz - ${PROJECT_BINARY_DIR}/utfcpp-3.2.5.tar.gz - /tmp/utfcpp-3.2.5.tar.gz - /star-fj/fangjun/download/github/utfcpp-3.2.5.tar.gz - ) - - foreach(f IN LISTS possible_file_locations) - if(EXISTS ${f}) - set(utfcpp_URL "${f}") - file(TO_CMAKE_PATH "${utfcpp_URL}" utfcpp_URL) - message(STATUS "Found local downloaded utfcpp: ${utfcpp_URL}") - set(utfcpp_URL2) - break() - endif() - endforeach() - - FetchContent_Declare(utfcpp - URL - ${utfcpp_URL} - ${utfcpp_URL2} - URL_HASH ${utfcpp_HASH} - ) - - FetchContent_GetProperties(utfcpp) - if(NOT utfcpp_POPULATED) - message(STATUS "Downloading utfcpp from ${utfcpp_URL}") - FetchContent_Populate(utfcpp) - endif() - message(STATUS "utfcpp is downloaded to ${utfcpp_SOURCE_DIR}") - # add_subdirectory(${utfcpp_SOURCE_DIR} ${utfcpp_BINARY_DIR} EXCLUDE_FROM_ALL) - include_directories(${utfcpp_SOURCE_DIR}) -endfunction() - -download_utfcpp() diff --git a/sherpa-onnx/csrc/text-utils.cc b/sherpa-onnx/csrc/text-utils.cc index c08e857d9..3aefbf9ea 100644 --- a/sherpa-onnx/csrc/text-utils.cc +++ b/sherpa-onnx/csrc/text-utils.cc @@ -16,7 +16,7 @@ #include #include -#include "source/utf8.h" +#include "sherpa-onnx/csrc/macros.h" // This file is copied/modified from // https://github.com/kaldi-asr/kaldi/blob/master/src/util/text-utils.cc @@ -163,56 +163,39 @@ template bool SplitStringToFloats(const std::string &full, const char *delim, std::vector *out); std::vector SplitUtf8(const std::string &text) { - char *begin = const_cast(text.c_str()); - char *end = begin + text.size(); + const uint8_t *begin = reinterpret_cast(text.c_str()); + const uint8_t *end = begin + text.size(); std::vector ans; - std::string buf; - while (begin < end) { - uint32_t code = utf8::next(begin, end); + auto start = begin; + while (start < end) { + uint8_t c = *start; + uint8_t i = 0x80; + int32_t num_bytes = 0; - // 1. is punctuation - if (std::ispunct(code)) { - if (!buf.empty()) { - ans.push_back(std::move(buf)); - } - - char s[5] = {0}; - utf8::append(code, s); - ans.push_back(s); - continue; - } - - // 2. is space - if (std::isspace(code)) { - if (!buf.empty()) { - ans.push_back(std::move(buf)); - } - continue; - } - - // 3. is alpha - if (std::isalpha(code)) { - buf.push_back(code); - continue; + // see + // https://en.wikipedia.org/wiki/UTF-8 + for (; c & i; i >>= 1) { + ++num_bytes; } - if (!buf.empty()) { - ans.push_back(std::move(buf)); + if (num_bytes == 0) { + // this is an ascii + ans.emplace_back(reinterpret_cast(start), 1); + ++start; + } else if (2 <= num_bytes && num_bytes <= 4) { + ans.emplace_back(reinterpret_cast(start), num_bytes); + start += num_bytes; + } else { + SHERPA_ONNX_LOGE("Invalid byte at position: %d", + static_cast(start - begin)); + // skip this byte + ++start; } - - // for others - - char s[5] = {0}; - utf8::append(code, s); - ans.push_back(s); - } - - if (!buf.empty()) { - ans.push_back(std::move(buf)); } return ans; } + } // namespace sherpa_onnx