Skip to content

Commit

Permalink
Fix splitting utf8 string into words (#385)
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj authored Oct 25, 2023
1 parent 1249710 commit 6e5efa4
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 90 deletions.
4 changes: 1 addition & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
project(sherpa-onnx)

set(SHERPA_ONNX_VERSION "1.8.4")
set(SHERPA_ONNX_VERSION "1.8.5")

# Disable warning about
#
Expand Down Expand Up @@ -175,8 +175,6 @@ if(SHERPA_ONNX_ENABLE_WEBSOCKET)
include(asio)
endif()

include(utfcpp)

add_subdirectory(sherpa-onnx)

if(SHERPA_ONNX_ENABLE_C_API)
Expand Down
45 changes: 0 additions & 45 deletions cmake/utfcpp.cmake

This file was deleted.

67 changes: 25 additions & 42 deletions sherpa-onnx/csrc/text-utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#include <utility>
#include <vector>

#include "source/utf8.h"
#include "sherpa-onnx/csrc/macros.h"

// This file is copied/modified from
// https://github.com/kaldi-asr/kaldi/blob/master/src/util/text-utils.cc
Expand Down Expand Up @@ -163,56 +163,39 @@ template bool SplitStringToFloats(const std::string &full, const char *delim,
std::vector<double> *out);

std::vector<std::string> SplitUtf8(const std::string &text) {
char *begin = const_cast<char *>(text.c_str());
char *end = begin + text.size();
const uint8_t *begin = reinterpret_cast<const uint8_t *>(text.c_str());
const uint8_t *end = begin + text.size();

std::vector<std::string> ans;
std::string buf;

while (begin < end) {
uint32_t code = utf8::next(begin, end);
auto start = begin;
while (start < end) {
uint8_t c = *start;
uint8_t i = 0x80;
int32_t num_bytes = 0;

// 1. is punctuation
if (std::ispunct(code)) {
if (!buf.empty()) {
ans.push_back(std::move(buf));
}

char s[5] = {0};
utf8::append(code, s);
ans.push_back(s);
continue;
}

// 2. is space
if (std::isspace(code)) {
if (!buf.empty()) {
ans.push_back(std::move(buf));
}
continue;
}

// 3. is alpha
if (std::isalpha(code)) {
buf.push_back(code);
continue;
// see
// https://en.wikipedia.org/wiki/UTF-8
for (; c & i; i >>= 1) {
++num_bytes;
}

if (!buf.empty()) {
ans.push_back(std::move(buf));
if (num_bytes == 0) {
// this is an ascii
ans.emplace_back(reinterpret_cast<const char *>(start), 1);
++start;
} else if (2 <= num_bytes && num_bytes <= 4) {
ans.emplace_back(reinterpret_cast<const char *>(start), num_bytes);
start += num_bytes;
} else {
SHERPA_ONNX_LOGE("Invalid byte at position: %d",
static_cast<int32_t>(start - begin));
// skip this byte
++start;
}

// for others

char s[5] = {0};
utf8::append(code, s);
ans.push_back(s);
}

if (!buf.empty()) {
ans.push_back(std::move(buf));
}

return ans;
}

} // namespace sherpa_onnx

0 comments on commit 6e5efa4

Please sign in to comment.