Skip to content

Commit

Permalink
Fix tokens processing for byte-level BPE (#333)
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj authored Sep 22, 2023
1 parent 969fff5 commit 43b2b77
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 15 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
project(sherpa-onnx)

set(SHERPA_ONNX_VERSION "1.7.17")
set(SHERPA_ONNX_VERSION "1.7.18")

# Disable warning about
#
Expand Down
51 changes: 38 additions & 13 deletions sherpa-onnx/csrc/offline-stream.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@

#include <algorithm>
#include <cmath>
#include <iomanip>

#include "kaldi-native-fbank/csrc/online-feature.h"
#include "nlohmann/json.hpp"
#include "sherpa-onnx/csrc/macros.h"
#include "sherpa-onnx/csrc/offline-recognizer.h"
#include "sherpa-onnx/csrc/resample.h"
Expand Down Expand Up @@ -256,25 +256,50 @@ const OfflineRecognitionResult &OfflineStream::GetResult() const {
return impl_->GetResult();
}
std::string OfflineRecognitionResult::AsJsonString() const {
nlohmann::json j;
j["text"] = text;
j["tokens"] = tokens;
#if 1
// This branch chooses number of decimal points to keep in
// the return json string
std::ostringstream os;
os << "[";
os << "{";
os << "\"text\""
<< ": ";
os << "\"" << text << "\""
<< ", ";

os << "\""
<< "timestamps"
<< "\""
<< ": ";
os << "\"[";

std::string sep = "";
for (auto t : timestamps) {
os << sep << std::fixed << std::setprecision(2) << t;
sep = ", ";
}
os << "]\", ";

os << "\""
<< "tokens"
<< "\""
<< ":";
os << "[";

sep = "";
auto oldFlags = os.flags();
for (const auto &t : tokens) {
if (t.size() == 1 && static_cast<uint8_t>(t[0]) > 0x7f) {
const uint8_t *p = reinterpret_cast<const uint8_t *>(t.c_str());
os << sep << "\""
<< "<0x" << std::hex << std::uppercase << static_cast<uint32_t>(p[0])
<< ">"
<< "\"";
os.flags(oldFlags);
} else {
os << sep << "\"" << t << "\"";
}
sep = ", ";
}
os << "]";
j["timestamps"] = os.str();
#else
j["timestamps"] = timestamps;
#endif
os << "}";

return j.dump();
return os.str();
}
} // namespace sherpa_onnx
2 changes: 1 addition & 1 deletion sherpa-onnx/csrc/symbol-table.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ void SymbolTable::Init(std::istream &is) {
if (id >= 3 && id <= 258 && sym.size() == 6 && sym[0] == '<' &&
sym[1] == '0' && sym[2] == 'x' && sym[5] == '>') {
std::ostringstream os;
os << std::hex << (id - 3);
os << std::hex << std::uppercase << (id - 3);

if (std::string(sym.data() + 3, sym.data() + 5) == os.str()) {
uint8_t i = id - 3;
Expand Down

0 comments on commit 43b2b77

Please sign in to comment.