From 47fdd6c060f36653741649339070525cc42bc06d Mon Sep 17 00:00:00 2001 From: CEisenhofer <56730610+CEisenhofer@users.noreply.github.com> Date: Thu, 9 Sep 2021 11:35:16 +0200 Subject: [PATCH] Added 16 bit string-encoding (#5540) --- src/ast/char_decl_plugin.cpp | 1 - src/ast/char_decl_plugin.h | 6 ++---- src/ast/seq_decl_plugin.h | 2 -- src/params/context_params.cpp | 15 +++++++++++---- src/params/context_params.h | 31 ++++++++++++++++--------------- src/util/zstring.cpp | 22 ++++++++++++++-------- src/util/zstring.h | 32 +++++++++++++++++++++++++++++++- 7 files changed, 74 insertions(+), 35 deletions(-) diff --git a/src/ast/char_decl_plugin.cpp b/src/ast/char_decl_plugin.cpp index 7e0f6d4af4f..774309ed607 100644 --- a/src/ast/char_decl_plugin.cpp +++ b/src/ast/char_decl_plugin.cpp @@ -22,7 +22,6 @@ Module Name: char_decl_plugin::char_decl_plugin(): m_charc_sym("Char") { - m_unicode = gparams::get_value("unicode") != "false"; } char_decl_plugin::~char_decl_plugin() { diff --git a/src/ast/char_decl_plugin.h b/src/ast/char_decl_plugin.h index 50b89306eab..272c09844f5 100644 --- a/src/ast/char_decl_plugin.h +++ b/src/ast/char_decl_plugin.h @@ -40,7 +40,6 @@ enum char_op_kind { class char_decl_plugin : public decl_plugin { sort* m_char { nullptr }; symbol m_charc_sym; - bool m_unicode { true }; void set_manager(ast_manager * m, family_id id) override; @@ -96,8 +95,7 @@ class char_decl_plugin : public decl_plugin { MATCH_UNARY(is_to_int); MATCH_BINARY(is_le); - bool unicode() const { return m_unicode; } - unsigned max_char() const { return m_unicode ? zstring::unicode_max_char() : zstring::ascii_max_char(); } - unsigned num_bits() const { return m_unicode ? zstring::unicode_num_bits() : zstring::ascii_num_bits(); } + static unsigned max_char() { return zstring::max_char(); } + static unsigned num_bits() { return zstring::num_bits(); } }; diff --git a/src/ast/seq_decl_plugin.h b/src/ast/seq_decl_plugin.h index 3c0f28ca60c..6cd1398a956 100644 --- a/src/ast/seq_decl_plugin.h +++ b/src/ast/seq_decl_plugin.h @@ -166,8 +166,6 @@ class seq_decl_plugin : public decl_plugin { void finalize() override; - bool unicode() const { return get_char_plugin().unicode(); } - decl_plugin * mk_fresh() override { return alloc(seq_decl_plugin); } sort * mk_sort(decl_kind k, unsigned num_parameters, parameter const * parameters) override; diff --git a/src/params/context_params.cpp b/src/params/context_params.cpp index 03f49c028f7..4bc37b86b53 100644 --- a/src/params/context_params.cpp +++ b/src/params/context_params.cpp @@ -106,8 +106,15 @@ void context_params::set(char const * param, char const * value) { else if (p == "smtlib2_compliant") { set_bool(m_smtlib2_compliant, param, value); } - else if (p == "unicode") { - set_bool(m_unicode, param, value); + else if (p == "encoding") { + if (strcmp(value, "unicode") == 0 || strcmp(value, "bmp") == 0 || strcmp(value, "ascii") == 0) { + m_encoding = value; + } + else { + std::stringstream strm; + strm << "invalid value '" << value << "' for parameter '" << param << "' (supported: unicode, bmp, ascii)"; + throw default_exception(strm.str()); + } } else { param_descrs d; @@ -140,7 +147,7 @@ void context_params::updt_params(params_ref const & p) { m_debug_ref_count = p.get_bool("debug_ref_count", m_debug_ref_count); m_smtlib2_compliant = p.get_bool("smtlib2_compliant", m_smtlib2_compliant); m_statistics = p.get_bool("stats", m_statistics); - m_unicode = p.get_bool("unicode", m_unicode); + m_encoding = p.get_str("encoding", m_encoding.c_str()); } void context_params::collect_param_descrs(param_descrs & d) { @@ -157,7 +164,7 @@ void context_params::collect_param_descrs(param_descrs & d) { d.insert("debug_ref_count", CPK_BOOL, "debug support for AST reference counting", "false"); d.insert("smtlib2_compliant", CPK_BOOL, "enable/disable SMT-LIB 2.0 compliance", "false"); d.insert("stats", CPK_BOOL, "enable/disable statistics", "false"); - d.insert("unicode", CPK_BOOL, "use unicode strings instead of ASCII strings"); + d.insert("encoding", CPK_STRING, "string encoding used internally: unicode|bmp|ascii", "unicode"); // statistics are hidden as they are controlled by the /st option. collect_solver_param_descrs(d); } diff --git a/src/params/context_params.h b/src/params/context_params.h index b4c6e914cd2..ad4cee31df3 100644 --- a/src/params/context_params.h +++ b/src/params/context_params.h @@ -20,6 +20,7 @@ Module Name: #pragma once #include "util/params.h" +#include "util/zstring.h" class context_params { void set_bool(bool & opt, char const * param, char const * value); @@ -28,21 +29,21 @@ class context_params { unsigned m_rlimit { 0 }; public: - unsigned m_timeout { UINT_MAX } ; - std::string m_dot_proof_file; - std::string m_trace_file_name; - bool m_auto_config { true }; - bool m_proof { false }; - bool m_debug_ref_count { false }; - bool m_trace { false }; - bool m_well_sorted_check { false }; - bool m_model { true }; - bool m_model_validate { false }; - bool m_dump_models { false }; - bool m_unsat_core { false }; - bool m_smtlib2_compliant { false }; // it must be here because it enable/disable the use of coercions in the ast_manager. - bool m_statistics { false }; - bool m_unicode { true }; + unsigned m_timeout { UINT_MAX } ; + std::string m_dot_proof_file; + std::string m_trace_file_name; + bool m_auto_config { true }; + bool m_proof { false }; + bool m_debug_ref_count { false }; + bool m_trace { false }; + bool m_well_sorted_check { false }; + bool m_model { true }; + bool m_model_validate { false }; + bool m_dump_models { false }; + bool m_unsat_core { false }; + bool m_smtlib2_compliant { false }; // it must be here because it enable/disable the use of coercions in the ast_manager. + bool m_statistics { false }; + std::string m_encoding { "unicode" }; unsigned rlimit() const { return m_rlimit; } context_params(); diff --git a/src/util/zstring.cpp b/src/util/zstring.cpp index a0a5764bd35..7352cea5c29 100644 --- a/src/util/zstring.cpp +++ b/src/util/zstring.cpp @@ -42,9 +42,7 @@ bool zstring::is_escape_char(char const *& s, unsigned& result) { result = 16*result + d; } else if (*(s+3+i) == '}') { - if (result > 255 && !uses_unicode()) - return false; - if (result > unicode_max_char()) + if (result > max_char()) return false; s += 4 + i; return true; @@ -65,7 +63,7 @@ bool zstring::is_escape_char(char const *& s, unsigned& result) { result = 16*result + d2; result = 16*result + d3; result = 16*result + d4; - if (result > unicode_max_char()) + if (result > max_char()) return false; s += 6; return true; @@ -87,14 +85,22 @@ zstring::zstring(char const* s) { SASSERT(well_formed()); } - -bool zstring::uses_unicode() const { - return gparams::get_value("unicode") != "false"; +string_encoding zstring::get_encoding() { + if (gparams::get_value("encoding") == "unicode") { + return unicode; + } + if (gparams::get_value("encoding") == "bmp") { + return bmp; + } + if (gparams::get_value("encoding") == "ascii") { + return ascii; + } + return unicode; } bool zstring::well_formed() const { for (unsigned ch : m_buffer) { - if (ch > unicode_max_char()) { + if (ch > max_char()) { IF_VERBOSE(0, verbose_stream() << "large character: " << ch << "\n";); return false; } diff --git a/src/util/zstring.h b/src/util/zstring.h index a15d1b03d67..c5606b26742 100644 --- a/src/util/zstring.h +++ b/src/util/zstring.h @@ -21,17 +21,47 @@ Module Name: #include "util/buffer.h" #include "util/rational.h" +enum string_encoding { + ascii, // exactly 8 bits + unicode, + bmp // basic multilingual plane; exactly 16 bits +}; + class zstring { private: buffer m_buffer; bool well_formed() const; - bool uses_unicode() const; bool is_escape_char(char const *& s, unsigned& result); public: static unsigned unicode_max_char() { return 196607; } static unsigned unicode_num_bits() { return 18; } + static unsigned bmp_max_char() { return 65535; } + static unsigned bmp_num_bits() { return 16; } static unsigned ascii_max_char() { return 255; } static unsigned ascii_num_bits() { return 8; } + static unsigned max_char() { + switch (get_encoding()) { + case unicode: + return unicode_max_char(); + case bmp: + return bmp_max_char(); + case ascii: + return ascii_max_char(); + } + return unicode_max_char(); + } + static unsigned num_bits() { + switch (get_encoding()) { + case unicode: + return unicode_num_bits(); + case bmp: + return bmp_num_bits(); + case ascii: + return ascii_num_bits(); + } + return unicode_num_bits(); + } + static string_encoding get_encoding(); zstring() = default; zstring(char const* s); zstring(const std::string &str) : zstring(str.c_str()) {}