From 0ab17f1885411052f6eeb730e6cba228e9538d60 Mon Sep 17 00:00:00 2001 From: amorynan Date: Tue, 2 Jul 2024 20:49:17 +0800 Subject: [PATCH 1/4] support json string format --- .../data_types/serde/data_type_string_serde.h | 43 ++++++++-- .../data/jsonb_p0/test_jsonb_cast.csv | 4 + .../data/jsonb_p0/test_jsonb_cast.out | 27 +++++++ .../suites/jsonb_p0/test_jsonb_cast.groovy | 79 +++++++++++++++++++ 4 files changed, 148 insertions(+), 5 deletions(-) create mode 100644 regression-test/data/jsonb_p0/test_jsonb_cast.csv create mode 100644 regression-test/data/jsonb_p0/test_jsonb_cast.out create mode 100644 regression-test/suites/jsonb_p0/test_jsonb_cast.groovy diff --git a/be/src/vec/data_types/serde/data_type_string_serde.h b/be/src/vec/data_types/serde/data_type_string_serde.h index b870b61d64f371..d072f1401a4a70 100644 --- a/be/src/vec/data_types/serde/data_type_string_serde.h +++ b/be/src/vec/data_types/serde/data_type_string_serde.h @@ -73,15 +73,48 @@ class DataTypeStringSerDeBase : public DataTypeSerDe { auto result = check_column_const_set_readability(column, row_num); ColumnPtr ptr = result.first; row_num = result.second; + const auto& value = assert_cast(*ptr).get_data_at(row_num); if (_nesting_level > 1) { + // _nested_level > 1 means string is in a complex type, we add double quotes, and escape + // which should make deal with some special characters in json str bw.write('"'); - } - - const auto& value = assert_cast(*ptr).get_data_at(row_num); - bw.write(value.data, value.size); - if (_nesting_level > 1) { + if constexpr (std::is_same_v) { + // we should make deal with some special characters in json str + StringRef str_ref = value; + for (char it : str_ref) { + switch (it) { + case '\b': + bw.write("\\b", 2); + break; + case '\f': + bw.write("\\f", 2); + break; + case '\n': + bw.write("\\n", 2); + break; + case '\r': + bw.write("\\r", 2); + break; + case '\t': + bw.write("\\t", 2); + break; + case '\\': + bw.write("\\\\", 2); + break; + case '"': + bw.write("\\\"", 2); + break; + default: + bw.write(it); + } + } + } else { + bw.write(value.data, value.size); + } bw.write('"'); + } else { + bw.write(value.data, value.size); } return Status::OK(); } diff --git a/regression-test/data/jsonb_p0/test_jsonb_cast.csv b/regression-test/data/jsonb_p0/test_jsonb_cast.csv new file mode 100644 index 00000000000000..08b694ddea822f --- /dev/null +++ b/regression-test/data/jsonb_p0/test_jsonb_cast.csv @@ -0,0 +1,4 @@ +1 \N +2 ['{\'x\' : \'{"y" : 1}\', \'t\' : \'{"y" : 2}\'}', '{"x" : 1}'] +3 ['foo\'bar', 'foo"bar', 'foo\\'bar', 'foo\'\'bar'] +4 ['\/some\/cool\/url', '/some/cool/url', 'a\\_\\c\\l\\i\\c\\k\\h\\o\\u\\s\\e'] \ No newline at end of file diff --git a/regression-test/data/jsonb_p0/test_jsonb_cast.out b/regression-test/data/jsonb_p0/test_jsonb_cast.out new file mode 100644 index 00000000000000..0b572943601d90 --- /dev/null +++ b/regression-test/data/jsonb_p0/test_jsonb_cast.out @@ -0,0 +1,27 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select_1 -- +1 \N +2 ["{\\'x\\' : \\'{"y" : 1}\\', \\'t\\' : \\'{"y" : 2}\\'}", "{"x" : 1}"] +3 ["foo\\'bar', 'foo"bar', 'foo\\\\'bar', 'foo\\'\\'bar"] +4 ["\\/some\\/cool\\/url", "/some/cool/url", "a\\\\_\\\\c\\\\l\\\\i\\\\c\\\\k\\\\h\\\\o\\\\u\\\\s\\\\e"] + +-- !select_2 -- +1 \N +2 ["{\\'x\\' : \\'{"y" : 1}\\', \\'t\\' : \\'{"y" : 2}\\'}", "{"x" : 1}"] +3 ["foo\\'bar', 'foo"bar', 'foo\\\\'bar', 'foo\\'\\'bar"] +4 ["\\/some\\/cool\\/url", "/some/cool/url", "a\\\\_\\\\c\\\\l\\\\i\\\\c\\\\k\\\\h\\\\o\\\\u\\\\s\\\\e"] +27 ["{"k1":"v1", "k2": 200}"] +28 ["{"a.b.c":{"k1.a1":"v31", "k2": 300},"a":"niu"}"] +29 [" \n\r", " \n\r"] +30 ["f\r\n", "f\r\n""] + +-- !select_json -- +1 \N +2 ["{\\\\'x\\\\' : \\\\'{\\"y\\" : 1}\\\\', \\\\'t\\\\' : \\\\'{\\"y\\" : 2}\\\\'}","{\\"x\\" : 1}"] +3 ["foo\\\\'bar', 'foo\\"bar', 'foo\\\\\\\\'bar', 'foo\\\\'\\\\'bar"] +4 ["\\\\some\\\\cool\\\\url","somecoolurl","a\\\\\\\\_\\\\\\\\c\\\\\\\\l\\\\\\\\i\\\\\\\\c\\\\\\\\k\\\\\\\\h\\\\\\\\o\\\\\\\\u\\\\\\\\s\\\\\\\\e"] +27 ["{\\"k1\\":\\"v1\\", \\"k2\\": 200}"] +28 ["{\\"a.b.c\\":{\\"k1.a1\\":\\"v31\\", \\"k2\\": 300},\\"a\\":\\"niu\\"}"] +29 ["\\f\\n\\r","\\f\\n\\r"] +30 ["f\\b\\r\\n","f\\b\\r\\n\\""] + diff --git a/regression-test/suites/jsonb_p0/test_jsonb_cast.groovy b/regression-test/suites/jsonb_p0/test_jsonb_cast.groovy new file mode 100644 index 00000000000000..4d1b2aa7181923 --- /dev/null +++ b/regression-test/suites/jsonb_p0/test_jsonb_cast.groovy @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.codehaus.groovy.runtime.IOGroovyMethods + +suite("test_jsonb_cast", "p0") { + + // define a sql table with array which has some Escape Character and should also to cast to json + def testTable = "tbl_test_array_text_cast_jsonb" + def dataFile = "test_jsonb_cast.csv" + + sql """ set experimental_enable_nereids_planner = true """ + sql """ set enable_fallback_to_original_planner = true """ + + sql "DROP TABLE IF EXISTS ${testTable}" + + sql """ + CREATE TABLE IF NOT EXISTS ${testTable} ( + id INT, + a ARRAY, + ) + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 3 + PROPERTIES("replication_num" = "1"); + """ + + // load the jsonb data from csv file + streamLoad { + table testTable + + file dataFile // import csv file + time 10000 // limit inflight 10s + set 'strict_mode', 'true' + + // if declared a check callback, the default check condition will ignore. + // So you must check all condition + check { result, exception, startTime, endTime -> + if (exception != null) { + throw exception + } + log.info("Stream load result: ${result}".toString()) + def json = parseJson(result) + assertEquals(4, json.NumberTotalRows) + assertEquals(4, json.NumberLoadedRows) + assertTrue(json.LoadBytes > 0) + } + } + + sql """ sync; """ + + // check result + qt_select_1 "SELECT * FROM ${testTable} ORDER BY id" + + + // insert into valid json rows + sql """INSERT INTO ${testTable} VALUES(27, ['{"k1":"v1", "k2": 200}'])""" + sql """INSERT INTO ${testTable} VALUES(28, ['{"a.b.c":{"k1.a1":"v31", "k2": 300},"a":"niu"}'])""" + sql """INSERT INTO ${testTable} VALUES(29, ['\f\n\r', "\f\n\r"])""" + sql """INSERT INTO ${testTable} VALUES(30, ["\\f\\b\\r\\n", '\\f\\b\\r\\n"'])""" + + // check result + qt_select_2 "SELECT * FROM ${testTable} ORDER BY id" + // check cast as json + qt_select_json "SELECT id, cast(a as JSON) FROM ${testTable} ORDER BY id" +} \ No newline at end of file From e7d516639ccde335f9d5c3b363374675db08a726 Mon Sep 17 00:00:00 2001 From: amorynan Date: Wed, 3 Jul 2024 17:29:41 +0800 Subject: [PATCH 2/4] update with option param for escape char --- .../vec/data_types/serde/data_type_string_serde.h | 13 ++++++++----- be/src/vec/functions/function_cast.h | 1 + regression-test/data/jsonb_p0/test_jsonb_cast.out | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/be/src/vec/data_types/serde/data_type_string_serde.h b/be/src/vec/data_types/serde/data_type_string_serde.h index d072f1401a4a70..6726343079a63b 100644 --- a/be/src/vec/data_types/serde/data_type_string_serde.h +++ b/be/src/vec/data_types/serde/data_type_string_serde.h @@ -76,11 +76,11 @@ class DataTypeStringSerDeBase : public DataTypeSerDe { const auto& value = assert_cast(*ptr).get_data_at(row_num); if (_nesting_level > 1) { - // _nested_level > 1 means string is in a complex type, we add double quotes, and escape - // which should make deal with some special characters in json str bw.write('"'); - if constexpr (std::is_same_v) { - // we should make deal with some special characters in json str + } + if constexpr (std::is_same_v) { + if (options.escape_char != 0) { + // we should make deal with some special characters in json str if we have escape_char StringRef str_ref = value; for (char it : str_ref) { switch (it) { @@ -112,10 +112,13 @@ class DataTypeStringSerDeBase : public DataTypeSerDe { } else { bw.write(value.data, value.size); } - bw.write('"'); } else { bw.write(value.data, value.size); } + if (_nesting_level > 1) { + bw.write('"'); + } + return Status::OK(); } diff --git a/be/src/vec/functions/function_cast.h b/be/src/vec/functions/function_cast.h index 17250e10fd77d0..d4b21aacc5c148 100644 --- a/be/src/vec/functions/function_cast.h +++ b/be/src/vec/functions/function_cast.h @@ -766,6 +766,7 @@ struct ConvertImplGenericToJsonb { auto tmp_col = ColumnString::create(); vectorized::DataTypeSerDe::FormatOptions options; + options.escape_char = '\\'; for (size_t i = 0; i < input_rows_count; i++) { // convert to string tmp_col->clear(); diff --git a/regression-test/data/jsonb_p0/test_jsonb_cast.out b/regression-test/data/jsonb_p0/test_jsonb_cast.out index 0b572943601d90..2ab4174c746d6a 100644 --- a/regression-test/data/jsonb_p0/test_jsonb_cast.out +++ b/regression-test/data/jsonb_p0/test_jsonb_cast.out @@ -19,7 +19,7 @@ 1 \N 2 ["{\\\\'x\\\\' : \\\\'{\\"y\\" : 1}\\\\', \\\\'t\\\\' : \\\\'{\\"y\\" : 2}\\\\'}","{\\"x\\" : 1}"] 3 ["foo\\\\'bar', 'foo\\"bar', 'foo\\\\\\\\'bar', 'foo\\\\'\\\\'bar"] -4 ["\\\\some\\\\cool\\\\url","somecoolurl","a\\\\\\\\_\\\\\\\\c\\\\\\\\l\\\\\\\\i\\\\\\\\c\\\\\\\\k\\\\\\\\h\\\\\\\\o\\\\\\\\u\\\\\\\\s\\\\\\\\e"] +4 ["\\\\/some\\\\/cool\\\\/url","/some/cool/url","a\\\\\\\\_\\\\\\\\c\\\\\\\\l\\\\\\\\i\\\\\\\\c\\\\\\\\k\\\\\\\\h\\\\\\\\o\\\\\\\\u\\\\\\\\s\\\\\\\\e"] 27 ["{\\"k1\\":\\"v1\\", \\"k2\\": 200}"] 28 ["{\\"a.b.c\\":{\\"k1.a1\\":\\"v31\\", \\"k2\\": 300},\\"a\\":\\"niu\\"}"] 29 ["\\f\\n\\r","\\f\\n\\r"] From 0df3c5936d6b7d26f6148af5dd59a7222cfefea4 Mon Sep 17 00:00:00 2001 From: amorynan Date: Thu, 4 Jul 2024 09:57:26 +0800 Subject: [PATCH 3/4] fix comment --- .../data_types/serde/data_type_string_serde.h | 71 ++++++++++--------- 1 file changed, 39 insertions(+), 32 deletions(-) diff --git a/be/src/vec/data_types/serde/data_type_string_serde.h b/be/src/vec/data_types/serde/data_type_string_serde.h index d072f1401a4a70..1019335b0565c2 100644 --- a/be/src/vec/data_types/serde/data_type_string_serde.h +++ b/be/src/vec/data_types/serde/data_type_string_serde.h @@ -76,49 +76,56 @@ class DataTypeStringSerDeBase : public DataTypeSerDe { const auto& value = assert_cast(*ptr).get_data_at(row_num); if (_nesting_level > 1) { - // _nested_level > 1 means string is in a complex type, we add double quotes, and escape - // which should make deal with some special characters in json str bw.write('"'); - if constexpr (std::is_same_v) { - // we should make deal with some special characters in json str + } + if constexpr (std::is_same_v) { + if (options.escape_char != 0) { + // we should make deal with some special characters in json str if we have escape_char StringRef str_ref = value; - for (char it : str_ref) { - switch (it) { - case '\b': - bw.write("\\b", 2); - break; - case '\f': - bw.write("\\f", 2); - break; - case '\n': - bw.write("\\n", 2); - break; - case '\r': - bw.write("\\r", 2); - break; - case '\t': - bw.write("\\t", 2); - break; - case '\\': - bw.write("\\\\", 2); - break; - case '"': - bw.write("\\\"", 2); - break; - default: - bw.write(it); - } - } + write_with_escaped_char_to_json(str_ref, bw); } else { bw.write(value.data, value.size); } - bw.write('"'); } else { bw.write(value.data, value.size); } + if (_nesting_level > 1) { + bw.write('"'); + } + return Status::OK(); } + inline void write_with_escaped_char_to_json(StringRef value, BufferWritable& bw) { + for (char it : value) { + switch (it) { + case '\b': + bw.write("\\b", 2); + break; + case '\f': + bw.write("\\f", 2); + break; + case '\n': + bw.write("\\n", 2); + break; + case '\r': + bw.write("\\r", 2); + break; + case '\t': + bw.write("\\t", 2); + break; + case '\\': + bw.write("\\\\", 2); + break; + case '"': + bw.write("\\\"", 2); + break; + default: + bw.write(it); + } + } + } + Status serialize_column_to_json(const IColumn& column, int start_idx, int end_idx, BufferWritable& bw, FormatOptions& options) const override { SERIALIZE_COLUMN_TO_JSON(); From efcb1284cf61f12330dd0a354995ce319bb7ab67 Mon Sep 17 00:00:00 2001 From: amorynan Date: Thu, 4 Jul 2024 10:12:48 +0800 Subject: [PATCH 4/4] fix compile --- be/src/vec/data_types/serde/data_type_string_serde.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/be/src/vec/data_types/serde/data_type_string_serde.h b/be/src/vec/data_types/serde/data_type_string_serde.h index 1019335b0565c2..91ed4dbb4135c9 100644 --- a/be/src/vec/data_types/serde/data_type_string_serde.h +++ b/be/src/vec/data_types/serde/data_type_string_serde.h @@ -96,7 +96,7 @@ class DataTypeStringSerDeBase : public DataTypeSerDe { return Status::OK(); } - inline void write_with_escaped_char_to_json(StringRef value, BufferWritable& bw) { + inline void write_with_escaped_char_to_json(StringRef value, BufferWritable& bw) const { for (char it : value) { switch (it) { case '\b':