Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support cast as json #8333

Merged
merged 65 commits into from
Nov 27, 2023
Merged
Changes from 1 commit
Commits
Show all changes
65 commits
Select commit Hold shift + click to select a range
c769c97
add appendJsonBinary
SeaRise Nov 7, 2023
6d677d9
tmp save
SeaRise Nov 8, 2023
749bc3c
tmp save
SeaRise Nov 8, 2023
bbb96e6
merge master
SeaRise Nov 8, 2023
a2e6f30
revert some useless change
SeaRise Nov 8, 2023
a5ecd2f
revert
SeaRise Nov 8, 2023
c694cfd
cast json as json
SeaRise Nov 8, 2023
137edc5
cast real as json; cast decimal as json
SeaRise Nov 8, 2023
386b886
cast int as json
SeaRise Nov 8, 2023
900030c
cast string as json step1
SeaRise Nov 8, 2023
dbe7506
tmp save
SeaRise Nov 8, 2023
ef05b96
cast string as json
SeaRise Nov 9, 2023
c7f8c2e
cast time as json
SeaRise Nov 9, 2023
1195c88
cast duration as json
SeaRise Nov 9, 2023
a58de3e
add ut for cast json as json
SeaRise Nov 9, 2023
a9e4d29
update
SeaRise Nov 13, 2023
b72f70a
ut finish
SeaRise Nov 13, 2023
71f8116
fix format
SeaRise Nov 13, 2023
40b24d8
tmp save
SeaRise Nov 13, 2023
c981d29
ut ut ut
SeaRise Nov 14, 2023
b8771b6
fmt
SeaRise Nov 14, 2023
7dbde8c
add it
SeaRise Nov 14, 2023
7e398ba
add it for decimal
SeaRise Nov 14, 2023
c82b923
revert
SeaRise Nov 14, 2023
170874f
fix #NO_UNESCAPE
SeaRise Nov 15, 2023
5c4ac21
Update tests/fullstack-test/expr/cast_as_json.test
SeaRise Nov 15, 2023
3cb5df4
Merge branch 'master' into cast_as_json
SeaRise Nov 15, 2023
4f238f7
introduce simdjson
SeaRise Nov 15, 2023
7b613c1
fix it
SeaRise Nov 15, 2023
b70e571
add ut for simdjson
SeaRise Nov 15, 2023
f4a10e1
use simdjson
SeaRise Nov 15, 2023
112558a
fmt
SeaRise Nov 15, 2023
214e992
fix obj
SeaRise Nov 15, 2023
529e402
fmt
SeaRise Nov 15, 2023
eba1300
fix and check deep
SeaRise Nov 16, 2023
cdb48ca
ut
SeaRise Nov 16, 2023
1c7401b
fmt
SeaRise Nov 16, 2023
84fec10
revert
SeaRise Nov 16, 2023
c27dfd4
revert
SeaRise Nov 16, 2023
fcdff44
Update dbms/src/Functions/tests/gtest_cast_as_json.cpp
SeaRise Nov 16, 2023
ce62b37
Update dbms/src/Functions/tests/gtest_cast_as_json.cpp
SeaRise Nov 16, 2023
9995ccb
prepare for optimize
SeaRise Nov 16, 2023
66c5a7c
optimize
SeaRise Nov 16, 2023
e680b94
add ut
SeaRise Nov 17, 2023
fcc7a6d
fmt
SeaRise Nov 17, 2023
c0e6aa8
update
SeaRise Nov 17, 2023
25b2e93
add more uts
SeaRise Nov 17, 2023
9aa59b8
minor refine
SeaRise Nov 20, 2023
b8c2bfa
Update dbms/src/Functions/FunctionsJson.h
SeaRise Nov 21, 2023
5e7717b
Merge branch 'master' into cast_as_json
SeaRise Nov 21, 2023
04bffbb
minior refine for json_array
SeaRise Nov 21, 2023
f0b1781
minior refine for json_array
SeaRise Nov 21, 2023
25195e6
more refine
SeaRise Nov 21, 2023
bc7e55d
tmp save
SeaRise Nov 22, 2023
35e8031
address comments
SeaRise Nov 22, 2023
1a83a58
fix
SeaRise Nov 22, 2023
90ca9ab
format
SeaRise Nov 23, 2023
2163ca7
Update dbms/src/Functions/FunctionsJson.h
SeaRise Nov 23, 2023
b3600ff
address comments
SeaRise Nov 23, 2023
efdeb11
fmt
SeaRise Nov 23, 2023
f662dad
fmt
SeaRise Nov 23, 2023
ae175d6
address comments
SeaRise Nov 27, 2023
2430e09
update
SeaRise Nov 27, 2023
86b2e6e
udpate
SeaRise Nov 27, 2023
8b77518
Merge branch 'master' into cast_as_json
ti-chi-bot[bot] Nov 27, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
address comments
SeaRise committed Nov 23, 2023
commit b3600ff1832b7112838fc6fc198c5fec2be2bde7
19 changes: 9 additions & 10 deletions dbms/src/Common/VectorWriter.h
Original file line number Diff line number Diff line change
@@ -52,16 +52,6 @@ class VectorWriter
pos += n;
}

void alloc(size_t n)
{
reserveForNextSize(n);
pos += n;
}

size_t offset() { return pos - reinterpret_cast<Position>(vector.data()); }

size_t count() { return offset(); }

void setOffset(size_t new_offset)
{
if (new_offset > vector.size())
@@ -72,6 +62,15 @@ class VectorWriter
pos = reinterpret_cast<Position>(vector.data() + new_offset);
}

void advance(size_t n)
{
setOffset(offset() + n);
}

size_t offset() { return pos - reinterpret_cast<Position>(vector.data()); }

size_t count() { return offset(); }

~VectorWriter()
{
vector.resize(count());
File renamed without changes.
4 changes: 2 additions & 2 deletions dbms/src/Common/tests/gtest_vector_writer.cpp
Original file line number Diff line number Diff line change
@@ -38,7 +38,7 @@ try
writer.write('a');
ASSERT_EQ(writer.offset(), 1);

writer.alloc(3);
writer.advance(3);
ASSERT_EQ(writer.offset(), 4);

PaddedPODArray<UInt8> tmp;
@@ -66,7 +66,7 @@ try
VectorWriter writer(vector, 1);
ASSERT_EQ(writer.offset(), 0);

writer.alloc(3);
writer.advance(3);
ASSERT_EQ(writer.offset(), 3);

writer.write('a');
101 changes: 74 additions & 27 deletions dbms/src/Functions/FunctionsJson.h
Original file line number Diff line number Diff line change
@@ -702,6 +702,7 @@ class FunctionCastRealAsJson : public IFunction
const auto * column_from = checkAndGetColumn<ColumnVector<FromType>>(column_ptr_from.get());
RUNTIME_CHECK(column_from);
const auto & data_from = column_from->getData();
// json_type + string end char 0 + value
size_t reserve_size = data_from.size() * (1 + 1 + sizeof(Float64));
JsonBinary::JsonBinaryWriteBuffer write_buffer(data_to, reserve_size);
for (size_t i = 0; i < data_from.size(); ++i)
@@ -779,11 +780,14 @@ class FunctionCastDecimalAsJson : public IFunction
{
const auto * column_from = checkAndGetColumn<ColumnDecimal<FromType>>(column_ptr_from.get());
RUNTIME_CHECK(column_from);
// json_type + string end char 0 + value
size_t reserve_size = column_from->size() * (1 + 1 + sizeof(Float64));
JsonBinary::JsonBinaryWriteBuffer write_buffer(data_to, reserve_size);
for (size_t i = 0; i < column_from->size(); ++i)
{
const auto & field = (*column_from)[i].template safeGet<DecimalField<FromType>>();
// same as https://github.com/pingcap/tidb/blob/90628349860718bb84c94fe7dc1e1f9bd9da4348/pkg/expression/builtin_cast.go#L854-L865
// TODO `select json_type(cast(1111.11 as json))` should return `DECIMAL`, we return `DOUBLE` now.
JsonBinary::appendNumber(write_buffer, static_cast<Float64>(field));
SeaRise marked this conversation as resolved.
Show resolved Hide resolved
writeChar(0, write_buffer);
offsets_to[i] = write_buffer.count();
@@ -877,6 +881,7 @@ class FunctionCastIntAsJson : public IFunction
RUNTIME_CHECK(column_from);
const auto & data_from = column_from->getData();

// json_type + string end char 0 + value
size_t reserve_size = 0;
if constexpr (std::is_same_v<bool, ToType>)
reserve_size = data_from.size() * (1 + 1 + 1);
@@ -951,26 +956,51 @@ class FunctionCastStringAsJson : public IFunction
// In raw function test, input_tidb_tp/output_tidb_tp is nullptr.
if (collator && collator->isBinary())
{
auto tmp_null_map = ColumnUInt8::create(0, 0);
if (unlikely(input_tidb_tp == nullptr))
{
doExecuteForBinary<false>(data_to, offsets_to, input_source, input_tidb_tp->tp(), -1, block.rows());
}
else if (input_tidb_tp->tp() == TiDB::TypeString)
{
doExecuteForBinary<true>(
doExecuteForBinary<false, false>(
data_to,
offsets_to,
input_source,
input_tidb_tp->tp(),
input_tidb_tp->flen(),
tmp_null_map->getData(),
TiDB::TypeVarchar,
-1,
block.rows());
}
else if (input_tidb_tp->tp() == TiDB::TypeString)
{
if (from.column->isColumnNullable())
{
const auto & column_nullable = static_cast<const ColumnNullable &>(*from.column);
doExecuteForBinary<true, true>(
data_to,
offsets_to,
input_source,
column_nullable.getNullMapData(),
input_tidb_tp->tp(),
input_tidb_tp->flen(),
block.rows());
}
else
{
doExecuteForBinary<true, false>(
data_to,
offsets_to,
input_source,
tmp_null_map->getData(),
input_tidb_tp->tp(),
input_tidb_tp->flen(),
block.rows());
}
}
else
{
doExecuteForBinary<false>(
doExecuteForBinary<false, false>(
data_to,
offsets_to,
input_source,
tmp_null_map->getData(),
input_tidb_tp->tp(),
input_tidb_tp->flen(),
block.rows());
@@ -1016,22 +1046,38 @@ class FunctionCastStringAsJson : public IFunction
}

private:
template <bool is_binary_str>
template <bool is_binary_str, bool check_null_for_binary_str>
static void doExecuteForBinary(
ColumnString::Chars_t & data_to,
ColumnString::Offsets & offsets_to,
const std::unique_ptr<IStringSource> & data_from,
const NullMap & null_map_from,
UInt8 from_type_code,
Int32 flen,
size_t size)
{
size_t reserve_size = (size * (1 + 1)) + data_from->getSizeForReserve();
size_t reserve_size = size * (1 + 1 + 1); // json_type + from_type_code + string end char 0
if constexpr (is_binary_str)
reserve_size += reserve_size <= 0 ? data_from->getSizeForReserve() : size * flen;
SeaRise marked this conversation as resolved.
Show resolved Hide resolved
else
reserve_size += data_from->getSizeForReserve();
JsonBinary::JsonBinaryWriteBuffer write_buffer(data_to, reserve_size);
ColumnString::Chars_t tmp_buf;
for (size_t i = 0; i < size; ++i)
{
const auto & slice = data_from->getWhole();
if constexpr (is_binary_str)
{
if constexpr (check_null_for_binary_str)
{
if (null_map_from[i])
{
writeChar(0, write_buffer);
offsets_to[i] = write_buffer.count();
data_from->next();
continue;
}
}
if (unlikely(flen <= 0))
{
JsonBinary::appendOpaque(
@@ -1049,12 +1095,13 @@ class FunctionCastStringAsJson : public IFunction
}
else
{
ColumnString::Chars_t buf;
buf.resize_fill(size_t_flen, 0);
std::memcpy(buf.data(), slice.data, slice.size);
if (tmp_buf.size() < size_t_flen)
tmp_buf.resize(size_t_flen);
std::memcpy(tmp_buf.data(), slice.data, slice.size);
std::fill(tmp_buf.data() + slice.size, tmp_buf.data() + size_t_flen, 0);
JsonBinary::appendOpaque(
write_buffer,
JsonBinary::Opaque{from_type_code, StringRef{buf.data(), size_t_flen}});
JsonBinary::Opaque{from_type_code, StringRef{tmp_buf.data(), size_t_flen}});
}
}
}
@@ -1078,7 +1125,7 @@ class FunctionCastStringAsJson : public IFunction
const NullMap & null_map_from,
size_t size)
{
size_t reserve_size = (size * (1 + 1)) + data_from->getSizeForReserve();
size_t reserve_size = (size * (1 + 1)) + data_from->getSizeForReserve(); // json_type + string end char 0 + value
JsonBinary::JsonBinaryWriteBuffer write_buffer(data_to, reserve_size);
simdjson::dom::parser parser;
for (size_t i = 0; i < size; ++i)
@@ -1171,17 +1218,17 @@ class FunctionCastTimeAsJson : public IFunction

const auto & from = block.getByPosition(arguments[0]);
if (checkDataType<DataTypeMyDateTime>(from.type.get()))
{
doExecute<DataTypeMyDateTime, false>(data_to, offsets_to, from.column);
}
else if (checkDataType<DataTypeMyDate>(from.type.get()))
{
// In raw function test, input_tidb_tp is nullptr.
bool is_timestamp = (unlikely(input_tidb_tp == nullptr)) || input_tidb_tp->tp() == TiDB::TypeTimestamp;
SeaRise marked this conversation as resolved.
Show resolved Hide resolved
if (is_timestamp)
doExecute<DataTypeMyDate, true>(data_to, offsets_to, from.column);
doExecute<DataTypeMyDateTime, true>(data_to, offsets_to, from.column);
else
doExecute<DataTypeMyDate, false>(data_to, offsets_to, from.column);
doExecute<DataTypeMyDateTime, false>(data_to, offsets_to, from.column);
}
else if (checkDataType<DataTypeMyDate>(from.type.get()))
{
doExecute<DataTypeMyDate, false>(data_to, offsets_to, from.column);
}

block.getByPosition(result).column = std::move(col_to);
@@ -1198,7 +1245,7 @@ class FunctionCastTimeAsJson : public IFunction
= checkAndGetColumn<ColumnVector<typename FromDataType::FieldType>>(column_ptr_from.get());
RUNTIME_CHECK(column_from);
const auto & data_from = column_from->getData();
size_t reserve_size = data_from.size() * (1 + 1 + sizeof(UInt64));
size_t reserve_size = data_from.size() * (1 + 1 + sizeof(UInt64)); // json_type + string end char 0 + value
JsonBinary::JsonBinaryWriteBuffer write_buffer(data_to, reserve_size);
for (size_t i = 0; i < data_from.size(); ++i)
{
@@ -1256,17 +1303,17 @@ class FunctionCastDurationAsJson : public IFunction
offsets_to.resize(rows);

const auto & from = block.getByPosition(arguments[0]);
if (const auto * duration_type = checkAndGetDataType<DataTypeMyDuration>(from.type.get());
likely(duration_type))
if (likely(checkDataType<DataTypeMyDuration>(from.type.get())))
{
auto fsp = duration_type->getFsp();
const auto & col_from = checkAndGetColumn<ColumnVector<DataTypeMyDuration::FieldType>>(from.column.get());
const auto & data_from = col_from->getData();
size_t reserve_size = data_from.size() * (1 + 1 + sizeof(UInt64) + sizeof(UInt32));
size_t reserve_size = data_from.size() * (1 + 1 + sizeof(UInt64) + sizeof(UInt32)); // json_type + string end char 0 + value
JsonBinary::JsonBinaryWriteBuffer write_buffer(data_to, reserve_size);
for (size_t i = 0; i < data_from.size(); ++i)
{
JsonBinary::appendDuration(write_buffer, data_from[i], fsp);
// from https://github.com/pingcap/tidb/blob/3543275dcf4b6454eb874c1362c87d31a963da6d/pkg/expression/builtin_cast.go#L921
// fsp always is MaxFsp.
JsonBinary::appendDuration(write_buffer, data_from[i], 6);
writeChar(0, write_buffer);
offsets_to[i] = write_buffer.count();
}
37 changes: 26 additions & 11 deletions dbms/src/Functions/tests/gtest_cast_as_json.cpp
Original file line number Diff line number Diff line change
@@ -351,17 +351,32 @@ CATCH
TEST_F(TestCastAsJson, CastDurationAsJson)
try
{
ColumnWithTypeAndName input(
// 22hour, 22min, 22s, 222ms
createColumn<DataTypeMyDuration::FieldType>({(22 * 3600000 + 22 * 60000 + 22 * 1000 + 222) * 1000000L,
-1 * (22 * 3600000 + 22 * 60000 + 22 * 1000 + 222) * 1000000L})
.column,
std::make_shared<DataTypeMyDuration>(6),
"");

auto res = executeFunctionWithCast("cast_duration_as_json", {input});
auto expect = createColumn<Nullable<String>>({"\"22:22:22.222000\"", "\"-22:22:22.222000\""});
ASSERT_COLUMN_EQ(expect, res);
{
ColumnWithTypeAndName input(
// 22hour, 22min, 22s, 222ms
createColumn<DataTypeMyDuration::FieldType>({(22 * 3600000 + 22 * 60000 + 22 * 1000 + 222) * 1000000L,
-1 * (22 * 3600000 + 22 * 60000 + 22 * 1000 + 222) * 1000000L})
.column,
std::make_shared<DataTypeMyDuration>(6),
"");

auto res = executeFunctionWithCast("cast_duration_as_json", {input});
auto expect = createColumn<Nullable<String>>({"\"22:22:22.222000\"", "\"-22:22:22.222000\""});
ASSERT_COLUMN_EQ(expect, res);
}
{
ColumnWithTypeAndName input(
// 22hour, 22min, 22s, 222ms
createColumn<DataTypeMyDuration::FieldType>({(22 * 3600000 + 22 * 60000 + 22 * 1000 + 222) * 1000000L,
-1 * (22 * 3600000 + 22 * 60000 + 22 * 1000 + 222) * 1000000L})
.column,
std::make_shared<DataTypeMyDuration>(1),
"");

auto res = executeFunctionWithCast("cast_duration_as_json", {input});
auto expect = createColumn<Nullable<String>>({"\"22:22:22.222000\"", "\"-22:22:22.222000\""});
ASSERT_COLUMN_EQ(expect, res);
}
}
CATCH

Loading