Skip to content

Commit

Permalink
vector: Fix ColumnArray does not work well with CHBlockChunkCodec (pi…
Browse files Browse the repository at this point in the history
…ngcap#9477)

close pingcap#9485

vector: Fix ColumnArray does not work well with CHBlockChunkCodec

Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com>
  • Loading branch information
JaySon-Huang and ti-chi-bot[bot] committed Sep 29, 2024
1 parent 84a1207 commit f568261
Show file tree
Hide file tree
Showing 8 changed files with 302 additions and 53 deletions.
39 changes: 31 additions & 8 deletions dbms/src/DataTypes/DataTypeArray.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
// limitations under the License.

#include <Columns/ColumnArray.h>
#include <Common/Exception.h>
#include <Common/typeid_cast.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeFactory.h>
Expand Down Expand Up @@ -111,13 +112,12 @@ void serializeArraySizesPositionIndependent(const IColumn & column, WriteBuffer
{
const ColumnArray & column_array = typeid_cast<const ColumnArray &>(column);
const ColumnArray::Offsets & offset_values = column_array.getOffsets();
size_t size = offset_values.size();

if (!size)
size_t size = offset_values.size();
if (size == 0)
return;

size_t end = limit && (offset + limit < size) ? offset + limit : size;

ColumnArray::Offset prev_offset = offset == 0 ? 0 : offset_values[offset - 1];
for (size_t i = offset; i < end; ++i)
{
Expand Down Expand Up @@ -173,6 +173,12 @@ void DataTypeArray::serializeBinaryBulkWithMultipleStreams(
path.push_back(Substream::ArraySizes);
if (auto stream = getter(path))
{
// `position_independent_encoding == false` indicates that the `column_array.offsets`
// is serialized as is, which can provide better performance but only supports
// deserialization into an empty column. Conversely, when `position_independent_encoding == true`,
// the `column_array.offsets` is encoded into a format that supports deserializing
// and appending data into a column containing existing data.
// If you are unsure, set position_independent_encoding to true.
if (position_independent_encoding)
serializeArraySizesPositionIndependent(column, *stream, offset, limit);
else
Expand Down Expand Up @@ -224,11 +230,23 @@ void DataTypeArray::deserializeBinaryBulkWithMultipleStreams(
path.push_back(Substream::ArraySizes);
if (auto stream = getter(path))
{
// `position_independent_encoding == false` indicates that the `column_array.offsets`
// is serialized as is, which can provide better performance but only supports
// deserialization into an empty column. Conversely, when `position_independent_encoding == true`,
// the `column_array.offsets` is encoded into a format that supports deserializing
// and appending data into a column containing existing data.
// If you are unsure, set position_independent_encoding to true.
if (position_independent_encoding)
deserializeArraySizesPositionIndependent(column, *stream, limit);
else
{
RUNTIME_CHECK_MSG(
column_array.getOffsetsColumn().empty(),
"try to deserialize Array type to non-empty column without position idenpendent encoding, type_name={}",
getName());
DataTypeNumber<ColumnArray::Offset>()
.deserializeBinaryBulk(column_array.getOffsetsColumn(), *stream, limit, 0);
}
}

path.back() = Substream::ArrayElements;
Expand All @@ -237,9 +255,13 @@ void DataTypeArray::deserializeBinaryBulkWithMultipleStreams(
IColumn & nested_column = column_array.getData();

/// Number of values corresponding with `offset_values` must be read.
size_t last_offset = (offset_values.empty() ? 0 : offset_values.back());
const size_t last_offset = (offset_values.empty() ? 0 : offset_values.back());
if (last_offset < nested_column.size())
throw Exception("Nested column is longer than last offset", ErrorCodes::LOGICAL_ERROR);
throw Exception(
ErrorCodes::LOGICAL_ERROR,
"Nested column is longer than last offset, last_offset={} nest_column_size={}",
last_offset,
nested_column.size());
size_t nested_limit = last_offset - nested_column.size();
nested->deserializeBinaryBulkWithMultipleStreams(
nested_column,
Expand All @@ -253,9 +275,10 @@ void DataTypeArray::deserializeBinaryBulkWithMultipleStreams(
/// But if elements column is empty - it's ok for columns of Nested types that was added by ALTER.
if (!nested_column.empty() && nested_column.size() != last_offset)
throw Exception(
"Cannot read all array values: read just " + toString(nested_column.size()) + " of "
+ toString(last_offset),
ErrorCodes::CANNOT_READ_ALL_DATA);
ErrorCodes::CANNOT_READ_ALL_DATA,
"Cannot read all array values: read just {} of {}",
nested_column.size(),
last_offset);
}


Expand Down
62 changes: 33 additions & 29 deletions dbms/src/DataTypes/IDataType.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ class IDataType : private boost::noncopyable
/// static constexpr bool is_parametric = false;

/// Name of data type (examples: UInt64, Array(String)).
virtual String getName() const { return getFamilyName(); };
virtual String getName() const { return getFamilyName(); }

virtual TypeIndex getTypeId() const = 0;

Expand Down Expand Up @@ -124,6 +124,8 @@ class IDataType : private boost::noncopyable
* offset must be not greater than size of column.
* offset + limit could be greater than size of column
* - in that case, column is serialized till the end.
* `position_independent_encoding` - provide better performance when it is false, but it requires not to be
* deserialized the data into a column with existing data.
*/
virtual void serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
Expand All @@ -149,7 +151,9 @@ class IDataType : private boost::noncopyable
}

/** Read no more than limit values and append them into column.
* avg_value_size_hint - if not zero, may be used to avoid reallocations while reading column of String type.
* `avg_value_size_hint` - if not zero, may be used to avoid reallocations while reading column of String type.
* `position_independent_encoding` - provide better performance when it is false, but it requires not to be
* deserialized the data into a column with existing data.
*/
virtual void deserializeBinaryBulkWithMultipleStreams(
IColumn & column,
Expand Down Expand Up @@ -295,89 +299,89 @@ class IDataType : private boost::noncopyable
/** Can appear in table definition.
* Counterexamples: Interval, Nothing.
*/
virtual bool cannotBeStoredInTables() const { return false; };
virtual bool cannotBeStoredInTables() const { return false; }

/** In text formats that render "pretty" tables,
* is it better to align value right in table cell.
* Examples: numbers, even nullable.
*/
virtual bool shouldAlignRightInPrettyFormats() const { return false; };
virtual bool shouldAlignRightInPrettyFormats() const { return false; }

/** Does formatted value in any text format can contain anything but valid UTF8 sequences.
* Example: String (because it can contain arbitary bytes).
* Counterexamples: numbers, Date, DateTime.
* For Enum, it depends.
*/
virtual bool textCanContainOnlyValidUTF8() const { return false; };
virtual bool textCanContainOnlyValidUTF8() const { return false; }

/** Is it possible to compare for less/greater, to calculate min/max?
* Not necessarily totally comparable. For example, floats are comparable despite the fact that NaNs compares to nothing.
* The same for nullable of comparable types: they are comparable (but not totally-comparable).
*/
virtual bool isComparable() const { return false; };
virtual bool isComparable() const { return false; }

/** Does it make sense to use this type with COLLATE modifier in ORDER BY.
* Example: String, but not FixedString.
*/
virtual bool canBeComparedWithCollation() const { return false; };
virtual bool canBeComparedWithCollation() const { return false; }

/** If the type is totally comparable (Ints, Date, DateTime, not nullable, not floats)
* and "simple" enough (not String, FixedString) to be used as version number
* (to select rows with maximum version).
*/
virtual bool canBeUsedAsVersion() const { return false; };
virtual bool canBeUsedAsVersion() const { return false; }

/** Values of data type can be summed (possibly with overflow, within the same data type).
* Example: numbers, even nullable. Not Date/DateTime. Not Enum.
* Enums can be passed to aggregate function 'sum', but the result is Int64, not Enum, so they are not summable.
*/
virtual bool isSummable() const { return false; };
virtual bool isSummable() const { return false; }

/** Can be used in operations like bit and, bit shift, bit not, etc.
*/
virtual bool canBeUsedInBitOperations() const { return false; };
virtual bool canBeUsedInBitOperations() const { return false; }

/** Can be used in boolean context (WHERE, HAVING).
* UInt8, maybe nullable.
*/
virtual bool canBeUsedInBooleanContext() const { return false; };
virtual bool canBeUsedInBooleanContext() const { return false; }

/** Integers, floats, not Nullable. Not Enums. Not Date/DateTime.
*/
virtual bool isNumber() const { return false; };
virtual bool isNumber() const { return false; }

/** Integers. Not Nullable. Not Enums. Not Date/DateTime.
*/
virtual bool isInteger() const { return false; };
virtual bool isUnsignedInteger() const { return false; };
virtual bool isInteger() const { return false; }
virtual bool isUnsignedInteger() const { return false; }

/** Floating point values. Not Nullable. Not Enums. Not Date/DateTime.
*/
virtual bool isFloatingPoint() const { return false; }

/** Date, DateTime, MyDate, MyDateTime. Not Nullable.
*/
virtual bool isDateOrDateTime() const { return false; };
virtual bool isDateOrDateTime() const { return false; }

/** MyDate, MyDateTime. Not Nullable.
*/
virtual bool isMyDateOrMyDateTime() const { return false; };
virtual bool isMyDateOrMyDateTime() const { return false; }

/** MyTime. Not Nullable.
*/
virtual bool isMyTime() const { return false; };
virtual bool isMyTime() const { return false; }

/** Decimal. Not Nullable.
*/
virtual bool isDecimal() const { return false; };
virtual bool isDecimal() const { return false; }

/** Numbers, Enums, Date, DateTime, MyDate, MyDateTime. Not nullable.
*/
virtual bool isValueRepresentedByNumber() const { return false; };
virtual bool isValueRepresentedByNumber() const { return false; }

/** Integers, Enums, Date, DateTime, MyDate, MyDateTime. Not nullable.
*/
virtual bool isValueRepresentedByInteger() const { return false; };
virtual bool isValueRepresentedByInteger() const { return false; }

/** Values are unambiguously identified by contents of contiguous memory region,
* that can be obtained by IColumn::getDataAt method.
Expand All @@ -386,23 +390,23 @@ class IDataType : private boost::noncopyable
* (because Array(String) values became ambiguous if you concatenate Strings).
* Counterexamples: Nullable, Tuple.
*/
virtual bool isValueUnambiguouslyRepresentedInContiguousMemoryRegion() const { return false; };
virtual bool isValueUnambiguouslyRepresentedInContiguousMemoryRegion() const { return false; }

virtual bool isValueUnambiguouslyRepresentedInFixedSizeContiguousMemoryRegion() const
{
return isValueUnambiguouslyRepresentedInContiguousMemoryRegion()
&& (isValueRepresentedByNumber() || isFixedString());
};
}

virtual bool isString() const { return false; };
virtual bool isFixedString() const { return false; };
virtual bool isStringOrFixedString() const { return isString() || isFixedString(); };
virtual bool isString() const { return false; }
virtual bool isFixedString() const { return false; }
virtual bool isStringOrFixedString() const { return isString() || isFixedString(); }

/** Example: numbers, Date, DateTime, FixedString, Enum... Nullable and Tuple of such types.
* Counterexamples: String, Array.
* It's Ok to return false for AggregateFunction despite the fact that some of them have fixed size state.
*/
virtual bool haveMaximumSizeOfValue() const { return false; };
virtual bool haveMaximumSizeOfValue() const { return false; }

/** Size in amount of bytes in memory. Throws an exception if not haveMaximumSizeOfValue.
*/
Expand All @@ -414,9 +418,9 @@ class IDataType : private boost::noncopyable

/** Integers (not floats), Enum, String, FixedString.
*/
virtual bool isCategorial() const { return false; };
virtual bool isCategorial() const { return false; }

virtual bool isEnum() const { return false; };
virtual bool isEnum() const { return false; }

virtual bool isNullable() const { return false; }
/** Is this type can represent only NULL value? (It also implies isNullable)
Expand All @@ -425,7 +429,7 @@ class IDataType : private boost::noncopyable

/** If this data type cannot be wrapped in Nullable data type.
*/
virtual bool canBeInsideNullable() const { return false; };
virtual bool canBeInsideNullable() const { return false; }

/// Updates avg_value_size_hint for newly read column. Uses to optimize deserialization. Zero expected for first column.
static void updateAvgValueSizeHint(const IColumn & column, double & avg_value_size_hint);
Expand Down
16 changes: 14 additions & 2 deletions dbms/src/DataTypes/tests/gtest_data_type_get_common_type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,12 @@
// limitations under the License.

#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/getLeastSupertype.h>
#include <DataTypes/getMostSubtype.h>
#include <DataTypes/isSupportedDataTypeCast.h>
#include <TestUtils/TiFlashTestBasic.h>

#include <sstream>

namespace DB
{
namespace tests
Expand Down Expand Up @@ -330,6 +329,19 @@ try
// not true for nullable
ASSERT_FALSE(ntype->isDateOrDateTime()) << "type: " + type->getName();
}

{
// array can be wrapped by Nullable
auto type = typeFromString("Array(Float32)");
ASSERT_NE(type, nullptr);
auto ntype = DataTypeNullable(type);
ASSERT_TRUE(ntype.isNullable());
}

{
auto type = typeFromString("Nullable(Array(Float32))");
ASSERT_TRUE(type->isNullable());
}
}
CATCH

Expand Down
16 changes: 14 additions & 2 deletions dbms/src/Flash/Coprocessor/CHBlockChunkCodec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,15 +97,27 @@ void WriteColumnData(const IDataType & type, const ColumnPtr & column, WriteBuff
IDataType::OutputStreamGetter output_stream_getter = [&](const IDataType::SubstreamPath &) {
return &ostr;
};
type.serializeBinaryBulkWithMultipleStreams(*full_column, output_stream_getter, offset, limit, false, {});
type.serializeBinaryBulkWithMultipleStreams(
*full_column,
output_stream_getter,
offset,
limit,
/*position_independent_encoding=*/true,
{});
}

void CHBlockChunkCodec::readData(const IDataType & type, IColumn & column, ReadBuffer & istr, size_t rows)
{
IDataType::InputStreamGetter input_stream_getter = [&](const IDataType::SubstreamPath &) {
return &istr;
};
type.deserializeBinaryBulkWithMultipleStreams(column, input_stream_getter, rows, 0, false, {});
type.deserializeBinaryBulkWithMultipleStreams(
column,
input_stream_getter,
rows,
0,
/*position_independent_encoding=*/true,
{});
}

size_t ApproxBlockBytes(const Block & block)
Expand Down
2 changes: 1 addition & 1 deletion dbms/src/Flash/Coprocessor/CHBlockChunkCodecV1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ static inline void decodeColumnsByBlock(ReadBuffer & istr, Block & res, size_t r
[&](const IDataType::SubstreamPath &) { return &istr; },
sz,
0,
{},
/*position_independent_encoding=*/true,
{});
}
}
Expand Down
Loading

0 comments on commit f568261

Please sign in to comment.