Skip to content

Commit

Permalink
review comments
Browse files Browse the repository at this point in the history
  • Loading branch information
bkietz committed Oct 10, 2023
1 parent 7416749 commit 15e5846
Show file tree
Hide file tree
Showing 21 changed files with 116 additions and 172 deletions.
2 changes: 1 addition & 1 deletion cpp/src/arrow/array/array_binary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ BinaryViewArray::BinaryViewArray(std::shared_ptr<DataType> type, int64_t length,

std::string_view BinaryViewArray::GetView(int64_t i) const {
const std::shared_ptr<Buffer>* data_buffers = data_->buffers.data() + 2;
return util::FromIndexOffsetBinaryView(raw_values_[i], data_buffers);
return util::FromBinaryView(raw_values_[i], data_buffers);
}

StringViewArray::StringViewArray(std::shared_ptr<ArrayData> data) {
Expand Down
68 changes: 30 additions & 38 deletions cpp/src/arrow/array/array_binary_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -405,61 +405,59 @@ TEST(StringViewArray, Validate) {

// non-inline views are expected to reference only buffers managed by the array
EXPECT_THAT(
MakeBinaryViewArray({buffer_s, buffer_y},
{util::ToIndexOffsetBinaryView(
"supe", static_cast<int32_t>(buffer_s->size()), 0, 0),
util::ToIndexOffsetBinaryView(
"yyyy", static_cast<int32_t>(buffer_y->size()), 1, 0)}),
MakeBinaryViewArray(
{buffer_s, buffer_y},
{util::ToBinaryView("supe", static_cast<int32_t>(buffer_s->size()), 0, 0),
util::ToBinaryView("yyyy", static_cast<int32_t>(buffer_y->size()), 1, 0)}),
Ok());

// views may not reference data buffers not present in the array
EXPECT_THAT(
MakeBinaryViewArray({}, {util::ToIndexOffsetBinaryView(
"supe", static_cast<int32_t>(buffer_s->size()), 0, 0)}),
MakeBinaryViewArray(
{}, {util::ToBinaryView("supe", static_cast<int32_t>(buffer_s->size()), 0, 0)}),
Raises(StatusCode::IndexError));
// ... or ranges which overflow the referenced data buffer
EXPECT_THAT(
MakeBinaryViewArray(
{buffer_s}, {util::ToIndexOffsetBinaryView(
{buffer_s}, {util::ToBinaryView(
"supe", static_cast<int32_t>(buffer_s->size() + 50), 0, 0)}),
Raises(StatusCode::IndexError));

// Additionally, the prefixes of non-inline views must match the data buffer
EXPECT_THAT(
MakeBinaryViewArray({buffer_s, buffer_y},
{util::ToIndexOffsetBinaryView(
"SUPE", static_cast<int32_t>(buffer_s->size()), 0, 0),
util::ToIndexOffsetBinaryView(
"yyyy", static_cast<int32_t>(buffer_y->size()), 1, 0)}),
MakeBinaryViewArray(
{buffer_s, buffer_y},
{util::ToBinaryView("SUPE", static_cast<int32_t>(buffer_s->size()), 0, 0),
util::ToBinaryView("yyyy", static_cast<int32_t>(buffer_y->size()), 1, 0)}),
Raises(StatusCode::Invalid));

// Invalid string views which are masked by a null bit do not cause validation to fail
auto invalid_but_masked =
MakeBinaryViewArray({buffer_s},
{util::ToIndexOffsetBinaryView(
"SUPE", static_cast<int32_t>(buffer_s->size()), 0, 0),
util::ToIndexOffsetBinaryView("yyyy", 50, 40, 30)},
/*validate=*/false)
MakeBinaryViewArray(
{buffer_s},
{util::ToBinaryView("SUPE", static_cast<int32_t>(buffer_s->size()), 0, 0),
util::ToBinaryView("yyyy", 50, 40, 30)},
/*validate=*/false)
.ValueOrDie()
->data();
invalid_but_masked->null_count = 2;
invalid_but_masked->buffers[0] = *AllocateEmptyBitmap(2);
EXPECT_THAT(internal::ValidateArrayFull(*invalid_but_masked), Ok());

// overlapping views are allowed
EXPECT_THAT(MakeBinaryViewArray(
{buffer_s},
{
util::ToIndexOffsetBinaryView(
"supe", static_cast<int32_t>(buffer_s->size()), 0, 0),
util::ToIndexOffsetBinaryView(
"uper", static_cast<int32_t>(buffer_s->size() - 1), 0, 1),
util::ToIndexOffsetBinaryView(
"perc", static_cast<int32_t>(buffer_s->size() - 2), 0, 2),
util::ToIndexOffsetBinaryView(
"erca", static_cast<int32_t>(buffer_s->size() - 3), 0, 3),
}),
Ok());
EXPECT_THAT(
MakeBinaryViewArray(
{buffer_s},
{
util::ToBinaryView("supe", static_cast<int32_t>(buffer_s->size()), 0, 0),
util::ToBinaryView("uper", static_cast<int32_t>(buffer_s->size() - 1), 0,
1),
util::ToBinaryView("perc", static_cast<int32_t>(buffer_s->size() - 2), 0,
2),
util::ToBinaryView("erca", static_cast<int32_t>(buffer_s->size() - 3), 0,
3),
}),
Ok());
}

template <typename T>
Expand Down Expand Up @@ -982,13 +980,7 @@ class TestBaseBinaryDataVisitor : public ::testing::Test {
public:
using TypeClass = T;

void SetUp() override {
if constexpr (is_binary_view_like_type<TypeClass>::value) {
type_ = TypeClass::is_utf8 ? utf8_view() : binary_view();
} else {
type_ = TypeTraits<TypeClass>::type_singleton();
}
}
void SetUp() override { type_ = TypeTraits<TypeClass>::type_singleton(); }

void TestBasics() {
auto array = ArrayFromJSON(
Expand Down
21 changes: 13 additions & 8 deletions cpp/src/arrow/array/builder_binary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include "arrow/util/checked_cast.h"
#include "arrow/util/decimal.h"
#include "arrow/util/logging.h"
#include "arrow/visit_data_inline.h"

namespace arrow {

Expand All @@ -51,12 +52,17 @@ Status BinaryViewBuilder::AppendArraySlice(const ArraySpan& array, int64_t offse
auto bitmap = array.GetValues<uint8_t>(0, 0);
auto values = array.GetValues<BinaryViewType::c_type>(1) + offset;

int64_t out_of_line_total = 0;
for (int64_t i = 0; i < length; i++) {
if (!values[i].is_inline()) {
out_of_line_total += static_cast<int64_t>(values[i].size());
}
}
int64_t out_of_line_total = 0, i = 0;
VisitNullBitmapInline(
array.buffers[0].data, array.offset, array.length, array.null_count,
[&] {
if (!values[i].is_inline()) {
out_of_line_total += static_cast<int64_t>(values[i].size());
}
++i;
},
[&] { ++i; });

RETURN_NOT_OK(Reserve(length));
RETURN_NOT_OK(ReserveData(out_of_line_total));

Expand All @@ -66,8 +72,7 @@ Status BinaryViewBuilder::AppendArraySlice(const ArraySpan& array, int64_t offse
continue;
}

UnsafeAppend(
util::FromIndexOffsetBinaryView(values[i], array.GetVariadicBuffers().data()));
UnsafeAppend(util::FromBinaryView(values[i], array.GetVariadicBuffers().data()));
}
return Status::OK();
}
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/arrow/array/builder_binary.h
Original file line number Diff line number Diff line change
Expand Up @@ -500,9 +500,9 @@ class ARROW_EXPORT StringHeapBuilder {
ARROW_RETURN_NOT_OK(Reserve(length));
}

auto v = util::ToIndexOffsetBinaryView(value, static_cast<int32_t>(length),
static_cast<int32_t>(blocks_.size() - 1),
current_offset_);
auto v =
util::ToBinaryView(value, static_cast<int32_t>(length),
static_cast<int32_t>(blocks_.size() - 1), current_offset_);

memcpy(current_out_buffer_, value, static_cast<size_t>(length));
current_out_buffer_ += length;
Expand Down
7 changes: 4 additions & 3 deletions cpp/src/arrow/array/concatenate.cc
Original file line number Diff line number Diff line change
Expand Up @@ -242,16 +242,17 @@ class ConcatenateImpl {
ARROW_ASSIGN_OR_RAISE(auto view_buffers, Buffers(1, BinaryViewType::kSize));
ARROW_ASSIGN_OR_RAISE(auto view_buffer, ConcatenateBuffers(view_buffers, pool_));

auto* s = view_buffer->mutable_data_as<BinaryViewType::c_type>();
auto* views = view_buffer->mutable_data_as<BinaryViewType::c_type>();
size_t preceding_buffer_count = 0;

int64_t i = in_[0]->length;
for (size_t in_index = 1; in_index < in_.size(); ++in_index) {
preceding_buffer_count += in_[in_index - 1]->buffers.size() - 2;

for (int64_t end_i = i + in_[in_index]->length; i < end_i; ++i) {
if (s[i].is_inline()) continue;
s[i].ref.buffer_index += static_cast<int32_t>(preceding_buffer_count);
if (views[i].is_inline()) continue;
views[i].ref.buffer_index = SafeSignedAdd(
views[i].ref.buffer_index, static_cast<int32_t>(preceding_buffer_count));
}
}

Expand Down
4 changes: 2 additions & 2 deletions cpp/src/arrow/array/concatenate_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,8 @@ class ConcatenateTest : public ::testing::Test {
for (auto slice : slices) {
ASSERT_OK(slice->ValidateFull());
}
ASSERT_OK(expected->ValidateFull());
ASSERT_OK_AND_ASSIGN(auto actual, Concatenate(slices));
ASSERT_OK(actual->ValidateFull());
AssertArraysEqual(*expected, *actual);
if (actual->data()->buffers[0]) {
CheckTrailingBitsAreZeroed(actual->data()->buffers[0], actual->length());
Expand Down Expand Up @@ -163,7 +163,7 @@ TEST_F(ConcatenateTest, StringType) {

TEST_F(ConcatenateTest, StringViewType) {
Check([this](int32_t size, double null_probability, std::shared_ptr<Array>* out) {
*out = rng_.StringView(size, /*min_length =*/0, /*max_length =*/15, null_probability);
*out = rng_.StringView(size, /*min_length =*/0, /*max_length =*/40, null_probability);
ASSERT_OK((**out).ValidateFull());
});
}
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/arrow/array/data.cc
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data) {
return ArraySpan(data).MayHaveLogicalNulls();
}

BufferSpan PackVariadicBuffers(util::span<std::shared_ptr<Buffer> const> buffers) {
BufferSpan PackVariadicBuffers(util::span<const std::shared_ptr<Buffer>> buffers) {
return {const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(buffers.data())),
static_cast<int64_t>(buffers.size() * sizeof(std::shared_ptr<Buffer>))};
}
Expand Down Expand Up @@ -372,7 +372,7 @@ void ArraySpan::FillFromScalar(const Scalar& value) {
static_assert(sizeof(BinaryViewType::c_type) <= sizeof(scalar.scratch_space_));
auto* view = new (&scalar.scratch_space_) BinaryViewType::c_type;
if (scalar.is_valid) {
*view = util::ToIndexOffsetBinaryView(std::string_view{*scalar.value}, 0, 0);
*view = util::ToBinaryView(std::string_view{*scalar.value}, 0, 0);
this->buffers[2] = internal::PackVariadicBuffers({&scalar.value, 1});
} else {
*view = {};
Expand Down Expand Up @@ -565,7 +565,7 @@ std::shared_ptr<ArrayData> ArraySpan::ToArrayData() const {
return result;
}

util::span<std::shared_ptr<Buffer> const> ArraySpan::GetVariadicBuffers() const {
util::span<const std::shared_ptr<Buffer>> ArraySpan::GetVariadicBuffers() const {
DCHECK(HasVariadicBuffers());
return {buffers[2].data_as<std::shared_ptr<Buffer>>(),
buffers[2].size / sizeof(std::shared_ptr<Buffer>)};
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/array/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -540,7 +540,7 @@ struct ARROW_EXPORT ArraySpan {
/// sizeof(shared_ptr<Buffer>).
///
/// \see HasVariadicBuffers
util::span<std::shared_ptr<Buffer> const> GetVariadicBuffers() const;
util::span<const std::shared_ptr<Buffer>> GetVariadicBuffers() const;
bool HasVariadicBuffers() const;

private:
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/array/util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -680,7 +680,7 @@ class RepeatedArrayFactory {
template <typename T>
enable_if_binary_view_like<T, Status> Visit(const T& type) {
std::string_view value{*scalar<T>().value};
auto s = util::ToIndexOffsetBinaryView(value, 0, 0);
auto s = util::ToBinaryView(value, 0, 0);
RETURN_NOT_OK(FinishFixedWidth(&s, sizeof(s)));
if (!s.is_inline()) {
out_->data()->buffers.push_back(scalar<T>().value);
Expand Down
13 changes: 0 additions & 13 deletions cpp/src/arrow/array/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,18 +86,5 @@ Result<std::shared_ptr<ArrayData>> SwapEndianArrayData(
ARROW_EXPORT
std::vector<ArrayVector> RechunkArraysConsistently(const std::vector<ArrayVector>&);

/// Convert between index/offset and raw pointer views.
///
/// This function can be used to overwrite a buffer of views if desired,
/// IE it is supported for `in.buffers[1].data == out`.
///
/// Note that calling this function is not necessary if all views happen to be
/// Inline; this is usually efficiently detectable by checking for an absence of any
/// data buffers.
///
/// Will raise IndexError if a view refers to memory outside the data buffers.
ARROW_EXPORT
Status SwapBinaryViewPointers(const ArraySpan& in, BinaryViewType::c_type* out);

} // namespace internal
} // namespace arrow
13 changes: 12 additions & 1 deletion cpp/src/arrow/array/validate.cc
Original file line number Diff line number Diff line change
Expand Up @@ -650,7 +650,18 @@ struct ValidateArrayImpl {
views[i].size());
}

if (views[i].is_inline()) continue;
if (views[i].is_inline()) {
auto padding_bytes = util::span(views[i].inlined.data).subspan(views[i].size());
for (auto padding_byte : padding_bytes) {
if (padding_byte != 0) {
return Status::Invalid("View at slot ", i, " was inline with size ",
views[i].size(),
" but its padding bytes were not all zero: ",
HexEncode(padding_bytes.data(), padding_bytes.size()));
}
}
continue;
}

auto [size, prefix, buffer_index, offset] = views[i].ref;

Expand Down
4 changes: 2 additions & 2 deletions cpp/src/arrow/compare.cc
Original file line number Diff line number Diff line change
Expand Up @@ -271,8 +271,8 @@ class RangeDataEqualsImpl {
auto* right_buffers = right_.buffers.data() + 2;
VisitValidRuns([&](int64_t i, int64_t length) {
for (auto end_i = i + length; i < end_i; ++i) {
if (!util::EqualIndexOffsetBinaryView(left_values[i], right_values[i],
left_buffers, right_buffers)) {
if (!util::EqualBinaryView(left_values[i], right_values[i], left_buffers,
right_buffers)) {
return false;
}
}
Expand Down
34 changes: 18 additions & 16 deletions cpp/src/arrow/integration/json_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,11 @@ std::string GetTimeUnitName(TimeUnit::type unit) {
return "UNKNOWN";
}

std::string_view GetStringView(const rj::Value& str) {
DCHECK(str.IsString());
return {str.GetString(), str.GetStringLength()};
Result<std::string_view> GetStringView(const rj::Value& str) {
if (!str.IsString()) {
return Status::Invalid("field was not a string line ", __LINE__);
}
return std::string_view{str.GetString(), str.GetStringLength()};
}

class SchemaWriter {
Expand Down Expand Up @@ -1413,7 +1415,7 @@ class ArrayReader {
BufferVector buffers;
buffers.resize(json_variadic_bufs.Size() + 2);
for (auto [json_buf, buf] : Zip(json_variadic_bufs, span{buffers}.subspan(2))) {
auto hex_string = GetStringView(json_buf);
ARROW_ASSIGN_OR_RAISE(auto hex_string, GetStringView(json_buf));
ARROW_ASSIGN_OR_RAISE(
buf, AllocateBuffer(static_cast<int64_t>(hex_string.size()) / 2, pool_));
RETURN_NOT_OK(ParseHexValues(hex_string, buf->mutable_data()));
Expand All @@ -1433,9 +1435,9 @@ class ArrayReader {
static_cast<size_t>(length_)};

int64_t null_count = 0;
for (auto [json_view, s, is_valid] : Zip(json_views, views, is_valid_)) {
for (auto [json_view, out_view, is_valid] : Zip(json_views, views, is_valid_)) {
if (!is_valid) {
s = {};
out_view = {};
++null_count;
continue;
}
Expand All @@ -1451,16 +1453,16 @@ class ArrayReader {
if (size <= BinaryViewType::kInlineSize) {
auto json_inlined = json_view_obj.FindMember("INLINED");
RETURN_NOT_STRING("INLINED", json_inlined, json_view_obj);
s.inlined = {size, {}};
out_view.inlined = {size, {}};

if constexpr (ViewType::is_utf8) {
DCHECK_LE(json_inlined->value.GetStringLength(), BinaryViewType::kInlineSize);
memcpy(&s.inlined.data, json_inlined->value.GetString(), size);
memcpy(&out_view.inlined.data, json_inlined->value.GetString(), size);
} else {
DCHECK_LE(json_inlined->value.GetStringLength(),
BinaryViewType::kInlineSize * 2);
RETURN_NOT_OK(
ParseHexValues(GetStringView(json_inlined->value), s.inlined.data.data()));
ARROW_ASSIGN_OR_RAISE(auto inlined, GetStringView(json_inlined->value));
RETURN_NOT_OK(ParseHexValues(inlined, out_view.inlined.data.data()));
}
continue;
}
Expand All @@ -1472,20 +1474,20 @@ class ArrayReader {
RETURN_NOT_INT("BUFFER_INDEX", json_buffer_index, json_view_obj);
RETURN_NOT_INT("OFFSET", json_offset, json_view_obj);

s.ref = {
out_view.ref = {
size,
{},
static_cast<int32_t>(json_buffer_index->value.GetInt64()),
static_cast<int32_t>(json_offset->value.GetInt64()),
};

DCHECK_EQ(json_prefix->value.GetStringLength(), BinaryViewType::kPrefixSize * 2);
RETURN_NOT_OK(
ParseHexValues(GetStringView(json_prefix->value), s.ref.prefix.data()));
ARROW_ASSIGN_OR_RAISE(auto prefix, GetStringView(json_prefix->value));
RETURN_NOT_OK(ParseHexValues(prefix, out_view.ref.prefix.data()));

DCHECK_LE(static_cast<size_t>(s.ref.buffer_index), buffers.size() - 2);
DCHECK_LE(static_cast<int64_t>(s.ref.offset) + s.size(),
buffers[s.ref.buffer_index + 2]->size());
DCHECK_LE(static_cast<size_t>(out_view.ref.buffer_index), buffers.size() - 2);
DCHECK_LE(static_cast<int64_t>(out_view.ref.offset) + out_view.size(),
buffers[out_view.ref.buffer_index + 2]->size());
}

data_ = ArrayData::Make(type_, length_, std::move(buffers), null_count);
Expand Down
Loading

0 comments on commit 15e5846

Please sign in to comment.