Skip to content

Commit

Permalink
apacheGH-37710: [C++][Integration] Add C++ Utf8View implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
bkietz committed Sep 20, 2023
1 parent 7084f71 commit 9b867e3
Show file tree
Hide file tree
Showing 68 changed files with 2,561 additions and 343 deletions.
4 changes: 4 additions & 0 deletions cpp/src/arrow/array/array_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,10 @@ struct ScalarFromArraySlotImpl {
return Finish(a.GetString(index_));
}

Status Visit(const BinaryViewArray& a) {
return Finish(a.GetString(index_));
}

Status Visit(const FixedSizeBinaryArray& a) { return Finish(a.GetString(index_)); }

Status Visit(const DayTimeIntervalArray& a) { return Finish(a.Value(index_)); }
Expand Down
32 changes: 32 additions & 0 deletions cpp/src/arrow/array/array_binary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "arrow/array/validate.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
#include "arrow/util/binary_view_util.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/logging.h"

Expand Down Expand Up @@ -89,6 +90,37 @@ LargeStringArray::LargeStringArray(int64_t length,

Status LargeStringArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); }

BinaryViewArray::BinaryViewArray(const std::shared_ptr<ArrayData>& data) {
ARROW_CHECK_EQ(data->type->id(), Type::BINARY_VIEW);
SetData(data);
}

BinaryViewArray::BinaryViewArray(int64_t length, std::shared_ptr<Buffer> headers,
BufferVector char_buffers,
std::shared_ptr<Buffer> null_bitmap, int64_t null_count,
int64_t offset)
: PrimitiveArray(binary_view(), length, std::move(headers), std::move(null_bitmap),
null_count, offset) {
data_->buffers.resize(char_buffers.size() + 2);
std::move(char_buffers.begin(), char_buffers.end(), data_->buffers.begin() + 2);
}

std::string_view BinaryViewArray::GetView(int64_t i) const {
const auto& s = raw_values()[i];
if (raw_pointers_) {
return util::FromRawPointerBinaryView(s);
}
const std::shared_ptr<Buffer>* data_buffers = data_->buffers.data() + 2;
return util::FromIndexOffsetBinaryView(s, data_buffers);
}

StringViewArray::StringViewArray(const std::shared_ptr<ArrayData>& data) {
ARROW_CHECK_EQ(data->type->id(), Type::STRING_VIEW);
SetData(data);
}

Status StringViewArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); }

FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data) {
SetData(data);
}
Expand Down
70 changes: 70 additions & 0 deletions cpp/src/arrow/array/array_binary.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

#include <cstdint>
#include <memory>
#include <optional>
#include <string>
#include <string_view>
#include <vector>
Expand Down Expand Up @@ -217,6 +218,75 @@ class ARROW_EXPORT LargeStringArray : public LargeBinaryArray {
Status ValidateUTF8() const;
};

// ----------------------------------------------------------------------
// BinaryView and StringView

/// Concrete Array class for variable-size binary view data using the
/// BinaryViewType::c_type struct to reference in-line or out-of-line string values
class ARROW_EXPORT BinaryViewArray : public PrimitiveArray {
public:
using TypeClass = BinaryViewType;
using IteratorType = stl::ArrayIterator<BinaryViewArray>;
using c_type = BinaryViewType::c_type;

explicit BinaryViewArray(const std::shared_ptr<ArrayData>& data);

BinaryViewArray(int64_t length, std::shared_ptr<Buffer> headers,
BufferVector char_buffers,
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);

const c_type* raw_values() const {
return reinterpret_cast<const c_type*>(raw_values_) + data_->offset;
}

// For API compatibility with BinaryArray etc.
std::string_view GetView(int64_t i) const;
std::string GetString(int64_t i) const { return std::string{GetView(i)}; }

std::optional<std::string_view> operator[](int64_t i) const {
return *IteratorType(*this, i);
}

IteratorType begin() const { return IteratorType(*this); }
IteratorType end() const { return IteratorType(*this, length()); }

bool has_raw_pointers() const { return raw_pointers_; }

protected:
using PrimitiveArray::PrimitiveArray;

void SetData(const std::shared_ptr<ArrayData>& data) {
PrimitiveArray::SetData(data);
raw_pointers_ =
internal::checked_cast<const BinaryViewType&>(*type()).has_raw_pointers();
}

bool raw_pointers_ = false;
};

/// Concrete Array class for variable-size string view (utf-8) data using
/// BinaryViewType::c_type to reference in-line or out-of-line string values
class ARROW_EXPORT StringViewArray : public BinaryViewArray {
public:
using TypeClass = StringViewType;

explicit StringViewArray(const std::shared_ptr<ArrayData>& data);

StringViewArray(int64_t length, std::shared_ptr<Buffer> data, BufferVector char_buffers,
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
: BinaryViewArray(length, std::move(data), std::move(char_buffers),
std::move(null_bitmap), null_count, offset) {
data_->type = utf8_view();
}

/// \brief Validate that this array contains only valid UTF8 entries
///
/// This check is also implied by ValidateFull()
Status ValidateUTF8() const;
};

// ----------------------------------------------------------------------
// Fixed width binary

Expand Down
Loading

0 comments on commit 9b867e3

Please sign in to comment.