Skip to content

Commit

Permalink
Merge branch 'main' into parquet-dataset/use-key-value-metadata-corre…
Browse files Browse the repository at this point in the history
…ctly
  • Loading branch information
mapleFU committed Sep 20, 2023
2 parents de4bda7 + 868b9bd commit 3710405
Show file tree
Hide file tree
Showing 12 changed files with 269 additions and 315 deletions.
2 changes: 0 additions & 2 deletions ci/scripts/matlab_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,6 @@ cmake \
-S ${source_dir} \
-B ${build_dir} \
-G Ninja \
-D MATLAB_BUILD_TESTS=ON \
-D CMAKE_INSTALL_PREFIX=${install_dir} \
-D MATLAB_ADD_INSTALL_DIR_TO_SEARCH_PATH=OFF
cmake --build ${build_dir} --config Release --target install
ctest --test-dir ${build_dir}
12 changes: 6 additions & 6 deletions cpp/src/arrow/array/array_dict.cc
Original file line number Diff line number Diff line change
Expand Up @@ -282,9 +282,9 @@ class DictionaryUnifierImpl : public DictionaryUnifier {
*out_type = arrow::dictionary(index_type, value_type_);

// Build unified dictionary array
std::shared_ptr<ArrayData> data;
RETURN_NOT_OK(DictTraits::GetDictionaryArrayData(pool_, value_type_, memo_table_,
0 /* start_offset */, &data));
ARROW_ASSIGN_OR_RAISE(
auto data, DictTraits::GetDictionaryArrayData(pool_, value_type_, memo_table_,
0 /* start_offset */));
*out_dict = MakeArray(data);
return Status::OK();
}
Expand All @@ -299,9 +299,9 @@ class DictionaryUnifierImpl : public DictionaryUnifier {
}

// Build unified dictionary array
std::shared_ptr<ArrayData> data;
RETURN_NOT_OK(DictTraits::GetDictionaryArrayData(pool_, value_type_, memo_table_,
0 /* start_offset */, &data));
ARROW_ASSIGN_OR_RAISE(
auto data, DictTraits::GetDictionaryArrayData(pool_, value_type_, memo_table_,
0 /* start_offset */));
*out_dict = MakeArray(data);
return Status::OK();
}
Expand Down
5 changes: 3 additions & 2 deletions cpp/src/arrow/array/builder_dict.cc
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,9 @@ class DictionaryMemoTable::DictionaryMemoTableImpl {
enable_if_memoize<T, Status> Visit(const T&) {
using ConcreteMemoTable = typename DictionaryTraits<T>::MemoTableType;
auto memo_table = checked_cast<ConcreteMemoTable*>(memo_table_);
return DictionaryTraits<T>::GetDictionaryArrayData(pool_, value_type_, *memo_table,
start_offset_, out_);
ARROW_ASSIGN_OR_RAISE(*out_, DictionaryTraits<T>::GetDictionaryArrayData(
pool_, value_type_, *memo_table, start_offset_));
return Status::OK();
}
};

Expand Down
47 changes: 19 additions & 28 deletions cpp/src/arrow/array/dict_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

#include "arrow/array.h"
#include "arrow/buffer.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
Expand Down Expand Up @@ -63,11 +64,9 @@ struct DictionaryTraits<BooleanType> {
using T = BooleanType;
using MemoTableType = typename HashTraits<T>::MemoTableType;

static Status GetDictionaryArrayData(MemoryPool* pool,
const std::shared_ptr<DataType>& type,
const MemoTableType& memo_table,
int64_t start_offset,
std::shared_ptr<ArrayData>* out) {
static Result<std::shared_ptr<ArrayData>> GetDictionaryArrayData(
MemoryPool* pool, const std::shared_ptr<DataType>& type,
const MemoTableType& memo_table, int64_t start_offset) {
if (start_offset < 0) {
return Status::Invalid("invalid start_offset ", start_offset);
}
Expand All @@ -82,7 +81,9 @@ struct DictionaryTraits<BooleanType> {
: builder.Append(bool_values[i]));
}

return builder.FinishInternal(out);
std::shared_ptr<ArrayData> out;
RETURN_NOT_OK(builder.FinishInternal(&out));
return out;
}
}; // namespace internal

Expand All @@ -91,11 +92,9 @@ struct DictionaryTraits<T, enable_if_has_c_type<T>> {
using c_type = typename T::c_type;
using MemoTableType = typename HashTraits<T>::MemoTableType;

static Status GetDictionaryArrayData(MemoryPool* pool,
const std::shared_ptr<DataType>& type,
const MemoTableType& memo_table,
int64_t start_offset,
std::shared_ptr<ArrayData>* out) {
static Result<std::shared_ptr<ArrayData>> GetDictionaryArrayData(
MemoryPool* pool, const std::shared_ptr<DataType>& type,
const MemoTableType& memo_table, int64_t start_offset) {
auto dict_length = static_cast<int64_t>(memo_table.size()) - start_offset;
// This makes a copy, but we assume a dictionary array is usually small
// compared to the size of the dictionary-using array.
Expand All @@ -112,20 +111,17 @@ struct DictionaryTraits<T, enable_if_has_c_type<T>> {
RETURN_NOT_OK(
ComputeNullBitmap(pool, memo_table, start_offset, &null_count, &null_bitmap));

*out = ArrayData::Make(type, dict_length, {null_bitmap, dict_buffer}, null_count);
return Status::OK();
return ArrayData::Make(type, dict_length, {null_bitmap, dict_buffer}, null_count);
}
};

template <typename T>
struct DictionaryTraits<T, enable_if_base_binary<T>> {
using MemoTableType = typename HashTraits<T>::MemoTableType;

static Status GetDictionaryArrayData(MemoryPool* pool,
const std::shared_ptr<DataType>& type,
const MemoTableType& memo_table,
int64_t start_offset,
std::shared_ptr<ArrayData>* out) {
static Result<std::shared_ptr<ArrayData>> GetDictionaryArrayData(
MemoryPool* pool, const std::shared_ptr<DataType>& type,
const MemoTableType& memo_table, int64_t start_offset) {
using offset_type = typename T::offset_type;

// Create the offsets buffer
Expand All @@ -148,23 +144,19 @@ struct DictionaryTraits<T, enable_if_base_binary<T>> {
RETURN_NOT_OK(
ComputeNullBitmap(pool, memo_table, start_offset, &null_count, &null_bitmap));

*out = ArrayData::Make(type, dict_length,
return ArrayData::Make(type, dict_length,
{null_bitmap, std::move(dict_offsets), std::move(dict_data)},
null_count);

return Status::OK();
}
};

template <typename T>
struct DictionaryTraits<T, enable_if_fixed_size_binary<T>> {
using MemoTableType = typename HashTraits<T>::MemoTableType;

static Status GetDictionaryArrayData(MemoryPool* pool,
const std::shared_ptr<DataType>& type,
const MemoTableType& memo_table,
int64_t start_offset,
std::shared_ptr<ArrayData>* out) {
static Result<std::shared_ptr<ArrayData>> GetDictionaryArrayData(
MemoryPool* pool, const std::shared_ptr<DataType>& type,
const MemoTableType& memo_table, int64_t start_offset) {
const T& concrete_type = internal::checked_cast<const T&>(*type);

// Create the data buffer
Expand All @@ -182,9 +174,8 @@ struct DictionaryTraits<T, enable_if_fixed_size_binary<T>> {
RETURN_NOT_OK(
ComputeNullBitmap(pool, memo_table, start_offset, &null_count, &null_bitmap));

*out = ArrayData::Make(type, dict_length, {null_bitmap, std::move(dict_data)},
return ArrayData::Make(type, dict_length, {null_bitmap, std::move(dict_data)},
null_count);
return Status::OK();
}
};

Expand Down
5 changes: 3 additions & 2 deletions cpp/src/arrow/compute/kernels/vector_hash.cc
Original file line number Diff line number Diff line change
Expand Up @@ -285,8 +285,9 @@ class RegularHashKernel : public HashKernel {
Status FlushFinal(ExecResult* out) override { return action_.FlushFinal(out); }

Status GetDictionary(std::shared_ptr<ArrayData>* out) override {
return DictionaryTraits<Type>::GetDictionaryArrayData(pool_, type_, *memo_table_,
0 /* start_offset */, out);
ARROW_ASSIGN_OR_RAISE(*out, DictionaryTraits<Type>::GetDictionaryArrayData(
pool_, type_, *memo_table_, 0 /* start_offset */));
return Status::OK();
}

std::shared_ptr<DataType> value_type() const override { return type_; }
Expand Down
18 changes: 18 additions & 0 deletions java/vector/src/main/codegen/templates/AbstractFieldWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,24 @@ public void write(${name}Holder holder) {
}
</#if>

<#if minor.class?ends_with("VarBinary")>
public void writeTo${minor.class}(byte[] value) {
fail("${name}");
}

public void writeTo${minor.class}(byte[] value, int offset, int length) {
fail("${name}");
}

public void writeTo${minor.class}(ByteBuffer value) {
fail("${name}");
}

public void writeTo${minor.class}(ByteBuffer value, int offset, int length) {
fail("${name}");
}
</#if>

</#list></#list>

public void writeNull() {
Expand Down
33 changes: 33 additions & 0 deletions java/vector/src/main/codegen/templates/ComplexWriters.java
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,28 @@ public void writeNull() {
vector.setValueCount(idx()+1);
}
</#if>
<#if minor.class?ends_with("VarBinary")>
public void writeTo${minor.class}(byte[] value) {
vector.setSafe(idx(), value);
vector.setValueCount(idx() + 1);
}
public void writeTo${minor.class}(byte[] value, int offset, int length) {
vector.setSafe(idx(), value, offset, length);
vector.setValueCount(idx() + 1);
}
public void writeTo${minor.class}(ByteBuffer value) {
vector.setSafe(idx(), value, 0, value.remaining());
vector.setValueCount(idx() + 1);
}
public void writeTo${minor.class}(ByteBuffer value, int offset, int length) {
vector.setSafe(idx(), value, offset, length);
vector.setValueCount(idx() + 1);
}
</#if>
}
<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/writer/${eName}Writer.java" />
Expand Down Expand Up @@ -223,6 +245,17 @@ public interface ${eName}Writer extends BaseWriter {
@Deprecated
public void writeBigEndianBytesTo${minor.class}(byte[] value);
</#if>
<#if minor.class?ends_with("VarBinary")>
public void writeTo${minor.class}(byte[] value);

public void writeTo${minor.class}(byte[] value, int offset, int length);

public void writeTo${minor.class}(ByteBuffer value);

public void writeTo${minor.class}(ByteBuffer value, int offset, int length);
</#if>

}

</#list>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.arrow.vector.complex.writer;

import java.nio.ByteBuffer;

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.LargeVarBinaryVector;
import org.apache.arrow.vector.VarBinaryVector;
import org.apache.arrow.vector.complex.impl.LargeVarBinaryWriterImpl;
import org.apache.arrow.vector.complex.impl.VarBinaryWriterImpl;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;

public class TestSimpleWriter {

private BufferAllocator allocator;

@Before
public void init() {
allocator = new RootAllocator(Integer.MAX_VALUE);
}

@After
public void terminate() throws Exception {
allocator.close();
}

@Test
public void testWriteByteArrayToVarBinary() {
try (VarBinaryVector vector = new VarBinaryVector("test", allocator);
VarBinaryWriterImpl writer = new VarBinaryWriterImpl(vector)) {
byte[] input = new byte[] { 0x01, 0x02 };
writer.writeToVarBinary(input);
byte[] result = vector.get(0);
Assert.assertArrayEquals(input, result);
}
}

@Test
public void testWriteByteArrayWithOffsetToVarBinary() {
try (VarBinaryVector vector = new VarBinaryVector("test", allocator);
VarBinaryWriterImpl writer = new VarBinaryWriterImpl(vector)) {
byte[] input = new byte[] { 0x01, 0x02 };
writer.writeToVarBinary(input, 1, 1);
byte[] result = vector.get(0);
Assert.assertArrayEquals(new byte[] { 0x02 }, result);
}
}

@Test
public void testWriteByteBufferToVarBinary() {
try (VarBinaryVector vector = new VarBinaryVector("test", allocator);
VarBinaryWriterImpl writer = new VarBinaryWriterImpl(vector)) {
byte[] input = new byte[] { 0x01, 0x02 };
ByteBuffer buffer = ByteBuffer.wrap(input);
writer.writeToVarBinary(buffer);
byte[] result = vector.get(0);
Assert.assertArrayEquals(input, result);
}
}

@Test
public void testWriteByteBufferWithOffsetToVarBinary() {
try (VarBinaryVector vector = new VarBinaryVector("test", allocator);
VarBinaryWriterImpl writer = new VarBinaryWriterImpl(vector)) {
byte[] input = new byte[] { 0x01, 0x02 };
ByteBuffer buffer = ByteBuffer.wrap(input);
writer.writeToVarBinary(buffer, 1, 1);
byte[] result = vector.get(0);
Assert.assertArrayEquals(new byte[] { 0x02 }, result);
}
}

@Test
public void testWriteByteArrayToLargeVarBinary() {
try (LargeVarBinaryVector vector = new LargeVarBinaryVector("test", allocator);
LargeVarBinaryWriterImpl writer = new LargeVarBinaryWriterImpl(vector)) {
byte[] input = new byte[] { 0x01, 0x02 };
writer.writeToLargeVarBinary(input);
byte[] result = vector.get(0);
Assert.assertArrayEquals(input, result);
}
}

@Test
public void testWriteByteArrayWithOffsetToLargeVarBinary() {
try (LargeVarBinaryVector vector = new LargeVarBinaryVector("test", allocator);
LargeVarBinaryWriterImpl writer = new LargeVarBinaryWriterImpl(vector)) {
byte[] input = new byte[] { 0x01, 0x02 };
writer.writeToLargeVarBinary(input, 1, 1);
byte[] result = vector.get(0);
Assert.assertArrayEquals(new byte[] { 0x02 }, result);
}
}

@Test
public void testWriteByteBufferToLargeVarBinary() {
try (LargeVarBinaryVector vector = new LargeVarBinaryVector("test", allocator);
LargeVarBinaryWriterImpl writer = new LargeVarBinaryWriterImpl(vector)) {
byte[] input = new byte[] { 0x01, 0x02 };
ByteBuffer buffer = ByteBuffer.wrap(input);
writer.writeToLargeVarBinary(buffer);
byte[] result = vector.get(0);
Assert.assertArrayEquals(input, result);
}
}

@Test
public void testWriteByteBufferWithOffsetToLargeVarBinary() {
try (LargeVarBinaryVector vector = new LargeVarBinaryVector("test", allocator);
LargeVarBinaryWriterImpl writer = new LargeVarBinaryWriterImpl(vector)) {
byte[] input = new byte[] { 0x01, 0x02 };
ByteBuffer buffer = ByteBuffer.wrap(input);
writer.writeToLargeVarBinary(buffer, 1, 1);
byte[] result = vector.get(0);
Assert.assertArrayEquals(new byte[] { 0x02 }, result);
}
}
}
Loading

0 comments on commit 3710405

Please sign in to comment.