Skip to content

Commit

Permalink
GH-37537: [Integration][C++] Add C Data Interface integration testing (
Browse files Browse the repository at this point in the history
…#37769)

### Rationale for this change

Currently there are no systematic integration tests between implementations of the C Data Interface, only a couple ad-hoc tests.

### What changes are included in this PR?

1. Add Archery infrastructure for integration testing of the C Data Interface
2. Add implementation of this interface for Arrow C++

### Are these changes tested?

Yes, by construction.

### Are there any user-facing changes?

No.
* Closes: #37537

Lead-authored-by: Antoine Pitrou <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Co-authored-by: Will Jones <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
  • Loading branch information
3 people authored Sep 19, 2023
1 parent 25fa89d commit 3b646ad
Show file tree
Hide file tree
Showing 15 changed files with 913 additions and 124 deletions.
6 changes: 4 additions & 2 deletions ci/scripts/integration_arrow.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,12 @@ set -ex
arrow_dir=${1}
gold_dir=$arrow_dir/testing/data/arrow-ipc-stream/integration

pip install -e $arrow_dir/dev/archery
pip install -e $arrow_dir/dev/archery[integration]

# Rust can be enabled by exporting ARCHERY_INTEGRATION_WITH_RUST=1
archery integration \
time archery integration \
--run-c-data \
--run-ipc \
--run-flight \
--with-cpp=1 \
--with-csharp=1 \
Expand Down
6 changes: 5 additions & 1 deletion cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,11 @@ endif()
#

if(ARROW_BUILD_INTEGRATION OR ARROW_BUILD_TESTS)
list(APPEND ARROW_SRCS integration/json_integration.cc integration/json_internal.cc)
list(APPEND
ARROW_SRCS
integration/c_data_integration_internal.cc
integration/json_integration.cc
integration/json_internal.cc)
endif()

if(ARROW_CSV)
Expand Down
145 changes: 145 additions & 0 deletions cpp/src/arrow/integration/c_data_integration_internal.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "arrow/integration/c_data_integration_internal.h"

#include <sstream>
#include <utility>

#include "arrow/c/bridge.h"
#include "arrow/integration/json_integration.h"
#include "arrow/io/file.h"
#include "arrow/memory_pool.h"
#include "arrow/pretty_print.h"
#include "arrow/record_batch.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/type_fwd.h"
#include "arrow/util/logging.h"

namespace arrow::internal::integration {
namespace {

template <typename Func>
const char* StatusToErrorString(Func&& func) {
static std::string error;

Status st = func();
if (st.ok()) {
return nullptr;
}
error = st.ToString();
ARROW_CHECK_GT(error.length(), 0);
return error.c_str();
}

Result<std::shared_ptr<Schema>> ReadSchemaFromJson(const std::string& json_path,
MemoryPool* pool) {
ARROW_ASSIGN_OR_RAISE(auto file, io::ReadableFile::Open(json_path, pool));
ARROW_ASSIGN_OR_RAISE(auto reader, IntegrationJsonReader::Open(pool, file));
return reader->schema();
}

Result<std::shared_ptr<RecordBatch>> ReadBatchFromJson(const std::string& json_path,
int num_batch, MemoryPool* pool) {
ARROW_ASSIGN_OR_RAISE(auto file, io::ReadableFile::Open(json_path, pool));
ARROW_ASSIGN_OR_RAISE(auto reader, IntegrationJsonReader::Open(pool, file));
return reader->ReadRecordBatch(num_batch);
}

// XXX ideally, we should allow use of a custom memory pool in the C bridge API,
// but that requires non-trivial refactor

Status ExportSchemaFromJson(std::string json_path, ArrowSchema* out) {
auto pool = default_memory_pool();
ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchemaFromJson(json_path, pool));
return ExportSchema(*schema, out);
}

Status ImportSchemaAndCompareToJson(std::string json_path, ArrowSchema* c_schema) {
auto pool = default_memory_pool();
ARROW_ASSIGN_OR_RAISE(auto json_schema, ReadSchemaFromJson(json_path, pool));
ARROW_ASSIGN_OR_RAISE(auto imported_schema, ImportSchema(c_schema));
if (!imported_schema->Equals(json_schema, /*check_metadata=*/true)) {
return Status::Invalid("Schemas are different:", "\n- Json Schema: ", *json_schema,
"\n- Imported Schema: ", *imported_schema);
}
return Status::OK();
}

Status ExportBatchFromJson(std::string json_path, int num_batch, ArrowArray* out) {
auto pool = default_memory_pool();
ARROW_ASSIGN_OR_RAISE(auto batch, ReadBatchFromJson(json_path, num_batch, pool));
return ExportRecordBatch(*batch, out);
}

Status ImportBatchAndCompareToJson(std::string json_path, int num_batch,
ArrowArray* c_batch) {
auto pool = default_memory_pool();
ARROW_ASSIGN_OR_RAISE(auto batch, ReadBatchFromJson(json_path, num_batch, pool));
ARROW_ASSIGN_OR_RAISE(auto imported_batch, ImportRecordBatch(c_batch, batch->schema()));
RETURN_NOT_OK(imported_batch->ValidateFull());
if (!imported_batch->Equals(*batch, /*check_metadata=*/true)) {
std::stringstream pp_expected;
std::stringstream pp_actual;
PrettyPrintOptions options(/*indent=*/2);
options.window = 50;
ARROW_CHECK_OK(PrettyPrint(*batch, options, &pp_expected));
ARROW_CHECK_OK(PrettyPrint(*imported_batch, options, &pp_actual));
return Status::Invalid("Record Batches are different:", "\n- Json Batch: ",
pp_expected.str(), "\n- Imported Batch: ", pp_actual.str());
}
return Status::OK();
}

} // namespace
} // namespace arrow::internal::integration

const char* ArrowCpp_CDataIntegration_ExportSchemaFromJson(const char* json_path,
ArrowSchema* out) {
using namespace arrow::internal::integration; // NOLINT(build/namespaces)
return StatusToErrorString([=]() { return ExportSchemaFromJson(json_path, out); });
}

const char* ArrowCpp_CDataIntegration_ImportSchemaAndCompareToJson(const char* json_path,
ArrowSchema* schema) {
using namespace arrow::internal::integration; // NOLINT(build/namespaces)
return StatusToErrorString(
[=]() { return ImportSchemaAndCompareToJson(json_path, schema); });
}

const char* ArrowCpp_CDataIntegration_ExportBatchFromJson(const char* json_path,
int num_batch,
ArrowArray* out) {
using namespace arrow::internal::integration; // NOLINT(build/namespaces)
return StatusToErrorString(
[=]() { return ExportBatchFromJson(json_path, num_batch, out); });
}

const char* ArrowCpp_CDataIntegration_ImportBatchAndCompareToJson(const char* json_path,
int num_batch,
ArrowArray* batch) {
using namespace arrow::internal::integration; // NOLINT(build/namespaces)
return StatusToErrorString(
[=]() { return ImportBatchAndCompareToJson(json_path, num_batch, batch); });
}

int64_t ArrowCpp_BytesAllocated() {
auto pool = arrow::default_memory_pool();
return pool->bytes_allocated();
}
48 changes: 48 additions & 0 deletions cpp/src/arrow/integration/c_data_integration_internal.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include "arrow/c/abi.h"
#include "arrow/util/visibility.h"

// This file only serves as documentation for the C Data Interface integration
// entrypoints. The actual functions are called by Archery through DLL symbol lookup.

extern "C" {

ARROW_EXPORT
const char* ArrowCpp_CDataIntegration_ExportSchemaFromJson(const char* json_path,
ArrowSchema* out);

ARROW_EXPORT
const char* ArrowCpp_CDataIntegration_ImportSchemaAndCompareToJson(const char* json_path,
ArrowSchema* schema);

ARROW_EXPORT
const char* ArrowCpp_CDataIntegration_ExportBatchFromJson(const char* json_path,
int num_batch, ArrowArray* out);

ARROW_EXPORT
const char* ArrowCpp_CDataIntegration_ImportBatchAndCompareToJson(const char* json_path,
int num_batch,
ArrowArray* batch);

ARROW_EXPORT
int64_t ArrowCpp_BytesAllocated();

} // extern "C"
7 changes: 3 additions & 4 deletions cpp/src/arrow/integration/json_integration.cc
Original file line number Diff line number Diff line change
Expand Up @@ -144,10 +144,9 @@ class IntegrationJsonReader::Impl {
}

Result<std::shared_ptr<RecordBatch>> ReadRecordBatch(int i) {
DCHECK_GE(i, 0) << "i out of bounds";
DCHECK_LT(i, static_cast<int>(record_batches_->GetArray().Size()))
<< "i out of bounds";

if (i < 0 || i >= static_cast<int>(record_batches_->GetArray().Size())) {
return Status::IndexError("record batch index ", i, " out of bounds");
}
return json::ReadRecordBatch(record_batches_->GetArray()[i], schema_,
&dictionary_memo_, pool_);
}
Expand Down
1 change: 1 addition & 0 deletions cpp/src/arrow/symbols.map
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
};
# Also export C-level helpers
arrow_*;
Arrow*;
# ARROW-14771: export Protobuf symbol table
descriptor_table_Flight_2eproto;
descriptor_table_FlightSql_2eproto;
Expand Down
21 changes: 17 additions & 4 deletions dev/archery/archery/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -723,8 +723,12 @@ def _set_default(opt, default):
envvar="ARCHERY_INTEGRATION_WITH_RUST")
@click.option('--write_generated_json', default="",
help='Generate test JSON to indicated path')
@click.option('--run-ipc', is_flag=True, default=False,
help='Run IPC integration tests')
@click.option('--run-flight', is_flag=True, default=False,
help='Run Flight integration tests')
@click.option('--run-c-data', is_flag=True, default=False,
help='Run C Data Interface integration tests')
@click.option('--debug', is_flag=True, default=False,
help='Run executables in debug mode as relevant')
@click.option('--serial', is_flag=True, default=False,
Expand Down Expand Up @@ -753,24 +757,33 @@ def integration(with_all=False, random_seed=12345, **args):
gen_path = args['write_generated_json']

languages = ['cpp', 'csharp', 'java', 'js', 'go', 'rust']
formats = ['ipc', 'flight', 'c_data']

enabled_languages = 0
for lang in languages:
param = 'with_{}'.format(lang)
param = f'with_{lang}'
if with_all:
args[param] = with_all
enabled_languages += args[param]

if args[param]:
enabled_languages += 1
enabled_formats = 0
for fmt in formats:
param = f'run_{fmt}'
enabled_formats += args[param]

if gen_path:
# XXX See GH-37575: this option is only used by the JS test suite
# and might not be useful anymore.
os.makedirs(gen_path, exist_ok=True)
write_js_test_json(gen_path)
else:
if enabled_formats == 0:
raise click.UsageError(
"Need to enable at least one format to test "
"(IPC, Flight, C Data Interface); try --help")
if enabled_languages == 0:
raise Exception("Must enable at least 1 language to test")
raise click.UsageError(
"Need to enable at least one language to test; try --help")
run_all_tests(**args)


Expand Down
Loading

0 comments on commit 3b646ad

Please sign in to comment.