Skip to content

Commit

Permalink
Merge branch 'main' into add-benchmark-for-binary-view-builder
Browse files Browse the repository at this point in the history
  • Loading branch information
mapleFU committed Jul 30, 2024
2 parents eb9d3e0 + 48782e7 commit 12c5f5b
Show file tree
Hide file tree
Showing 52 changed files with 704 additions and 67 deletions.
12 changes: 6 additions & 6 deletions .github/CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
/matlab/ @kevingurney @kou @sgilmore10
/python/pyarrow/_flight.pyx @lidavidm
/python/pyarrow/**/*gandiva* @wjones127
/r/ @thisisnic
/r/ @jonkeane @thisisnic
/ruby/ @kou
/swift/ @kou

Expand All @@ -53,19 +53,19 @@
# *.txt

# PR CI and repository files
/.github/ @assignUser @kou @raulcd
/.github/ @assignUser @jonkeane @kou @raulcd
.asf.yaml @assignUser @kou @raulcd
.pre-commit-config.yaml @raulcd
.travis.yml @assignUser @kou @raulcd
appveyor.yml @assignUser @kou @raulcd
# .git*

# release scripts, archery etc.
/ci/ @assignUser @kou @raulcd
/dev/ @assignUser @kou @raulcd
/ci/ @assignUser @jonkeane @kou @raulcd
/dev/ @assignUser @jonkeane @kou @raulcd
.dockerignore @raulcd
.env @assignUser @kou @raulcd
docker-compose.yml @assignUser @kou @raulcd
.env @assignUser @jonkeane @kou @raulcd
docker-compose.yml @assignUser @jonkeane @kou @raulcd

# R specific packaging tooling
/r/configure* @assignUser
Expand Down
5 changes: 5 additions & 0 deletions cpp/cmake_modules/BuildUtils.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -721,6 +721,11 @@ function(ADD_TEST_CASE REL_TEST_NAME)
"${EXECUTABLE_OUTPUT_PATH};$ENV{CONDA_PREFIX}/lib")
endif()

# Ensure using bundled GoogleTest when we use bundled GoogleTest.
# ARROW_GTEST_GTEST_HEADERS is defined only when we use bundled
# GoogleTest.
target_link_libraries(${TEST_NAME} PRIVATE ${ARROW_GTEST_GTEST_HEADERS})

if(ARG_STATIC_LINK_LIBS)
# Customize link libraries
target_link_libraries(${TEST_NAME} PRIVATE ${ARG_STATIC_LINK_LIBS})
Expand Down
6 changes: 6 additions & 0 deletions cpp/cmake_modules/ThirdpartyToolchain.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -2306,6 +2306,10 @@ function(build_gtest)
install(DIRECTORY "${googletest_SOURCE_DIR}/googlemock/include/"
"${googletest_SOURCE_DIR}/googletest/include/"
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}")
add_library(arrow::GTest::gtest_headers INTERFACE IMPORTED)
target_include_directories(arrow::GTest::gtest_headers
INTERFACE "${googletest_SOURCE_DIR}/googlemock/include/"
"${googletest_SOURCE_DIR}/googletest/include/")
install(TARGETS gmock gmock_main gtest gtest_main
EXPORT arrow_testing_targets
RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
Expand Down Expand Up @@ -2350,12 +2354,14 @@ if(ARROW_TESTING)

string(APPEND ARROW_TESTING_PC_LIBS " $<TARGET_FILE:GTest::gtest>")
endif()
set(ARROW_GTEST_GTEST_HEADERS)
set(ARROW_GTEST_GMOCK GTest::gmock)
set(ARROW_GTEST_GTEST GTest::gtest)
set(ARROW_GTEST_GTEST_MAIN GTest::gtest_main)
else()
string(APPEND ARROW_TESTING_PC_CFLAGS " -I\${includedir}/arrow-gtest")
string(APPEND ARROW_TESTING_PC_LIBS " -larrow_gtest")
set(ARROW_GTEST_GTEST_HEADERS arrow::GTest::gtest_headers)
set(ARROW_GTEST_GMOCK arrow::GTest::gmock)
set(ARROW_GTEST_GTEST arrow::GTest::gtest)
set(ARROW_GTEST_GTEST_MAIN arrow::GTest::gtest_main)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ ::grpc::Status FlightDataSerialize(const FlightPayload& msg, ByteBuffer* out,
for (const auto& buffer : ipc_msg.body_buffers) {
// Buffer may be null when the row length is zero, or when all
// entries are invalid.
if (!buffer) continue;
if (!buffer || buffer->size() == 0) continue;

::grpc::Slice slice;
auto status = SliceFromBuffer(buffer).Value(&slice);
Expand Down
2 changes: 1 addition & 1 deletion csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

<ItemGroup>
<PackageReference Include="Google.Protobuf" Version="3.27.2" />
<PackageReference Include="Grpc.Net.Client" Version="2.64.0" />
<PackageReference Include="Grpc.Net.Client" Version="2.65.0" />
<PackageReference Include="Grpc.Tools" Version="2.65.0" PrivateAssets="All" />
</ItemGroup>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Grpc.AspNetCore" Version="2.63.0" />
<PackageReference Include="Grpc.AspNetCore" Version="2.65.0" />
</ItemGroup>

<ItemGroup>
Expand Down
1 change: 1 addition & 0 deletions dev/tasks/docker-tests/github.linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ jobs:
{% if arrow.is_default_branch() %}
{{ macros.github_login_dockerhub()|indent }}
- name: Push Docker Image
if: {{ push|default("true") }}
shell: bash
run: archery docker push {{ image }}
{% endif %}
7 changes: 0 additions & 7 deletions dev/tasks/java-jars/github.yml
Original file line number Diff line number Diff line change
Expand Up @@ -140,12 +140,6 @@ jobs:
brew uninstall protobuf
brew bundle --file=arrow/java/Brewfile
# We want to use the bundled googletest for static linking. Since
# both BUNDLED and brew options are enabled, it could cause a conflict
# when there is a version mismatch.
# We uninstall googletest to ensure using the bundled googletest.
brew uninstall googletest
- name: Build C++ libraries
env:
{{ macros.github_set_sccache_envvars()|indent(8) }}
Expand Down Expand Up @@ -256,7 +250,6 @@ jobs:
pushd arrow/java
mvn versions:set -DnewVersion={{ arrow.no_rc_snapshot_version }}
mvn versions:set -DnewVersion={{ arrow.no_rc_snapshot_version }} -f bom
mvn versions:set -DnewVersion={{ arrow.no_rc_snapshot_version }} -f maven
popd
arrow/ci/scripts/java_full_build.sh \
$GITHUB_WORKSPACE/arrow \
Expand Down
12 changes: 12 additions & 0 deletions dev/tasks/tasks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,16 @@ groups:
- ubuntu-*
- centos-*
- conda-*
# Can be removed after conda recipes are synced: #42114
- ~conda-linux-aarch64-cuda-py3
- ~conda-linux-x64-cpu-py3
- ~conda-win-x64-cpu-py3
- ~conda-win-x64-cuda-py3
- ~conda-linux-ppc64le-cuda-py3
- ~conda-linux-aarch64-cpu-py3
- ~conda-linux-ppc64le-cpu-py3
- ~conda-linux-x64-cuda-py3
- ~conda-osx-arm64-cpu-py3
- conan-*
- java-jars
- homebrew-cpp
Expand Down Expand Up @@ -197,6 +207,7 @@ tasks:
template: docker-tests/github.linux.yml
params:
image: conan
push: false

conan-maximum:
ci: github
Expand All @@ -214,6 +225,7 @@ tasks:
-e ARROW_CONAN_WITH_SNAPPY=True
-e ARROW_CONAN_WITH_ZSTD=True
image: conan
push: false

########################### Python Minimal ############################

Expand Down
2 changes: 1 addition & 1 deletion docs/source/java/cdata.rst
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,7 @@ This application uses JNI to call Java code, but transfers data (zero-copy) via
JavaVMOption options[2];
options[0].optionString = "-Djava.class.path=cpptojava.jar";
options[1].optionString = "-DXcheck:jni:pedantic";
vm_args.version = JNI_VERSION_1_8;
vm_args.version = JNI_VERSION_10;
vm_args.nOptions = 2;
vm_args.options = options;
int status = JNI_CreateJavaVM(jvm, (void **) &env, &vm_args);
Expand Down
1 change: 1 addition & 0 deletions java/.gitattributes
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
.gitattributes export-ignore
.gitignore export-ignore
* text=auto eol=lf
2 changes: 1 addition & 1 deletion java/adapter/orc/src/main/cpp/jni_wrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ static jmethodID orc_memory_constructor;
static jclass record_batch_class;
static jmethodID record_batch_constructor;

static jint JNI_VERSION = JNI_VERSION_1_6;
static jint JNI_VERSION = JNI_VERSION_10;

using arrow::internal::checked_cast;
using arrow::jni::ConcurrentMap;
Expand Down
3 changes: 2 additions & 1 deletion java/bom/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ under the License.
<groupId>org.apache</groupId>
<artifactId>apache</artifactId>
<version>33</version>
<relativePath></relativePath>
</parent>

<groupId>org.apache.arrow</groupId>
Expand Down Expand Up @@ -207,7 +208,7 @@ under the License.
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>versions-maven-plugin</artifactId>
<version>2.17.0</version>
<version>2.17.1</version>
</plugin>
</plugins>
</pluginManagement>
Expand Down
2 changes: 1 addition & 1 deletion java/c/src/main/cpp/jni_wrapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ jmethodID kPrivateDataGetNextMethod;
jmethodID kPrivateDataGetSchemaMethod;
jmethodID kCDataExceptionConstructor;

jint JNI_VERSION = JNI_VERSION_1_6;
jint JNI_VERSION = JNI_VERSION_10;

class JniPendingException : public std::runtime_error {
public:
Expand Down
2 changes: 1 addition & 1 deletion java/dataset/src/main/cpp/jni_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ namespace arrow {
namespace dataset {
namespace jni {

jint JNI_VERSION = JNI_VERSION_1_6;
jint JNI_VERSION = JNI_VERSION_10;

class ReservationListenableMemoryPool::Impl {
public:
Expand Down
119 changes: 108 additions & 11 deletions java/dataset/src/main/cpp/jni_wrapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@
#include "arrow/c/helpers.h"
#include "arrow/dataset/api.h"
#include "arrow/dataset/file_base.h"
#ifdef ARROW_CSV
#include "arrow/dataset/file_csv.h"
#endif
#include "arrow/filesystem/api.h"
#include "arrow/filesystem/path_util.h"
#include "arrow/engine/substrait/util.h"
Expand All @@ -51,7 +54,7 @@ jmethodID unreserve_memory_method;

jlong default_memory_pool_id = -1L;

jint JNI_VERSION = JNI_VERSION_1_6;
jint JNI_VERSION = JNI_VERSION_10;

class JniPendingException : public std::runtime_error {
public:
Expand Down Expand Up @@ -363,6 +366,63 @@ std::shared_ptr<arrow::Buffer> LoadArrowBufferFromByteBuffer(JNIEnv* env, jobjec
return buffer;
}

inline bool ParseBool(const std::string& value) { return value == "true" ? true : false; }

/// \brief Construct FragmentScanOptions from config map
#ifdef ARROW_CSV
arrow::Result<std::shared_ptr<arrow::dataset::FragmentScanOptions>>
ToCsvFragmentScanOptions(const std::unordered_map<std::string, std::string>& configs) {
std::shared_ptr<arrow::dataset::CsvFragmentScanOptions> options =
std::make_shared<arrow::dataset::CsvFragmentScanOptions>();
for (auto const& [key, value] : configs) {
if (key == "delimiter") {
options->parse_options.delimiter = value.data()[0];
} else if (key == "quoting") {
options->parse_options.quoting = ParseBool(value);
} else if (key == "column_types") {
int64_t schema_address = std::stol(value);
ArrowSchema* c_schema = reinterpret_cast<ArrowSchema*>(schema_address);
ARROW_ASSIGN_OR_RAISE(auto schema, arrow::ImportSchema(c_schema));
auto& column_types = options->convert_options.column_types;
for (auto field : schema->fields()) {
column_types[field->name()] = field->type();
}
} else if (key == "strings_can_be_null") {
options->convert_options.strings_can_be_null = ParseBool(value);
} else {
return arrow::Status::Invalid("Config " + key + " is not supported.");
}
}
return options;
}
#endif

arrow::Result<std::shared_ptr<arrow::dataset::FragmentScanOptions>>
GetFragmentScanOptions(jint file_format_id,
const std::unordered_map<std::string, std::string>& configs) {
switch (file_format_id) {
#ifdef ARROW_CSV
case 3:
return ToCsvFragmentScanOptions(configs);
#endif
default:
return arrow::Status::Invalid("Illegal file format id: ", file_format_id);
}
}

std::unordered_map<std::string, std::string> ToStringMap(JNIEnv* env,
jobjectArray& str_array) {
int length = env->GetArrayLength(str_array);
std::unordered_map<std::string, std::string> map;
map.reserve(length / 2);
for (int i = 0; i < length; i += 2) {
auto key = reinterpret_cast<jstring>(env->GetObjectArrayElement(str_array, i));
auto value = reinterpret_cast<jstring>(env->GetObjectArrayElement(str_array, i + 1));
map[JStringToCString(env, key)] = JStringToCString(env, value);
}
return map;
}

/*
* Class: org_apache_arrow_dataset_jni_NativeMemoryPool
* Method: getDefaultMemoryPool
Expand Down Expand Up @@ -501,12 +561,13 @@ JNIEXPORT void JNICALL Java_org_apache_arrow_dataset_jni_JniWrapper_closeDataset
/*
* Class: org_apache_arrow_dataset_jni_JniWrapper
* Method: createScanner
* Signature: (J[Ljava/lang/String;Ljava/nio/ByteBuffer;Ljava/nio/ByteBuffer;JJ)J
* Signature:
* (J[Ljava/lang/String;Ljava/nio/ByteBuffer;Ljava/nio/ByteBuffer;JI;[Ljava/lang/String;J)J
*/
JNIEXPORT jlong JNICALL Java_org_apache_arrow_dataset_jni_JniWrapper_createScanner(
JNIEnv* env, jobject, jlong dataset_id, jobjectArray columns,
jobject substrait_projection, jobject substrait_filter,
jlong batch_size, jlong memory_pool_id) {
jobject substrait_projection, jobject substrait_filter, jlong batch_size,
jint file_format_id, jobjectArray options, jlong memory_pool_id) {
JNI_METHOD_START
arrow::MemoryPool* pool = reinterpret_cast<arrow::MemoryPool*>(memory_pool_id);
if (pool == nullptr) {
Expand Down Expand Up @@ -555,6 +616,12 @@ JNIEXPORT jlong JNICALL Java_org_apache_arrow_dataset_jni_JniWrapper_createScann
}
JniAssertOkOrThrow(scanner_builder->Filter(*filter_expr));
}
if (file_format_id != -1 && options != nullptr) {
std::unordered_map<std::string, std::string> option_map = ToStringMap(env, options);
std::shared_ptr<arrow::dataset::FragmentScanOptions> scan_options =
JniGetOrThrow(GetFragmentScanOptions(file_format_id, option_map));
JniAssertOkOrThrow(scanner_builder->FragmentScanOptions(scan_options));
}
JniAssertOkOrThrow(scanner_builder->BatchSize(batch_size));

auto scanner = JniGetOrThrow(scanner_builder->Finish());
Expand Down Expand Up @@ -668,14 +735,29 @@ JNIEXPORT void JNICALL Java_org_apache_arrow_dataset_jni_JniWrapper_ensureS3Fina
/*
* Class: org_apache_arrow_dataset_file_JniWrapper
* Method: makeFileSystemDatasetFactory
* Signature: (Ljava/lang/String;II)J
* Signature: (Ljava/lang/String;II;Ljava/lang/String;Ljava/lang/String)J
*/
JNIEXPORT jlong JNICALL
Java_org_apache_arrow_dataset_file_JniWrapper_makeFileSystemDatasetFactory__Ljava_lang_String_2I(
JNIEnv* env, jobject, jstring uri, jint file_format_id) {
Java_org_apache_arrow_dataset_file_JniWrapper_makeFileSystemDatasetFactory(
JNIEnv* env, jobject, jstring uri, jint file_format_id, jobjectArray options) {
JNI_METHOD_START
std::shared_ptr<arrow::dataset::FileFormat> file_format =
JniGetOrThrow(GetFileFormat(file_format_id));
if (options != nullptr) {
std::unordered_map<std::string, std::string> option_map = ToStringMap(env, options);
std::shared_ptr<arrow::dataset::FragmentScanOptions> scan_options =
JniGetOrThrow(GetFragmentScanOptions(file_format_id, option_map));
file_format->default_fragment_scan_options = scan_options;
#ifdef ARROW_CSV
if (file_format_id == 3) {
std::shared_ptr<arrow::dataset::CsvFileFormat> csv_file_format =
std::dynamic_pointer_cast<arrow::dataset::CsvFileFormat>(file_format);
csv_file_format->parse_options =
std::dynamic_pointer_cast<arrow::dataset::CsvFragmentScanOptions>(scan_options)
->parse_options;
}
#endif
}
arrow::dataset::FileSystemFactoryOptions options;
std::shared_ptr<arrow::dataset::DatasetFactory> d =
JniGetOrThrow(arrow::dataset::FileSystemDatasetFactory::Make(
Expand All @@ -686,16 +768,31 @@ Java_org_apache_arrow_dataset_file_JniWrapper_makeFileSystemDatasetFactory__Ljav

/*
* Class: org_apache_arrow_dataset_file_JniWrapper
* Method: makeFileSystemDatasetFactory
* Signature: ([Ljava/lang/String;II)J
* Method: makeFileSystemDatasetFactoryWithFiles
* Signature: ([Ljava/lang/String;II;[Ljava/lang/String)J
*/
JNIEXPORT jlong JNICALL
Java_org_apache_arrow_dataset_file_JniWrapper_makeFileSystemDatasetFactory___3Ljava_lang_String_2I(
JNIEnv* env, jobject, jobjectArray uris, jint file_format_id) {
Java_org_apache_arrow_dataset_file_JniWrapper_makeFileSystemDatasetFactoryWithFiles(
JNIEnv* env, jobject, jobjectArray uris, jint file_format_id, jobjectArray options) {
JNI_METHOD_START

std::shared_ptr<arrow::dataset::FileFormat> file_format =
JniGetOrThrow(GetFileFormat(file_format_id));
if (options != nullptr) {
std::unordered_map<std::string, std::string> option_map = ToStringMap(env, options);
std::shared_ptr<arrow::dataset::FragmentScanOptions> scan_options =
JniGetOrThrow(GetFragmentScanOptions(file_format_id, option_map));
file_format->default_fragment_scan_options = scan_options;
#ifdef ARROW_CSV
if (file_format_id == 3) {
std::shared_ptr<arrow::dataset::CsvFileFormat> csv_file_format =
std::dynamic_pointer_cast<arrow::dataset::CsvFileFormat>(file_format);
csv_file_format->parse_options =
std::dynamic_pointer_cast<arrow::dataset::CsvFragmentScanOptions>(scan_options)
->parse_options;
}
#endif
}
arrow::dataset::FileSystemFactoryOptions options;

std::vector<std::string> uri_vec = ToStringVector(env, uris);
Expand Down
Loading

0 comments on commit 12c5f5b

Please sign in to comment.