Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Fix](Outfile) upgrade apache-arrow version to 13.0.0 #35142

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions thirdparty/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,13 @@ Now there will be 2 set of libhdfs, one is without kerberos, the other is with k

## v20211215

## v20240521
- Modified: arrow 7.0.0 -> 13.0.0
- Modified: jemalloc for arrow 5.2.1 -> 5.3.0
- Modified: xsimd 7.0.0 -> 13.0.0
- Added: c-ares -> 1.19.1
- Added: grpc -> 1.54.3

### Changes

- Added: cyrus-sasl
Expand Down
72 changes: 71 additions & 1 deletion thirdparty/build-thirdparty.sh
Original file line number Diff line number Diff line change
Expand Up @@ -958,6 +958,8 @@ build_arrow() {
export ARROW_ZLIB_URL="${TP_SOURCE_DIR}/${ZLIB_NAME}"
export ARROW_XSIMD_URL="${TP_SOURCE_DIR}/${XSIMD_NAME}"
export ARROW_ORC_URL="${TP_SOURCE_DIR}/${ORC_NAME}"
export ARROW_GRPC_URL="${TP_SOURCE_DIR}/${GRPC_NAME}"
export ARROW_PROTOBUF_URL="${TP_SOURCE_DIR}/${PROTOBUF_NAME}"

if [[ "${KERNEL}" != 'Darwin' ]]; then
ldflags="-L${TP_LIB_DIR} -static-libstdc++ -static-libgcc"
Expand All @@ -973,22 +975,38 @@ build_arrow() {
-DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" \
-DCMAKE_INSTALL_LIBDIR=lib64 \
-DARROW_BOOST_USE_SHARED=OFF \
-DARROW_WITH_GRPC=ON \
-DgRPC_SOURCE=SYSTEM \
-DgRPC_ROOT="${TP_INSTALL_DIR}" \
-DARROW_WITH_PROTOBUF=ON \
-DProtobuf_SOURCE=SYSTEM \
-DProtobuf_LIB="${TP_INSTALL_DIR}/lib/libprotoc.a" -DProtobuf_INCLUDE_DIR="${TP_INSTALL_DIR}/include" \
-DARROW_FLIGHT=ON \
-DARROW_FLIGHT_SQL=ON \
-DBoost_USE_STATIC_RUNTIME=ON \
-DARROW_GFLAGS_USE_SHARED=OFF \
-Dgflags_ROOT="${TP_INSTALL_DIR}" \
-DGLOG_ROOT="${TP_INSTALL_DIR}" \
-DRE2_ROOT="${TP_INSTALL_DIR}" \
-DZLIB_SOURCE=SYSTEM \
-DZLIB_LIBRARY="${TP_INSTALL_DIR}/lib/libz.a" -DZLIB_INCLUDE_DIR="${TP_INSTALL_DIR}/include" \
-DRapidJSON_SOURCE=SYSTEM \
-DRapidJSON_ROOT="${TP_INSTALL_DIR}" \
-DORC_ROOT="${TP_INSTALL_DIR}" \
-Dxsimd_SOURCE=BUNDLED \
-DBrotli_SOURCE=BUNDLED \
-DARROW_LZ4_USE_SHARED=OFF \
-DLZ4_LIB="${TP_INSTALL_DIR}/lib/liblz4.a" -DLZ4_INCLUDE_DIR="${TP_INSTALL_DIR}/include/lz4" \
-DLz4_SOURCE=SYSTEM \
-DARROW_ZSTD_USE_SHARED=OFF \
-DZSTD_LIB="${TP_INSTALL_DIR}/lib/libzstd.a" -DZSTD_INCLUDE_DIR="${TP_INSTALL_DIR}/include" \
-Dzstd_SOURCE=SYSTEM \
-DSnappy_LIB="${TP_INSTALL_DIR}/lib/libsnappy.a" -DSnappy_INCLUDE_DIR="${TP_INSTALL_DIR}/include" \
-DSnappy_SOURCE=SYSTEM \
-DBOOST_ROOT="${TP_INSTALL_DIR}" --no-warn-unused-cli \
-Djemalloc_SOURCE=BUNDLED \
-DARROW_THRIFT_USE_SHARED=OFF \
-DThrift_SOURCE=SYSTEM \
-DThrift_ROOT="${TP_INSTALL_DIR}" ..

"${BUILD_SYSTEM}" -j "${PARALLEL}"
Expand Down Expand Up @@ -1662,6 +1680,56 @@ build_libdeflate() {
"${BUILD_SYSTEM}" install
}

# c-ares
build_cares() {
check_if_source_exist "${CARES_SOURCE}"
cd "${TP_SOURCE_DIR}/${CARES_SOURCE}"

mkdir -p build
cd build
cmake -DCMAKE_BUILD_TYPE=Release \
-DCARES_STATIC=ON \
-DCARES_SHARED=OFF \
-DCARES_STATIC_PIC=ON \
-DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" ..
make
make install
}

# grpc
build_grpc() {
check_if_source_exist "${GRPC_SOURCE}"
cd "${TP_SOURCE_DIR}/${GRPC_SOURCE}"

mkdir -p cmake/build
cd cmake/build

cmake -DgRPC_INSTALL=ON \
-DgRPC_BUILD_TESTS=OFF \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" \
-DgRPC_CARES_PROVIDER=package \
-Dc-ares_DIR="${TP_INSTALL_DIR}" \
-DgRPC_ABSL_PROVIDER=package \
-Dabsl_DIR="${TP_INSTALL_DIR}" \
-DgRPC_PROTOBUF_PROVIDER=package \
-DProtobuf_DIR="${TP_INSTALL_DIR}" \
-DgRPC_RE2_PROVIDER=package \
-Dre2_DIR:STRING="${TP_INSTALL_DIR}" \
-DgRPC_SSL_PROVIDER=package \
-DOPENSSL_ROOT_DIR="${TP_INSTALL_DIR}" \
-DgRPC_ZLIB_PROVIDER=package \
-DZLIB_ROOT="${TP_INSTALL_DIR}" \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
../..

make -j "${PARALLEL}"
make install

# for grpc > v1.55, cmake 2.22 does not support find_dependency, delete this line after cmake version upgrade.
# sed -i 's/find_dependency/find_package/g' "${TP_INSTALL_DIR}"/lib64/cmake/grpc/gRPCConfig.cmake
}

if [[ "${#packages[@]}" -eq 0 ]]; then
packages=(
libunixodbc
Expand All @@ -1673,9 +1741,9 @@ if [[ "${#packages[@]}" -eq 0 ]]; then
lzo2
zstd
boost # must before thrift
protobuf
gflags
gtest
protobuf # after gtest
glog
rapidjson
snappy
Expand All @@ -1693,6 +1761,8 @@ if [[ "${#packages[@]}" -eq 0 ]]; then
librdkafka
flatbuffers
orc
cares
grpc # after cares, protobuf
arrow
abseil
s2
Expand Down
4 changes: 2 additions & 2 deletions thirdparty/download-thirdparty.sh
Original file line number Diff line number Diff line change
Expand Up @@ -323,10 +323,10 @@ fi
echo "Finished patching ${OPENTELEMETRY_SOURCE}"

# arrow patch is used to get the raw orc reader for filter prune.
if [[ "${ARROW_SOURCE}" == "apache-arrow-7.0.0" ]]; then
if [[ "${ARROW_SOURCE}" == "arrow-apache-arrow-13.0.0" ]]; then
cd "${TP_SOURCE_DIR}/${ARROW_SOURCE}"
if [[ ! -f "${PATCHED_MARK}" ]]; then
patch -p1 <"${TP_PATCH_DIR}/apache-arrow-7.0.0.patch"
patch -p1 <"${TP_PATCH_DIR}/apache-arrow-13.0.0.patch"
touch "${PATCHED_MARK}"
fi
cd -
Expand Down
Original file line number Diff line number Diff line change
@@ -1,82 +1,81 @@
diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc
index 03243e751..1eb9b2c81 100644
index 2466e7433..46b4402d4 100644
--- a/cpp/src/arrow/adapters/orc/adapter.cc
+++ b/cpp/src/arrow/adapters/orc/adapter.cc
@@ -47,9 +47,6 @@
#include "arrow/util/visibility.h"
#include "orc/Exceptions.hh"

-// alias to not interfere with nested orc namespace
-namespace liborc = orc;
-
#define ORC_THROW_NOT_OK(s) \
do { \
Status _s = (s); \
@@ -198,6 +195,8 @@ class ORCFileReader::Impl {
@@ -202,6 +199,8 @@ class ORCFileReader::Impl {
return Init();
}

+ virtual liborc::Reader* GetRawORCReader() { return reader_.get(); }
+
Status Init() {
int64_t nstripes = reader_->getNumberOfStripes();
stripes_.resize(nstripes);
@@ -504,6 +503,32 @@ class ORCFileReader::Impl {
@@ -479,6 +478,31 @@ class ORCFileReader::Impl {
return Status::OK();
}

+ Result<std::shared_ptr<RecordBatchReader>> NextStripeReader(
+ int64_t batch_size, const std::vector<std::string>& include_names) {
+ if (current_row_ >= NumberOfRows()) {
+ return nullptr;
+ }
+
+ liborc::RowReaderOptions opts;
+ liborc::RowReaderOptions opts = default_row_reader_options();
+ if (!include_names.empty()) {
+ RETURN_NOT_OK(SelectNames(&opts, include_names));
+ }
+ StripeInformation stripe_info({0, 0, 0, 0});
+ RETURN_NOT_OK(SelectStripeWithRowNumber(&opts, current_row_, &stripe_info));
+ std::shared_ptr<Schema> schema;
+ RETURN_NOT_OK(ReadSchema(opts, &schema));
+ ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts));
+ std::unique_ptr<liborc::RowReader> row_reader;
+
+ ORC_BEGIN_CATCH_NOT_OK
+ row_reader = reader_->createRowReader(opts);
+ row_reader->seekToRow(current_row_);
+ current_row_ = stripe_info.first_row_of_stripe + stripe_info.num_rows;
+ current_row_ = stripe_info.first_row_id + stripe_info.num_rows;
+ ORC_END_CATCH_NOT_OK
+
+ return std::make_shared<OrcStripeReader>(std::move(row_reader), schema, batch_size,
+ pool_);
+ }
+
Status NextStripeReader(int64_t batch_size, std::shared_ptr<RecordBatchReader>* out) {
return NextStripeReader(batch_size, {}, out);
}
@@ -531,6 +556,8 @@ Result<std::unique_ptr<ORCFileReader>> ORCFileReader::Open(
Result<std::shared_ptr<RecordBatchReader>> NextStripeReader(
int64_t batch_size, const std::vector<int>& include_indices) {
if (current_row_ >= NumberOfRows()) {
@@ -544,6 +568,8 @@ Result<std::unique_ptr<ORCFileReader>> ORCFileReader::Open(
return std::move(result);
}

+liborc::Reader* ORCFileReader::GetRawORCReader() { return impl_->GetRawORCReader(); }
+
Result<std::shared_ptr<const KeyValueMetadata>> ORCFileReader::ReadMetadata() {
return impl_->ReadMetadata();
}
@@ -653,6 +680,11 @@ Result<std::shared_ptr<RecordBatchReader>> ORCFileReader::NextStripeReader(
return reader;
@@ -605,6 +631,11 @@ Result<std::shared_ptr<RecordBatchReader>> ORCFileReader::NextStripeReader(
return impl_->NextStripeReader(batch_size, include_indices);
}

+Result<std::shared_ptr<RecordBatchReader>> ORCFileReader::NextStripeReader(
+ int64_t batch_size, const std::vector<std::string>& include_names) {
+ return impl_->NextStripeReader(batch_size, include_names);
+}
+
int64_t ORCFileReader::NumberOfStripes() { return impl_->NumberOfStripes(); }

int64_t ORCFileReader::NumberOfRows() { return impl_->NumberOfRows(); }
diff --git a/cpp/src/arrow/adapters/orc/adapter.h b/cpp/src/arrow/adapters/orc/adapter.h
index 223efa515..04e6b0612 100644
index 013be7860..7fd06bcb8 100644
--- a/cpp/src/arrow/adapters/orc/adapter.h
+++ b/cpp/src/arrow/adapters/orc/adapter.h
@@ -30,6 +30,10 @@
Expand All @@ -87,23 +86,23 @@ index 223efa515..04e6b0612 100644
+
+// alias to not interfere with nested orc namespace
+namespace liborc = orc;

namespace arrow {
namespace adapters {
@@ -51,6 +55,9 @@ class ARROW_EXPORT ORCFileReader {
static Status Open(const std::shared_ptr<io::RandomAccessFile>& file, MemoryPool* pool,
std::unique_ptr<ORCFileReader>* reader);
@@ -53,6 +57,9 @@ class ARROW_EXPORT ORCFileReader {
public:
~ORCFileReader();

+ /// \brief Get ORC reader from inside.
+ liborc::Reader* GetRawORCReader();
+
/// \brief Creates a new ORC reader
///
/// \param[in] file the data source
@@ -240,6 +247,19 @@ class ARROW_EXPORT ORCFileReader {
Result<std::shared_ptr<RecordBatchReader>> NextStripeReader(
int64_t batch_size, const std::vector<int>& include_indices);
@@ -174,6 +181,19 @@ class ARROW_EXPORT ORCFileReader {
Result<std::shared_ptr<RecordBatchReader>> GetRecordBatchReader(
int64_t batch_size, const std::vector<std::string>& include_names);

+ /// \brief Get a stripe level record batch iterator with specified row count
+ /// in each record batch. NextStripeReader serves as a fine grain
+ /// alternative to ReadStripe which may cause OOM issue by loading
Expand All @@ -119,4 +118,3 @@ index 223efa515..04e6b0612 100644
+
/// \brief The number of stripes in the file
int64_t NumberOfStripes();

41 changes: 28 additions & 13 deletions thirdparty/vars.sh
Original file line number Diff line number Diff line change
Expand Up @@ -238,11 +238,24 @@ FLATBUFFERS_NAME=flatbuffers-2.0.0.tar.gz
FLATBUFFERS_SOURCE=flatbuffers-2.0.0
FLATBUFFERS_MD5SUM="a27992324c3cbf86dd888268a23d17bd"

# c-ares
CARES_DOWNLOAD="https://github.com/c-ares/c-ares/releases/download/cares-1_19_1/c-ares-1.19.1.tar.gz"
CARES_NAME="c-ares-1.19.1.tar.gz"
CARES_SOURCE=c-ares-1.19.1
CARES_MD5SUM="dafc5825a92dc907e144570e4e75a908"

# grpc
# grpc v1.55 and above require protobuf >= 22
GRPC_DOWNLOAD="https://github.com/grpc/grpc/archive/refs/tags/v1.54.3.tar.gz"
GRPC_NAME="grpc-v1.54.3.tar.gz"
GRPC_SOURCE=grpc-1.54.3
GRPC_MD5SUM="af00a2edeae0f02bb25917cc3473b7de"

# arrow
ARROW_DOWNLOAD="https://archive.apache.org/dist/arrow/arrow-7.0.0/apache-arrow-7.0.0.tar.gz"
ARROW_NAME="apache-arrow-7.0.0.tar.gz"
ARROW_SOURCE="apache-arrow-7.0.0"
ARROW_MD5SUM="316ade159901646849b3b4760fa52816"
ARROW_DOWNLOAD="https://github.com/apache/arrow/archive/refs/tags/apache-arrow-13.0.0.tar.gz"
ARROW_NAME="apache-arrow-13.0.0.tar.gz"
ARROW_SOURCE="arrow-apache-arrow-13.0.0"
ARROW_MD5SUM="8ec1ec6a119514bcaea1cf7aabc9df1f"

# Abseil
ABSEIL_DOWNLOAD="https://github.com/abseil/abseil-cpp/archive/refs/tags/20220623.1.tar.gz"
Expand Down Expand Up @@ -287,10 +300,10 @@ ORC_SOURCE=orc-1.7.2
ORC_MD5SUM="6cab37935eacdec7d078d327746a8578"

# jemalloc for arrow
JEMALLOC_ARROW_DOWNLOAD="https://github.com/jemalloc/jemalloc/releases/download/5.2.1/jemalloc-5.2.1.tar.bz2"
JEMALLOC_ARROW_NAME="jemalloc-5.2.1.tar.bz2"
JEMALLOC_ARROW_SOURCE="jemalloc-5.2.1"
JEMALLOC_ARROW_MD5SUM="3d41fbf006e6ebffd489bdb304d009ae"
JEMALLOC_ARROW_DOWNLOAD="https://github.com/jemalloc/jemalloc/releases/download/5.3.0/jemalloc-5.3.0.tar.bz2"
JEMALLOC_ARROW_NAME="jemalloc-5.3.0.tar.bz2"
JEMALLOC_ARROW_SOURCE="jemalloc-5.3.0"
JEMALLOC_ARROW_MD5SUM="09a8328574dab22a7df848eae6dbbf53"

# jemalloc for doris
JEMALLOC_DORIS_DOWNLOAD="https://github.com/jemalloc/jemalloc/releases/download/5.3.0/jemalloc-5.3.0.tar.bz2"
Expand Down Expand Up @@ -398,11 +411,11 @@ BENCHMARK_SOURCE=benchmark-1.8.0
BENCHMARK_MD5SUM="8ddf8571d3f6198d37852bcbd964f817"

# xsimd
# for arrow-7.0.0, if arrow upgrade, this version may also need to be changed
XSIMD_DOWNLOAD="https://github.com/xtensor-stack/xsimd/archive/aeec9c872c8b475dedd7781336710f2dd2666cb2.tar.gz"
XSIMD_NAME=xsimd-aeec9c872c8b475dedd7781336710f2dd2666cb2.tar.gz
XSIMD_SOURCE=xsimd-aeec9c872c8b475dedd7781336710f2dd2666cb2
XSIMD_MD5SUM="d024855f71c0a2837a6918c0f8f66245"
# for arrow-13.0.0, if arrow upgrade, this version may also need to be changed
XSIMD_DOWNLOAD="https://github.com/xtensor-stack/xsimd/archive/refs/tags/9.0.1.tar.gz"
XSIMD_NAME="xsimd-9.0.1.tar.gz"
XSIMD_SOURCE=xsimd-9.0.1
XSIMD_MD5SUM="59f38fe3364acd7ed137771258812d6c"

# simdjson
SIMDJSON_DOWNLOAD="https://github.com/simdjson/simdjson/archive/refs/tags/v3.0.1.tar.gz"
Expand Down Expand Up @@ -505,6 +518,8 @@ export TP_ARCHIVES=(
'CYRUS_SASL'
'LIBRDKAFKA'
'FLATBUFFERS'
'CARES'
'GRPC'
'ARROW'
'BROTLI'
'ZSTD'
Expand Down
Loading