From f44c9c76a97af0bca84f0ecfc3f6e7019f4e3679 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 5 Jul 2019 09:01:43 +0200 Subject: [PATCH 01/14] ARROW-5817: [Python] Use pytest mark for flight tests https://issues.apache.org/jira/browse/ARROW-5817 Author: Joris Van den Bossche Closes #4810 from jorisvandenbossche/ARROW-5817 and squashes the following commits: b6eb5420e fix py2 super syntax e66f7c400 ARROW-5817: use pytest mark for flight tests --- docs/source/developers/python.rst | 1 + python/pyarrow/tests/conftest.py | 13 ++++++-- python/pyarrow/tests/test_flight.py | 50 ++++++++++++++++++++--------- 3 files changed, 46 insertions(+), 18 deletions(-) diff --git a/docs/source/developers/python.rst b/docs/source/developers/python.rst index 4691d2e0f0ca7..f25e030fd0e9a 100644 --- a/docs/source/developers/python.rst +++ b/docs/source/developers/python.rst @@ -94,6 +94,7 @@ The test groups currently include: * ``plasma``: Plasma Object Store tests * ``s3``: Tests for Amazon S3 * ``tensorflow``: Tests that involve TensorFlow +* ``flight``: Flight RPC tests Benchmarking ------------ diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index 49075575ce592..21e15283f296d 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -47,7 +47,8 @@ 'parquet', 'plasma', 's3', - 'tensorflow' + 'tensorflow', + 'flight' ] @@ -61,7 +62,8 @@ 'parquet': False, 'plasma': False, 's3': False, - 'tensorflow': False + 'tensorflow': False, + 'flight': False, } try: @@ -104,6 +106,13 @@ pass +try: + import pyarrow.flight # noqa + defaults['flight'] = True +except ImportError: + pass + + def pytest_configure(config): for mark in groups: config.addinivalue_line( diff --git a/python/pyarrow/tests/test_flight.py b/python/pyarrow/tests/test_flight.py index a8973ab7343d5..ae4bbd7082445 100644 --- a/python/pyarrow/tests/test_flight.py +++ b/python/pyarrow/tests/test_flight.py @@ -32,7 +32,25 @@ from pyarrow.compat import tobytes from pyarrow.util import pathlib -flight = pytest.importorskip("pyarrow.flight") +try: + from pyarrow import flight + from pyarrow.flight import ( + FlightServerBase, ServerAuthHandler, ClientAuthHandler + ) +except ImportError: + flight = None + FlightServerBase = object + ServerAuthHandler, ClientAuthHandler = object, object + + +# Marks all of the tests in this module +# Ignore these with pytest ... -m 'not flight' +pytestmark = pytest.mark.flight + + +def test_import(): + # So we see the ImportError somewhere + import pyarrow.flight # noqa def resource_root(): @@ -92,7 +110,7 @@ def simple_dicts_table(): return pa.Table.from_arrays(data, names=['some_dicts']) -class ConstantFlightServer(flight.FlightServerBase): +class ConstantFlightServer(FlightServerBase): """A Flight server that always returns the same data. See ARROW-4796: this server implementation will segfault if Flight @@ -114,7 +132,7 @@ def do_get(self, context, ticket): return flight.RecordBatchStream(table) -class MetadataFlightServer(flight.FlightServerBase): +class MetadataFlightServer(FlightServerBase): """A Flight server that numbers incoming/outgoing data.""" def do_get(self, context, ticket): @@ -151,7 +169,7 @@ def number_batches(table): yield batch, buf -class EchoFlightServer(flight.FlightServerBase): +class EchoFlightServer(FlightServerBase): """A Flight server that returns the last data uploaded.""" def __init__(self, expected_schema=None): @@ -185,7 +203,7 @@ def do_action(self, context, action): raise NotImplementedError -class GetInfoFlightServer(flight.FlightServerBase): +class GetInfoFlightServer(FlightServerBase): """A Flight server that tests GetFlightInfo.""" def get_flight_info(self, context, descriptor): @@ -204,7 +222,7 @@ def get_flight_info(self, context, descriptor): ) -class CheckTicketFlightServer(flight.FlightServerBase): +class CheckTicketFlightServer(FlightServerBase): """A Flight server that compares the given ticket to an expected value.""" def __init__(self, expected_ticket): @@ -221,7 +239,7 @@ def do_put(self, context, descriptor, reader): self.last_message = reader.read_all() -class InvalidStreamFlightServer(flight.FlightServerBase): +class InvalidStreamFlightServer(FlightServerBase): """A Flight server that tries to return messages with differing schemas.""" schema = pa.schema([('a', pa.int32())]) @@ -237,7 +255,7 @@ def do_get(self, context, ticket): return flight.GeneratorStream(self.schema, [table1, table2]) -class SlowFlightServer(flight.FlightServerBase): +class SlowFlightServer(FlightServerBase): """A Flight server that delays its responses to test timeouts.""" def do_get(self, context, ticket): @@ -258,11 +276,11 @@ def slow_stream(): yield pa.Table.from_arrays(data1, names=['a']) -class HttpBasicServerAuthHandler(flight.ServerAuthHandler): +class HttpBasicServerAuthHandler(ServerAuthHandler): """An example implementation of HTTP basic authentication.""" def __init__(self, creds): - super().__init__() + super(HttpBasicServerAuthHandler, self).__init__() self.creds = creds def authenticate(self, outgoing, incoming): @@ -280,11 +298,11 @@ def is_valid(self, token): return username -class HttpBasicClientAuthHandler(flight.ClientAuthHandler): +class HttpBasicClientAuthHandler(ClientAuthHandler): """An example implementation of HTTP basic authentication.""" def __init__(self, username, password): - super().__init__() + super(HttpBasicClientAuthHandler, self).__init__() self.username = tobytes(username) self.password = tobytes(password) @@ -295,11 +313,11 @@ def get_token(self): return base64.b64encode(self.username + b':' + self.password) -class TokenServerAuthHandler(flight.ServerAuthHandler): +class TokenServerAuthHandler(ServerAuthHandler): """An example implementation of authentication via handshake.""" def __init__(self, creds): - super().__init__() + super(TokenServerAuthHandler, self).__init__() self.creds = creds def authenticate(self, outgoing, incoming): @@ -317,11 +335,11 @@ def is_valid(self, token): return token[7:] -class TokenClientAuthHandler(flight.ClientAuthHandler): +class TokenClientAuthHandler(ClientAuthHandler): """An example implementation of authentication via handshake.""" def __init__(self, username, password): - super().__init__() + super(TokenClientAuthHandler, self).__init__() self.username = username self.password = password self.token = b'' From ebb8031944fb90f5726ec1d611b0abd8c4fe1075 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 5 Jul 2019 09:03:04 +0200 Subject: [PATCH 02/14] ARROW-5838: [C++] Delegate OPENSSL_ROOT_DIR to bundled gRPC Author: Sutou Kouhei Closes #4795 from kou/cpp-macos-grpc-openssl and squashes the following commits: 6014d4573 Delegate OPENSSL_ROOT_DIR to bundled gRPC --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index b91312376fbbb..a8595e96afa39 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -2140,6 +2140,9 @@ macro(build_grpc) -DCMAKE_INSTALL_LIBDIR=lib "-DProtobuf_PROTOC_LIBRARY=${GRPC_Protobuf_PROTOC_LIBRARY}" -DBUILD_SHARED_LIBS=OFF) + if(OPENSSL_ROOT_DIR) + list(APPEND GRPC_CMAKE_ARGS -DOPENSSL_ROOT_DIR=${OPENSSL_ROOT_DIR}) + endif() # XXX the gRPC git checkout is huge and takes a long time # Ideally, we should be able to use the tarballs, but they don't contain From ad2539d24fd68300dc509be96d75335defc37472 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 5 Jul 2019 09:05:42 +0200 Subject: [PATCH 03/14] ARROW-5828: [C++] Add required Protocol Buffers versions check This works Protobuf_SOURCE=AUTO well on environments that have old Protocol Buffers. Author: Sutou Kouhei Closes #4785 from kou/cpp-protobuf-version-check and squashes the following commits: b4e9a8859 Add required Protocol Buffers versions check --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index a8595e96afa39..2c4a4971dea87 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -180,6 +180,19 @@ macro(resolve_dependency DEPENDENCY_NAME) endif() endmacro() +macro(resolve_dependency_with_version DEPENDENCY_NAME REQUIRED_VERSION) + if(${DEPENDENCY_NAME}_SOURCE STREQUAL "AUTO") + find_package(${DEPENDENCY_NAME} ${REQUIRED_VERSION} MODULE) + if(NOT ${${DEPENDENCY_NAME}_FOUND}) + build_dependency(${DEPENDENCY_NAME}) + endif() + elseif(${DEPENDENCY_NAME}_SOURCE STREQUAL "BUNDLED") + build_dependency(${DEPENDENCY_NAME}) + elseif(${DEPENDENCY_NAME}_SOURCE STREQUAL "SYSTEM") + find_package(${DEPENDENCY_NAME} ${REQUIRED_VERSION} REQUIRED) + endif() +endmacro() + # ---------------------------------------------------------------------- # Thirdparty versions, environment variables, source URLs @@ -1282,7 +1295,12 @@ macro(build_protobuf) endmacro() if(ARROW_WITH_PROTOBUF) - resolve_dependency(Protobuf) + if(ARROW_WITH_GRPC) + set(ARROW_PROTOBUF_REQUIRED_VERSION "3.6.0") + else() + set(ARROW_PROTOBUF_REQUIRED_VERSION "2.6.1") + endif() + resolve_dependency_with_version(Protobuf ${ARROW_PROTOBUF_REQUIRED_VERSION}) if(ARROW_PROTOBUF_USE_SHARED AND MSVC) add_definitions(-DPROTOBUF_USE_DLLS) From 0028b2b65a87c07582ce7da9d53970305022dd26 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Fri, 5 Jul 2019 09:08:47 +0200 Subject: [PATCH 04/14] ARROW-5775: [C++] Fix thread-unsafe cached data Author: Antoine Pitrou Closes #4791 from pitrou/ARROW-5775-boxed-fields and squashes the following commits: 9c2a33319 Add "inline" 6fe0b860d ARROW-5775: Fix thread-unsafe cached data --- cpp/CMakeLists.txt | 1 + cpp/src/arrow/array.cc | 24 +++++------ cpp/src/arrow/array.h | 3 -- cpp/src/arrow/python/deserialize.cc | 2 +- cpp/src/arrow/record_batch.cc | 11 +++-- cpp/src/arrow/util/atomic_shared_ptr.h | 57 ++++++++++++++++++++++++++ 6 files changed, 77 insertions(+), 21 deletions(-) create mode 100644 cpp/src/arrow/util/atomic_shared_ptr.h diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 82585d05d3dbc..93b5f0199c163 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -52,6 +52,7 @@ endif() message(STATUS "Arrow version: " "${ARROW_VERSION_MAJOR}.${ARROW_VERSION_MINOR}.${ARROW_VERSION_PATCH} " "(full: '${ARROW_VERSION}')") +message(STATUS "Arrow SO version: ${ARROW_SO_VERSION} (full: ${ARROW_FULL_SO_VERSION})") set(ARROW_SOURCE_DIR ${PROJECT_SOURCE_DIR}) set(ARROW_BINARY_DIR ${PROJECT_BINARY_DIR}) diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 0f63aba11d3d1..bc38559b91b17 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -31,6 +31,7 @@ #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_traits.h" +#include "arrow/util/atomic_shared_ptr.h" #include "arrow/util/bit-util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" @@ -530,7 +531,8 @@ const StructType* StructArray::struct_type() const { } std::shared_ptr StructArray::field(int i) const { - if (!boxed_fields_[i]) { + std::shared_ptr result = internal::atomic_load(&boxed_fields_[i]); + if (!result) { std::shared_ptr field_data; if (data_->offset != 0 || data_->child_data[i]->length != data_->length) { field_data = std::make_shared( @@ -538,9 +540,10 @@ std::shared_ptr StructArray::field(int i) const { } else { field_data = data_->child_data[i]; } - boxed_fields_[i] = MakeArray(field_data); + result = MakeArray(field_data); + internal::atomic_store(&boxed_fields_[i], result); } - return boxed_fields_[i]; + return result; } std::shared_ptr StructArray::GetFieldByName(const std::string& name) const { @@ -709,7 +712,8 @@ Status UnionArray::MakeSparse(const Array& type_ids, } std::shared_ptr UnionArray::child(int i) const { - if (!boxed_fields_[i]) { + std::shared_ptr result = internal::atomic_load(&boxed_fields_[i]); + if (!result) { std::shared_ptr child_data = data_->child_data[i]->Copy(); if (mode() == UnionMode::SPARSE) { // Sparse union: need to adjust child if union is sliced @@ -719,16 +723,10 @@ std::shared_ptr UnionArray::child(int i) const { *child_data = child_data->Slice(data_->offset, data_->length); } } - boxed_fields_[i] = MakeArray(child_data); + result = MakeArray(child_data); + internal::atomic_store(&boxed_fields_[i], result); } - return boxed_fields_[i]; -} - -const Array* UnionArray::UnsafeChild(int i) const { - if (!boxed_fields_[i]) { - boxed_fields_[i] = MakeArray(data_->child_data[i]); - } - return boxed_fields_[i].get(); + return result; } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 2a1ce7aae6b04..256bbdc6c67bc 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -1042,9 +1042,6 @@ class ARROW_EXPORT UnionArray : public Array { // For dense unions, the returned array is unchanged. std::shared_ptr child(int pos) const; - /// Only use this while the UnionArray is in scope - const Array* UnsafeChild(int pos) const; - protected: void SetData(const std::shared_ptr& data); diff --git a/cpp/src/arrow/python/deserialize.cc b/cpp/src/arrow/python/deserialize.cc index 5e6e135890683..45f7d61890e45 100644 --- a/cpp/src/arrow/python/deserialize.cc +++ b/cpp/src/arrow/python/deserialize.cc @@ -235,7 +235,7 @@ Status DeserializeSequence(PyObject* context, const Array& array, int64_t start_ int64_t offset = value_offsets[i]; uint8_t type = type_ids[i]; PyObject* value; - RETURN_NOT_OK(GetValue(context, *data.UnsafeChild(type), offset, + RETURN_NOT_OK(GetValue(context, *data.child(type), offset, python_types[type_ids[i]], base, blobs, &value)); RETURN_NOT_OK(set_item(result.obj(), i - start_idx, value)); } diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index 1f266df4a6a63..f83a6cd2771fe 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -18,6 +18,7 @@ #include "arrow/record_batch.h" #include +#include #include #include #include @@ -27,6 +28,7 @@ #include "arrow/status.h" #include "arrow/table.h" #include "arrow/type.h" +#include "arrow/util/atomic_shared_ptr.h" #include "arrow/util/logging.h" #include "arrow/util/stl.h" @@ -85,11 +87,12 @@ class SimpleRecordBatch : public RecordBatch { } std::shared_ptr column(int i) const override { - if (!boxed_columns_[i]) { - boxed_columns_[i] = MakeArray(columns_[i]); + std::shared_ptr result = internal::atomic_load(&boxed_columns_[i]); + if (!result) { + result = MakeArray(columns_[i]); + internal::atomic_store(&boxed_columns_[i], result); } - DCHECK(boxed_columns_[i]); - return boxed_columns_[i]; + return result; } std::shared_ptr column_data(int i) const override { return columns_[i]; } diff --git a/cpp/src/arrow/util/atomic_shared_ptr.h b/cpp/src/arrow/util/atomic_shared_ptr.h new file mode 100644 index 0000000000000..9f3152bafd5ce --- /dev/null +++ b/cpp/src/arrow/util/atomic_shared_ptr.h @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +namespace arrow { +namespace internal { + +#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5 + +// atomic shared_ptr operations only appeared in gcc 5, +// emulate them with unsafe ops on gcc 4.x. + +template +inline std::shared_ptr atomic_load(const std::shared_ptr* p) { + return *p; +} + +template +inline void atomic_store(std::shared_ptr* p, std::shared_ptr r) { + *p = r; +} + +#else + +template +inline std::shared_ptr atomic_load(const std::shared_ptr* p) { + return std::atomic_load(p); +} + +template +inline void atomic_store(std::shared_ptr* p, std::shared_ptr r) { + std::atomic_store(p, std::move(r)); +} + +#endif + +} // namespace internal +} // namespace arrow From 4319066360d799c7b3632619c32a1c99caec20a2 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Fri, 5 Jul 2019 09:16:07 +0200 Subject: [PATCH 05/14] ARROW-5833: [C++] Factor out Status-enriching code Author: Antoine Pitrou Closes #4806 from pitrou/ARROW-5833-status-copying and squashes the following commits: 3db9c3fb7 ARROW-5833: Factor out Status-enriching code --- cpp/src/arrow/compute/kernels/cast.cc | 32 +++++++++++++-------------- cpp/src/arrow/csv/column-builder.cc | 4 ++-- cpp/src/arrow/status.h | 10 +++++++-- python/pyarrow/tests/test_csv.py | 2 +- 4 files changed, 26 insertions(+), 22 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc index 93feb656dd582..ef40f08623115 100644 --- a/cpp/src/arrow/compute/kernels/cast.cc +++ b/cpp/src/arrow/compute/kernels/cast.cc @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -46,26 +45,25 @@ #ifdef ARROW_EXTRA_ERROR_CONTEXT -#define FUNC_RETURN_NOT_OK(s) \ - do { \ - Status _s = (s); \ - if (ARROW_PREDICT_FALSE(!_s.ok())) { \ - std::stringstream ss; \ - ss << __FILE__ << ":" << __LINE__ << " code: " << #s << "\n" << _s.message(); \ - ctx->SetStatus(Status(_s.code(), ss.str(), s.detail())); \ - return; \ - } \ +#define FUNC_RETURN_NOT_OK(expr) \ + do { \ + Status _st = (expr); \ + if (ARROW_PREDICT_FALSE(!_st.ok())) { \ + _st.AddContextLine(__FILE__, __LINE__, #expr); \ + ctx->SetStatus(_st); \ + return; \ + } \ } while (0) #else -#define FUNC_RETURN_NOT_OK(s) \ - do { \ - Status _s = (s); \ - if (ARROW_PREDICT_FALSE(!_s.ok())) { \ - ctx->SetStatus(_s); \ - return; \ - } \ +#define FUNC_RETURN_NOT_OK(expr) \ + do { \ + Status _st = (expr); \ + if (ARROW_PREDICT_FALSE(!_st.ok())) { \ + ctx->SetStatus(_st); \ + return; \ + } \ } while (0) #endif // ARROW_EXTRA_ERROR_CONTEXT diff --git a/cpp/src/arrow/csv/column-builder.cc b/cpp/src/arrow/csv/column-builder.cc index 4099507016d3b..cfc36fe610910 100644 --- a/cpp/src/arrow/csv/column-builder.cc +++ b/cpp/src/arrow/csv/column-builder.cc @@ -75,8 +75,8 @@ class TypedColumnBuilder : public ColumnBuilder { return st; } else { std::stringstream ss; - ss << "In column #" << col_index_ << ": " << st.message(); - return Status(st.code(), ss.str(), st.detail()); + ss << "In CSV column #" << col_index_ << ": " << st.message(); + return st.WithMessage(ss.str()); } } diff --git a/cpp/src/arrow/status.h b/cpp/src/arrow/status.h index 7cafc41902df2..b69040949ade9 100644 --- a/cpp/src/arrow/status.h +++ b/cpp/src/arrow/status.h @@ -303,12 +303,18 @@ class ARROW_EXPORT Status { return state_ == NULLPTR ? NULLPTR : state_->detail; } - /// \brief Returns a new Status copying the existing status, but + /// \brief Return a new Status copying the existing status, but /// updating with the existing detail. - Status WithDetail(std::shared_ptr new_detail) { + Status WithDetail(std::shared_ptr new_detail) const { return Status(code(), message(), std::move(new_detail)); } + /// \brief Return a new Status with changed message, copying the + /// existing status code and detail. + Status WithMessage(std::string message) const { + return Status(code(), std::move(message), detail()); + } + [[noreturn]] void Abort() const; [[noreturn]] void Abort(const std::string& message) const; diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index df4d0a5b55c3e..57a023237a42a 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -376,7 +376,7 @@ def test_column_types(self): with pytest.raises(pa.ArrowInvalid) as exc: self.read_bytes(rows, convert_options=opts) err = str(exc.value) - assert "In column #1: " in err + assert "In CSV column #1: " in err assert "CSV conversion error to float: invalid value 'XXX'" in err def test_no_ending_newline(self): From 9ffb9cdd46c498a2f46a8d030613e7c046b6e843 Mon Sep 17 00:00:00 2001 From: Yosuke Shiro Date: Fri, 5 Jul 2019 16:38:14 +0900 Subject: [PATCH 06/14] ARROW-5784: [Release][GLib] Replace c_glib/ after running c_glib/autogen.sh in dev/release/02-source.sh c_glib/ source archive is generated by `make dist` because includes configure script. The current `dev/release/02-source.sh` build Arrow C++ and Arrow GLib to include the artifacts of GTK-Doc and then run `make dist`. But it is slow. So this PR run only `c_glib/autogen.sh` and then replace c_glib/. Author: Yosuke Shiro Author: Sutou Kouhei Closes #4749 from shiro615/release-replace-c-glib-after-running-autogen and squashes the following commits: 9a69f8edc Remove an unnecessary environment variable 3a2550fbd Remove omit from 02-source-test.rb 501a2dd97 Remove autom4te.cache after running autogen.sh 46a4f8995 Use docker-compose e357a88b7 Exclude c_glib/autom4te.cache/* from RAT check 70cb4a762 Remove an unnecessary diff aa786804d Enable test test_glib_configure on Travis CI e04276e33 Remove libraries for C++ build 56098ae87 Replace c_glib/ by c_glib/ after running autogen.sh --- dev/release/02-source-test.rb | 4 ---- dev/release/02-source.sh | 26 +++++++++++--------------- dev/release/source/Dockerfile | 25 ++----------------------- dev/release/source/build.sh | 33 ++++++--------------------------- docker-compose.yml | 10 ++++++++++ 5 files changed, 29 insertions(+), 69 deletions(-) diff --git a/dev/release/02-source-test.rb b/dev/release/02-source-test.rb index e777c7561c449..cf3c887a1c860 100644 --- a/dev/release/02-source-test.rb +++ b/dev/release/02-source-test.rb @@ -55,10 +55,6 @@ def test_symbolic_links end def test_glib_configure - unless ENV["ARROW_TEST_SOURCE_GLIB"] == "yes" - omit("This takes a long time. " + - "Set ARROW_TEST_SOURCE_GLIB=yes environment variable to test this.") - end source("GLIB") Dir.chdir("#{@tag_name}/c_glib") do assert_equal([ diff --git a/dev/release/02-source.sh b/dev/release/02-source.sh index edaeec9ebe7ea..ca192a8be9a14 100755 --- a/dev/release/02-source.sh +++ b/dev/release/02-source.sh @@ -61,26 +61,22 @@ rm -rf ${tag} git archive ${release_hash} --prefix ${tag}/) | \ tar xf - -# replace c_glib/ by tar.gz generated by "make dist" +# Replace c_glib/ after running c_glib/autogen.sh to create c_gilb/ source archive containing the configure script if [ ${SOURCE_GLIB} -gt 0 ]; then archive_name=tmp-apache-arrow (cd "${SOURCE_TOP_DIR}" && \ git archive ${release_hash} --prefix ${archive_name}/) \ - > ${archive_name}.tar - dist_c_glib_tar_gz=c_glib.tar.gz - docker_image_name=apache-arrow/release-source - DEBUG=yes docker build -t ${docker_image_name} "${SOURCE_DIR}/source" - docker \ - run \ - --rm \ - --interactive \ - --volume "$PWD":/host \ - ${docker_image_name} \ - /build.sh ${archive_name} ${dist_c_glib_tar_gz} - rm -f ${archive_name}.tar + > "${SOURCE_TOP_DIR}/${archive_name}.tar" + c_glib_including_configure_tar_gz=c_glib.tar.gz + "${SOURCE_TOP_DIR}/dev/run_docker_compose.sh" \ + release-source \ + /arrow/dev/release/source/build.sh \ + ${archive_name} \ + ${c_glib_including_configure_tar_gz} + rm -f "${SOURCE_TOP_DIR}/${archive_name}.tar" rm -rf ${tag}/c_glib - tar xf ${dist_c_glib_tar_gz} -C ${tag} - rm -f ${dist_c_glib_tar_gz} + tar xf "${SOURCE_TOP_DIR}/${c_glib_including_configure_tar_gz}" -C ${tag} + rm -f "${SOURCE_TOP_DIR}/${c_glib_including_configure_tar_gz}" fi # Resolve all hard and symbolic links diff --git a/dev/release/source/Dockerfile b/dev/release/source/Dockerfile index 9085cef3327b4..7d5453b80c456 100644 --- a/dev/release/source/Dockerfile +++ b/dev/release/source/Dockerfile @@ -15,38 +15,17 @@ # specific language governing permissions and limitations # under the License. -FROM ubuntu:18.04 +FROM debian:buster ENV DEBIAN_FRONTEND noninteractive -RUN apt-get update -y -q && \ - apt-get install -y -q --no-install-recommends wget software-properties-common gpg-agent && \ - wget --quiet -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \ - apt-add-repository -y "deb http://apt.llvm.org/bionic llvm-toolchain-bionic-7 main" && \ - apt-get -y install clang-7 - RUN apt update && \ apt install -y -V \ autoconf-archive \ - bison \ - cmake \ - flex \ - g++ \ - gcc \ gtk-doc-tools \ - libboost-filesystem-dev \ - libboost-regex-dev \ - libboost-system-dev \ libgirepository1.0-dev \ libglib2.0-doc \ - libprotobuf-dev \ - libprotoc-dev \ libtool \ - lsb-release \ - make \ - pkg-config \ - protobuf-compiler && \ + pkg-config && \ apt clean && \ rm -rf /var/lib/apt/lists/* - -COPY build.sh /build.sh diff --git a/dev/release/source/build.sh b/dev/release/source/build.sh index 20c972466b4c0..558600e1fb71b 100755 --- a/dev/release/source/build.sh +++ b/dev/release/source/build.sh @@ -20,35 +20,14 @@ set -e archive_name=$1 -dist_c_glib_tar_gz=$2 +c_glib_including_configure_tar_gz=$2 -tar xf /host/${archive_name}.tar +tar xf /arrow/${archive_name}.tar -# build Apache Arrow C++ before building Apache Arrow GLib because -# Apache Arrow GLib requires Apache Arrow C++. -mkdir -p ${archive_name}/cpp/build -cpp_install_dir=${PWD}/${archive_name}/cpp/install -cd ${archive_name}/cpp/build -cmake .. \ - -DCMAKE_INSTALL_PREFIX=${cpp_install_dir} \ - -DCMAKE_INSTALL_LIBDIR=lib \ - -DARROW_PLASMA=yes \ - -DARROW_GANDIVA=yes \ - -DARROW_PARQUET=yes -make -j8 -make install -cd - - -# build source archive for Apache Arrow GLib by "make dist". +# Run autogen.sh to create c_glib/ source archive containing the configure script cd ${archive_name}/c_glib ./autogen.sh -./configure \ - PKG_CONFIG_PATH=${cpp_install_dir}/lib/pkgconfig \ - --enable-gtk-doc -LD_LIBRARY_PATH=${cpp_install_dir}/lib make -j8 -make dist -tar xzf *.tar.gz -rm *.tar.gz +rm -rf autom4te.cache cd - -mv ${archive_name}/c_glib/apache-arrow-glib-* c_glib/ -tar czf /host/${dist_c_glib_tar_gz} c_glib +mv ${archive_name}/c_glib/ c_glib/ +tar czf /arrow/${c_glib_including_configure_tar_gz} c_glib diff --git a/docker-compose.yml b/docker-compose.yml index 94171483c9a30..768c4e6eb72ab 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -609,6 +609,16 @@ services: # TODO(kszucs): hive-integration + ################################# Release ################################### + + release-source: + image: arrow:release-source + build: + context: . + dockerfile: dev/release/source/Dockerfile + volumes: + - .:/arrow:delegated + ######################## Verification Containers ############################ debian-stretch: From e6d033f77b0380807d1335c7a65191d16437d562 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Fri, 5 Jul 2019 08:34:09 -0400 Subject: [PATCH 07/14] ARROW-5851: [C++] Fix compilation of reference benchmarks Also fix a warning because of static variables in headers. Author: Antoine Pitrou Closes #4808 from pitrou/ARROW-5851-compile-reference-benchmarks and squashes the following commits: fb6740aea ARROW-5851: Fix compilation of reference benchmarks --- cpp/src/arrow/flight/internal.cc | 2 ++ cpp/src/arrow/flight/internal.h | 3 ++- cpp/src/arrow/flight/types.cc | 5 +++++ cpp/src/arrow/flight/types.h | 12 ++++++++---- cpp/src/arrow/util/compression-benchmark.cc | 1 + 5 files changed, 18 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/flight/internal.cc b/cpp/src/arrow/flight/internal.cc index 55821495e1caa..56fc86234a318 100644 --- a/cpp/src/arrow/flight/internal.cc +++ b/cpp/src/arrow/flight/internal.cc @@ -42,6 +42,8 @@ namespace arrow { namespace flight { namespace internal { +const char* kGrpcAuthHeader = "auth-token-bin"; + Status FromGrpcStatus(const grpc::Status& grpc_status) { if (grpc_status.ok()) { return Status::OK(); diff --git a/cpp/src/arrow/flight/internal.h b/cpp/src/arrow/flight/internal.h index 5283bed21831a..a554e81ff436b 100644 --- a/cpp/src/arrow/flight/internal.h +++ b/cpp/src/arrow/flight/internal.h @@ -64,7 +64,8 @@ namespace flight { namespace internal { /// The name of the header used to pass authentication tokens. -static const char* kGrpcAuthHeader = "auth-token-bin"; +ARROW_FLIGHT_EXPORT +extern const char* kGrpcAuthHeader; ARROW_FLIGHT_EXPORT Status SchemaToString(const Schema& schema, std::string* out); diff --git a/cpp/src/arrow/flight/types.cc b/cpp/src/arrow/flight/types.cc index c82e68136484f..86aa2237c763d 100644 --- a/cpp/src/arrow/flight/types.cc +++ b/cpp/src/arrow/flight/types.cc @@ -31,6 +31,11 @@ namespace arrow { namespace flight { +const char* kSchemeGrpc = "grpc"; +const char* kSchemeGrpcTcp = "grpc+tcp"; +const char* kSchemeGrpcUnix = "grpc+unix"; +const char* kSchemeGrpcTls = "grpc+tls"; + bool FlightDescriptor::Equals(const FlightDescriptor& other) const { if (type != other.type) { return false; diff --git a/cpp/src/arrow/flight/types.h b/cpp/src/arrow/flight/types.h index abf894c88c8aa..b4c4c6ca42dd3 100644 --- a/cpp/src/arrow/flight/types.h +++ b/cpp/src/arrow/flight/types.h @@ -135,10 +135,14 @@ struct ARROW_FLIGHT_EXPORT Ticket { class FlightClient; class FlightServerBase; -static const char* kSchemeGrpc = "grpc"; -static const char* kSchemeGrpcTcp = "grpc+tcp"; -static const char* kSchemeGrpcUnix = "grpc+unix"; -static const char* kSchemeGrpcTls = "grpc+tls"; +ARROW_FLIGHT_EXPORT +extern const char* kSchemeGrpc; +ARROW_FLIGHT_EXPORT +extern const char* kSchemeGrpcTcp; +ARROW_FLIGHT_EXPORT +extern const char* kSchemeGrpcUnix; +ARROW_FLIGHT_EXPORT +extern const char* kSchemeGrpcTls; /// \brief A host location (a URI) struct ARROW_FLIGHT_EXPORT Location { diff --git a/cpp/src/arrow/util/compression-benchmark.cc b/cpp/src/arrow/util/compression-benchmark.cc index 28bc1255c4bbb..5700b0dd0e2ea 100644 --- a/cpp/src/arrow/util/compression-benchmark.cc +++ b/cpp/src/arrow/util/compression-benchmark.cc @@ -25,6 +25,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/util/compression.h" +#include "arrow/util/logging.h" namespace arrow { namespace util { From 9cbc42e48de26d022ab94aa88576179d70ae4521 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Fri, 5 Jul 2019 08:54:44 -0400 Subject: [PATCH 08/14] ARROW-4187: [C++] Enable file-benchmark on Windows Also fix take-benchmark compilation. Author: Antoine Pitrou Closes #4809 from pitrou/ARROW-4187-file-benchmark-windows and squashes the following commits: 46defe6af ARROW-4187: Enable file-benchmark on Windows --- .../arrow/compute/kernels/take-benchmark.cc | 9 +- cpp/src/arrow/io/file-benchmark.cc | 87 +++++++++++++++++-- 2 files changed, 86 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/take-benchmark.cc b/cpp/src/arrow/compute/kernels/take-benchmark.cc index 139e183b92f7a..d28f7af2abbd7 100644 --- a/cpp/src/arrow/compute/kernels/take-benchmark.cc +++ b/cpp/src/arrow/compute/kernels/take-benchmark.cc @@ -48,7 +48,8 @@ static void TakeInt64(benchmark::State& state) { auto values = rand.Int64(array_size, -100, 100, args.null_proportion); - auto indices = rand.Int32(array_size, 0, array_size - 1, args.null_proportion); + auto indices = rand.Int32(static_cast(array_size), 0, + static_cast(array_size - 1), args.null_proportion); TakeBenchmark(state, values, indices); } @@ -64,7 +65,8 @@ static void TakeFixedSizeList1Int64(benchmark::State& state) { fixed_size_list(int64(), 1), array_size, int_array, int_array->null_bitmap(), int_array->null_count()); - auto indices = rand.Int32(array_size, 0, array_size - 1, args.null_proportion); + auto indices = rand.Int32(static_cast(array_size), 0, + static_cast(array_size - 1), args.null_proportion); TakeBenchmark(state, values, indices); } @@ -110,7 +112,8 @@ static void TakeString(benchmark::State& state) { auto values = std::static_pointer_cast(rand.String( array_size, string_min_length, string_max_length, args.null_proportion)); - auto indices = rand.Int32(array_size, 0, array_size - 1, args.null_proportion); + auto indices = rand.Int32(static_cast(array_size), 0, + static_cast(array_size - 1), args.null_proportion); TakeBenchmark(state, values, indices); } diff --git a/cpp/src/arrow/io/file-benchmark.cc b/cpp/src/arrow/io/file-benchmark.cc index 74b92cbf3d6e6..b0880fdd485f6 100644 --- a/cpp/src/arrow/io/file-benchmark.cc +++ b/cpp/src/arrow/io/file-benchmark.cc @@ -20,6 +20,8 @@ #include "arrow/io/file.h" #include "arrow/testing/gtest_util.h" #include "arrow/util/io-util.h" +#include "arrow/util/logging.h" +#include "arrow/util/windows_compatibility.h" #include "benchmark/benchmark.h" @@ -30,7 +32,11 @@ #include #include -#ifndef _WIN32 +#ifdef _WIN32 + +#include + +#else #include #include @@ -40,17 +46,82 @@ namespace arrow { -#ifndef _WIN32 - -std::string GetNullFile() { return "/dev/null"; } +std::string GetNullFile() { +#ifdef _WIN32 + return "NUL"; +#else + return "/dev/null"; +#endif +} const std::valarray small_sizes = {8, 24, 33, 1, 32, 192, 16, 40}; const std::valarray large_sizes = {8192, 100000}; constexpr int64_t kBufferSize = 4096; +#ifdef _WIN32 + +class BackgroundReader { + // A class that reads data in the background from a file descriptor + // (Windows implementation) + + public: + static std::shared_ptr StartReader(int fd) { + std::shared_ptr reader(new BackgroundReader(fd)); + reader->worker_.reset(new std::thread([=] { reader->LoopReading(); })); + return reader; + } + void Stop() { ARROW_CHECK(SetEvent(event_)); } + void Join() { worker_->join(); } + + ~BackgroundReader() { + ABORT_NOT_OK(internal::FileClose(fd_)); + ARROW_CHECK(CloseHandle(event_)); + } + + protected: + explicit BackgroundReader(int fd) : fd_(fd), total_bytes_(0) { + file_handle_ = reinterpret_cast(_get_osfhandle(fd)); + ARROW_CHECK_NE(file_handle_, INVALID_HANDLE_VALUE); + event_ = + CreateEvent(nullptr, /* bManualReset=*/TRUE, /* bInitialState=*/FALSE, nullptr); + ARROW_CHECK_NE(event_, INVALID_HANDLE_VALUE); + } + + void LoopReading() { + const HANDLE handles[] = {file_handle_, event_}; + while (true) { + DWORD ret = WaitForMultipleObjects(2, handles, /* bWaitAll=*/FALSE, INFINITE); + ARROW_CHECK_NE(ret, WAIT_FAILED); + if (ret == WAIT_OBJECT_0 + 1) { + // Got stop request + break; + } else if (ret == WAIT_OBJECT_0) { + // File ready for reading + int64_t bytes_read; + ARROW_CHECK_OK(internal::FileRead(fd_, buffer_, buffer_size_, &bytes_read)); + total_bytes_ += bytes_read; + } else { + ARROW_LOG(FATAL) << "Unexpected WaitForMultipleObjects return value " << ret; + } + } + } + + int fd_; + HANDLE file_handle_, event_; + int64_t total_bytes_; + + static const int64_t buffer_size_ = 16384; + uint8_t buffer_[buffer_size_]; + + std::unique_ptr worker_; +}; + +#else + class BackgroundReader { // A class that reads data in the background from a file descriptor + // (Unix implementation) public: static std::shared_ptr StartReader(int fd) { @@ -116,6 +187,8 @@ class BackgroundReader { std::unique_ptr worker_; }; +#endif + // Set up a pipe with an OutputStream at one end and a BackgroundReader at // the other end. static void SetupPipeWriter(std::shared_ptr* stream, @@ -139,6 +212,9 @@ static void BenchmarkStreamingWrites(benchmark::State& state, ABORT_NOT_OK(stream->Write(data, size)); } } + // For Windows: need to close writer before joining reader thread. + ABORT_NOT_OK(stream->Close()); + const int64_t total_bytes = static_cast(state.iterations()) * sum_sizes; state.SetBytesProcessed(total_bytes); @@ -147,7 +223,6 @@ static void BenchmarkStreamingWrites(benchmark::State& state, reader->Stop(); reader->Join(); } - ABORT_NOT_OK(stream->Close()); } // Benchmark writing to /dev/null @@ -232,6 +307,4 @@ BENCHMARK(BufferedOutputStreamSmallWritesToNull)->UseRealTime(); BENCHMARK(BufferedOutputStreamSmallWritesToPipe)->UseRealTime(); BENCHMARK(BufferedOutputStreamLargeWritesToPipe)->UseRealTime(); -#endif // ifndef _WIN32 - } // namespace arrow From 094ce087dceeb655c554b029203d73fe1ed2918c Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Fri, 5 Jul 2019 09:02:01 -0400 Subject: [PATCH 09/14] ARROW-5849: [C++] Fix compiler warnings on mingw32 Author: Antoine Pitrou Closes #4804 from pitrou/ARROW-5849-mingw-warnings and squashes the following commits: ff48a18f0 ARROW-5849: Fix compiler warnings on mingw32 --- cpp/src/arrow/array-test.cc | 3 +-- cpp/src/arrow/compute/kernels/cast-test.cc | 2 +- cpp/src/arrow/io/mman.h | 25 +++++++--------------- cpp/src/arrow/util/decimal.cc | 2 +- cpp/src/arrow/util/io-util.cc | 7 +++--- 5 files changed, 15 insertions(+), 24 deletions(-) diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 2005a0db562b5..df8b85262ff49 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -1300,7 +1300,6 @@ TEST_F(TestFWBinaryArray, ZeroSize) { ASSERT_OK(builder.Append("")); ASSERT_OK(builder.Append(std::string())); - ASSERT_OK(builder.Append(static_cast(nullptr))); ASSERT_OK(builder.AppendNull()); ASSERT_OK(builder.AppendNull()); ASSERT_OK(builder.AppendNull()); @@ -1314,7 +1313,7 @@ TEST_F(TestFWBinaryArray, ZeroSize) { ASSERT_EQ(fw_array.values()->size(), 0); ASSERT_EQ(0, fw_array.byte_width()); - ASSERT_EQ(6, array->length()); + ASSERT_EQ(5, array->length()); ASSERT_EQ(3, array->null_count()); } diff --git a/cpp/src/arrow/compute/kernels/cast-test.cc b/cpp/src/arrow/compute/kernels/cast-test.cc index 510b9a48c733b..6bf4f941755c3 100644 --- a/cpp/src/arrow/compute/kernels/cast-test.cc +++ b/cpp/src/arrow/compute/kernels/cast-test.cc @@ -263,7 +263,7 @@ std::vector UnsafeVectorCast(const std::vector& v) { for (size_t i = 0; i < v.size(); i++) result[i] = static_cast(v[i]); - return std::move(result); + return result; } TEST_F(TestCast, IntegerSignedToUnsigned) { diff --git a/cpp/src/arrow/io/mman.h b/cpp/src/arrow/io/mman.h index 6125492560967..5826891a60b0a 100644 --- a/cpp/src/arrow/io/mman.h +++ b/cpp/src/arrow/io/mman.h @@ -14,6 +14,8 @@ #include #include +#include + #define PROT_NONE 0 #define PROT_READ 1 #define PROT_WRITE 2 @@ -75,28 +77,17 @@ static inline void* mmap(void* addr, size_t len, int prot, int flags, int fildes HANDLE fm, h; void* map = MAP_FAILED; + const uint64_t off64 = static_cast(off); + const uint64_t maxSize = off64 + len; -#ifdef _MSC_VER -#pragma warning(push) -#pragma warning(disable : 4293) -#endif + const DWORD dwFileOffsetLow = static_cast(off64 & 0xFFFFFFFFUL); + const DWORD dwFileOffsetHigh = static_cast((off64 >> 32) & 0xFFFFFFFFUL); + const DWORD dwMaxSizeLow = static_cast(maxSize & 0xFFFFFFFFUL); + const DWORD dwMaxSizeHigh = static_cast((maxSize >> 32) & 0xFFFFFFFFUL); - const DWORD dwFileOffsetLow = - (sizeof(off_t) <= sizeof(DWORD)) ? (DWORD)off : (DWORD)(off & 0xFFFFFFFFL); - const DWORD dwFileOffsetHigh = - (sizeof(off_t) <= sizeof(DWORD)) ? (DWORD)0 : (DWORD)((off >> 32) & 0xFFFFFFFFL); const DWORD protect = __map_mmap_prot_page(prot); const DWORD desiredAccess = __map_mmap_prot_file(prot); - const size_t maxSize = off + len; - - const DWORD dwMaxSizeLow = static_cast(maxSize & 0xFFFFFFFFL); - const DWORD dwMaxSizeHigh = static_cast((maxSize >> 32) & 0xFFFFFFFFL); - -#ifdef _MSC_VER -#pragma warning(pop) -#endif - errno = 0; if (len == 0 diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc index 46928b20e0936..00beffd2450c5 100644 --- a/cpp/src/arrow/util/decimal.cc +++ b/cpp/src/arrow/util/decimal.cc @@ -214,7 +214,7 @@ namespace { struct DecimalComponents { util::string_view whole_digits; util::string_view fractional_digits; - int32_t exponent; + int32_t exponent = 0; char sign = 0; bool has_exponent = false; }; diff --git a/cpp/src/arrow/util/io-util.cc b/cpp/src/arrow/util/io-util.cc index c092769d81fde..58072b613810f 100644 --- a/cpp/src/arrow/util/io-util.cc +++ b/cpp/src/arrow/util/io-util.cc @@ -564,8 +564,9 @@ Status MemoryMapRemap(void* addr, size_t old_size, size_t new_size, int fildes, return StatusFromErrno("Cannot get file handle: "); } - LONG new_size_low = static_cast(new_size & 0xFFFFFFFFL); - LONG new_size_high = static_cast((new_size >> 32) & 0xFFFFFFFFL); + uint64_t new_size64 = new_size; + LONG new_size_low = static_cast(new_size64 & 0xFFFFFFFFUL); + LONG new_size_high = static_cast((new_size64 >> 32) & 0xFFFFFFFFUL); SetFilePointer(h, new_size_low, &new_size_high, FILE_BEGIN); SetEndOfFile(h); @@ -892,7 +893,7 @@ Status TemporaryDir::Make(const std::string& prefix, std::unique_ptr Date: Fri, 5 Jul 2019 09:56:18 -0700 Subject: [PATCH 10/14] ARROW-5846: [Java] Create Avro adapter module and add dependencies Related to [ARROW-5846](https://issues.apache.org/jira/browse/ARROW-5846). Author: tianchen Closes #4800 from tianchen92/ARROW-5846 and squashes the following commits: 810784750 fix 2 2c5ff6622 fix de49b3af8 add test 79c04c879 update maven-compiler-plugin version d60cc8b89 update version 2a060fbf3 ARROW-5846: Create Avro adapter module and add dependencies --- java/adapter/avro/pom.xml | 38 ++++++++ .../apache/arrow/TestWriteReadAvroRecord.java | 94 +++++++++++++++++++ .../avro/src/test/resources/schema/test.avsc | 26 +++++ java/pom.xml | 1 + 4 files changed, 159 insertions(+) create mode 100644 java/adapter/avro/pom.xml create mode 100644 java/adapter/avro/src/test/java/org/apache/arrow/TestWriteReadAvroRecord.java create mode 100644 java/adapter/avro/src/test/resources/schema/test.avsc diff --git a/java/adapter/avro/pom.xml b/java/adapter/avro/pom.xml new file mode 100644 index 0000000000000..4e13d57fb34a1 --- /dev/null +++ b/java/adapter/avro/pom.xml @@ -0,0 +1,38 @@ + + + + + 4.0.0 + + + org.apache.arrow + arrow-java-root + 1.0.0-SNAPSHOT + ../../pom.xml + + + arrow-avro + Arrow AVRO Adapter + http://maven.apache.org + + + + + org.apache.avro + avro + 1.9.0 + + + + diff --git a/java/adapter/avro/src/test/java/org/apache/arrow/TestWriteReadAvroRecord.java b/java/adapter/avro/src/test/java/org/apache/arrow/TestWriteReadAvroRecord.java new file mode 100644 index 0000000000000..aa5c45a52e298 --- /dev/null +++ b/java/adapter/avro/src/test/java/org/apache/arrow/TestWriteReadAvroRecord.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow; + +import static org.junit.Assert.assertEquals; + +import java.io.File; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; + +import org.apache.avro.Schema; +import org.apache.avro.file.DataFileReader; +import org.apache.avro.file.DataFileWriter; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericDatumReader; +import org.apache.avro.generic.GenericDatumWriter; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.io.DatumReader; +import org.apache.avro.io.DatumWriter; + +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + + +public class TestWriteReadAvroRecord { + + @ClassRule + public static final TemporaryFolder TMP = new TemporaryFolder(); + + @Test + public void testWriteAndRead() throws Exception { + + File dataFile = TMP.newFile(); + Path schemaPath = Paths.get(TestWriteReadAvroRecord.class.getResource("/").getPath(), "schema", "test.avsc"); + Schema schema = new Schema.Parser().parse(schemaPath.toFile()); + + //write data to disk + GenericRecord user1 = new GenericData.Record(schema); + user1.put("name", "Alyssa"); + user1.put("favorite_number", 256); + + GenericRecord user2 = new GenericData.Record(schema); + user2.put("name", "Ben"); + user2.put("favorite_number", 7); + user2.put("favorite_color", "red"); + + DatumWriter datumWriter = new GenericDatumWriter(schema); + DataFileWriter dataFileWriter = new DataFileWriter(datumWriter); + dataFileWriter.create(schema, dataFile); + dataFileWriter.append(user1); + dataFileWriter.append(user2); + dataFileWriter.close(); + + //read data from disk + DatumReader datumReader = new GenericDatumReader(schema); + DataFileReader + dataFileReader = new DataFileReader(dataFile, datumReader); + List result = new ArrayList<>(); + while (dataFileReader.hasNext()) { + GenericRecord user = dataFileReader.next(); + result.add(user); + } + + assertEquals(2, result.size()); + GenericRecord deUser1 = result.get(0); + assertEquals("Alyssa", deUser1.get("name").toString()); + assertEquals(256, deUser1.get("favorite_number")); + assertEquals(null, deUser1.get("favorite_color")); + + GenericRecord deUser2 = result.get(1); + assertEquals("Ben", deUser2.get("name").toString()); + assertEquals(7, deUser2.get("favorite_number")); + assertEquals("red", deUser2.get("favorite_color").toString()); + } + +} diff --git a/java/adapter/avro/src/test/resources/schema/test.avsc b/java/adapter/avro/src/test/resources/schema/test.avsc new file mode 100644 index 0000000000000..15fdd76361b92 --- /dev/null +++ b/java/adapter/avro/src/test/resources/schema/test.avsc @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +{"namespace": "org.apache.arrow.avro", + "type": "record", + "name": "User", + "fields": [ + {"name": "name", "type": "string"}, + {"name": "favorite_number", "type": ["int", "null"]}, + {"name": "favorite_color", "type": ["string", "null"]} + ] +} diff --git a/java/pom.xml b/java/pom.xml index 1358f5f21ebee..29d602ddb4088 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -655,6 +655,7 @@ flight performance algorithm + adapter/avro From c037ee2be7c8065032494ba7e052941edaf3ac61 Mon Sep 17 00:00:00 2001 From: tianchen Date: Fri, 5 Jul 2019 10:25:26 -0700 Subject: [PATCH 11/14] ARROW-5834: [Java] Apply new hash map in DictionaryEncoder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Related to [ARROW-5834](https://issues.apache.org/jira/browse/ARROW-5834). Apply new hash map in DictionaryEncoder to make it work. Meanwhile provide benchmark for DictionaryEncoder: DictionaryEncoderBenchmarks#testEncode: Before: 5 430860.989 ± 14157.675 ns/op After: 5 415703.943 ± 9258.049 ns/op Author: tianchen Closes #4786 from tianchen92/ARROW-5834 and squashes the following commits: dce8b6ae6 fix 43f7695a2 Apply new hash map in DictionaryEncoder --- .../DictionaryEncoderBenchmarks.java | 149 ++++++++++++++++++ .../vector/dictionary/DictionaryEncoder.java | 9 +- 2 files changed, 152 insertions(+), 6 deletions(-) create mode 100644 java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncoderBenchmarks.java diff --git a/java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncoderBenchmarks.java b/java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncoderBenchmarks.java new file mode 100644 index 0000000000000..047807c2ba195 --- /dev/null +++ b/java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncoderBenchmarks.java @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.dictionary; + +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.Random; +import java.util.concurrent.TimeUnit; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.types.pojo.DictionaryEncoding; +import org.junit.Test; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.RunnerException; +import org.openjdk.jmh.runner.options.Options; +import org.openjdk.jmh.runner.options.OptionsBuilder; + +/** + * Benchmarks for {@link DictionaryEncoder}. + */ +@State(Scope.Benchmark) +public class DictionaryEncoderBenchmarks { + + private BufferAllocator allocator; + + private static final int DATA_SIZE = 1000; + private static final int KEY_SIZE = 100; + + + private static final int KEY_LENGTH = 10; + + private List keys = new ArrayList<>(); + + private VarCharVector vector; + + private VarCharVector dictionaryVector; + + /** + * Setup benchmarks. + */ + @Setup + public void prepare() { + + for (int i = 0; i < KEY_SIZE; i++) { + keys.add(generateUniqueKey(KEY_LENGTH)); + } + + allocator = new RootAllocator(10 * 1024 * 1024); + + vector = new VarCharVector("vector", allocator); + dictionaryVector = new VarCharVector("dict", allocator); + + vector.allocateNew(10240, DATA_SIZE); + vector.setValueCount(DATA_SIZE); + for (int i = 0; i < DATA_SIZE; i++) { + byte[] value = keys.get(generateRandomIndex(KEY_SIZE)).getBytes(StandardCharsets.UTF_8); + vector.setSafe(i, value, 0, value.length); + } + + dictionaryVector.allocateNew(1024, 100); + dictionaryVector.setValueCount(100); + for (int i = 0; i < KEY_SIZE; i++) { + byte[] value = keys.get(i).getBytes(StandardCharsets.UTF_8); + dictionaryVector.setSafe(i, value, 0, value.length); + } + + } + + /** + * Tear down benchmarks. + */ + @TearDown + public void tearDown() { + vector.close(); + dictionaryVector.close(); + keys.clear(); + allocator.close(); + } + + /** + * Test encode for {@link DictionaryEncoder}. + * @return useless. To avoid DCE by JIT. + */ + @Benchmark + @BenchmarkMode(Mode.AverageTime) + @OutputTimeUnit(TimeUnit.NANOSECONDS) + public int testEncode() { + Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null)); + final ValueVector encoded = DictionaryEncoder.encode(vector, dictionary); + encoded.close(); + return 0; + } + + private int generateRandomIndex(int max) { + Random random = new Random(); + return random.nextInt(max); + } + + private String generateUniqueKey(int length) { + String str = "abcdefghijklmnopqrstuvwxyz"; + Random random = new Random(); + StringBuffer sb = new StringBuffer(); + for (int i = 0; i < length; i++) { + int number = random.nextInt(26); + sb.append(str.charAt(number)); + } + if (keys.contains(sb.toString())) { + return generateUniqueKey(length); + } + return sb.toString(); + } + + @Test + public void evaluate() throws RunnerException { + Options opt = new OptionsBuilder() + .include(DictionaryEncoderBenchmarks.class.getSimpleName()) + .forks(1) + .build(); + + new Runner(opt).run(); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java index b9f547c8fb517..ccd4b55c78d39 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoder.java @@ -17,9 +17,6 @@ package org.apache.arrow.vector.dictionary; -import java.util.HashMap; -import java.util.Map; - import org.apache.arrow.vector.BaseIntVector; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.ValueVector; @@ -47,7 +44,7 @@ public class DictionaryEncoder { public static ValueVector encode(ValueVector vector, Dictionary dictionary) { validateType(vector.getMinorType()); // load dictionary values into a hashmap for lookup - Map lookUps = new HashMap<>(dictionary.getVector().getValueCount()); + DictionaryEncodeHashMap lookUps = new DictionaryEncodeHashMap<>(dictionary.getVector().getValueCount()); for (int i = 0; i < dictionary.getVector().getValueCount(); i++) { // for primitive array types we need a wrapper that implements equals and hashcode appropriately lookUps.put(dictionary.getVector().getObject(i), i); @@ -74,8 +71,8 @@ public static ValueVector encode(ValueVector vector, Dictionary dictionary) { Object value = vector.getObject(i); if (value != null) { // if it's null leave it null // note: this may fail if value was not included in the dictionary - Integer encoded = lookUps.get(value); - if (encoded == null) { + int encoded = lookUps.get(value); + if (encoded == -1) { throw new IllegalArgumentException("Dictionary encoding not defined for value:" + value); } indices.setWithPossibleTruncate(i, encoded); From 320766d32d50ca231bda1428cd4d24a064649b52 Mon Sep 17 00:00:00 2001 From: liyafan82 Date: Fri, 5 Jul 2019 22:19:39 -0700 Subject: [PATCH 12/14] ARROW-5843: [Java] Improve the readability and performance of BitVectorHelper#getNullCount MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Improve the implementation by: 1. Count the number of 1 bits by long or int, instead of by byte 2. If the number of value count is a multiple of 8, there is no need to process the last byte separately. This makes the code clearer. Performance evaluations show 5+x performance improvements: Before: BitVectorHelperBenchmarks.getNullCountBenchmark avgt 5 284.398 ± 0.782 ns/op After: BitVectorHelperBenchmarks.getNullCountBenchmark avgt 5 55.569 ± 0.205 ns/op Author: liyafan82 Closes #4798 from liyafan82/fly_0704_null and squashes the following commits: 829f9221e Improve the readability and performance of BitVectorHelper#getNullCount --- .../vector/BitVectorHelperBenchmarks.java | 95 +++++++++++++++++++ .../apache/arrow/vector/BitVectorHelper.java | 27 ++++-- 2 files changed, 116 insertions(+), 6 deletions(-) create mode 100644 java/performance/src/test/java/org/apache/arrow/vector/BitVectorHelperBenchmarks.java diff --git a/java/performance/src/test/java/org/apache/arrow/vector/BitVectorHelperBenchmarks.java b/java/performance/src/test/java/org/apache/arrow/vector/BitVectorHelperBenchmarks.java new file mode 100644 index 0000000000000..bb48247417c97 --- /dev/null +++ b/java/performance/src/test/java/org/apache/arrow/vector/BitVectorHelperBenchmarks.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import java.util.concurrent.TimeUnit; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.RunnerException; +import org.openjdk.jmh.runner.options.Options; +import org.openjdk.jmh.runner.options.OptionsBuilder; + +import io.netty.buffer.ArrowBuf; + +/** + * Benchmarks for {@link BitVectorHelper}. + */ +@State(Scope.Benchmark) +public class BitVectorHelperBenchmarks { + + private static final int VALIDITY_BUFFER_CAPACITY = 1024; + + private static final int ALLOCATOR_CAPACITY = 1024 * 1024; + + private BufferAllocator allocator; + + private ArrowBuf validityBuffer; + + /** + * Setup benchmarks. + */ + @Setup + public void prepare() { + allocator = new RootAllocator(ALLOCATOR_CAPACITY); + validityBuffer = allocator.buffer(VALIDITY_BUFFER_CAPACITY / 8); + + for (int i = 0;i < VALIDITY_BUFFER_CAPACITY; i++) { + if (i % 7 == 0) { + BitVectorHelper.setValidityBit(validityBuffer, i, (byte) 1); + } else { + BitVectorHelper.setValidityBit(validityBuffer, i, (byte) 0); + } + } + } + + /** + * Tear down benchmarks. + */ + @TearDown + public void tearDown() { + validityBuffer.close(); + allocator.close(); + } + + @Benchmark + @BenchmarkMode(Mode.AverageTime) + @OutputTimeUnit(TimeUnit.NANOSECONDS) + public int getNullCountBenchmark() { + return BitVectorHelper.getNullCount(validityBuffer, VALIDITY_BUFFER_CAPACITY); + } + + //@Test + public static void main(String [] args) throws RunnerException { + Options opt = new OptionsBuilder() + .include(BitVectorHelperBenchmarks.class.getSimpleName()) + .forks(1) + .build(); + + new Runner(opt).run(); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java index d1e99006505bf..329330ed61d61 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java @@ -145,21 +145,36 @@ public static int getNullCount(final ArrowBuf validityBuffer, final int valueCou final int sizeInBytes = getValidityBufferSize(valueCount); // If value count is not a multiple of 8, then calculate number of used bits in the last byte final int remainder = valueCount % 8; + final int fullBytesCount = remainder == 0 ? sizeInBytes : sizeInBytes - 1; - final int sizeInBytesMinus1 = sizeInBytes - 1; - for (int i = 0; i < sizeInBytesMinus1; i++) { - byte byteValue = validityBuffer.getByte(i); + int index = 0; + while (index + 8 <= fullBytesCount) { + long longValue = validityBuffer.getLong(index); + count += Long.bitCount(longValue); + index += 8; + } + + while (index + 4 <= fullBytesCount) { + int intValue = validityBuffer.getInt(index); + count += Integer.bitCount(intValue); + index += 4; + } + + while (index < fullBytesCount) { + byte byteValue = validityBuffer.getByte(index); count += Integer.bitCount(byteValue & 0xFF); + index += 1; } - // handling with the last byte - byte byteValue = validityBuffer.getByte(sizeInBytes - 1); + // handling with the last bits if (remainder != 0) { + byte byteValue = validityBuffer.getByte(sizeInBytes - 1); + // making the remaining bits all 1s if it is not fully filled byte mask = (byte) (0xFF << remainder); byteValue = (byte) (byteValue | mask); + count += Integer.bitCount(byteValue & 0xFF); } - count += Integer.bitCount(byteValue & 0xFF); return 8 * sizeInBytes - count; } From e0b84f117f6e76d2b67b70be3eb3a71fea9f2a18 Mon Sep 17 00:00:00 2001 From: Yosuke Shiro Date: Sun, 7 Jul 2019 06:27:32 +0900 Subject: [PATCH 13/14] ARROW-5866: [C++] Remove duplicate library in cpp/Brewfile Author: Yosuke Shiro Closes #4816 from shiro615/cpp-remove-duplicate-library and squashes the following commits: 1a200d4f0 Remove duplicate library in cpp/Brewfile --- cpp/Brewfile | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/Brewfile b/cpp/Brewfile index 4b796db9e1be3..c6588873d321c 100644 --- a/cpp/Brewfile +++ b/cpp/Brewfile @@ -35,7 +35,6 @@ brew "python" brew "rapidjson" brew "re2" brew "snappy" -brew "openssl" brew "thrift" brew "wget" brew "zstd" From c2c9e99140363545521f957219e3eba71a98c804 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sun, 7 Jul 2019 06:29:49 +0900 Subject: [PATCH 14/14] ARROW-5865: [Release] Helper script to rebase PRs on master This make a script that will generate shell commands to force push all branchs. Author: Micah Kornfield Closes #4814 from emkornfield/make_script_for_force_push and squashes the following commits: ce9bf9a41 Adrress PR feedback c4a14f5e6 fix flake8 033c8703f ARROW-5865: Helper script to rebase PRs on master --- dev/release/generate_force_push_script.py | 61 +++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100755 dev/release/generate_force_push_script.py diff --git a/dev/release/generate_force_push_script.py b/dev/release/generate_force_push_script.py new file mode 100755 index 0000000000000..b6cd760bc6002 --- /dev/null +++ b/dev/release/generate_force_push_script.py @@ -0,0 +1,61 @@ +#!/usr/bin/python +############################################################################## +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +############################################################################## + +# This script generates a series of shell commands +# to rebase all open pull requests off of master +# and force push the updates. + +from http.client import HTTPSConnection +import json +from collections import defaultdict + +client = HTTPSConnection('api.github.com') +client.request('GET', + '/repos/apache/arrow/pulls?state=open&per_page=100', + headers={'User-Agent': 'ApacheArrowRebaser'}) +response = client.getresponse() +json_content = response.read() +if response.status != 200: + error_msg = 'GitHub connection error:{}'.format(json_content) + raise Exception(error_msg) + +parsed_content = json.loads(json_content) +if len(parsed_content) == 100: + print("# WARNING: Only the most recent 100 PRs will be processed") + +repos = defaultdict(list) +for pr in parsed_content: + head = pr['head'] + repos[head['repo']['full_name']].append(head['label']) + +for repo, labels in repos.items(): + print('git clone git@github.com:{}.git'.format(repo)) + print('cd arrow') + print('git remote add upstream https://github.com/apache/arrow.git') + print('git fetch --all --prune --tags --force') + for label in labels: + # Labels are in the form 'user:branch' + owner, branch = label.split(':') + print('git checkout {}'.format(branch)) + print('(git rebase upstream/master && git push --force) || ' + + '(echo "Rebase failed for {}" && '.format(label) + + 'git rebase --abort)') + print('cd ..') + print('rm -rf arrow')