From 09241242ea93a78a5a72058f40f9a24e0a6a87e4 Mon Sep 17 00:00:00 2001 From: lriggs Date: Wed, 4 Sep 2024 10:42:58 -0700 Subject: [PATCH] Cherry pick fix for empty vectors and some additional build process fixes. (#81) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * GH-30866: [Java] fix SplitAndTransfer throws for (0,0) if vector empty (#41066) This is addresses https://issues.apache.org/jira/browse/ARROW-15382 and is reopening of https://github.com/apache/arrow/pull/12250 (which I asked to be reopened). I tried to address all the comments from the previous discussion, added some more tests and fixed an issue in the old commit. * GitHub Issue: #30866 Authored-by: Finn Völkel Signed-off-by: David Li * GH-43463: [C++][Gandiva] Always use gdv_function_stubs.h in context_helper.cc (#43464) ### Rationale for this change `gdv_function_stubs.h` has declarations of functions in `context_helper.cc`. If we don't include `gdv_function_stubs.h`, it causes attribution mismatch error with unity build. ### What changes are included in this PR? Always include `gdv_function_stubs.h` in `context_helper.cc`. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #43463 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei * GH-43119: [CI][Packaging] Update manylinux 2014 CentOS repos that have been deprecated (#43121) ### Rationale for this change Jobs are failing to find mirrorlist.centos.org ### What changes are included in this PR? Updating repos based on solution from: https://github.com/apache/arrow/issues/43119#issuecomment-2203534492 ### Are these changes tested? Via archery ### Are there any user-facing changes? No * GitHub Issue: #43119 Lead-authored-by: Raúl Cumplido Co-authored-by: Sutou Kouhei Co-authored-by: Sutou Kouhei Signed-off-by: Raúl Cumplido * Update macos deployment target to 12 to match build machine. * GH-43400: [C++] Ensure using bundled GoogleTest when we use bundled GoogleTest (#43465) ### Rationale for this change If we use bundled GoogleTest and system other dependencies such as Boost, our include path options may be: * `-isystem /opt/homebrew/include` (for Boost) * `-isystem build_dir/_deps/googletest-src/googletest` (for bundled GoogleTest) * `-isystem build_dir/_deps/googletest-src/googlemock` (for bundled GoogleTest) With this order, GoogleTest headers in `/opt/homebrew/include/` are used with bundled GoogleTest. It may cause link errors. ### What changes are included in this PR? This change introduces a new CMake target `arrow::GTest::gtest_headers` that has include paths for bundled GoogleTest. And it's always used as the first link library of all test program. With this change, our include path options are: * `-isystem build_dir/_deps/googletest-src/googletest` (for bundled GoogleTest) * `-isystem build_dir/_deps/googletest-src/googlemock` (for bundled GoogleTest) * `-isystem /opt/homebrew/include` (for Boost) With this order, we can always use our bundled GoogleTest. `arrow::GTest::gtest_headers` is defined only when we use bundled GoogleTest. So this doesn't change the system GoogleTest case. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * GitHub Issue: #43400 Authored-by: Sutou Kouhei Signed-off-by: Jacob Wujciak-Jens --------- Signed-off-by: David Li Signed-off-by: Sutou Kouhei Signed-off-by: Raúl Cumplido Signed-off-by: Jacob Wujciak-Jens Co-authored-by: Finn Völkel Co-authored-by: Sutou Kouhei Co-authored-by: Raúl Cumplido Co-authored-by: Sutou Kouhei --- ci/docker/centos-7-cpp.dockerfile | 14 ++ ci/docker/python-wheel-manylinux.dockerfile | 12 ++ cpp/cmake_modules/BuildUtils.cmake | 5 + cpp/cmake_modules/ThirdpartyToolchain.cmake | 6 + cpp/src/gandiva/context_helper.cc | 6 +- dev/tasks/java-jars/github.yml | 2 +- .../vector/BaseLargeVariableWidthVector.java | 12 +- .../arrow/vector/BaseVariableWidthVector.java | 6 +- .../arrow/vector/complex/ListVector.java | 28 ++-- .../arrow/vector/TestLargeVarCharVector.java | 4 +- .../arrow/vector/TestSplitAndTransfer.java | 121 +++++++++++++++++- 11 files changed, 181 insertions(+), 35 deletions(-) diff --git a/ci/docker/centos-7-cpp.dockerfile b/ci/docker/centos-7-cpp.dockerfile index 8c1893cbbb2ae..1f30eed694e4e 100644 --- a/ci/docker/centos-7-cpp.dockerfile +++ b/ci/docker/centos-7-cpp.dockerfile @@ -17,11 +17,25 @@ FROM centos:centos7 +# Update mirrors to use vault.centos.org as CentOS 7 +# is EOL since 2024-06-30 +RUN sed -i \ + -e 's/^mirrorlist/#mirrorlist/' \ + -e 's/^#baseurl/baseurl/' \ + -e 's/mirror\.centos\.org/vault.centos.org/' \ + /etc/yum.repos.d/*.repo + # devtoolset is required for C++17 RUN \ yum install -y \ centos-release-scl \ epel-release && \ + sed -i \ + -e 's/^mirrorlist/#mirrorlist/' \ + -e 's/^#baseurl/baseurl/' \ + -e 's/^# baseurl/baseurl/' \ + -e 's/mirror\.centos\.org/vault.centos.org/' \ + /etc/yum.repos.d/CentOS-SCLo-scl*.repo && \ yum install -y \ cmake3 \ curl \ diff --git a/ci/docker/python-wheel-manylinux.dockerfile b/ci/docker/python-wheel-manylinux.dockerfile index 63fd7b1d46820..81d205076bb92 100644 --- a/ci/docker/python-wheel-manylinux.dockerfile +++ b/ci/docker/python-wheel-manylinux.dockerfile @@ -25,6 +25,18 @@ ARG manylinux ENV MANYLINUX_VERSION=${manylinux} # Ensure dnf is installed, especially for the manylinux2014 base +RUN if [ "${MANYLINUX_VERSION}" = "2014" ]; then \ + sed -i \ + -e 's/^mirrorlist/#mirrorlist/' \ + -e 's/^#baseurl/baseurl/' \ + -e 's/mirror\.centos\.org/vault.centos.org/' \ + /etc/yum.repos.d/*.repo; \ + if [ "${arch}" != "amd64" ]; then \ + sed -i \ + -e 's,vault\.centos\.org/centos,vault.centos.org/altarch,' \ + /etc/yum.repos.d/CentOS-SCLo-scl-rh.repo; \ + fi; \ + fi RUN yum install -y dnf # Install basic dependencies diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index e7523add27223..692efa78376f4 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -721,6 +721,11 @@ function(ADD_TEST_CASE REL_TEST_NAME) "${EXECUTABLE_OUTPUT_PATH};$ENV{CONDA_PREFIX}/lib") endif() + # Ensure using bundled GoogleTest when we use bundled GoogleTest. + # ARROW_GTEST_GTEST_HEADERS is defined only when we use bundled + # GoogleTest. + target_link_libraries(${TEST_NAME} PRIVATE ${ARROW_GTEST_GTEST_HEADERS}) + if(ARG_STATIC_LINK_LIBS) # Customize link libraries target_link_libraries(${TEST_NAME} PRIVATE ${ARG_STATIC_LINK_LIBS}) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 7d54ccccf7c19..62c1137bac639 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -2298,6 +2298,10 @@ function(build_gtest) install(DIRECTORY "${googletest_SOURCE_DIR}/googlemock/include/" "${googletest_SOURCE_DIR}/googletest/include/" DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}") + add_library(arrow::GTest::gtest_headers INTERFACE IMPORTED) + target_include_directories(arrow::GTest::gtest_headers + INTERFACE "${googletest_SOURCE_DIR}/googlemock/include/" + "${googletest_SOURCE_DIR}/googletest/include/") install(TARGETS gmock gmock_main gtest gtest_main EXPORT arrow_testing_targets RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}" @@ -2342,12 +2346,14 @@ if(ARROW_TESTING) string(APPEND ARROW_TESTING_PC_LIBS " $") endif() + set(ARROW_GTEST_GTEST_HEADERS) set(ARROW_GTEST_GMOCK GTest::gmock) set(ARROW_GTEST_GTEST GTest::gtest) set(ARROW_GTEST_GTEST_MAIN GTest::gtest_main) else() string(APPEND ARROW_TESTING_PC_CFLAGS " -I\${includedir}/arrow-gtest") string(APPEND ARROW_TESTING_PC_LIBS " -larrow_gtest") + set(ARROW_GTEST_GTEST_HEADERS arrow::GTest::gtest_headers) set(ARROW_GTEST_GMOCK arrow::GTest::gmock) set(ARROW_GTEST_GTEST arrow::GTest::gtest) set(ARROW_GTEST_GTEST_MAIN arrow::GTest::gtest_main) diff --git a/cpp/src/gandiva/context_helper.cc b/cpp/src/gandiva/context_helper.cc index 03bbe1b7a67d9..8edd52b1fb070 100644 --- a/cpp/src/gandiva/context_helper.cc +++ b/cpp/src/gandiva/context_helper.cc @@ -15,11 +15,13 @@ // specific language governing permissions and limitations // under the License. +#include "gandiva/execution_context.h" +#include "gandiva/gdv_function_stubs.h" + // This file is also used in the pre-compiled unit tests, which do include // llvm/engine/.. #ifndef GANDIVA_UNIT_TEST #include "gandiva/exported_funcs.h" -#include "gandiva/gdv_function_stubs.h" #include "gandiva/engine.h" @@ -56,8 +58,6 @@ arrow::Status ExportedContextFunctions::AddMappings(Engine* engine) const { } // namespace gandiva #endif // !GANDIVA_UNIT_TEST -#include "gandiva/execution_context.h" - extern "C" { void gdv_fn_context_set_error_msg(int64_t context_ptr, char const* err_msg) { diff --git a/dev/tasks/java-jars/github.yml b/dev/tasks/java-jars/github.yml index 12ab0e136368e..4533e62bce037 100644 --- a/dev/tasks/java-jars/github.yml +++ b/dev/tasks/java-jars/github.yml @@ -95,7 +95,7 @@ jobs: platform: - { runs_on: ["macos-12"], arch: "x86_64"} env: - MACOSX_DEPLOYMENT_TARGET: "10.15" + MACOSX_DEPLOYMENT_TARGET: "12.0" steps: {{ macros.github_checkout_arrow()|indent }} - name: Set up Python diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java index 34c9e73a0b072..2ef6e4bd8b374 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java @@ -763,16 +763,14 @@ public void transferTo(BaseLargeVariableWidthVector target) { */ public void splitAndTransferTo(int startIndex, int length, BaseLargeVariableWidthVector target) { - Preconditions.checkArgument(startIndex >= 0 && startIndex < valueCount, - "Invalid startIndex: %s", startIndex); - Preconditions.checkArgument(startIndex + length <= valueCount, - "Invalid length: %s", length); + Preconditions.checkArgument(startIndex >= 0 && length >= 0 && startIndex + length <= valueCount, + "Invalid parameters startIndex: %s, length: %s for valueCount: %s", startIndex, length, valueCount); compareTypes(target, "splitAndTransferTo"); target.clear(); - splitAndTransferValidityBuffer(startIndex, length, target); - splitAndTransferOffsetBuffer(startIndex, length, target); - target.setLastSet(length - 1); if (length > 0) { + splitAndTransferValidityBuffer(startIndex, length, target); + splitAndTransferOffsetBuffer(startIndex, length, target); + target.setLastSet(length - 1); target.setValueCount(length); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java index 6b82dd7729a6c..d533629cdd44e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java @@ -808,10 +808,10 @@ public void splitAndTransferTo(int startIndex, int length, "Invalid parameters startIndex: %s, length: %s for valueCount: %s", startIndex, length, valueCount); compareTypes(target, "splitAndTransferTo"); target.clear(); - splitAndTransferValidityBuffer(startIndex, length, target); - splitAndTransferOffsetBuffer(startIndex, length, target); - target.setLastSet(length - 1); if (length > 0) { + splitAndTransferValidityBuffer(startIndex, length, target); + splitAndTransferOffsetBuffer(startIndex, length, target); + target.setLastSet(length - 1); target.setValueCount(length); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index c61a2894aafd9..656c4add04201 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -552,21 +552,23 @@ public void transfer() { public void splitAndTransfer(int startIndex, int length) { Preconditions.checkArgument(startIndex >= 0 && length >= 0 && startIndex + length <= valueCount, "Invalid parameters startIndex: %s, length: %s for valueCount: %s", startIndex, length, valueCount); - final int startPoint = offsetBuffer.getInt(startIndex * OFFSET_WIDTH); - final int sliceLength = offsetBuffer.getInt((startIndex + length) * OFFSET_WIDTH) - startPoint; to.clear(); - to.offsetBuffer = to.allocateOffsetBuffer((length + 1) * OFFSET_WIDTH); - /* splitAndTransfer offset buffer */ - for (int i = 0; i < length + 1; i++) { - final int relativeOffset = offsetBuffer.getInt((startIndex + i) * OFFSET_WIDTH) - startPoint; - to.offsetBuffer.setInt(i * OFFSET_WIDTH, relativeOffset); + if (length > 0) { + final int startPoint = offsetBuffer.getInt(startIndex * OFFSET_WIDTH); + final int sliceLength = offsetBuffer.getInt((startIndex + length) * OFFSET_WIDTH) - startPoint; + to.offsetBuffer = to.allocateOffsetBuffer((length + 1) * OFFSET_WIDTH); + /* splitAndTransfer offset buffer */ + for (int i = 0; i < length + 1; i++) { + final int relativeOffset = offsetBuffer.getInt((startIndex + i) * OFFSET_WIDTH) - startPoint; + to.offsetBuffer.setInt(i * OFFSET_WIDTH, relativeOffset); + } + /* splitAndTransfer validity buffer */ + splitAndTransferValidityBuffer(startIndex, length, to); + /* splitAndTransfer data buffer */ + dataTransferPair.splitAndTransfer(startPoint, sliceLength); + to.lastSet = length - 1; + to.setValueCount(length); } - /* splitAndTransfer validity buffer */ - splitAndTransferValidityBuffer(startIndex, length, to); - /* splitAndTransfer data buffer */ - dataTransferPair.splitAndTransfer(startPoint, sliceLength); - to.lastSet = length - 1; - to.setValueCount(length); } /* diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarCharVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarCharVector.java index 62d09da86d652..06b27a9eba156 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarCharVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarCharVector.java @@ -166,7 +166,7 @@ public void testInvalidStartIndex() { IllegalArgumentException.class, () -> tp.splitAndTransfer(valueCount, 10)); - assertEquals("Invalid startIndex: 500", e.getMessage()); + assertEquals("Invalid parameters startIndex: 500, length: 10 for valueCount: 500", e.getMessage()); } } @@ -185,7 +185,7 @@ public void testInvalidLength() { IllegalArgumentException.class, () -> tp.splitAndTransfer(0, valueCount * 2)); - assertEquals("Invalid length: 1000", e.getMessage()); + assertEquals("Invalid parameters startIndex: 0, length: 1000 for valueCount: 500", e.getMessage()); } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java b/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java index 3580a321f01c9..396f5665e0382 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestSplitAndTransfer.java @@ -28,6 +28,7 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.complex.DenseUnionVector; import org.apache.arrow.vector.complex.FixedSizeListVector; import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.MapVector; @@ -49,7 +50,7 @@ public class TestSplitAndTransfer { public void init() { allocator = new RootAllocator(Long.MAX_VALUE); } - + @After public void terminate() throws Exception { allocator.close(); @@ -65,7 +66,116 @@ private void populateVarcharVector(final VarCharVector vector, int valueCount, S } vector.setValueCount(valueCount); } - + + private void populateIntVector(final IntVector vector, int valueCount) { + for (int i = 0; i < valueCount; i++) { + vector.set(i, i); + } + vector.setValueCount(valueCount); + } + + private void populateDenseUnionVector(final DenseUnionVector vector, int valueCount) { + VarCharVector varCharVector = vector.addOrGet("varchar", FieldType.nullable(new ArrowType.Utf8()), + VarCharVector.class); + BigIntVector intVector = vector.addOrGet("int", + FieldType.nullable(new ArrowType.Int(64, true)), BigIntVector.class); + + for (int i = 0; i < valueCount; i++) { + vector.setTypeId(i, (byte) (i % 2)); + if (i % 2 == 0) { + final String s = String.format("%010d", i); + varCharVector.setSafe(i / 2, s.getBytes(StandardCharsets.UTF_8)); + } else { + intVector.setSafe(i / 2, i); + } + } + vector.setValueCount(valueCount); + } + + @Test + public void testWithEmptyVector() { + // MapVector use TransferImpl from ListVector + ListVector listVector = ListVector.empty("", allocator); + TransferPair transferPair = listVector.getTransferPair(allocator); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, transferPair.getTo().getValueCount()); + // BaseFixedWidthVector + IntVector intVector = new IntVector("", allocator); + transferPair = intVector.getTransferPair(allocator); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, transferPair.getTo().getValueCount()); + // BaseVariableWidthVector + VarCharVector varCharVector = new VarCharVector("", allocator); + transferPair = varCharVector.getTransferPair(allocator); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, transferPair.getTo().getValueCount()); + // BaseLargeVariableWidthVector + LargeVarCharVector largeVarCharVector = new LargeVarCharVector("", allocator); + transferPair = largeVarCharVector.getTransferPair(allocator); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, transferPair.getTo().getValueCount()); + // StructVector + StructVector structVector = StructVector.empty("", allocator); + transferPair = structVector.getTransferPair(allocator); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, transferPair.getTo().getValueCount()); + // FixedSizeListVector + FixedSizeListVector fixedSizeListVector = FixedSizeListVector.empty("", 0, allocator); + transferPair = fixedSizeListVector.getTransferPair(allocator); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, transferPair.getTo().getValueCount()); + // FixedSizeBinaryVector + FixedSizeBinaryVector fixedSizeBinaryVector = new FixedSizeBinaryVector("", allocator, 4); + transferPair = fixedSizeBinaryVector.getTransferPair(allocator); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, transferPair.getTo().getValueCount()); + // UnionVector + UnionVector unionVector = UnionVector.empty("", allocator); + transferPair = unionVector.getTransferPair(allocator); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, transferPair.getTo().getValueCount()); + // DenseUnionVector + DenseUnionVector duv = DenseUnionVector.empty("", allocator); + transferPair = duv.getTransferPair(allocator); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, transferPair.getTo().getValueCount()); + + // non empty from vector + + // BaseFixedWidthVector + IntVector fromIntVector = new IntVector("", allocator); + fromIntVector.allocateNew(100); + populateIntVector(fromIntVector, 100); + transferPair = fromIntVector.getTransferPair(allocator); + IntVector toIntVector = (IntVector) transferPair.getTo(); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, toIntVector.getValueCount()); + + transferPair.splitAndTransfer(50, 0); + assertEquals(0, toIntVector.getValueCount()); + + transferPair.splitAndTransfer(100, 0); + assertEquals(0, toIntVector.getValueCount()); + fromIntVector.clear(); + toIntVector.clear(); + + // DenseUnionVector + DenseUnionVector fromDuv = DenseUnionVector.empty("", allocator); + populateDenseUnionVector(fromDuv, 100); + transferPair = fromDuv.getTransferPair(allocator); + DenseUnionVector toDUV = (DenseUnionVector) transferPair.getTo(); + transferPair.splitAndTransfer(0, 0); + assertEquals(0, toDUV.getValueCount()); + + transferPair.splitAndTransfer(50, 0); + assertEquals(0, toDUV.getValueCount()); + + transferPair.splitAndTransfer(100, 0); + assertEquals(0, toDUV.getValueCount()); + fromDuv.clear(); + toDUV.clear(); + } + @Test /* VarCharVector */ public void test() throws Exception { try (final VarCharVector varCharVector = new VarCharVector("myvector", allocator)) { @@ -73,13 +183,13 @@ public void test() throws Exception { final int valueCount = 500; final String[] compareArray = new String[valueCount]; - + populateVarcharVector(varCharVector, valueCount, compareArray); - + final TransferPair tp = varCharVector.getTransferPair(allocator); final VarCharVector newVarCharVector = (VarCharVector) tp.getTo(); final int[][] startLengths = {{0, 201}, {201, 0}, {201, 200}, {401, 99}}; - + for (final int[] startLength : startLengths) { final int start = startLength[0]; final int length = startLength[1]; @@ -429,5 +539,4 @@ public void testMapVectorZeroStartIndexAndLength() { newMapVector.clear(); } } - }