Merge branch 'branch-24.06' into fix_json_all_empty

rapidsai · Apr 25, 2024 · 1c7bd73 · 1c7bd73
2 parents 35b4812 + 65c2b53
commit 1c7bd73
Show file tree

Hide file tree

Showing 167 changed files with 11,101 additions and 6,286 deletions.
diff --git a/.github/ISSUE_TEMPLATE/pandas_function_request.md b/.github/ISSUE_TEMPLATE/pandas_function_request.md
@@ -2,7 +2,7 @@
 name: Request a Missing Pandas Function
 about: Request GPU support for a function executed on the CPU in pandas accelerator mode.
 title: "[FEA]"
-labels: "? - Needs Triage, feature request"
+labels: "Needs Triage, feature request, cudf.pandas"
 assignees: ''
 
 ---

diff --git a/.gitignore b/.gitignore
@@ -78,6 +78,7 @@ CMakeFiles/
 Debug
 build/
 cpp/build/
+cpp/examples/*/install/
 cpp/include/cudf/ipc_generated/*.h
 cpp/thirdparty/googletest/
 
@@ -160,9 +161,6 @@ ENV/
 # Dask
 dask-worker-space/
 
-# protobuf
-**/*_pb2.py
-
 # Sphinx docs & build artifacts
 docs/cudf/source/api_docs/generated/*
 docs/cudf/source/user_guide/api_docs/api/*

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
@@ -70,7 +70,7 @@ sed_runner "s/version == ${CURRENT_SHORT_TAG}/version == ${NEXT_SHORT_TAG}/g" RE
 sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" README.md
 
 # Libcudf examples update
-sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/fetch_dependencies.cmake
+sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/versions.cmake
 
 # CI files
 for FILE in .github/workflows/*.yaml; do

diff --git a/ci/run_cudf_examples.sh b/ci/run_cudf_examples.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -uo pipefail
+
+EXITCODE=0
+trap "EXITCODE=1" ERR
+
+# Support customizing the examples' install location
+cd "${INSTALL_PREFIX:-${CONDA_PREFIX:-/usr}}/bin/examples/libcudf/";
+
+# compute-sanitizer not available before CUDA 11.6
+if [[ "${RAPIDS_CUDA_VERSION%.*}" < "11.6" ]]; then
+  echo "computer-sanitizer unavailable pre 11.6"
+  exit 0
+fi
+
+compute-sanitizer --tool memcheck basic_example
+
+compute-sanitizer --tool memcheck deduplication
+
+compute-sanitizer --tool memcheck custom_optimized names.csv
+compute-sanitizer --tool memcheck custom_prealloc names.csv
+compute-sanitizer --tool memcheck custom_with_malloc names.csv
+
+exit ${EXITCODE}
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
@@ -17,6 +17,12 @@ rapids-logger "Run libcudf gtests"
 ./ci/run_cudf_ctests.sh -j20
 SUITEERROR=$?
 
+if (( ${SUITEERROR} == 0 )); then
+    rapids-logger "Run libcudf examples"
+    ./ci/run_cudf_examples.sh
+    SUITEERROR=$?
+fi
+
 if (( ${SUITEERROR} == 0 )); then
     rapids-logger "Run libcudf_kafka gtests"
     ./ci/run_cudf_kafka_ctests.sh -j20

diff --git a/ci/test_cpp_common.sh b/ci/test_cpp_common.sh
@@ -31,7 +31,7 @@ rapids-print-env
 
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
-  libcudf libcudf_kafka libcudf-tests
+  libcudf libcudf_kafka libcudf-tests libcudf-example
 
 rapids-logger "Check GPU usage"
 nvidia-smi
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -8,7 +8,6 @@ channels:
 - nvidia
 dependencies:
 - aiobotocore>=2.2.0
-- benchmark==1.8.0
 - boto3>=1.21.21
 - botocore>=1.24.21
 - breathe>=4.35.0
@@ -34,8 +33,6 @@ dependencies:
 - fmt>=10.1.1,<11
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
-- gmock>=1.13.0
-- gtest>=1.13.0
 - hypothesis
 - identify>=2.5.20
 - ipython
@@ -68,7 +65,6 @@ dependencies:
 - pandoc
 - pip
 - pre-commit
-- protobuf>=3.20,<5
 - ptxcompiler
 - pyarrow==14.0.2.*
 - pydata-sphinx-theme!=0.14.2

diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -8,7 +8,6 @@ channels:
 - nvidia
 dependencies:
 - aiobotocore>=2.2.0
-- benchmark==1.8.0
 - boto3>=1.21.21
 - botocore>=1.24.21
 - breathe>=4.35.0
@@ -35,8 +34,6 @@ dependencies:
 - fmt>=10.1.1,<11
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
-- gmock>=1.13.0
-- gtest>=1.13.0
 - hypothesis
 - identify>=2.5.20
 - ipython
@@ -66,7 +63,6 @@ dependencies:
 - pandoc
 - pip
 - pre-commit
-- protobuf>=3.20,<5
 - pyarrow==14.0.2.*
 - pydata-sphinx-theme!=0.14.2
 - pynvjitlink

diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
@@ -59,7 +59,6 @@ requirements:
     - cuda-version ={{ cuda_version }}
     - sysroot_{{ target_platform }} {{ sysroot_version }}
   host:
-    - protobuf ==4.24.*
     - python
     - cython >=3.0.3
     - scikit-build-core >=0.7.0
@@ -78,7 +77,6 @@ requirements:
     {% endif %}
     - cuda-version ={{ cuda_version }}
   run:
-    - protobuf >=3.20,<5.0a0
     - python
     - typing_extensions >=4.0.0
     - pandas >=2.0,<2.2.3dev0

diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
@@ -16,12 +16,6 @@ sysroot_version:
 cmake_version:
   - ">=3.26.4"
 
-gbench_version:
-  - "==1.8.0"
-
-gtest_version:
-  - ">=1.13.0"
-
 libarrow_version:
   - "==14.0.2"
 

diff --git a/conda/recipes/libcudf/install_libcudf_example.sh b/conda/recipes/libcudf/install_libcudf_example.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
-./cpp/examples/build.sh
+# build and install libcudf examples
+./cpp/examples/build.sh --install
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
@@ -69,9 +69,6 @@ requirements:
     - librdkafka {{ librdkafka_version }}
     - fmt {{ fmt_version }}
     - spdlog {{ spdlog_version }}
-    - benchmark {{ gbench_version }}
-    - gtest {{ gtest_version }}
-    - gmock {{ gtest_version }}
     - zlib {{ zlib_version }}
 
 outputs:
@@ -108,8 +105,6 @@ outputs:
         - librmm ={{ minor_version }}
         - libkvikio ={{ minor_version }}
         - dlpack {{ dlpack_version }}
-        - gtest {{ gtest_version }}
-        - gmock {{ gtest_version }}
     test:
       commands:
         - test -f $PREFIX/lib/libcudf.so
@@ -195,7 +190,7 @@ outputs:
       license: Apache-2.0
       license_family: APACHE
       license_file: LICENSE
-      summary: libcudf_example library
+      summary: libcudf example executables
   - name: libcudf-tests
     version: {{ version }}
     script: install_libcudf_tests.sh
@@ -221,9 +216,6 @@ outputs:
         {% else %}
         - libcurand-dev
         {% endif %}
-        - benchmark {{ gbench_version }}
-        - gtest {{ gtest_version }}
-        - gmock {{ gtest_version }}
       run:
         - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
         - {{ pin_subpackage('libcudf', exact=True) }}
@@ -233,9 +225,6 @@ outputs:
         {% else %}
         - libcurand
         {% endif %}
-        - benchmark {{ gbench_version }}
-        - gtest {{ gtest_version }}
-        - gmock {{ gtest_version }}
     about:
       home: https://rapids.ai/
       license: Apache-2.0

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -359,6 +359,9 @@ add_library(
   src/interop/from_arrow.cu
   src/interop/to_arrow.cu
   src/interop/to_arrow_device.cu
+  src/interop/from_arrow_device.cu
+  src/interop/to_arrow_schema.cpp
+  src/interop/to_arrow_utilities.cpp
   src/interop/detail/arrow_allocator.cpp
   src/io/avro/avro.cpp
   src/io/avro/avro_gpu.cu
@@ -584,6 +587,7 @@ add_library(
   src/strings/filling/fill.cu
   src/strings/filter_chars.cu
   src/strings/like.cu
+  src/strings/merge/merge.cu
   src/strings/padding.cu
   src/strings/regex/regcomp.cpp
   src/strings/regex/regexec.cpp
@@ -844,14 +848,12 @@ if(CUDF_BUILD_TESTUTIL)
 
   add_library(cudf::cudftest_default_stream ALIAS cudftest_default_stream)
 
-  # Needs to be static so that we support usage of static builds of gtest which doesn't compile with
-  # fPIC enabled and therefore can't be embedded into shared libraries.
   add_library(
-    cudftestutil STATIC
+    cudftestutil SHARED
     tests/io/metadata_utilities.cpp
-    tests/utilities/base_fixture.cpp
     tests/utilities/column_utilities.cu
     tests/utilities/debug_utilities.cu
+    tests/utilities/random_seed.cpp
     tests/utilities/table_utilities.cu
     tests/utilities/tdigest_utilities.cu
   )
@@ -876,8 +878,8 @@ if(CUDF_BUILD_TESTUTIL)
 
   target_link_libraries(
     cudftestutil
-    PUBLIC GTest::gmock GTest::gtest Threads::Threads cudf cudftest_default_stream
-    PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
+    PUBLIC Threads::Threads cudf cudftest_default_stream
+    PRIVATE GTest::gmock GTest::gtest $<TARGET_NAME_IF_EXISTS:conda_env>
   )
 
   target_include_directories(
@@ -956,7 +958,7 @@ endif()
 if(CUDF_BUILD_BENCHMARKS)
   # Find or install GoogleBench
   include(${rapids-cmake-dir}/cpm/gbench.cmake)
-  rapids_cpm_gbench()
+  rapids_cpm_gbench(BUILD_STATIC)
 
   # Find or install nvbench
   include(cmake/thirdparty/get_nvbench.cmake)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -40,7 +40,7 @@ target_include_directories(
 
 # Use an OBJECT library so we only compile these helper source files only once
 add_library(
-  cudf_benchmark_common OBJECT "${CUDF_SOURCE_DIR}/tests/utilities/base_fixture.cpp"
+  cudf_benchmark_common OBJECT "${CUDF_SOURCE_DIR}/tests/utilities/random_seed.cpp"
                                synchronization/synchronization.cpp io/cuio_common.cpp
 )
 target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen $<TARGET_NAME_IF_EXISTS:conda_env>)
@@ -236,7 +236,9 @@ ConfigureNVBench(HASHING_NVBENCH hashing/hash.cpp)
 # ##################################################################################################
 # * merge benchmark -------------------------------------------------------------------------------
 ConfigureBench(MERGE_BENCH merge/merge.cpp)
-ConfigureNVBench(MERGE_NVBENCH merge/merge_structs.cpp merge/merge_lists.cpp)
+ConfigureNVBench(
+  MERGE_NVBENCH merge/merge_lists.cpp merge/merge_structs.cpp merge/merge_strings.cpp
+)
 
 # ##################################################################################################
 # * null_mask benchmark ---------------------------------------------------------------------------

diff --git a/cpp/benchmarks/fixture/nvbench_fixture.hpp b/cpp/benchmarks/fixture/nvbench_fixture.hpp
@@ -45,6 +45,8 @@ static std::string cuio_host_mem_param{
  * Initializes the default memory resource to use the RMM pool device resource.
  */
 struct nvbench_base_fixture {
+  using host_pooled_mr_t = rmm::mr::pool_memory_resource<rmm::mr::pinned_host_memory_resource>;
+
   inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
 
   inline auto make_pool()
@@ -90,12 +92,14 @@ struct nvbench_base_fixture {
 
   inline rmm::host_async_resource_ref make_cuio_host_pinned_pool()
   {
-    using host_pooled_mr = rmm::mr::pool_memory_resource<rmm::mr::pinned_host_memory_resource>;
-    static std::shared_ptr<host_pooled_mr> mr = std::make_shared<host_pooled_mr>(
-      std::make_shared<rmm::mr::pinned_host_memory_resource>().get(),
-      size_t{1} * 1024 * 1024 * 1024);
+    if (!this->host_pooled_mr) {
+      // Don't store in static, as the CUDA context may be destroyed before static destruction
+      this->host_pooled_mr = std::make_shared<host_pooled_mr_t>(
+        std::make_shared<rmm::mr::pinned_host_memory_resource>().get(),
+        size_t{1} * 1024 * 1024 * 1024);
+    }
 
-    return *mr;
+    return *this->host_pooled_mr;
   }
 
   inline rmm::host_async_resource_ref create_cuio_host_memory_resource(std::string const& mode)
@@ -126,9 +130,16 @@ struct nvbench_base_fixture {
     std::cout << "CUIO host memory resource = " << cuio_host_mode << "\n";
   }
 
+  ~nvbench_base_fixture()
+  {
+    // Ensure the the pool is freed before the CUDA context is destroyed:
+    cudf::io::set_host_memory_resource(this->make_cuio_host_pinned());
+  }
+
   std::shared_ptr<rmm::mr::device_memory_resource> mr;
   std::string rmm_mode{"pool"};
 
+  std::shared_ptr<host_pooled_mr_t> host_pooled_mr;
   std::string cuio_host_mode{"pinned"};
 };