rapidsai · rapids-bot · Oct 8, 2024 · Oct 7, 2024 · Oct 7, 2024 · Oct 8, 2024
@@ -87,21 +87,6 @@ if(CUVS_ANN_BENCH_USE_FAISS)
   include(cmake/thirdparty/get_faiss)
 endif()
 
-# ##################################################################################################
-# * Enable NVTX if available
-
-# Note: ANN_BENCH wrappers have extra NVTX code not related to raft::nvtx.They track gbench
-# benchmark cases and iterations. This is to make limited NVTX available to all algos, not just
-# raft/cuVS.
-if(TARGET CUDA::nvtx3)
-  set(_CMAKE_REQUIRED_INCLUDES_ORIG ${CMAKE_REQUIRED_INCLUDES})
-  get_target_property(CMAKE_REQUIRED_INCLUDES CUDA::nvtx3 INTERFACE_INCLUDE_DIRECTORIES)
-  unset(NVTX3_HEADERS_FOUND CACHE)
-  # Check the headers explicitly to make sure the cpu-only build succeeds
-  CHECK_INCLUDE_FILE_CXX(nvtx3/nvToolsExt.h NVTX3_HEADERS_FOUND)
-  set(CMAKE_REQUIRED_INCLUDES ${_CMAKE_REQUIRED_INCLUDES_ORIG})
-endif()
-
 # ##################################################################################################
 # * Target function -------------------------------------------------------------
 
@@ -127,12 +112,9 @@ function(ConfigureAnnBench)
     add_dependencies(${BENCH_NAME} ANN_BENCH)
   else()
     add_executable(${BENCH_NAME} ${ConfigureAnnBench_PATH})
-    target_compile_definitions(
-      ${BENCH_NAME} PRIVATE ANN_BENCH_BUILD_MAIN
-                            $<$<BOOL:${NVTX3_HEADERS_FOUND}>:ANN_BENCH_NVTX3_HEADERS_FOUND>
-    )
+    target_compile_definitions(${BENCH_NAME} PRIVATE ANN_BENCH_BUILD_MAIN>)
     target_link_libraries(
-      ${BENCH_NAME} PRIVATE benchmark::benchmark $<$<BOOL:${NVTX3_HEADERS_FOUND}>:CUDA::nvtx3>
+      ${BENCH_NAME} PRIVATE benchmark::benchmark $<$<TARGET_EXISTS:CUDA::nvtx3>:CUDA::nvtx3>
     )
   endif()
 
@@ -300,7 +282,7 @@ if(CUVS_ANN_BENCH_SINGLE_EXE)
   target_link_libraries(
     ANN_BENCH
     PRIVATE raft::raft nlohmann_json::nlohmann_json benchmark::benchmark dl fmt::fmt-header-only
-            spdlog::spdlog_header_only $<$<BOOL:${NVTX3_HEADERS_FOUND}>:CUDA::nvtx3>
+            spdlog::spdlog_header_only $<$<TARGET_EXISTS:CUDA::nvtx3>:CUDA::nvtx3>
   )
   set_target_properties(
     ANN_BENCH
@@ -318,7 +300,6 @@ if(CUVS_ANN_BENCH_SINGLE_EXE)
     ANN_BENCH
     PRIVATE
       $<$<BOOL:${CUDAToolkit_FOUND}>:ANN_BENCH_LINK_CUDART="libcudart.so.${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}.${CUDAToolkit_VERSION_PATCH}">
-      $<$<BOOL:${NVTX3_HEADERS_FOUND}>:ANN_BENCH_NVTX3_HEADERS_FOUND>
   )
 
   target_link_options(ANN_BENCH PRIVATE -export-dynamic)

@@ -18,7 +18,8 @@
 #include "ann_types.hpp"
 #include "cuda_stub.hpp"  // cuda-related utils
 
-#ifdef ANN_BENCH_NVTX3_HEADERS_FOUND
+#if __has_include(<nvtx3/nvToolsExt.h>)
+#define ANN_BENCH_NVTX3_HEADERS_FOUND
 #include <nvtx3/nvToolsExt.h>
 #endif
 

@@ -15,12 +15,12 @@
  */
 #pragma once
 
+#include "../../core/nvtx.hpp"
 #include "kmeans_common.cuh"
 
 #include <cuvs/cluster/kmeans.hpp>
 #include <cuvs/distance/distance.hpp>
 
-#include <raft/common/nvtx.hpp>
 #include <raft/core/cudart_utils.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/host_mdarray.hpp>
@@ -71,7 +71,7 @@ void initRandom(raft::resources const& handle,
                 raft::device_matrix_view<const DataT, IndexT> X,
                 raft::device_matrix_view<DataT, IndexT> centroids)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("initRandom");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("initRandom");
   auto n_clusters = params.n_clusters;
   cuvs::cluster::kmeans::detail::shuffleAndGather<DataT, IndexT>(
     handle, X, centroids, n_clusters, params.rng_state.seed);
@@ -98,7 +98,7 @@ void kmeansPlusPlus(raft::resources const& handle,
                     raft::device_matrix_view<DataT, IndexT> centroidsRawData,
                     rmm::device_uvector<char>& workspace)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("kmeansPlusPlus");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("kmeansPlusPlus");
   cudaStream_t stream = raft::resource::get_cuda_stream(handle);
   auto n_samples      = X.extent(0);
   auto n_features     = X.extent(1);
@@ -372,7 +372,7 @@ void kmeans_fit_main(raft::resources const& handle,
                      raft::host_scalar_view<IndexT> n_iter,
                      rmm::device_uvector<char>& workspace)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("kmeans_fit_main");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("kmeans_fit_main");
   raft::logger::get(RAFT_NAME).set_level(params.verbosity);
   cudaStream_t stream = raft::resource::get_cuda_stream(handle);
   auto n_samples      = X.extent(0);
@@ -590,7 +590,7 @@ void initScalableKMeansPlusPlus(raft::resources const& handle,
                                 raft::device_matrix_view<DataT, IndexT> centroidsRawData,
                                 rmm::device_uvector<char>& workspace)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "initScalableKMeansPlusPlus");
   cudaStream_t stream = raft::resource::get_cuda_stream(handle);
   auto n_samples      = X.extent(0);
@@ -841,7 +841,7 @@ void kmeans_fit(raft::resources const& handle,
                 raft::host_scalar_view<DataT> inertia,
                 raft::host_scalar_view<IndexT> n_iter)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("kmeans_fit");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("kmeans_fit");
   auto n_samples      = X.extent(0);
   auto n_features     = X.extent(1);
   auto n_clusters     = pams.n_clusters;
@@ -1009,7 +1009,7 @@ void kmeans_predict(raft::resources const& handle,
                     bool normalize_weight,
                     raft::host_scalar_view<DataT> inertia)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("kmeans_predict");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("kmeans_predict");
   auto n_samples      = X.extent(0);
   auto n_features     = X.extent(1);
   cudaStream_t stream = raft::resource::get_cuda_stream(handle);
@@ -1153,7 +1153,7 @@ void kmeans_fit_predict(raft::resources const& handle,
                         raft::host_scalar_view<DataT> inertia,
                         raft::host_scalar_view<IndexT> n_iter)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("kmeans_fit_predict");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("kmeans_fit_predict");
   if (!centroids.has_value()) {
     auto n_features = X.extent(1);
     auto centroids_matrix =
@@ -1217,7 +1217,7 @@ void kmeans_transform(raft::resources const& handle,
                       raft::device_matrix_view<const DataT> centroids,
                       raft::device_matrix_view<DataT> X_new)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("kmeans_transform");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("kmeans_transform");
   raft::logger::get(RAFT_NAME).set_level(pams.verbosity);
   cudaStream_t stream = raft::resource::get_cuda_stream(handle);
   auto n_samples      = X.extent(0);

@@ -20,10 +20,10 @@
 #include "kmeans_common.cuh"
 #include <cuvs/cluster/kmeans.hpp>
 
+#include "../../core/nvtx.hpp"
 #include "../../distance/distance.cuh"
 
 #include <cuvs/distance/distance.hpp>
-#include <raft/common/nvtx.hpp>
 #include <raft/core/cudart_utils.hpp>
 #include <raft/core/logger-ext.hpp>
 #include <raft/core/operators.hpp>
@@ -378,7 +378,7 @@ void compute_norm(const raft::resources& handle,
                   FinOpT norm_fin_op,
                   std::optional<rmm::device_async_resource_ref> mr = std::nullopt)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("compute_norm");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("compute_norm");
   auto stream = raft::resource::get_cuda_stream(handle);
   rmm::device_uvector<MathT> mapped_dataset(
     0, stream, mr.value_or(raft::resource::get_workspace_resource(handle)));
@@ -434,7 +434,7 @@ void predict(const raft::resources& handle,
              const MathT* dataset_norm                        = nullptr)
 {
   auto stream = raft::resource::get_cuda_stream(handle);
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "predict(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
   auto mem_res = mr.value_or(raft::resource::get_workspace_resource(handle));
   auto [max_minibatch_size, _mem_per_row] =
@@ -603,7 +603,7 @@ auto adjust_centers(MathT* centers,
                     rmm::cuda_stream_view stream,
                     rmm::device_async_resource_ref device_memory) -> bool
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "adjust_centers(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
   if (n_clusters == 0) { return false; }
   constexpr static std::array kPrimes{29,   71,   113,  173,  229,  281,  349,  409,  463,  541,
@@ -1036,7 +1036,7 @@ void build_hierarchical(const raft::resources& handle,
   auto stream  = raft::resource::get_cuda_stream(handle);
   using LabelT = uint32_t;
 
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "build_hierarchical(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
 
   IdxT n_mesoclusters = std::min(n_clusters, static_cast<IdxT>(std::sqrt(n_clusters) + 0.5));

@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include "../../../core/nvtx.hpp"
 #include "../../vpq_dataset.cuh"
 #include "graph_core.cuh"
 #include <cuvs/neighbors/cagra.hpp>
@@ -130,7 +131,7 @@ void build_knn_graph(
                "Currently only L2Expanded or InnerProduct metric are supported");
 
   uint32_t node_degree = knn_graph.extent(1);
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "cagra::build_graph(%zu, %zu, %u)",
     size_t(dataset.extent(0)),
     size_t(dataset.extent(1)),

@@ -16,14 +16,14 @@
 
 #pragma once
 
+#include "../../../core/nvtx.hpp"
 #include "factory.cuh"
 #include "sample_filter_utils.cuh"
 #include "search_plan.cuh"
 #include "search_single_cta_inst.cuh"
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
-#include <raft/core/nvtx.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
 
@@ -66,7 +66,7 @@ void search_main_core(raft::resources const& res,
     params.max_queries = std::min<size_t>(queries.extent(0), deviceProp.maxGridSize[1]);
   }
 
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope(
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "cagra::search(max_queries = %u, k = %u, dim = %zu)",
     params.max_queries,
     topk,

@@ -21,10 +21,10 @@
 #include <raft/core/logger-ext.hpp>
 #include <raft/core/mdarray.hpp>
 #include <raft/core/mdspan_types.hpp>
-#include <raft/core/nvtx.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/serialize.hpp>
 
+#include "../../../core/nvtx.hpp"
 #include "../dataset_serialize.hpp"
 
 #include <cstddef>
@@ -53,7 +53,7 @@ void serialize(raft::resources const& res,
                const index<T, IdxT>& index_,
                bool include_dataset)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("cagra::serialize");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("cagra::serialize");
 
   RAFT_LOG_DEBUG(
     "Saving CAGRA index, size %zu, dim %u", static_cast<size_t>(index_.size()), index_.dim());
@@ -103,7 +103,7 @@ void serialize_to_hnswlib(raft::resources const& res,
 {
   // static_assert(std::is_same_v<IdxT, int> or std::is_same_v<IdxT, uint32_t>,
   //               "An hnswlib index can only be trained with int32 or uint32 IdxT");
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("cagra::serialize");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("cagra::serialize");
   RAFT_LOG_DEBUG("Saving CAGRA index to hnswlib format, size %zu, dim %u",
                  static_cast<size_t>(index_.size()),
                  index_.dim());
@@ -234,7 +234,7 @@ void serialize_to_hnswlib(raft::resources const& res,
 template <typename T, typename IdxT>
 void deserialize(raft::resources const& res, std::istream& is, index<T, IdxT>* index_)
 {
-  raft::common::nvtx::range<raft::common::nvtx::domain::raft> fun_scope("cagra::deserialize");
+  raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("cagra::deserialize");
 
   char dtype_string[4];
   is.read(dtype_string, 4);

@@ -1729,6 +1729,12 @@ auto build(raft::resources const& handle,
     if constexpr (std::is_same_v<T, float>) {
       raft::matrix::sample_rows<T, int64_t>(handle, random_state, dataset, trainset.view());
     } else {
+      raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
+        "   ivf_pq::build(%zu, %zu)/sample rows with tmp trainset (%zu rows).",
+        size_t(n_rows),
+        size_t(dim),
+        size_t(n_rows_train));
+
       // TODO(tfeher): Enable codebook generation with any type T, and then remove trainset tmp.
       auto trainset_tmp = raft::make_device_mdarray<T>(
         handle, big_memory_resource, raft::make_extents<int64_t>(n_rows_train, dim));