Update the list of algos to benchmark (#2337)

This PR 1. Update the way `uniform neighbor sample` is imported( it has been removed from experimental) 2. Ping `libraft-headers` and `pyraft` to 22.08 3. Add `Triangle count` to the list of algos to benchmarks Authors: - Joseph Nke (https://github.com/jnke2016) Approvers: - Rick Ratzel (https://github.com/rlratzel) URL: #2337
rapidsai · Jun 21, 2022 · 0bcb6e0 · 0bcb6e0
1 parent 78b394c
commit 0bcb6e0
Show file tree

Hide file tree

Showing 21 changed files with 659 additions and 304 deletions.
diff --git a/benchmarks/python_e2e/benchmark.py b/benchmarks/python_e2e/benchmark.py
@@ -86,15 +86,16 @@ def __init__(self,
         # FIXME: need to accept and save individual algo args
         self.construct_graph = benchmark(construct_graph_func)
 
-        #add starting node to algos: BFS and SSSP
+        # add starting node to algos: BFS and SSSP
+        # FIXME: Refactor BenchmarkRun __init__ because all the work
+        # done below should be done elsewhere
         for i, algo in enumerate (algo_func_param_list):
-            if benchmark(algo).name in ["bfs", "sssp", "neighborhood_sampling"]:
+            if benchmark(algo).name in ["bfs", "sssp", "uniform_neighbor_sample"]:
                 param={}
                 param["start"]=self.input_dataframe['src'].head()[0]
-                if benchmark(algo).name in ["neighborhood_sampling"]:
+                if benchmark(algo).name in ["uniform_neighbor_sample"]:
                     start = [param.pop("start")]
-                    labels = [0]
-                    param["start_info_list"] = (start, labels)
+                    param["start_list"] = start
                     param["fanout_vals"] = [1]
                 algo_func_param_list[i]=(algo,)+(param,)
 
@@ -128,32 +129,44 @@ def run(self):
         self.__log("done.")
         G = result.retval
         self.results.append(result)
-
-        #algos with transposed=True : PageRank, Katz
-        #algos with transposed=False: BFS, SSSP, Louvain, HITS, Neighborhood_sampling
-        #algos supporting the legacy_renum_only: HITS, Neighborhood_sampling
+        #
+        # Algos with transposed=True : PageRank, Katz.
+        # Algos with transposed=False: BFS, SSSP, Louvain, HITS,
+        # Neighborhood_sampling.
+        # Algos supporting the legacy_renum_only: HITS, Neighborhood_sampling
+        #
         for i in range(len(self.algos)):
-            if self.algos[i][0].name in ["pagerank", "katz"]: #set transpose=True when renumbering
-                if self.algos[i][0].name == "katz" and self.construct_graph.name == "from_dask_cudf_edgelist":
-                    largest_out_degree = G.out_degree().compute().\
-                    nlargest(n=1, columns="degree") #compute outdegree before renumbering because outdegree has transpose=False
-                    largest_out_degree = largest_out_degree["degree"].iloc[0]
-                    katz_alpha = 1 / (largest_out_degree + 1)
-                    self.algos[i][1]["alpha"] = katz_alpha
-                elif self.algos[i][0].name == "katz" and self.construct_graph.name == "from_cudf_edgelist":
-                    largest_out_degree = G.out_degree().nlargest(n=1, columns="degree")
-                    largest_out_degree = largest_out_degree["degree"].iloc[0]
-                    katz_alpha = 1 / (largest_out_degree + 1)
-                    self.algos[i][1]["alpha"] = katz_alpha
-                if hasattr(G, "compute_renumber_edge_list"):
-                    G.compute_renumber_edge_list(transposed=True)
-            elif self.algos[i][0].name in ["neighborhood_sampling", "hits"]:
-                if hasattr(G, "compute_renumber_edge_list"):
-                    G.compute_renumber_edge_list(transposed=False, legacy_renum_only=True)
+            # set transpose=True when renumbering
+            if self.algos[i][0].name in ["pagerank", "katz"]:
+                if self.algos[i][0].name == "katz":
+                    if self.construct_graph.name == "from_dask_cudf_edgelist":
+                        # compute out_degree before renumbering because out_degree
+                        # has transpose=False
+                        degree_max = G.degree()['degree'].max().compute()
+                        katz_alpha = 1 / (degree_max)
+                        self.algos[i][1]["alpha"] = katz_alpha
+                    elif self.construct_graph.name == "from_cudf_edgelist":
+                        degree_max = G.degree()['degree'].max()
+                        katz_alpha = 1 / (degree_max)
+                        self.algos[i][1]["alpha"] = katz_alpha
+                    if hasattr(G, "compute_renumber_edge_list"):
+                        G.compute_renumber_edge_list(
+                            transposed=True, legacy_renum_only=True)
+                else:
+                    # FIXME: Pagerank still follows the old path. Update this once it
+                    # follows the pylibcugraph/C path
+                    if hasattr(G, "compute_renumber_edge_list"):
+                        G.compute_renumber_edge_list(transposed=True)
             else: #set transpose=False when renumbering
                 self.__log("running compute_renumber_edge_list...", end="")
                 if hasattr(G, "compute_renumber_edge_list"):
-                    G.compute_renumber_edge_list(transposed=False)
+                    if self.algos[i][0].name in ["wcc", "louvain"]:
+                        # FIXME: Pagerank and Louvain still follow the old path.
+                        # Update this once it follows the pylibcugraph/C path
+                        G.compute_renumber_edge_list(transposed=False)
+                    else:
+                        G.compute_renumber_edge_list(
+                            transposed=False, legacy_renum_only=True)
                 self.__log("done.")
         # FIXME: need to handle individual algo args
         for ((algo, params), validator) in zip(self.algos, self.validators):

diff --git a/benchmarks/python_e2e/cugraph_dask_funcs.py b/benchmarks/python_e2e/cugraph_dask_funcs.py
@@ -18,12 +18,10 @@
 from cugraph.structure.symmetrize import symmetrize_ddf
 from cugraph.dask.common.mg_utils import get_visible_devices
 from dask_cuda.initialize import initialize
-from cugraph.experimental.dask import uniform_neighborhood_sampling
 import cudf
 
 import cugraph
 from cugraph.dask.comms import comms as Comms
-from cugraph.dask.common.mg_utils import get_visible_devices
 from cugraph.generators import rmat
 import tempfile
 
@@ -109,10 +107,15 @@ def construct_graph(dask_dataframe, symmetric=False):
     object must be symmetrized and have self loops removed.
     """
 
-    G = cugraph.DiGraph()
+    if symmetric:
+        G = cugraph.Graph(directed=False)
+    else:
+        G = cugraph.Graph(directed=True)
+
     if len(dask_dataframe.columns) > 2:
         if symmetric: #symmetrize dask dataframe
-            dask_dataframe = symmetrize_ddf(dask_dataframe, 'src', 'dst', 'weight')
+            dask_dataframe = symmetrize_ddf(
+                dask_dataframe, 'src', 'dst', 'weight')
 
         G.from_dask_cudf_edgelist(
             dask_dataframe, source="src", destination="dst", edge_attr="weight")
@@ -130,11 +133,12 @@ def construct_graph(dask_dataframe, symmetric=False):
 
 
 def bfs(G, start):
-    return cugraph.dask.bfs(G, start=start, return_distances=True)
+    return cugraph.dask.bfs(
+        G, start=start, return_distances=True, check_start=False)
 
 
 def sssp(G, start):
-    return cugraph.dask.sssp(G, source=start)
+    return cugraph.dask.sssp(G, source=start, check_start=False)
 
 
 def wcc(G):
@@ -156,15 +160,19 @@ def katz(G, alpha=None):
 def hits(G):
     return cugraph.dask.hits(G)
 
-def neighborhood_sampling(G, start_info_list=None, fanout_vals=None):
+def uniform_neighbor_sample(G, start_list=None, fanout_vals=None):
     # convert list to cudf.Series
-    start_info_list = (
-        cudf.Series(start_info_list[0], dtype="int32"),
-        cudf.Series(start_info_list[1], dtype="int32"),
-    )
-
-    return uniform_neighborhood_sampling(
-        G, start_info_list=start_info_list, fanout_vals=fanout_vals)
+    start_list = cudf.Series(start_list, dtype="int32")  
+    return cugraph.dask.uniform_neighbor_sample(
+        G, start_list=start_list, fanout_vals=fanout_vals)
+
+def triangle_count(G):
+    # FIXME: Update this calls once triangle_count is promoted
+    return cugraph.dask.triangle_count(G)
+
+def eigenvector_centrality(G):
+    # FIXME: Update this calls once triangle_count is promoted
+    return cugraph.dask.eigenvector_centrality(G)
 
 ################################################################################
 # Session-wide setup and teardown

diff --git a/benchmarks/python_e2e/cugraph_funcs.py b/benchmarks/python_e2e/cugraph_funcs.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,6 +15,7 @@
 
 import cugraph
 from cugraph.generators import rmat
+import cudf
 
 
 def generate_edgelist(scale,
@@ -96,9 +97,9 @@ def construct_graph(dataframe, symmetric=False):
     symmetrized and have self loops removed.
     """
     if symmetric:
-        G = cugraph.Graph()
+        G = cugraph.Graph(directed=False)
     else:
-        G = cugraph.DiGraph()
+        G = cugraph.Graph(directed=True)
 
     if len(dataframe.columns) > 2:
         G.from_cudf_edgelist(
@@ -137,6 +138,22 @@ def pagerank(G):
 def katz(G, alpha=None):
     return cugraph.katz_centrality(G, alpha)
 
+def hits(G):
+    return cugraph.hits(G)
+
+def uniform_neighbor_sample(G, start_list=None, fanout_vals=None):
+    # convert list to cudf.Series
+    start_list = cudf.Series(start_list, dtype="int32")  
+    return cugraph.uniform_neighbor_sample(
+        G, start_list=start_list, fanout_vals=fanout_vals)
+
+def triangle_count(G):
+    # FIXME: Update this calls once triangle_count is promoted
+    return cugraph.experimental.triangle_count(G)
+
+def eigenvector_centrality(G):
+    # FIXME: Update this calls once triangle_count is promoted
+    return cugraph.eigenvector_centrality(G)
 
 ################################################################################
 # Session-wide setup and teardown

diff --git a/benchmarks/python_e2e/main.py b/benchmarks/python_e2e/main.py
@@ -88,7 +88,9 @@ def run(algos,
                   "katz": funcs.katz,
                   "wcc": funcs.wcc,
                   "hits": funcs.hits,
-                  "neighborhood_sampling": funcs.neighborhood_sampling,
+                  "uniform_neighbor_sample": funcs.uniform_neighbor_sample,
+                  "triangle_count": funcs.triangle_count,
+                  "eigenvector_centrality": funcs.eigenvector_centrality,
                  }
 
     if algos:

diff --git a/benchmarks/python_e2e/reporting.py b/benchmarks/python_e2e/reporting.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -43,15 +43,15 @@ def generate_console_report(benchmark_result_list):
     # the graph_create run, then a run of each algo.
     r = benchmark_result_list[0]
     name = f"{r.name}({__namify_dict(r.params)})"
-    space = " " * (30 - len(name))
+    space = " " * (70 - len(name))
     retstring += f"{name}{space}{r.runtime:.6}\n"
 
     remaining_results = benchmark_result_list[1:]
 
     for r in remaining_results:
-        retstring += f"{'-'*60}\n"
+        retstring += f"{'-'*80}\n"
         name = f"{r.name}({__namify_dict(r.params)})"
-        space = " " * (30 - len(name))
+        space = " " * (70 - len(name))
         retstring += f"{name}{space}{r.runtime:.6}\n"
 
     return retstring

diff --git a/python/cugraph/cugraph/centrality/katz_centrality.py b/python/cugraph/cugraph/centrality/katz_centrality.py
@@ -23,7 +23,7 @@
 
 
 def katz_centrality(
-    G, alpha=None, beta=None, max_iter=100, tol=1.0e-6,
+    G, alpha=None, beta=1.0, max_iter=100, tol=1.0e-6,
     nstart=None, normalized=True
 ):
     """
@@ -114,11 +114,16 @@ def katz_centrality(
     >>> kc = cugraph.katz_centrality(G)
 
     """
+    G, isNx = ensure_cugraph_obj_for_nx(G)
+
+    if alpha is None:
+        degree_max = G.degree()['degree'].max()
+        alpha = 1 / (degree_max)
+
     if (alpha is not None) and (alpha <= 0.0):
         raise ValueError(f"'alpha' must be a positive float or None, "
                          f"got: {alpha}")
-    if beta is None:
-        beta = 1.0
+
     elif (not isinstance(beta, float)) or (beta <= 0.0):
         raise ValueError(f"'beta' must be a positive float or None, "
                          f"got: {beta}")
@@ -128,8 +133,6 @@ def katz_centrality(
     if (not isinstance(tol, float)) or (tol <= 0.0):
         raise ValueError(f"'tol' must be a positive float, got: {tol}")
 
-    G, isNx = ensure_cugraph_obj_for_nx(G)
-
     srcs = G.edgelist.edgelist_df['src']
     dsts = G.edgelist.edgelist_df['dst']
     if 'weights' in G.edgelist.edgelist_df.columns:
@@ -139,11 +142,6 @@ def katz_centrality(
         # with type hardcoded to float32 is passed into wrapper
         weights = cudf.Series((srcs + 1) / (srcs + 1), dtype="float32")
 
-    if alpha is None:
-        largest_out_degree = G.degrees().nlargest(n=1, columns="out_degree")
-        largest_out_degree = largest_out_degree["out_degree"].iloc[0]
-        alpha = 1 / (largest_out_degree + 1)
-
     if nstart is not None:
         if G.renumbered is True:
             if len(G.renumber_map.implementation.col_names) > 1:

diff --git a/python/cugraph/cugraph/dask/centrality/eigenvector_centrality.py b/python/cugraph/cugraph/dask/centrality/eigenvector_centrality.py
@@ -141,7 +141,8 @@ def eigenvector_centrality(
     """
     client = default_client()
     # Calling renumbering results in data that is sorted by degree
-    input_graph.compute_renumber_edge_list(transposed=False)
+    input_graph.compute_renumber_edge_list(
+        transposed=False, legacy_renum_only=True)
 
     graph_properties = GraphProperties(
         is_multigraph=False)

diff --git a/python/cugraph/cugraph/dask/centrality/katz_centrality.py b/python/cugraph/cugraph/dask/centrality/katz_centrality.py
@@ -174,6 +174,21 @@ def katz_centrality(
     """
     client = default_client()
 
+    if alpha is None:
+        degree_max = input_graph.degree()['degree'].max().compute()
+        alpha = 1 / (degree_max)
+
+    if (alpha is not None) and (alpha <= 0.0):
+        raise ValueError(f"'alpha' must be a positive float or None, "
+                         f"got: {alpha}")
+
+    # FIXME: 'legacy_renum_only' will not trigger the C++ renumbering
+    # In the future, once all the algos follow the C/Pylibcugraph path,
+    # compute_renumber_edge_list will only be used for multicolumn and
+    # string vertices since the renumbering will be done in pylibcugraph
+    input_graph.compute_renumber_edge_list(transposed=True,
+                                           legacy_renum_only=False)
+
     graph_properties = GraphProperties(
         is_multigraph=False)
 
@@ -188,10 +203,6 @@ def katz_centrality(
     num_edges = len(ddf)
     data = get_distributed_data(ddf)
 
-    # FIXME: Incorporate legacy_renum_only=True to only trigger the python
-    # renumbering when more support is added in the C/C++ API
-    input_graph.compute_renumber_edge_list(transposed=True,
-                                           legacy_renum_only=False)
     vertex_partition_offsets = get_vertex_partition_offsets(input_graph)
     num_verts = vertex_partition_offsets.iloc[-1]
 

diff --git a/python/cugraph/cugraph/dask/common/part_utils.py b/python/cugraph/cugraph/dask/common/part_utils.py
@@ -93,14 +93,18 @@ async def _extract_partitions(dask_obj, client=None, batch_enabled=False):
             persisted = [client.persist(
                 dask_obj.get_partition(p), workers=w) for p, w in enumerate(
                     worker_list[:dask_obj.npartitions])]
-            # Persist empty dataframe with the remaining workers if there are
-            # less partitions than workers
+            # Persist empty dataframe/series with the remaining workers if
+            # there are less partitions than workers
             if dask_obj.npartitions < len(worker_list):
                 # The empty df should have the same column names and dtypes as
                 # dask_obj
-                empty_df = cudf.DataFrame(columns=list(dask_obj.columns))
-                empty_df = empty_df.astype(dict(zip(
-                    dask_obj.columns, dask_obj.dtypes)))
+                if isinstance(dask_obj, dask_cudf.DataFrame):
+                    empty_df = cudf.DataFrame(columns=list(dask_obj.columns))
+                    empty_df = empty_df.astype(dict(zip(
+                        dask_obj.columns, dask_obj.dtypes)))
+                else:
+                    empty_df = cudf.Series(dtype=dask_obj.dtype)
+
                 for p, w in enumerate(worker_list[dask_obj.npartitions:]):
                     empty_ddf = dask_cudf.from_cudf(empty_df, npartitions=1)
                     persisted.append(client.persist(empty_ddf, workers=w))