From cff84f5157720fc118e96d06bb1e1fd879d724c5 Mon Sep 17 00:00:00 2001 From: Iroy30 <41401566+Iroy30@users.noreply.github.com> Date: Wed, 28 Apr 2021 14:31:40 -0500 Subject: [PATCH] Add new graph structure (#1404) Authors: - https://github.com/Iroy30 Approvers: - Rick Ratzel (https://github.com/rlratzel) - Brad Rees (https://github.com/BradReesWork) - Joseph Nke (https://github.com/jnke2016) - Alex Fender (https://github.com/afender) URL: https://github.com/rapidsai/cugraph/pull/1404 --- benchmarks/bench_algos.py | 10 +- python/cugraph/__init__.py | 8 +- .../betweenness_centrality_wrapper.pyx | 2 +- .../edge_betweenness_centrality_wrapper.pyx | 2 +- python/cugraph/community/ktruss_subgraph.py | 4 +- python/cugraph/community/leiden.py | 4 +- python/cugraph/community/louvain.py | 4 +- .../cugraph/community/subgraph_extraction.py | 4 +- python/cugraph/community/triangle_count.py | 4 +- .../components/connectivity_wrapper.pyx | 2 +- python/cugraph/cores/k_core.py | 4 +- python/cugraph/dask/link_analysis/pagerank.py | 2 +- python/cugraph/layout/force_atlas2.py | 4 +- python/cugraph/link_analysis/pagerank.py | 2 +- python/cugraph/link_prediction/jaccard.py | 5 +- python/cugraph/link_prediction/overlap.py | 4 +- python/cugraph/link_prediction/wjaccard.py | 5 +- python/cugraph/link_prediction/woverlap.py | 4 +- python/cugraph/structure/__init__.py | 14 +- python/cugraph/structure/convert_matrix.py | 4 +- python/cugraph/structure/graph.py | 1509 ----------------- python/cugraph/structure/graph_classes.py | 743 ++++++++ .../graph_implementation/__init__.py | 17 + .../graph_implementation/npartiteGraph.py | 100 ++ .../simpleDistributedGraph.py | 473 ++++++ .../graph_implementation/simpleGraph.py | 823 +++++++++ python/cugraph/structure/hypergraph.py | 9 +- python/cugraph/structure/number_map.py | 1 - python/cugraph/structure/symmetrize.py | 30 +- python/cugraph/tests/test_graph.py | 11 +- python/cugraph/traversal/bfs.py | 2 +- .../traversal/traveling_salesperson.py | 2 +- python/cugraph/tree/minimum_spanning_tree.py | 2 +- 33 files changed, 2239 insertions(+), 1575 deletions(-) delete mode 100644 python/cugraph/structure/graph.py create mode 100644 python/cugraph/structure/graph_classes.py create mode 100644 python/cugraph/structure/graph_implementation/__init__.py create mode 100644 python/cugraph/structure/graph_implementation/npartiteGraph.py create mode 100644 python/cugraph/structure/graph_implementation/simpleDistributedGraph.py create mode 100644 python/cugraph/structure/graph_implementation/simpleGraph.py diff --git a/benchmarks/bench_algos.py b/benchmarks/bench_algos.py index f9f8bf9cf53..5284ffbd37b 100644 --- a/benchmarks/bench_algos.py +++ b/benchmarks/bench_algos.py @@ -51,9 +51,9 @@ def createGraph(csvFileName, graphType=None): # complexity lower, and assume tests have coverage to verify # correctness for those combinations. if "/directed/" in csvFileName: - graphType = cugraph.structure.graph.DiGraph + graphType = cugraph.structure.graph_classes.DiGraph else: - graphType = cugraph.structure.graph.Graph + graphType = cugraph.structure.graph_classes.Graph return cugraph.from_cudf_edgelist( utils.read_csv_file(csvFileName), @@ -122,7 +122,7 @@ def graphWithAdjListComputed(request): csvFileName = request.param[0] reinitRMM(request.param[1], request.param[2]) - G = createGraph(csvFileName, cugraph.structure.graph.Graph) + G = createGraph(csvFileName, cugraph.structure.graph_classes.Graph) G.view_adj_list() return G @@ -166,7 +166,7 @@ def bench_create_graph(gpubenchmark, edgelistCreated): gpubenchmark(cugraph.from_cudf_edgelist, edgelistCreated, source="0", destination="1", - create_using=cugraph.structure.graph.Graph, + create_using=cugraph.structure.graph_classes.Graph, renumber=False) @@ -183,7 +183,7 @@ def bench_create_digraph(gpubenchmark, edgelistCreated): gpubenchmark(cugraph.from_cudf_edgelist, edgelistCreated, source="0", destination="1", - create_using=cugraph.structure.graph.DiGraph, + create_using=cugraph.structure.graph_classes.DiGraph, renumber=False) diff --git a/python/cugraph/__init__.py b/python/cugraph/__init__.py index d4632708591..1a113b93d8d 100644 --- a/python/cugraph/__init__.py +++ b/python/cugraph/__init__.py @@ -33,6 +33,8 @@ DiGraph, MultiGraph, MultiDiGraph, + BiPartiteGraph, + BiPartiteDiGraph, from_edgelist, from_cudf_edgelist, from_pandas_edgelist, @@ -48,7 +50,11 @@ symmetrize, symmetrize_df, symmetrize_ddf, -) + is_weighted, + is_directed, + is_multigraph, + is_bipartite, + is_multipartite) from cugraph.centrality import ( betweenness_centrality, diff --git a/python/cugraph/centrality/betweenness_centrality_wrapper.pyx b/python/cugraph/centrality/betweenness_centrality_wrapper.pyx index 855de3327ba..e63b6996816 100644 --- a/python/cugraph/centrality/betweenness_centrality_wrapper.pyx +++ b/python/cugraph/centrality/betweenness_centrality_wrapper.pyx @@ -17,7 +17,7 @@ # cython: language_level = 3 from cugraph.centrality.betweenness_centrality cimport betweenness_centrality as c_betweenness_centrality -from cugraph.structure.graph import DiGraph +from cugraph.structure.graph_classes import DiGraph from cugraph.structure.graph_primtypes cimport * from libc.stdint cimport uintptr_t from libcpp cimport bool diff --git a/python/cugraph/centrality/edge_betweenness_centrality_wrapper.pyx b/python/cugraph/centrality/edge_betweenness_centrality_wrapper.pyx index 136bde1b0e3..095d291c45e 100644 --- a/python/cugraph/centrality/edge_betweenness_centrality_wrapper.pyx +++ b/python/cugraph/centrality/edge_betweenness_centrality_wrapper.pyx @@ -18,7 +18,7 @@ from cugraph.centrality.betweenness_centrality cimport edge_betweenness_centrality as c_edge_betweenness_centrality from cugraph.structure import graph_primtypes_wrapper -from cugraph.structure.graph import DiGraph, Graph +from cugraph.structure.graph_classes import DiGraph, Graph from cugraph.structure.graph_primtypes cimport * from libc.stdint cimport uintptr_t from libcpp cimport bool diff --git a/python/cugraph/community/ktruss_subgraph.py b/python/cugraph/community/ktruss_subgraph.py index 8e4f1471955..f4e4f7fb1cc 100644 --- a/python/cugraph/community/ktruss_subgraph.py +++ b/python/cugraph/community/ktruss_subgraph.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -12,7 +12,7 @@ # limitations under the License. from cugraph.community import ktruss_subgraph_wrapper -from cugraph.structure.graph import Graph +from cugraph.structure.graph_classes import Graph from cugraph.utilities import check_nx_graph from cugraph.utilities import cugraph_to_nx diff --git a/python/cugraph/community/leiden.py b/python/cugraph/community/leiden.py index 8c1b79b8b63..641cf552192 100644 --- a/python/cugraph/community/leiden.py +++ b/python/cugraph/community/leiden.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019 - 2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -12,7 +12,7 @@ # limitations under the License. from cugraph.community import leiden_wrapper -from cugraph.structure.graph import Graph +from cugraph.structure.graph_classes import Graph from cugraph.utilities import check_nx_graph from cugraph.utilities import df_score_to_dictionary diff --git a/python/cugraph/community/louvain.py b/python/cugraph/community/louvain.py index d4d56a1100c..a761e060038 100644 --- a/python/cugraph/community/louvain.py +++ b/python/cugraph/community/louvain.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -12,7 +12,7 @@ # limitations under the License. from cugraph.community import louvain_wrapper -from cugraph.structure.graph import Graph +from cugraph.structure.graph_classes import Graph from cugraph.utilities import check_nx_graph from cugraph.utilities import df_score_to_dictionary diff --git a/python/cugraph/community/subgraph_extraction.py b/python/cugraph/community/subgraph_extraction.py index 8c702c2f58f..7815851d465 100644 --- a/python/cugraph/community/subgraph_extraction.py +++ b/python/cugraph/community/subgraph_extraction.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -12,7 +12,7 @@ # limitations under the License. from cugraph.community import subgraph_extraction_wrapper -from cugraph.structure.graph import null_check +from cugraph.structure.graph_classes import null_check from cugraph.utilities import check_nx_graph from cugraph.utilities import cugraph_to_nx diff --git a/python/cugraph/community/triangle_count.py b/python/cugraph/community/triangle_count.py index ff4dc9a5c5f..d28424a513e 100644 --- a/python/cugraph/community/triangle_count.py +++ b/python/cugraph/community/triangle_count.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -12,7 +12,7 @@ # limitations under the License. from cugraph.community import triangle_count_wrapper -from cugraph.structure.graph import Graph +from cugraph.structure.graph_classes import Graph from cugraph.utilities import check_nx_graph diff --git a/python/cugraph/components/connectivity_wrapper.pyx b/python/cugraph/components/connectivity_wrapper.pyx index 76d279a8116..ac173de3564 100644 --- a/python/cugraph/components/connectivity_wrapper.pyx +++ b/python/cugraph/components/connectivity_wrapper.pyx @@ -22,7 +22,7 @@ from cugraph.structure import utils_wrapper from cugraph.structure import graph_primtypes_wrapper from libc.stdint cimport uintptr_t from cugraph.structure.symmetrize import symmetrize -from cugraph.structure.graph import Graph as type_Graph +from cugraph.structure.graph_classes import Graph as type_Graph import cudf import numpy as np diff --git a/python/cugraph/cores/k_core.py b/python/cugraph/cores/k_core.py index ce67665764b..ca17bdd5c81 100644 --- a/python/cugraph/cores/k_core.py +++ b/python/cugraph/cores/k_core.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -14,7 +14,7 @@ from cugraph.cores import k_core_wrapper, core_number_wrapper from cugraph.utilities import cugraph_to_nx from cugraph.utilities import check_nx_graph -from cugraph.structure.graph import Graph +from cugraph.structure.graph_classes import Graph def k_core(G, k=None, core_number=None): diff --git a/python/cugraph/dask/link_analysis/pagerank.py b/python/cugraph/dask/link_analysis/pagerank.py index fb9f4ad3a25..f90e5c72231 100644 --- a/python/cugraph/dask/link_analysis/pagerank.py +++ b/python/cugraph/dask/link_analysis/pagerank.py @@ -119,7 +119,7 @@ def pagerank(input_graph, edge_attr='value') >>> pr = dcg.pagerank(dg) """ - from cugraph.structure.graph import null_check + from cugraph.structure.graph_classes import null_check nstart = None diff --git a/python/cugraph/layout/force_atlas2.py b/python/cugraph/layout/force_atlas2.py index 4c6859c6c03..0b745d8ca15 100644 --- a/python/cugraph/layout/force_atlas2.py +++ b/python/cugraph/layout/force_atlas2.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -12,7 +12,7 @@ # limitations under the License. from cugraph.layout import force_atlas2_wrapper -from cugraph.structure.graph import null_check +from cugraph.structure.graph_classes import null_check def force_atlas2( diff --git a/python/cugraph/link_analysis/pagerank.py b/python/cugraph/link_analysis/pagerank.py index 8a03ee077f6..4f5f8f6aae0 100644 --- a/python/cugraph/link_analysis/pagerank.py +++ b/python/cugraph/link_analysis/pagerank.py @@ -12,7 +12,7 @@ # limitations under the License. from cugraph.link_analysis import pagerank_wrapper -from cugraph.structure.graph import null_check +from cugraph.structure.graph_classes import null_check import cugraph diff --git a/python/cugraph/link_prediction/jaccard.py b/python/cugraph/link_prediction/jaccard.py index 71cf0925342..2a9e9625050 100644 --- a/python/cugraph/link_prediction/jaccard.py +++ b/python/cugraph/link_prediction/jaccard.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -13,9 +13,8 @@ import pandas as pd import cudf -from cugraph.structure.graph import Graph +from cugraph.structure.graph_classes import Graph, null_check from cugraph.link_prediction import jaccard_wrapper -from cugraph.structure.graph import null_check from cugraph.utilities import check_nx_graph from cugraph.utilities import df_edge_score_to_dictionary diff --git a/python/cugraph/link_prediction/overlap.py b/python/cugraph/link_prediction/overlap.py index a5ca1e22979..077080bda1d 100644 --- a/python/cugraph/link_prediction/overlap.py +++ b/python/cugraph/link_prediction/overlap.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -13,7 +13,7 @@ import pandas as pd from cugraph.link_prediction import overlap_wrapper -from cugraph.structure.graph import null_check +from cugraph.structure.graph_classes import null_check import cudf from cugraph.utilities import check_nx_graph from cugraph.utilities import df_edge_score_to_dictionary diff --git a/python/cugraph/link_prediction/wjaccard.py b/python/cugraph/link_prediction/wjaccard.py index 2a4e2417102..9679d1ba9cf 100644 --- a/python/cugraph/link_prediction/wjaccard.py +++ b/python/cugraph/link_prediction/wjaccard.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -11,9 +11,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from cugraph.structure.graph import Graph +from cugraph.structure.graph_classes import Graph, null_check from cugraph.link_prediction import jaccard_wrapper -from cugraph.structure.graph import null_check import cudf diff --git a/python/cugraph/link_prediction/woverlap.py b/python/cugraph/link_prediction/woverlap.py index c93ad28ea54..fe64f812957 100644 --- a/python/cugraph/link_prediction/woverlap.py +++ b/python/cugraph/link_prediction/woverlap.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -12,7 +12,7 @@ # limitations under the License. from cugraph.link_prediction import overlap_wrapper -from cugraph.structure.graph import null_check +from cugraph.structure.graph_classes import null_check import cudf diff --git a/python/cugraph/structure/__init__.py b/python/cugraph/structure/__init__.py index ad67fe91876..b70854d61ce 100644 --- a/python/cugraph/structure/__init__.py +++ b/python/cugraph/structure/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -11,7 +11,17 @@ # See the License for the specific language governing permissions and # limitations under the License. -from cugraph.structure.graph import Graph, DiGraph, MultiGraph, MultiDiGraph +from cugraph.structure.graph_classes import (Graph, + DiGraph, + MultiGraph, + MultiDiGraph, + BiPartiteGraph, + BiPartiteDiGraph) +from cugraph.structure.graph_classes import (is_weighted, + is_directed, + is_multigraph, + is_bipartite, + is_multipartite) from cugraph.structure.number_map import NumberMap from cugraph.structure.symmetrize import symmetrize, symmetrize_df , symmetrize_ddf from cugraph.structure.convert_matrix import (from_edgelist, diff --git a/python/cugraph/structure/convert_matrix.py b/python/cugraph/structure/convert_matrix.py index edd1c630185..5b3c375ea9d 100644 --- a/python/cugraph/structure/convert_matrix.py +++ b/python/cugraph/structure/convert_matrix.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -18,7 +18,7 @@ import cudf import dask_cudf -from cugraph.structure.graph import DiGraph, Graph +from cugraph.structure.graph_classes import DiGraph, Graph # optional dependencies used for handling different input types try: diff --git a/python/cugraph/structure/graph.py b/python/cugraph/structure/graph.py deleted file mode 100644 index a3024f9d081..00000000000 --- a/python/cugraph/structure/graph.py +++ /dev/null @@ -1,1509 +0,0 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from cugraph.structure import graph_primtypes_wrapper -from cugraph.structure.symmetrize import symmetrize -from cugraph.structure.number_map import NumberMap -import cugraph.dask.common.mg_utils as mg_utils -import cudf -import dask_cudf -import cugraph.comms.comms as Comms -import pandas as pd -import numpy as np -from cugraph.dask.structure import replication - - -def null_check(col): - if col.null_count != 0: - raise ValueError("Series contains NULL values") - - -class Graph: - class EdgeList: - def __init__(self, *args): - if len(args) == 1: - self.__from_dask_cudf(*args) - else: - self.__from_cudf(*args) - - def __from_cudf(self, source, destination, edge_attr=None): - self.edgelist_df = cudf.DataFrame() - self.edgelist_df["src"] = source - self.edgelist_df["dst"] = destination - self.weights = False - if edge_attr is not None: - self.weights = True - if type(edge_attr) is dict: - for k in edge_attr.keys(): - self.edgelist_df[k] = edge_attr[k] - else: - self.edgelist_df["weights"] = edge_attr - - def __from_dask_cudf(self, ddf): - self.edgelist_df = ddf - self.weights = False - # FIXME: Edge Attribute not handled - - class AdjList: - def __init__(self, offsets, indices, value=None): - self.offsets = offsets - self.indices = indices - self.weights = value # Should be a dataframe for multiple weights - - class transposedAdjList: - def __init__(self, offsets, indices, value=None): - Graph.AdjList.__init__(self, offsets, indices, value) - - """ - cuGraph graph class containing basic graph creation and transformation - operations. - """ - - def __init__( - self, - m_graph=None, - symmetrized=False, - bipartite=False, - multi=False, - dynamic=False, - ): - """ - Returns - ------- - G : cuGraph.Graph. - - Examples - -------- - >>> import cuGraph - >>> G = cuGraph.Graph() - - """ - self.symmetrized = symmetrized - self.renumbered = False - self.renumber_map = None - self.bipartite = False - self.multipartite = False - self._nodes = {} - self.multi = multi - self.distributed = False - self.dynamic = dynamic - self.self_loop = False - self.edgelist = None - self.adjlist = None - self.transposedadjlist = None - self.edge_count = None - self.node_count = None - - # MG - Batch - self.batch_enabled = False - self.batch_edgelists = None - self.batch_adjlists = None - self.batch_transposed_adjlists = None - - if m_graph is not None: - if type(m_graph) is MultiGraph or type(m_graph) is MultiDiGraph: - elist = m_graph.view_edge_list() - if m_graph.edgelist.weights: - weights = "weights" - else: - weights = None - self.from_cudf_edgelist(elist, - source="src", - destination="dst", - edge_attr=weights) - else: - msg = ( - "Graph can only be initialized using MultiGraph " - "or MultiDiGraph" - ) - raise Exception(msg) - - def enable_batch(self): - client = mg_utils.get_client() - comms = Comms.get_comms() - - if client is None or comms is None: - msg = ( - "MG Batch needs a Dask Client and the " - "Communicator needs to be initialized." - ) - raise Exception(msg) - - self.batch_enabled = True - - if self.edgelist is not None: - if self.batch_edgelists is None: - self._replicate_edgelist() - - if self.adjlist is not None: - if self.batch_adjlists is None: - self._replicate_adjlist() - - if self.transposedadjlist is not None: - if self.batch_transposed_adjlists is None: - self._replicate_transposed_adjlist() - - def _replicate_edgelist(self): - client = mg_utils.get_client() - comms = Comms.get_comms() - - # FIXME: There might be a better way to control it - if client is None: - return - work_futures = replication.replicate_cudf_dataframe( - self.edgelist.edgelist_df, client=client, comms=comms - ) - - self.batch_edgelists = work_futures - - def _replicate_adjlist(self): - client = mg_utils.get_client() - comms = Comms.get_comms() - - # FIXME: There might be a better way to control it - if client is None: - return - - weights = None - offsets_futures = replication.replicate_cudf_series( - self.adjlist.offsets, client=client, comms=comms - ) - indices_futures = replication.replicate_cudf_series( - self.adjlist.indices, client=client, comms=comms - ) - - if self.adjlist.weights is not None: - weights = replication.replicate_cudf_series(self.adjlist.weights) - else: - weights = {worker: None for worker in offsets_futures} - - merged_futures = { - worker: [ - offsets_futures[worker], - indices_futures[worker], - weights[worker], - ] - for worker in offsets_futures - } - self.batch_adjlists = merged_futures - - # FIXME: Not implemented yet - def _replicate_transposed_adjlist(self): - self.batch_transposed_adjlists = True - - def clear(self): - """ - Empty this graph. This function is added for NetworkX compatibility. - """ - self.edgelist = None - self.adjlist = None - self.transposedadjlist = None - - self.batch_edgelists = None - self.batch_adjlists = None - self.batch_transposed_adjlists = None - - def add_nodes_from(self, nodes, bipartite=None, multipartite=None): - """ - Add nodes information to the Graph. - - Parameters - ---------- - nodes : list or cudf.Series - The nodes of the graph to be stored. If bipartite and multipartite - arguments are not passed, the nodes are considered to be a list of - all the nodes present in the Graph. - bipartite : str - Sets the Graph as bipartite. The nodes are stored as a set of nodes - of the partition named as bipartite argument. - multipartite : str - Sets the Graph as multipartite. The nodes are stored as a set of - nodes of the partition named as multipartite argument. - """ - if bipartite is None and multipartite is None: - self._nodes["all_nodes"] = cudf.Series(nodes) - else: - set_names = [i for i in self._nodes.keys() if i != "all_nodes"] - if multipartite is not None: - if self.bipartite: - raise Exception( - "The Graph is already set as bipartite. " - "Use bipartite option instead." - ) - self.multipartite = True - elif bipartite is not None: - if self.multipartite: - raise Exception( - "The Graph is set as multipartite. " - "Use multipartite option instead." - ) - self.bipartite = True - multipartite = bipartite - if multipartite not in set_names and len(set_names) == 2: - raise Exception( - "The Graph is set as bipartite and " - "already has two partitions initialized." - ) - self._nodes[multipartite] = cudf.Series(nodes) - - def is_bipartite(self): - """ - Checks if Graph is bipartite. This solely relies on the user call of - add_nodes_from with the bipartite parameter. This does not parse the - graph to check if it is bipartite. - """ - # TO DO: Call coloring algorithm - return self.bipartite - - def is_multipartite(self): - """ - Checks if Graph is multipartite. This solely relies on the user call - of add_nodes_from with the partition parameter. This does not parse - the graph to check if it is multipartite. - """ - # TO DO: Call coloring algorithm - return self.multipartite or self.bipartite - - def is_multigraph(self): - """ - Returns True if the graph is a multigraph. Else returns False. - """ - return self.multi - - def sets(self): - """ - Returns the bipartite set of nodes. This solely relies on the user's - call of add_nodes_from with the bipartite parameter. This does not - parse the graph to compute bipartite sets. If bipartite argument was - not provided during add_nodes_from(), it raise an exception that the - graph is not bipartite. - """ - # TO DO: Call coloring algorithm - set_names = [i for i in self._nodes.keys() if i != "all_nodes"] - if self.bipartite: - top = self._nodes[set_names[0]] - if len(set_names) == 2: - bottom = self._nodes[set_names[1]] - else: - bottom = cudf.Series( - set(self.nodes().values_host) - set(top.values_host) - ) - return top, bottom - else: - return {k: self._nodes[k] for k in set_names} - - def from_cudf_edgelist( - self, - input_df, - source="source", - destination="destination", - edge_attr=None, - renumber=True, - ): - """ - Initialize a graph from the edge list. It is an error to call this - method on an initialized Graph object. The passed input_df argument - wraps gdf_column objects that represent a graph using the edge list - format. source argument is source column name and destination argument - is destination column name. - - By default, renumbering is enabled to map the source and destination - vertices into an index in the range [0, V) where V is the number - of vertices. If the input vertices are a single column of integers - in the range [0, V), renumbering can be disabled and the original - external vertex ids will be used. - - If weights are present, edge_attr argument is the weights column name. - - Parameters - ---------- - input_df : cudf.DataFrame or dask_cudf.DataFrame - A DataFrame that contains edge information - If a dask_cudf.DataFrame is passed it will be reinterpreted as - a cudf.DataFrame. For the distributed path please use - from_dask_cudf_edgelist. - source : str or array-like - source column name or array of column names - destination : str or array-like - destination column name or array of column names - edge_attr : str or None - the weights column name. Default is None - renumber : bool - Indicate whether or not to renumber the source and destination - vertex IDs. Default is True. - - Examples - -------- - >>> df = cudf.read_csv('datasets/karate.csv', delimiter=' ', - >>> dtype=['int32', 'int32', 'float32'], header=None) - >>> G = cugraph.Graph() - >>> G.from_cudf_edgelist(df, source='0', destination='1', - edge_attr='2', renumber=False) - - """ - if self.edgelist is not None or self.adjlist is not None: - raise Exception("Graph already has values") - - s_col = source - d_col = destination - if not isinstance(s_col, list): - s_col = [s_col] - if not isinstance(d_col, list): - d_col = [d_col] - if not ( - set(s_col).issubset(set(input_df.columns)) - and set(d_col).issubset(set(input_df.columns)) - ): - raise Exception( - "source column names and/or destination column " - "names not found in input. Recheck the source and " - "destination parameters" - ) - - # FIXME: update for smaller GPUs - # Consolidation - if isinstance(input_df, cudf.DataFrame): - if len(input_df[source]) > 2147483100: - raise Exception( - "cudf dataFrame edge list is too big " - "to fit in a single GPU" - ) - elist = input_df - elif isinstance(input_df, dask_cudf.DataFrame): - if len(input_df[source]) > 2147483100: - raise Exception( - "dask_cudf dataFrame edge list is too big " - "to fit in a single GPU" - ) - elist = input_df.compute().reset_index(drop=True) - else: - raise Exception( - "input should be a cudf.DataFrame or " - "a dask_cudf dataFrame" - ) - - renumber_map = None - if renumber: - # FIXME: Should SG do lazy evaluation like MG? - elist, renumber_map = NumberMap.renumber( - elist, source, destination, store_transposed=False - ) - source = "src" - destination = "dst" - self.renumbered = True - self.renumber_map = renumber_map - else: - if type(source) is list and type(destination) is list: - raise Exception("set renumber to True for multi column ids") - - if (elist[source] == elist[destination]).any(): - self.self_loop = True - source_col = elist[source] - dest_col = elist[destination] - - if edge_attr is not None: - value_col = elist[edge_attr] - else: - value_col = None - - if value_col is not None: - source_col, dest_col, value_col = symmetrize( - source_col, dest_col, value_col, multi=self.multi, - symmetrize=not self.symmetrized) - else: - source_col, dest_col = symmetrize( - source_col, dest_col, multi=self.multi, - symmetrize=not self.symmetrized) - - self.edgelist = Graph.EdgeList(source_col, dest_col, value_col) - - if self.batch_enabled: - self._replicate_edgelist() - - self.renumber_map = renumber_map - - def from_pandas_edgelist( - self, - pdf, - source="source", - destination="destination", - edge_attr=None, - renumber=True, - ): - """ - Initialize a graph from the edge list. It is an error to call this - method on an initialized Graph object. Source argument is source - column name and destination argument is destination column name. - - By default, renumbering is enabled to map the source and destination - vertices into an index in the range [0, V) where V is the number - of vertices. If the input vertices are a single column of integers - in the range [0, V), renumbering can be disabled and the original - external vertex ids will be used. - - If weights are present, edge_attr argument is the weights column name. - - Parameters - ---------- - input_df : pandas.DataFrame - A DataFrame that contains edge information - source : str or array-like - source column name or array of column names - destination : str or array-like - destination column name or array of column names - edge_attr : str or None - the weights column name. Default is None - renumber : bool - Indicate whether or not to renumber the source and destination - vertex IDs. Default is True. - - Examples - -------- - >>> df = pandas.read_csv('datasets/karate.csv', delimiter=' ', - >>> dtype=['int32', 'int32', 'float32'], header=None) - >>> G = cugraph.Graph() - >>> G.from_pandas_edgelist(df, source='0', destination='1', - edge_attr='2', renumber=False) - - """ - gdf = cudf.DataFrame.from_pandas(pdf) - self.from_cudf_edgelist(gdf, source=source, destination=destination, - edge_attr=edge_attr, renumber=renumber) - - def to_pandas_edgelist(self, source='source', destination='destination'): - """ - Returns the graph edge list as a Pandas DataFrame. - - Parameters - ---------- - source : str or array-like - source column name or array of column names - destination : str or array-like - destination column name or array of column names - - Returns - ------- - df : pandas.DataFrame - """ - - gdf = self.view_edge_list() - return gdf.to_pandas() - - def from_pandas_adjacency(self, pdf): - """ - Initializes the graph from pandas adjacency matrix - """ - np_array = pdf.to_numpy() - columns = pdf.columns - self.from_numpy_array(np_array, columns) - - def to_pandas_adjacency(self): - """ - Returns the graph adjacency matrix as a Pandas DataFrame. - """ - - np_array_data = self.to_numpy_array() - pdf = pd.DataFrame(np_array_data) - if self.renumbered: - nodes = self.renumber_map.implementation.df['0'].\ - values_host.tolist() - pdf.columns = nodes - pdf.index = nodes - return pdf - - def to_numpy_array(self): - """ - Returns the graph adjacency matrix as a NumPy array. - """ - - nlen = self.number_of_nodes() - elen = self.number_of_edges() - df = self.edgelist.edgelist_df - np_array = np.full((nlen, nlen), 0.0) - for i in range(0, elen): - np_array[df['src'].iloc[i], df['dst'].iloc[i]] = df['weights'].\ - iloc[i] - return np_array - - def to_numpy_matrix(self): - """ - Returns the graph adjacency matrix as a NumPy matrix. - """ - np_array = self.to_numpy_array() - return np.asmatrix(np_array) - - def from_numpy_array(self, np_array, nodes=None): - """ - Initializes the graph from numpy array containing adjacency matrix. - """ - src, dst = np_array.nonzero() - weight = np_array[src, dst] - df = cudf.DataFrame() - if nodes is not None: - df['src'] = nodes[src] - df['dst'] = nodes[dst] - else: - df['src'] = src - df['dst'] = dst - df['weight'] = weight - self.from_cudf_edgelist(df, 'src', 'dst', edge_attr='weight') - - def from_numpy_matrix(self, np_matrix): - """ - Initializes the graph from numpy matrix containing adjacency matrix. - """ - np_array = np.asarray(np_matrix) - self.from_numpy_array(np_array) - - def from_dask_cudf_edgelist( - self, - input_ddf, - source="source", - destination="destination", - edge_attr=None, - renumber=True, - ): - """ - Initializes the distributed graph from the dask_cudf.DataFrame - edgelist. Undirected Graphs are not currently supported. - - By default, renumbering is enabled to map the source and destination - vertices into an index in the range [0, V) where V is the number - of vertices. If the input vertices are a single column of integers - in the range [0, V), renumbering can be disabled and the original - external vertex ids will be used. - - Note that the graph object will store a reference to the - dask_cudf.DataFrame provided. - - Parameters - ---------- - input_ddf : dask_cudf.DataFrame - The edgelist as a dask_cudf.DataFrame - source : str or array-like - source column name or array of column names - destination : str - destination column name or array of column names - edge_attr : str - weights column name. - renumber : bool - If source and destination indices are not in range 0 to V where V - is number of vertices, renumber argument should be True. - """ - if self.edgelist is not None or self.adjlist is not None: - raise Exception("Graph already has values") - if not isinstance(input_ddf, dask_cudf.DataFrame): - raise Exception("input should be a dask_cudf dataFrame") - if type(self) is Graph: - raise Exception("Undirected distributed graph not supported") - - s_col = source - d_col = destination - if not isinstance(s_col, list): - s_col = [s_col] - if not isinstance(d_col, list): - d_col = [d_col] - if not ( - set(s_col).issubset(set(input_ddf.columns)) - and set(d_col).issubset(set(input_ddf.columns)) - ): - raise Exception( - "source column names and/or destination column " - "names not found in input. Recheck the source " - "and destination parameters" - ) - ddf_columns = s_col + d_col - if edge_attr is not None: - if not (set([edge_attr]).issubset(set(input_ddf.columns))): - raise Exception( - "edge_attr column name not found in input." - "Recheck the edge_attr parameter") - ddf_columns = ddf_columns + [edge_attr] - input_ddf = input_ddf[ddf_columns] - - if edge_attr is not None: - input_ddf = input_ddf.rename(columns={edge_attr: 'value'}) - - # - # Keep all of the original parameters so we can lazily - # evaluate this function - # - - # FIXME: Edge Attribute not handled - self.distributed = True - self.local_data = None - self.edgelist = None - self.adjlist = None - self.renumbered = renumber - self.input_df = input_ddf - self.source_columns = source - self.destination_columns = destination - self.store_tranposed = None - - def view_edge_list(self): - """ - Display the edge list. Compute it if needed. - - NOTE: If the graph is of type Graph() then the displayed undirected - edges are the same as displayed by networkx Graph(), but the direction - could be different i.e. an edge displayed by cugraph as (src, dst) - could be displayed as (dst, src) by networkx. - - cugraph.Graph stores symmetrized edgelist internally. For displaying - undirected edgelist for a Graph the upper trianglar matrix of the - symmetrized edgelist is returned. - - networkx.Graph renumbers the input and stores the upper triangle of - this renumbered input. Since the internal renumbering of networx and - cugraph is different, the upper triangular matrix of networkx - renumbered input may not be the same as cugraph's upper trianglar - matrix of the symmetrized edgelist. Hence the displayed source and - destination pairs in both will represent the same edge but node values - could be swapped. - - Returns - ------- - df : cudf.DataFrame - This cudf.DataFrame wraps source, destination and weight - - df[src] : cudf.Series - contains the source index for each edge - df[dst] : cudf.Series - contains the destination index for each edge - df[weight] : cusd.Series - Column is only present for weighted Graph, - then containing the weight value for each edge - """ - if self.distributed: - if self.edgelist is None: - raise Exception("Graph has no Edgelist.") - return self.edgelist.edgelist_df - if self.edgelist is None: - src, dst, weights = graph_primtypes_wrapper.view_edge_list(self) - self.edgelist = self.EdgeList(src, dst, weights) - - edgelist_df = self.edgelist.edgelist_df - - if self.renumbered: - edgelist_df = self.unrenumber(edgelist_df, "src") - edgelist_df = self.unrenumber(edgelist_df, "dst") - - if type(self) is Graph or type(self) is MultiGraph: - edgelist_df = edgelist_df[edgelist_df["src"] <= edgelist_df["dst"]] - edgelist_df = edgelist_df.reset_index(drop=True) - self.edge_count = len(edgelist_df) - - return edgelist_df - - def delete_edge_list(self): - """ - Delete the edge list. - """ - # decrease reference count to free memory if the referenced objects are - # no longer used. - self.edgelist = None - - def from_cudf_adjlist(self, offset_col, index_col, value_col=None): - """ - Initialize a graph from the adjacency list. It is an error to call this - method on an initialized Graph object. The passed offset_col and - index_col arguments wrap gdf_column objects that represent a graph - using the adjacency list format. - If value_col is None, an unweighted graph is created. If value_col is - not None, a weighted graph is created. - Undirected edges must be stored as directed edges in both directions. - - Parameters - ---------- - offset_col : cudf.Series - This cudf.Series wraps a gdf_column of size V + 1 (V: number of - vertices). - The gdf column contains the offsets for the vertices in this graph. - Offsets must be in the range [0, E] (E: number of edges). - index_col : cudf.Series - This cudf.Series wraps a gdf_column of size E (E: number of edges). - The gdf column contains the destination index for each edge. - Destination indices must be in the range [0, V) (V: number of - vertices). - value_col : cudf.Series, optional - This pointer can be ``None``. - If not, this cudf.Series wraps a gdf_column of size E (E: number of - edges). - The gdf column contains the weight value for each edge. - The expected type of the gdf_column element is floating point - number. - - Examples - -------- - >>> gdf = cudf.read_csv('datasets/karate.csv', delimiter=' ', - >>> dtype=['int32', 'int32', 'float32'], header=None) - >>> M = gdf.to_pandas() - >>> M = scipy.sparse.coo_matrix((M['2'],(M['0'],M['1']))) - >>> M = M.tocsr() - >>> offsets = cudf.Series(M.indptr) - >>> indices = cudf.Series(M.indices) - >>> G = cugraph.Graph() - >>> G.from_cudf_adjlist(offsets, indices, None) - - """ - if self.edgelist is not None or self.adjlist is not None: - raise Exception("Graph already has values") - self.adjlist = Graph.AdjList(offset_col, index_col, value_col) - - if self.batch_enabled: - self._replicate_adjlist() - - def compute_renumber_edge_list(self, transposed=False): - """ - Compute a renumbered edge list - - This function works in the MNMG pipeline and will transform - the input dask_cudf.DataFrame into a renumbered edge list - in the prescribed direction. - - This function will be called by the algorithms to ensure - that the graph is renumbered properly. The graph object will - cache the most recent renumbering attempt. For benchmarking - purposes, this function can be called prior to calling a - graph algorithm so we can measure the cost of computing - the renumbering separately from the cost of executing the - algorithm. - - When creating a CSR-like structure, set transposed to False. - When creating a CSC-like structure, set transposed to True. - - Parameters - ---------- - transposed : (optional) bool - If True, renumber with the intent to make a CSC-like - structure. If False, renumber with the intent to make - a CSR-like structure. Defaults to False. - """ - # FIXME: What to do about edge_attr??? - # currently ignored for MNMG - - if not self.distributed: - raise Exception( - "compute_renumber_edge_list should only be used " - "for distributed graphs" - ) - - if not self.renumbered: - self.edgelist = self.EdgeList(self.input_df) - self.renumber_map = None - else: - if self.edgelist is not None: - if type(self) is Graph: - return - - if self.store_transposed == transposed: - return - - del self.edgelist - - renumbered_ddf, number_map = NumberMap.renumber( - self.input_df, - self.source_columns, - self.destination_columns, - store_transposed=transposed, - ) - self.edgelist = self.EdgeList(renumbered_ddf) - self.renumber_map = number_map - self.store_transposed = transposed - - def view_adj_list(self): - """ - Display the adjacency list. Compute it if needed. - - Returns - ------- - offset_col : cudf.Series - This cudf.Series wraps a gdf_column of size V + 1 (V: number of - vertices). - The gdf column contains the offsets for the vertices in this graph. - Offsets are in the range [0, E] (E: number of edges). - index_col : cudf.Series - This cudf.Series wraps a gdf_column of size E (E: number of edges). - The gdf column contains the destination index for each edge. - Destination indices are in the range [0, V) (V: number of - vertices). - value_col : cudf.Series or ``None`` - This pointer is ``None`` for unweighted graphs. - For weighted graphs, this cudf.Series wraps a gdf_column of size E - (E: number of edges). - The gdf column contains the weight value for each edge. - The expected type of the gdf_column element is floating point - number. - """ - if self.distributed: - raise Exception("Not supported for distributed graph") - - if self.adjlist is None: - if self.transposedadjlist is not None and type(self) is Graph: - off, ind, vals = ( - self.transposedadjlist.offsets, - self.transposedadjlist.indices, - self.transposedadjlist.weights, - ) - else: - off, ind, vals = graph_primtypes_wrapper.view_adj_list(self) - self.adjlist = self.AdjList(off, ind, vals) - - if self.batch_enabled: - self._replicate_adjlist() - - return self.adjlist.offsets, self.adjlist.indices, self.adjlist.weights - - def view_transposed_adj_list(self): - """ - Display the transposed adjacency list. Compute it if needed. - - Returns - ------- - offset_col : cudf.Series - This cudf.Series wraps a gdf_column of size V + 1 (V: number of - vertices). - The gdf column contains the offsets for the vertices in this graph. - Offsets are in the range [0, E] (E: number of edges). - index_col : cudf.Series - This cudf.Series wraps a gdf_column of size E (E: number of edges). - The gdf column contains the destination index for each edge. - Destination indices are in the range [0, V) (V: number of - vertices). - value_col : cudf.Series or ``None`` - This pointer is ``None`` for unweighted graphs. - For weighted graphs, this cudf.Series wraps a gdf_column of size E - (E: number of edges). - The gdf column contains the weight value for each edge. - The expected type of the gdf_column element is floating point - number. - - """ - if self.distributed: - raise Exception("Not supported for distributed graph") - if self.transposedadjlist is None: - if self.adjlist is not None and type(self) is Graph: - off, ind, vals = ( - self.adjlist.offsets, - self.adjlist.indices, - self.adjlist.weights, - ) - else: - ( - off, - ind, - vals, - ) = graph_primtypes_wrapper.view_transposed_adj_list(self) - self.transposedadjlist = self.transposedAdjList(off, ind, vals) - - if self.batch_enabled: - self._replicate_transposed_adjlist() - - return ( - self.transposedadjlist.offsets, - self.transposedadjlist.indices, - self.transposedadjlist.weights, - ) - - def delete_adj_list(self): - """ - Delete the adjacency list. - """ - self.adjlist = None - - def get_two_hop_neighbors(self): - """ - Compute vertex pairs that are two hops apart. The resulting pairs are - sorted before returning. - - Returns - ------- - df : cudf.DataFrame - df[first] : cudf.Series - the first vertex id of a pair, if an external vertex id - is defined by only one column - df[second] : cudf.Series - the second vertex id of a pair, if an external vertex id - is defined by only one column - """ - if self.distributed: - raise Exception("Not supported for distributed graph") - df = graph_primtypes_wrapper.get_two_hop_neighbors(self) - if self.renumbered is True: - df = self.unrenumber(df, "first") - df = self.unrenumber(df, "second") - - return df - - def number_of_vertices(self): - """ - Get the number of nodes in the graph. - - """ - if self.node_count is None: - if self.distributed: - if self.edgelist is not None: - ddf = self.edgelist.edgelist_df[["src", "dst"]] - self.node_count = ddf.max().max().compute() + 1 - else: - raise Exception("Graph is Empty") - elif self.adjlist is not None: - self.node_count = len(self.adjlist.offsets) - 1 - elif self.transposedadjlist is not None: - self.node_count = len(self.transposedadjlist.offsets) - 1 - elif self.edgelist is not None: - df = self.edgelist.edgelist_df[["src", "dst"]] - self.node_count = df.max().max() + 1 - else: - raise Exception("Graph is Empty") - return self.node_count - - def number_of_nodes(self): - """ - An alias of number_of_vertices(). This function is added for NetworkX - compatibility. - - """ - return self.number_of_vertices() - - def number_of_edges(self, directed_edges=False): - """ - Get the number of edges in the graph. - - """ - if self.distributed: - if self.edgelist is not None: - return len(self.edgelist.edgelist_df) - else: - raise ValueError("Graph is Empty") - if directed_edges and self.edgelist is not None: - return len(self.edgelist.edgelist_df) - if self.edge_count is None: - if self.edgelist is not None: - if type(self) is Graph or type(self) is MultiGraph: - self.edge_count = len( - self.edgelist.edgelist_df[ - self.edgelist.edgelist_df["src"] - >= self.edgelist.edgelist_df["dst"] - ] - ) - else: - self.edge_count = len(self.edgelist.edgelist_df) - elif self.adjlist is not None: - self.edge_count = len(self.adjlist.indices) - elif self.transposedadjlist is not None: - self.edge_count = len(self.transposedadjlist.indices) - else: - raise ValueError("Graph is Empty") - return self.edge_count - - def in_degree(self, vertex_subset=None): - """ - Compute vertex in-degree. Vertex in-degree is the number of edges - pointing into the vertex. By default, this method computes vertex - degrees for the entire set of vertices. If vertex_subset is provided, - this method optionally filters out all but those listed in - vertex_subset. - - Parameters - ---------- - vertex_subset : cudf.Series or iterable container, optional - A container of vertices for displaying corresponding in-degree. - If not set, degrees are computed for the entire set of vertices. - - Returns - ------- - df : cudf.DataFrame - GPU DataFrame of size N (the default) or the size of the given - vertices (vertex_subset) containing the in_degree. The ordering is - relative to the adjacency list, or that given by the specified - vertex_subset. - - df[vertex] : cudf.Series - The vertex IDs (will be identical to vertex_subset if - specified). - df[degree] : cudf.Series - The computed in-degree of the corresponding vertex. - - Examples - -------- - >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', - >>> dtype=['int32', 'int32', 'float32'], header=None) - >>> G = cugraph.Graph() - >>> G.from_cudf_edgelist(M, '0', '1') - >>> df = G.in_degree([0,9,12]) - - """ - return self._degree(vertex_subset, x=1) - - def out_degree(self, vertex_subset=None): - """ - Compute vertex out-degree. Vertex out-degree is the number of edges - pointing out from the vertex. By default, this method computes vertex - degrees for the entire set of vertices. If vertex_subset is provided, - this method optionally filters out all but those listed in - vertex_subset. - - Parameters - ---------- - vertex_subset : cudf.Series or iterable container, optional - A container of vertices for displaying corresponding out-degree. - If not set, degrees are computed for the entire set of vertices. - - Returns - ------- - df : cudf.DataFrame - GPU DataFrame of size N (the default) or the size of the given - vertices (vertex_subset) containing the out_degree. The ordering is - relative to the adjacency list, or that given by the specified - vertex_subset. - - df[vertex] : cudf.Series - The vertex IDs (will be identical to vertex_subset if - specified). - df[degree] : cudf.Series - The computed out-degree of the corresponding vertex. - - Examples - -------- - >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', - >>> dtype=['int32', 'int32', 'float32'], header=None) - >>> G = cugraph.Graph() - >>> G.from_cudf_edgelist(M, '0', '1') - >>> df = G.out_degree([0,9,12]) - - """ - if self.distributed: - raise Exception("Not supported for distributed graph") - return self._degree(vertex_subset, x=2) - - def degree(self, vertex_subset=None): - """ - Compute vertex degree, which is the total number of edges incident - to a vertex (both in and out edges). By default, this method computes - degrees for the entire set of vertices. If vertex_subset is provided, - then this method optionally filters out all but those listed in - vertex_subset. - - Parameters - ---------- - vertex_subset : cudf.Series or iterable container, optional - a container of vertices for displaying corresponding degree. If not - set, degrees are computed for the entire set of vertices. - - Returns - ------- - df : cudf.DataFrame - GPU DataFrame of size N (the default) or the size of the given - vertices (vertex_subset) containing the degree. The ordering is - relative to the adjacency list, or that given by the specified - vertex_subset. - - df['vertex'] : cudf.Series - The vertex IDs (will be identical to vertex_subset if - specified). - df['degree'] : cudf.Series - The computed degree of the corresponding vertex. - - Examples - -------- - >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', - >>> dtype=['int32', 'int32', 'float32'], header=None) - >>> G = cugraph.Graph() - >>> G.from_cudf_edgelist(M, '0', '1') - >>> all_df = G.degree() - >>> subset_df = G.degree([0,9,12]) - - """ - if self.distributed: - raise Exception("Not supported for distributed graph") - return self._degree(vertex_subset) - - # FIXME: vertex_subset could be a DataFrame for multi-column vertices - def degrees(self, vertex_subset=None): - """ - Compute vertex in-degree and out-degree. By default, this method - computes vertex degrees for the entire set of vertices. If - vertex_subset is provided, this method optionally filters out all but - those listed in vertex_subset. - - Parameters - ---------- - vertex_subset : cudf.Series or iterable container, optional - A container of vertices for displaying corresponding degree. If not - set, degrees are computed for the entire set of vertices. - - Returns - ------- - df : cudf.DataFrame - GPU DataFrame of size N (the default) or the size of the given - vertices (vertex_subset) containing the degrees. The ordering is - relative to the adjacency list, or that given by the specified - vertex_subset. - - df['vertex'] : cudf.Series - The vertex IDs (will be identical to vertex_subset if - specified). - df['in_degree'] : cudf.Series - The in-degree of the vertex. - df['out_degree'] : cudf.Series - The out-degree of the vertex. - - Examples - -------- - >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', - >>> dtype=['int32', 'int32', 'float32'], header=None) - >>> G = cugraph.Graph() - >>> G.from_cudf_edgelist(M, '0', '1') - >>> df = G.degrees([0,9,12]) - - """ - if self.distributed: - raise Exception("Not supported for distributed graph") - ( - vertex_col, - in_degree_col, - out_degree_col, - ) = graph_primtypes_wrapper._degrees(self) - - df = cudf.DataFrame() - df["vertex"] = vertex_col - df["in_degree"] = in_degree_col - df["out_degree"] = out_degree_col - - if self.renumbered is True: - df = self.unrenumber(df, "vertex") - - if vertex_subset is not None: - df = df[df['vertex'].isin(vertex_subset)] - - return df - - def _degree(self, vertex_subset, x=0): - vertex_col, degree_col = graph_primtypes_wrapper._degree(self, x) - df = cudf.DataFrame() - df["vertex"] = vertex_col - df["degree"] = degree_col - - if self.renumbered is True: - df = self.unrenumber(df, "vertex") - - if vertex_subset is not None: - df = df[df['vertex'].isin(vertex_subset)] - - return df - - def to_directed(self): - """ - Return a directed representation of the graph. - This function sets the type of graph as DiGraph() and returns the - directed view. - - Returns - ------- - G : DiGraph - A directed graph with the same nodes, and each edge (u,v,weights) - replaced by two directed edges (u,v,weights) and (v,u,weights). - - Examples - -------- - >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', - >>> dtype=['int32', 'int32', 'float32'], header=None) - >>> G = cugraph.Graph() - >>> G.from_cudf_edgelist(M, '0', '1') - >>> DiG = G.to_directed() - - """ - if self.distributed: - raise Exception("Not supported for distributed graph") - if type(self) is DiGraph: - return self - if type(self) is Graph: - DiG = DiGraph() - DiG.renumbered = self.renumbered - DiG.renumber_map = self.renumber_map - DiG.edgelist = self.edgelist - DiG.adjlist = self.adjlist - DiG.transposedadjlist = self.transposedadjlist - return DiG - - def to_undirected(self): - """ - Return an undirected copy of the graph. - - Returns - ------- - G : Graph - A undirected graph with the same nodes, and each directed edge - (u,v,weights) replaced by an undirected edge (u,v,weights). - - Examples - -------- - >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', - >>> dtype=['int32', 'int32', 'float32'], header=None) - >>> DiG = cugraph.DiGraph() - >>> DiG.from_cudf_edgelist(M, '0', '1') - >>> G = DiG.to_undirected() - - """ - - if type(self) is Graph: - return self - if type(self) is DiGraph: - G = Graph() - df = self.edgelist.edgelist_df - G.renumbered = self.renumbered - G.renumber_map = self.renumber_map - G.multi = self.multi - if self.edgelist.weights: - source_col, dest_col, value_col = symmetrize( - df["src"], df["dst"], df["weights"] - ) - else: - source_col, dest_col = symmetrize(df["src"], df["dst"]) - value_col = None - G.edgelist = Graph.EdgeList(source_col, dest_col, value_col) - - return G - - def is_directed(self): - if type(self) is DiGraph: - return True - else: - return False - - def has_node(self, n): - """ - Returns True if the graph contains the node n. - """ - if self.edgelist is None: - raise Exception("Graph has no Edgelist.") - if self.distributed: - ddf = self.edgelist.edgelist_df[["src", "dst"]] - return (ddf == n).any().any().compute() - if self.renumbered: - tmp = self.renumber_map.to_internal_vertex_id(cudf.Series([n])) - return tmp[0] is not cudf.NA and tmp[0] >= 0 - else: - df = self.edgelist.edgelist_df[["src", "dst"]] - return (df == n).any().any() - - def has_edge(self, u, v): - """ - Returns True if the graph contains the edge (u,v). - """ - if self.edgelist is None: - raise Exception("Graph has no Edgelist.") - if self.renumbered: - tmp = cudf.DataFrame({"src": [u, v]}) - tmp = tmp.astype({"src": "int"}) - tmp = self.add_internal_vertex_id( - tmp, "id", "src", preserve_order=True - ) - - u = tmp["id"][0] - v = tmp["id"][1] - - df = self.edgelist.edgelist_df - if self.distributed: - return ((df["src"] == u) & (df["dst"] == v)).any().compute() - return ((df["src"] == u) & (df["dst"] == v)).any() - - def edges(self): - """ - Returns all the edges in the graph as a cudf.DataFrame containing - sources and destinations. It does not return the edge weights. - For viewing edges with weights use view_edge_list() - """ - return self.view_edge_list()[["src", "dst"]] - - def nodes(self): - """ - Returns all the nodes in the graph as a cudf.Series - """ - if self.distributed: - raise Exception("Not supported for distributed graph") - if self.edgelist is not None: - df = self.edgelist.edgelist_df - if self.renumbered: - # FIXME: If vertices are multicolumn - # this needs to return a dataframe - # FIXME: This relies on current implementation - # of NumberMap, should not really expose - # this, perhaps add a method to NumberMap - return self.renumber_map.implementation.df["0"] - else: - return cudf.concat([df["src"], df["dst"]]).unique() - if self.adjlist is not None: - return cudf.Series(np.arange(0, self.number_of_nodes())) - if "all_nodes" in self._nodes.keys(): - return self._nodes["all_nodes"] - else: - n = cudf.Series(dtype="int") - set_names = [i for i in self._nodes.keys() if i != "all_nodes"] - for k in set_names: - n = n.append(self._nodes[k]) - return n - - def neighbors(self, n): - if self.edgelist is None: - raise Exception("Graph has no Edgelist.") - if self.distributed: - ddf = self.edgelist.edgelist_df - return ddf[ddf["src"] == n]["dst"].reset_index(drop=True) - if self.renumbered: - node = self.renumber_map.to_internal_vertex_id(cudf.Series([n])) - if len(node) == 0: - return cudf.Series(dtype="int") - n = node[0] - - df = self.edgelist.edgelist_df - neighbors = df[df["src"] == n]["dst"].reset_index(drop=True) - if self.renumbered: - # FIXME: Multi-column vertices - return self.renumber_map.from_internal_vertex_id(neighbors)["0"] - else: - return neighbors - - def unrenumber(self, df, column_name, preserve_order=False): - """ - Given a DataFrame containing internal vertex ids in the identified - column, replace this with external vertex ids. If the renumbering - is from a single column, the output dataframe will use the same - name for the external vertex identifiers. If the renumbering is from - a multi-column input, the output columns will be labeled 0 through - n-1 with a suffix of _column_name. - - Note that this function does not guarantee order in single GPU mode, - and does not guarantee order or partitioning in multi-GPU mode. If you - wish to preserve ordering, add an index column to df and sort the - return by that index column. - - Parameters - ---------- - df: cudf.DataFrame or dask_cudf.DataFrame - A DataFrame containing internal vertex identifiers that will be - converted into external vertex identifiers. - - column_name: string - Name of the column containing the internal vertex id. - - preserve_order: (optional) bool - If True, preserve the order of the rows in the output - DataFrame to match the input DataFrame - - Returns - --------- - df : cudf.DataFrame or dask_cudf.DataFrame - The original DataFrame columns exist unmodified. The external - vertex identifiers are added to the DataFrame, the internal - vertex identifier column is removed from the dataframe. - """ - return self.renumber_map.unrenumber(df, column_name, preserve_order) - - def lookup_internal_vertex_id(self, df, column_name=None): - """ - Given a DataFrame containing external vertex ids in the identified - columns, or a Series containing external vertex ids, return a - Series with the internal vertex ids. - - Note that this function does not guarantee order in single GPU mode, - and does not guarantee order or partitioning in multi-GPU mode. - - Parameters - ---------- - df: cudf.DataFrame, cudf.Series, dask_cudf.DataFrame, dask_cudf.Series - A DataFrame containing external vertex identifiers that will be - converted into internal vertex identifiers. - - column_name: (optional) string - Name of the column containing the external vertex ids - - Returns - --------- - series : cudf.Series or dask_cudf.Series - The internal vertex identifiers - """ - return self.renumber_map.to_internal_vertex_id(df, column_name) - - def add_internal_vertex_id( - self, - df, - internal_column_name, - external_column_name, - drop=True, - preserve_order=False, - ): - """ - Given a DataFrame containing external vertex ids in the identified - columns, return a DataFrame containing the internal vertex ids as the - specified column name. Optionally drop the external vertex id columns. - Optionally preserve the order of the original DataFrame. - - Parameters - ---------- - df: cudf.DataFrame or dask_cudf.DataFrame - A DataFrame containing external vertex identifiers that will be - converted into internal vertex identifiers. - - internal_column_name: string - Name of column to contain the internal vertex id - - external_column_name: string or list of strings - Name of the column(s) containing the external vertex ids - - drop: (optional) bool, defaults to True - Drop the external columns from the returned DataFrame - - preserve_order: (optional) bool, defaults to False - Preserve the order of the data frame (requires an extra sort) - - Returns - --------- - df : cudf.DataFrame or dask_cudf.DataFrame - Original DataFrame with new column containing internal vertex - id - """ - return self.renumber_map.add_internal_vertex_id( - df, - internal_column_name, - external_column_name, - drop, - preserve_order, - ) - - -class DiGraph(Graph): - """ - cuGraph directed graph class. Drops parallel edges. - """ - def __init__(self, m_graph=None): - super().__init__( - m_graph=m_graph, symmetrized=True - ) - - -class MultiGraph(Graph): - """ - cuGraph class to create and store undirected graphs with parallel edges. - """ - def __init__(self, renumbered=True): - super().__init__(multi=True) - - -class MultiDiGraph(Graph): - """ - cuGraph class to create and store directed graphs with parallel edges. - """ - def __init__(self, renumbered=True): - super().__init__(symmetrized=True, multi=True) diff --git a/python/cugraph/structure/graph_classes.py b/python/cugraph/structure/graph_classes.py new file mode 100644 index 00000000000..3cd1863a054 --- /dev/null +++ b/python/cugraph/structure/graph_classes.py @@ -0,0 +1,743 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from .graph_implementation import (simpleGraphImpl, + simpleDistributedGraphImpl, + npartiteGraphImpl) +import cudf +import warnings + + +# TODO: Move to utilities +def null_check(col): + if col.null_count != 0: + raise ValueError("Series contains NULL values") + + +class Graph: + class Properties: + def __init__(self, directed): + self.directed = directed + self.weights = False + + def __init__(self, m_graph=None, directed=False): + self._Impl = None + self.graph_properties = Graph.Properties(directed) + if m_graph is not None: + if m_graph.is_multigraph(): + elist = m_graph.view_edge_list() + if m_graph.is_weighted(): + weights = "weights" + else: + weights = None + self.from_cudf_edgelist(elist, + source="src", + destination="dst", + edge_attr=weights) + else: + msg = ( + "Graph can only be initialized using MultiGraph " + "or MultiDiGraph" + ) + raise Exception(msg) + + def __getattr__(self, name): + if self._Impl is None: + raise AttributeError(name) + if hasattr(self._Impl, name): + return getattr(self._Impl, name) + # FIXME: Remove access to Impl properties + elif hasattr(self._Impl.properties, name): + return getattr(self._Impl.properties, name) + else: + raise AttributeError(name) + + def __dir__(self): + return dir(self._Impl) + + def from_cudf_edgelist( + self, + input_df, + source="source", + destination="destination", + edge_attr=None, + renumber=True + ): + """ + Initialize a graph from the edge list. It is an error to call this + method on an initialized Graph object. The passed input_df argument + wraps gdf_column objects that represent a graph using the edge list + format. source argument is source column name and destination argument + is destination column name. + By default, renumbering is enabled to map the source and destination + vertices into an index in the range [0, V) where V is the number + of vertices. If the input vertices are a single column of integers + in the range [0, V), renumbering can be disabled and the original + external vertex ids will be used. + If weights are present, edge_attr argument is the weights column name. + Parameters + ---------- + input_df : cudf.DataFrame or dask_cudf.DataFrame + A DataFrame that contains edge information + If a dask_cudf.DataFrame is passed it will be reinterpreted as + a cudf.DataFrame. For the distributed path please use + from_dask_cudf_edgelist. + source : str or array-like + source column name or array of column names + destination : str or array-like + destination column name or array of column names + edge_attr : str or None + the weights column name. Default is None + renumber : bool + Indicate whether or not to renumber the source and destination + vertex IDs. Default is True. + Examples + -------- + >>> df = cudf.read_csv('datasets/karate.csv', delimiter=' ', + >>> dtype=['int32', 'int32', 'float32'], header=None) + >>> G = cugraph.Graph() + >>> G.from_cudf_edgelist(df, source='0', destination='1', + edge_attr='2', renumber=False) + """ + if self._Impl is None: + self._Impl = simpleGraphImpl(self.graph_properties) + elif type(self._Impl) is not simpleGraphImpl: + raise Exception("Graph is already initialized") + elif (self._Impl.edgelist is not None or + self._Impl.adjlist is not None): + raise Exception("Graph already has values") + self._Impl._simpleGraphImpl__from_edgelist(input_df, + source=source, + destination=destination, + edge_attr=edge_attr, + renumber=renumber) + + def from_cudf_adjlist(self, offset_col, index_col, value_col=None): + """ + Initialize a graph from the adjacency list. It is an error to call this + method on an initialized Graph object. The passed offset_col and + index_col arguments wrap gdf_column objects that represent a graph + using the adjacency list format. + If value_col is None, an unweighted graph is created. If value_col is + not None, a weighted graph is created. + Undirected edges must be stored as directed edges in both directions. + Parameters + ---------- + offset_col : cudf.Series + This cudf.Series wraps a gdf_column of size V + 1 (V: number of + vertices). + The gdf column contains the offsets for the vertices in this graph. + Offsets must be in the range [0, E] (E: number of edges). + index_col : cudf.Series + This cudf.Series wraps a gdf_column of size E (E: number of edges). + The gdf column contains the destination index for each edge. + Destination indices must be in the range [0, V) (V: number of + vertices). + value_col : cudf.Series, optional + This pointer can be ``None``. + If not, this cudf.Series wraps a gdf_column of size E (E: number of + edges). + The gdf column contains the weight value for each edge. + The expected type of the gdf_column element is floating point + number. + Examples + -------- + >>> gdf = cudf.read_csv('datasets/karate.csv', delimiter=' ', + >>> dtype=['int32', 'int32', 'float32'], header=None) + >>> M = gdf.to_pandas() + >>> M = scipy.sparse.coo_matrix((M['2'],(M['0'],M['1']))) + >>> M = M.tocsr() + >>> offsets = cudf.Series(M.indptr) + >>> indices = cudf.Series(M.indices) + >>> G = cugraph.Graph() + >>> G.from_cudf_adjlist(offsets, indices, None) + """ + if self._Impl is None: + self._Impl = simpleGraphImpl(self.graph_properties) + elif type(self._Impl) is not simpleGraphImpl: + raise Exception("Graph is already initialized") + elif (self._Impl.edgelist is not None or + self._Impl.adjlist is not None): + raise Exception("Graph already has values") + self._Impl._simpleGraphImpl__from_adjlist(offset_col, + index_col, + value_col) + + def from_dask_cudf_edgelist( + self, + input_ddf, + source="source", + destination="destination", + edge_attr=None, + renumber=True, + ): + """ + Initializes the distributed graph from the dask_cudf.DataFrame + edgelist. Undirected Graphs are not currently supported. + By default, renumbering is enabled to map the source and destination + vertices into an index in the range [0, V) where V is the number + of vertices. If the input vertices are a single column of integers + in the range [0, V), renumbering can be disabled and the original + external vertex ids will be used. + Note that the graph object will store a reference to the + dask_cudf.DataFrame provided. + Parameters + ---------- + input_ddf : dask_cudf.DataFrame + The edgelist as a dask_cudf.DataFrame + source : str or array-like + source column name or array of column names + destination : str + destination column name or array of column names + edge_attr : str + weights column name. + renumber : bool + If source and destination indices are not in range 0 to V where V + is number of vertices, renumber argument should be True. + """ + if self._Impl is None: + self._Impl = simpleDistributedGraphImpl(self.graph_properties) + elif type(self._Impl) is not simpleDistributedGraphImpl: + raise Exception("Graph is already initialized") + elif (self._Impl.edgelist is not None): + raise Exception("Graph already has values") + self._Impl._simpleDistributedGraphImpl__from_edgelist(input_ddf, + source, + destination, + edge_attr, + renumber) + + # Move to Compat Module + def from_pandas_edgelist( + self, + pdf, + source="source", + destination="destination", + edge_attr=None, + renumber=True, + ): + """ + Initialize a graph from the edge list. It is an error to call this + method on an initialized Graph object. Source argument is source + column name and destination argument is destination column name. + By default, renumbering is enabled to map the source and destination + vertices into an index in the range [0, V) where V is the number + of vertices. If the input vertices are a single column of integers + in the range [0, V), renumbering can be disabled and the original + external vertex ids will be used. + If weights are present, edge_attr argument is the weights column name. + Parameters + ---------- + input_df : pandas.DataFrame + A DataFrame that contains edge information + source : str or array-like + source column name or array of column names + destination : str or array-like + destination column name or array of column names + edge_attr : str or None + the weights column name. Default is None + renumber : bool + Indicate whether or not to renumber the source and destination + vertex IDs. Default is True. + Examples + -------- + >>> df = pandas.read_csv('datasets/karate.csv', delimiter=' ', + >>> dtype=['int32', 'int32', 'float32'], header=None) + >>> G = cugraph.Graph() + >>> G.from_pandas_edgelist(df, source='0', destination='1', + edge_attr='2', renumber=False) + """ + gdf = cudf.DataFrame.from_pandas(pdf) + self.from_cudf_edgelist(gdf, source=source, destination=destination, + edge_attr=edge_attr, renumber=renumber) + + def from_pandas_adjacency(self, pdf): + """ + Initializes the graph from pandas adjacency matrix + """ + np_array = pdf.to_numpy() + columns = pdf.columns + self.from_numpy_array(np_array, columns) + + def from_numpy_array(self, np_array, nodes=None): + """ + Initializes the graph from numpy array containing adjacency matrix. + """ + src, dst = np_array.nonzero() + weight = np_array[src, dst] + df = cudf.DataFrame() + if nodes is not None: + df['src'] = nodes[src] + df['dst'] = nodes[dst] + else: + df['src'] = src + df['dst'] = dst + df['weight'] = weight + self.from_cudf_edgelist(df, 'src', 'dst', edge_attr='weight') + + def from_numpy_matrix(self, np_matrix): + """ + Initializes the graph from numpy matrix containing adjacency matrix. + """ + np_array = np.asarray(np_matrix) + self.from_numpy_array(np_array) + + def unrenumber(self, df, column_name, preserve_order=False): + """ + Given a DataFrame containing internal vertex ids in the identified + column, replace this with external vertex ids. If the renumbering + is from a single column, the output dataframe will use the same + name for the external vertex identifiers. If the renumbering is from + a multi-column input, the output columns will be labeled 0 through + n-1 with a suffix of _column_name. + Note that this function does not guarantee order in single GPU mode, + and does not guarantee order or partitioning in multi-GPU mode. If you + wish to preserve ordering, add an index column to df and sort the + return by that index column. + Parameters + ---------- + df: cudf.DataFrame or dask_cudf.DataFrame + A DataFrame containing internal vertex identifiers that will be + converted into external vertex identifiers. + column_name: string + Name of the column containing the internal vertex id. + preserve_order: (optional) bool + If True, preserve the order of the rows in the output + DataFrame to match the input DataFrame + Returns + --------- + df : cudf.DataFrame or dask_cudf.DataFrame + The original DataFrame columns exist unmodified. The external + vertex identifiers are added to the DataFrame, the internal + vertex identifier column is removed from the dataframe. + """ + return self.renumber_map.unrenumber(df, column_name, preserve_order) + + def lookup_internal_vertex_id(self, df, column_name=None): + """ + Given a DataFrame containing external vertex ids in the identified + columns, or a Series containing external vertex ids, return a + Series with the internal vertex ids. + Note that this function does not guarantee order in single GPU mode, + and does not guarantee order or partitioning in multi-GPU mode. + Parameters + ---------- + df: cudf.DataFrame, cudf.Series, dask_cudf.DataFrame, dask_cudf.Series + A DataFrame containing external vertex identifiers that will be + converted into internal vertex identifiers. + column_name: (optional) string + Name of the column containing the external vertex ids + Returns + --------- + series : cudf.Series or dask_cudf.Series + The internal vertex identifiers + """ + return self.renumber_map.to_internal_vertex_id(df, column_name) + + def add_internal_vertex_id( + self, + df, + internal_column_name, + external_column_name, + drop=True, + preserve_order=False, + ): + """ + Given a DataFrame containing external vertex ids in the identified + columns, return a DataFrame containing the internal vertex ids as the + specified column name. Optionally drop the external vertex id columns. + Optionally preserve the order of the original DataFrame. + Parameters + ---------- + df: cudf.DataFrame or dask_cudf.DataFrame + A DataFrame containing external vertex identifiers that will be + converted into internal vertex identifiers. + internal_column_name: string + Name of column to contain the internal vertex id + external_column_name: string or list of strings + Name of the column(s) containing the external vertex ids + drop: (optional) bool, defaults to True + Drop the external columns from the returned DataFrame + preserve_order: (optional) bool, defaults to False + Preserve the order of the data frame (requires an extra sort) + Returns + --------- + df : cudf.DataFrame or dask_cudf.DataFrame + Original DataFrame with new column containing internal vertex + id + """ + return self.renumber_map.add_internal_vertex_id( + df, + internal_column_name, + external_column_name, + drop, + preserve_order, + ) + + def clear(self): + """ + Empty the graph. + """ + self._Impl = None + + def is_bipartite(self): + """ + Checks if Graph is bipartite. This solely relies on the user call of + add_nodes_from with the bipartite parameter. This does not parse the + graph to check if it is bipartite. + """ + # TO DO: Call coloring algorithm + return False + + def is_multipartite(self): + """ + Checks if Graph is multipartite. This solely relies on the user call + of add_nodes_from with the partition parameter. This does not parse + the graph to check if it is multipartite. + """ + # TO DO: Call coloring algorithm + return False + + def is_multigraph(self): + """ + Returns True if the graph is a multigraph. Else returns False. + """ + # TO DO: Call coloring algorithm + return False + + def is_directed(self): + """ + Returns True if the graph is a directed graph. + Returns False if the graph is an undirected graph. + """ + return self.graph_properties.directed + + def is_renumbered(self): + """ + Returns True if the graph is renumbered. + """ + return self.properties.renumbered + + def is_weighted(self): + """ + Returns True if the graph has edge weights. + """ + return self.properties.weighted + + def has_isolated_vertices(self): + """ + Returns True if the graph has isolated vertices. + """ + return self.properties.isolated_vertices + + def to_directed(self): + """ + Return a directed representation of the graph. + This function sets the type of graph as DiGraph() and returns the + directed view. + Returns + ------- + G : DiGraph + A directed graph with the same nodes, and each edge (u,v,weights) + replaced by two directed edges (u,v,weights) and (v,u,weights). + Examples + -------- + >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', + >>> dtype=['int32', 'int32', 'float32'], header=None) + >>> G = cugraph.Graph() + >>> G.from_cudf_edgelist(M, '0', '1') + >>> DiG = G.to_directed() + """ + directed_graph = type(self)() + directed_graph.graph_properties.directed = True + directed_graph._Impl = type(self._Impl)(directed_graph. + graph_properties) + self._Impl.to_directed(directed_graph._Impl) + return directed_graph + + def to_undirected(self): + """ + Return an undirected copy of the graph. + Returns + ------- + G : Graph + A undirected graph with the same nodes, and each directed edge + (u,v,weights) replaced by an undirected edge (u,v,weights). + Examples + -------- + >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', + >>> dtype=['int32', 'int32', 'float32'], header=None) + >>> DiG = cugraph.DiGraph() + >>> DiG.from_cudf_edgelist(M, '0', '1') + >>> G = DiG.to_undirected() + """ + + if self.graph_properties.directed is False: + undirected_graph = type(self)() + elif self.__class__.__bases__[0] == object: + undirected_graph = type(self)() + else: + undirected_graph = self.__class__.__bases__[0]() + undirected_graph._Impl = type(self._Impl)(undirected_graph. + graph_properties) + self._Impl.to_undirected(undirected_graph._Impl) + return undirected_graph + + def add_nodes_from(self, nodes): + """ + Add nodes information to the Graph. + Parameters + ---------- + nodes : list or cudf.Series + The nodes of the graph to be stored. + """ + self._Impl._nodes["all_nodes"] = cudf.Series(nodes) + + # TODO: Add function + # def properties(): + + +class DiGraph(Graph): + def __init__(self, m_graph=None): + warnings.warn( + "DiGraph is deprecated, use Graph(directed=True) instead", + DeprecationWarning + ) + super(DiGraph, self).__init__(m_graph, directed=True) + + +class MultiGraph(Graph): + def __init__(self, directed=False): + super(MultiGraph, self).__init__(directed=directed) + self.graph_properties.multi_edge = True + + def is_multigraph(self): + """ + Returns True if the graph is a multigraph. Else returns False. + """ + # TO DO: Call coloring algorithm + return True + + +class MultiDiGraph(MultiGraph): + def __init__(self): + warnings.warn( + "MultiDiGraph is deprecated,\ + use MultiGraph(directed=True) instead", + DeprecationWarning + ) + super(MultiDiGraph, self).__init__(directed=True) + + +class Tree(Graph): + def __init__(self, directed=False): + super(Tree, self).__init__(directed=directed) + self.graph_properties.tree = True + + +class NPartiteGraph(Graph): + def __init__(self, bipartite=False, directed=False): + super(NPartiteGraph, self).__init__(directed=directed) + self.graph_properties.bipartite = bipartite + self.graph_properties.multipartite = True + + def from_cudf_edgelist( + self, + input_df, + source="source", + destination="destination", + edge_attr=None, + renumber=True + ): + """ + Initialize a graph from the edge list. It is an error to call this + method on an initialized Graph object. The passed input_df argument + wraps gdf_column objects that represent a graph using the edge list + format. source argument is source column name and destination argument + is destination column name. + By default, renumbering is enabled to map the source and destination + vertices into an index in the range [0, V) where V is the number + of vertices. If the input vertices are a single column of integers + in the range [0, V), renumbering can be disabled and the original + external vertex ids will be used. + If weights are present, edge_attr argument is the weights column name. + Parameters + ---------- + input_df : cudf.DataFrame or dask_cudf.DataFrame + A DataFrame that contains edge information + If a dask_cudf.DataFrame is passed it will be reinterpreted as + a cudf.DataFrame. For the distributed path please use + from_dask_cudf_edgelist. + source : str or array-like + source column name or array of column names + destination : str or array-like + destination column name or array of column names + edge_attr : str or None + the weights column name. Default is None + renumber : bool + Indicate whether or not to renumber the source and destination + vertex IDs. Default is True. + Examples + -------- + >>> df = cudf.read_csv('datasets/karate.csv', delimiter=' ', + >>> dtype=['int32', 'int32', 'float32'], header=None) + >>> G = cugraph.BiPartiteGraph() + >>> G.from_cudf_edgelist(df, source='0', destination='1', + edge_attr='2', renumber=False) + """ + if self._Impl is None: + self._Impl = npartiteGraphImpl(self.graph_properties) + # API may change in future + self._Impl._npartiteGraphImpl__from_edgelist(input_df, + source=source, + destination=destination, + edge_attr=edge_attr, + renumber=renumber) + + def from_dask_cudf_edgelist( + self, + input_ddf, + source="source", + destination="destination", + edge_attr=None, + renumber=True, + ): + """ + Initializes the distributed graph from the dask_cudf.DataFrame + edgelist. Undirected Graphs are not currently supported. + By default, renumbering is enabled to map the source and destination + vertices into an index in the range [0, V) where V is the number + of vertices. If the input vertices are a single column of integers + in the range [0, V), renumbering can be disabled and the original + external vertex ids will be used. + Note that the graph object will store a reference to the + dask_cudf.DataFrame provided. + Parameters + ---------- + input_ddf : dask_cudf.DataFrame + The edgelist as a dask_cudf.DataFrame + source : str or array-like + source column name or array of column names + destination : str + destination column name or array of column names + edge_attr : str + weights column name. + renumber : bool + If source and destination indices are not in range 0 to V where V + is number of vertices, renumber argument should be True. + """ + raise Exception("Distributed N-partite graph not supported") + + def add_nodes_from(self, nodes, bipartite=None, multipartite=None): + """ + Add nodes information to the Graph. + Parameters + ---------- + nodes : list or cudf.Series + The nodes of the graph to be stored. If bipartite and multipartite + arguments are not passed, the nodes are considered to be a list of + all the nodes present in the Graph. + bipartite : str + Sets the Graph as bipartite. The nodes are stored as a set of nodes + of the partition named as bipartite argument. + multipartite : str + Sets the Graph as multipartite. The nodes are stored as a set of + nodes of the partition named as multipartite argument. + """ + if self._Impl is None: + self._Impl = npartiteGraphImpl(self.graph_properties) + if bipartite is None and multipartite is None: + self._Impl._nodes["all_nodes"] = cudf.Series(nodes) + else: + self._Impl.add_nodes_from(nodes, bipartite=bipartite, + multipartite=multipartite) + + def is_multipartite(self): + """ + Checks if Graph is multipartite. This solely relies on the user call + of add_nodes_from with the partition parameter and the Graph created. + This does not parse the graph to check if it is multipartite. + """ + return True + + +class BiPartiteGraph(NPartiteGraph): + def __init__(self, directed=False): + super(BiPartiteGraph, self).__init__(directed=directed, bipartite=True) + + def is_bipartite(self): + """ + Checks if Graph is bipartite. This solely relies on the user call of + add_nodes_from with the bipartite parameter and the Graph created. + This does not parse the graph to check if it is bipartite. + """ + return True + + +class BiPartiteDiGraph(BiPartiteGraph): + def __init__(self): + warnings.warn( + "BiPartiteDiGraph is deprecated,\ + use BiPartiteGraph(directed=True) instead", + DeprecationWarning + ) + super(BiPartiteDiGraph, self).__init__(directed=True) + + +class NPartiteDiGraph(NPartiteGraph): + def __init__(self): + warnings.warn( + "NPartiteDiGraph is deprecated,\ + use NPartiteGraph(directed=True) instead", + DeprecationWarning + ) + super(NPartiteGraph, self).__init__(directed=True) + + +def is_directed(G): + """ + Returns True if the graph is a directed graph. + Returns False if the graph is an undirected graph. + """ + return G.is_directed() + + +def is_multigraph(G): + """ + Returns True if the graph is a multigraph. Else returns False. + """ + return G.is_multigraph() + + +def is_multipartite(G): + """ + Checks if Graph is multipartite. This solely relies on the Graph + type. This does not parse the graph to check if it is multipartite. + """ + return G.is_multipatite() + + +def is_bipartite(G): + """ + Checks if Graph is bipartite. This solely relies on the Graph type. + This does not parse the graph to check if it is bipartite. + """ + return G.is_bipartite() + + +def is_weighted(G): + """ + Returns True if the graph has edge weights. + """ + return G.is_weighted() diff --git a/python/cugraph/structure/graph_implementation/__init__.py b/python/cugraph/structure/graph_implementation/__init__.py new file mode 100644 index 00000000000..eeef73c0f64 --- /dev/null +++ b/python/cugraph/structure/graph_implementation/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .simpleGraph import simpleGraphImpl +from .simpleDistributedGraph import simpleDistributedGraphImpl +from .npartiteGraph import npartiteGraphImpl + diff --git a/python/cugraph/structure/graph_implementation/npartiteGraph.py b/python/cugraph/structure/graph_implementation/npartiteGraph.py new file mode 100644 index 00000000000..111d9f792fa --- /dev/null +++ b/python/cugraph/structure/graph_implementation/npartiteGraph.py @@ -0,0 +1,100 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .simpleGraph import simpleGraphImpl +import cudf + + +class npartiteGraphImpl(simpleGraphImpl): + def __init__(self, properties): + super(npartiteGraphImpl, self).__init__(properties) + self.properties.bipartite = properties.bipartite + + # API may change in future + def __from_edgelist( + self, + input_df, + source="source", + destination="destination", + edge_attr=None, + renumber=True, + ): + self._simpleGraphImpl__from_edgelist( + input_df, + source=source, + destination=destination, + edge_attr=edge_attr, + renumber=renumber, + ) + + def sets(self): + """ + Returns the bipartite set of nodes. This solely relies on the user's + call of add_nodes_from with the bipartite parameter. This does not + parse the graph to compute bipartite sets. If bipartite argument was + not provided during add_nodes_from(), it raise an exception that the + graph is not bipartite. + """ + # TO DO: Call coloring algorithm + set_names = [i for i in self._nodes.keys() if i != "all_nodes"] + if self.properties.bipartite: + top = self._nodes[set_names[0]] + if len(set_names) == 2: + bottom = self._nodes[set_names[1]] + else: + bottom = cudf.Series( + set(self.nodes().values_host) - set(top.values_host) + ) + return top, bottom + else: + return {k: self._nodes[k] for k in set_names} + + # API may change in future + def add_nodes_from(self, nodes, bipartite=None, multipartite=None): + """ + Add nodes information to the Graph. + Parameters + ---------- + nodes : list or cudf.Series + The nodes of the graph to be stored. If bipartite and multipartite + arguments are not passed, the nodes are considered to be a list of + all the nodes present in the Graph. + bipartite : str + Sets the Graph as bipartite. The nodes are stored as a set of nodes + of the partition named as bipartite argument. + multipartite : str + Sets the Graph as multipartite. The nodes are stored as a set of + nodes of the partition named as multipartite argument. + """ + if bipartite is None and multipartite is None: + raise Exception("Partition not provided") + else: + set_names = [i for i in self._nodes.keys() if i != "all_nodes"] + if multipartite is not None: + if self.properties.bipartite: + raise Exception( + "The Graph is bipartite. " + "Use bipartite option instead." + ) + elif bipartite is not None: + if not self.properties.bipartite: + raise Exception( + "The Graph is set as npartite. " + "Use multipartite option instead.") + multipartite = bipartite + if multipartite not in set_names and len(set_names) == 2: + raise Exception( + "The Graph is set as bipartite and " + "already has two partitions initialized." + ) + self._nodes[multipartite] = cudf.Series(nodes) diff --git a/python/cugraph/structure/graph_implementation/simpleDistributedGraph.py b/python/cugraph/structure/graph_implementation/simpleDistributedGraph.py new file mode 100644 index 00000000000..e85f3b6ab6c --- /dev/null +++ b/python/cugraph/structure/graph_implementation/simpleDistributedGraph.py @@ -0,0 +1,473 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from cugraph.structure import graph_primtypes_wrapper +from cugraph.structure.number_map import NumberMap +import cudf +import dask_cudf + + +class simpleDistributedGraphImpl: + class EdgeList: + def __init__(self, ddf): + self.edgelist_df = ddf + self.weights = False + # FIXME: Edge Attribute not handled + + # class AdjList: + # Not Supported + + # class transposedAdjList: + # Not Supported + + class Properties: + def __init__(self, properties): + self.multi_edge = getattr(properties, 'multi_edge', False) + self.directed = properties.directed + self.renumbered = False + self.store_transposed = False + self.self_loop = None + self.isolated_vertices = None + self.node_count = None + self.edge_count = None + self.weighted = False + + def __init__(self, properties): + # Structure + self.edgelist = None + self.renumber_map = None + self.properties = simpleDistributedGraphImpl.Properties(properties) + self.source_columns = None + self.destination_columns = None + + # Functions + def __from_edgelist( + self, + input_ddf, + source="source", + destination="destination", + edge_attr=None, + renumber=True, + store_transposed=False, + ): + if not isinstance(input_ddf, dask_cudf.DataFrame): + raise Exception("input should be a dask_cudf dataFrame") + if self.properties.directed is False: + raise Exception("Undirected distributed graph not supported") + + s_col = source + d_col = destination + if not isinstance(s_col, list): + s_col = [s_col] + if not isinstance(d_col, list): + d_col = [d_col] + if not ( + set(s_col).issubset(set(input_ddf.columns)) + and set(d_col).issubset(set(input_ddf.columns)) + ): + raise Exception( + "source column names and/or destination column " + "names not found in input. Recheck the source " + "and destination parameters" + ) + ddf_columns = s_col + d_col + if edge_attr is not None: + if not (set([edge_attr]).issubset(set(input_ddf.columns))): + raise Exception( + "edge_attr column name not found in input." + "Recheck the edge_attr parameter") + self.weighted = True + ddf_columns = ddf_columns + [edge_attr] + input_ddf = input_ddf[ddf_columns] + + if edge_attr is not None: + input_ddf = input_ddf.rename(columns={edge_attr: 'value'}) + + # + # Keep all of the original parameters so we can lazily + # evaluate this function + # + + # FIXME: Edge Attribute not handled + self.properties.renumbered = renumber + self.input_df = input_ddf + self.source_columns = source + self.destination_columns = destination + + def view_edge_list(self): + """ + Display the edge list. Compute it if needed. + NOTE: If the graph is of type Graph() then the displayed undirected + edges are the same as displayed by networkx Graph(), but the direction + could be different i.e. an edge displayed by cugraph as (src, dst) + could be displayed as (dst, src) by networkx. + cugraph.Graph stores symmetrized edgelist internally. For displaying + undirected edgelist for a Graph the upper trianglar matrix of the + symmetrized edgelist is returned. + networkx.Graph renumbers the input and stores the upper triangle of + this renumbered input. Since the internal renumbering of networx and + cugraph is different, the upper triangular matrix of networkx + renumbered input may not be the same as cugraph's upper trianglar + matrix of the symmetrized edgelist. Hence the displayed source and + destination pairs in both will represent the same edge but node values + could be swapped. + Returns + ------- + df : cudf.DataFrame + This cudf.DataFrame wraps source, destination and weight + df[src] : cudf.Series + contains the source index for each edge + df[dst] : cudf.Series + contains the destination index for each edge + df[weight] : cusd.Series + Column is only present for weighted Graph, + then containing the weight value for each edge + """ + if self.edgelist is None: + raise Exception("Graph has no Edgelist.") + return self.edgelist.edgelist_df + + def delete_edge_list(self): + """ + Delete the edge list. + """ + # decrease reference count to free memory if the referenced objects are + # no longer used. + self.edgelist = None + + def clear(self): + """ + Empty this graph. This function is added for NetworkX compatibility. + """ + self.edgelist = None + + def number_of_vertices(self): + """ + Get the number of nodes in the graph. + """ + if self.properties.node_count is None: + if self.edgelist is not None: + ddf = self.edgelist.edgelist_df[["src", "dst"]] + self.properties.node_count = ddf.max().max().compute() + 1 + else: + raise Exception("Graph is Empty") + return self.properties.node_count + + def number_of_nodes(self): + """ + An alias of number_of_vertices(). This function is added for NetworkX + compatibility. + """ + return self.number_of_vertices() + + def number_of_edges(self, directed_edges=False): + """ + Get the number of edges in the graph. + """ + if self.edgelist is not None: + return len(self.edgelist.edgelist_df) + else: + raise Exception("Graph is Empty") + + def in_degree(self, vertex_subset=None): + """ + Compute vertex in-degree. Vertex in-degree is the number of edges + pointing into the vertex. By default, this method computes vertex + degrees for the entire set of vertices. If vertex_subset is provided, + this method optionally filters out all but those listed in + vertex_subset. + Parameters + ---------- + vertex_subset : cudf.Series or iterable container, optional + A container of vertices for displaying corresponding in-degree. + If not set, degrees are computed for the entire set of vertices. + Returns + ------- + df : cudf.DataFrame + GPU DataFrame of size N (the default) or the size of the given + vertices (vertex_subset) containing the in_degree. The ordering is + relative to the adjacency list, or that given by the specified + vertex_subset. + df[vertex] : cudf.Series + The vertex IDs (will be identical to vertex_subset if + specified). + df[degree] : cudf.Series + The computed in-degree of the corresponding vertex. + Examples + -------- + >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', + >>> dtype=['int32', 'int32', 'float32'], header=None) + >>> G = cugraph.Graph() + >>> G.from_cudf_edgelist(M, '0', '1') + >>> df = G.in_degree([0,9,12]) + """ + return self._degree(vertex_subset, x=1) + + def out_degree(self, vertex_subset=None): + """ + Compute vertex out-degree. Vertex out-degree is the number of edges + pointing out from the vertex. By default, this method computes vertex + degrees for the entire set of vertices. If vertex_subset is provided, + this method optionally filters out all but those listed in + vertex_subset. + Parameters + ---------- + vertex_subset : cudf.Series or iterable container, optional + A container of vertices for displaying corresponding out-degree. + If not set, degrees are computed for the entire set of vertices. + Returns + ------- + df : cudf.DataFrame + GPU DataFrame of size N (the default) or the size of the given + vertices (vertex_subset) containing the out_degree. The ordering is + relative to the adjacency list, or that given by the specified + vertex_subset. + df[vertex] : cudf.Series + The vertex IDs (will be identical to vertex_subset if + specified). + df[degree] : cudf.Series + The computed out-degree of the corresponding vertex. + Examples + -------- + >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', + >>> dtype=['int32', 'int32', 'float32'], header=None) + >>> G = cugraph.Graph() + >>> G.from_cudf_edgelist(M, '0', '1') + >>> df = G.out_degree([0,9,12]) + """ + # TODO: Add support + raise Exception("Not supported for distributed graph") + + def degree(self, vertex_subset=None): + """ + Compute vertex degree, which is the total number of edges incident + to a vertex (both in and out edges). By default, this method computes + degrees for the entire set of vertices. If vertex_subset is provided, + then this method optionally filters out all but those listed in + vertex_subset. + Parameters + ---------- + vertex_subset : cudf.Series or iterable container, optional + a container of vertices for displaying corresponding degree. If not + set, degrees are computed for the entire set of vertices. + Returns + ------- + df : cudf.DataFrame + GPU DataFrame of size N (the default) or the size of the given + vertices (vertex_subset) containing the degree. The ordering is + relative to the adjacency list, or that given by the specified + vertex_subset. + df['vertex'] : cudf.Series + The vertex IDs (will be identical to vertex_subset if + specified). + df['degree'] : cudf.Series + The computed degree of the corresponding vertex. + Examples + -------- + >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', + >>> dtype=['int32', 'int32', 'float32'], header=None) + >>> G = cugraph.Graph() + >>> G.from_cudf_edgelist(M, '0', '1') + >>> all_df = G.degree() + >>> subset_df = G.degree([0,9,12]) + """ + raise Exception("Not supported for distributed graph") + + # FIXME: vertex_subset could be a DataFrame for multi-column vertices + def degrees(self, vertex_subset=None): + """ + Compute vertex in-degree and out-degree. By default, this method + computes vertex degrees for the entire set of vertices. If + vertex_subset is provided, this method optionally filters out all but + those listed in vertex_subset. + Parameters + ---------- + vertex_subset : cudf.Series or iterable container, optional + A container of vertices for displaying corresponding degree. If not + set, degrees are computed for the entire set of vertices. + Returns + ------- + df : cudf.DataFrame + GPU DataFrame of size N (the default) or the size of the given + vertices (vertex_subset) containing the degrees. The ordering is + relative to the adjacency list, or that given by the specified + vertex_subset. + df['vertex'] : cudf.Series + The vertex IDs (will be identical to vertex_subset if + specified). + df['in_degree'] : cudf.Series + The in-degree of the vertex. + df['out_degree'] : cudf.Series + The out-degree of the vertex. + Examples + -------- + >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', + >>> dtype=['int32', 'int32', 'float32'], header=None) + >>> G = cugraph.Graph() + >>> G.from_cudf_edgelist(M, '0', '1') + >>> df = G.degrees([0,9,12]) + """ + raise Exception("Not supported for distributed graph") + + def _degree(self, vertex_subset, x=0): + vertex_col, degree_col = graph_primtypes_wrapper._degree(self, x) + df = cudf.DataFrame() + df["vertex"] = vertex_col + df["degree"] = degree_col + + if self.renumbered is True: + df = self.unrenumber(df, "vertex") + + if vertex_subset is not None: + df = df[df['vertex'].isin(vertex_subset)] + + return df + + def to_directed(self, DiG): + """ + Return a directed representation of the graph. + This function sets the type of graph as DiGraph() and returns the + directed view. + Returns + ------- + G : DiGraph + A directed graph with the same nodes, and each edge (u,v,weights) + replaced by two directed edges (u,v,weights) and (v,u,weights). + Examples + -------- + >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', + >>> dtype=['int32', 'int32', 'float32'], header=None) + >>> G = cugraph.Graph() + >>> G.from_cudf_edgelist(M, '0', '1') + >>> DiG = G.to_directed() + """ + # TODO: Add support + raise Exception("Not supported for distributed graph") + + def to_undirected(self, G): + """ + Return an undirected copy of the graph. + Returns + ------- + G : Graph + A undirected graph with the same nodes, and each directed edge + (u,v,weights) replaced by an undirected edge (u,v,weights). + Examples + -------- + >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', + >>> dtype=['int32', 'int32', 'float32'], header=None) + >>> DiG = cugraph.DiGraph() + >>> DiG.from_cudf_edgelist(M, '0', '1') + >>> G = DiG.to_undirected() + """ + + # TODO: Add support + raise Exception("Not supported for distributed graph") + + def has_node(self, n): + """ + Returns True if the graph contains the node n. + """ + if self.edgelist is None: + raise Exception("Graph has no Edgelist.") + # FIXME: Check renumber map + ddf = self.edgelist.edgelist_df[["src", "dst"]] + return (ddf == n).any().any().compute() + + def has_edge(self, u, v): + """ + Returns True if the graph contains the edge (u,v). + """ + # TODO: Verify Correctness + if self.properties.renumbered: + tmp = cudf.DataFrame({"src": [u, v]}) + tmp = tmp.astype({"src": "int"}) + tmp = self.add_internal_vertex_id( + tmp, "id", "src", preserve_order=True + ) + + u = tmp["id"][0] + v = tmp["id"][1] + + df = self.edgelist.edgelist_df + return ((df["src"] == u) & (df["dst"] == v)).any().compute() + + def edges(self): + """ + Returns all the edges in the graph as a cudf.DataFrame containing + sources and destinations. It does not return the edge weights. + For viewing edges with weights use view_edge_list() + """ + return self.view_edge_list()[["src", "dst"]] + + def nodes(self): + """ + Returns all the nodes in the graph as a cudf.Series + """ + # FIXME: Return renumber map nodes + raise Exception("Not supported for distributed graph") + + def neighbors(self, n): + if self.edgelist is None: + raise Exception("Graph has no Edgelist.") + # FIXME: Add renumbering of node n + ddf = self.edgelist.edgelist_df + return ddf[ddf["src"] == n]["dst"].reset_index(drop=True) + + def compute_renumber_edge_list(self, transposed=False): + """ + Compute a renumbered edge list + This function works in the MNMG pipeline and will transform + the input dask_cudf.DataFrame into a renumbered edge list + in the prescribed direction. + This function will be called by the algorithms to ensure + that the graph is renumbered properly. The graph object will + cache the most recent renumbering attempt. For benchmarking + purposes, this function can be called prior to calling a + graph algorithm so we can measure the cost of computing + the renumbering separately from the cost of executing the + algorithm. + When creating a CSR-like structure, set transposed to False. + When creating a CSC-like structure, set transposed to True. + Parameters + ---------- + transposed : (optional) bool + If True, renumber with the intent to make a CSC-like + structure. If False, renumber with the intent to make + a CSR-like structure. Defaults to False. + """ + # FIXME: What to do about edge_attr??? + # currently ignored for MNMG + + if not self.properties.renumbered: + self.edgelist = self.EdgeList(self.input_df) + self.renumber_map = None + else: + if self.edgelist is not None: + if self.properties.directed is False: + return + + if self.properties.store_transposed == transposed: + return + + del self.edgelist + + renumbered_ddf, number_map = NumberMap.renumber( + self.input_df, + self.source_columns, + self.destination_columns, + store_transposed=transposed, + ) + self.edgelist = self.EdgeList(renumbered_ddf) + self.renumber_map = number_map + self.properties.store_transposed = transposed diff --git a/python/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/structure/graph_implementation/simpleGraph.py new file mode 100644 index 00000000000..4e632a72231 --- /dev/null +++ b/python/cugraph/structure/graph_implementation/simpleGraph.py @@ -0,0 +1,823 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from cugraph.structure import graph_primtypes_wrapper +from cugraph.structure.symmetrize import symmetrize +from cugraph.structure.number_map import NumberMap +import cugraph.dask.common.mg_utils as mg_utils +import cudf +import dask_cudf +import cugraph.comms.comms as Comms +import pandas as pd +import numpy as np +from cugraph.dask.structure import replication + + +# FIXME: Change to consistent camel case naming +class simpleGraphImpl: + + class EdgeList: + def __init__(self, source, destination, edge_attr=None): + self.edgelist_df = cudf.DataFrame() + self.edgelist_df["src"] = source + self.edgelist_df["dst"] = destination + self.weights = False + if edge_attr is not None: + self.weights = True + if type(edge_attr) is dict: + for k in edge_attr.keys(): + self.edgelist_df[k] = edge_attr[k] + else: + self.edgelist_df["weights"] = edge_attr + + class AdjList: + def __init__(self, offsets, indices, value=None): + self.offsets = offsets + self.indices = indices + self.weights = value # Should be a dataframe for multiple weights + + class transposedAdjList: + def __init__(self, offsets, indices, value=None): + simpleGraphImpl.AdjList.__init__(self, offsets, indices, value) + + class Properties: + def __init__(self, properties): + self.multi_edge = getattr(properties, 'multi_edge', False) + self.directed = properties.directed + self.renumbered = False + self.self_loop = None + self.isolated_vertices = None + self.node_count = None + self.edge_count = None + self.weighted = False + + def __init__(self, properties): + # Structure + self.edgelist = None + self.adjlist = None + self.transposedadjlist = None + self.renumber_map = None + self.properties = simpleGraphImpl.Properties(properties) + self._nodes = {} + + # TODO: Move to new batch class + # MG - Batch + self.batch_enabled = False + self.batch_edgelists = None + self.batch_adjlists = None + self.batch_transposed_adjlists = None + + # Functions + # FIXME: Change to public function + # FIXME: Make function more modular + def __from_edgelist( + self, + input_df, + source="source", + destination="destination", + edge_attr=None, + renumber=True, + ): + + # Verify column names present in input DataFrame + s_col = source + d_col = destination + if not isinstance(s_col, list): + s_col = [s_col] + if not isinstance(d_col, list): + d_col = [d_col] + if not ( + set(s_col).issubset(set(input_df.columns)) + and set(d_col).issubset(set(input_df.columns)) + ): + # FIXME: Raise concrete Exceptions + raise Exception( + "source column names and/or destination column " + "names not found in input. Recheck the source and " + "destination parameters" + ) + + # FIXME: check if the consolidated graph fits on the + # device before gathering all the edge lists + + # Consolidation + if isinstance(input_df, cudf.DataFrame): + if len(input_df[source]) > 2147483100: + raise Exception( + "cudf dataFrame edge list is too big " + "to fit in a single GPU" + ) + elist = input_df + elif isinstance(input_df, dask_cudf.DataFrame): + if len(input_df[source]) > 2147483100: + raise Exception( + "dask_cudf dataFrame edge list is too big " + "to fit in a single GPU" + ) + elist = input_df.compute().reset_index(drop=True) + else: + raise Exception( + "input should be a cudf.DataFrame or " + "a dask_cudf dataFrame" + ) + + # Renumbering + self.renumber_map = None + if renumber: + # FIXME: Should SG do lazy evaluation like MG? + elist, renumber_map = NumberMap.renumber( + elist, source, destination, store_transposed=False + ) + source = "src" + destination = "dst" + self.properties.renumbered = True + self.renumber_map = renumber_map + else: + if type(source) is list and type(destination) is list: + raise Exception("set renumber to True for multi column ids") + + # Populate graph edgelist + source_col = elist[source] + dest_col = elist[destination] + + if edge_attr is not None: + self.weighted = True + value_col = elist[edge_attr] + else: + value_col = None + + # TODO: Update Symmetrize to work on Graph and/or DataFrame + if value_col is not None: + source_col, dest_col, value_col = symmetrize( + source_col, dest_col, value_col, + multi=self.properties.multi_edge, + symmetrize=not self.properties.directed) + if isinstance(value_col, cudf.DataFrame): + value_dict = {} + for i in value_col.columns: + value_dict[i] = value_col[i] + value_col = value_dict + else: + source_col, dest_col = symmetrize( + source_col, dest_col, multi=self.properties.multi_edge, + symmetrize=not self.properties.directed) + + self.edgelist = simpleGraphImpl.EdgeList(source_col, dest_col, + value_col) + + if self.batch_enabled: + self._replicate_edgelist() + + def to_pandas_edgelist(self, source='source', destination='destination'): + """ + Returns the graph edge list as a Pandas DataFrame. + Parameters + ---------- + source : str or array-like + source column name or array of column names + destination : str or array-like + destination column name or array of column names + Returns + ------- + df : pandas.DataFrame + """ + + gdf = self.view_edge_list() + return gdf.to_pandas() + + def to_pandas_adjacency(self): + """ + Returns the graph adjacency matrix as a Pandas DataFrame. + """ + + np_array_data = self.to_numpy_array() + pdf = pd.DataFrame(np_array_data) + if self.properties.renumbered: + nodes = self.renumber_map.implementation.df['0'].\ + values_host.tolist() + pdf.columns = nodes + pdf.index = nodes + return pdf + + def to_numpy_array(self): + """ + Returns the graph adjacency matrix as a NumPy array. + """ + + nlen = self.number_of_nodes() + elen = self.number_of_edges() + df = self.edgelist.edgelist_df + np_array = np.full((nlen, nlen), 0.0) + for i in range(0, elen): + np_array[df['src'].iloc[i], df['dst'].iloc[i]] = df['weights'].\ + iloc[i] + return np_array + + def to_numpy_matrix(self): + """ + Returns the graph adjacency matrix as a NumPy matrix. + """ + np_array = self.to_numpy_array() + return np.asmatrix(np_array) + + def view_edge_list(self): + """ + Display the edge list. Compute it if needed. + NOTE: If the graph is of type Graph() then the displayed undirected + edges are the same as displayed by networkx Graph(), but the direction + could be different i.e. an edge displayed by cugraph as (src, dst) + could be displayed as (dst, src) by networkx. + cugraph.Graph stores symmetrized edgelist internally. For displaying + undirected edgelist for a Graph the upper trianglar matrix of the + symmetrized edgelist is returned. + networkx.Graph renumbers the input and stores the upper triangle of + this renumbered input. Since the internal renumbering of networx and + cugraph is different, the upper triangular matrix of networkx + renumbered input may not be the same as cugraph's upper trianglar + matrix of the symmetrized edgelist. Hence the displayed source and + destination pairs in both will represent the same edge but node values + could be swapped. + Returns + ------- + df : cudf.DataFrame + This cudf.DataFrame wraps source, destination and weight + df[src] : cudf.Series + contains the source index for each edge + df[dst] : cudf.Series + contains the destination index for each edge + df[weight] : cusd.Series + Column is only present for weighted Graph, + then containing the weight value for each edge + """ + if self.edgelist is None: + src, dst, weights = graph_primtypes_wrapper.view_edge_list(self) + self.edgelist = self.EdgeList(src, dst, weights) + + edgelist_df = self.edgelist.edgelist_df + + if self.properties.renumbered: + edgelist_df = self.renumber_map.unrenumber(edgelist_df, "src") + edgelist_df = self.renumber_map.unrenumber(edgelist_df, "dst") + + if not self.properties.directed: + edgelist_df = edgelist_df[edgelist_df["src"] <= edgelist_df["dst"]] + edgelist_df = edgelist_df.reset_index(drop=True) + self.properties.edge_count = len(edgelist_df) + + return edgelist_df + + def delete_edge_list(self): + """ + Delete the edge list. + """ + # decrease reference count to free memory if the referenced objects are + # no longer used. + self.edgelist = None + + def __from_adjlist(self, offset_col, index_col, value_col=None): + self.adjlist = simpleGraphImpl.AdjList(offset_col, index_col, + value_col) + + if self.batch_enabled: + self._replicate_adjlist() + + def view_adj_list(self): + """ + Display the adjacency list. Compute it if needed. + Returns + ------- + offset_col : cudf.Series + This cudf.Series wraps a gdf_column of size V + 1 (V: number of + vertices). + The gdf column contains the offsets for the vertices in this graph. + Offsets are in the range [0, E] (E: number of edges). + index_col : cudf.Series + This cudf.Series wraps a gdf_column of size E (E: number of edges). + The gdf column contains the destination index for each edge. + Destination indices are in the range [0, V) (V: number of + vertices). + value_col : cudf.Series or ``None`` + This pointer is ``None`` for unweighted graphs. + For weighted graphs, this cudf.Series wraps a gdf_column of size E + (E: number of edges). + The gdf column contains the weight value for each edge. + The expected type of the gdf_column element is floating point + number. + """ + + if self.adjlist is None: + if self.transposedadjlist is not None and\ + self.properties.directed is False: + off, ind, vals = ( + self.transposedadjlist.offsets, + self.transposedadjlist.indices, + self.transposedadjlist.weights, + ) + else: + off, ind, vals = graph_primtypes_wrapper.view_adj_list(self) + self.adjlist = self.AdjList(off, ind, vals) + + if self.batch_enabled: + self._replicate_adjlist() + + return self.adjlist.offsets, self.adjlist.indices, self.adjlist.weights + + def view_transposed_adj_list(self): + """ + Display the transposed adjacency list. Compute it if needed. + Returns + ------- + offset_col : cudf.Series + This cudf.Series wraps a gdf_column of size V + 1 (V: number of + vertices). + The gdf column contains the offsets for the vertices in this graph. + Offsets are in the range [0, E] (E: number of edges). + index_col : cudf.Series + This cudf.Series wraps a gdf_column of size E (E: number of edges). + The gdf column contains the destination index for each edge. + Destination indices are in the range [0, V) (V: number of + vertices). + value_col : cudf.Series or ``None`` + This pointer is ``None`` for unweighted graphs. + For weighted graphs, this cudf.Series wraps a gdf_column of size E + (E: number of edges). + The gdf column contains the weight value for each edge. + The expected type of the gdf_column element is floating point + number. + """ + + if self.transposedadjlist is None: + if self.adjlist is not None and self.properties.directed is False: + off, ind, vals = ( + self.adjlist.offsets, + self.adjlist.indices, + self.adjlist.weights, + ) + else: + ( + off, + ind, + vals, + ) = graph_primtypes_wrapper.view_transposed_adj_list(self) + self.transposedadjlist = self.transposedAdjList(off, ind, vals) + + if self.batch_enabled: + self._replicate_transposed_adjlist() + + return ( + self.transposedadjlist.offsets, + self.transposedadjlist.indices, + self.transposedadjlist.weights, + ) + + def delete_adj_list(self): + """ + Delete the adjacency list. + """ + self.adjlist = None + + # FIXME: Update batch workflow and refactor to suitable file + def enable_batch(self): + client = mg_utils.get_client() + comms = Comms.get_comms() + + if client is None or comms is None: + msg = ( + "MG Batch needs a Dask Client and the " + "Communicator needs to be initialized." + ) + raise Exception(msg) + + self.batch_enabled = True + + if self.edgelist is not None: + if self.batch_edgelists is None: + self._replicate_edgelist() + + if self.adjlist is not None: + if self.batch_adjlists is None: + self._replicate_adjlist() + + if self.transposedadjlist is not None: + if self.batch_transposed_adjlists is None: + self._replicate_transposed_adjlist() + + def _replicate_edgelist(self): + client = mg_utils.get_client() + comms = Comms.get_comms() + + # FIXME: There might be a better way to control it + if client is None: + return + work_futures = replication.replicate_cudf_dataframe( + self.edgelist.edgelist_df, client=client, comms=comms + ) + + self.batch_edgelists = work_futures + + def _replicate_adjlist(self): + client = mg_utils.get_client() + comms = Comms.get_comms() + + # FIXME: There might be a better way to control it + if client is None: + return + + weights = None + offsets_futures = replication.replicate_cudf_series( + self.adjlist.offsets, client=client, comms=comms + ) + indices_futures = replication.replicate_cudf_series( + self.adjlist.indices, client=client, comms=comms + ) + + if self.adjlist.weights is not None: + weights = replication.replicate_cudf_series(self.adjlist.weights) + else: + weights = {worker: None for worker in offsets_futures} + + merged_futures = { + worker: [ + offsets_futures[worker], + indices_futures[worker], + weights[worker], + ] + for worker in offsets_futures + } + self.batch_adjlists = merged_futures + + # FIXME: Not implemented yet + def _replicate_transposed_adjlist(self): + self.batch_transposed_adjlists = True + + def get_two_hop_neighbors(self): + """ + Compute vertex pairs that are two hops apart. The resulting pairs are + sorted before returning. + Returns + ------- + df : cudf.DataFrame + df[first] : cudf.Series + the first vertex id of a pair, if an external vertex id + is defined by only one column + df[second] : cudf.Series + the second vertex id of a pair, if an external vertex id + is defined by only one column + """ + + df = graph_primtypes_wrapper.get_two_hop_neighbors(self) + + if self.properties.renumbered is True: + df = self.renumber_map.unrenumber(df, "first") + df = self.renumber_map.unrenumber(df, "second") + + return df + + def number_of_vertices(self): + """ + Get the number of nodes in the graph. + """ + if self.properties.node_count is None: + if self.adjlist is not None: + self.properties.node_count = len(self.adjlist.offsets) - 1 + elif self.transposedadjlist is not None: + self.properties.node_count = len( + self.transposedadjlist.offsets) - 1 + elif self.edgelist is not None: + df = self.edgelist.edgelist_df[["src", "dst"]] + self.properties.node_count = df.max().max() + 1 + else: + raise Exception("Graph is Empty") + return self.properties.node_count + + def number_of_nodes(self): + """ + An alias of number_of_vertices(). This function is added for NetworkX + compatibility. + """ + return self.number_of_vertices() + + def number_of_edges(self, directed_edges=False): + """ + Get the number of edges in the graph. + """ + # TODO: Move to Outer graphs? + if directed_edges and self.edgelist is not None: + return len(self.edgelist.edgelist_df) + if self.properties.edge_count is None: + if self.edgelist is not None: + if self.properties.directed is False: + self.properties.edge_count = len( + self.edgelist.edgelist_df[ + self.edgelist.edgelist_df["src"] + >= self.edgelist.edgelist_df["dst"] + ] + ) + else: + self.properties.edge_count = len(self.edgelist.edgelist_df) + elif self.adjlist is not None: + self.properties.edge_count = len(self.adjlist.indices) + elif self.transposedadjlist is not None: + self.properties.edge_count = len( + self.transposedadjlist.indices) + else: + raise ValueError("Graph is Empty") + return self.properties.edge_count + + def in_degree(self, vertex_subset=None): + """ + Compute vertex in-degree. Vertex in-degree is the number of edges + pointing into the vertex. By default, this method computes vertex + degrees for the entire set of vertices. If vertex_subset is provided, + this method optionally filters out all but those listed in + vertex_subset. + Parameters + ---------- + vertex_subset : cudf.Series or iterable container, optional + A container of vertices for displaying corresponding in-degree. + If not set, degrees are computed for the entire set of vertices. + Returns + ------- + df : cudf.DataFrame + GPU DataFrame of size N (the default) or the size of the given + vertices (vertex_subset) containing the in_degree. The ordering is + relative to the adjacency list, or that given by the specified + vertex_subset. + df[vertex] : cudf.Series + The vertex IDs (will be identical to vertex_subset if + specified). + df[degree] : cudf.Series + The computed in-degree of the corresponding vertex. + Examples + -------- + >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', + >>> dtype=['int32', 'int32', 'float32'], header=None) + >>> G = cugraph.Graph() + >>> G.from_cudf_edgelist(M, '0', '1') + >>> df = G.in_degree([0,9,12]) + """ + return self._degree(vertex_subset, x=1) + + def out_degree(self, vertex_subset=None): + """ + Compute vertex out-degree. Vertex out-degree is the number of edges + pointing out from the vertex. By default, this method computes vertex + degrees for the entire set of vertices. If vertex_subset is provided, + this method optionally filters out all but those listed in + vertex_subset. + Parameters + ---------- + vertex_subset : cudf.Series or iterable container, optional + A container of vertices for displaying corresponding out-degree. + If not set, degrees are computed for the entire set of vertices. + Returns + ------- + df : cudf.DataFrame + GPU DataFrame of size N (the default) or the size of the given + vertices (vertex_subset) containing the out_degree. The ordering is + relative to the adjacency list, or that given by the specified + vertex_subset. + df[vertex] : cudf.Series + The vertex IDs (will be identical to vertex_subset if + specified). + df[degree] : cudf.Series + The computed out-degree of the corresponding vertex. + Examples + -------- + >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', + >>> dtype=['int32', 'int32', 'float32'], header=None) + >>> G = cugraph.Graph() + >>> G.from_cudf_edgelist(M, '0', '1') + >>> df = G.out_degree([0,9,12]) + """ + return self._degree(vertex_subset, x=2) + + def degree(self, vertex_subset=None): + """ + Compute vertex degree, which is the total number of edges incident + to a vertex (both in and out edges). By default, this method computes + degrees for the entire set of vertices. If vertex_subset is provided, + then this method optionally filters out all but those listed in + vertex_subset. + Parameters + ---------- + vertex_subset : cudf.Series or iterable container, optional + a container of vertices for displaying corresponding degree. If not + set, degrees are computed for the entire set of vertices. + Returns + ------- + df : cudf.DataFrame + GPU DataFrame of size N (the default) or the size of the given + vertices (vertex_subset) containing the degree. The ordering is + relative to the adjacency list, or that given by the specified + vertex_subset. + df['vertex'] : cudf.Series + The vertex IDs (will be identical to vertex_subset if + specified). + df['degree'] : cudf.Series + The computed degree of the corresponding vertex. + Examples + -------- + >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', + >>> dtype=['int32', 'int32', 'float32'], header=None) + >>> G = cugraph.Graph() + >>> G.from_cudf_edgelist(M, '0', '1') + >>> all_df = G.degree() + >>> subset_df = G.degree([0,9,12]) + """ + return self._degree(vertex_subset) + + # FIXME: vertex_subset could be a DataFrame for multi-column vertices + def degrees(self, vertex_subset=None): + """ + Compute vertex in-degree and out-degree. By default, this method + computes vertex degrees for the entire set of vertices. If + vertex_subset is provided, this method optionally filters out all but + those listed in vertex_subset. + Parameters + ---------- + vertex_subset : cudf.Series or iterable container, optional + A container of vertices for displaying corresponding degree. If not + set, degrees are computed for the entire set of vertices. + Returns + ------- + df : cudf.DataFrame + GPU DataFrame of size N (the default) or the size of the given + vertices (vertex_subset) containing the degrees. The ordering is + relative to the adjacency list, or that given by the specified + vertex_subset. + df['vertex'] : cudf.Series + The vertex IDs (will be identical to vertex_subset if + specified). + df['in_degree'] : cudf.Series + The in-degree of the vertex. + df['out_degree'] : cudf.Series + The out-degree of the vertex. + Examples + -------- + >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', + >>> dtype=['int32', 'int32', 'float32'], header=None) + >>> G = cugraph.Graph() + >>> G.from_cudf_edgelist(M, '0', '1') + >>> df = G.degrees([0,9,12]) + """ + ( + vertex_col, + in_degree_col, + out_degree_col, + ) = graph_primtypes_wrapper._degrees(self) + + df = cudf.DataFrame() + df["vertex"] = vertex_col + df["in_degree"] = in_degree_col + df["out_degree"] = out_degree_col + + if self.properties.renumbered is True: + df = self.renumber_map.unrenumber(df, "vertex") + + if vertex_subset is not None: + df = df[df['vertex'].isin(vertex_subset)] + + return df + + def _degree(self, vertex_subset, x=0): + vertex_col, degree_col = graph_primtypes_wrapper._degree(self, x) + df = cudf.DataFrame() + df["vertex"] = vertex_col + df["degree"] = degree_col + + if self.properties.renumbered is True: + df = self.renumber_map.unrenumber(df, "vertex") + + if vertex_subset is not None: + df = df[df['vertex'].isin(vertex_subset)] + + return df + + def to_directed(self, DiG): + """ + Return a directed representation of the graph Implementation. + This function copies the internal structures and returns the + directed view. + """ + DiG.properties.renumbered = self.properties.renumbered + DiG.renumber_map = self.renumber_map + DiG.edgelist = self.edgelist + DiG.adjlist = self.adjlist + DiG.transposedadjlist = self.transposedadjlist + + def to_undirected(self, G): + """ + Return an undirected copy of the graph. + """ + G.properties.renumbered = self.properties.renumbered + G.renumber_map = self.renumber_map + if self.properties.directed is False: + G.edgelist = self.edgelist + G.adjlist = self.adjlist + G.transposedadjlist = self.transposedadjlist + else: + df = self.edgelist.edgelist_df + if self.edgelist.weights: + source_col, dest_col, value_col = symmetrize( + df["src"], df["dst"], df["weights"] + ) + else: + source_col, dest_col = symmetrize(df["src"], df["dst"]) + value_col = None + G.edgelist = simpleGraphImpl.EdgeList(source_col, dest_col, + value_col) + + def has_node(self, n): + """ + Returns True if the graph contains the node n. + """ + if self.properties.renumbered: + tmp = self.renumber_map.to_internal_vertex_id(cudf.Series([n])) + return tmp[0] is not cudf.NA and tmp[0] >= 0 + else: + df = self.edgelist.edgelist_df[["src", "dst"]] + return (df == n).any().any() + + def has_edge(self, u, v): + """ + Returns True if the graph contains the edge (u,v). + """ + if self.properties.renumbered: + tmp = cudf.DataFrame({"src": [u, v]}) + tmp = tmp.astype({"src": "int"}) + tmp = self.renumber_map.add_internal_vertex_id( + tmp, "id", "src", preserve_order=True + ) + + u = tmp["id"][0] + v = tmp["id"][1] + + df = self.edgelist.edgelist_df + return ((df["src"] == u) & (df["dst"] == v)).any() + + def has_self_loop(self): + """ + Returns True if the graph has self loop. + """ + # Detect self loop + if self.properties.self_loop is None: + elist = self.edgelist.edgelist_df + if (elist["src"] == elist["dst"]).any(): + self.properties.self_loop = True + else: + self.properties.self_loop = False + return self.properties.self_loop + + def edges(self): + """ + Returns all the edges in the graph as a cudf.DataFrame containing + sources and destinations. It does not return the edge weights. + For viewing edges with weights use view_edge_list() + """ + return self.view_edge_list()[["src", "dst"]] + + def nodes(self): + """ + Returns all the nodes in the graph as a cudf.Series + """ + if self.edgelist is not None: + df = self.edgelist.edgelist_df + if self.properties.renumbered: + # FIXME: If vertices are multicolumn + # this needs to return a dataframe + # FIXME: This relies on current implementation + # of NumberMap, should not really expose + # this, perhaps add a method to NumberMap + return self.renumber_map.implementation.df["0"] + else: + return cudf.concat([df["src"], df["dst"]]).unique() + if self.adjlist is not None: + return cudf.Series(np.arange(0, self.number_of_nodes())) + + def neighbors(self, n): + if self.edgelist is None: + raise Exception("Graph has no Edgelist.") + if self.properties.renumbered: + node = self.renumber_map.to_internal_vertex_id(cudf.Series([n])) + if len(node) == 0: + return cudf.Series(dtype="int") + n = node[0] + + df = self.edgelist.edgelist_df + neighbors = df[df["src"] == n]["dst"].reset_index(drop=True) + if self.properties.renumbered: + # FIXME: Multi-column vertices + return self.renumber_map.from_internal_vertex_id(neighbors)["0"] + else: + return neighbors diff --git a/python/cugraph/structure/hypergraph.py b/python/cugraph/structure/hypergraph.py index a11c937d83d..c5a1ac39e4f 100644 --- a/python/cugraph/structure/hypergraph.py +++ b/python/cugraph/structure/hypergraph.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -36,7 +36,7 @@ import cudf import numpy as np -from cugraph.structure.graph import Graph +from cugraph.structure.graph_classes import Graph def hypergraph( @@ -66,24 +66,20 @@ def hypergraph( components as dataframes. The transform reveals relationships between the rows and unique values. This transform is useful for lists of events, samples, relationships, and other structured high-dimensional data. - The transform creates a node for every row, and turns a row's column entries into node attributes. If direct=False (default), every unique value within a column is also turned into a node. Edges are added to connect a row's nodes to each of its column nodes, or if direct=True, to one another. Nodes are given the attribute specified by ``NODETYPE`` that corresponds to the originating column name, or if a row ``EVENTID``. - Consider a list of events. Each row represents a distinct event, and each column some metadata about an event. If multiple events have common metadata, they will be transitively connected through those metadata values. Conversely, if an event has unique metadata, the unique metadata will turn into nodes that only have connections to the event node. - For best results, set ``EVENTID`` to a row's unique ID, ``SKIP`` to all non-categorical columns (or ``columns`` to all categorical columns), and ``categories`` to group columns with the same kinds of values. - Parameters ---------- values : cudf.DataFrame @@ -130,7 +126,6 @@ def hypergraph( The name to use as the node type column in the graph and node DFs. EDGETYPE : str, optional, default "edge_type" The name to use as the edge type column in the graph and edge DF. - Returns ------- result : dict {"nodes", "edges", "graph", "events", "entities"} diff --git a/python/cugraph/structure/number_map.py b/python/cugraph/structure/number_map.py index 73316756ef2..2b7c2b2f296 100644 --- a/python/cugraph/structure/number_map.py +++ b/python/cugraph/structure/number_map.py @@ -182,7 +182,6 @@ def to_internal_vertex_id(self, ddf, col_names): on=self.col_names, how="right", ) - print(x.compute()) return x['global_id'] def from_internal_vertex_id( diff --git a/python/cugraph/structure/symmetrize.py b/python/cugraph/structure/symmetrize.py index 8720f7ad343..442701f6508 100644 --- a/python/cugraph/structure/symmetrize.py +++ b/python/cugraph/structure/symmetrize.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2021, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -11,7 +11,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from cugraph.structure import graph as csg +from cugraph.structure import graph_classes as csg import cudf import dask_cudf @@ -201,8 +201,12 @@ def symmetrize(source_col, dest_col, value_col=None, multi=False, csg.null_check(source_col) csg.null_check(dest_col) if value_col is not None: - weight_name = "value" - input_df.insert(len(input_df.columns), "value", value_col) + if isinstance(value_col, cudf.Series): + weight_name = "value" + input_df.insert(len(input_df.columns), "value", value_col) + elif isinstance(value_col, cudf.DataFrame): + input_df = cudf.concat([input_df, value_col], axis=1) + output_df = None if type(source_col) is dask_cudf.Series: output_df = symmetrize_ddf( @@ -211,11 +215,17 @@ def symmetrize(source_col, dest_col, value_col=None, multi=False, else: output_df = symmetrize_df(input_df, "source", "destination", multi, symmetrize) - if value_col is not None: - return ( - output_df["source"], - output_df["destination"], - output_df["value"], - ) + if isinstance(value_col, cudf.Series): + return ( + output_df["source"], + output_df["destination"], + output_df["value"], + ) + elif isinstance(value_col, cudf.DataFrame): + return ( + output_df["source"], + output_df["destination"], + output_df[value_col.columns], + ) return output_df["source"], output_df["destination"] diff --git a/python/cugraph/tests/test_graph.py b/python/cugraph/tests/test_graph.py index 348f7e2e130..933a34aef3c 100644 --- a/python/cugraph/tests/test_graph.py +++ b/python/cugraph/tests/test_graph.py @@ -200,6 +200,7 @@ def test_add_adj_list_to_edge_list(graph_file): # cugraph add_adj_list to_edge_list call G = cugraph.DiGraph() G.from_cudf_adjlist(offsets, indices, None) + edgelist = G.view_edge_list() sources_cu = edgelist["src"] destinations_cu = edgelist["dst"] @@ -535,6 +536,7 @@ def test_to_directed(graph_file): DiG = G.to_directed() DiGnx = Gnx.to_directed() + assert DiG.is_directed() assert DiG.number_of_nodes() == DiGnx.number_of_nodes() assert DiG.number_of_edges() == DiGnx.number_of_edges() @@ -569,6 +571,7 @@ def test_to_undirected(graph_file): G = DiG.to_undirected() Gnx = DiGnx.to_undirected() + assert not G.is_directed() assert G.number_of_nodes() == Gnx.number_of_nodes() assert G.number_of_edges() == Gnx.number_of_edges() @@ -627,17 +630,13 @@ def test_bipartite_api(graph_file): set2_exp = cudf.Series(set(nodes.values_host) - set(set1_exp.values_host)) - G = cugraph.Graph() - assert not G.is_bipartite() + G = cugraph.BiPartiteGraph() + assert G.is_bipartite() # Add a set of nodes present in one partition G.add_nodes_from(set1_exp, bipartite='set1') G.from_cudf_edgelist(cu_M, source='0', destination='1') - # Check if Graph is bipartite. It should return True since we have - # added the partition in add_nodes_from() - assert G.is_bipartite() - # Call sets() to get the bipartite set of nodes. set1, set2 = G.sets() diff --git a/python/cugraph/traversal/bfs.py b/python/cugraph/traversal/bfs.py index 1e6cc42b760..d397b5a4241 100644 --- a/python/cugraph/traversal/bfs.py +++ b/python/cugraph/traversal/bfs.py @@ -14,7 +14,7 @@ import cudf from cugraph.traversal import bfs_wrapper -from cugraph.structure.graph import Graph, DiGraph +from cugraph.structure.graph_classes import Graph, DiGraph from cugraph.utilities import (ensure_cugraph_obj, is_matrix_type, is_cp_matrix_type, diff --git a/python/cugraph/traversal/traveling_salesperson.py b/python/cugraph/traversal/traveling_salesperson.py index 7aea7ae603f..53d411c92ae 100644 --- a/python/cugraph/traversal/traveling_salesperson.py +++ b/python/cugraph/traversal/traveling_salesperson.py @@ -12,7 +12,7 @@ # limitations under the License. from cugraph.traversal import traveling_salesperson_wrapper -from cugraph.structure.graph import null_check +from cugraph.structure.graph_classes import null_check import cudf diff --git a/python/cugraph/tree/minimum_spanning_tree.py b/python/cugraph/tree/minimum_spanning_tree.py index 45e996aa083..6a5f7b5bf38 100644 --- a/python/cugraph/tree/minimum_spanning_tree.py +++ b/python/cugraph/tree/minimum_spanning_tree.py @@ -12,7 +12,7 @@ # limitations under the License. from cugraph.tree import minimum_spanning_tree_wrapper -from cugraph.structure.graph import Graph +from cugraph.structure.graph_classes import Graph from cugraph.utilities import check_nx_graph from cugraph.utilities import cugraph_to_nx