From 77bf63b076ee6f10fe4b599acd30406264851f49 Mon Sep 17 00:00:00 2001 From: Rick Ratzel Date: Fri, 28 Jan 2022 00:55:50 -0600 Subject: [PATCH 1/6] Updates to experimental_warning_wrapper() to better handle classes by wrapping in a class wrapper instead of a function (where the return type is no longer a class type, and isinstance() unexpectedly fails). Added tests. --- python/cugraph/cugraph/utilities/api_tools.py | 77 +++++++++++++------ python/pylibcugraph/pylibcugraph/graphs.pxd | 9 ++- python/pylibcugraph/pylibcugraph/graphs.pyx | 2 +- python/pylibcugraph/pylibcugraph/pagerank.pyx | 4 +- python/pylibcugraph/pylibcugraph/sssp.pyx | 4 +- .../pylibcugraph/tests/test_utils.py | 53 +++++++++++++ .../pylibcugraph/utilities/api_tools.py | 76 ++++++++++++------ 7 files changed, 170 insertions(+), 55 deletions(-) create mode 100644 python/pylibcugraph/pylibcugraph/tests/test_utils.py diff --git a/python/cugraph/cugraph/utilities/api_tools.py b/python/cugraph/cugraph/utilities/api_tools.py index e0281d86e5c..eea347a0b40 100644 --- a/python/cugraph/cugraph/utilities/api_tools.py +++ b/python/cugraph/cugraph/utilities/api_tools.py @@ -14,27 +14,34 @@ import functools import warnings import inspect +import types experimental_prefix = "EXPERIMENTAL" +# FIXME: this utility is copied from pylibcugraph. Remove this copy and have +# cugraph code call the version in pylibcugraph. -def experimental_warning_wrapper(obj, make_public_name=True): +def experimental_warning_wrapper(obj): """ - Return a callable obj wrapped in a callable the prints a warning about it - being "experimental" (an object that is in the public API but subject to - change or removal) prior to calling it and returning its value. - - If make_public_name is False, the object's name used in the warning message - is left unmodified. If True (default), any leading __ and/or EXPERIMENTAL - string are removed from the name used in warning messages. This allows an - object to be named with a "private" name in the public API so it can remain - hidden while it is still experimental, but have a public name within the - experimental namespace so it can be easily discovered and used. + Return a callable obj wrapped in a callable the prints a warning about + it being "experimental" (an object that is in the public API but subject + to change or removal) prior to calling it and returning its value. + + The object's name used in the warning message also has any leading __ + and/or EXPERIMENTAL string are removed from the name used in warning + messages. This allows an object to be named with a "private" name in the + public API so it can remain hidden while it is still experimental, but + have a public name within the experimental namespace so it can be easily + discovered and used. """ - obj_name = obj.__qualname__ - if make_public_name: - obj_name = obj_name.lstrip(experimental_prefix) - obj_name = obj_name.lstrip("__") + obj_type = type(obj) + if obj_type not in [type, types.FunctionType, types.BuiltinFunctionType]: + raise TypeError("obj must be a class or a function type, got " + f"{obj_type}") + + obj_name = obj.__name__ + obj_name = obj_name.lstrip(experimental_prefix) + obj_name = obj_name.lstrip("__") # Assume the caller of this function is the module containing the # experimental obj and try to get its namespace name. Default to no @@ -42,17 +49,41 @@ def experimental_warning_wrapper(obj, make_public_name=True): call_stack = inspect.stack() calling_frame = call_stack[1].frame ns_name = calling_frame.f_locals.get("__name__") - if ns_name is not None: - ns_name += "." - else: - ns_name = "" + dot = "." if ns_name is not None else "" + + warning_msg = (f"{ns_name}{dot}{obj_name} is experimental and will " + "change or be removed in a future release.") + + # If obj is a class, create a wrapper class which 1) inherits from the + # incoming class, and 2) has a ctor that simply prints the warning and + # assigns self to an instance of the incoming class. Ideally a wrapper + # around __init__ would be created and assigned to the class as the new + # __init__, but #2 is necessary since assigning attributes cannot be done to + # a builtin type (such as what a class defined in cython produces). + if obj_type is type: + class WarningWrapperClass(obj): + def __init__(self, *args, **kwargs): + warnings.warn(warning_msg, PendingDeprecationWarning) + # cython classes do not have a standard __init__, but assigning + # to self works instead. + if type(obj.__init__) is types.FunctionType: + super(WarningWrapperClass, self).__init__(*args, **kwargs) + else: + self = obj(*args, **kwargs) + WarningWrapperClass.__module__ = ns_name + WarningWrapperClass.__qualname__ = obj_name + WarningWrapperClass.__name__ = obj_name - warning_msg = (f"{ns_name}{obj_name} is experimental and will change " - "or be removed in a future release.") + return WarningWrapperClass + # If this point is reached, the incoming obj is a function so wrap it and + # return the wrapper (which is also a function type). @functools.wraps(obj) - def callable_warning_wrapper(*args, **kwargs): + def warning_wrapper_function(*args, **kwargs): warnings.warn(warning_msg, PendingDeprecationWarning) return obj(*args, **kwargs) + warning_wrapper_function.__module__ = ns_name + warning_wrapper_function.__qualname__ = obj_name + warning_wrapper_function.__name__ = obj_name - return callable_warning_wrapper + return warning_wrapper_function diff --git a/python/pylibcugraph/pylibcugraph/graphs.pxd b/python/pylibcugraph/pylibcugraph/graphs.pxd index 9da256f9928..63cbb01f547 100644 --- a/python/pylibcugraph/pylibcugraph/graphs.pxd +++ b/python/pylibcugraph/pylibcugraph/graphs.pxd @@ -19,11 +19,14 @@ from pylibcugraph._cugraph_c.graph cimport ( ) -cdef class EXPERIMENTAL__Graph: +# Base class allowing functions to accept either SGGraph or MGGraph +# This is not visible in python +cdef class _GPUGraph: cdef cugraph_graph_t* c_graph_ptr -cdef class EXPERIMENTAL__SGGraph(EXPERIMENTAL__Graph): +cdef class EXPERIMENTAL__SGGraph(_GPUGraph): pass -# cdef class EXPERIMENTAL__MGGraph(EXPERIMENTAL__Graph): +# Not yet supported +# cdef class EXPERIMENTAL__MGGraph(_GPUGraph): # pass diff --git a/python/pylibcugraph/pylibcugraph/graphs.pyx b/python/pylibcugraph/pylibcugraph/graphs.pyx index 381191c3e51..e4c60dc125c 100644 --- a/python/pylibcugraph/pylibcugraph/graphs.pyx +++ b/python/pylibcugraph/pylibcugraph/graphs.pyx @@ -48,7 +48,7 @@ from pylibcugraph.utils cimport ( ) -cdef class EXPERIMENTAL__SGGraph(EXPERIMENTAL__Graph): +cdef class EXPERIMENTAL__SGGraph(_GPUGraph): """ RAII-stye Graph class for use with single-GPU APIs that manages the individual create/free calls and the corresponding cugraph_graph_t pointer. diff --git a/python/pylibcugraph/pylibcugraph/pagerank.pyx b/python/pylibcugraph/pylibcugraph/pagerank.pyx index a1b5a704693..b2aca789c15 100644 --- a/python/pylibcugraph/pylibcugraph/pagerank.pyx +++ b/python/pylibcugraph/pylibcugraph/pagerank.pyx @@ -48,7 +48,7 @@ from pylibcugraph.resource_handle cimport ( EXPERIMENTAL__ResourceHandle, ) from pylibcugraph.graphs cimport ( - EXPERIMENTAL__Graph, + _GPUGraph, ) from pylibcugraph.utils cimport ( assert_success, @@ -58,7 +58,7 @@ from pylibcugraph.utils cimport ( def EXPERIMENTAL__pagerank(EXPERIMENTAL__ResourceHandle resource_handle, - EXPERIMENTAL__Graph graph, + _GPUGraph graph, precomputed_vertex_out_weight_sums, double alpha, double epsilon, diff --git a/python/pylibcugraph/pylibcugraph/sssp.pyx b/python/pylibcugraph/pylibcugraph/sssp.pyx index af3eed36186..1d8aa7e5937 100644 --- a/python/pylibcugraph/pylibcugraph/sssp.pyx +++ b/python/pylibcugraph/pylibcugraph/sssp.pyx @@ -49,7 +49,7 @@ from pylibcugraph.resource_handle cimport ( EXPERIMENTAL__ResourceHandle, ) from pylibcugraph.graphs cimport ( - EXPERIMENTAL__Graph, + _GPUGraph, ) from pylibcugraph.utils cimport ( assert_success, @@ -58,7 +58,7 @@ from pylibcugraph.utils cimport ( def EXPERIMENTAL__sssp(EXPERIMENTAL__ResourceHandle resource_handle, - EXPERIMENTAL__Graph graph, + _GPUGraph graph, size_t source, double cutoff, bool_t compute_predecessors, diff --git a/python/pylibcugraph/pylibcugraph/tests/test_utils.py b/python/pylibcugraph/pylibcugraph/tests/test_utils.py new file mode 100644 index 00000000000..3156daa80ba --- /dev/null +++ b/python/pylibcugraph/pylibcugraph/tests/test_utils.py @@ -0,0 +1,53 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import types + +import pytest + + +def test_experimental_warning_wrapper_for_funcs(): + from pylibcugraph.utilities.api_tools import experimental_warning_wrapper + + def EXPERIMENTAL__func(a, b): + return a - b + + exp_func = experimental_warning_wrapper(EXPERIMENTAL__func) + + with pytest.warns(PendingDeprecationWarning): + assert 1 == exp_func(3, 2) + + +def test_experimental_warning_wrapper_for_classes(): + from pylibcugraph.utilities.api_tools import experimental_warning_wrapper + + class EXPERIMENTAL__klass: + def __init__(self, a, b): + self.r = a - b + + exp_klass = experimental_warning_wrapper(EXPERIMENTAL__klass) + + with pytest.warns(PendingDeprecationWarning): + k = exp_klass(3, 2) + assert 1 == k.r + assert isinstance(k, exp_klass) + assert k.__class__.__name__ == "klass" + + +def test_experimental_warning_wrapper_for_unsupported_type(): + from pylibcugraph.utilities.api_tools import experimental_warning_wrapper + + # A module type should not be allowed to be wrapped + mod = types.ModuleType("modname") + with pytest.raises(TypeError): + exp_mod = experimental_warning_wrapper(mod) diff --git a/python/pylibcugraph/pylibcugraph/utilities/api_tools.py b/python/pylibcugraph/pylibcugraph/utilities/api_tools.py index e0281d86e5c..f581d95c84b 100644 --- a/python/pylibcugraph/pylibcugraph/utilities/api_tools.py +++ b/python/pylibcugraph/pylibcugraph/utilities/api_tools.py @@ -14,27 +14,31 @@ import functools import warnings import inspect +import types experimental_prefix = "EXPERIMENTAL" - -def experimental_warning_wrapper(obj, make_public_name=True): +def experimental_warning_wrapper(obj): """ - Return a callable obj wrapped in a callable the prints a warning about it - being "experimental" (an object that is in the public API but subject to - change or removal) prior to calling it and returning its value. - - If make_public_name is False, the object's name used in the warning message - is left unmodified. If True (default), any leading __ and/or EXPERIMENTAL - string are removed from the name used in warning messages. This allows an - object to be named with a "private" name in the public API so it can remain - hidden while it is still experimental, but have a public name within the - experimental namespace so it can be easily discovered and used. + Return a callable obj wrapped in a callable the prints a warning about + it being "experimental" (an object that is in the public API but subject + to change or removal) prior to calling it and returning its value. + + The object's name used in the warning message also has any leading __ + and/or EXPERIMENTAL string are removed from the name used in warning + messages. This allows an object to be named with a "private" name in the + public API so it can remain hidden while it is still experimental, but + have a public name within the experimental namespace so it can be easily + discovered and used. """ - obj_name = obj.__qualname__ - if make_public_name: - obj_name = obj_name.lstrip(experimental_prefix) - obj_name = obj_name.lstrip("__") + obj_type = type(obj) + if obj_type not in [type, types.FunctionType, types.BuiltinFunctionType]: + raise TypeError("obj must be a class or a function type, got " + f"{obj_type}") + + obj_name = obj.__name__ + obj_name = obj_name.lstrip(experimental_prefix) + obj_name = obj_name.lstrip("__") # Assume the caller of this function is the module containing the # experimental obj and try to get its namespace name. Default to no @@ -42,17 +46,41 @@ def experimental_warning_wrapper(obj, make_public_name=True): call_stack = inspect.stack() calling_frame = call_stack[1].frame ns_name = calling_frame.f_locals.get("__name__") - if ns_name is not None: - ns_name += "." - else: - ns_name = "" + dot = "." if ns_name is not None else "" + + warning_msg = (f"{ns_name}{dot}{obj_name} is experimental and will " + "change or be removed in a future release.") + + # If obj is a class, create a wrapper class which 1) inherits from the + # incoming class, and 2) has a ctor that simply prints the warning and + # assigns self to an instance of the incoming class. Ideally a wrapper + # around __init__ would be created and assigned to the class as the new + # __init__, but #2 is necessary since assigning attributes cannot be done to + # a builtin type (such as what a class defined in cython produces). + if obj_type is type: + class WarningWrapperClass(obj): + def __init__(self, *args, **kwargs): + warnings.warn(warning_msg, PendingDeprecationWarning) + # cython classes do not have a standard __init__, but assigning + # to self works instead. + if type(obj.__init__) is types.FunctionType: + super(WarningWrapperClass, self).__init__(*args, **kwargs) + else: + self = obj(*args, **kwargs) + WarningWrapperClass.__module__ = ns_name + WarningWrapperClass.__qualname__ = obj_name + WarningWrapperClass.__name__ = obj_name - warning_msg = (f"{ns_name}{obj_name} is experimental and will change " - "or be removed in a future release.") + return WarningWrapperClass + # If this point is reached, the incoming obj is a function so wrap it and + # return the wrapper (which is also a function type). @functools.wraps(obj) - def callable_warning_wrapper(*args, **kwargs): + def warning_wrapper_function(*args, **kwargs): warnings.warn(warning_msg, PendingDeprecationWarning) return obj(*args, **kwargs) + warning_wrapper_function.__module__ = ns_name + warning_wrapper_function.__qualname__ = obj_name + warning_wrapper_function.__name__ = obj_name - return callable_warning_wrapper + return warning_wrapper_function From 7c5b5e2acba593222320cabd0f51b6ac0a5c42c1 Mon Sep 17 00:00:00 2001 From: Rick Ratzel Date: Fri, 28 Jan 2022 09:09:50 -0600 Subject: [PATCH 2/6] Updated comments, flake8 fixes. --- python/cugraph/cugraph/utilities/api_tools.py | 22 +++++++++++-------- .../pylibcugraph/tests/test_utils.py | 2 +- .../pylibcugraph/utilities/api_tools.py | 22 +++++++++++-------- 3 files changed, 27 insertions(+), 19 deletions(-) diff --git a/python/cugraph/cugraph/utilities/api_tools.py b/python/cugraph/cugraph/utilities/api_tools.py index eea347a0b40..26e3b0a1d62 100644 --- a/python/cugraph/cugraph/utilities/api_tools.py +++ b/python/cugraph/cugraph/utilities/api_tools.py @@ -18,6 +18,7 @@ experimental_prefix = "EXPERIMENTAL" + # FIXME: this utility is copied from pylibcugraph. Remove this copy and have # cugraph code call the version in pylibcugraph. @@ -56,17 +57,19 @@ def experimental_warning_wrapper(obj): # If obj is a class, create a wrapper class which 1) inherits from the # incoming class, and 2) has a ctor that simply prints the warning and - # assigns self to an instance of the incoming class. Ideally a wrapper - # around __init__ would be created and assigned to the class as the new - # __init__, but #2 is necessary since assigning attributes cannot be done to - # a builtin type (such as what a class defined in cython produces). + # calls the base class ctor. A wrapper class is needed so the new type + # matches the incoming type. + # Ideally a wrapper function would be created and assigned to the class as + # the new __init__, but #2 is necessary since assigning attributes cannot + # be done to a builtin type (such as a class defined in cython). if obj_type is type: class WarningWrapperClass(obj): def __init__(self, *args, **kwargs): warnings.warn(warning_msg, PendingDeprecationWarning) - # cython classes do not have a standard __init__, but assigning - # to self works instead. - if type(obj.__init__) is types.FunctionType: + # call base class __init__ for python, but cython classes do + # not have a standard callable __init__ and assigning to self + # works instead. + if isinstance(obj.__init__, types.FunctionType): super(WarningWrapperClass, self).__init__(*args, **kwargs) else: self = obj(*args, **kwargs) @@ -76,8 +79,9 @@ def __init__(self, *args, **kwargs): return WarningWrapperClass - # If this point is reached, the incoming obj is a function so wrap it and - # return the wrapper (which is also a function type). + # If this point is reached, the incoming obj is a function so simply wrap + # it and return the wrapper. Since the wrapper is a function type, it will + # match the incoming obj type. @functools.wraps(obj) def warning_wrapper_function(*args, **kwargs): warnings.warn(warning_msg, PendingDeprecationWarning) diff --git a/python/pylibcugraph/pylibcugraph/tests/test_utils.py b/python/pylibcugraph/pylibcugraph/tests/test_utils.py index 3156daa80ba..036a62b9c1e 100644 --- a/python/pylibcugraph/pylibcugraph/tests/test_utils.py +++ b/python/pylibcugraph/pylibcugraph/tests/test_utils.py @@ -50,4 +50,4 @@ def test_experimental_warning_wrapper_for_unsupported_type(): # A module type should not be allowed to be wrapped mod = types.ModuleType("modname") with pytest.raises(TypeError): - exp_mod = experimental_warning_wrapper(mod) + experimental_warning_wrapper(mod) diff --git a/python/pylibcugraph/pylibcugraph/utilities/api_tools.py b/python/pylibcugraph/pylibcugraph/utilities/api_tools.py index f581d95c84b..e869e766c11 100644 --- a/python/pylibcugraph/pylibcugraph/utilities/api_tools.py +++ b/python/pylibcugraph/pylibcugraph/utilities/api_tools.py @@ -18,6 +18,7 @@ experimental_prefix = "EXPERIMENTAL" + def experimental_warning_wrapper(obj): """ Return a callable obj wrapped in a callable the prints a warning about @@ -53,17 +54,19 @@ def experimental_warning_wrapper(obj): # If obj is a class, create a wrapper class which 1) inherits from the # incoming class, and 2) has a ctor that simply prints the warning and - # assigns self to an instance of the incoming class. Ideally a wrapper - # around __init__ would be created and assigned to the class as the new - # __init__, but #2 is necessary since assigning attributes cannot be done to - # a builtin type (such as what a class defined in cython produces). + # calls the base class ctor. A wrapper class is needed so the new type + # matches the incoming type. + # Ideally a wrapper function would be created and assigned to the class as + # the new __init__, but #2 is necessary since assigning attributes cannot + # be done to a builtin type (such as a class defined in cython). if obj_type is type: class WarningWrapperClass(obj): def __init__(self, *args, **kwargs): warnings.warn(warning_msg, PendingDeprecationWarning) - # cython classes do not have a standard __init__, but assigning - # to self works instead. - if type(obj.__init__) is types.FunctionType: + # call base class __init__ for python, but cython classes do + # not have a standard callable __init__ and assigning to self + # works instead. + if isinstance(obj.__init__, types.FunctionType): super(WarningWrapperClass, self).__init__(*args, **kwargs) else: self = obj(*args, **kwargs) @@ -73,8 +76,9 @@ def __init__(self, *args, **kwargs): return WarningWrapperClass - # If this point is reached, the incoming obj is a function so wrap it and - # return the wrapper (which is also a function type). + # If this point is reached, the incoming obj is a function so simply wrap + # it and return the wrapper. Since the wrapper is a function type, it will + # match the incoming obj type. @functools.wraps(obj) def warning_wrapper_function(*args, **kwargs): warnings.warn(warning_msg, PendingDeprecationWarning) From 9c270fb7cf317bc2c2962dac6a81721a40cf14ef Mon Sep 17 00:00:00 2001 From: Rick Ratzel Date: Wed, 2 Feb 2022 13:48:36 -0600 Subject: [PATCH 3/6] Made PropertyGraph column name constants public for use by clients, changed num_vertices property impl for better performance, added get_vertices(), allow a None type_name for add_*_data(), updated docstrings, changed the edge_data added to Graphs to be a DataFrame for efficiency, added tests and benchmarks, refactored and added test fixtures, updated pytest.ini for new benchmarks. --- .../cugraph/structure/property_graph.py | 255 +++++----- .../cugraph/tests/test_property_graph.py | 457 +++++++++++++++--- python/cugraph/pytest.ini | 28 ++ 3 files changed, 567 insertions(+), 173 deletions(-) diff --git a/python/cugraph/cugraph/structure/property_graph.py b/python/cugraph/cugraph/structure/property_graph.py index 4712b4f1067..14f18aa8f50 100644 --- a/python/cugraph/cugraph/structure/property_graph.py +++ b/python/cugraph/cugraph/structure/property_graph.py @@ -11,7 +11,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import cudf import cugraph @@ -59,12 +58,12 @@ class EXPERIMENTAL__PropertyGraph: algorithm results with corresponding properties. """ # column name constants used in internal DataFrames - __vertex_col_name = "_VERTEX_" - __src_col_name = "_SRC_" - __dst_col_name = "_DST_" - __type_col_name = "_TYPE_" - __edge_id_col_name = "_EDGE_ID_" - __vertex_id_col_name = "_VERTEX_ID_" + vertex_col_name = "_VERTEX_" + src_col_name = "_SRC_" + dst_col_name = "_DST_" + type_col_name = "_TYPE_" + edge_id_col_name = "_EDGE_ID_" + vertex_id_col_name = "_VERTEX_ID_" def __init__(self): # The dataframe containing the properties for each vertex. @@ -133,31 +132,24 @@ def __init__(self): # incrementing this counter. self.__last_edge_id = None + # Cached property values + self.__num_vertices = None + # PropertyGraph read-only attributes @property def num_vertices(self): - # Create a Series of the appropriate type (cudf.Series, pandas.Series, - # etc.) based on the type currently in use, then use it to gather all - # unique vertices. - vpd = self.__vertex_prop_dataframe - epd = self.__edge_prop_dataframe - if (vpd is None) and (epd is None): - return 0 + if self.__num_vertices is not None: + return self.__num_vertices + + self.__num_vertices = 0 + vert_sers = self.__get_all_vertices_series() + if vert_sers: + if self.__series_type is cudf.Series: + self.__num_vertices = cudf.concat(vert_sers).nunique() + else: + self.__num_vertices = pd.concat(vert_sers).nunique() - # Assume __series_type is set if this point reached! - verts = self.__series_type(dtype="object") - if vpd is not None: - verts = verts.append(vpd[self.__vertex_col_name]) - if epd is not None: - # pandas.Series.unique() can return an ndarray, which cannot be - # appended to a Series. Always construct an appropriate series_type - # from the unique values prior to appending. - verts = verts.append( - self.__series_type(epd[self.__src_col_name].unique())) - verts = verts.append( - self.__series_type(epd[self.__dst_col_name].unique())) - verts = verts.unique() - return len(verts) + return self.__num_vertices @property def num_edges(self): @@ -166,12 +158,19 @@ def num_edges(self): else: return 0 + @property + def edges(self): + if self.__edge_prop_dataframe is not None: + return self.__edge_prop_dataframe[[self.src_col_name, + self.dst_col_name]] + return None + @property def vertex_property_names(self): if self.__vertex_prop_dataframe is not None: props = list(self.__vertex_prop_dataframe.columns) - props.remove(self.__vertex_col_name) - props.remove(self.__type_col_name) # should "type" be removed? + props.remove(self.vertex_col_name) + props.remove(self.type_col_name) # should "type" be removed? return props return [] @@ -179,10 +178,10 @@ def vertex_property_names(self): def edge_property_names(self): if self.__edge_prop_dataframe is not None: props = list(self.__edge_prop_dataframe.columns) - props.remove(self.__src_col_name) - props.remove(self.__dst_col_name) - props.remove(self.__edge_id_col_name) - props.remove(self.__type_col_name) # should "type" be removed? + props.remove(self.src_col_name) + props.remove(self.dst_col_name) + props.remove(self.edge_id_col_name) + props.remove(self.type_col_name) # should "type" be removed? return props return [] @@ -195,6 +194,15 @@ def _vertex_prop_dataframe(self): def _edge_prop_dataframe(self): return self.__edge_prop_dataframe + def get_vertices(self, selection=None): + vert_sers = self.__get_all_vertices_series() + if vert_sers: + if self.__series_type is cudf.Series: + return self.__series_type(cudf.concat(vert_sers).unique()) + else: + return self.__series_type(pd.concat(vert_sers).unique()) + return self.__series_type() + def add_vertex_data(self, dataframe, vertex_id_column, @@ -235,7 +243,7 @@ def add_vertex_data(self, if vertex_id_column not in dataframe.columns: raise ValueError(f"{vertex_id_column} is not a column in " f"dataframe: {dataframe.columns}") - if type(type_name) is not str: + if (type_name is not None) and not(isinstance(type_name, str)): raise TypeError("type_name must be a string, got: " f"{type(type_name)}") if property_columns: @@ -259,9 +267,13 @@ def add_vertex_data(self, "the PropertyGraph was already initialized " f"using type {self.__dataframe_type}") + # Clear the cached value for num_vertices since more could be added in + # this method. + self.__num_vertices = None + # Initialize the __vertex_prop_dataframe if necessary using the same # type as the incoming dataframe. - default_vertex_columns = [self.__vertex_col_name, self.__type_col_name] + default_vertex_columns = [self.vertex_col_name, self.type_col_name] if self.__vertex_prop_dataframe is None: self.__vertex_prop_dataframe = \ self.__dataframe_type(columns=default_vertex_columns) @@ -271,7 +283,7 @@ def add_vertex_data(self, # https://github.com/rapidsai/cudf/issues/9981) self.__update_dataframe_dtypes( self.__vertex_prop_dataframe, - {self.__vertex_col_name: dataframe[vertex_id_column].dtype}) + {self.vertex_col_name: dataframe[vertex_id_column].dtype}) # Ensure that both the predetermined vertex ID column name and vertex # type column name are present for proper merging. @@ -280,9 +292,9 @@ def add_vertex_data(self, # columns. The copied DataFrame is then merged (another copy) and then # deleted when out-of-scope. tmp_df = dataframe.copy(deep=True) - tmp_df[self.__vertex_col_name] = tmp_df[vertex_id_column] + tmp_df[self.vertex_col_name] = tmp_df[vertex_id_column] # FIXME: handle case of a type_name column already being in tmp_df - tmp_df[self.__type_col_name] = type_name + tmp_df[self.type_col_name] = type_name if property_columns: # all columns @@ -352,7 +364,7 @@ def add_edge_data(self, if invalid_columns: raise ValueError("vertex_id_columns contains column(s) not found " f"in dataframe: {list(invalid_columns)}") - if type(type_name) is not str: + if (type_name is not None) and not(isinstance(type_name, str)): raise TypeError("type_name must be a string, got: " f"{type(type_name)}") if property_columns: @@ -376,10 +388,14 @@ def add_edge_data(self, "the PropertyGraph was already initialized " f"using type {self.__dataframe_type}") - default_edge_columns = [self.__src_col_name, - self.__dst_col_name, - self.__edge_id_col_name, - self.__type_col_name] + # Clear the cached value for num_vertices since more could be added in + # this method. + self.__num_vertices = None + + default_edge_columns = [self.src_col_name, + self.dst_col_name, + self.edge_id_col_name, + self.type_col_name] if self.__edge_prop_dataframe is None: self.__edge_prop_dataframe = \ self.__dataframe_type(columns=default_edge_columns) @@ -389,18 +405,18 @@ def add_edge_data(self, # https://github.com/rapidsai/cudf/issues/9981) self.__update_dataframe_dtypes( self.__edge_prop_dataframe, - {self.__src_col_name: dataframe[vertex_id_columns[0]].dtype, - self.__dst_col_name: dataframe[vertex_id_columns[1]].dtype, - self.__edge_id_col_name: "Int64"}) + {self.src_col_name: dataframe[vertex_id_columns[0]].dtype, + self.dst_col_name: dataframe[vertex_id_columns[1]].dtype, + self.edge_id_col_name: "Int64"}) # NOTE: This copies the incoming DataFrame in order to add the new # columns. The copied DataFrame is then merged (another copy) and then # deleted when out-of-scope. tmp_df = dataframe.copy(deep=True) - tmp_df[self.__src_col_name] = tmp_df[vertex_id_columns[0]] - tmp_df[self.__dst_col_name] = tmp_df[vertex_id_columns[1]] + tmp_df[self.src_col_name] = tmp_df[vertex_id_columns[0]] + tmp_df[self.dst_col_name] = tmp_df[vertex_id_columns[1]] # FIXME: handle case of a type_name column already being in tmp_df - tmp_df[self.__type_col_name] = type_name + tmp_df[self.type_col_name] = type_name if property_columns: # all columns @@ -463,11 +479,11 @@ def select_vertices(self, expr, from_previous_selection=None): previously_selected_rows = self.__vertex_prop_dataframe[ from_previous_selection.vertex_selections] verts_from_previously_selected_rows = \ - previously_selected_rows[self.__vertex_col_name] + previously_selected_rows[self.vertex_col_name] # get all the rows from the entire __vertex_prop_dataframe that # contain those verts rows_with_verts = \ - self.__vertex_prop_dataframe[self.__vertex_col_name]\ + self.__vertex_prop_dataframe[self.vertex_col_name]\ .isin(verts_from_previously_selected_rows) rows_to_eval = self.__vertex_prop_dataframe[rows_with_verts] locals = dict([(n, rows_to_eval[n]) @@ -520,7 +536,7 @@ def select_edges(self, expr): edge_selection_series=selected_col) def extract_subgraph(self, - create_using=None, + create_using=cugraph.Graph, selection=None, edge_weight_property=None, default_edge_weight=None, @@ -555,7 +571,9 @@ def extract_subgraph(self, Returns ------- - None + A Graph instance of the same type as create_using containing only the + vertices and edges resulting from applying the selection to the set of + vertex and edge property data. Examples -------- @@ -570,8 +588,8 @@ def extract_subgraph(self, # vertices assume the original dtypes in the user input have been # preserved. However, merge operations on the DataFrames can change # dtypes (eg. int64 to float64 in order to add NaN entries). This - # should not be a problem since this the conversions do not change - # the values. + # should not be a problem since the conversions do not change the + # values. if (selection is not None) and \ (selection.vertex_selections is not None): selected_vertex_dataframe = \ @@ -592,10 +610,10 @@ def extract_subgraph(self, # selected verts in both src and dst if (selected_vertex_dataframe is not None) and \ not(selected_vertex_dataframe.empty): - selected_verts = selected_vertex_dataframe[self.__vertex_col_name] - has_srcs = selected_edge_dataframe[self.__src_col_name]\ + selected_verts = selected_vertex_dataframe[self.vertex_col_name] + has_srcs = selected_edge_dataframe[self.src_col_name]\ .isin(selected_verts) - has_dsts = selected_edge_dataframe[self.__dst_col_name]\ + has_dsts = selected_edge_dataframe[self.dst_col_name]\ .isin(selected_verts) edges = selected_edge_dataframe[has_srcs & has_dsts] else: @@ -668,13 +686,15 @@ def annotate_dataframe(self, df, G, edge_vertex_id_columns): raise TypeError(f"df type {df_type} does not match DataFrame type " f"{self.__dataframe_type} used in PropertyGraph") - # FIXME: check that G has edge_data attr - # Add the src, dst, edge_id info from the Graph to a DataFrame - edge_info_df = self.__dataframe_type(columns=[self.__src_col_name, - self.__dst_col_name, - self.__edge_id_col_name], - data=G.edge_data) + # edge_info_df = self.__dataframe_type(columns=[self.src_col_name, + # self.dst_col_name, + # self.edge_id_col_name], + # data=G.edge_data) + if hasattr(G, "edge_data"): + edge_info_df = G.edge_data + else: + raise AttributeError("Graph G does not have attribute 'edge_data'") # New result includes only properties from the src/dst edges identified # by edge IDs. All other data in df is merged based on src/dst values. @@ -683,12 +703,12 @@ def annotate_dataframe(self, df, G, edge_vertex_id_columns): how="inner") # FIXME: also allow edge ID col to be passed in and renamed. - new_df = df.rename(columns={src_col_name: self.__src_col_name, - dst_col_name: self.__dst_col_name}) + new_df = df.rename(columns={src_col_name: self.src_col_name, + dst_col_name: self.dst_col_name}) new_df = new_df.merge(edge_props_df) # restore the original src/dst column names - new_df.rename(columns={self.__src_col_name: src_col_name, - self.__dst_col_name: dst_col_name}, + new_df.rename(columns={self.src_col_name: src_col_name, + self.dst_col_name: dst_col_name}, inplace=True) # restore the original dtypes @@ -700,31 +720,9 @@ def annotate_dataframe(self, df, G, edge_vertex_id_columns): # columns from edge types not included in the edges in df. return new_df - @classmethod - def get_edge_tuples(cls, edge_prop_df): - """ - Returns a list of (src vertex, dst vertex, edge_id) tuples present in - edge_prop_df. - """ - if cls.__src_col_name not in edge_prop_df.columns: - raise ValueError(f"column {cls.__src_col_name} missing from " - "edge_prop_df") - if cls.__dst_col_name not in edge_prop_df.columns: - raise ValueError(f"column {cls.__dst_col_name} missing from " - "edge_prop_df") - if cls.__edge_id_col_name not in edge_prop_df.columns: - raise ValueError(f"column {cls.__edge_id_col_name} missing " - "from edge_prop_df") - src = edge_prop_df[cls.__src_col_name] - dst = edge_prop_df[cls.__dst_col_name] - edge_id = edge_prop_df[cls.__edge_id_col_name] - retlist = [(src.iloc[i], dst.iloc[i], edge_id.iloc[i]) - for i in range(len(src))] - return retlist - - @classmethod - def edge_props_to_graph(cls, edge_prop_df, - create_using=None, + def edge_props_to_graph(self, + edge_prop_df, + create_using, edge_weight_property=None, allow_multi_edges=False): """ @@ -737,12 +735,12 @@ def edge_props_to_graph(cls, edge_prop_df, "edge_prop_df") # Set up the new Graph to return - if create_using is None: - G = cugraph.Graph() - elif isinstance(create_using, cugraph.Graph): + if isinstance(create_using, cugraph.Graph): # FIXME: extract more attrs from the create_using instance attrs = {"directed": create_using.is_directed()} G = type(create_using)(**attrs) + # FIXME: this allows anything to be instantiated does not check that + # the type is a valid Graph type. elif type(create_using) is type(type): G = create_using() else: @@ -754,20 +752,20 @@ def edge_props_to_graph(cls, edge_prop_df, # non-MultiGraphs would result in ambiguous edge properties. # FIXME: make allow_multi_edges accept "auto" for use with MultiGraph if (allow_multi_edges is False) and \ - cls.has_duplicate_edges(edge_prop_df): + self.has_duplicate_edges(edge_prop_df): if create_using: if type(create_using) is type: t = create_using.__name__ else: t = type(create_using).__name__ - msg = f"{t} graph type specified by create_using" + msg = f"'{t}' graph type specified by create_using" else: msg = "default Graph graph type" raise RuntimeError("query resulted in duplicate edges which " - f"cannot be represented with a {msg}") + f"cannot be represented with the {msg}") - create_args = {"source": cls.__src_col_name, - "destination": cls.__dst_col_name, + create_args = {"source": self.src_col_name, + "destination": self.dst_col_name, "edge_attr": edge_weight_property, "renumber": True, } @@ -776,15 +774,12 @@ def edge_props_to_graph(cls, edge_prop_df, else: G.from_pandas_edgelist(edge_prop_df, **create_args) - # Set the edge_data on the resulting Graph to the list of edge tuples, - # which includes the unique edge IDs. Edge IDs are needed for future - # calls to annotate_dataframe() in order to apply properties from the - # correct edges. - # FIXME: this could be a very large list of tuples if the number of - # edges in G is large (eg. a large MNMG graph that cannot fit in host - # memory). Consider adding the edge IDs to the edgelist DataFrame in G - # instead. - G.edge_data = cls.get_edge_tuples(edge_prop_df) + # Set the edge_data on the resulting Graph to a DataFrame containing + # the edges and the edge ID for each. Edge IDs are needed for future + # calls to annotate_dataframe() in order to associate edges with their + # properties, since the PG can contain multiple edges between vertrices + # with different properties. + G.edge_data = self.__create_property_lookup_table(edge_prop_df) # FIXME: also add vertex_data return G @@ -794,14 +789,27 @@ def has_duplicate_edges(cls, df): """ Return True if df has >1 of the same src, dst pair """ + # FIXME: this can be very expensive for large DataFrames if df.empty: return False def has_duplicate_dst(df): - return df[cls.__dst_col_name].nunique() != \ - df[cls.__dst_col_name].size + return df[cls.dst_col_name].nunique() != \ + df[cls.dst_col_name].size - return df.groupby(cls.__src_col_name).apply(has_duplicate_dst).any() + return df.groupby(cls.src_col_name).apply(has_duplicate_dst).any() + + def __create_property_lookup_table(self, edge_prop_df): + """ + Returns a DataFrame containing the src vertex, dst vertex, and edge_id + values from edge_prop_df. + """ + src = edge_prop_df[self.src_col_name] + dst = edge_prop_df[self.dst_col_name] + edge_id = edge_prop_df[self.edge_id_col_name] + return self.__dataframe_type({self.src_col_name: src, + self.dst_col_name: dst, + self.edge_id_col_name: edge_id}) def __add_edge_ids(self): """ @@ -809,7 +817,7 @@ def __add_edge_ids(self): incremented by 1 for each edge. """ prev_eid = -1 if self.__last_edge_id is None else self.__last_edge_id - nans = self.__edge_prop_dataframe[self.__edge_id_col_name].isna() + nans = self.__edge_prop_dataframe[self.edge_id_col_name].isna() if nans.any(): indices = nans.index[nans] @@ -818,11 +826,26 @@ def __add_edge_ids(self): new_eids = self.__series_type( range(starting_eid, starting_eid + num_indices)) - self.__edge_prop_dataframe[self.__edge_id_col_name]\ + self.__edge_prop_dataframe[self.edge_id_col_name]\ .iloc[indices] = new_eids self.__last_edge_id = starting_eid + num_indices - 1 + def __get_all_vertices_series(self): + """ + Return a list of all Series objects that contain vertices from all + tables. + """ + vpd = self.__vertex_prop_dataframe + epd = self.__edge_prop_dataframe + vert_sers = [] + if vpd is not None: + vert_sers.append(vpd[self.vertex_col_name]) + if epd is not None: + vert_sers.append(epd[self.src_col_name]) + vert_sers.append(epd[self.dst_col_name]) + return vert_sers + @staticmethod def __get_new_column_dtypes(from_df, to_df): """ diff --git a/python/cugraph/cugraph/tests/test_property_graph.py b/python/cugraph/cugraph/tests/test_property_graph.py index 0578a7780de..8cced223bf7 100644 --- a/python/cugraph/cugraph/tests/test_property_graph.py +++ b/python/cugraph/cugraph/tests/test_property_graph.py @@ -11,14 +11,28 @@ # See the License for the specific language governing permissions and # limitations under the License. +import time import gc import pytest import pandas as pd +import numpy as np import cudf from cudf.testing import assert_frame_equal, assert_series_equal +# If the rapids-pytest-benchmark plugin is installed, the "gpubenchmark" +# fixture will be available automatically. Check that this fixture is available +# by trying to import rapids_pytest_benchmark, and if that fails, set +# "gpubenchmark" to the standard "benchmark" fixture provided by +# pytest-benchmark. +try: + import rapids_pytest_benchmark # noqa: F401 +except ImportError: + import pytest_benchmark + gpubenchmark = pytest_benchmark.plugin.benchmark + import cugraph +from cugraph.generators import rmat from cugraph.tests import utils # ============================================================================= @@ -107,21 +121,26 @@ def setup_function(): df_types = [cudf.DataFrame, pd.DataFrame] -def df_type_id(dft): +def df_type_id(dataframe_type): + """ + Return a string that describes the dataframe_type, used for test output. + """ s = "df_type=" - if dft == cudf.DataFrame: + if dataframe_type == cudf.DataFrame: return s+"cudf.DataFrame" - if dft == pd.DataFrame: + if dataframe_type == pd.DataFrame: return s+"pandas.DataFrame" return s+"?" -@pytest.fixture(scope="module", - params=utils.genFixtureParamsProduct((df_types, df_type_id)) - ) -def property_graph_instance(request): +df_types_fixture_params = utils.genFixtureParamsProduct((df_types, df_type_id)) + + +@pytest.fixture(scope="module", params=df_types_fixture_params) +def dataset1_PropertyGraph(request): """ - FIXME: fill this in + Fixture which returns an instance of a PropertyGraph with vertex and edge + data added from dataset1, parameterized for different DataFrame types. """ dataframe_type = request.param[0] from cugraph.experimental import PropertyGraph @@ -177,8 +196,74 @@ def property_graph_instance(request): return pG -############################################################################### +@pytest.fixture(scope="module", params=df_types_fixture_params) +def cyber_PropertyGraph(request): + """ + Fixture which returns an instance of a PropertyGraph with vertex and edge + data added from the cyber.csv dataset, parameterized for different + DataFrame types. + """ + from cugraph.experimental import PropertyGraph + + dataframe_type = request.param[0] + cyber_csv = utils.RAPIDS_DATASET_ROOT_DIR_PATH/"cyber.csv" + source_col_name = "srcip" + dest_col_name = "dstip" + + if dataframe_type is pd.DataFrame: + read_csv = pd.read_csv + else: + read_csv = cudf.read_csv + df = read_csv(cyber_csv, delimiter=",", + dtype={"idx": "int32", + source_col_name: "str", + dest_col_name: "str"}, + header=0) + + pG = PropertyGraph() + pG.add_edge_data(df, (source_col_name, dest_col_name)) + + return pG + + +@pytest.fixture(scope="module", params=df_types_fixture_params) +def rmat_PropertyGraph(): + """ + Fixture which uses the RMAT generator to generate a cuDF DataFrame + edgelist, then uses it to add vertex and edge data to a PropertyGraph + instance, then returns the (PropertyGraph, DataFrame) instances in a tuple. + """ + from cugraph.experimental import PropertyGraph + + source_col_name = "src" + dest_col_name = "dst" + weight_col_name = "weight" + scale = 20 + edgefactor = 16 + seed = 42 + df = rmat(scale, + (2**scale)*edgefactor, + 0.57, # from Graph500 + 0.19, # from Graph500 + 0.19, # from Graph500 + seed, + clip_and_flip=False, + scramble_vertex_ids=True, + create_using=None, # None == return edgelist + mg=False + ) + rng = np.random.default_rng(seed) + df[weight_col_name] = rng.random(size=len(df)) + + pG = PropertyGraph() + pG.add_edge_data(df, (source_col_name, dest_col_name)) + + return (pG, df) + + +# ============================================================================= # Tests +# ============================================================================= @pytest.mark.parametrize("df_type", df_types, ids=df_type_id) def test_add_vertex_data(df_type): """ @@ -202,6 +287,60 @@ def test_add_vertex_data(df_type): assert sorted(pG.vertex_property_names) == sorted(expected_props) +@pytest.mark.parametrize("df_type", df_types, ids=df_type_id) +def test_num_vertices(df_type): + """ + Ensures num_vertices is correct after various additions of specific data. + """ + from cugraph.experimental import PropertyGraph + + merchants = dataset1["merchants"] + merchants_df = df_type(columns=merchants[0], + data=merchants[1]) + + pG = PropertyGraph() + pG.add_vertex_data(merchants_df, + type_name="merchants", + vertex_id_column="merchant_id", + property_columns=None) + + # Test caching - the second retrieval should always be faster + st = time.time() + assert pG.num_vertices == 5 + compute_time = time.time() - st + assert pG.num_edges == 0 + + st = time.time() + assert pG.num_vertices == 5 + cache_retrieval_time = time.time() - st + assert cache_retrieval_time < compute_time + + users = dataset1["users"] + users_df = df_type(columns=users[0], data=users[1]) + + pG.add_vertex_data(users_df, + type_name="users", + vertex_id_column="user_id", + property_columns=None) + + assert pG.num_vertices == 9 + assert pG.num_edges == 0 + + # The taxpayers table does not add new vertices, it only adds properties to + # vertices already present in the merchants and users tables. + taxpayers = dataset1["taxpayers"] + taxpayers_df = df_type(columns=taxpayers[0], + data=taxpayers[1]) + + pG.add_vertex_data(taxpayers_df, + type_name="taxpayers", + vertex_id_column="payer_id", + property_columns=None) + + assert pG.num_vertices == 9 + assert pG.num_edges == 0 + + @pytest.mark.parametrize("df_type", df_types, ids=df_type_id) def test_null_data(df_type): """ @@ -365,9 +504,9 @@ def test_add_edge_data_bad_args(): property_columns="time") -def test_extract_subgraph_vertex_prop_condition_only(property_graph_instance): +def test_extract_subgraph_vertex_prop_condition_only(dataset1_PropertyGraph): - pG = property_graph_instance + pG = dataset1_PropertyGraph selection = pG.select_vertices("(_TYPE_=='taxpayers') & (amount<100)") G = pG.extract_subgraph(selection=selection, @@ -386,12 +525,15 @@ def test_extract_subgraph_vertex_prop_condition_only(property_graph_instance): assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True) -def test_extract_subgraph_vertex_edge_prop_condition(property_graph_instance): - pG = property_graph_instance +def test_extract_subgraph_vertex_edge_prop_condition(dataset1_PropertyGraph): + from cugraph.experimental import PropertyGraph + + pG = dataset1_PropertyGraph + tcn = PropertyGraph.type_col_name selection = pG.select_vertices("(user_location==47906) | " "(user_location==78750)") - selection += pG.select_edges("_TYPE_=='referrals'") + selection += pG.select_edges(f"{tcn}=='referrals'") G = pG.extract_subgraph(selection=selection, create_using=DiGraph_inst, edge_weight_property="stars") @@ -407,10 +549,13 @@ def test_extract_subgraph_vertex_edge_prop_condition(property_graph_instance): assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True) -def test_extract_subgraph_edge_prop_condition_only(property_graph_instance): - pG = property_graph_instance +def test_extract_subgraph_edge_prop_condition_only(dataset1_PropertyGraph): + from cugraph.experimental import PropertyGraph - selection = pG.select_edges("_TYPE_=='transactions'") + pG = dataset1_PropertyGraph + tcn = PropertyGraph.type_col_name + + selection = pG.select_edges(f"{tcn} =='transactions'") G = pG.extract_subgraph(selection=selection, create_using=DiGraph_inst) @@ -431,27 +576,33 @@ def test_extract_subgraph_edge_prop_condition_only(property_graph_instance): assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True) -def test_extract_subgraph_unweighted(property_graph_instance): +def test_extract_subgraph_unweighted(dataset1_PropertyGraph): """ Ensure a subgraph is unweighted if the edge_weight_property is None. """ - pG = property_graph_instance + from cugraph.experimental import PropertyGraph - selection = pG.select_edges("_TYPE_=='transactions'") + pG = dataset1_PropertyGraph + tcn = PropertyGraph.type_col_name + + selection = pG.select_edges(f"{tcn} == 'transactions'") G = pG.extract_subgraph(selection=selection, create_using=DiGraph_inst) assert G.is_weighted() is False -def test_extract_subgraph_specific_query(property_graph_instance): +def test_extract_subgraph_specific_query(dataset1_PropertyGraph): """ Graph of only transactions after time 1639085000 for merchant_id 4 (should be a graph of 2 vertices, 1 edge) """ - pG = property_graph_instance + from cugraph.experimental import PropertyGraph - selection = pG.select_edges("(_TYPE_=='transactions') & " + pG = dataset1_PropertyGraph + tcn = PropertyGraph.type_col_name + + selection = pG.select_edges(f"({tcn}=='transactions') & " "(merchant_id==4) & " "(time>1639085000)") G = pG.extract_subgraph(selection=selection, @@ -469,28 +620,34 @@ def test_extract_subgraph_specific_query(property_graph_instance): assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True) -def test_edge_props_to_graph(property_graph_instance): +def test_edge_props_to_graph(dataset1_PropertyGraph): """ Access the property DataFrames directly and use them to perform a more complex query, then call edge_props_to_graph() to create the corresponding graph. """ - pG = property_graph_instance + from cugraph.experimental import PropertyGraph + + pG = dataset1_PropertyGraph + vcn = PropertyGraph.vertex_col_name + tcn = PropertyGraph.type_col_name + scn = PropertyGraph.src_col_name + dcn = PropertyGraph.dst_col_name # Select referrals from only taxpayers who are users (should be 1) # Find the list of vertices that are both users and taxpayers def contains_both(df): - return (df["_TYPE_"] == "taxpayers").any() and \ - (df["_TYPE_"] == "users").any() - verts = pG._vertex_prop_dataframe.groupby("_VERTEX_")\ + return (df[tcn] == "taxpayers").any() and \ + (df[tcn] == "users").any() + verts = pG._vertex_prop_dataframe.groupby(vcn)\ .apply(contains_both) verts = verts[verts].keys() # get an array of only verts that have both # Find the "referral" edge_props containing only those verts - referrals = pG._edge_prop_dataframe["_TYPE_"] == "referrals" - srcs = pG._edge_prop_dataframe[referrals]["_SRC_"].isin(verts) - dsts = pG._edge_prop_dataframe[referrals]["_DST_"].isin(verts) + referrals = pG._edge_prop_dataframe[tcn] == "referrals" + srcs = pG._edge_prop_dataframe[referrals][scn].isin(verts) + dsts = pG._edge_prop_dataframe[referrals][dcn].isin(verts) matching_edges = (srcs & dsts) indices = matching_edges.index[matching_edges] edge_props = pG._edge_prop_dataframe.loc[indices] @@ -508,18 +665,21 @@ def contains_both(df): assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True) -def test_select_vertices_from_previous_selection(property_graph_instance): +def test_select_vertices_from_previous_selection(dataset1_PropertyGraph): """ Ensures that the intersection of vertices of multiple types (only vertices that are both type A and type B) can be selected. """ - pG = property_graph_instance + from cugraph.experimental import PropertyGraph + + pG = dataset1_PropertyGraph + tcn = PropertyGraph.type_col_name # Select referrals from only taxpayers who are users (should be 1) - selection = pG.select_vertices("_TYPE_ == 'taxpayers'") - selection = pG.select_vertices("_TYPE_ == 'users'", + selection = pG.select_vertices(f"{tcn} == 'taxpayers'") + selection = pG.select_vertices(f"{tcn} == 'users'", from_previous_selection=selection) - selection += pG.select_edges("_TYPE_ == 'referrals'") + selection += pG.select_edges(f"{tcn} == 'referrals'") G = pG.extract_subgraph(create_using=DiGraph_inst, selection=selection) expected_edgelist = cudf.DataFrame({"src": [89021], "dst": [78634]}) @@ -555,7 +715,8 @@ def test_extract_subgraph_graph_without_vert_props(): vertex_id_columns=("user_id_1", "user_id_2"), property_columns=None) - G = pG.extract_subgraph(selection=pG.select_edges("_SRC_ == 89216"), + scn = PropertyGraph.src_col_name + G = pG.extract_subgraph(selection=pG.select_edges(f"{scn} == 89216"), create_using=DiGraph_inst, edge_weight_property="relationship_type", default_edge_weight=0) @@ -572,11 +733,11 @@ def test_extract_subgraph_graph_without_vert_props(): assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True) -def test_extract_subgraph_no_edges(property_graph_instance): +def test_extract_subgraph_no_edges(dataset1_PropertyGraph): """ Valid query that only matches a single vertex. """ - pG = property_graph_instance + pG = dataset1_PropertyGraph selection = pG.select_vertices("(_TYPE_=='merchants') & (merchant_id==86)") G = pG.extract_subgraph(selection=selection) @@ -584,11 +745,11 @@ def test_extract_subgraph_no_edges(property_graph_instance): assert len(G.edgelist.edgelist_df) == 0 -def test_extract_subgraph_no_query(property_graph_instance): +def test_extract_subgraph_no_query(dataset1_PropertyGraph): """ Call extract with no args, should result in the entire property graph. """ - pG = property_graph_instance + pG = dataset1_PropertyGraph G = pG.extract_subgraph(create_using=DiGraph_inst, allow_multi_edges=True) @@ -602,16 +763,20 @@ def test_extract_subgraph_no_query(property_graph_instance): assert len(G.edgelist.edgelist_df) == num_edges -def test_extract_subgraph_multi_edges(property_graph_instance): +def test_extract_subgraph_multi_edges(dataset1_PropertyGraph): """ Ensure an exception is thrown if a graph is attempted to be extracted with multi edges. NOTE: an option to allow multi edges when create_using is MultiGraph will be provided in the future. """ - pG = property_graph_instance + from cugraph.experimental import PropertyGraph + + pG = dataset1_PropertyGraph + tcn = PropertyGraph.type_col_name + # referrals has multiple edges - selection = pG.select_edges("_TYPE_ == 'referrals'") + selection = pG.select_edges(f"{tcn} == 'referrals'") # FIXME: use a better exception with pytest.raises(RuntimeError): @@ -619,8 +784,11 @@ def test_extract_subgraph_multi_edges(property_graph_instance): create_using=DiGraph_inst) -def test_extract_subgraph_bad_args(property_graph_instance): - pG = property_graph_instance +def test_extract_subgraph_bad_args(dataset1_PropertyGraph): + from cugraph.experimental import PropertyGraph + + pG = dataset1_PropertyGraph + tcn = PropertyGraph.type_col_name # non-PropertySelection selection with pytest.raises(TypeError): @@ -629,7 +797,7 @@ def test_extract_subgraph_bad_args(property_graph_instance): edge_weight_property="stars", default_edge_weight=1.0) - selection = pG.select_edges("_TYPE_=='referrals'") + selection = pG.select_edges(f"{tcn}=='referrals'") # bad create_using type with pytest.raises(TypeError): pG.extract_subgraph(selection=selection, @@ -648,14 +816,17 @@ def test_extract_subgraph_bad_args(property_graph_instance): edge_weight_property="card_type") -def test_extract_subgraph_default_edge_weight(property_graph_instance): +def test_extract_subgraph_default_edge_weight(dataset1_PropertyGraph): """ Ensure the default_edge_weight value is added to edges with missing properties used for weights. """ - pG = property_graph_instance + from cugraph.experimental import PropertyGraph - selection = pG.select_edges("_TYPE_=='transactions'") + pG = dataset1_PropertyGraph + tcn = PropertyGraph.type_col_name + + selection = pG.select_edges(f"{tcn}=='transactions'") G = pG.extract_subgraph(create_using=DiGraph_inst, selection=selection, edge_weight_property="volume", @@ -685,12 +856,15 @@ def test_extract_subgraph_default_edge_weight(property_graph_instance): assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True) -def test_graph_edge_data_added(property_graph_instance): +def test_graph_edge_data_added(dataset1_PropertyGraph): """ Ensures the subgraph returned from extract_subgraph() has the edge_data attribute added which contains the proper edge IDs. """ - pG = property_graph_instance + from cugraph.experimental import PropertyGraph + + pG = dataset1_PropertyGraph + eicn = PropertyGraph.edge_id_col_name expected_num_edges = \ len(dataset1["transactions"][-1]) + \ @@ -703,16 +877,15 @@ def test_graph_edge_data_added(property_graph_instance): # meta-data, which includes edge IDs. G = pG.extract_subgraph(create_using=DiGraph_inst, allow_multi_edges=True) - # G.edge_data should be set to a list of tuples of (src, dst, edge_id) for - # each edge in the graph. + # G.edge_data should be set to a DataFrame with rows for each graph edge. assert len(G.edge_data) == expected_num_edges - edge_ids = sorted([d[-1] for d in G.edge_data]) + edge_ids = sorted(G.edge_data[eicn].values) assert edge_ids[0] == 0 assert edge_ids[-1] == (expected_num_edges - 1) -def test_annotate_dataframe(property_graph_instance): +def test_annotate_dataframe(dataset1_PropertyGraph): """ FIXME: Add tests for: properties list @@ -720,7 +893,7 @@ def test_annotate_dataframe(property_graph_instance): copy=False invalid args raise correct exceptions """ - pG = property_graph_instance + pG = dataset1_PropertyGraph selection = pG.select_edges("(_TYPE_ == 'referrals') & (stars > 3)") G = pG.extract_subgraph(selection=selection, @@ -800,3 +973,173 @@ def test_different_vertex_edge_input_dataframe_types(): pG.add_edge_data(df, type_name="bar", vertex_id_columns=("a", "b")) with pytest.raises(TypeError): pG.add_edge_data(pdf, type_name="bar", vertex_id_columns=("a", "b")) + + +def test_get_vertices(dataset1_PropertyGraph): + """ + Test that get_vertices() returns the correct set of vertices without + duplicates. + """ + pG = dataset1_PropertyGraph + + (merchants, users, taxpayers, + transactions, relationships, referrals) = dataset1.values() + + expected_vertices = set([t[0] for t in merchants[1]] + + [t[0] for t in users[1]] + + [t[0] for t in taxpayers[1]]) + + assert sorted(pG.get_vertices().values) == sorted(expected_vertices) + + +def test_get_edges(dataset1_PropertyGraph): + """ + Test that get_edges() returns the correct set of edges (as src/dst + columns). + """ + from cugraph.experimental import PropertyGraph + + pG = dataset1_PropertyGraph + + (merchants, users, taxpayers, + transactions, relationships, referrals) = dataset1.values() + + expected_edges = \ + [(src, dst) for (src, dst, _, _, _, _) in transactions[1]] + \ + [(src, dst) for (src, dst, _) in relationships[1]] + \ + [(src, dst) for (src, dst, _, _) in referrals[1]] + + actual_edges = pG.edges + + assert len(expected_edges) == len(actual_edges) + for i in range(len(expected_edges)): + src = actual_edges[PropertyGraph.src_col_name].iloc[i] + dst = actual_edges[PropertyGraph.dst_col_name].iloc[i] + assert (src, dst) in expected_edges + + +@pytest.mark.skip(reason="unfinished") +def test_extract_subgraph_with_vertex_ids(): + """ + FIXME: add a PropertyGraph API that makes it easy to support the common use + case of extracting a subgraph containing only specific vertex IDs. This is + currently done in the bench_extract_subgraph_for_* tests below, but could + be made easier for users to do. + """ + raise NotImplementedError + + +@pytest.mark.skip(reason="unfinished") +def test_dgl_use_case(): + """ + FIXME: add a test demonstrating typical DGL use cases + """ + raise NotImplementedError + + +# ============================================================================= +# Benchmarks +# ============================================================================= +def bench_num_vertices(gpubenchmark, dataset1_PropertyGraph): + pG = dataset1_PropertyGraph + + def get_num_vertices(): + return pG.num_vertices + + assert gpubenchmark(get_num_vertices) == 9 + + +def bench_get_vertices(gpubenchmark, dataset1_PropertyGraph): + pG = dataset1_PropertyGraph + + gpubenchmark(pG.get_vertices) + + +def bench_extract_subgraph_for_cyber(gpubenchmark, cyber_PropertyGraph): + from cugraph.experimental import PropertyGraph + + pG = cyber_PropertyGraph + scn = PropertyGraph.src_col_name + dcn = PropertyGraph.dst_col_name + + # Create a Graph containing only specific src or dst vertices + verts = ["10.40.182.3", "10.40.182.255", "59.166.0.9", "59.166.0.8"] + selected_edges = \ + pG.select_edges(f"{scn}.isin({verts}) | {dcn}.isin({verts})") + gpubenchmark(pG.extract_subgraph, + create_using=cugraph.Graph(directed=True), + selection=selected_edges, + default_edge_weight=1.0, + allow_multi_edges=True) + + +def bench_extract_subgraph_for_cyber_detect_duplicate_edges( + gpubenchmark, cyber_PropertyGraph): + from cugraph.experimental import PropertyGraph + + pG = cyber_PropertyGraph + scn = PropertyGraph.src_col_name + dcn = PropertyGraph.dst_col_name + + # Create a Graph containing only specific src or dst vertices + verts = ["10.40.182.3", "10.40.182.255", "59.166.0.9", "59.166.0.8"] + selected_edges = \ + pG.select_edges(f"{scn}.isin({verts}) | {dcn}.isin({verts})") + + def func(): + with pytest.raises(RuntimeError): + pG.extract_subgraph(create_using=cugraph.Graph(directed=True), + selection=selected_edges, + default_edge_weight=1.0, + allow_multi_edges=False) + + gpubenchmark(func) + + +def bench_extract_subgraph_for_rmat(gpubenchmark, rmat_PropertyGraph): + from cugraph.experimental import PropertyGraph + + (pG, generated_df) = rmat_PropertyGraph + scn = PropertyGraph.src_col_name + dcn = PropertyGraph.dst_col_name + + verts = [] + for i in range(0, 10000, 10): + verts.append(generated_df["src"].iloc[i]) + + selected_edges = \ + pG.select_edges(f"{scn}.isin({verts}) | {dcn}.isin({verts})") + gpubenchmark(pG.extract_subgraph, + create_using=cugraph.Graph(directed=True), + selection=selected_edges, + default_edge_weight=1.0, + allow_multi_edges=True) + + +# This test runs for *minutes* with the current implementation, and since +# benchmarking can call it multiple times per run, the overall time for this +# test can be ~20 minutes. +@pytest.mark.slow +def bench_extract_subgraph_for_rmat_detect_duplicate_edges( + gpubenchmark, rmat_PropertyGraph): + from cugraph.experimental import PropertyGraph + + (pG, generated_df) = rmat_PropertyGraph + scn = PropertyGraph.src_col_name + dcn = PropertyGraph.dst_col_name + + verts = [] + for i in range(0, 10000, 10): + verts.append(generated_df["src"].iloc[i]) + + selected_edges = \ + pG.select_edges(f"{scn}.isin({verts}) | {dcn}.isin({verts})") + + def func(): + with pytest.raises(RuntimeError): + pG.extract_subgraph(create_using=cugraph.Graph(directed=True), + selection=selected_edges, + default_edge_weight=1.0, + allow_multi_edges=False) + + gpubenchmark(func) diff --git a/python/cugraph/pytest.ini b/python/cugraph/pytest.ini index 046f972801c..0da378d3d13 100644 --- a/python/cugraph/pytest.ini +++ b/python/cugraph/pytest.ini @@ -1,9 +1,24 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + [pytest] addopts = --benchmark-warmup=off --benchmark-max-time=0 --benchmark-min-rounds=1 --benchmark-columns="mean, rounds" + ## do not run the slow tests/benchmarks by default + -m "not slow" ## for use with rapids-pytest-benchmark plugin #--benchmark-gpu-disable ## for use with pytest-cov plugin @@ -24,3 +39,16 @@ markers = cugraph_types: use cuGraph input types nx_types: use NetworkX input types matrix_types: use SciPy/CuPy matrix input types + slow: slow-running tests/benchmarks + +python_classes = + Bench* + Test* + +python_files = + bench_* + test_* + +python_functions = + bench_* + test_* From f9e619a34a4626e93a359f56e61b77a555bae6ae Mon Sep 17 00:00:00 2001 From: Rick Ratzel Date: Wed, 2 Feb 2022 13:54:42 -0600 Subject: [PATCH 4/6] removed dead code, fixed comment. --- python/cugraph/cugraph/structure/property_graph.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/python/cugraph/cugraph/structure/property_graph.py b/python/cugraph/cugraph/structure/property_graph.py index 14f18aa8f50..0a0cea61fea 100644 --- a/python/cugraph/cugraph/structure/property_graph.py +++ b/python/cugraph/cugraph/structure/property_graph.py @@ -678,7 +678,7 @@ def annotate_dataframe(self, df, G, edge_vertex_id_columns): -------- >>> """ - # FIXME: all check args + # FIXME: check all args (src_col_name, dst_col_name) = edge_vertex_id_columns df_type = type(df) @@ -686,11 +686,6 @@ def annotate_dataframe(self, df, G, edge_vertex_id_columns): raise TypeError(f"df type {df_type} does not match DataFrame type " f"{self.__dataframe_type} used in PropertyGraph") - # Add the src, dst, edge_id info from the Graph to a DataFrame - # edge_info_df = self.__dataframe_type(columns=[self.src_col_name, - # self.dst_col_name, - # self.edge_id_col_name], - # data=G.edge_data) if hasattr(G, "edge_data"): edge_info_df = G.edge_data else: From ff86defd66e34bc0172059860970db1cff1d58f9 Mon Sep 17 00:00:00 2001 From: Rick Ratzel Date: Sun, 13 Feb 2022 23:01:18 -0600 Subject: [PATCH 5/6] Allowed default_edge_weight to be used to add an edge weight value on extracted Graphs even when a weight property wasn't specified. --- .../cugraph/structure/property_graph.py | 74 ++++++++++++------- .../cugraph/tests/test_property_graph.py | 13 ++++ 2 files changed, 61 insertions(+), 26 deletions(-) diff --git a/python/cugraph/cugraph/structure/property_graph.py b/python/cugraph/cugraph/structure/property_graph.py index de941bb940b..14c6b14d299 100644 --- a/python/cugraph/cugraph/structure/property_graph.py +++ b/python/cugraph/cugraph/structure/property_graph.py @@ -64,6 +64,7 @@ class EXPERIMENTAL__PropertyGraph: type_col_name = "_TYPE_" edge_id_col_name = "_EDGE_ID_" vertex_id_col_name = "_VERTEX_ID_" + weight_col_name = "_WEIGHT_" def __init__(self): # The dataframe containing the properties for each vertex. @@ -605,7 +606,7 @@ def extract_subgraph(self, selected_vertex_dataframe = \ self.__vertex_prop_dataframe[selection.vertex_selections] else: - selected_vertex_dataframe = self.__vertex_prop_dataframe + selected_vertex_dataframe = None if (selection is not None) and \ (selection.edge_selections is not None): @@ -629,25 +630,6 @@ def extract_subgraph(self, else: edges = selected_edge_dataframe - if edge_weight_property: - if edge_weight_property not in edges.columns: - raise ValueError("edge_weight_property " - f'"{edge_weight_property}" was not found in ' - "the properties of the subgraph") - - # Ensure a valid edge_weight_property can be used for applying - # weights to the subgraph, and if a default_edge_weight was - # specified, apply it to all NAs in the weight column. - prop_col = edges[edge_weight_property] - if prop_col.count() != prop_col.size: - if default_edge_weight is None: - raise ValueError("edge_weight_property " - f'"{edge_weight_property}" ' - "contains NA values in the subgraph and " - "default_edge_weight is not set") - else: - prop_col.fillna(default_edge_weight, inplace=True) - # The __*_prop_dataframes have likely been merged several times and # possibly had their dtypes converted in order to accommodate NaN # values. Restore the original dtypes in the resulting edges df prior @@ -658,6 +640,7 @@ def extract_subgraph(self, edges, create_using=create_using, edge_weight_property=edge_weight_property, + default_edge_weight=default_edge_weight, allow_multi_edges=allow_multi_edges) def annotate_dataframe(self, df, G, edge_vertex_id_columns): @@ -729,15 +712,42 @@ def edge_props_to_graph(self, edge_prop_df, create_using, edge_weight_property=None, + default_edge_weight=None, allow_multi_edges=False): """ Create and return a Graph from the edges in edge_prop_df. """ - if edge_weight_property and \ - (edge_weight_property not in edge_prop_df.columns): - raise ValueError("edge_weight_property " - f'"{edge_weight_property}" was not found in ' - "edge_prop_df") + # FIXME: check default_edge_weight is valid + + if edge_weight_property: + if edge_weight_property not in edge_prop_df.columns: + raise ValueError("edge_weight_property " + f'"{edge_weight_property}" was not found in ' + "edge_prop_df") + + # Ensure a valid edge_weight_property can be used for applying + # weights to the subgraph, and if a default_edge_weight was + # specified, apply it to all NAs in the weight column. + prop_col = edge_prop_df[edge_weight_property] + if prop_col.count() != prop_col.size: + if default_edge_weight is None: + raise ValueError("edge_weight_property " + f'"{edge_weight_property}" ' + "contains NA values in the subgraph and " + "default_edge_weight is not set") + else: + prop_col.fillna(default_edge_weight, inplace=True) + edge_attr = edge_weight_property + + # If a default_edge_weight was specified but an edge_weight_property was + # not, a new edge weight column must be added. + elif default_edge_weight: + edge_attr = self.__gen_unique_name(edge_prop_df.columns, + prefix=self.weight_col_name) + edge_prop_df[edge_attr] = default_edge_weight + + else: + edge_attr = None # Set up the new Graph to return if isinstance(create_using, cugraph.Graph): @@ -771,7 +781,7 @@ def edge_props_to_graph(self, create_args = {"source": self.src_col_name, "destination": self.dst_col_name, - "edge_attr": edge_weight_property, + "edge_attr": edge_attr, "renumber": True, } if type(edge_prop_df) is cudf.DataFrame: @@ -851,6 +861,18 @@ def __get_all_vertices_series(self): vert_sers.append(epd[self.dst_col_name]) return vert_sers + @staticmethod + def __gen_unique_name(current_names, prefix="col"): + """ + Helper function to generate a currently unused name. + """ + name = prefix + counter = 2 + while name in current_names: + name = f"{prefix}{counter}" + counter += 1 + return name + @staticmethod def __get_new_column_dtypes(from_df, to_df): """ diff --git a/python/cugraph/cugraph/tests/test_property_graph.py b/python/cugraph/cugraph/tests/test_property_graph.py index 8cced223bf7..e71cffdf825 100644 --- a/python/cugraph/cugraph/tests/test_property_graph.py +++ b/python/cugraph/cugraph/tests/test_property_graph.py @@ -856,6 +856,19 @@ def test_extract_subgraph_default_edge_weight(dataset1_PropertyGraph): assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True) +def test_extract_subgraph_default_edge_weight_no_property( + dataset1_PropertyGraph): + """ + Ensure default_edge_weight can be used to provide an edge value when a + property for the edge weight is not specified. + """ + pG = dataset1_PropertyGraph + edge_weight = 99.2 + G = pG.extract_subgraph(allow_multi_edges=True, + default_edge_weight=edge_weight) + assert (G.edgelist.edgelist_df["weights"] == edge_weight).all() + + def test_graph_edge_data_added(dataset1_PropertyGraph): """ Ensures the subgraph returned from extract_subgraph() has the edge_data From f031fc8a79232f6dab70f4143e7356c2b9c46433 Mon Sep 17 00:00:00 2001 From: Rick Ratzel Date: Mon, 14 Feb 2022 11:22:23 -0600 Subject: [PATCH 6/6] flake8 fixes, changed arg name to better indicate that names should be passed in and not column/series objects. --- .../cugraph/structure/property_graph.py | 46 +++++------ .../cugraph/cugraph/tests/test_graph_store.py | 8 +- .../cugraph/tests/test_property_graph.py | 76 +++++++++---------- 3 files changed, 65 insertions(+), 65 deletions(-) diff --git a/python/cugraph/cugraph/structure/property_graph.py b/python/cugraph/cugraph/structure/property_graph.py index 14c6b14d299..4169537efe8 100644 --- a/python/cugraph/cugraph/structure/property_graph.py +++ b/python/cugraph/cugraph/structure/property_graph.py @@ -216,7 +216,7 @@ def vertices_ids(self): def add_vertex_data(self, dataframe, - vertex_id_column, + vertex_col_name, type_name=None, property_columns=None ): @@ -228,7 +228,7 @@ def add_vertex_data(self, dataframe : DataFrame-compatible instance A DataFrame instance with a compatible Pandas-like DataFrame interface. - vertex_id_column : string + vertex_col_name : string The column name that contains the values to be used as vertex IDs. type_name : string The name to be assigned to the type of property being added. For @@ -251,8 +251,8 @@ def add_vertex_data(self, if type(dataframe) not in _dataframe_types: raise TypeError("dataframe must be one of the following types: " f"{_dataframe_types}, got: {type(dataframe)}") - if vertex_id_column not in dataframe.columns: - raise ValueError(f"{vertex_id_column} is not a column in " + if vertex_col_name not in dataframe.columns: + raise ValueError(f"{vertex_col_name} is not a column in " f"dataframe: {dataframe.columns}") if (type_name is not None) and not(isinstance(type_name, str)): raise TypeError("type_name must be a string, got: " @@ -294,7 +294,7 @@ def add_vertex_data(self, # https://github.com/rapidsai/cudf/issues/9981) self.__update_dataframe_dtypes( self.__vertex_prop_dataframe, - {self.vertex_col_name: dataframe[vertex_id_column].dtype}) + {self.vertex_col_name: dataframe[vertex_col_name].dtype}) # Ensure that both the predetermined vertex ID column name and vertex # type column name are present for proper merging. @@ -303,7 +303,7 @@ def add_vertex_data(self, # columns. The copied DataFrame is then merged (another copy) and then # deleted when out-of-scope. tmp_df = dataframe.copy(deep=True) - tmp_df[self.vertex_col_name] = tmp_df[vertex_id_column] + tmp_df[self.vertex_col_name] = tmp_df[vertex_col_name] # FIXME: handle case of a type_name column already being in tmp_df tmp_df[self.type_col_name] = type_name @@ -332,7 +332,7 @@ def add_vertex_data(self, def add_edge_data(self, dataframe, - vertex_id_columns, + vertex_col_names, type_name=None, property_columns=None ): @@ -344,7 +344,7 @@ def add_edge_data(self, dataframe : DataFrame-compatible instance A DataFrame instance with a compatible Pandas-like DataFrame interface. - vertex_id_columns : list of strings + vertex_col_names : list of strings The column names that contain the values to be used as the source and destination vertex IDs for the edges. type_name : string @@ -368,12 +368,12 @@ def add_edge_data(self, if type(dataframe) not in _dataframe_types: raise TypeError("dataframe must be one of the following types: " f"{_dataframe_types}, got: {type(dataframe)}") - if type(vertex_id_columns) not in [list, tuple]: - raise TypeError("vertex_id_columns must be a list or tuple, got: " - f"{type(vertex_id_columns)}") - invalid_columns = set(vertex_id_columns).difference(dataframe.columns) + if type(vertex_col_names) not in [list, tuple]: + raise TypeError("vertex_col_names must be a list or tuple, got: " + f"{type(vertex_col_names)}") + invalid_columns = set(vertex_col_names).difference(dataframe.columns) if invalid_columns: - raise ValueError("vertex_id_columns contains column(s) not found " + raise ValueError("vertex_col_names contains column(s) not found " f"in dataframe: {list(invalid_columns)}") if (type_name is not None) and not(isinstance(type_name, str)): raise TypeError("type_name must be a string, got: " @@ -416,16 +416,16 @@ def add_edge_data(self, # https://github.com/rapidsai/cudf/issues/9981) self.__update_dataframe_dtypes( self.__edge_prop_dataframe, - {self.src_col_name: dataframe[vertex_id_columns[0]].dtype, - self.dst_col_name: dataframe[vertex_id_columns[1]].dtype, + {self.src_col_name: dataframe[vertex_col_names[0]].dtype, + self.dst_col_name: dataframe[vertex_col_names[1]].dtype, self.edge_id_col_name: "Int64"}) # NOTE: This copies the incoming DataFrame in order to add the new # columns. The copied DataFrame is then merged (another copy) and then # deleted when out-of-scope. tmp_df = dataframe.copy(deep=True) - tmp_df[self.src_col_name] = tmp_df[vertex_id_columns[0]] - tmp_df[self.dst_col_name] = tmp_df[vertex_id_columns[1]] + tmp_df[self.src_col_name] = tmp_df[vertex_col_names[0]] + tmp_df[self.dst_col_name] = tmp_df[vertex_col_names[1]] # FIXME: handle case of a type_name column already being in tmp_df tmp_df[self.type_col_name] = type_name @@ -643,21 +643,21 @@ def extract_subgraph(self, default_edge_weight=default_edge_weight, allow_multi_edges=allow_multi_edges) - def annotate_dataframe(self, df, G, edge_vertex_id_columns): + def annotate_dataframe(self, df, G, edge_vertex_col_names): """ Add properties to df that represent the vertices and edges in graph G. Parameters ---------- df : cudf.DataFrame or pandas.DataFrame - A DataFrame containing edges identified by edge_vertex_id_columns + A DataFrame containing edges identified by edge_vertex_col_names which will have properties for those edges added to it. G : cugraph.Graph (or subclass of) instance. Graph containing the edges specified in df. The Graph instance must have been generated from a prior call to extract_subgraph() in order to have the edge meta-data used to look up the correct properties. - edge_vertex_id_columns : tuple of strings + edge_vertex_col_names : tuple of strings The column names in df that represent the source and destination vertices, used for identifying edges. @@ -672,7 +672,7 @@ def annotate_dataframe(self, df, G, edge_vertex_id_columns): >>> """ # FIXME: check all args - (src_col_name, dst_col_name) = edge_vertex_id_columns + (src_col_name, dst_col_name) = edge_vertex_col_names df_type = type(df) if df_type is not self.__dataframe_type: @@ -739,8 +739,8 @@ def edge_props_to_graph(self, prop_col.fillna(default_edge_weight, inplace=True) edge_attr = edge_weight_property - # If a default_edge_weight was specified but an edge_weight_property was - # not, a new edge weight column must be added. + # If a default_edge_weight was specified but an edge_weight_property + # was not, a new edge weight column must be added. elif default_edge_weight: edge_attr = self.__gen_unique_name(edge_prop_df.columns, prefix=self.weight_col_name) diff --git a/python/cugraph/cugraph/tests/test_graph_store.py b/python/cugraph/cugraph/tests/test_graph_store.py index 25790662119..5f783c3baa8 100644 --- a/python/cugraph/cugraph/tests/test_graph_store.py +++ b/python/cugraph/cugraph/tests/test_graph_store.py @@ -50,7 +50,7 @@ def test_using_pgraph(graph_file): pG = PropertyGraph() pG.add_edge_data(cu_M, type_name="edge", - vertex_id_columns=("0", "1"), + vertex_col_names=("0", "1"), property_columns=None) gstore = cugraph.gnn.CuGraphStore(graph=pG) @@ -70,7 +70,7 @@ def test_node_data_pg(graph_file): pG = PropertyGraph() pG.add_edge_data(cu_M, type_name="edge", - vertex_id_columns=("0", "1"), + vertex_col_names=("0", "1"), property_columns=None) gstore = cugraph.gnn.CuGraphStore(graph=pG) @@ -91,7 +91,7 @@ def test_egonet(graph_file): pG = PropertyGraph() pG.add_edge_data(cu_M, type_name="edge", - vertex_id_columns=("0", "1"), + vertex_col_names=("0", "1"), property_columns=None) gstore = cugraph.gnn.CuGraphStore(graph=pG) @@ -117,7 +117,7 @@ def test_workflow(graph_file): pg = PropertyGraph() pg.add_edge_data(cu_M, type_name="edge", - vertex_id_columns=("0", "1"), + vertex_col_names=("0", "1"), property_columns=["2"]) gstore = cugraph.gnn.CuGraphStore(graph=pg) diff --git a/python/cugraph/cugraph/tests/test_property_graph.py b/python/cugraph/cugraph/tests/test_property_graph.py index e71cffdf825..6643e3c3c46 100644 --- a/python/cugraph/cugraph/tests/test_property_graph.py +++ b/python/cugraph/cugraph/tests/test_property_graph.py @@ -158,39 +158,39 @@ def dataset1_PropertyGraph(request): # relationships, and referrals. # property_columns=None (the default) means all columns except - # vertex_id_column will be used as properties for the vertices/edges. + # vertex_col_name will be used as properties for the vertices/edges. pG.add_vertex_data(dataframe_type(columns=merchants[0], data=merchants[1]), type_name="merchants", - vertex_id_column="merchant_id", + vertex_col_name="merchant_id", property_columns=None) pG.add_vertex_data(dataframe_type(columns=users[0], data=users[1]), type_name="users", - vertex_id_column="user_id", + vertex_col_name="user_id", property_columns=None) pG.add_vertex_data(dataframe_type(columns=taxpayers[0], data=taxpayers[1]), type_name="taxpayers", - vertex_id_column="payer_id", + vertex_col_name="payer_id", property_columns=None) pG.add_edge_data(dataframe_type(columns=transactions[0], data=transactions[1]), type_name="transactions", - vertex_id_columns=("user_id", "merchant_id"), + vertex_col_names=("user_id", "merchant_id"), property_columns=None) pG.add_edge_data(dataframe_type(columns=relationships[0], data=relationships[1]), type_name="relationships", - vertex_id_columns=("user_id_1", "user_id_2"), + vertex_col_names=("user_id_1", "user_id_2"), property_columns=None) pG.add_edge_data(dataframe_type(columns=referrals[0], data=referrals[1]), type_name="referrals", - vertex_id_columns=("user_id_1", - "user_id_2"), + vertex_col_names=("user_id_1", + "user_id_2"), property_columns=None) return pG @@ -278,7 +278,7 @@ def test_add_vertex_data(df_type): pG = PropertyGraph() pG.add_vertex_data(merchants_df, type_name="merchants", - vertex_id_column="merchant_id", + vertex_col_name="merchant_id", property_columns=None) assert pG.num_vertices == 5 @@ -301,7 +301,7 @@ def test_num_vertices(df_type): pG = PropertyGraph() pG.add_vertex_data(merchants_df, type_name="merchants", - vertex_id_column="merchant_id", + vertex_col_name="merchant_id", property_columns=None) # Test caching - the second retrieval should always be faster @@ -320,7 +320,7 @@ def test_num_vertices(df_type): pG.add_vertex_data(users_df, type_name="users", - vertex_id_column="user_id", + vertex_col_name="user_id", property_columns=None) assert pG.num_vertices == 9 @@ -334,7 +334,7 @@ def test_num_vertices(df_type): pG.add_vertex_data(taxpayers_df, type_name="taxpayers", - vertex_id_column="payer_id", + vertex_col_name="payer_id", property_columns=None) assert pG.num_vertices == 9 @@ -370,7 +370,7 @@ def test_add_vertex_data_prop_columns(df_type): pG = PropertyGraph() pG.add_vertex_data(merchants_df, type_name="merchants", - vertex_id_column="merchant_id", + vertex_col_name="merchant_id", property_columns=expected_props) assert pG.num_vertices == 5 @@ -393,28 +393,28 @@ def test_add_vertex_data_bad_args(): with pytest.raises(TypeError): pG.add_vertex_data(42, type_name="merchants", - vertex_id_column="merchant_id", + vertex_col_name="merchant_id", property_columns=None) with pytest.raises(TypeError): pG.add_vertex_data(merchants_df, type_name=42, - vertex_id_column="merchant_id", + vertex_col_name="merchant_id", property_columns=None) with pytest.raises(ValueError): pG.add_vertex_data(merchants_df, type_name="merchants", - vertex_id_column="bad_column_name", + vertex_col_name="bad_column_name", property_columns=None) with pytest.raises(ValueError): pG.add_vertex_data(merchants_df, type_name="merchants", - vertex_id_column="merchant_id", + vertex_col_name="merchant_id", property_columns=["bad_column_name", "merchant_name"]) with pytest.raises(TypeError): pG.add_vertex_data(merchants_df, type_name="merchants", - vertex_id_column="merchant_id", + vertex_col_name="merchant_id", property_columns="merchant_name") @@ -432,7 +432,7 @@ def test_add_edge_data(df_type): pG = PropertyGraph() pG.add_edge_data(transactions_df, type_name="transactions", - vertex_id_columns=("user_id", "merchant_id"), + vertex_col_names=("user_id", "merchant_id"), property_columns=None) assert pG.num_vertices == 7 @@ -457,7 +457,7 @@ def test_add_edge_data_prop_columns(df_type): pG = PropertyGraph() pG.add_edge_data(transactions_df, type_name="transactions", - vertex_id_columns=("user_id", "merchant_id"), + vertex_col_names=("user_id", "merchant_id"), property_columns=expected_props) assert pG.num_vertices == 7 @@ -480,27 +480,27 @@ def test_add_edge_data_bad_args(): with pytest.raises(TypeError): pG.add_edge_data(42, type_name="transactions", - vertex_id_columns=("user_id", "merchant_id"), + vertex_col_names=("user_id", "merchant_id"), property_columns=None) with pytest.raises(TypeError): pG.add_edge_data(transactions_df, type_name=42, - vertex_id_columns=("user_id", "merchant_id"), + vertex_col_names=("user_id", "merchant_id"), property_columns=None) with pytest.raises(ValueError): pG.add_edge_data(transactions_df, type_name="transactions", - vertex_id_columns=("user_id", "bad_column"), + vertex_col_names=("user_id", "bad_column"), property_columns=None) with pytest.raises(ValueError): pG.add_edge_data(transactions_df, type_name="transactions", - vertex_id_columns=("user_id", "merchant_id"), + vertex_col_names=("user_id", "merchant_id"), property_columns=["bad_column_name", "time"]) with pytest.raises(TypeError): pG.add_edge_data(transactions_df, type_name="transactions", - vertex_id_columns=("user_id", "merchant_id"), + vertex_col_names=("user_id", "merchant_id"), property_columns="time") @@ -707,12 +707,12 @@ def test_extract_subgraph_graph_without_vert_props(): pG.add_edge_data(cudf.DataFrame(columns=transactions[0], data=transactions[1]), type_name="transactions", - vertex_id_columns=("user_id", "merchant_id"), + vertex_col_names=("user_id", "merchant_id"), property_columns=None) pG.add_edge_data(cudf.DataFrame(columns=relationships[0], data=relationships[1]), type_name="relationships", - vertex_id_columns=("user_id_1", "user_id_2"), + vertex_col_names=("user_id_1", "user_id_2"), property_columns=None) scn = PropertyGraph.src_col_name @@ -925,7 +925,7 @@ def test_annotate_dataframe(dataset1_PropertyGraph): inplace=True, ignore_index=True) new_algo_result = pG.annotate_dataframe( - algo_result, G, edge_vertex_id_columns=("from", "to")) + algo_result, G, edge_vertex_col_names=("from", "to")) expected_algo_result = df_type({"from": srcs, "to": dsts, "result": range(len(srcs)), "merchant_id": mids, @@ -961,31 +961,31 @@ def test_different_vertex_edge_input_dataframe_types(): from cugraph.experimental import PropertyGraph pG = PropertyGraph() - pG.add_vertex_data(df, type_name="foo", vertex_id_column="a") + pG.add_vertex_data(df, type_name="foo", vertex_col_name="a") with pytest.raises(TypeError): - pG.add_edge_data(pdf, type_name="bar", vertex_id_columns=("a", "b")) + pG.add_edge_data(pdf, type_name="bar", vertex_col_names=("a", "b")) pG = PropertyGraph() - pG.add_vertex_data(pdf, type_name="foo", vertex_id_column="a") + pG.add_vertex_data(pdf, type_name="foo", vertex_col_name="a") with pytest.raises(TypeError): - pG.add_edge_data(df, type_name="bar", vertex_id_columns=("a", "b")) + pG.add_edge_data(df, type_name="bar", vertex_col_names=("a", "b")) # Different order pG = PropertyGraph() - pG.add_edge_data(df, type_name="bar", vertex_id_columns=("a", "b")) + pG.add_edge_data(df, type_name="bar", vertex_col_names=("a", "b")) with pytest.raises(TypeError): - pG.add_vertex_data(pdf, type_name="foo", vertex_id_column="a") + pG.add_vertex_data(pdf, type_name="foo", vertex_col_name="a") # Same API call, different types pG = PropertyGraph() - pG.add_vertex_data(df, type_name="foo", vertex_id_column="a") + pG.add_vertex_data(df, type_name="foo", vertex_col_name="a") with pytest.raises(TypeError): - pG.add_vertex_data(pdf, type_name="foo", vertex_id_column="a") + pG.add_vertex_data(pdf, type_name="foo", vertex_col_name="a") pG = PropertyGraph() - pG.add_edge_data(df, type_name="bar", vertex_id_columns=("a", "b")) + pG.add_edge_data(df, type_name="bar", vertex_col_names=("a", "b")) with pytest.raises(TypeError): - pG.add_edge_data(pdf, type_name="bar", vertex_id_columns=("a", "b")) + pG.add_edge_data(pdf, type_name="bar", vertex_col_names=("a", "b")) def test_get_vertices(dataset1_PropertyGraph):