From 3542708693b45c7024592e08aabcdcce9e7123d9 Mon Sep 17 00:00:00 2001 From: Rick Ratzel Date: Tue, 25 Oct 2022 23:39:13 -0500 Subject: [PATCH 01/41] Added the ability to load/call/unload general extensions in addition to graph creation extensions, changed how unloading extensions work to unload by unique module name, make numpy and cupy optional modules in the client, updated Value and ValueWrapper to handle doubles and lists (recursive), added tests for new extension changes. --- .../cugraph_service_client/client.py | 110 +++++++++-- .../cugraph_service_thrift.py | 15 +- .../cugraph_service_client/types.py | 132 +++++++++++--- .../cugraph_service_server/cugraph_handler.py | 171 ++++++++++++------ python/cugraph_service/tests/conftest.py | 39 +++- .../tests/test_cugraph_handler.py | 63 +++++-- python/cugraph_service/tests/test_e2e.py | 76 ++++++-- .../tests/test_mg_cugraph_handler.py | 4 +- python/cugraph_service/tests/test_mg_e2e.py | 10 +- python/cugraph_service/tests/utils.py | 2 +- 10 files changed, 471 insertions(+), 151 deletions(-) diff --git a/python/cugraph_service/cugraph_service_client/client.py b/python/cugraph_service/cugraph_service_client/client.py index 629a1268a56..b3effb97749 100644 --- a/python/cugraph_service/cugraph_service_client/client.py +++ b/python/cugraph_service/cugraph_service_client/client.py @@ -249,27 +249,57 @@ def load_graph_creation_extensions(self, extension_dir_path): Returns ------- - num_files_read : int - Number of extension files read in the extension_dir_path directory. + extension_modnames : list + List of the module names loaded. These can be used in calls to + unload_extension_module() Examples -------- >>> from cugraph_service_client import CugraphServiceClient >>> client = CugraphServiceClient() - >>> num_files_read = client.load_graph_creation_extensions( + >>> extension_modnames = client.load_graph_creation_extensions( ... "/some/server/side/directory") >>> """ return self.__client.load_graph_creation_extensions(extension_dir_path) @__server_connection - def unload_graph_creation_extensions(self): + def load_extensions(self, extension_dir_path): """ - Removes all extensions for graph creation previously loaded. + Loads the extensions present in the directory specified by extension_dir_path. Parameters ---------- - None + extension_dir_path : string + Path to the directory containing the extension files (.py source + files). This directory must be readable by the server. + + Returns + ------- + extension_modnames : list + List of the module names loaded. These can be used in calls to + unload_extension_module() + + Examples + -------- + >>> from cugraph_service_client import CugraphServiceClient + >>> client = CugraphServiceClient() + >>> extension_modnames = client.load_graph_creation_extensions( + ... "/some/server/side/directory") + >>> + """ + return self.__client.load_extensions(extension_dir_path) + + @__server_connection + def unload_extension_module(self, modname): + """ + Removes all extensions contained in the modname module. + + Parameters + ---------- + modname : string + Name of the module to be unloaded. All extension functions contained in + modname will no longer be callable. Returns ------- @@ -279,10 +309,12 @@ def unload_graph_creation_extensions(self): -------- >>> from cugraph_service_client import CugraphServiceClient >>> client = CugraphServiceClient() - >>> client.unload_graph_creation_extensions() + >>> ext_mod_name = client.load_graph_creation_extensions( + ... "/some/server/side/directory") + >>> client.unload_extension_module(ext_mod_name) >>> """ - return self.__client.unload_graph_creation_extensions() + return self.__client.unload_extension_module(modname) @__server_connection def call_graph_creation_extension(self, func_name, *func_args, **func_kwargs): @@ -335,6 +367,58 @@ def call_graph_creation_extension(self, func_name, *func_args, **func_kwargs): func_name, func_args_repr, func_kwargs_repr ) + @__server_connection + def call_extension(self, func_name, *func_args, **func_kwargs): + """ + Calls an extension on the server that was previously + loaded by a prior call to load_extensions(), then + returns the result returned by the extension. + + Parameters + ---------- + func_name : string + The name of the server-side extension function loaded by a prior + call to load_graph_creation_extensions(). All graph creation + extension functions are expected to return a new graph. + + *func_args : string, int, list, dictionary (optional) + The positional args to pass to func_name. Note that func_args are + converted to their string representation using repr() on the + client, then restored to python objects on the server using eval(), + and therefore only objects that can be restored server-side with + eval() are supported. + + **func_kwargs : string, int, list, dictionary + The keyword args to pass to func_name. Note that func_kwargs are + converted to their string representation using repr() on the + client, then restored to python objects on the server using eval(), + and therefore only objects that can be restored server-side with + eval() are supported. + + Returns + ------- + result : python int, float, string, list + The result returned by the extension + + Examples + -------- + >>> from cugraph_service_client import CugraphServiceClient + >>> client = CugraphServiceClient() + >>> # Load the extension file containing "my_serverside_function()" + >>> client.load_extensions("/some/server/side/dir") + >>> result = client.call_extension( + ... "my_serverside_function", 33, 22, "some_string") + >>> + """ + func_args_repr = repr(func_args) + func_kwargs_repr = repr(func_kwargs) + result = self.__client.call_extension( + func_name, func_args_repr, func_kwargs_repr + ) + # FIXME: ValueWrapper ctor and get_py_obj are recursive and could be slow, + # especially if Value is a list. Consider returning the Value obj as-is. + return ValueWrapper(result).get_py_obj() + ########################################################################### # Graph management @__server_connection @@ -883,16 +967,6 @@ def batched_ego_graphs(self, seeds, radius=1, graph_id=defaults.graph_id): seeds, radius, graph_id ) - # FIXME: ensure dtypes are correct for values returned from - # cugraph.batched_ego_graphs() in cugraph_handler.py - # return (numpy.frombuffer(batched_ego_graphs_result.src_verts, - # dtype="int32"), - # numpy.frombuffer(batched_ego_graphs_result.dst_verts, - # dtype="int32"), - # numpy.frombuffer(batched_ego_graphs_result.edge_weights, - # dtype="float64"), - # numpy.frombuffer(batched_ego_graphs_result.seeds_offsets, - # dtype="int64")) return ( batched_ego_graphs_result.src_verts, batched_ego_graphs_result.dst_verts, diff --git a/python/cugraph_service/cugraph_service_client/cugraph_service_thrift.py b/python/cugraph_service/cugraph_service_client/cugraph_service_thrift.py index dbf5d7f6766..1268467b9cf 100644 --- a/python/cugraph_service/cugraph_service_client/cugraph_service_thrift.py +++ b/python/cugraph_service/cugraph_service_client/cugraph_service_thrift.py @@ -77,6 +77,8 @@ 2:i64 int64_value 3:string string_value 4:bool bool_value + 5:double double_value + 6:list list_value } service CugraphService { @@ -87,16 +89,23 @@ map get_server_info() throws (1:CugraphServiceError e), - i32 load_graph_creation_extensions(1:string extension_dir_path - ) throws (1:CugraphServiceError e), + list load_graph_creation_extensions(1:string extension_dir_path + ) throws (1:CugraphServiceError e), - void unload_graph_creation_extensions(), + list load_extensions(1:string extension_dir_path + ) throws (1:CugraphServiceError e), + + void unload_extension_module(1:string modname) throws (1:CugraphServiceError e), i32 call_graph_creation_extension(1:string func_name, 2:string func_args_repr, 3:string func_kwargs_repr ) throws (1:CugraphServiceError e), + Value call_extension(1:string func_name, + 2:string func_args_repr, + 3:string func_kwargs_repr + ) throws (1:CugraphServiceError e), ############################################################################## # Graph management diff --git a/python/cugraph_service/cugraph_service_client/types.py b/python/cugraph_service/cugraph_service_client/types.py index 7b7f9effbf8..c1384ac83ed 100644 --- a/python/cugraph_service/cugraph_service_client/types.py +++ b/python/cugraph_service/cugraph_service_client/types.py @@ -12,7 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy +# Optional modules: additional features are enabled if these are present +try: + import numpy +except ModuleNotFoundError: + numpy = None +try: + import cupy +except ModuleNotFoundError: + cupy = None from cugraph_service_client.cugraph_service_thrift import spec @@ -31,24 +39,7 @@ class UnionWrapper: unions to Thrift unions/py objects. """ - def get_py_obj(self): - """ - Get the python object set in the union. - """ - not_members = set(["default_spec", "thrift_spec", "read", "write"]) - attrs = [ - a - for a in dir(self.union) - if not (a.startswith("_")) and a not in not_members - ] - # Much like a C union, only one field will be set. Return the first - # non-None value encountered. - for a in attrs: - val = getattr(self.union, a) - if val is not None: - return val - - return None + non_attrs = set(["default_spec", "thrift_spec", "read", "write"]) class ValueWrapper(UnionWrapper): @@ -58,6 +49,12 @@ class ValueWrapper(UnionWrapper): field. """ + valid_types = ["int", "float", "str", "bool"] + if numpy: + valid_types += ["numpy.int32", "numpy.int64", "numpy.ndarray"] + if cupy: + valid_types += ["cupy.int32", "cupy.int64", "cupy.ndarray"] + def __init__(self, val, val_name="value"): """ Construct with a value supported by the Value "union". See @@ -73,20 +70,93 @@ def __init__(self, val, val_name="value"): self.union = Value(int32_value=val) else: self.union = Value(int64_value=val) - elif isinstance(val, numpy.int32): + elif isinstance(val, float): + self.union = Value(double_value=val) + elif (numpy and isinstance(val, numpy.int32)) or ( + cupy and isinstance(val, cupy.int32) + ): self.union = Value(int32_value=int(val)) - elif isinstance(val, numpy.int64): + elif (numpy and isinstance(val, numpy.int64)) or ( + cupy and isinstance(val, cupy.int64) + ): self.union = Value(int64_value=int(val)) + elif ( + (numpy and isinstance(val, numpy.float32)) + or (cupy and isinstance(val, cupy.float32)) + or (numpy and isinstance(val, numpy.float64)) + or (cupy and isinstance(val, cupy.float64)) + ): + self.union = Value(double_value=float(val)) elif isinstance(val, str): self.union = Value(string_value=val) elif isinstance(val, bool): self.union = Value(bool_value=val) + elif isinstance(val, (list, tuple)): + self.union = Value(list_value=[ValueWrapper(i) for i in val]) + # FIXME: Assume ndarrays contain values Thrift can accept! Otherwise, + # check and possibly convert ndarray dtypes. + elif (numpy and isinstance(val, numpy.ndarray)) or ( + cupy and isinstance(val, cupy.ndarray) + ): + # self.union = Value(list_value=val.tolist()) + self.union = Value(list_value=[ValueWrapper(i) for i in val.tolist()]) else: raise TypeError( f"{val_name} must be one of the " - "following types: [int, str, bool], got " + f"following types: {self.valid_types}, got " f"{type(val)}" ) + """ + # Also add members with values matching the now complete self.union + # Value object. This will essentially duck-type this ValueWrapper + # instance and allow it to be returned to Thrift and treated as a Value + self.int32_value = self.union.int32_value + self.int64_value = self.union.int64_value + self.string_value = self.union.string_value + self.bool_value = self.union.bool_value + self.double_value = self.union.double_value + self.list_value = self.union.list_value + """ + + def __getattr__(self, attr): + """ + Retrieve all other attrs from the underlying Value object. This will + essentially duck-type this ValueWrapper instance and allow it to be + returned to Thrift and treated as a Value. + """ + return getattr(self.union, attr) + + def get_py_obj(self): + """ + Get the python object set in the union. + """ + attrs = [ + a + for a in dir(self.union) + if not (a.startswith("_")) and a not in self.non_attrs + ] + # Much like a C union, only one field will be set. Return the first + # non-None value encountered. + for a in attrs: + val = getattr(self.union, a) + if val is not None: + # Assume all lists are homogeneous. Check the first item to see + # if it is a Value or ValueWrapper obj, and if so recurse. + # FIXME: this might be slow, consider handling lists of numbers + # differently + if isinstance(val, list) and len(val) > 0: + if isinstance(val[0], Value): + return [ValueWrapper(i).get_py_obj() for i in val] + elif isinstance(val[0], ValueWrapper): + return [i.get_py_obj() for i in val] + else: + raise TypeError( + f"expected Value or ValueWrapper, got {type(val)}" + ) + else: + return val + + return None class GraphVertexEdgeIDWrapper(UnionWrapper): @@ -110,3 +180,21 @@ def __init__(self, val, val_name="id"): "following types: [int, list], got " f"{type(val)}" ) + + def get_py_obj(self): + """ + Get the python object set in the union. + """ + attrs = [ + a + for a in dir(self.union) + if not (a.startswith("_")) and a not in self.non_attrs + ] + # Much like a C union, only one field will be set. Return the first + # non-None value encountered. + for a in attrs: + val = getattr(self.union, a) + if val is not None: + return val + + return None diff --git a/python/cugraph_service/cugraph_service_server/cugraph_handler.py b/python/cugraph_service/cugraph_service_server/cugraph_handler.py index 94b82ea3564..d4ee9ca9087 100644 --- a/python/cugraph_service/cugraph_service_server/cugraph_handler.py +++ b/python/cugraph_service/cugraph_service_server/cugraph_handler.py @@ -132,6 +132,7 @@ def __init__(self): self.__next_graph_id = defaults.graph_id + 1 self.__graph_objs = {} self.__graph_creation_extensions = {} + self.__extensions = {} self.__dask_client = None self.__dask_cluster = None self.__start_time = int(time.time()) @@ -173,7 +174,7 @@ def get_server_info(self): # cugraph_service server) num_gpus = 1 - return {"num_gpus": ValueWrapper(num_gpus).union} + return {"num_gpus": ValueWrapper(num_gpus)} def load_graph_creation_extensions(self, extension_dir_path): """ @@ -188,90 +189,89 @@ def load_graph_creation_extensions(self, extension_dir_path): if (not extension_dir.exists()) or (not extension_dir.is_dir()): raise CugraphServiceError(f"bad directory: {extension_dir}") - num_files_read = 0 + modules_loaded = [] for ext_file in extension_dir.glob("*_extension.py"): module_file_path = ext_file.absolute().as_posix() spec = importlib.util.spec_from_file_location(module_file_path, ext_file) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) self.__graph_creation_extensions[module_file_path] = module - num_files_read += 1 + modules_loaded.append(module_file_path) - return num_files_read + return modules_loaded - def unload_graph_creation_extensions(self): + def load_extensions(self, extension_dir_path): """ - Removes all graph creation extensions. + Loads ("imports") all modules matching the pattern *_extension.py in + the directory specified by extension_dir_path. + + The modules are searched and their functions are called (if a match is + found) when call_extension() is called. """ - self.__graph_creation_extensions.clear() + extension_dir = Path(extension_dir_path) + + if (not extension_dir.exists()) or (not extension_dir.is_dir()): + raise CugraphServiceError(f"bad directory: {extension_dir}") + + modules_loaded = [] + for ext_file in extension_dir.glob("*_extension.py"): + module_file_path = ext_file.absolute().as_posix() + spec = importlib.util.spec_from_file_location(module_file_path, ext_file) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + self.__extensions[module_file_path] = module + modules_loaded.append(module_file_path) + + return modules_loaded + + def unload_extension_module(self, modname): + """ + Removes all extension functions in modname. + """ + if (self.__graph_creation_extensions.pop(modname, None) is None) and ( + self.__extensions.pop(modname, None) is None + ): + raise CugraphServiceError(f"bad extension module {modname}") def call_graph_creation_extension( self, func_name, func_args_repr, func_kwargs_repr ): """ Calls the graph creation extension function func_name and passes it the - eval'd func_args_repr and func_kwargs_repr objects. - - The arg/kwarg reprs are eval'd prior to calling in order to pass actual - python objects to func_name (this is needed to allow arbitrary arg - objects to be serialized as part of the RPC call from the - client). + eval'd func_args_repr and func_kwargs_repr objects. If successful, it + associates the graph returned by the extension function with a new graph + ID and returns it. func_name cannot be a private name (name starting with __). + """ + graph_obj = self.__call_extension( + self.__graph_creation_extensions, + func_name, + func_args_repr, + func_kwargs_repr, + ) + # FIXME: ensure graph_obj is a graph obj + return self.__add_graph(graph_obj) - All loaded extension modules are checked when searching for func_name, - and the first extension module that contains it will have its function - called. + def call_extension(self, func_name, func_args_repr, func_kwargs_repr): """ - if not (func_name.startswith("__")): - for module in self.__graph_creation_extensions.values(): - # Ignore private functions - func = getattr(module, func_name, None) - if func is not None: - func_args = eval(func_args_repr) - func_kwargs = eval(func_kwargs_repr) - func_sig = signature(func) - func_params = list(func_sig.parameters.keys()) - facade_param = self.__server_facade_extension_param_name - - # Graph creation extensions that have the last arg named - # self.__server_facade_extension_param_name are passed a - # ExtensionServerFacade instance to allow them to query the - # "server" in a safe way, if needed. - if facade_param in func_params: - if func_params[-1] == facade_param: - func_kwargs[facade_param] = ExtensionServerFacade(self) - else: - raise CugraphServiceError( - f"{facade_param}, if specified, must be the " - "last param." - ) - try: - graph_obj = func(*func_args, **func_kwargs) - except Exception: - # FIXME: raise a more detailed error - raise CugraphServiceError( - f"error running {func_name} : " f"{traceback.format_exc()}" - ) - return self.__add_graph(graph_obj) + Calls the extension function func_name and passes it the eval'd + func_args_repr and func_kwargs_repr objects. If successful, returns a + Value object containing the results returned by the extension function. - raise CugraphServiceError(f"{func_name} is not a graph creation extension") + func_name cannot be a private name (name starting with __). + """ + result = self.__call_extension( + self.__extensions, func_name, func_args_repr, func_kwargs_repr + ) + return ValueWrapper(result) def initialize_dask_client(self, dask_scheduler_file=None): """ Initialize a dask client to be used for MG operations. """ if dask_scheduler_file is not None: - # FIXME: read the config from user options instead of hardcoding here. - # FIXME: for the config below, env var UCX_MAX_RNDV_RAILS=1 must be set too. - dask_initialize( - create_cuda_context=False, - enable_tcp_over_ucx=True, - enable_nvlink=True, - enable_infiniband=True, - enable_rdmacm=True, - # net_devices="mlx5_0:1", - ) + dask_initialize() self.__dask_client = Client(scheduler_file=dask_scheduler_file) else: # FIXME: LocalCUDACluster init. Implement when tests are in place. @@ -375,7 +375,7 @@ def get_graph_info(self, keys, graph_id): except Exception: raise CugraphServiceError(f"{traceback.format_exc()}") - return {key: ValueWrapper(value).union for (key, value) in info.items()} + return {key: ValueWrapper(value) for (key, value) in info.items()} def load_csv_as_vertex_data( self, @@ -932,3 +932,56 @@ def __get_graph_data_as_numpy_bytes(self, dataframe, null_replacement_value): except Exception: raise CugraphServiceError(f"{traceback.format_exc()}") + + def __call_extension( + self, extension_dict, func_name, func_args_repr, func_kwargs_repr + ): + """ + Calls the extension function func_name and passes it the eval'd + func_args_repr and func_kwargs_repr objects. If successful, returns a + Value object containing the results returned by the extension function. + + The arg/kwarg reprs are eval'd prior to calling in order to pass actual + python objects to func_name (this is needed to allow arbitrary arg + objects to be serialized as part of the RPC call from the + client). + + func_name cannot be a private name (name starting with __). + + All loaded extension modules are checked when searching for func_name, + and the first extension module that contains it will have its function + called. + """ + if func_name.startswith("__"): + raise CugraphServiceError(f"Cannot call private function {func_name}") + + for module in extension_dict.values(): + # Ignore private functions + func = getattr(module, func_name, None) + if func is not None: + func_args = eval(func_args_repr) + func_kwargs = eval(func_kwargs_repr) + func_sig = signature(func) + func_params = list(func_sig.parameters.keys()) + facade_param = self.__server_facade_extension_param_name + + # Graph creation extensions that have the last arg named + # self.__server_facade_extension_param_name are passed a + # ExtensionServerFacade instance to allow them to query the + # "server" in a safe way, if needed. + if facade_param in func_params: + if func_params[-1] == facade_param: + func_kwargs[facade_param] = ExtensionServerFacade(self) + else: + raise CugraphServiceError( + f"{facade_param}, if specified, must be the " "last param." + ) + try: + return func(*func_args, **func_kwargs) + except Exception: + # FIXME: raise a more detailed error + raise CugraphServiceError( + f"error running {func_name} : " f"{traceback.format_exc()}" + ) + + raise CugraphServiceError(f"extension {func_name} was not found") diff --git a/python/cugraph_service/tests/conftest.py b/python/cugraph_service/tests/conftest.py index 865cdb3ffbd..870f347f328 100644 --- a/python/cugraph_service/tests/conftest.py +++ b/python/cugraph_service/tests/conftest.py @@ -154,6 +154,19 @@ def graph_creation_extension_large_property_graph(server): return pG """ +extension1_file_contents = """ +import cupy as cp + + +def my_nines_function(array1_size, array1_dtype, array2_size, array2_dtype): + ''' + Returns 2 arrays of size and dtype specified containing only 9s + ''' + array1 = cp.array([9] * array1_size, dtype=array1_dtype) + array2 = cp.array([9] * array2_size, dtype=array2_dtype) + return (array1, array2) +""" + ############################################################################### # module scope fixtures @@ -165,7 +178,7 @@ def graph_creation_extension1(): graph_creation_extension1_file_contents ) - yield tmp_extension_dir + yield tmp_extension_dir.name @pytest.fixture(scope="module") @@ -174,7 +187,7 @@ def graph_creation_extension2(): graph_creation_extension2_file_contents ) - yield tmp_extension_dir + yield tmp_extension_dir.name @pytest.fixture(scope="module") @@ -183,7 +196,7 @@ def graph_creation_extension_long_running(): graph_creation_extension_long_running_file_contents ) - yield tmp_extension_dir + yield tmp_extension_dir.name @pytest.fixture(scope="module") @@ -192,7 +205,7 @@ def graph_creation_extension_no_facade_arg(): graph_creation_extension_no_facade_arg_file_contents ) - yield tmp_extension_dir + yield tmp_extension_dir.name @pytest.fixture(scope="module") @@ -201,7 +214,7 @@ def graph_creation_extension_bad_arg_order(): graph_creation_extension_bad_arg_order_file_contents ) - yield tmp_extension_dir + yield tmp_extension_dir.name @pytest.fixture(scope="module") @@ -210,7 +223,7 @@ def graph_creation_extension_big_vertex_ids(): graph_creation_extension_big_vertex_ids_file_contents ) - yield tmp_extension_dir + yield tmp_extension_dir.name @pytest.fixture(scope="module") @@ -219,7 +232,7 @@ def graph_creation_extension_empty_graph(): graph_creation_extension_empty_graph_file_contents ) - yield tmp_extension_dir + yield tmp_extension_dir.name @pytest.fixture(scope="module") @@ -228,4 +241,14 @@ def graph_creation_extension_large_property_graph(): graph_creation_extension_large_property_graph_file_contents ) - yield tmp_extension_dir + yield tmp_extension_dir.name + + +# General (ie. not graph creation) extension + + +@pytest.fixture(scope="module") +def extension1(): + tmp_extension_dir = utils.create_tmp_extension_dir(extension1_file_contents) + + yield tmp_extension_dir.name diff --git a/python/cugraph_service/tests/test_cugraph_handler.py b/python/cugraph_service/tests/test_cugraph_handler.py index 6bfb761060c..e39a208fe32 100644 --- a/python/cugraph_service/tests/test_cugraph_handler.py +++ b/python/cugraph_service/tests/test_cugraph_handler.py @@ -13,6 +13,7 @@ # limitations under the License. import pickle +from pathlib import Path import pytest @@ -47,8 +48,10 @@ def test_load_and_call_graph_creation_extension(graph_creation_extension2): handler.load_graph_creation_extensions(__file__) # Load the extension and call the function defined in it - num_files_read = handler.load_graph_creation_extensions(extension_dir.name) - assert num_files_read == 1 + ext_mod_names = handler.load_graph_creation_extensions(extension_dir) + assert len(ext_mod_names) == 1 + expected_mod_name = (Path(extension_dir) / "graph_creation_extension.py").as_posix() + assert ext_mod_names[0] == expected_mod_name # Private function should not be callable with pytest.raises(CugraphServiceError): @@ -78,32 +81,64 @@ def test_load_and_call_graph_creation_extension(graph_creation_extension2): assert "c" in edge_props -def test_load_and_unload_graph_creation_extension(graph_creation_extension2): +def test_load_and_unload_extensions(graph_creation_extension2, extension1): """ - Ensure extensions can be unloaded. + Ensure extensions can be loaded, run, and unloaded. """ from cugraph_service_server.cugraph_handler import CugraphHandler from cugraph_service_client.exceptions import CugraphServiceError handler = CugraphHandler() - extension_dir = graph_creation_extension2 + graph_creation_extension_dir = graph_creation_extension2 + extension_dir = extension1 - # Load the extensions and ensure it can be called. - handler.load_graph_creation_extensions(extension_dir.name) + # Loading + gc_ext_mod_names = handler.load_graph_creation_extensions( + graph_creation_extension_dir + ) + ext_mod_names = handler.load_extensions(extension_dir) + + # Running new_graph_ID = handler.call_graph_creation_extension( "my_graph_creation_function", "('a', 'b', 'c')", "{}" ) assert new_graph_ID in handler.get_graph_ids() - # Unload then try to run the same call again, which should fail - handler.unload_graph_creation_extensions() + results = handler.call_extension( + "my_nines_function", "(33, 'int32', 21, 'float64')", "{}" + ) + # Check the ValueWrapper object + assert len(results.list_value) == 2 + assert len(results.list_value[0].list_value) == 33 + assert len(results.list_value[1].list_value) == 21 + assert type(results.list_value[0].list_value[0].int32_value) is int + assert type(results.list_value[1].list_value[0].double_value) is float + assert results.list_value[0].list_value[0].int32_value == 9 + assert results.list_value[1].list_value[0].double_value == 9.0 + + # Unloading + with pytest.raises(CugraphServiceError): + handler.unload_extension_module("invalid_module") + + for mod_name in gc_ext_mod_names: + handler.unload_extension_module(mod_name) with pytest.raises(CugraphServiceError): handler.call_graph_creation_extension( "my_graph_creation_function", "('a', 'b', 'c')", "{}" ) + handler.call_extension("my_nines_function", "(33, 'int32', 21, 'float64')", "{}") + + for mod_name in ext_mod_names: + handler.unload_extension_module(mod_name) + + with pytest.raises(CugraphServiceError): + handler.call_extension( + "my_nines_function", "(33, 'int32', 21, 'float64')", "{}" + ) + def test_load_and_unload_graph_creation_extension_no_args(graph_creation_extension1): """ @@ -116,7 +151,7 @@ def test_load_and_unload_graph_creation_extension_no_args(graph_creation_extensi extension_dir = graph_creation_extension1 # Load the extensions and ensure it can be called. - handler.load_graph_creation_extensions(extension_dir.name) + handler.load_graph_creation_extensions(extension_dir) new_graph_ID = handler.call_graph_creation_extension( "custom_graph_creation_function", "()", "{}" ) @@ -136,7 +171,7 @@ def test_load_and_unload_graph_creation_extension_no_facade_arg( extension_dir = graph_creation_extension_no_facade_arg # Load the extensions and ensure it can be called. - handler.load_graph_creation_extensions(extension_dir.name) + handler.load_graph_creation_extensions(extension_dir) new_graph_ID = handler.call_graph_creation_extension( "graph_creation_function", "('a')", "{'arg2':33}" ) @@ -157,7 +192,7 @@ def test_load_and_unload_graph_creation_extension_bad_arg_order( extension_dir = graph_creation_extension_bad_arg_order # Load the extensions and ensure it can be called. - handler.load_graph_creation_extensions(extension_dir.name) + handler.load_graph_creation_extensions(extension_dir) with pytest.raises(CugraphServiceError): handler.call_graph_creation_extension( "graph_creation_function", "('a', 'b')", "{}" @@ -175,7 +210,7 @@ def test_get_graph_data_large_vertex_ids(graph_creation_extension_big_vertex_ids extension_dir = graph_creation_extension_big_vertex_ids # Load the extension and ensure it can be called. - handler.load_graph_creation_extensions(extension_dir.name) + handler.load_graph_creation_extensions(extension_dir) new_graph_id = handler.call_graph_creation_extension( "graph_creation_function_vert_and_edge_data_big_vertex_ids", "()", "{}" ) @@ -232,7 +267,7 @@ def test_get_graph_data_empty_graph(graph_creation_extension_empty_graph): extension_dir = graph_creation_extension_empty_graph # Load the extension and ensure it can be called. - handler.load_graph_creation_extensions(extension_dir.name) + handler.load_graph_creation_extensions(extension_dir) new_graph_id = handler.call_graph_creation_extension( "graph_creation_function", "()", "{}" ) diff --git a/python/cugraph_service/tests/test_e2e.py b/python/cugraph_service/tests/test_e2e.py index f9dae36cbc1..e1ee57745f4 100644 --- a/python/cugraph_service/tests/test_e2e.py +++ b/python/cugraph_service/tests/test_e2e.py @@ -13,6 +13,7 @@ # limitations under the License. from collections.abc import Sequence +from pathlib import Path import pytest @@ -84,12 +85,13 @@ def client_with_graph_creation_extension_loaded(client, graph_creation_extension """ server_extension_dir = graph_creation_extension1 - client.load_graph_creation_extensions(server_extension_dir.name) + extension_modnames = client.load_graph_creation_extensions(server_extension_dir) # yield control to the tests, cleanup on return yield client - client.unload_graph_creation_extensions() + for modname in extension_modnames: + client.unload_extension_module(modname) @pytest.fixture(scope="function") @@ -256,6 +258,25 @@ def test_extract_subgraph(client_with_edgelist_csv_loaded): assert Gid in client.get_graph_ids() +def test_call_graph_creation_extension(client_with_graph_creation_extension_loaded): + """ + Ensure the graph creation extension preloaded by the server fixture is + callable. + """ + client = client_with_graph_creation_extension_loaded + + new_graph_id = client.call_graph_creation_extension( + "custom_graph_creation_function" + ) + + assert new_graph_id in client.get_graph_ids() + + # Inspect the PG and ensure it was created from + # custom_graph_creation_function + # FIXME: add client APIs to allow for a more thorough test of the graph + assert client.get_graph_info(["num_edges"], new_graph_id) == 3 + + def test_load_and_call_graph_creation_extension( client_with_graph_creation_extension_loaded, graph_creation_extension2 ): @@ -268,8 +289,10 @@ def test_load_and_call_graph_creation_extension( extension_dir = graph_creation_extension2 client = client_with_graph_creation_extension_loaded - num_files_loaded = client.load_graph_creation_extensions(extension_dir.name) - assert num_files_loaded == 1 + ext_mod_names = client.load_graph_creation_extensions(extension_dir) + assert len(ext_mod_names) == 1 + expected_mod_name = (Path(extension_dir) / "graph_creation_extension.py").as_posix() + assert ext_mod_names[0] == expected_mod_name new_graph_id = client.call_graph_creation_extension( "my_graph_creation_function", "a", "b", "c" @@ -295,15 +318,18 @@ def test_load_and_call_graph_creation_long_running_extension( ): """ Tests calling a user-defined server-side graph creation extension from the - cugraph_service client. + cugraph_service client. This uses a client of a server that already has an + extension loaded to ensure both can properly coexist. """ # The graph_creation_extension returns the tmp dir created which contains # the extension extension_dir = graph_creation_extension_long_running client = client_with_graph_creation_extension_loaded - num_files_loaded = client.load_graph_creation_extensions(extension_dir.name) - assert num_files_loaded == 1 + ext_mod_names = client.load_graph_creation_extensions(extension_dir) + assert len(ext_mod_names) == 1 + expected_mod_name = (Path(extension_dir) / "graph_creation_extension.py").as_posix() + assert ext_mod_names[0] == expected_mod_name new_graph_id = client.call_graph_creation_extension( "long_running_graph_creation_function" @@ -316,23 +342,35 @@ def test_load_and_call_graph_creation_long_running_extension( assert client.get_graph_info(["num_edges"], new_graph_id) == 0 -def test_call_graph_creation_extension(client_with_graph_creation_extension_loaded): +def test_load_call_unload_extension(client, extension1): """ - Ensure the graph creation extension preloaded by the server fixture is - callable. + Ensure extensions can be loaded, run, and unloaded. """ - client = client_with_graph_creation_extension_loaded + from cugraph_service_client.exceptions import CugraphServiceError - new_graph_id = client.call_graph_creation_extension( - "custom_graph_creation_function" - ) + extension_dir = extension1 - assert new_graph_id in client.get_graph_ids() + # Loading + ext_mod_names = client.load_extensions(extension_dir) - # Inspect the PG and ensure it was created from - # custom_graph_creation_function - # FIXME: add client APIs to allow for a more thorough test of the graph - assert client.get_graph_info(["num_edges"], new_graph_id) == 3 + # Running + # my_nines_function in extension1 returns a list of two lists of 9's with + # sizes and dtypes based on args. + results = client.call_extension("my_nines_function", 33, "int32", 21, "float64") + assert len(results) == 2 + assert len(results[0]) == 33 + assert len(results[1]) == 21 + assert type(results[0][0]) == int + assert type(results[1][0]) == float + assert results[0][0] == 9 + assert results[1][0] == 9.0 + + # Unloading + for mod_name in ext_mod_names: + client.unload_extension_module(mod_name) + + with pytest.raises(CugraphServiceError): + client.call_extension("my_nines_function", 33, "int32", 21, "float64") def test_get_graph_vertex_data(client_with_property_csvs_loaded): diff --git a/python/cugraph_service/tests/test_mg_cugraph_handler.py b/python/cugraph_service/tests/test_mg_cugraph_handler.py index 00da2007ccf..b57995b3207 100644 --- a/python/cugraph_service/tests/test_mg_cugraph_handler.py +++ b/python/cugraph_service/tests/test_mg_cugraph_handler.py @@ -100,7 +100,7 @@ def test_get_graph_data_large_vertex_ids( extension_dir = graph_creation_extension_big_vertex_ids # Load the extension and ensure it can be called. - handler.load_graph_creation_extensions(extension_dir.name) + handler.load_graph_creation_extensions(extension_dir) new_graph_id = handler.call_graph_creation_extension( "graph_creation_function_vert_and_edge_data_big_vertex_ids", "()", "{}" ) @@ -158,7 +158,7 @@ def test_get_graph_data_empty_graph( extension_dir = graph_creation_extension_empty_graph # Load the extension and ensure it can be called. - handler.load_graph_creation_extensions(extension_dir.name) + handler.load_graph_creation_extensions(extension_dir) new_graph_id = handler.call_graph_creation_extension( "graph_creation_function", "()", "{}" ) diff --git a/python/cugraph_service/tests/test_mg_e2e.py b/python/cugraph_service/tests/test_mg_e2e.py index 2c1007514e5..bd36dec10dc 100644 --- a/python/cugraph_service/tests/test_mg_e2e.py +++ b/python/cugraph_service/tests/test_mg_e2e.py @@ -204,7 +204,7 @@ def client_of_sg_server_on_device_1_large_property_graph_loaded( client = client_of_sg_server_on_device_1 server_extension_dir = graph_creation_extension_large_property_graph - client.load_graph_creation_extensions(server_extension_dir.name) + ext_mod_names = client.load_graph_creation_extensions(server_extension_dir) # Assume fixture that starts server on device 1 has the extension loaded # for creating large property graphs. @@ -218,12 +218,12 @@ def client_of_sg_server_on_device_1_large_property_graph_loaded( yield (client, new_graph_id) client.delete_graph(new_graph_id) - client.unload_graph_creation_extensions() + for mod_name in ext_mod_names: + client.unload_extension_module(mod_name) -# Because pytest does not allow mixing fixtures and parametrization decorators -# for test functions, this fixture is parametrized for different device IDs to -# test against, and simply returns the param value to the test using it. +# This fixture is parametrized for different device IDs to test against, and +# simply returns the param value to the test using it. @pytest.fixture(scope="module", params=[None, 0], ids=lambda p: f"device={p}") def result_device_id(request): return request.param diff --git a/python/cugraph_service/tests/utils.py b/python/cugraph_service/tests/utils.py index 5e8dad02be6..99bc4d6e6d4 100644 --- a/python/cugraph_service/tests/utils.py +++ b/python/cugraph_service/tests/utils.py @@ -20,7 +20,7 @@ from pathlib import Path -def create_tmp_extension_dir(file_contents, file_name="graph_creation_extension.py"): +def create_tmp_extension_dir(file_contents, file_name="my_extension.py"): """ Create and return a temporary dir to be used as a dir containing extensions to be read by a cugraph_service server. file_contents is a string From 2572ed00917227a91ec71164d998ecfec15ad215 Mon Sep 17 00:00:00 2001 From: Rick Ratzel Date: Wed, 26 Oct 2022 00:04:01 -0500 Subject: [PATCH 02/41] Fixed tests broken by refactorings, removed dead code. --- .../cugraph_service_client/types.py | 11 ----------- .../tests/test_cugraph_handler.py | 2 +- python/cugraph_service/tests/test_e2e.py | 4 ++-- .../tests/test_mg_cugraph_handler.py | 16 +++++++--------- 4 files changed, 10 insertions(+), 23 deletions(-) diff --git a/python/cugraph_service/cugraph_service_client/types.py b/python/cugraph_service/cugraph_service_client/types.py index c1384ac83ed..505e499e849 100644 --- a/python/cugraph_service/cugraph_service_client/types.py +++ b/python/cugraph_service/cugraph_service_client/types.py @@ -106,17 +106,6 @@ def __init__(self, val, val_name="value"): f"following types: {self.valid_types}, got " f"{type(val)}" ) - """ - # Also add members with values matching the now complete self.union - # Value object. This will essentially duck-type this ValueWrapper - # instance and allow it to be returned to Thrift and treated as a Value - self.int32_value = self.union.int32_value - self.int64_value = self.union.int64_value - self.string_value = self.union.string_value - self.bool_value = self.union.bool_value - self.double_value = self.union.double_value - self.list_value = self.union.list_value - """ def __getattr__(self, attr): """ diff --git a/python/cugraph_service/tests/test_cugraph_handler.py b/python/cugraph_service/tests/test_cugraph_handler.py index e39a208fe32..4be3ce53226 100644 --- a/python/cugraph_service/tests/test_cugraph_handler.py +++ b/python/cugraph_service/tests/test_cugraph_handler.py @@ -50,7 +50,7 @@ def test_load_and_call_graph_creation_extension(graph_creation_extension2): # Load the extension and call the function defined in it ext_mod_names = handler.load_graph_creation_extensions(extension_dir) assert len(ext_mod_names) == 1 - expected_mod_name = (Path(extension_dir) / "graph_creation_extension.py").as_posix() + expected_mod_name = (Path(extension_dir) / "my_extension.py").as_posix() assert ext_mod_names[0] == expected_mod_name # Private function should not be callable diff --git a/python/cugraph_service/tests/test_e2e.py b/python/cugraph_service/tests/test_e2e.py index e1ee57745f4..e9d4da46e2a 100644 --- a/python/cugraph_service/tests/test_e2e.py +++ b/python/cugraph_service/tests/test_e2e.py @@ -291,7 +291,7 @@ def test_load_and_call_graph_creation_extension( ext_mod_names = client.load_graph_creation_extensions(extension_dir) assert len(ext_mod_names) == 1 - expected_mod_name = (Path(extension_dir) / "graph_creation_extension.py").as_posix() + expected_mod_name = (Path(extension_dir) / "my_extension.py").as_posix() assert ext_mod_names[0] == expected_mod_name new_graph_id = client.call_graph_creation_extension( @@ -328,7 +328,7 @@ def test_load_and_call_graph_creation_long_running_extension( ext_mod_names = client.load_graph_creation_extensions(extension_dir) assert len(ext_mod_names) == 1 - expected_mod_name = (Path(extension_dir) / "graph_creation_extension.py").as_posix() + expected_mod_name = (Path(extension_dir) / "my_extension.py").as_posix() assert ext_mod_names[0] == expected_mod_name new_graph_id = client.call_graph_creation_extension( diff --git a/python/cugraph_service/tests/test_mg_cugraph_handler.py b/python/cugraph_service/tests/test_mg_cugraph_handler.py index b57995b3207..5162f5b3c2e 100644 --- a/python/cugraph_service/tests/test_mg_cugraph_handler.py +++ b/python/cugraph_service/tests/test_mg_cugraph_handler.py @@ -214,7 +214,6 @@ def test_get_graph_info(handler_with_karate_edgelist_loaded): get_graph_info() for specific args. """ from cugraph_service_client import defaults - from cugraph_service_client.types import ValueWrapper (handler, test_data) = handler_with_karate_edgelist_loaded @@ -224,11 +223,11 @@ def test_get_graph_info(handler_with_karate_edgelist_loaded): info = handler.get_graph_info( ["num_edges", "num_edge_properties"], defaults.graph_id ) - # info is a dictionary containing cugraph_service_client.types.Value objs, - # so access the int32 member directly for easy comparison. + # info is a dictionary containing cugraph_service_client.types.ValueWrapper + # objs, so access the int32 member directly for easy comparison. shape = ( - ValueWrapper(info["num_edges"]).get_py_obj(), - ValueWrapper(info["num_edge_properties"]).get_py_obj(), + info["num_edges"].get_py_obj(), + info["num_edge_properties"].get_py_obj(), ) assert shape == (156, 1) # The single edge property is the weight @@ -236,8 +235,8 @@ def test_get_graph_info(handler_with_karate_edgelist_loaded): ["num_vertices_from_vertex_data", "num_vertex_properties"], defaults.graph_id ) shape = ( - ValueWrapper(info["num_vertices_from_vertex_data"]).get_py_obj(), - ValueWrapper(info["num_vertex_properties"]).get_py_obj(), + info["num_vertices_from_vertex_data"].get_py_obj(), + info["num_vertex_properties"].get_py_obj(), ) assert shape == (0, 0) @@ -248,7 +247,6 @@ def test_get_graph_info_defaults(mg_handler): keys present for an empty default graph. """ from cugraph_service_client import defaults - from cugraph_service_client.types import ValueWrapper handler = mg_handler @@ -261,7 +259,7 @@ def test_get_graph_info_defaults(mg_handler): "num_vertex_properties": 0, "num_edge_properties": 0, } - actual = {key: ValueWrapper(val).get_py_obj() for (key, val) in info.items()} + actual = {key: val.get_py_obj() for (key, val) in info.items()} assert expected == actual From 203da90c1809e7d4f44020447730e96035bdf843 Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Wed, 26 Oct 2022 13:30:30 -0700 Subject: [PATCH 03/41] Remove remove_pg_dependency_from_cugraph_store.py --- .../gnn/dgl_extensions/cugraph_store.py | 149 ++++++------------ .../gnn/dgl_extensions/utils/sampling.py | 105 +++++++++++- 2 files changed, 148 insertions(+), 106 deletions(-) diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py index 76e60e0d25c..8b1a1bdcc28 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py @@ -20,20 +20,14 @@ from functools import cached_property from .utils.add_data import _update_feature_map -from .utils.sampling import sample_multiple_sgs, sample_single_sg -from .utils.sampling import ( - get_subgraph_and_src_range_from_edgelist, - get_underlying_dtype_from_sg, -) -from .utils.sampling import create_dlpack_d +from .utils.sampling import sample_pg, get_subgraph_and_src_range_from_pg +from .utils.sampling import get_underlying_dtype_from_sg from .feature_storage import CuFeatureStorage src_n = PropertyGraph.src_col_name dst_n = PropertyGraph.dst_col_name type_n = PropertyGraph.type_col_name -eid_n = PropertyGraph.edge_id_col_name -vid_n = PropertyGraph.vertex_col_name class CuGraphStore: @@ -101,7 +95,6 @@ def add_node_data( self.ndata_feat_col_d, feat_name, contains_vector_features, columns ) # Clear properties if set as data has changed - self.__clear_cached_properties() def add_edge_data( @@ -168,7 +161,6 @@ def get_node_storage(self, key, ntype=None, indices_offset=0): ) columns = self.ndata_feat_col_d[key] - return CuFeatureStorage( pg=self.gdata, columns=columns, @@ -214,15 +206,14 @@ def num_edges(self, etype=None): def has_multiple_etypes(self): return len(self.etypes) > 1 - @property + @cached_property def ntypes(self): return sorted(self.gdata.vertex_types) - @property + @cached_property def etypes(self): return sorted(self.gdata.edge_types) - @property def is_mg(self): return isinstance(self.gdata, MGPropertyGraph) @@ -276,76 +267,36 @@ def sample_neighbors( f"edge_dir must be either 'in' or 'out' got {edge_dir} instead" ) - if isinstance(nodes_cap, dict): - nodes = {t: cudf.from_dlpack(n) for t, n in nodes_cap.items()} - else: - nodes = cudf.from_dlpack(nodes_cap) - - if self.is_mg: - sample_f = cugraph.dask.uniform_neighbor_sample - else: - sample_f = cugraph.uniform_neighbor_sample - if self.has_multiple_etypes: # TODO: Convert into a single call when # https://github.com/rapidsai/cugraph/issues/2696 lands if edge_dir == "in": - sgs = self.extracted_reverse_subgraphs_per_type + sgs_obj, sgs_src_range_obj = self.extracted_reverse_subgraphs_per_type else: - sgs = self.extracted_subgraphs_per_type - # Uniform sampling fails when the dtype - # of the seed dtype is not same as the node dtype - - self.set_sg_node_dtype(list(sgs.values())[0][0]) - sampled_df = sample_multiple_sgs( - sgs, - sample_f, - nodes, - self._sg_node_dtype, - edge_dir, - fanout, - replace, - ) + sgs_obj, sgs_src_range_obj = self.extracted_subgraphs_per_type + first_sg = list(sgs_obj.values())[0] else: if edge_dir == "in": - sg, start_list_range = self.extracted_reverse_subgraph + sgs_obj, sgs_src_range_obj = self.extracted_reverse_subgraph else: - sg, start_list_range = self.extracted_subgraph - self.set_sg_node_dtype(sg) - sampled_df = sample_single_sg( - sg, - sample_f, - nodes, - self._sg_node_dtype, - start_list_range, - fanout, - replace, - ) - - # we reverse directions when directions=='in' - if edge_dir == "in": - sampled_df = sampled_df.rename( - columns={"destinations": src_n, "sources": dst_n} - ) - else: - sampled_df = sampled_df.rename( - columns={"sources": src_n, "destinations": dst_n} - ) - # Transfer data to client - if isinstance(sampled_df, dask_cudf.DataFrame): - sampled_df = sampled_df.compute() - - if self.has_multiple_etypes: - # Heterogeneous graph case - d = self._get_edgeid_type_d(sampled_df["indices"], self.etypes) - d = create_dlpack_d(d) - return d - else: - return ( - sampled_df[src_n].to_dlpack(), - sampled_df[dst_n].to_dlpack(), - sampled_df["indices"].to_dlpack(), - ) + sgs_obj, sgs_src_range_obj = self.extracted_subgraph + + first_sg = sgs_obj + # Uniform sampling fails when the dtype + # of the seed dtype is not same as the node dtype + self.set_sg_node_dtype(first_sg) + return sample_pg( + self.gdata, + has_multiple_etypes=self.has_multiple_etypes, + etypes=self.etypes, + sgs_obj=sgs_obj, + sgs_src_range_obj=sgs_src_range_obj, + sg_node_dtype=self._sg_node_dtype, + nodes_cap=nodes_cap, + replace=replace, + fanout=fanout, + edge_dir=edge_dir, + ) ###################################### # Utilities @@ -357,55 +308,37 @@ def num_vertices(self): def get_vertex_ids(self): return self.gdata.vertices_ids() - def _get_edgeid_type_d(self, edge_ids, etypes): - if isinstance(edge_ids, cudf.Series): - # Work around for below issue - # https://github.com/rapidsai/cudf/issues/11877 - edge_ids = edge_ids.values_host - df = self.gdata.get_edge_data(edge_ids=edge_ids, columns=[type_n]) - if isinstance(df, dask_cudf.DataFrame): - df = df.compute() - return {etype: df[df[type_n] == etype] for etype in etypes} - @cached_property def extracted_subgraph(self): - edge_list = self.gdata.get_edge_data(columns=[src_n, dst_n, type_n]) - edge_list = edge_list.reset_index(drop=True) - - return get_subgraph_and_src_range_from_edgelist( - edge_list, self.is_mg, reverse_edges=False + return get_subgraph_and_src_range_from_pg( + self.gdata, reverse_edges=False, etype=None ) @cached_property def extracted_reverse_subgraph(self): - edge_list = self.gdata.get_edge_data(columns=[src_n, dst_n, type_n]) - return get_subgraph_and_src_range_from_edgelist( - edge_list, self.is_mg, reverse_edges=True + return get_subgraph_and_src_range_from_pg( + self.gdata, reverse_edges=True, etype=None ) @cached_property def extracted_subgraphs_per_type(self): sg_d = {} + sg_src_range_d = {} for etype in self.etypes: - edge_list = self.gdata.get_edge_data( - columns=[src_n, dst_n, type_n], types=[etype] - ) - sg_d[etype] = get_subgraph_and_src_range_from_edgelist( - edge_list, self.is_mg, reverse_edges=False + sg_d[etype], sg_src_range_d[etype] = get_subgraph_and_src_range_from_pg( + self.gdata, reverse_edges=False, etype=etype ) - return sg_d + return sg_d, sg_src_range_d @cached_property def extracted_reverse_subgraphs_per_type(self): sg_d = {} + sg_src_range_d = {} for etype in self.etypes: - edge_list = self.gdata.get_edge_data( - columns=[src_n, dst_n, type_n], types=[etype] + sg_d[etype], sg_src_range_d[etype] = get_subgraph_and_src_range_from_pg( + self.gdata, reverse_edges=True, etype=etype ) - sg_d[etype] = get_subgraph_and_src_range_from_edgelist( - edge_list, self.is_mg, reverse_edges=True - ) - return sg_d + return sg_d, sg_src_range_d @cached_property def num_nodes_dict(self): @@ -489,6 +422,12 @@ def __clear_cached_properties(self): if "has_multiple_etypes" in self.__dict__: del self.has_multiple_etypes + if "etypes" in self.__dict__: + del self.etypes + + if "ntypes" in self.__dict__: + del self.ntypes + if "num_nodes_dict" in self.__dict__: del self.num_nodes_dict diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/utils/sampling.py b/python/cugraph/cugraph/gnn/dgl_extensions/utils/sampling.py index 0e12371271d..eaee2414bb3 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/utils/sampling.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/utils/sampling.py @@ -18,6 +18,8 @@ import cupy as cp import dask_cudf from cugraph.experimental import PropertyGraph +from cugraph.experimental import MGPropertyGraph + src_n = PropertyGraph.src_col_name dst_n = PropertyGraph.dst_col_name @@ -26,6 +28,20 @@ vid_n = PropertyGraph.vertex_col_name +def get_subgraph_and_src_range_from_pg(pg, reverse_edges, etype=None): + if etype: + edge_list = pg.get_edge_data(columns=[src_n, dst_n, type_n], types=[etype]) + else: + edge_list = pg.get_edge_data(columns=[src_n, dst_n, type_n]) + + edge_list = edge_list.reset_index(drop=True) + + is_mg = isinstance(pg, MGPropertyGraph) + return get_subgraph_and_src_range_from_edgelist( + edge_list, is_mg, reverse_edges=reverse_edges + ) + + def get_subgraph_and_src_range_from_edgelist(edge_list, is_mg, reverse_edges=False): if reverse_edges: edge_list = edge_list.rename(columns={src_n: dst_n, dst_n: src_n}) @@ -63,6 +79,7 @@ def get_subgraph_and_src_range_from_edgelist(edge_list, is_mg, reverse_edges=Fal def sample_multiple_sgs( sgs, + sgs_src_range_obj, sample_f, start_list_d, start_list_dtype, @@ -72,7 +89,8 @@ def sample_multiple_sgs( ): start_list_types = list(start_list_d.keys()) output_dfs = [] - for can_etype, (sg, start_list_range) in sgs.items(): + for can_etype, sg in sgs.items(): + start_list_range = sgs_src_range_obj[can_etype] can_etype = _convert_can_etype_s_to_tup(can_etype) if _edge_types_contains_canonical_etype(can_etype, start_list_types, edge_dir): if edge_dir == "in": @@ -179,3 +197,88 @@ def get_underlying_dtype_from_sg(sg): raise ValueError(f"Source column {src_n} not found in the subgraph") return sg_node_dtype + + +def get_edgeid_type_d(pg, edge_ids, etypes): + if isinstance(edge_ids, cudf.Series): + # Work around for below issue + # https://github.com/rapidsai/cudf/issues/11877 + edge_ids = edge_ids.values_host + df = pg.get_edge_data(edge_ids=edge_ids, columns=[type_n]) + if isinstance(df, dask_cudf.DataFrame): + df = df.compute() + return {etype: df[df[type_n] == etype] for etype in etypes} + + +def sample_pg( + pg, + has_multiple_etypes, + etypes, + sgs_obj, + sgs_src_range_obj, + sg_node_dtype, + nodes_cap, + replace, + fanout, + edge_dir, +): + if isinstance(nodes_cap, dict): + nodes = {t: cudf.from_dlpack(n) for t, n in nodes_cap.items()} + else: + nodes = cudf.from_dlpack(nodes_cap) + + if isinstance(pg, MGPropertyGraph): + sample_f = cugraph.dask.uniform_neighbor_sample + else: + sample_f = cugraph.uniform_neighbor_sample + + if has_multiple_etypes: + # TODO: Convert into a single call when + # https://github.com/rapidsai/cugraph/issues/2696 lands + # Uniform sampling fails when the dtype + # of the seed dtype is not same as the node dtype + sampled_df = sample_multiple_sgs( + sgs=sgs_obj, + sgs_src_range_obj=sgs_src_range_obj, + start_list_dtype=sg_node_dtype, + sample_f=sample_f, + start_list_d=nodes, + edge_dir=edge_dir, + fanout=fanout, + with_replacement=replace, + ) + else: + sampled_df = sample_single_sg( + sg=sgs_obj, + start_list_range=sgs_src_range_obj, + start_list_dtype=sg_node_dtype, + sample_f=sample_f, + start_list=nodes, + fanout=fanout, + with_replacement=replace, + ) + + # we reverse directions when directions=='in' + if edge_dir == "in": + sampled_df = sampled_df.rename( + columns={"destinations": src_n, "sources": dst_n} + ) + else: + sampled_df = sampled_df.rename( + columns={"sources": src_n, "destinations": dst_n} + ) + # Transfer data to client + if isinstance(sampled_df, dask_cudf.DataFrame): + sampled_df = sampled_df.compute() + + if has_multiple_etypes: + # Heterogeneous graph case + d = get_edgeid_type_d(pg, sampled_df["indices"], etypes) + d = create_dlpack_d(d) + return d + else: + return ( + sampled_df[src_n].to_dlpack(), + sampled_df[dst_n].to_dlpack(), + sampled_df["indices"].to_dlpack(), + ) From b19fab1eb6936e42234709616b07bb978d2c818e Mon Sep 17 00:00:00 2001 From: Rick Ratzel Date: Wed, 26 Oct 2022 15:43:29 -0500 Subject: [PATCH 04/41] Added tests for extension modules that access the server and return None, added test for extension modules returning results to client GPUs (code not done yet). --- .../cugraph_service_client/types.py | 2 + .../cugraph_service_server/cugraph_handler.py | 9 +++- python/cugraph_service/tests/conftest.py | 44 ++++++++++++++++ .../tests/test_cugraph_handler.py | 42 ++++++++++++++- python/cugraph_service/tests/test_e2e.py | 17 ++++++ python/cugraph_service/tests/test_mg_e2e.py | 52 ++++++++++++++++++- 6 files changed, 162 insertions(+), 4 deletions(-) diff --git a/python/cugraph_service/cugraph_service_client/types.py b/python/cugraph_service/cugraph_service_client/types.py index 505e499e849..613e29e94d1 100644 --- a/python/cugraph_service/cugraph_service_client/types.py +++ b/python/cugraph_service/cugraph_service_client/types.py @@ -100,6 +100,8 @@ def __init__(self, val, val_name="value"): ): # self.union = Value(list_value=val.tolist()) self.union = Value(list_value=[ValueWrapper(i) for i in val.tolist()]) + elif val is None: + self.union = Value() else: raise TypeError( f"{val_name} must be one of the " diff --git a/python/cugraph_service/cugraph_service_server/cugraph_handler.py b/python/cugraph_service/cugraph_service_server/cugraph_handler.py index d4ee9ca9087..6cfe126e2dc 100644 --- a/python/cugraph_service/cugraph_service_server/cugraph_handler.py +++ b/python/cugraph_service/cugraph_service_server/cugraph_handler.py @@ -118,6 +118,12 @@ def get_server_info(self): for (k, v) in self.__handler.get_server_info().items() } + def get_graph_ids(self): + return self.__handler.get_graph_ids() + + def get_graph(self, graph_id): + return self.__handler._get_graph(graph_id) + class CugraphHandler: """ @@ -794,7 +800,8 @@ def get_graph_type(self, graph_id): ########################################################################### # "Protected" interface - used for both implementation and test/debug. Will - # not be exposed to a cugraph_service client. + # not be exposed to a cugraph_service client, but will be used by extensions + # via the ExtensionServerFacade. def _get_graph(self, graph_id): """ Return the cuGraph Graph object associated with graph_id. diff --git a/python/cugraph_service/tests/conftest.py b/python/cugraph_service/tests/conftest.py index 870f347f328..519a7d39536 100644 --- a/python/cugraph_service/tests/conftest.py +++ b/python/cugraph_service/tests/conftest.py @@ -168,6 +168,32 @@ def my_nines_function(array1_size, array1_dtype, array2_size, array2_dtype): """ +extension_with_facade_file_contents = """ +import cupy + +def my_extension(arg1, arg2, server): + + # This extension assumes the server already has a single PG loaded via + # calling graph_creation_extension1 + gid = server.get_graph_ids()[0] + pG = server.get_graph(gid) + + edge_df = pG.get_edge_data() + + # Do an arbitrary operation on the PG based on the args, and return the + # result as a cupy array. + + retval = cupy.array(edge_df[pG.edge_id_col_name] + arg1 + arg2) + return retval +""" + +extension_returns_none_file_contents = """ + +def my_extension(): + return None +""" + + ############################################################################### # module scope fixtures @@ -252,3 +278,21 @@ def extension1(): tmp_extension_dir = utils.create_tmp_extension_dir(extension1_file_contents) yield tmp_extension_dir.name + + +@pytest.fixture(scope="module") +def extension_with_facade(): + tmp_extension_dir = utils.create_tmp_extension_dir( + extension_with_facade_file_contents + ) + + yield tmp_extension_dir.name + + +@pytest.fixture(scope="module") +def extension_returns_none(): + tmp_extension_dir = utils.create_tmp_extension_dir( + extension_returns_none_file_contents + ) + + yield tmp_extension_dir.name diff --git a/python/cugraph_service/tests/test_cugraph_handler.py b/python/cugraph_service/tests/test_cugraph_handler.py index 4be3ce53226..ebc26e5a796 100644 --- a/python/cugraph_service/tests/test_cugraph_handler.py +++ b/python/cugraph_service/tests/test_cugraph_handler.py @@ -108,7 +108,8 @@ def test_load_and_unload_extensions(graph_creation_extension2, extension1): results = handler.call_extension( "my_nines_function", "(33, 'int32', 21, 'float64')", "{}" ) - # Check the ValueWrapper object + # results is a ValueWrapper object which Thrift will understand to be a + # Value, which it can serialize. Check the ValueWrapper object here. assert len(results.list_value) == 2 assert len(results.list_value[0].list_value) == 33 assert len(results.list_value[1].list_value) == 21 @@ -140,6 +141,45 @@ def test_load_and_unload_extensions(graph_creation_extension2, extension1): ) +def test_extension_with_facade_graph_access( + graph_creation_extension1, extension_with_facade +): + """ + Creates a Graph then calls an extension that accesses the graph in order to + return data. + """ + from cugraph_service_server.cugraph_handler import CugraphHandler + + handler = CugraphHandler() + gc_extension_dir = graph_creation_extension1 + extension_dir = extension_with_facade + + # Load the extensions - use the graph creation extension to create a known PG + # for use by the extension being tested. + handler.load_graph_creation_extensions(gc_extension_dir) + handler.load_extensions(extension_dir) + + new_graph_ID = handler.call_graph_creation_extension( + "custom_graph_creation_function", "()", "{}" + ) + assert new_graph_ID in handler.get_graph_ids() + + val1 = 33 + val2 = 22.1 + + # Call the extension under test, it will access the PG loaded above to return + # results. This extension just adds val1 + val2 to each edge ID. + results = handler.call_extension("my_extension", f"({val1}, {val2})", "{}") + + # results is a ValueWrapper object which Thrift will understand to be a Value, which + # it can serialize. Check the ValueWrapper object here, it should contain the 3 edge + # IDs starting from 0 with the values added to each. + assert len(results.list_value) == 3 + assert results.list_value[0].double_value == 0 + val1 + val2 + assert results.list_value[1].double_value == 1 + val1 + val2 + assert results.list_value[2].double_value == 2 + val1 + val2 + + def test_load_and_unload_graph_creation_extension_no_args(graph_creation_extension1): """ Test graph_creation_extension1 which contains an extension with no args. diff --git a/python/cugraph_service/tests/test_e2e.py b/python/cugraph_service/tests/test_e2e.py index e9d4da46e2a..a5bead3c4be 100644 --- a/python/cugraph_service/tests/test_e2e.py +++ b/python/cugraph_service/tests/test_e2e.py @@ -373,6 +373,23 @@ def test_load_call_unload_extension(client, extension1): client.call_extension("my_nines_function", 33, "int32", 21, "float64") +def test_extension_returns_none(client, extension_returns_none): + """ + Ensures an extension that returns None is handled + """ + extension_dir = extension_returns_none + + ext_mod_names = client.load_extensions(extension_dir) + + result = client.call_extension("my_extension") + assert result is None + + # FIXME: much of this test could be in a fixture which ensures the extension + # is unloaded from the server before returning + for mod_name in ext_mod_names: + client.unload_extension_module(mod_name) + + def test_get_graph_vertex_data(client_with_property_csvs_loaded): (client, test_data) = client_with_property_csvs_loaded diff --git a/python/cugraph_service/tests/test_mg_e2e.py b/python/cugraph_service/tests/test_mg_e2e.py index bd36dec10dc..7649787dfb7 100644 --- a/python/cugraph_service/tests/test_mg_e2e.py +++ b/python/cugraph_service/tests/test_mg_e2e.py @@ -249,7 +249,6 @@ def test_get_default_graph_info(client_of_mg_server_with_edgelist_csv_loaded): def test_get_edge_IDs_for_vertices(client_of_mg_server_with_edgelist_csv_loaded): - """ """ (client_of_mg_server, test_data) = client_of_mg_server_with_edgelist_csv_loaded # get_graph_type() is a test/debug API which returns a string repr of the @@ -293,7 +292,7 @@ def test_device_transfer( assert bytes_returned.device == device_n -def test_uniform_neighbor_sampling_result_device( +def test_uniform_neighbor_sampling_result_on_device( benchmark, result_device_id, client_of_sg_server_on_device_1_large_property_graph_loaded, @@ -329,3 +328,52 @@ def test_uniform_neighbor_sampling_result_device( assert dtype is cp.ndarray device_n = cp.cuda.Device(result_device_id) assert result.sources.device == device_n + + +def test_call_extension_result_on_device( + benchmark, extension1, result_device_id, client_of_sg_server_on_device_1 +): + client = client_of_sg_server_on_device_1 + extension_dir = extension1 + array1_len = 33 + array2_len = 21 + + # Loading + ext_mod_names = client.load_extensions(extension_dir) + + # Running + # my_nines_function in extension1 returns a list of two lists of 9's with + # sizes and dtypes based on args. + results = client.call_extension( + "my_nines_function", + array1_len, + "int32", + array2_len, + "float64", + result_device=result_device_id, + ) + if result_device_id is None: + assert len(results) == 2 + assert len(results[0]) == array1_len + assert len(results[1]) == array2_len + assert type(results[0][0]) == int + assert type(results[1][0]) == float + assert results[0][0] == 9 + assert results[1][0] == 9.0 + else: + # results will be a n-tuple where n is the number of arrays returned. The + # n-tuple contains each array as a device array on result_device_id. + assert isinstance(results, tuple) + assert len(results) == 2 + + device_n = cp.cuda.Device(result_device_id) + assert isinstance(results[0], cp.ndarray) + assert results[0].device == device_n + assert results[0].tolist() == [9] * array1_len + + assert isinstance(results[1], cp.ndarray) + assert results[1].device == device_n + assert results[1].tolist() == [9.0] * array2_len + + for mod_name in ext_mod_names: + client.unload_extension_module(mod_name) From 51fb38f3677020991f8eddcfb16c0915282a2368 Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Wed, 26 Oct 2022 14:18:59 -0700 Subject: [PATCH 05/41] Remove remove_pg_dependency_from_cugraph_store.py --- .../gnn/dgl_extensions/cugraph_store.py | 38 ++++--------------- .../gnn/dgl_extensions/utils/find_edges.py | 15 ++++++++ .../gnn/dgl_extensions/utils/node_subgraph.py | 36 ++++++++++++++++++ 3 files changed, 58 insertions(+), 31 deletions(-) create mode 100644 python/cugraph/cugraph/gnn/dgl_extensions/utils/find_edges.py create mode 100644 python/cugraph/cugraph/gnn/dgl_extensions/utils/node_subgraph.py diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py index 8b1a1bdcc28..708890c4ad8 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py @@ -13,23 +13,15 @@ from collections import defaultdict -import cudf -import dask_cudf -import cugraph -from cugraph.experimental import PropertyGraph, MGPropertyGraph from functools import cached_property - +from .utils.find_edges import find_edges +from .utils.node_subgraph import node_subgraph from .utils.add_data import _update_feature_map from .utils.sampling import sample_pg, get_subgraph_and_src_range_from_pg from .utils.sampling import get_underlying_dtype_from_sg from .feature_storage import CuFeatureStorage -src_n = PropertyGraph.src_col_name -dst_n = PropertyGraph.dst_col_name -type_n = PropertyGraph.type_col_name - - class CuGraphStore: """ A wrapper around a cuGraph Property Graph that @@ -40,6 +32,8 @@ class CuGraphStore: """ def __init__(self, graph, backend_lib="torch"): + from cugraph.experimental import PropertyGraph, MGPropertyGraph + if isinstance(graph, (PropertyGraph, MGPropertyGraph)): self.__G = graph else: @@ -214,9 +208,6 @@ def ntypes(self): def etypes(self): return sorted(self.gdata.edge_types) - def is_mg(self): - return isinstance(self.gdata, MGPropertyGraph) - @property def gdata(self): return self.__G @@ -375,18 +366,12 @@ def find_edges(self, edge_ids_cap, etype): DLPack capsule The dst nodes for the given ids """ - edge_ids = cudf.from_dlpack(edge_ids_cap) - subset_df = self.gdata.get_edge_data( - edge_ids=edge_ids, columns=type_n, types=[etype] - ) - if isinstance(subset_df, dask_cudf.DataFrame): - subset_df = subset_df.compute() - return subset_df[src_n].to_dlpack(), subset_df[dst_n].to_dlpack() + return find_edges(edge_ids_cap, etype) def node_subgraph( self, nodes=None, - create_using=cugraph.MultiGraph, + create_using=None, ): """ Return a subgraph induced on the given nodes. @@ -405,16 +390,7 @@ def node_subgraph( The sampled subgraph with the same node ID space with the original graph. """ - _g = self.gdata.extract_subgraph( - create_using=create_using, check_multi_edges=True - ) - - if nodes is None: - return _g - else: - _n = cudf.Series(nodes) - _subg = cugraph.subgraph(_g, _n) - return _subg + return node_subgraph(self.gdata, nodes, create_using) def __clear_cached_properties(self): # Check for cached properties using self.__dict__ because calling diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/utils/find_edges.py b/python/cugraph/cugraph/gnn/dgl_extensions/utils/find_edges.py new file mode 100644 index 00000000000..e0279417a45 --- /dev/null +++ b/python/cugraph/cugraph/gnn/dgl_extensions/utils/find_edges.py @@ -0,0 +1,15 @@ +import cudf +import dask_cudf +from cugraph.experimental import PropertyGraph + +src_n = PropertyGraph.src_col_name +dst_n = PropertyGraph.dst_col_name +type_n = PropertyGraph.type_col_name + + +def find_edges(pg, edge_ids_cap, etype): + edge_ids = cudf.from_dlpack(edge_ids_cap) + subset_df = pg.get_edge_data(edge_ids=edge_ids, columns=type_n, types=[etype]) + if isinstance(subset_df, dask_cudf.DataFrame): + subset_df = subset_df.compute() + return subset_df[src_n].to_dlpack(), subset_df[dst_n].to_dlpack() diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/utils/node_subgraph.py b/python/cugraph/cugraph/gnn/dgl_extensions/utils/node_subgraph.py new file mode 100644 index 00000000000..32906bf1e31 --- /dev/null +++ b/python/cugraph/cugraph/gnn/dgl_extensions/utils/node_subgraph.py @@ -0,0 +1,36 @@ +import cugraph +import cudf + + +def node_subgraph( + pg, + nodes=None, + create_using=cugraph.MultiGraph, +): + """ + Return a subgraph induced on the given nodes. + + A node-induced subgraph is a graph with edges whose endpoints are both + in the specified node set. + + Parameters + ---------- + pg: Property Graph + The graph to create subgraph from + nodes : Tensor + The nodes to form the subgraph. + Returns + ------- + cuGraph + The sampled subgraph with the same node ID space with the original + graph. + """ + + _g = pg.extract_subgraph(create_using=create_using, check_multi_edges=True) + + if nodes is None: + return _g + else: + _n = cudf.Series(nodes) + _subg = cugraph.subgraph(_g, _n) + return _subg From d528c0c22dbdec6cc415452f7c8fbd30d089b07e Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Wed, 26 Oct 2022 15:30:00 -0700 Subject: [PATCH 06/41] Move code to helper functions to make them callable from extensions --- .../gnn/dgl_extensions/cugraph_store.py | 34 +++++++++++++++++-- .../gnn/dgl_extensions/feature_storage.py | 5 ++- .../gnn/dgl_extensions/utils/sampling.py | 28 +++++++-------- 3 files changed, 46 insertions(+), 21 deletions(-) diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py index 708890c4ad8..f2b7e7d5672 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py @@ -32,9 +32,8 @@ class CuGraphStore: """ def __init__(self, graph, backend_lib="torch"): - from cugraph.experimental import PropertyGraph, MGPropertyGraph - if isinstance(graph, (PropertyGraph, MGPropertyGraph)): + if type(graph).__name__ in ["PropertyGraph", "MGPropertyGraph"]: self.__G = graph else: raise ValueError("graph must be a PropertyGraph or MGPropertyGraph") @@ -276,7 +275,9 @@ def sample_neighbors( # Uniform sampling fails when the dtype # of the seed dtype is not same as the node dtype self.set_sg_node_dtype(first_sg) - return sample_pg( + + # Below will be called from dict + sampled_result_arrays = sample_pg( self.gdata, has_multiple_etypes=self.has_multiple_etypes, etypes=self.etypes, @@ -288,6 +289,7 @@ def sample_neighbors( fanout=fanout, edge_dir=edge_dir, ) + return create_dlpack_results_from_arrays(sampled_result_arrays, self.etypes) ###################################### # Utilities @@ -421,3 +423,29 @@ def __clear_cached_properties(self): if "extracted_reverse_subgraphs_per_type" in self.__dict__: del self.extracted_reverse_subgraphs_per_type + + +def create_dlpack_results_from_arrays(sampled_result_arrays, etypes: list[str]): + # TODO: Extend to pytorch/numpy/etc + import cupy as cp + + if len(etypes) <= 1: + s, d, e_id = sampled_result_arrays + # Handle numpy array, cupy array, lists etc + s, d, e_id = cp.asarray(s), cp.asarray(d), cp.asarray(e_id) + return s.toDlpack(), d.toDlpack(), e_id.toDlpack() + else: + result_d = {} + array_start_offset = 0 + for etype in etypes: + s = sampled_result_arrays[array_start_offset] + d = sampled_result_arrays[array_start_offset + 1] + e_id = sampled_result_arrays[array_start_offset + 2] + s, d, e_id = cp.asarray(s), cp.asarray(d), cp.asarray(e_id) + array_start_offset = array_start_offset + 3 + if s is not None and len(s) >= 0: + s, d, e_id = s.toDlpack(), d.toDlpack(), e_id.toDlpack() + else: + s, d, e_id = None, None, None + result_d[etype] = (s, d, e_id) + return result_d diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py b/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py index 244dfa8b621..207132748e4 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py @@ -15,7 +15,6 @@ import cudf import dask_cudf import cupy as cp -from cugraph.experimental import MGPropertyGraph class CuFeatureStorage: @@ -36,7 +35,7 @@ def __init__( from cupy import from_dlpack else: raise NotImplementedError( - f"Only PyTorch ('torch'), TensorFlow ('tf'), and CuPy ('cupy') " + f"Only PyTorch ('torch'), TensorFlow ('tf'), and CuPy ('cupy') g" f"backends are currently supported, got {backend_lib=}" ) if storage_type not in ["edge", "node"]: @@ -67,7 +66,7 @@ def fetch(self, indices, device=None, pin_memory=False, **kwargs): # Default implementation uses synchronous fetch. indices = cp.asarray(indices) - if isinstance(self.pg, MGPropertyGraph): + if type(self.pg).__name__ in "MGPropertyGraph": # dask_cudf loc breaks if we provide cudf series/cupy array # https://github.com/rapidsai/cudf/issues/11877 indices = indices.get() diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/utils/sampling.py b/python/cugraph/cugraph/gnn/dgl_extensions/utils/sampling.py index eaee2414bb3..460b44ee3b1 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/utils/sampling.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/utils/sampling.py @@ -165,19 +165,18 @@ def _convert_can_etype_s_to_tup(canonical_etype_s): return (src_type, etype, dst_type) -def create_dlpack_d(d): - dlpack_d = {} +def create_cp_result_ls(d): + cupy_result_ls = [] for k, df in d.items(): if len(df) == 0: - dlpack_d[k] = (None, None, None) + cupy_result_ls.append(cp.empty(shape=0, dtype=cp.int32)) + cupy_result_ls.append(cp.empty(shape=0, dtype=cp.int32)) + cupy_result_ls.append(cp.empty(shape=0, dtype=cp.int32)) else: - dlpack_d[k] = ( - df[src_n].to_dlpack(), - df[dst_n].to_dlpack(), - df[eid_n].to_dlpack(), - ) - - return dlpack_d + cupy_result_ls.append(df[src_n].values) + cupy_result_ls.append(df[dst_n].values) + cupy_result_ls.append(df[eid_n].values) + return cupy_result_ls def get_underlying_dtype_from_sg(sg): @@ -274,11 +273,10 @@ def sample_pg( if has_multiple_etypes: # Heterogeneous graph case d = get_edgeid_type_d(pg, sampled_df["indices"], etypes) - d = create_dlpack_d(d) - return d + return create_cp_result_ls(d) else: return ( - sampled_df[src_n].to_dlpack(), - sampled_df[dst_n].to_dlpack(), - sampled_df["indices"].to_dlpack(), + sampled_df[src_n].values, + sampled_df[dst_n].values, + sampled_df["indices"].values, ) From a18becd8a7094898f698ba847e3c2579c30becda Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Wed, 26 Oct 2022 15:44:46 -0700 Subject: [PATCH 07/41] Fix typos --- python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py | 2 +- python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py index f2b7e7d5672..4f3bacbb05a 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py @@ -276,7 +276,7 @@ def sample_neighbors( # of the seed dtype is not same as the node dtype self.set_sg_node_dtype(first_sg) - # Below will be called from dict + # Below will be called from remote storage sampled_result_arrays = sample_pg( self.gdata, has_multiple_etypes=self.has_multiple_etypes, diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py b/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py index 207132748e4..5682a832b2c 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py @@ -35,7 +35,7 @@ def __init__( from cupy import from_dlpack else: raise NotImplementedError( - f"Only PyTorch ('torch'), TensorFlow ('tf'), and CuPy ('cupy') g" + f"Only PyTorch ('torch'), TensorFlow ('tf'), and CuPy ('cupy')" f"backends are currently supported, got {backend_lib=}" ) if storage_type not in ["edge", "node"]: From 3b6ef598837738db3e0b44788807052ece742d51 Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Wed, 26 Oct 2022 15:49:44 -0700 Subject: [PATCH 08/41] Added copyrights --- .../cugraph/gnn/dgl_extensions/utils/find_edges.py | 14 ++++++++++++++ .../gnn/dgl_extensions/utils/node_subgraph.py | 14 ++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/utils/find_edges.py b/python/cugraph/cugraph/gnn/dgl_extensions/utils/find_edges.py index e0279417a45..0e5b7f3f561 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/utils/find_edges.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/utils/find_edges.py @@ -1,3 +1,17 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + import cudf import dask_cudf from cugraph.experimental import PropertyGraph diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/utils/node_subgraph.py b/python/cugraph/cugraph/gnn/dgl_extensions/utils/node_subgraph.py index 32906bf1e31..f2dd49e8af8 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/utils/node_subgraph.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/utils/node_subgraph.py @@ -1,3 +1,17 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + import cugraph import cudf From 4cfa293e59c50cd2c97148d5f4babcd15589d4c5 Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Wed, 26 Oct 2022 17:04:15 -0700 Subject: [PATCH 09/41] Moved common stuff to RemoteStorage --- .../gnn/dgl_extensions/base_cugraph_store.py | 78 +++++++++++++++++++ .../gnn/dgl_extensions/cugraph_store.py | 39 ++-------- 2 files changed, 83 insertions(+), 34 deletions(-) create mode 100644 python/cugraph/cugraph/gnn/dgl_extensions/base_cugraph_store.py diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/base_cugraph_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/base_cugraph_store.py new file mode 100644 index 00000000000..49607453b6c --- /dev/null +++ b/python/cugraph/cugraph/gnn/dgl_extensions/base_cugraph_store.py @@ -0,0 +1,78 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import cached_property + + +class BaseCuGraphStore: + """ + BaseClass for DGL GraphStore and RemoteGraphStore + """ + + def __init__(self, graph): + self.__G = graph + + def get_edge_storage(self, key, etype=None, indices_offset=0): + raise NotImplementedError + + def get_node_storage(self, key, ntype=None, indices_offset=0): + raise NotImplementedError + + @property + def gdata(self): + return self.__G + + def num_nodes(self, ntype=None): + return self.gdata.get_num_vertices(ntype) + + def num_edges(self, etype=None): + return self.gdata.get_num_edges(etype) + + @cached_property + def has_multiple_etypes(self): + return len(self.etypes) > 1 + + @cached_property + def ntypes(self): + return sorted(self.gdata.vertex_types) + + @cached_property + def etypes(self): + return sorted(self.gdata.edge_types) + + ###################################### + # Sampling APIs + ###################################### + + def sample_neighbors( + self, nodes_cap, fanout=-1, edge_dir="in", prob=None, replace=False + ): + raise NotImplementedError + + ###################################### + # Utilities + ###################################### + @property + def extracted_subgraph(self): + raise NotImplementedError + + @cached_property + def num_nodes_dict(self): + """ + Return num_nodes_dict of the graph + """ + return {ntype: self.num_nodes(ntype) for ntype in self.ntypes} + + @cached_property + def num_edges_dict(self): + return {etype: self.num_edges(etype) for etype in self.etypes} diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py index 4f3bacbb05a..faed0c7086f 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py @@ -13,6 +13,8 @@ from collections import defaultdict +from .base_cugraph_store import BaseCuGraphStore + from functools import cached_property from .utils.find_edges import find_edges from .utils.node_subgraph import node_subgraph @@ -22,7 +24,7 @@ from .feature_storage import CuFeatureStorage -class CuGraphStore: +class CuGraphStore(BaseCuGraphStore): """ A wrapper around a cuGraph Property Graph that then adds functions to basically match the DGL GraphStorage API. @@ -37,6 +39,8 @@ def __init__(self, graph, backend_lib="torch"): self.__G = graph else: raise ValueError("graph must be a PropertyGraph or MGPropertyGraph") + + BaseCuGraphStore.__init__(self, graph) # dict to map column names corresponding to edge features # of each type self.edata_feat_col_d = defaultdict(list) @@ -189,28 +193,6 @@ def get_edge_storage(self, key, etype=None, indices_offset=0): indices_offset=indices_offset, ) - def num_nodes(self, ntype=None): - return self.gdata.get_num_vertices(ntype) - - def num_edges(self, etype=None): - return self.gdata.get_num_edges(etype) - - @cached_property - def has_multiple_etypes(self): - return len(self.etypes) > 1 - - @cached_property - def ntypes(self): - return sorted(self.gdata.vertex_types) - - @cached_property - def etypes(self): - return sorted(self.gdata.edge_types) - - @property - def gdata(self): - return self.__G - ###################################### # Sampling APIs ###################################### @@ -333,17 +315,6 @@ def extracted_reverse_subgraphs_per_type(self): ) return sg_d, sg_src_range_d - @cached_property - def num_nodes_dict(self): - """ - Return num_nodes_dict of the graph - """ - return {ntype: self.num_nodes(ntype) for ntype in self.ntypes} - - @cached_property - def num_edges_dict(self): - return {etype: self.num_edges(etype) for etype in self.etypes} - def set_sg_node_dtype(self, sg): if hasattr(self, "_sg_node_dtype"): return self._sg_node_dtype From f1a3318f7f1d3c938c4ac90dc6f5b1f9fb0cd496 Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Wed, 26 Oct 2022 17:42:56 -0700 Subject: [PATCH 10/41] Address Reviews --- .../cugraph/gnn/dgl_extensions/base_cugraph_store.py | 4 ++++ .../cugraph/gnn/dgl_extensions/feature_storage.py | 12 +++++------- .../cugraph/gnn/dgl_extensions/utils/find_edges.py | 1 - 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/base_cugraph_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/base_cugraph_store.py index 49607453b6c..90a26ca6f7c 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/base_cugraph_store.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/base_cugraph_store.py @@ -32,6 +32,10 @@ def get_node_storage(self, key, ntype=None, indices_offset=0): def gdata(self): return self.__G + @property + def num_vertices(self): + return self.gdata.get_num_vertices() + def num_nodes(self, ntype=None): return self.gdata.get_num_vertices(ntype) diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py b/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py index 5682a832b2c..a518e9015cd 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py @@ -10,10 +10,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - - -import cudf -import dask_cudf import cupy as cp @@ -35,7 +31,7 @@ def __init__( from cupy import from_dlpack else: raise NotImplementedError( - f"Only PyTorch ('torch'), TensorFlow ('tf'), and CuPy ('cupy')" + f"Only PyTorch ('torch'), TensorFlow ('tf'), and CuPy ('cupy') " f"backends are currently supported, got {backend_lib=}" ) if storage_type not in ["edge", "node"]: @@ -66,11 +62,13 @@ def fetch(self, indices, device=None, pin_memory=False, **kwargs): # Default implementation uses synchronous fetch. indices = cp.asarray(indices) - if type(self.pg).__name__ in "MGPropertyGraph": + if type(self.pg).__name__ == "MGPropertyGraph": # dask_cudf loc breaks if we provide cudf series/cupy array # https://github.com/rapidsai/cudf/issues/11877 indices = indices.get() else: + import cudf + indices = cudf.Series(indices) indices = indices + self.indices_offset @@ -84,7 +82,7 @@ def fetch(self, indices, device=None, pin_memory=False, **kwargs): subset_df = subset_df[self.columns] - if isinstance(subset_df, dask_cudf.DataFrame): + if hasattr(subset_df, "compute"): subset_df = subset_df.compute() if len(subset_df) == 0: diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/utils/find_edges.py b/python/cugraph/cugraph/gnn/dgl_extensions/utils/find_edges.py index 0e5b7f3f561..4f632e9ad56 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/utils/find_edges.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/utils/find_edges.py @@ -11,7 +11,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import cudf import dask_cudf from cugraph.experimental import PropertyGraph From 067c4e98630dd6d1442deb95e375d594d4ed943a Mon Sep 17 00:00:00 2001 From: Rick Ratzel Date: Wed, 26 Oct 2022 22:09:58 -0500 Subject: [PATCH 11/41] Added ucx-py infra for support result_device param for call_extension(), updated tests, added test for upcoming ability to load extensions via import paths. --- .../cugraph_service_client/__init__.py | 20 +++ .../cugraph_service_client/client.py | 126 +++++++++++++++--- .../cugraph_service_thrift.py | 2 + .../cugraph_service_server/cugraph_handler.py | 96 +++++++++++-- .../tests/test_cugraph_handler.py | 61 +++++++++ python/cugraph_service/tests/test_mg_e2e.py | 2 +- 6 files changed, 272 insertions(+), 35 deletions(-) diff --git a/python/cugraph_service/cugraph_service_client/__init__.py b/python/cugraph_service/cugraph_service_client/__init__.py index c7479163894..dd916adc278 100644 --- a/python/cugraph_service/cugraph_service_client/__init__.py +++ b/python/cugraph_service/cugraph_service_client/__init__.py @@ -12,4 +12,24 @@ # See the License for the specific language governing permissions and # limitations under the License. +# constants used by both client and server +# (the server package depends on the client so server code can share client +# code/utilities/defaults/etc.) +supported_extension_return_dtypes = [ + "NoneType", + "int8", + "int16", + "int32", + "int64", + "float16", + "float32", + "float64", +] +# make a bi-directional mapping between type strings and ints. This is used for +# sending dtype meta-data between client and server. +extension_return_dtype_map = dict(enumerate(supported_extension_return_dtypes)) +extension_return_dtype_map.update( + dict(map(reversed, extension_return_dtype_map.items())) +) + from cugraph_service_client.client import CugraphServiceClient diff --git a/python/cugraph_service/cugraph_service_client/client.py b/python/cugraph_service/cugraph_service_client/client.py index b3effb97749..9eb0065c8e8 100644 --- a/python/cugraph_service/cugraph_service_client/client.py +++ b/python/cugraph_service/cugraph_service_client/client.py @@ -22,6 +22,7 @@ import cupy as cp from cugraph_service_client import defaults +from cugraph_service_client import extension_return_dtype_map from cugraph_service_client.types import ( ValueWrapper, GraphVertexEdgeID, @@ -368,11 +369,17 @@ def call_graph_creation_extension(self, func_name, *func_args, **func_kwargs): ) @__server_connection - def call_extension(self, func_name, *func_args, **func_kwargs): + def call_extension( + self, + func_name, + *func_args, + result_device=None, + **func_kwargs, + ): """ - Calls an extension on the server that was previously - loaded by a prior call to load_extensions(), then - returns the result returned by the extension. + Calls an extension on the server that was previously loaded by a prior + call to load_extensions(), then returns the result returned by the + extension. Parameters ---------- @@ -388,12 +395,18 @@ def call_extension(self, func_name, *func_args, **func_kwargs): and therefore only objects that can be restored server-side with eval() are supported. - **func_kwargs : string, int, list, dictionary - The keyword args to pass to func_name. Note that func_kwargs are - converted to their string representation using repr() on the - client, then restored to python objects on the server using eval(), - and therefore only objects that can be restored server-side with - eval() are supported. + **func_kwargs : string, int, list, dictionary The keyword args to pass + to func_name. func_kwargs are converted to their string + representation using repr() on the client, then restored to python + objects on the server using eval(), and therefore only objects that + can be restored server-side with eval() are supported. + + result_device is reserved for use in specifying an optional GPU + device ID to have the server transfer results to. + + result_device : int, default is None + If specified, must be the integer ID of a GPU device to have the + server transfer results to as one or more cupy ndarrays Returns ------- @@ -412,12 +425,26 @@ def call_extension(self, func_name, *func_args, **func_kwargs): """ func_args_repr = repr(func_args) func_kwargs_repr = repr(func_kwargs) - result = self.__client.call_extension( - func_name, func_args_repr, func_kwargs_repr - ) - # FIXME: ValueWrapper ctor and get_py_obj are recursive and could be slow, - # especially if Value is a list. Consider returning the Value obj as-is. - return ValueWrapper(result).get_py_obj() + if result_device is not None: + result_obj = asyncio.run( + self.__call_extension_to_device( + func_name, func_args_repr, func_kwargs_repr, result_device + ) + ) + # result_obj is a cupy array or tuple of cupy arrays on result_device + return result_obj + else: + result_obj = self.__client.call_extension( + func_name, + func_args_repr, + func_kwargs_repr, + client_host=None, + client_result_port=None, + ) + # Convert the structure returned from the RPC call to a python type + # FIXME: ValueWrapper ctor and get_py_obj are recursive and could be slow, + # especially if Value is a list. Consider returning the Value obj as-is. + return ValueWrapper(result_obj).get_py_obj() ########################################################################### # Graph management @@ -1026,15 +1053,15 @@ def uniform_neighbor_sample( Samples the graph and returns a UniformNeighborSampleResult instance. Parameters: - start_list: list[int] + start_list : list[int] - fanout_vals: list[int] + fanout_vals : list[int] - with_replacement: bool + with_replacement : bool - graph_id: int, default is defaults.graph_id + graph_id : int, default is defaults.graph_id - result_device: int, default is None + result_device : int, default is None Returns ------- @@ -1193,6 +1220,63 @@ async def receiver(endpoint): uns_thread.join() return result_obj + async def __call_extension_to_device( + self, func_name, func_args_repr, func_kwargs_repr, result_device + ): + """ + Run the server-side extension func_name with the args/kwargs and have the + result sent directly to the device specified by result_device. + """ + # FIXME: there's probably a better way to do this, eg. create a class containing + # both allocator and receiver that maintains results, devices, etc. that's + # callable from the listener + result = [] + + # FIXME: check for valid device + allocator = DeviceArrayAllocator(result_device) + + async def receiver(endpoint): + # Format of data sent is assumed to be: + # 1) a single array of length n describing the dtypes for the n arrays that + # follow + # 2) n arrays + with cp.cuda.Device(result_device): + # First get the array describing the data + # FIXME: meta_data doesn't need to be a cupy array + dtype_meta_data = await endpoint.recv_obj(allocator=allocator) + for dtype_enum in [int(i) for i in dtype_meta_data]: + # FIXME: safe to assume dtype_enum will always be valid? + dtype = extension_return_dtype_map[dtype_enum] + a = await endpoint.recv_obj(allocator=allocator) + result.append(a.view(dtype)) + + await endpoint.close() + listener.close() + + listener = ucp.create_listener(receiver, self.results_port) + + ce_thread = threading.Thread( + target=self.__client.call_extension, + args=( + func_name, + func_args_repr, + func_kwargs_repr, + self.host, + self.results_port, + ), + ) + ce_thread.start() + + while not listener.closed(): + await asyncio.sleep(0.05) + + ce_thread.join() + + # special case, assume a list of len 1 should not be a list + if len(result) == 1: + result = result[0] + return result + @staticmethod def __get_vertex_edge_id_obj(id_or_ids): # FIXME: do not assume all values are int32 diff --git a/python/cugraph_service/cugraph_service_client/cugraph_service_thrift.py b/python/cugraph_service/cugraph_service_client/cugraph_service_thrift.py index 1268467b9cf..fdd85be328c 100644 --- a/python/cugraph_service/cugraph_service_client/cugraph_service_thrift.py +++ b/python/cugraph_service/cugraph_service_client/cugraph_service_thrift.py @@ -105,6 +105,8 @@ Value call_extension(1:string func_name, 2:string func_args_repr, 3:string func_kwargs_repr + 4:string result_host, + 5:i16 result_port ) throws (1:CugraphServiceError e), ############################################################################## diff --git a/python/cugraph_service/cugraph_service_server/cugraph_handler.py b/python/cugraph_service/cugraph_service_server/cugraph_handler.py index 6cfe126e2dc..7f57da37f72 100644 --- a/python/cugraph_service/cugraph_service_server/cugraph_handler.py +++ b/python/cugraph_service/cugraph_service_server/cugraph_handler.py @@ -36,6 +36,10 @@ ) from cugraph_service_client import defaults +from cugraph_service_client import ( + extension_return_dtype_map, + supported_extension_return_dtypes, +) from cugraph_service_client.exceptions import CugraphServiceError from cugraph_service_client.types import ( BatchedEgoGraphsResult, @@ -216,7 +220,10 @@ def load_extensions(self, extension_dir_path): """ extension_dir = Path(extension_dir_path) + # extension_dir_path is either a path on disk or an importable module path + # (eg. import foo.bar.module) if (not extension_dir.exists()) or (not extension_dir.is_dir()): + raise CugraphServiceError(f"bad directory: {extension_dir}") modules_loaded = [] @@ -259,7 +266,14 @@ def call_graph_creation_extension( # FIXME: ensure graph_obj is a graph obj return self.__add_graph(graph_obj) - def call_extension(self, func_name, func_args_repr, func_kwargs_repr): + def call_extension( + self, + func_name, + func_args_repr, + func_kwargs_repr, + result_host=None, + result_port=None, + ): """ Calls the extension function func_name and passes it the eval'd func_args_repr and func_kwargs_repr objects. If successful, returns a @@ -267,10 +281,57 @@ def call_extension(self, func_name, func_args_repr, func_kwargs_repr): func_name cannot be a private name (name starting with __). """ - result = self.__call_extension( - self.__extensions, func_name, func_args_repr, func_kwargs_repr - ) - return ValueWrapper(result) + try: + result = self.__call_extension( + self.__extensions, func_name, func_args_repr, func_kwargs_repr + ) + if self.__check_host_port_args(result_host, result_port): + # Ensure result is in list format for calling __ucx_send_results so it + # sends the contents as individual arrays. + if isinstance(result, (list, tuple)): + result_list = result + else: + result_list = [result] + + # Form the meta-data array to send first. This array contains uint8 + # values which map to dtypes the client uses when converting bytes to + # values. + meta_data = [] + for r in result_list: + if hasattr(r, "dtype"): + dtype_str = str(r.dtype) + else: + dtype_str = type(r).__name__ + + dtype_enum_val = extension_return_dtype_map.get(dtype_str) + if dtype_enum_val is None: + raise TypeError( + f"extension {func_name} returned an invalid type " + f"{dtype_str}, only " + f"{supported_extension_return_dtypes} are supported" + ) + meta_data.append(dtype_enum_val) + # FIXME: meta_data should not need to be a cupy array + meta_data = cp.array(meta_data, dtype="uint8") + + asyncio.run( + self.__ucx_send_results( + result_host, + result_port, + meta_data, + *result_list, + ) + ) + # FIXME: Thrift still expects something of the expected type to + # be returned to be serialized and sent. Look into a separate + # API that uses the Thrift "oneway" modifier when returning + # results via client device. + return ValueWrapper(None) + else: + return ValueWrapper(result) + + except Exception: + raise CugraphServiceError(f"{traceback.format_exc()}") def initialize_dask_client(self, dask_scheduler_file=None): """ @@ -711,13 +772,7 @@ def uniform_neighbor_sample( fanout_vals=fanout_vals, with_replacement=with_replacement, ) - if (result_host is not None) or (result_port is not None): - if (result_host is None) or (result_port is None): - raise ValueError( - "both result_host and result_port must " - "be set if either is set. Got: " - f"{result_host=}, {result_port=}" - ) + if self.__check_host_port_args(result_host, result_port): asyncio.run( self.__ucx_send_results( result_host, @@ -824,6 +879,21 @@ def _get_graph(self, graph_id): ########################################################################### # Private + @staticmethod + def __check_host_port_args(result_host, result_port): + """ + Return True if host and port are set correctly, False if not set, and raise + ValueError if set incorrectly. + """ + if (result_host is not None) or (result_port is not None): + if (result_host is None) or (result_port is None): + raise ValueError( + "both result_host and result_port must be set if either is set. " + f"Got: {result_host=}, {result_port=}" + ) + return True + return False + async def __ucx_send_results(self, result_host, result_port, *results): # The cugraph_service_client should have set up a UCX listener waiting # for the result. Create an endpoint, send results, and close. @@ -833,6 +903,7 @@ async def __ucx_send_results(self, result_host, result_port, *results): await ep.close() def __get_dataframe_from_csv(self, csv_file_name, delimiter, dtypes, header, names): + """ Read a CSV into a DataFrame and return it. This will use either a cuDF DataFrame or a dask_cudf DataFrame based on if the handler is @@ -963,7 +1034,6 @@ def __call_extension( raise CugraphServiceError(f"Cannot call private function {func_name}") for module in extension_dict.values(): - # Ignore private functions func = getattr(module, func_name, None) if func is not None: func_args = eval(func_args_repr) diff --git a/python/cugraph_service/tests/test_cugraph_handler.py b/python/cugraph_service/tests/test_cugraph_handler.py index ebc26e5a796..5b0d63d51ce 100644 --- a/python/cugraph_service/tests/test_cugraph_handler.py +++ b/python/cugraph_service/tests/test_cugraph_handler.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import sys import pickle from pathlib import Path @@ -180,7 +181,67 @@ def test_extension_with_facade_graph_access( assert results.list_value[2].double_value == 2 + val1 + val2 +def test_load_and_unload_extensions_python_module_path(extension1): + """ + Load, run, unload an extension that was loaded using a python module path + (as would be used by an import statement) instead of a file path. + """ + from cugraph_service_client.exceptions import CugraphServiceError + from cugraph_service_server.cugraph_handler import CugraphHandler + + handler = CugraphHandler() + extension_dir = extension1 + extension_dir_path = Path(extension_dir) + + # Create an __init__py file and add the dir to sys.path so it can be + # imported as a package. + with open(extension_dir_path / "__init__.py", "w") as f: + f.write("") + # FIXME: this should go into a fixture which can unmodify sys.path when done + sys.path.append(extension_dir_path.parent) + + # Load everything in the package + # ext_mod_names is a list of python module paths (eg. "foo.bar.module") + # containing only 1 module + ext_mod_names1 = handler.load_extensions(extension_dir_path.stem) + assert len(ext_mod_names1) == 1 + + results = handler.call_extension( + "my_nines_function", "(33, 'int32', 21, 'float64')", "{}" + ) + assert results.list_value[0].list_value[0].int32_value == 9 + assert results.list_value[1].list_value[0].double_value == 9.0 + + for mod_name in ext_mod_names1: + handler.unload_extension_module(mod_name) + + with pytest.raises(CugraphServiceError): + handler.call_extension( + "my_nines_function", "(33, 'int32', 21, 'float64')", "{}" + ) + + # Load just an individual module in the package + # ext_mod_names should be the same as above + ext_mod_names2 = handler.load_extensions(extension_dir_path.stem) + assert ext_mod_names1 == ext_mod_names2 + + results = handler.call_extension( + "my_nines_function", "(33, 'int32', 21, 'float64')", "{}" + ) + assert results.list_value[0].list_value[0].int32_value == 9 + assert results.list_value[1].list_value[0].double_value == 9.0 + + for mod_name in ext_mod_names2: + handler.unload_extension_module(mod_name) + + with pytest.raises(CugraphServiceError): + handler.call_extension( + "my_nines_function", "(33, 'int32', 21, 'float64')", "{}" + ) + + def test_load_and_unload_graph_creation_extension_no_args(graph_creation_extension1): + """ Test graph_creation_extension1 which contains an extension with no args. """ diff --git a/python/cugraph_service/tests/test_mg_e2e.py b/python/cugraph_service/tests/test_mg_e2e.py index 7649787dfb7..182829adea1 100644 --- a/python/cugraph_service/tests/test_mg_e2e.py +++ b/python/cugraph_service/tests/test_mg_e2e.py @@ -363,7 +363,7 @@ def test_call_extension_result_on_device( else: # results will be a n-tuple where n is the number of arrays returned. The # n-tuple contains each array as a device array on result_device_id. - assert isinstance(results, tuple) + assert isinstance(results, list) assert len(results) == 2 device_n = cp.cuda.Device(result_device_id) From ac59971263669e93f1bfa305886c3707f5a4c764 Mon Sep 17 00:00:00 2001 From: Rick Ratzel Date: Wed, 26 Oct 2022 23:18:15 -0500 Subject: [PATCH 12/41] Added code to support loading extensions using a python module or package path (eg. import foo.bar.baz), updated test. --- .../cugraph_service_server/cugraph_handler.py | 24 ++++++++--- .../tests/test_cugraph_handler.py | 40 +++++++++++++------ 2 files changed, 46 insertions(+), 18 deletions(-) diff --git a/python/cugraph_service/cugraph_service_server/cugraph_handler.py b/python/cugraph_service/cugraph_service_server/cugraph_handler.py index 7f57da37f72..a5684aa2b06 100644 --- a/python/cugraph_service/cugraph_service_server/cugraph_handler.py +++ b/python/cugraph_service/cugraph_service_server/cugraph_handler.py @@ -210,7 +210,7 @@ def load_graph_creation_extensions(self, extension_dir_path): return modules_loaded - def load_extensions(self, extension_dir_path): + def load_extensions(self, extension_dir_or_mod_path): """ Loads ("imports") all modules matching the pattern *_extension.py in the directory specified by extension_dir_path. @@ -218,16 +218,28 @@ def load_extensions(self, extension_dir_path): The modules are searched and their functions are called (if a match is found) when call_extension() is called. """ - extension_dir = Path(extension_dir_path) + modules_loaded = [] + extension_path = Path(extension_dir_or_mod_path) # extension_dir_path is either a path on disk or an importable module path # (eg. import foo.bar.module) - if (not extension_dir.exists()) or (not extension_dir.is_dir()): + if (not extension_path.exists()) or (not extension_path.is_dir()): + try: + mod = importlib.import_module(str(extension_path)) + except ModuleNotFoundError: + raise CugraphServiceError(f"bad path: {extension_dir_or_mod_path}") - raise CugraphServiceError(f"bad directory: {extension_dir}") + mod_file_path = Path(mod.__file__).absolute() - modules_loaded = [] - for ext_file in extension_dir.glob("*_extension.py"): + # If mod is a package, find all the .py files in it + if mod_file_path.name == "__init__.py": + extension_files = mod_file_path.parent.glob("*.py") + else: + extension_files = [mod_file_path] + else: + extension_files = extension_path.glob("*_extension.py") + + for ext_file in extension_files: module_file_path = ext_file.absolute().as_posix() spec = importlib.util.spec_from_file_location(module_file_path, ext_file) module = importlib.util.module_from_spec(spec) diff --git a/python/cugraph_service/tests/test_cugraph_handler.py b/python/cugraph_service/tests/test_cugraph_handler.py index 5b0d63d51ce..a5637c0c604 100644 --- a/python/cugraph_service/tests/test_cugraph_handler.py +++ b/python/cugraph_service/tests/test_cugraph_handler.py @@ -191,20 +191,29 @@ def test_load_and_unload_extensions_python_module_path(extension1): handler = CugraphHandler() extension_dir = extension1 - extension_dir_path = Path(extension_dir) + extension_dir_path = Path(extension_dir).absolute() + package_name = extension_dir_path.name # last name in the path only # Create an __init__py file and add the dir to sys.path so it can be # imported as a package. with open(extension_dir_path / "__init__.py", "w") as f: f.write("") # FIXME: this should go into a fixture which can unmodify sys.path when done - sys.path.append(extension_dir_path.parent) - - # Load everything in the package - # ext_mod_names is a list of python module paths (eg. "foo.bar.module") - # containing only 1 module - ext_mod_names1 = handler.load_extensions(extension_dir_path.stem) - assert len(ext_mod_names1) == 1 + sys.path.append(extension_dir_path.parent.as_posix()) + + # Create another .py file to test multiple module loading + with open(extension_dir_path / "foo.py", "w") as f: + f.write("def foo_func(): return 33") + + # Load everything in the package, ext_mod_names should be a list of python + # files containing 3 files (2 modules + __init__.py file). + # Assume the .py file in the generated extension dir is named + # "my_extension.py" + ext_mod_names1 = handler.load_extensions(package_name) + assert len(ext_mod_names1) == 3 + assert str(extension_dir_path / "my_extension.py") in ext_mod_names1 + assert str(extension_dir_path / "foo.py") in ext_mod_names1 + assert str(extension_dir_path / "__init__.py") in ext_mod_names1 results = handler.call_extension( "my_nines_function", "(33, 'int32', 21, 'float64')", "{}" @@ -212,6 +221,10 @@ def test_load_and_unload_extensions_python_module_path(extension1): assert results.list_value[0].list_value[0].int32_value == 9 assert results.list_value[1].list_value[0].double_value == 9.0 + result = handler.call_extension("foo_func", "()", "{}") + assert result.int32_value == 33 + + # unload for mod_name in ext_mod_names1: handler.unload_extension_module(mod_name) @@ -219,11 +232,14 @@ def test_load_and_unload_extensions_python_module_path(extension1): handler.call_extension( "my_nines_function", "(33, 'int32', 21, 'float64')", "{}" ) + with pytest.raises(CugraphServiceError): + handler.call_extension("foo_func", "()", "{}") - # Load just an individual module in the package - # ext_mod_names should be the same as above - ext_mod_names2 = handler.load_extensions(extension_dir_path.stem) - assert ext_mod_names1 == ext_mod_names2 + # Load just an individual module in the package, ext_mod_names should only + # contain 1 file. + mod_name = f"{package_name}.my_extension" + ext_mod_names2 = handler.load_extensions(mod_name) + assert ext_mod_names2 == [str(extension_dir_path / "my_extension.py")] results = handler.call_extension( "my_nines_function", "(33, 'int32', 21, 'float64')", "{}" From b4b489e5197eb732adcb8571a9eda6f7faf6d3e4 Mon Sep 17 00:00:00 2001 From: Rick Ratzel Date: Wed, 26 Oct 2022 23:29:28 -0500 Subject: [PATCH 13/41] Updated docstring for load_extensions() --- .../cugraph_service_client/client.py | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/python/cugraph_service/cugraph_service_client/client.py b/python/cugraph_service/cugraph_service_client/client.py index 9eb0065c8e8..401f199e8ca 100644 --- a/python/cugraph_service/cugraph_service_client/client.py +++ b/python/cugraph_service/cugraph_service_client/client.py @@ -265,21 +265,27 @@ def load_graph_creation_extensions(self, extension_dir_path): return self.__client.load_graph_creation_extensions(extension_dir_path) @__server_connection - def load_extensions(self, extension_dir_path): + def load_extensions(self, extension_dir_or_mod_path): """ - Loads the extensions present in the directory specified by extension_dir_path. + Loads the extensions present in the directory (path on disk), or module or + package path (as used in an import statement) specified by + extension_dir_or_mod_path. Parameters ---------- - extension_dir_path : string + extension_dir_or_mod_path : string Path to the directory containing the extension files (.py source - files). This directory must be readable by the server. + files), or an importable module or package path (eg. my.package or + my.package.module). If a directory is specified it must be readable + by the server, and if a module or package path is specified it must + be importable by the server (ie. present in the sys.path of the + running server). Returns ------- extension_modnames : list - List of the module names loaded. These can be used in calls to - unload_extension_module() + List of the module names loaded as paths to files on disk. These can + be used in calls to unload_extension_module() Examples -------- @@ -287,9 +293,10 @@ def load_extensions(self, extension_dir_path): >>> client = CugraphServiceClient() >>> extension_modnames = client.load_graph_creation_extensions( ... "/some/server/side/directory") - >>> + >>> more_extension_modnames = client.load_graph_creation_extensions( + ... "my_project.extensions.etl") """ - return self.__client.load_extensions(extension_dir_path) + return self.__client.load_extensions(extension_dir_or_mod_path) @__server_connection def unload_extension_module(self, modname): From 943034f1be2d744c15ea1eac24ac070f82c98e5b Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Wed, 26 Oct 2022 21:37:22 -0700 Subject: [PATCH 14/41] Added Remote Storage Support --- .../dgl_extensions/cugraph_service_store.py | 514 ++++++++++++++++++ .../service_extensions/add_data.py | 53 ++ .../gnn/dgl_extensions/utils/add_data.py | 20 + 3 files changed, 587 insertions(+) create mode 100644 python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py create mode 100644 python/cugraph/cugraph/gnn/dgl_extensions/service_extensions/add_data.py diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py new file mode 100644 index 00000000000..8936a96d6fe --- /dev/null +++ b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py @@ -0,0 +1,514 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict + +from .base_cugraph_store import BaseCuGraphStore + +from functools import cached_property +from .utils.add_data import _update_feature_map, deserialize_strings_from_char_ars +from .utils.sampling import sample_pg, get_subgraph_and_src_range_from_pg +from .utils.sampling import get_underlying_dtype_from_sg +from .feature_storage import CuFeatureStorage + + +class CuGraphRemoteStore(BaseCuGraphStore): + """ + A wrapper around a cuGraph Property Graph that + then adds functions to basically match the DGL GraphStorage API. + This is not a full duck-types match to a DGL GraphStore. + + This class return dlpack types and has additional functional arguments. + """ + + def __init__(self, graph, backend_lib="torch"): + + if type(graph).__name__ in ["RemoteGraph"]: + self.__G = graph + self.client = graph._client + else: + raise ValueError("graph must be a RemoteGraph") + + BaseCuGraphStore.__init__(self, graph) + # dict to map column names corresponding to edge features + # of each type + self.edata_feat_col_d = defaultdict(list) + # dict to map column names corresponding to node features + # of each type + self.ndata_feat_col_d = defaultdict(list) + self.backend_lib = backend_lib + + def add_node_data( + self, + df, + node_col_name, + ntype=None, + feat_name=None, + contains_vector_features=False, + ): + """ + Add a dataframe describing node properties to the PropertyGraph. + + Parameters + ---------- + dataframe : DataFrame-compatible instance + A DataFrame instance with a compatible Pandas-like DataFrame + interface. + node_col_name : string + The column name that contains the values to be used as vertex IDs. + ntype : string + The node type to be added. + For example, if dataframe contains data about users, ntype + might be "users". + If not specified, the type of properties will be added as + an empty string. + feat_name : {} or string + A map of feature names under which we should save the added + properties like {"feat_1":[f1, f2], "feat_2":[f3, f4]} + (ignored if contains_vector_features=False and the col names of + the dataframe are treated as corresponding feature names) + contains_vector_features : False + Whether to treat the columns of the dataframe being added as + as 2d features + Returns + ------- + None + """ + raise NotImplementedError( + "Adding Node Data From Local is not yet supported" + "Please Use `add_node_data_from_parquet`" + ) + + def add_edge_data( + self, + df, + node_col_names, + canonical_etype=None, + feat_name=None, + contains_vector_features=False, + ): + """ + Add a dataframe describing edge properties to the PropertyGraph. + + Parameters + ---------- + dataframe : DataFrame-compatible instance + A DataFrame instance with a compatible Pandas-like DataFrame + interface. + node_col_names : string + The column names that contain the values to be used as the source + and destination vertex IDs for the edges. + canonical_etype : string + The edge type to be added. This should follow the string format + '(src_type),(edge_type),(dst_type)' + If not specified, the type of properties will be added as + an empty string. + feat_name : string or dict {} + The feature name under which we should save the added properties + (ignored if contains_vector_features=False and the col names of + the dataframe are treated as corresponding feature names) + contains_vector_features : False + Whether to treat the columns of the dataframe being added as + as 2d features + Returns + ------- + None + """ + raise NotImplementedError( + "Adding Node Data From local is not yet supported for Remote Storage" + "Please Use `add_edge_data_from_parquet`" + ) + + def add_node_data_from_parquet( + self, + file_path, + node_col_name, + ntype=None, + node_offset=0, + feat_name=None, + contains_vector_features=False, + ): + """ + Add a dataframe describing node properties to the PropertyGraph. + + Parameters + ---------- + file_path: string + Path of the files on the server + node_col_name : string + The column name that contains the values to be used as vertex IDs. + ntype : string + The node type to be added. + For example, if dataframe contains data about users, ntype + might be "users". + If not specified, the type of properties will be added as + an empty string. + feat_name : {} or string + A map of feature names under which we should save the added + properties like {"feat_1":[f1, f2], "feat_2":[f3, f4]} + (ignored if contains_vector_features=False and the col names of + the dataframe are treated as corresponding feature names) + contains_vector_features : False + Whether to treat the columns of the dataframe being added as + as 2d features + Returns + ------- + None + """ + # TODO: Use PR 2850: https://github.com/rapidsai/cugraph/pull/2850 + loaded_columns = self.client.call_extension( + func_name="add_node_data_from_parquet", + file_path=file_path, + node_col_name=node_col_name, + node_offset=node_offset, + type=ntype, + graph_id=self.gdata.id, + ) + loaded_columns = deserialize_strings_from_char_ars(loaded_columns) + + columns = [col for col in loaded_columns if col != node_col_name] + _update_feature_map( + self.ndata_feat_col_d, feat_name, contains_vector_features, columns + ) + # Clear properties if set as data has changed + self.__clear_cached_properties() + + def add_edge_data_from_parquet( + self, + file_path, + node_col_names, + src_offset=0, + dst_offset=0, + canonical_etype=None, + feat_name=None, + contains_vector_features=False, + ): + """ + Add a dataframe describing edge properties to the PropertyGraph. + + Parameters + ---------- + file_path : string + Path of file on server + node_col_names : string + The column names that contain the values to be used as the source + and destination vertex IDs for the edges. + canonical_etype : string + The edge type to be added. This should follow the string format + '(src_type),(edge_type),(dst_type)' + If not specified, the type of properties will be added as + an empty string. + feat_name : string or dict {} + The feature name under which we should save the added properties + (ignored if contains_vector_features=False and the col names of + the dataframe are treated as corresponding feature names) + contains_vector_features : False + Whether to treat the columns of the dataframe being added as + as 2d features + Returns + ------- + None + """ + + # TODO: Use PR 2850: https://github.com/rapidsai/cugraph/pull/2850 + loaded_column_ars = self.client.call_extension( + func_name="add_edge_data_from_parquet", + file_path=file_path, + node_col_names=node_col_names, + canonical_etype=canonical_etype, + src_offset=src_offset, + dst_offset=dst_offset, + graph_id=self.gdata, + ) + loaded_columns = deserialize_strings_from_char_ars(loaded_column_ars) + columns = [col for col in loaded_columns if col not in node_col_names] + _update_feature_map( + self.edata_feat_col_d, feat_name, contains_vector_features, columns + ) + + def get_node_storage(self, key, ntype=None, indices_offset=0): + if ntype is None: + ntypes = self.ntypes + if len(self.ntypes) > 1: + raise ValueError( + ( + "Node type name must be specified if there " + "are more than one node types." + ) + ) + ntype = ntypes[0] + if key not in self.ndata_feat_col_d: + raise ValueError( + f"key {key} not found in CuGraphStore node features", + f" {list(self.ndata_feat_col_d.keys())}", + ) + + columns = self.ndata_feat_col_d[key] + return CuFeatureStorage( + pg=self.gdata, + columns=columns, + storage_type="node", + indices_offset=indices_offset, + backend_lib=self.backend_lib, + ) + + def get_edge_storage(self, key, etype=None, indices_offset=0): + if etype is None: + etypes = self.etypes + if len(self.etypes) > 1: + raise ValueError( + ( + "Edge type name must be specified if there " + "are more than one edge types." + ) + ) + + etype = etypes[0] + if key not in self.edata_feat_col_d: + raise ValueError( + f"key {key} not found in CuGraphStore" " edge features", + f" {list(self.edata_feat_col_d.keys())}", + ) + columns = self.edata_feat_col_d[key] + + return CuFeatureStorage( + pg=self.gdata, + columns=columns, + storage_type="edge", + backend_lib=self.backend_lib, + indices_offset=indices_offset, + ) + + ###################################### + # Sampling APIs + ###################################### + + def sample_neighbors( + self, nodes_cap, fanout=-1, edge_dir="in", prob=None, replace=False + ): + """ + Sample neighboring edges of the given nodes and return the subgraph. + + Parameters + ---------- + nodes_cap : Dlpack or dict of Dlpack of Node IDs + to sample neighbors from. + fanout : int + The number of edges to be sampled for each node on each edge type. + If -1 is given all the neighboring edges for each node on + each edge type will be selected. + edge_dir : str {"in" or "out"} + Determines whether to sample inbound or outbound edges. + Can take either in for inbound edges or out for outbound edges. + prob : str + Feature name used as the (unnormalized) probabilities associated + with each neighboring edge of a node. Each feature must be a + scalar. The features must be non-negative floats, and the sum of + the features of inbound/outbound edges for every node must be + positive (though they don't have to sum up to one). Otherwise, + the result will be undefined. If not specified, sample uniformly. + replace : bool + If True, sample with replacement. + + Returns + ------- + DLPack capsule + The src nodes for the sampled bipartite graph. + DLPack capsule + The sampled dst nodes for the sampledbipartite graph. + DLPack capsule + The corresponding eids for the sampled bipartite graph + """ + + if edge_dir not in ["in", "out"]: + raise ValueError( + f"edge_dir must be either 'in' or 'out' got {edge_dir} instead" + ) + + if self.has_multiple_etypes: + # TODO: Convert into a single call when + # https://github.com/rapidsai/cugraph/issues/2696 lands + if edge_dir == "in": + sgs_obj, sgs_src_range_obj = self.extracted_reverse_subgraphs_per_type + else: + sgs_obj, sgs_src_range_obj = self.extracted_subgraphs_per_type + first_sg = list(sgs_obj.values())[0] + else: + if edge_dir == "in": + sgs_obj, sgs_src_range_obj = self.extracted_reverse_subgraph + else: + sgs_obj, sgs_src_range_obj = self.extracted_subgraph + + first_sg = sgs_obj + # Uniform sampling fails when the dtype + # of the seed dtype is not same as the node dtype + # TODO: Update this function + self.set_sg_node_dtype(first_sg) + + # Below will be called from remote storage + # Call via Remote Storage + # TODO: Update this function + sampled_result_arrays = sample_pg( + self.gdata, + has_multiple_etypes=self.has_multiple_etypes, + etypes=self.etypes, + sgs_obj=sgs_obj, + sgs_src_range_obj=sgs_src_range_obj, + sg_node_dtype=self._sg_node_dtype, + nodes_cap=nodes_cap, + replace=replace, + fanout=fanout, + edge_dir=edge_dir, + ) + return create_dlpack_results_from_arrays(sampled_result_arrays, self.etypes) + + ###################################### + # Utilities + ###################################### + @cached_property + def extracted_subgraph(self): + return get_subgraph_and_src_range_from_pg( + self.gdata, reverse_edges=False, etype=None + ) + + @cached_property + def extracted_reverse_subgraph(self): + return get_subgraph_and_src_range_from_pg( + self.gdata, reverse_edges=True, etype=None + ) + + @cached_property + def extracted_subgraphs_per_type(self): + sg_d = {} + sg_src_range_d = {} + for etype in self.etypes: + sg_d[etype], sg_src_range_d[etype] = get_subgraph_and_src_range_from_pg( + self.gdata, reverse_edges=False, etype=etype + ) + return sg_d, sg_src_range_d + + @cached_property + def extracted_reverse_subgraphs_per_type(self): + sg_d = {} + sg_src_range_d = {} + for etype in self.etypes: + sg_d[etype], sg_src_range_d[etype] = get_subgraph_and_src_range_from_pg( + self.gdata, reverse_edges=True, etype=etype + ) + return sg_d, sg_src_range_d + + def set_sg_node_dtype(self, sg): + if hasattr(self, "_sg_node_dtype"): + return self._sg_node_dtype + else: + self._sg_node_dtype = get_underlying_dtype_from_sg(sg) + return self._sg_node_dtype + + def find_edges(self, edge_ids_cap, etype): + """Return the source and destination node IDs given the edge IDs within + the given edge type. + + Parameters + ---------- + edge_ids_cap : Dlpack of Node IDs (single dimension) + The edge ids to find + + Returns + ------- + DLPack capsule + The src nodes for the given ids + + DLPack capsule + The dst nodes for the given ids + """ + raise NotImplementedError + + def node_subgraph( + self, + nodes=None, + create_using=None, + ): + """ + Return a subgraph induced on the given nodes. + + A node-induced subgraph is a graph with edges whose endpoints are both + in the specified node set. + + Parameters + ---------- + nodes : Tensor + The nodes to form the subgraph. + + Returns + ------- + cuGraph + The sampled subgraph with the same node ID space with the original + graph. + """ + raise NotImplementedError + + def __clear_cached_properties(self): + # Check for cached properties using self.__dict__ because calling + # hasattr() accesses the attribute and forces computation + if "has_multiple_etypes" in self.__dict__: + del self.has_multiple_etypes + + if "etypes" in self.__dict__: + del self.etypes + + if "ntypes" in self.__dict__: + del self.ntypes + + if "num_nodes_dict" in self.__dict__: + del self.num_nodes_dict + + if "num_edges_dict" in self.__dict__: + del self.num_edges_dict + + if "extracted_subgraph" in self.__dict__: + del self.extracted_subgraph + + if "extracted_reverse_subgraph" in self.__dict__: + del self.extracted_reverse_subgraph + + if "extracted_subgraphs_per_type" in self.__dict__: + del self.extracted_subgraphs_per_type + + if "extracted_reverse_subgraphs_per_type" in self.__dict__: + del self.extracted_reverse_subgraphs_per_type + + +def create_dlpack_results_from_arrays(sampled_result_arrays, etypes: list[str]): + # TODO: Extend to pytorch/numpy/etc + import cupy as cp + + if len(etypes) <= 1: + s, d, e_id = sampled_result_arrays + # Handle numpy array, cupy array, lists etc + s, d, e_id = cp.asarray(s), cp.asarray(d), cp.asarray(e_id) + return s.toDlpack(), d.toDlpack(), e_id.toDlpack() + else: + result_d = {} + array_start_offset = 0 + for etype in etypes: + s = sampled_result_arrays[array_start_offset] + d = sampled_result_arrays[array_start_offset + 1] + e_id = sampled_result_arrays[array_start_offset + 2] + s, d, e_id = cp.asarray(s), cp.asarray(d), cp.asarray(e_id) + array_start_offset = array_start_offset + 3 + if s is not None and len(s) >= 0: + s, d, e_id = s.toDlpack(), d.toDlpack(), e_id.toDlpack() + else: + s, d, e_id = None, None, None + result_d[etype] = (s, d, e_id) + return result_d diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/service_extensions/add_data.py b/python/cugraph/cugraph/gnn/dgl_extensions/service_extensions/add_data.py new file mode 100644 index 00000000000..dd1a33a7115 --- /dev/null +++ b/python/cugraph/cugraph/gnn/dgl_extensions/service_extensions/add_data.py @@ -0,0 +1,53 @@ +import cudf +import dask_cudf +import cupy as cp +from cugraph.experimental import MGPropertyGraph + + +def add_node_data_from_parquet( + file_path, node_col_name, node_offset, ntype, gid, server +): + pG = server.get_graph(gid) + if isinstance(pG, MGPropertyGraph): + df = dask_cudf.read_parquet(file_path) + else: + df = cudf.read_parquet(file_path) + + df[node_col_name] = df[node_col_name] + node_offset + pG.add_vertex_data(df, vertex_col_name=node_col_name, type_name=ntype) + + columns_list = list(df.columns) + + return serialize_strings_to_array(columns_list) + + +def add_edge_data_from_parquet( + file_path, node_col_names, canonical_etype, src_offset, dst_offset, gid, server +): + pG = server.get_graph(gid) + if isinstance(pG, MGPropertyGraph): + df = dask_cudf.read_parquet(file_path) + else: + df = cudf.read_parquet(file_path) + + df[node_col_names[0]] = df[node_col_names] + src_offset + df[node_col_names[1]] = df[node_col_names] + dst_offset + pG.add_edge_data(df, vertex_col_names=node_col_names, type_name=canonical_etype) + + columns_list = list(df.columns) + + return serialize_strings_to_array(columns_list) + + +def convert_to_string_ar(string): + return cp.asarray([ord(c) for c in string], cp.int32), len(string) + + +def serialize_strings_to_array(strings_list): + ar_ls = [] + len_ls = [] + for s in strings_list: + ar, s_len = convert_to_string_ar(s) + ar_ls.append(ar) + len_ls.append(s_len) + return cp.concatenate(ar_ls), cp.asarray(len_ls, dtype=cp.int32) diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/utils/add_data.py b/python/cugraph/cugraph/gnn/dgl_extensions/utils/add_data.py index 89614606dd3..c05f794f038 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/utils/add_data.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/utils/add_data.py @@ -60,3 +60,23 @@ def _update_feature_map( ) for col in columns: pg_feature_map[col] = [col] + + +def deserialize_strings_from_char_ars(char_ar, len_ar): + string_start = 0 + string_list = [] + for string_offset in len_ar: + string_end = string_start + string_offset + s = char_ar[string_start:string_end] + + # Check of cupy array + if type(s).__module__ == "cupy": + s = s.get() + + # Check for numpy + if type(s).__module__ == "numpy": + s = s.tolist() + s = "".join([chr(i) for i in s]) + string_list.append(s) + string_start = string_end + return string_list From 82871c386fa41934a33d3fb2cef343a901e6a897 Mon Sep 17 00:00:00 2001 From: Rick Ratzel Date: Thu, 27 Oct 2022 01:12:57 -0500 Subject: [PATCH 15/41] Added better error handling for APIs when using device transfer options, added tests for error conditions, added benchmark for call_extension test. --- .../cugraph_service_client/client.py | 44 +++++++++++-- .../cugraph_service_server/cugraph_handler.py | 1 + python/cugraph_service/tests/test_mg_e2e.py | 64 +++++++++++++++++-- 3 files changed, 98 insertions(+), 11 deletions(-) diff --git a/python/cugraph_service/cugraph_service_client/client.py b/python/cugraph_service/cugraph_service_client/client.py index 401f199e8ca..f5fba965ac4 100644 --- a/python/cugraph_service/cugraph_service_client/client.py +++ b/python/cugraph_service/cugraph_service_client/client.py @@ -1208,7 +1208,16 @@ async def receiver(endpoint): listener = ucp.create_listener(receiver, self.results_port) - uns_thread = threading.Thread( + # Use an excepthook to store an exception on the thread object if one is + # raised in the thread. + def excepthook(exc): + if exc.thread is not None: + exc.thread.exception = exc.exc_type(exc.exc_value) + + orig_excepthook = threading.excepthook + threading.excepthook = excepthook + + thread = threading.Thread( target=self.__client.uniform_neighbor_sample, args=( start_list, @@ -1219,12 +1228,19 @@ async def receiver(endpoint): self.results_port, ), ) - uns_thread.start() + thread.start() + # Poll the listener and the state of the thread. Close the listener if + # the thread died and raise the stored exception. while not listener.closed(): await asyncio.sleep(0.05) + if not thread.is_alive(): + listener.close() + threading.excepthook = orig_excepthook + if hasattr(thread, "exception"): + raise thread.exception - uns_thread.join() + thread.join() return result_obj async def __call_extension_to_device( @@ -1262,7 +1278,16 @@ async def receiver(endpoint): listener = ucp.create_listener(receiver, self.results_port) - ce_thread = threading.Thread( + # Use an excepthook to store an exception on the thread object if one is + # raised in the thread. + def excepthook(exc): + if exc.thread is not None: + exc.thread.exception = exc.exc_type(exc.exc_value) + + orig_excepthook = threading.excepthook + threading.excepthook = excepthook + + thread = threading.Thread( target=self.__client.call_extension, args=( func_name, @@ -1272,12 +1297,19 @@ async def receiver(endpoint): self.results_port, ), ) - ce_thread.start() + thread.start() + # Poll the listener and the state of the thread. Close the listener if + # the thread died and raise the stored exception. while not listener.closed(): await asyncio.sleep(0.05) + if not thread.is_alive(): + listener.close() + threading.excepthook = orig_excepthook + if hasattr(thread, "exception"): + raise thread.exception - ce_thread.join() + thread.join() # special case, assume a list of len 1 should not be a list if len(result) == 1: diff --git a/python/cugraph_service/cugraph_service_server/cugraph_handler.py b/python/cugraph_service/cugraph_service_server/cugraph_handler.py index a5684aa2b06..a3f84f5afb9 100644 --- a/python/cugraph_service/cugraph_service_server/cugraph_handler.py +++ b/python/cugraph_service/cugraph_service_server/cugraph_handler.py @@ -1048,6 +1048,7 @@ def __call_extension( for module in extension_dict.values(): func = getattr(module, func_name, None) if func is not None: + # FIXME: look for a way to do this without using eval() func_args = eval(func_args_repr) func_kwargs = eval(func_kwargs_repr) func_sig = signature(func) diff --git a/python/cugraph_service/tests/test_mg_e2e.py b/python/cugraph_service/tests/test_mg_e2e.py index 182829adea1..902cf359204 100644 --- a/python/cugraph_service/tests/test_mg_e2e.py +++ b/python/cugraph_service/tests/test_mg_e2e.py @@ -292,6 +292,31 @@ def test_device_transfer( assert bytes_returned.device == device_n +def test_uniform_neighbor_sampling_result_on_device_error( + client_of_sg_server_on_device_1_large_property_graph_loaded, +): + """ + Ensure errors are handled properly when using device transfer + """ + from cugraph_service_client.exceptions import CugraphServiceError + + (client, graph_id) = client_of_sg_server_on_device_1_large_property_graph_loaded + extracted_graph_id = client.extract_subgraph(graph_id=graph_id) + + start_list = [0, 1, 2] + fanout_vals = [] # should raise an exception + with_replacement = False + + with pytest.raises(CugraphServiceError): + client.uniform_neighbor_sample( + start_list=start_list, + fanout_vals=fanout_vals, + with_replacement=with_replacement, + graph_id=extracted_graph_id, + result_device=0, + ) + + def test_uniform_neighbor_sampling_result_on_device( benchmark, result_device_id, @@ -330,21 +355,49 @@ def test_uniform_neighbor_sampling_result_on_device( assert result.sources.device == device_n +def test_call_extension_result_on_device_error( + extension1, client_of_sg_server_on_device_1 +): + """ + Ensure errors are handled properly when using device transfer + """ + from cugraph_service_client.exceptions import CugraphServiceError + + client = client_of_sg_server_on_device_1 + extension_dir = extension1 + array1_len = 1.23 # should raise an exception + array2_len = 10 + + ext_mod_names = client.load_extensions(extension_dir) + + with pytest.raises(CugraphServiceError): + client.call_extension( + "my_nines_function", + array1_len, + "int32", + array2_len, + "float64", + result_device=0, + ) + + for mod_name in ext_mod_names: + client.unload_extension_module(mod_name) + + def test_call_extension_result_on_device( benchmark, extension1, result_device_id, client_of_sg_server_on_device_1 ): client = client_of_sg_server_on_device_1 extension_dir = extension1 - array1_len = 33 - array2_len = 21 + array1_len = int(1e5) + array2_len = int(1e5) - # Loading ext_mod_names = client.load_extensions(extension_dir) - # Running # my_nines_function in extension1 returns a list of two lists of 9's with # sizes and dtypes based on args. - results = client.call_extension( + results = benchmark( + client.call_extension, "my_nines_function", array1_len, "int32", @@ -352,6 +405,7 @@ def test_call_extension_result_on_device( "float64", result_device=result_device_id, ) + if result_device_id is None: assert len(results) == 2 assert len(results[0]) == array1_len From b0ffa456b89d2dc14558322deacb8a5eb8fd6881 Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Wed, 26 Oct 2022 13:30:30 -0700 Subject: [PATCH 16/41] Remove remove_pg_dependency_from_cugraph_store.py --- .../gnn/dgl_extensions/cugraph_store.py | 149 ++++++------------ .../gnn/dgl_extensions/utils/sampling.py | 105 +++++++++++- 2 files changed, 148 insertions(+), 106 deletions(-) diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py index 76e60e0d25c..8b1a1bdcc28 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py @@ -20,20 +20,14 @@ from functools import cached_property from .utils.add_data import _update_feature_map -from .utils.sampling import sample_multiple_sgs, sample_single_sg -from .utils.sampling import ( - get_subgraph_and_src_range_from_edgelist, - get_underlying_dtype_from_sg, -) -from .utils.sampling import create_dlpack_d +from .utils.sampling import sample_pg, get_subgraph_and_src_range_from_pg +from .utils.sampling import get_underlying_dtype_from_sg from .feature_storage import CuFeatureStorage src_n = PropertyGraph.src_col_name dst_n = PropertyGraph.dst_col_name type_n = PropertyGraph.type_col_name -eid_n = PropertyGraph.edge_id_col_name -vid_n = PropertyGraph.vertex_col_name class CuGraphStore: @@ -101,7 +95,6 @@ def add_node_data( self.ndata_feat_col_d, feat_name, contains_vector_features, columns ) # Clear properties if set as data has changed - self.__clear_cached_properties() def add_edge_data( @@ -168,7 +161,6 @@ def get_node_storage(self, key, ntype=None, indices_offset=0): ) columns = self.ndata_feat_col_d[key] - return CuFeatureStorage( pg=self.gdata, columns=columns, @@ -214,15 +206,14 @@ def num_edges(self, etype=None): def has_multiple_etypes(self): return len(self.etypes) > 1 - @property + @cached_property def ntypes(self): return sorted(self.gdata.vertex_types) - @property + @cached_property def etypes(self): return sorted(self.gdata.edge_types) - @property def is_mg(self): return isinstance(self.gdata, MGPropertyGraph) @@ -276,76 +267,36 @@ def sample_neighbors( f"edge_dir must be either 'in' or 'out' got {edge_dir} instead" ) - if isinstance(nodes_cap, dict): - nodes = {t: cudf.from_dlpack(n) for t, n in nodes_cap.items()} - else: - nodes = cudf.from_dlpack(nodes_cap) - - if self.is_mg: - sample_f = cugraph.dask.uniform_neighbor_sample - else: - sample_f = cugraph.uniform_neighbor_sample - if self.has_multiple_etypes: # TODO: Convert into a single call when # https://github.com/rapidsai/cugraph/issues/2696 lands if edge_dir == "in": - sgs = self.extracted_reverse_subgraphs_per_type + sgs_obj, sgs_src_range_obj = self.extracted_reverse_subgraphs_per_type else: - sgs = self.extracted_subgraphs_per_type - # Uniform sampling fails when the dtype - # of the seed dtype is not same as the node dtype - - self.set_sg_node_dtype(list(sgs.values())[0][0]) - sampled_df = sample_multiple_sgs( - sgs, - sample_f, - nodes, - self._sg_node_dtype, - edge_dir, - fanout, - replace, - ) + sgs_obj, sgs_src_range_obj = self.extracted_subgraphs_per_type + first_sg = list(sgs_obj.values())[0] else: if edge_dir == "in": - sg, start_list_range = self.extracted_reverse_subgraph + sgs_obj, sgs_src_range_obj = self.extracted_reverse_subgraph else: - sg, start_list_range = self.extracted_subgraph - self.set_sg_node_dtype(sg) - sampled_df = sample_single_sg( - sg, - sample_f, - nodes, - self._sg_node_dtype, - start_list_range, - fanout, - replace, - ) - - # we reverse directions when directions=='in' - if edge_dir == "in": - sampled_df = sampled_df.rename( - columns={"destinations": src_n, "sources": dst_n} - ) - else: - sampled_df = sampled_df.rename( - columns={"sources": src_n, "destinations": dst_n} - ) - # Transfer data to client - if isinstance(sampled_df, dask_cudf.DataFrame): - sampled_df = sampled_df.compute() - - if self.has_multiple_etypes: - # Heterogeneous graph case - d = self._get_edgeid_type_d(sampled_df["indices"], self.etypes) - d = create_dlpack_d(d) - return d - else: - return ( - sampled_df[src_n].to_dlpack(), - sampled_df[dst_n].to_dlpack(), - sampled_df["indices"].to_dlpack(), - ) + sgs_obj, sgs_src_range_obj = self.extracted_subgraph + + first_sg = sgs_obj + # Uniform sampling fails when the dtype + # of the seed dtype is not same as the node dtype + self.set_sg_node_dtype(first_sg) + return sample_pg( + self.gdata, + has_multiple_etypes=self.has_multiple_etypes, + etypes=self.etypes, + sgs_obj=sgs_obj, + sgs_src_range_obj=sgs_src_range_obj, + sg_node_dtype=self._sg_node_dtype, + nodes_cap=nodes_cap, + replace=replace, + fanout=fanout, + edge_dir=edge_dir, + ) ###################################### # Utilities @@ -357,55 +308,37 @@ def num_vertices(self): def get_vertex_ids(self): return self.gdata.vertices_ids() - def _get_edgeid_type_d(self, edge_ids, etypes): - if isinstance(edge_ids, cudf.Series): - # Work around for below issue - # https://github.com/rapidsai/cudf/issues/11877 - edge_ids = edge_ids.values_host - df = self.gdata.get_edge_data(edge_ids=edge_ids, columns=[type_n]) - if isinstance(df, dask_cudf.DataFrame): - df = df.compute() - return {etype: df[df[type_n] == etype] for etype in etypes} - @cached_property def extracted_subgraph(self): - edge_list = self.gdata.get_edge_data(columns=[src_n, dst_n, type_n]) - edge_list = edge_list.reset_index(drop=True) - - return get_subgraph_and_src_range_from_edgelist( - edge_list, self.is_mg, reverse_edges=False + return get_subgraph_and_src_range_from_pg( + self.gdata, reverse_edges=False, etype=None ) @cached_property def extracted_reverse_subgraph(self): - edge_list = self.gdata.get_edge_data(columns=[src_n, dst_n, type_n]) - return get_subgraph_and_src_range_from_edgelist( - edge_list, self.is_mg, reverse_edges=True + return get_subgraph_and_src_range_from_pg( + self.gdata, reverse_edges=True, etype=None ) @cached_property def extracted_subgraphs_per_type(self): sg_d = {} + sg_src_range_d = {} for etype in self.etypes: - edge_list = self.gdata.get_edge_data( - columns=[src_n, dst_n, type_n], types=[etype] - ) - sg_d[etype] = get_subgraph_and_src_range_from_edgelist( - edge_list, self.is_mg, reverse_edges=False + sg_d[etype], sg_src_range_d[etype] = get_subgraph_and_src_range_from_pg( + self.gdata, reverse_edges=False, etype=etype ) - return sg_d + return sg_d, sg_src_range_d @cached_property def extracted_reverse_subgraphs_per_type(self): sg_d = {} + sg_src_range_d = {} for etype in self.etypes: - edge_list = self.gdata.get_edge_data( - columns=[src_n, dst_n, type_n], types=[etype] + sg_d[etype], sg_src_range_d[etype] = get_subgraph_and_src_range_from_pg( + self.gdata, reverse_edges=True, etype=etype ) - sg_d[etype] = get_subgraph_and_src_range_from_edgelist( - edge_list, self.is_mg, reverse_edges=True - ) - return sg_d + return sg_d, sg_src_range_d @cached_property def num_nodes_dict(self): @@ -489,6 +422,12 @@ def __clear_cached_properties(self): if "has_multiple_etypes" in self.__dict__: del self.has_multiple_etypes + if "etypes" in self.__dict__: + del self.etypes + + if "ntypes" in self.__dict__: + del self.ntypes + if "num_nodes_dict" in self.__dict__: del self.num_nodes_dict diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/utils/sampling.py b/python/cugraph/cugraph/gnn/dgl_extensions/utils/sampling.py index 0e12371271d..eaee2414bb3 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/utils/sampling.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/utils/sampling.py @@ -18,6 +18,8 @@ import cupy as cp import dask_cudf from cugraph.experimental import PropertyGraph +from cugraph.experimental import MGPropertyGraph + src_n = PropertyGraph.src_col_name dst_n = PropertyGraph.dst_col_name @@ -26,6 +28,20 @@ vid_n = PropertyGraph.vertex_col_name +def get_subgraph_and_src_range_from_pg(pg, reverse_edges, etype=None): + if etype: + edge_list = pg.get_edge_data(columns=[src_n, dst_n, type_n], types=[etype]) + else: + edge_list = pg.get_edge_data(columns=[src_n, dst_n, type_n]) + + edge_list = edge_list.reset_index(drop=True) + + is_mg = isinstance(pg, MGPropertyGraph) + return get_subgraph_and_src_range_from_edgelist( + edge_list, is_mg, reverse_edges=reverse_edges + ) + + def get_subgraph_and_src_range_from_edgelist(edge_list, is_mg, reverse_edges=False): if reverse_edges: edge_list = edge_list.rename(columns={src_n: dst_n, dst_n: src_n}) @@ -63,6 +79,7 @@ def get_subgraph_and_src_range_from_edgelist(edge_list, is_mg, reverse_edges=Fal def sample_multiple_sgs( sgs, + sgs_src_range_obj, sample_f, start_list_d, start_list_dtype, @@ -72,7 +89,8 @@ def sample_multiple_sgs( ): start_list_types = list(start_list_d.keys()) output_dfs = [] - for can_etype, (sg, start_list_range) in sgs.items(): + for can_etype, sg in sgs.items(): + start_list_range = sgs_src_range_obj[can_etype] can_etype = _convert_can_etype_s_to_tup(can_etype) if _edge_types_contains_canonical_etype(can_etype, start_list_types, edge_dir): if edge_dir == "in": @@ -179,3 +197,88 @@ def get_underlying_dtype_from_sg(sg): raise ValueError(f"Source column {src_n} not found in the subgraph") return sg_node_dtype + + +def get_edgeid_type_d(pg, edge_ids, etypes): + if isinstance(edge_ids, cudf.Series): + # Work around for below issue + # https://github.com/rapidsai/cudf/issues/11877 + edge_ids = edge_ids.values_host + df = pg.get_edge_data(edge_ids=edge_ids, columns=[type_n]) + if isinstance(df, dask_cudf.DataFrame): + df = df.compute() + return {etype: df[df[type_n] == etype] for etype in etypes} + + +def sample_pg( + pg, + has_multiple_etypes, + etypes, + sgs_obj, + sgs_src_range_obj, + sg_node_dtype, + nodes_cap, + replace, + fanout, + edge_dir, +): + if isinstance(nodes_cap, dict): + nodes = {t: cudf.from_dlpack(n) for t, n in nodes_cap.items()} + else: + nodes = cudf.from_dlpack(nodes_cap) + + if isinstance(pg, MGPropertyGraph): + sample_f = cugraph.dask.uniform_neighbor_sample + else: + sample_f = cugraph.uniform_neighbor_sample + + if has_multiple_etypes: + # TODO: Convert into a single call when + # https://github.com/rapidsai/cugraph/issues/2696 lands + # Uniform sampling fails when the dtype + # of the seed dtype is not same as the node dtype + sampled_df = sample_multiple_sgs( + sgs=sgs_obj, + sgs_src_range_obj=sgs_src_range_obj, + start_list_dtype=sg_node_dtype, + sample_f=sample_f, + start_list_d=nodes, + edge_dir=edge_dir, + fanout=fanout, + with_replacement=replace, + ) + else: + sampled_df = sample_single_sg( + sg=sgs_obj, + start_list_range=sgs_src_range_obj, + start_list_dtype=sg_node_dtype, + sample_f=sample_f, + start_list=nodes, + fanout=fanout, + with_replacement=replace, + ) + + # we reverse directions when directions=='in' + if edge_dir == "in": + sampled_df = sampled_df.rename( + columns={"destinations": src_n, "sources": dst_n} + ) + else: + sampled_df = sampled_df.rename( + columns={"sources": src_n, "destinations": dst_n} + ) + # Transfer data to client + if isinstance(sampled_df, dask_cudf.DataFrame): + sampled_df = sampled_df.compute() + + if has_multiple_etypes: + # Heterogeneous graph case + d = get_edgeid_type_d(pg, sampled_df["indices"], etypes) + d = create_dlpack_d(d) + return d + else: + return ( + sampled_df[src_n].to_dlpack(), + sampled_df[dst_n].to_dlpack(), + sampled_df["indices"].to_dlpack(), + ) From 4df76f0ffdfb4fa1424de42cb154872b7d7418be Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Wed, 26 Oct 2022 14:18:59 -0700 Subject: [PATCH 17/41] Remove remove_pg_dependency_from_cugraph_store.py --- .../gnn/dgl_extensions/cugraph_store.py | 38 ++++--------------- .../gnn/dgl_extensions/utils/find_edges.py | 15 ++++++++ .../gnn/dgl_extensions/utils/node_subgraph.py | 36 ++++++++++++++++++ 3 files changed, 58 insertions(+), 31 deletions(-) create mode 100644 python/cugraph/cugraph/gnn/dgl_extensions/utils/find_edges.py create mode 100644 python/cugraph/cugraph/gnn/dgl_extensions/utils/node_subgraph.py diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py index 8b1a1bdcc28..708890c4ad8 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py @@ -13,23 +13,15 @@ from collections import defaultdict -import cudf -import dask_cudf -import cugraph -from cugraph.experimental import PropertyGraph, MGPropertyGraph from functools import cached_property - +from .utils.find_edges import find_edges +from .utils.node_subgraph import node_subgraph from .utils.add_data import _update_feature_map from .utils.sampling import sample_pg, get_subgraph_and_src_range_from_pg from .utils.sampling import get_underlying_dtype_from_sg from .feature_storage import CuFeatureStorage -src_n = PropertyGraph.src_col_name -dst_n = PropertyGraph.dst_col_name -type_n = PropertyGraph.type_col_name - - class CuGraphStore: """ A wrapper around a cuGraph Property Graph that @@ -40,6 +32,8 @@ class CuGraphStore: """ def __init__(self, graph, backend_lib="torch"): + from cugraph.experimental import PropertyGraph, MGPropertyGraph + if isinstance(graph, (PropertyGraph, MGPropertyGraph)): self.__G = graph else: @@ -214,9 +208,6 @@ def ntypes(self): def etypes(self): return sorted(self.gdata.edge_types) - def is_mg(self): - return isinstance(self.gdata, MGPropertyGraph) - @property def gdata(self): return self.__G @@ -375,18 +366,12 @@ def find_edges(self, edge_ids_cap, etype): DLPack capsule The dst nodes for the given ids """ - edge_ids = cudf.from_dlpack(edge_ids_cap) - subset_df = self.gdata.get_edge_data( - edge_ids=edge_ids, columns=type_n, types=[etype] - ) - if isinstance(subset_df, dask_cudf.DataFrame): - subset_df = subset_df.compute() - return subset_df[src_n].to_dlpack(), subset_df[dst_n].to_dlpack() + return find_edges(edge_ids_cap, etype) def node_subgraph( self, nodes=None, - create_using=cugraph.MultiGraph, + create_using=None, ): """ Return a subgraph induced on the given nodes. @@ -405,16 +390,7 @@ def node_subgraph( The sampled subgraph with the same node ID space with the original graph. """ - _g = self.gdata.extract_subgraph( - create_using=create_using, check_multi_edges=True - ) - - if nodes is None: - return _g - else: - _n = cudf.Series(nodes) - _subg = cugraph.subgraph(_g, _n) - return _subg + return node_subgraph(self.gdata, nodes, create_using) def __clear_cached_properties(self): # Check for cached properties using self.__dict__ because calling diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/utils/find_edges.py b/python/cugraph/cugraph/gnn/dgl_extensions/utils/find_edges.py new file mode 100644 index 00000000000..e0279417a45 --- /dev/null +++ b/python/cugraph/cugraph/gnn/dgl_extensions/utils/find_edges.py @@ -0,0 +1,15 @@ +import cudf +import dask_cudf +from cugraph.experimental import PropertyGraph + +src_n = PropertyGraph.src_col_name +dst_n = PropertyGraph.dst_col_name +type_n = PropertyGraph.type_col_name + + +def find_edges(pg, edge_ids_cap, etype): + edge_ids = cudf.from_dlpack(edge_ids_cap) + subset_df = pg.get_edge_data(edge_ids=edge_ids, columns=type_n, types=[etype]) + if isinstance(subset_df, dask_cudf.DataFrame): + subset_df = subset_df.compute() + return subset_df[src_n].to_dlpack(), subset_df[dst_n].to_dlpack() diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/utils/node_subgraph.py b/python/cugraph/cugraph/gnn/dgl_extensions/utils/node_subgraph.py new file mode 100644 index 00000000000..32906bf1e31 --- /dev/null +++ b/python/cugraph/cugraph/gnn/dgl_extensions/utils/node_subgraph.py @@ -0,0 +1,36 @@ +import cugraph +import cudf + + +def node_subgraph( + pg, + nodes=None, + create_using=cugraph.MultiGraph, +): + """ + Return a subgraph induced on the given nodes. + + A node-induced subgraph is a graph with edges whose endpoints are both + in the specified node set. + + Parameters + ---------- + pg: Property Graph + The graph to create subgraph from + nodes : Tensor + The nodes to form the subgraph. + Returns + ------- + cuGraph + The sampled subgraph with the same node ID space with the original + graph. + """ + + _g = pg.extract_subgraph(create_using=create_using, check_multi_edges=True) + + if nodes is None: + return _g + else: + _n = cudf.Series(nodes) + _subg = cugraph.subgraph(_g, _n) + return _subg From a941a8a25f50f2ca03878a50cdf0b32619adffc7 Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Wed, 26 Oct 2022 15:30:00 -0700 Subject: [PATCH 18/41] Move code to helper functions to make them callable from extensions --- .../gnn/dgl_extensions/cugraph_store.py | 34 +++++++++++++++++-- .../gnn/dgl_extensions/feature_storage.py | 5 ++- .../gnn/dgl_extensions/utils/sampling.py | 28 +++++++-------- 3 files changed, 46 insertions(+), 21 deletions(-) diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py index 708890c4ad8..f2b7e7d5672 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py @@ -32,9 +32,8 @@ class CuGraphStore: """ def __init__(self, graph, backend_lib="torch"): - from cugraph.experimental import PropertyGraph, MGPropertyGraph - if isinstance(graph, (PropertyGraph, MGPropertyGraph)): + if type(graph).__name__ in ["PropertyGraph", "MGPropertyGraph"]: self.__G = graph else: raise ValueError("graph must be a PropertyGraph or MGPropertyGraph") @@ -276,7 +275,9 @@ def sample_neighbors( # Uniform sampling fails when the dtype # of the seed dtype is not same as the node dtype self.set_sg_node_dtype(first_sg) - return sample_pg( + + # Below will be called from dict + sampled_result_arrays = sample_pg( self.gdata, has_multiple_etypes=self.has_multiple_etypes, etypes=self.etypes, @@ -288,6 +289,7 @@ def sample_neighbors( fanout=fanout, edge_dir=edge_dir, ) + return create_dlpack_results_from_arrays(sampled_result_arrays, self.etypes) ###################################### # Utilities @@ -421,3 +423,29 @@ def __clear_cached_properties(self): if "extracted_reverse_subgraphs_per_type" in self.__dict__: del self.extracted_reverse_subgraphs_per_type + + +def create_dlpack_results_from_arrays(sampled_result_arrays, etypes: list[str]): + # TODO: Extend to pytorch/numpy/etc + import cupy as cp + + if len(etypes) <= 1: + s, d, e_id = sampled_result_arrays + # Handle numpy array, cupy array, lists etc + s, d, e_id = cp.asarray(s), cp.asarray(d), cp.asarray(e_id) + return s.toDlpack(), d.toDlpack(), e_id.toDlpack() + else: + result_d = {} + array_start_offset = 0 + for etype in etypes: + s = sampled_result_arrays[array_start_offset] + d = sampled_result_arrays[array_start_offset + 1] + e_id = sampled_result_arrays[array_start_offset + 2] + s, d, e_id = cp.asarray(s), cp.asarray(d), cp.asarray(e_id) + array_start_offset = array_start_offset + 3 + if s is not None and len(s) >= 0: + s, d, e_id = s.toDlpack(), d.toDlpack(), e_id.toDlpack() + else: + s, d, e_id = None, None, None + result_d[etype] = (s, d, e_id) + return result_d diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py b/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py index 244dfa8b621..207132748e4 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py @@ -15,7 +15,6 @@ import cudf import dask_cudf import cupy as cp -from cugraph.experimental import MGPropertyGraph class CuFeatureStorage: @@ -36,7 +35,7 @@ def __init__( from cupy import from_dlpack else: raise NotImplementedError( - f"Only PyTorch ('torch'), TensorFlow ('tf'), and CuPy ('cupy') " + f"Only PyTorch ('torch'), TensorFlow ('tf'), and CuPy ('cupy') g" f"backends are currently supported, got {backend_lib=}" ) if storage_type not in ["edge", "node"]: @@ -67,7 +66,7 @@ def fetch(self, indices, device=None, pin_memory=False, **kwargs): # Default implementation uses synchronous fetch. indices = cp.asarray(indices) - if isinstance(self.pg, MGPropertyGraph): + if type(self.pg).__name__ in "MGPropertyGraph": # dask_cudf loc breaks if we provide cudf series/cupy array # https://github.com/rapidsai/cudf/issues/11877 indices = indices.get() diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/utils/sampling.py b/python/cugraph/cugraph/gnn/dgl_extensions/utils/sampling.py index eaee2414bb3..460b44ee3b1 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/utils/sampling.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/utils/sampling.py @@ -165,19 +165,18 @@ def _convert_can_etype_s_to_tup(canonical_etype_s): return (src_type, etype, dst_type) -def create_dlpack_d(d): - dlpack_d = {} +def create_cp_result_ls(d): + cupy_result_ls = [] for k, df in d.items(): if len(df) == 0: - dlpack_d[k] = (None, None, None) + cupy_result_ls.append(cp.empty(shape=0, dtype=cp.int32)) + cupy_result_ls.append(cp.empty(shape=0, dtype=cp.int32)) + cupy_result_ls.append(cp.empty(shape=0, dtype=cp.int32)) else: - dlpack_d[k] = ( - df[src_n].to_dlpack(), - df[dst_n].to_dlpack(), - df[eid_n].to_dlpack(), - ) - - return dlpack_d + cupy_result_ls.append(df[src_n].values) + cupy_result_ls.append(df[dst_n].values) + cupy_result_ls.append(df[eid_n].values) + return cupy_result_ls def get_underlying_dtype_from_sg(sg): @@ -274,11 +273,10 @@ def sample_pg( if has_multiple_etypes: # Heterogeneous graph case d = get_edgeid_type_d(pg, sampled_df["indices"], etypes) - d = create_dlpack_d(d) - return d + return create_cp_result_ls(d) else: return ( - sampled_df[src_n].to_dlpack(), - sampled_df[dst_n].to_dlpack(), - sampled_df["indices"].to_dlpack(), + sampled_df[src_n].values, + sampled_df[dst_n].values, + sampled_df["indices"].values, ) From 696fbc7aa41e071b3651d9ea325f74050f485ba2 Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Wed, 26 Oct 2022 15:44:46 -0700 Subject: [PATCH 19/41] Fix typos --- python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py | 2 +- python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py index f2b7e7d5672..4f3bacbb05a 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py @@ -276,7 +276,7 @@ def sample_neighbors( # of the seed dtype is not same as the node dtype self.set_sg_node_dtype(first_sg) - # Below will be called from dict + # Below will be called from remote storage sampled_result_arrays = sample_pg( self.gdata, has_multiple_etypes=self.has_multiple_etypes, diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py b/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py index 207132748e4..5682a832b2c 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py @@ -35,7 +35,7 @@ def __init__( from cupy import from_dlpack else: raise NotImplementedError( - f"Only PyTorch ('torch'), TensorFlow ('tf'), and CuPy ('cupy') g" + f"Only PyTorch ('torch'), TensorFlow ('tf'), and CuPy ('cupy')" f"backends are currently supported, got {backend_lib=}" ) if storage_type not in ["edge", "node"]: From 8f9cfab5d2ab53a1ddc4d9ed933763faf9cdd7f4 Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Wed, 26 Oct 2022 15:49:44 -0700 Subject: [PATCH 20/41] Added copyrights --- .../cugraph/gnn/dgl_extensions/utils/find_edges.py | 14 ++++++++++++++ .../gnn/dgl_extensions/utils/node_subgraph.py | 14 ++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/utils/find_edges.py b/python/cugraph/cugraph/gnn/dgl_extensions/utils/find_edges.py index e0279417a45..0e5b7f3f561 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/utils/find_edges.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/utils/find_edges.py @@ -1,3 +1,17 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + import cudf import dask_cudf from cugraph.experimental import PropertyGraph diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/utils/node_subgraph.py b/python/cugraph/cugraph/gnn/dgl_extensions/utils/node_subgraph.py index 32906bf1e31..f2dd49e8af8 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/utils/node_subgraph.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/utils/node_subgraph.py @@ -1,3 +1,17 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + import cugraph import cudf From d73bef4cdf5a6b899d0654f1a0bc66dea1046bf5 Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Wed, 26 Oct 2022 17:04:15 -0700 Subject: [PATCH 21/41] Moved common stuff to RemoteStorage --- .../gnn/dgl_extensions/base_cugraph_store.py | 78 +++++++++++++++++++ .../gnn/dgl_extensions/cugraph_store.py | 39 ++-------- 2 files changed, 83 insertions(+), 34 deletions(-) create mode 100644 python/cugraph/cugraph/gnn/dgl_extensions/base_cugraph_store.py diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/base_cugraph_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/base_cugraph_store.py new file mode 100644 index 00000000000..49607453b6c --- /dev/null +++ b/python/cugraph/cugraph/gnn/dgl_extensions/base_cugraph_store.py @@ -0,0 +1,78 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import cached_property + + +class BaseCuGraphStore: + """ + BaseClass for DGL GraphStore and RemoteGraphStore + """ + + def __init__(self, graph): + self.__G = graph + + def get_edge_storage(self, key, etype=None, indices_offset=0): + raise NotImplementedError + + def get_node_storage(self, key, ntype=None, indices_offset=0): + raise NotImplementedError + + @property + def gdata(self): + return self.__G + + def num_nodes(self, ntype=None): + return self.gdata.get_num_vertices(ntype) + + def num_edges(self, etype=None): + return self.gdata.get_num_edges(etype) + + @cached_property + def has_multiple_etypes(self): + return len(self.etypes) > 1 + + @cached_property + def ntypes(self): + return sorted(self.gdata.vertex_types) + + @cached_property + def etypes(self): + return sorted(self.gdata.edge_types) + + ###################################### + # Sampling APIs + ###################################### + + def sample_neighbors( + self, nodes_cap, fanout=-1, edge_dir="in", prob=None, replace=False + ): + raise NotImplementedError + + ###################################### + # Utilities + ###################################### + @property + def extracted_subgraph(self): + raise NotImplementedError + + @cached_property + def num_nodes_dict(self): + """ + Return num_nodes_dict of the graph + """ + return {ntype: self.num_nodes(ntype) for ntype in self.ntypes} + + @cached_property + def num_edges_dict(self): + return {etype: self.num_edges(etype) for etype in self.etypes} diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py index 4f3bacbb05a..faed0c7086f 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py @@ -13,6 +13,8 @@ from collections import defaultdict +from .base_cugraph_store import BaseCuGraphStore + from functools import cached_property from .utils.find_edges import find_edges from .utils.node_subgraph import node_subgraph @@ -22,7 +24,7 @@ from .feature_storage import CuFeatureStorage -class CuGraphStore: +class CuGraphStore(BaseCuGraphStore): """ A wrapper around a cuGraph Property Graph that then adds functions to basically match the DGL GraphStorage API. @@ -37,6 +39,8 @@ def __init__(self, graph, backend_lib="torch"): self.__G = graph else: raise ValueError("graph must be a PropertyGraph or MGPropertyGraph") + + BaseCuGraphStore.__init__(self, graph) # dict to map column names corresponding to edge features # of each type self.edata_feat_col_d = defaultdict(list) @@ -189,28 +193,6 @@ def get_edge_storage(self, key, etype=None, indices_offset=0): indices_offset=indices_offset, ) - def num_nodes(self, ntype=None): - return self.gdata.get_num_vertices(ntype) - - def num_edges(self, etype=None): - return self.gdata.get_num_edges(etype) - - @cached_property - def has_multiple_etypes(self): - return len(self.etypes) > 1 - - @cached_property - def ntypes(self): - return sorted(self.gdata.vertex_types) - - @cached_property - def etypes(self): - return sorted(self.gdata.edge_types) - - @property - def gdata(self): - return self.__G - ###################################### # Sampling APIs ###################################### @@ -333,17 +315,6 @@ def extracted_reverse_subgraphs_per_type(self): ) return sg_d, sg_src_range_d - @cached_property - def num_nodes_dict(self): - """ - Return num_nodes_dict of the graph - """ - return {ntype: self.num_nodes(ntype) for ntype in self.ntypes} - - @cached_property - def num_edges_dict(self): - return {etype: self.num_edges(etype) for etype in self.etypes} - def set_sg_node_dtype(self, sg): if hasattr(self, "_sg_node_dtype"): return self._sg_node_dtype From e899c89e96237e2c04d2dbded511e64014d52482 Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Wed, 26 Oct 2022 17:42:56 -0700 Subject: [PATCH 22/41] Address Reviews --- .../cugraph/gnn/dgl_extensions/base_cugraph_store.py | 4 ++++ .../cugraph/gnn/dgl_extensions/feature_storage.py | 12 +++++------- .../cugraph/gnn/dgl_extensions/utils/find_edges.py | 1 - 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/base_cugraph_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/base_cugraph_store.py index 49607453b6c..90a26ca6f7c 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/base_cugraph_store.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/base_cugraph_store.py @@ -32,6 +32,10 @@ def get_node_storage(self, key, ntype=None, indices_offset=0): def gdata(self): return self.__G + @property + def num_vertices(self): + return self.gdata.get_num_vertices() + def num_nodes(self, ntype=None): return self.gdata.get_num_vertices(ntype) diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py b/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py index 5682a832b2c..a518e9015cd 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py @@ -10,10 +10,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - - -import cudf -import dask_cudf import cupy as cp @@ -35,7 +31,7 @@ def __init__( from cupy import from_dlpack else: raise NotImplementedError( - f"Only PyTorch ('torch'), TensorFlow ('tf'), and CuPy ('cupy')" + f"Only PyTorch ('torch'), TensorFlow ('tf'), and CuPy ('cupy') " f"backends are currently supported, got {backend_lib=}" ) if storage_type not in ["edge", "node"]: @@ -66,11 +62,13 @@ def fetch(self, indices, device=None, pin_memory=False, **kwargs): # Default implementation uses synchronous fetch. indices = cp.asarray(indices) - if type(self.pg).__name__ in "MGPropertyGraph": + if type(self.pg).__name__ == "MGPropertyGraph": # dask_cudf loc breaks if we provide cudf series/cupy array # https://github.com/rapidsai/cudf/issues/11877 indices = indices.get() else: + import cudf + indices = cudf.Series(indices) indices = indices + self.indices_offset @@ -84,7 +82,7 @@ def fetch(self, indices, device=None, pin_memory=False, **kwargs): subset_df = subset_df[self.columns] - if isinstance(subset_df, dask_cudf.DataFrame): + if hasattr(subset_df, "compute"): subset_df = subset_df.compute() if len(subset_df) == 0: diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/utils/find_edges.py b/python/cugraph/cugraph/gnn/dgl_extensions/utils/find_edges.py index 0e5b7f3f561..4f632e9ad56 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/utils/find_edges.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/utils/find_edges.py @@ -11,7 +11,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import cudf import dask_cudf from cugraph.experimental import PropertyGraph From 206e0beb626cd5315300b4791fd84b69c82905e2 Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Wed, 26 Oct 2022 21:37:22 -0700 Subject: [PATCH 23/41] Added Remote Storage Support --- .../dgl_extensions/cugraph_service_store.py | 514 ++++++++++++++++++ .../service_extensions/add_data.py | 53 ++ .../gnn/dgl_extensions/utils/add_data.py | 20 + 3 files changed, 587 insertions(+) create mode 100644 python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py create mode 100644 python/cugraph/cugraph/gnn/dgl_extensions/service_extensions/add_data.py diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py new file mode 100644 index 00000000000..8936a96d6fe --- /dev/null +++ b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py @@ -0,0 +1,514 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict + +from .base_cugraph_store import BaseCuGraphStore + +from functools import cached_property +from .utils.add_data import _update_feature_map, deserialize_strings_from_char_ars +from .utils.sampling import sample_pg, get_subgraph_and_src_range_from_pg +from .utils.sampling import get_underlying_dtype_from_sg +from .feature_storage import CuFeatureStorage + + +class CuGraphRemoteStore(BaseCuGraphStore): + """ + A wrapper around a cuGraph Property Graph that + then adds functions to basically match the DGL GraphStorage API. + This is not a full duck-types match to a DGL GraphStore. + + This class return dlpack types and has additional functional arguments. + """ + + def __init__(self, graph, backend_lib="torch"): + + if type(graph).__name__ in ["RemoteGraph"]: + self.__G = graph + self.client = graph._client + else: + raise ValueError("graph must be a RemoteGraph") + + BaseCuGraphStore.__init__(self, graph) + # dict to map column names corresponding to edge features + # of each type + self.edata_feat_col_d = defaultdict(list) + # dict to map column names corresponding to node features + # of each type + self.ndata_feat_col_d = defaultdict(list) + self.backend_lib = backend_lib + + def add_node_data( + self, + df, + node_col_name, + ntype=None, + feat_name=None, + contains_vector_features=False, + ): + """ + Add a dataframe describing node properties to the PropertyGraph. + + Parameters + ---------- + dataframe : DataFrame-compatible instance + A DataFrame instance with a compatible Pandas-like DataFrame + interface. + node_col_name : string + The column name that contains the values to be used as vertex IDs. + ntype : string + The node type to be added. + For example, if dataframe contains data about users, ntype + might be "users". + If not specified, the type of properties will be added as + an empty string. + feat_name : {} or string + A map of feature names under which we should save the added + properties like {"feat_1":[f1, f2], "feat_2":[f3, f4]} + (ignored if contains_vector_features=False and the col names of + the dataframe are treated as corresponding feature names) + contains_vector_features : False + Whether to treat the columns of the dataframe being added as + as 2d features + Returns + ------- + None + """ + raise NotImplementedError( + "Adding Node Data From Local is not yet supported" + "Please Use `add_node_data_from_parquet`" + ) + + def add_edge_data( + self, + df, + node_col_names, + canonical_etype=None, + feat_name=None, + contains_vector_features=False, + ): + """ + Add a dataframe describing edge properties to the PropertyGraph. + + Parameters + ---------- + dataframe : DataFrame-compatible instance + A DataFrame instance with a compatible Pandas-like DataFrame + interface. + node_col_names : string + The column names that contain the values to be used as the source + and destination vertex IDs for the edges. + canonical_etype : string + The edge type to be added. This should follow the string format + '(src_type),(edge_type),(dst_type)' + If not specified, the type of properties will be added as + an empty string. + feat_name : string or dict {} + The feature name under which we should save the added properties + (ignored if contains_vector_features=False and the col names of + the dataframe are treated as corresponding feature names) + contains_vector_features : False + Whether to treat the columns of the dataframe being added as + as 2d features + Returns + ------- + None + """ + raise NotImplementedError( + "Adding Node Data From local is not yet supported for Remote Storage" + "Please Use `add_edge_data_from_parquet`" + ) + + def add_node_data_from_parquet( + self, + file_path, + node_col_name, + ntype=None, + node_offset=0, + feat_name=None, + contains_vector_features=False, + ): + """ + Add a dataframe describing node properties to the PropertyGraph. + + Parameters + ---------- + file_path: string + Path of the files on the server + node_col_name : string + The column name that contains the values to be used as vertex IDs. + ntype : string + The node type to be added. + For example, if dataframe contains data about users, ntype + might be "users". + If not specified, the type of properties will be added as + an empty string. + feat_name : {} or string + A map of feature names under which we should save the added + properties like {"feat_1":[f1, f2], "feat_2":[f3, f4]} + (ignored if contains_vector_features=False and the col names of + the dataframe are treated as corresponding feature names) + contains_vector_features : False + Whether to treat the columns of the dataframe being added as + as 2d features + Returns + ------- + None + """ + # TODO: Use PR 2850: https://github.com/rapidsai/cugraph/pull/2850 + loaded_columns = self.client.call_extension( + func_name="add_node_data_from_parquet", + file_path=file_path, + node_col_name=node_col_name, + node_offset=node_offset, + type=ntype, + graph_id=self.gdata.id, + ) + loaded_columns = deserialize_strings_from_char_ars(loaded_columns) + + columns = [col for col in loaded_columns if col != node_col_name] + _update_feature_map( + self.ndata_feat_col_d, feat_name, contains_vector_features, columns + ) + # Clear properties if set as data has changed + self.__clear_cached_properties() + + def add_edge_data_from_parquet( + self, + file_path, + node_col_names, + src_offset=0, + dst_offset=0, + canonical_etype=None, + feat_name=None, + contains_vector_features=False, + ): + """ + Add a dataframe describing edge properties to the PropertyGraph. + + Parameters + ---------- + file_path : string + Path of file on server + node_col_names : string + The column names that contain the values to be used as the source + and destination vertex IDs for the edges. + canonical_etype : string + The edge type to be added. This should follow the string format + '(src_type),(edge_type),(dst_type)' + If not specified, the type of properties will be added as + an empty string. + feat_name : string or dict {} + The feature name under which we should save the added properties + (ignored if contains_vector_features=False and the col names of + the dataframe are treated as corresponding feature names) + contains_vector_features : False + Whether to treat the columns of the dataframe being added as + as 2d features + Returns + ------- + None + """ + + # TODO: Use PR 2850: https://github.com/rapidsai/cugraph/pull/2850 + loaded_column_ars = self.client.call_extension( + func_name="add_edge_data_from_parquet", + file_path=file_path, + node_col_names=node_col_names, + canonical_etype=canonical_etype, + src_offset=src_offset, + dst_offset=dst_offset, + graph_id=self.gdata, + ) + loaded_columns = deserialize_strings_from_char_ars(loaded_column_ars) + columns = [col for col in loaded_columns if col not in node_col_names] + _update_feature_map( + self.edata_feat_col_d, feat_name, contains_vector_features, columns + ) + + def get_node_storage(self, key, ntype=None, indices_offset=0): + if ntype is None: + ntypes = self.ntypes + if len(self.ntypes) > 1: + raise ValueError( + ( + "Node type name must be specified if there " + "are more than one node types." + ) + ) + ntype = ntypes[0] + if key not in self.ndata_feat_col_d: + raise ValueError( + f"key {key} not found in CuGraphStore node features", + f" {list(self.ndata_feat_col_d.keys())}", + ) + + columns = self.ndata_feat_col_d[key] + return CuFeatureStorage( + pg=self.gdata, + columns=columns, + storage_type="node", + indices_offset=indices_offset, + backend_lib=self.backend_lib, + ) + + def get_edge_storage(self, key, etype=None, indices_offset=0): + if etype is None: + etypes = self.etypes + if len(self.etypes) > 1: + raise ValueError( + ( + "Edge type name must be specified if there " + "are more than one edge types." + ) + ) + + etype = etypes[0] + if key not in self.edata_feat_col_d: + raise ValueError( + f"key {key} not found in CuGraphStore" " edge features", + f" {list(self.edata_feat_col_d.keys())}", + ) + columns = self.edata_feat_col_d[key] + + return CuFeatureStorage( + pg=self.gdata, + columns=columns, + storage_type="edge", + backend_lib=self.backend_lib, + indices_offset=indices_offset, + ) + + ###################################### + # Sampling APIs + ###################################### + + def sample_neighbors( + self, nodes_cap, fanout=-1, edge_dir="in", prob=None, replace=False + ): + """ + Sample neighboring edges of the given nodes and return the subgraph. + + Parameters + ---------- + nodes_cap : Dlpack or dict of Dlpack of Node IDs + to sample neighbors from. + fanout : int + The number of edges to be sampled for each node on each edge type. + If -1 is given all the neighboring edges for each node on + each edge type will be selected. + edge_dir : str {"in" or "out"} + Determines whether to sample inbound or outbound edges. + Can take either in for inbound edges or out for outbound edges. + prob : str + Feature name used as the (unnormalized) probabilities associated + with each neighboring edge of a node. Each feature must be a + scalar. The features must be non-negative floats, and the sum of + the features of inbound/outbound edges for every node must be + positive (though they don't have to sum up to one). Otherwise, + the result will be undefined. If not specified, sample uniformly. + replace : bool + If True, sample with replacement. + + Returns + ------- + DLPack capsule + The src nodes for the sampled bipartite graph. + DLPack capsule + The sampled dst nodes for the sampledbipartite graph. + DLPack capsule + The corresponding eids for the sampled bipartite graph + """ + + if edge_dir not in ["in", "out"]: + raise ValueError( + f"edge_dir must be either 'in' or 'out' got {edge_dir} instead" + ) + + if self.has_multiple_etypes: + # TODO: Convert into a single call when + # https://github.com/rapidsai/cugraph/issues/2696 lands + if edge_dir == "in": + sgs_obj, sgs_src_range_obj = self.extracted_reverse_subgraphs_per_type + else: + sgs_obj, sgs_src_range_obj = self.extracted_subgraphs_per_type + first_sg = list(sgs_obj.values())[0] + else: + if edge_dir == "in": + sgs_obj, sgs_src_range_obj = self.extracted_reverse_subgraph + else: + sgs_obj, sgs_src_range_obj = self.extracted_subgraph + + first_sg = sgs_obj + # Uniform sampling fails when the dtype + # of the seed dtype is not same as the node dtype + # TODO: Update this function + self.set_sg_node_dtype(first_sg) + + # Below will be called from remote storage + # Call via Remote Storage + # TODO: Update this function + sampled_result_arrays = sample_pg( + self.gdata, + has_multiple_etypes=self.has_multiple_etypes, + etypes=self.etypes, + sgs_obj=sgs_obj, + sgs_src_range_obj=sgs_src_range_obj, + sg_node_dtype=self._sg_node_dtype, + nodes_cap=nodes_cap, + replace=replace, + fanout=fanout, + edge_dir=edge_dir, + ) + return create_dlpack_results_from_arrays(sampled_result_arrays, self.etypes) + + ###################################### + # Utilities + ###################################### + @cached_property + def extracted_subgraph(self): + return get_subgraph_and_src_range_from_pg( + self.gdata, reverse_edges=False, etype=None + ) + + @cached_property + def extracted_reverse_subgraph(self): + return get_subgraph_and_src_range_from_pg( + self.gdata, reverse_edges=True, etype=None + ) + + @cached_property + def extracted_subgraphs_per_type(self): + sg_d = {} + sg_src_range_d = {} + for etype in self.etypes: + sg_d[etype], sg_src_range_d[etype] = get_subgraph_and_src_range_from_pg( + self.gdata, reverse_edges=False, etype=etype + ) + return sg_d, sg_src_range_d + + @cached_property + def extracted_reverse_subgraphs_per_type(self): + sg_d = {} + sg_src_range_d = {} + for etype in self.etypes: + sg_d[etype], sg_src_range_d[etype] = get_subgraph_and_src_range_from_pg( + self.gdata, reverse_edges=True, etype=etype + ) + return sg_d, sg_src_range_d + + def set_sg_node_dtype(self, sg): + if hasattr(self, "_sg_node_dtype"): + return self._sg_node_dtype + else: + self._sg_node_dtype = get_underlying_dtype_from_sg(sg) + return self._sg_node_dtype + + def find_edges(self, edge_ids_cap, etype): + """Return the source and destination node IDs given the edge IDs within + the given edge type. + + Parameters + ---------- + edge_ids_cap : Dlpack of Node IDs (single dimension) + The edge ids to find + + Returns + ------- + DLPack capsule + The src nodes for the given ids + + DLPack capsule + The dst nodes for the given ids + """ + raise NotImplementedError + + def node_subgraph( + self, + nodes=None, + create_using=None, + ): + """ + Return a subgraph induced on the given nodes. + + A node-induced subgraph is a graph with edges whose endpoints are both + in the specified node set. + + Parameters + ---------- + nodes : Tensor + The nodes to form the subgraph. + + Returns + ------- + cuGraph + The sampled subgraph with the same node ID space with the original + graph. + """ + raise NotImplementedError + + def __clear_cached_properties(self): + # Check for cached properties using self.__dict__ because calling + # hasattr() accesses the attribute and forces computation + if "has_multiple_etypes" in self.__dict__: + del self.has_multiple_etypes + + if "etypes" in self.__dict__: + del self.etypes + + if "ntypes" in self.__dict__: + del self.ntypes + + if "num_nodes_dict" in self.__dict__: + del self.num_nodes_dict + + if "num_edges_dict" in self.__dict__: + del self.num_edges_dict + + if "extracted_subgraph" in self.__dict__: + del self.extracted_subgraph + + if "extracted_reverse_subgraph" in self.__dict__: + del self.extracted_reverse_subgraph + + if "extracted_subgraphs_per_type" in self.__dict__: + del self.extracted_subgraphs_per_type + + if "extracted_reverse_subgraphs_per_type" in self.__dict__: + del self.extracted_reverse_subgraphs_per_type + + +def create_dlpack_results_from_arrays(sampled_result_arrays, etypes: list[str]): + # TODO: Extend to pytorch/numpy/etc + import cupy as cp + + if len(etypes) <= 1: + s, d, e_id = sampled_result_arrays + # Handle numpy array, cupy array, lists etc + s, d, e_id = cp.asarray(s), cp.asarray(d), cp.asarray(e_id) + return s.toDlpack(), d.toDlpack(), e_id.toDlpack() + else: + result_d = {} + array_start_offset = 0 + for etype in etypes: + s = sampled_result_arrays[array_start_offset] + d = sampled_result_arrays[array_start_offset + 1] + e_id = sampled_result_arrays[array_start_offset + 2] + s, d, e_id = cp.asarray(s), cp.asarray(d), cp.asarray(e_id) + array_start_offset = array_start_offset + 3 + if s is not None and len(s) >= 0: + s, d, e_id = s.toDlpack(), d.toDlpack(), e_id.toDlpack() + else: + s, d, e_id = None, None, None + result_d[etype] = (s, d, e_id) + return result_d diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/service_extensions/add_data.py b/python/cugraph/cugraph/gnn/dgl_extensions/service_extensions/add_data.py new file mode 100644 index 00000000000..dd1a33a7115 --- /dev/null +++ b/python/cugraph/cugraph/gnn/dgl_extensions/service_extensions/add_data.py @@ -0,0 +1,53 @@ +import cudf +import dask_cudf +import cupy as cp +from cugraph.experimental import MGPropertyGraph + + +def add_node_data_from_parquet( + file_path, node_col_name, node_offset, ntype, gid, server +): + pG = server.get_graph(gid) + if isinstance(pG, MGPropertyGraph): + df = dask_cudf.read_parquet(file_path) + else: + df = cudf.read_parquet(file_path) + + df[node_col_name] = df[node_col_name] + node_offset + pG.add_vertex_data(df, vertex_col_name=node_col_name, type_name=ntype) + + columns_list = list(df.columns) + + return serialize_strings_to_array(columns_list) + + +def add_edge_data_from_parquet( + file_path, node_col_names, canonical_etype, src_offset, dst_offset, gid, server +): + pG = server.get_graph(gid) + if isinstance(pG, MGPropertyGraph): + df = dask_cudf.read_parquet(file_path) + else: + df = cudf.read_parquet(file_path) + + df[node_col_names[0]] = df[node_col_names] + src_offset + df[node_col_names[1]] = df[node_col_names] + dst_offset + pG.add_edge_data(df, vertex_col_names=node_col_names, type_name=canonical_etype) + + columns_list = list(df.columns) + + return serialize_strings_to_array(columns_list) + + +def convert_to_string_ar(string): + return cp.asarray([ord(c) for c in string], cp.int32), len(string) + + +def serialize_strings_to_array(strings_list): + ar_ls = [] + len_ls = [] + for s in strings_list: + ar, s_len = convert_to_string_ar(s) + ar_ls.append(ar) + len_ls.append(s_len) + return cp.concatenate(ar_ls), cp.asarray(len_ls, dtype=cp.int32) diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/utils/add_data.py b/python/cugraph/cugraph/gnn/dgl_extensions/utils/add_data.py index 89614606dd3..c05f794f038 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/utils/add_data.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/utils/add_data.py @@ -60,3 +60,23 @@ def _update_feature_map( ) for col in columns: pg_feature_map[col] = [col] + + +def deserialize_strings_from_char_ars(char_ar, len_ar): + string_start = 0 + string_list = [] + for string_offset in len_ar: + string_end = string_start + string_offset + s = char_ar[string_start:string_end] + + # Check of cupy array + if type(s).__module__ == "cupy": + s = s.get() + + # Check for numpy + if type(s).__module__ == "numpy": + s = s.tolist() + s = "".join([chr(i) for i in s]) + string_list.append(s) + string_start = string_end + return string_list From 0afbb9bdb15d974141d9af2c7686ed5861a6b19f Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Thu, 27 Oct 2022 10:11:21 -0700 Subject: [PATCH 24/41] Add cugraph_service_store --- .../cugraph/gnn/dgl_extensions/cugraph_service_store.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py index 8936a96d6fe..5ed3bc1aef5 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py @@ -31,11 +31,11 @@ class CuGraphRemoteStore(BaseCuGraphStore): This class return dlpack types and has additional functional arguments. """ - def __init__(self, graph, backend_lib="torch"): + def __init__(self, graph, graph_client, backend_lib="torch"): if type(graph).__name__ in ["RemoteGraph"]: self.__G = graph - self.client = graph._client + self.client = graph_client else: raise ValueError("graph must be a RemoteGraph") @@ -172,7 +172,7 @@ def add_node_data_from_parquet( node_col_name=node_col_name, node_offset=node_offset, type=ntype, - graph_id=self.gdata.id, + graph_id=self.gdata._id, ) loaded_columns = deserialize_strings_from_char_ars(loaded_columns) @@ -228,7 +228,7 @@ def add_edge_data_from_parquet( canonical_etype=canonical_etype, src_offset=src_offset, dst_offset=dst_offset, - graph_id=self.gdata, + graph_id=self.gdata._id, ) loaded_columns = deserialize_strings_from_char_ars(loaded_column_ars) columns = [col for col in loaded_columns if col not in node_col_names] From 84266565b302cdd3b0338cdd6fcb6962731c6724 Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Thu, 27 Oct 2022 15:01:51 -0700 Subject: [PATCH 25/41] Add RemotePropertyGraph and RemoteMGPropertyGraph --- .../cugraph/gnn/dgl_extensions/cugraph_service_store.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py index 071e1b6be58..454a6be989a 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py @@ -16,7 +16,8 @@ from .base_cugraph_store import BaseCuGraphStore from functools import cached_property -from .utils.add_data import _update_feature_map, deserialize_strings_from_char_ars +from .utils.add_data import _update_feature_map +from .utils.add_data import deserialize_strings_from_char_ars from .utils.sampling import sample_pg, get_subgraph_and_src_range_from_pg from .utils.sampling import get_underlying_dtype_from_sg from .feature_storage import CuFeatureStorage @@ -32,7 +33,7 @@ class CuGraphRemoteStore(BaseCuGraphStore): """ def __init__(self, graph, graph_client, backend_lib="torch"): - if type(graph).__name__ in ["RemoteGraph"]: + if type(graph).__name__ in ["RemotePropertyGraph", "RemoteMGPropertyGraph"]: self.__G = graph self.client = graph_client else: From 5e902e86abfc77041f58a98cd4b135e8698945e4 Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Thu, 27 Oct 2022 17:31:26 -0700 Subject: [PATCH 26/41] Added working feature storage and add parquet data --- .../cugraph/gnn/dgl_extensions/__init__.py | 12 ++ .../dgl_extensions/cugraph_service_store.py | 20 ++- .../gnn/dgl_extensions/feature_storage.py | 120 ++++++++++++++---- .../service_extensions/__init__.py | 12 ++ .../service_extensions/add_data.py | 12 +- .../test_dgl_extension_remote_wrappers.py | 108 ++++++++++++++++ .../cugraph_service_client/remote_graph.py | 7 +- 7 files changed, 250 insertions(+), 41 deletions(-) create mode 100644 python/cugraph/cugraph/gnn/dgl_extensions/service_extensions/__init__.py create mode 100644 python/cugraph/cugraph/tests/test_dgl_extension_remote_wrappers.py diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/__init__.py b/python/cugraph/cugraph/gnn/dgl_extensions/__init__.py index e69de29bb2d..b04c7e4b5f5 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/__init__.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/__init__.py @@ -0,0 +1,12 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py index 454a6be989a..0383a9e53c1 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py @@ -36,6 +36,11 @@ def __init__(self, graph, graph_client, backend_lib="torch"): if type(graph).__name__ in ["RemotePropertyGraph", "RemoteMGPropertyGraph"]: self.__G = graph self.client = graph_client + + add_data_module = "cugraph.gnn.dgl_extensions.service_extensions.add_data" + _ = self.client.load_extensions(add_data_module) + del _ + else: raise ValueError("graph must be a RemoteGraph") @@ -166,15 +171,15 @@ def add_node_data_from_parquet( None """ # TODO: Use PR 2850: https://github.com/rapidsai/cugraph/pull/2850 - loaded_columns = self.client.call_extension( + c_ar, len_ar = self.client.call_extension( func_name="add_node_data_from_parquet", file_path=file_path, node_col_name=node_col_name, node_offset=node_offset, - type=ntype, - graph_id=self.gdata._id, + ntype=ntype, + graph_id=self.gdata._graph_id, ) - loaded_columns = deserialize_strings_from_char_ars(loaded_columns) + loaded_columns = deserialize_strings_from_char_ars(c_ar, len_ar) columns = [col for col in loaded_columns if col != node_col_name] _update_feature_map( @@ -221,20 +226,21 @@ def add_edge_data_from_parquet( """ # TODO: Use PR 2850: https://github.com/rapidsai/cugraph/pull/2850 - loaded_column_ars = self.client.call_extension( + c_ar, len_ar = self.client.call_extension( func_name="add_edge_data_from_parquet", file_path=file_path, node_col_names=node_col_names, canonical_etype=canonical_etype, src_offset=src_offset, dst_offset=dst_offset, - graph_id=self.gdata._id, + graph_id=self.gdata._graph_id, ) - loaded_columns = deserialize_strings_from_char_ars(loaded_column_ars) + loaded_columns = deserialize_strings_from_char_ars(c_ar, len_ar) columns = [col for col in loaded_columns if col not in node_col_names] _update_feature_map( self.edata_feat_col_d, feat_name, contains_vector_features, columns ) + self.__clear_cached_properties() def get_node_storage(self, key, ntype=None, indices_offset=0): if ntype is None: diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py b/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py index a518e9015cd..b2deda15c87 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py @@ -10,7 +10,51 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import cupy as cp + +from importlib import import_module +import numpy as np + + +def _get_backend_lib_ar(ar): + return type(ar).__module__ + + +def _convert_ar_to_numpy(ar): + if isinstance(ar, list): + ar = np.asarray(ar) + else: + lib_name = _get_backend_lib_ar(ar) + if lib_name == "torch": + ar = ar.cpu().numpy() + elif lib_name == "cupy": + ar = ar.get() + elif lib_name == "cudf": + ar = ar.values.get() + elif lib_name == "numpy": + ar = ar + else: + raise NotImplementedError( + f"{lib_name=} not supported yet for conversion to numpy" + ) + return ar + + +def _convert_ar_list_to_dlpack(ar_ls): + lib_name = _get_backend_lib_ar(ar_ls[0]) + lib = import_module(lib_name) + ar_ls = [lib.atleast_2d(ar) for ar in ar_ls] + stacked_ar = lib.hstack(ar_ls) + if lib_name == "torch": + cap = lib.utils.dlpack.to_dlpack(stacked_ar) + elif lib_name == "cupy": + cap = stacked_ar.toDlpack() + elif lib_name == "numpy": + # handle numpy case + cap = stacked_ar + else: + raise NotImplementedError(f"{lib_name=} is not yet supported") + + return cap class CuFeatureStorage: @@ -23,16 +67,20 @@ def __init__( ): self.pg = pg self.columns = columns + if backend_lib == "torch": from torch.utils.dlpack import from_dlpack elif backend_lib == "tf": from tensorflow.experimental.dlpack import from_dlpack elif backend_lib == "cupy": from cupy import from_dlpack + elif backend_lib == "numpy": + pass else: raise NotImplementedError( - f"Only PyTorch ('torch'), TensorFlow ('tf'), and CuPy ('cupy') " - f"backends are currently supported, got {backend_lib=}" + f"Only PyTorch ('torch'), TensorFlow ('tf'), and CuPy ('cupy')" + f"and numpy ('numpy') backends are currently supported, " + f" got {backend_lib=}" ) if storage_type not in ["edge", "node"]: raise NotImplementedError("Only edge and node storage is supported") @@ -60,38 +108,56 @@ def fetch(self, indices, device=None, pin_memory=False, **kwargs): Feature data stored in PyTorch Tensor. """ # Default implementation uses synchronous fetch. - - indices = cp.asarray(indices) - if type(self.pg).__name__ == "MGPropertyGraph": - # dask_cudf loc breaks if we provide cudf series/cupy array - # https://github.com/rapidsai/cudf/issues/11877 - indices = indices.get() + # Handle remote case + if type(self.pg).__name__ in ["RemotePropertyGraph", "RemoteMGPropertyGraph"]: + indices = _convert_ar_to_numpy(indices) + indices = indices + self.indices_offset + # TODO: Raise Issue + # We dont support numpy arrays in get_vertex_data, get_edge_data + # for Remote Graphs + indices = indices.tolist() else: - import cudf + # For local case + # we rely on cupy to handle various inputs cleanly like GPU Tensor, + # cupy array, cudf Series, cpu tensor etc + import cupy as cp + + indices = cp.asarray(indices) + if type(self.pg).__name__ == "MGPropertyGraph": + # dask_cudf loc breaks if we provide cudf series/cupy array + # https://github.com/rapidsai/cudf/issues/11877 + indices = indices.get() + else: + import cudf - indices = cudf.Series(indices) + indices = cudf.Series(indices) - indices = indices + self.indices_offset + indices = indices + self.indices_offset if self.storage_type == "node": - subset_df = self.pg.get_vertex_data( - vertex_ids=indices, columns=self.columns - ) + result = self.pg.get_vertex_data(vertex_ids=indices, columns=self.columns) else: - subset_df = self.pg.get_edge_data(edge_ids=indices, columns=self.columns) - - subset_df = subset_df[self.columns] + result = self.pg.get_edge_data(edge_ids=indices, columns=self.columns) - if hasattr(subset_df, "compute"): - subset_df = subset_df.compute() + if type(result).__name__ == "DataFrame": + result = result[self.columns] + if hasattr(result, "compute"): + result = result.compute() + if len(result) == 0: + raise ValueError(f"{indices=} not found in FeatureStorage") + cap = result.to_dlpack() + else: + # When backend is not dataframe(pandas, cuDF) we return lists + result = result[-len(self.columns) :] + cap = _convert_ar_to_numpy(result) - if len(subset_df) == 0: - raise ValueError(f"indices = {indices} not found in FeatureStorage") - cap = subset_df.to_dlpack() - tensor = self.from_dlpack(cap) - del cap + if type(cap).__name__ == "PyCapsule": + tensor = self.from_dlpack(cap) + del cap + else: + tensor = cap if device: - if not isinstance(tensor, cp.ndarray): - # Cant transfer to different device for cupy + if type(tensor).__module__ == "torch": + # Can only transfer to different device for pytorch tensor = tensor.to(device) return tensor diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/service_extensions/__init__.py b/python/cugraph/cugraph/gnn/dgl_extensions/service_extensions/__init__.py new file mode 100644 index 00000000000..b04c7e4b5f5 --- /dev/null +++ b/python/cugraph/cugraph/gnn/dgl_extensions/service_extensions/__init__.py @@ -0,0 +1,12 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/service_extensions/add_data.py b/python/cugraph/cugraph/gnn/dgl_extensions/service_extensions/add_data.py index dd1a33a7115..d2e08922ef6 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/service_extensions/add_data.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/service_extensions/add_data.py @@ -5,9 +5,9 @@ def add_node_data_from_parquet( - file_path, node_col_name, node_offset, ntype, gid, server + file_path, node_col_name, node_offset, ntype, graph_id, server ): - pG = server.get_graph(gid) + pG = server.get_graph(graph_id) if isinstance(pG, MGPropertyGraph): df = dask_cudf.read_parquet(file_path) else: @@ -22,16 +22,16 @@ def add_node_data_from_parquet( def add_edge_data_from_parquet( - file_path, node_col_names, canonical_etype, src_offset, dst_offset, gid, server + file_path, node_col_names, canonical_etype, src_offset, dst_offset, graph_id, server ): - pG = server.get_graph(gid) + pG = server.get_graph(graph_id) if isinstance(pG, MGPropertyGraph): df = dask_cudf.read_parquet(file_path) else: df = cudf.read_parquet(file_path) - df[node_col_names[0]] = df[node_col_names] + src_offset - df[node_col_names[1]] = df[node_col_names] + dst_offset + df[node_col_names[0]] = df[node_col_names[0]] + src_offset + df[node_col_names[1]] = df[node_col_names[1]] + dst_offset pG.add_edge_data(df, vertex_col_names=node_col_names, type_name=canonical_etype) columns_list = list(df.columns) diff --git a/python/cugraph/cugraph/tests/test_dgl_extension_remote_wrappers.py b/python/cugraph/cugraph/tests/test_dgl_extension_remote_wrappers.py new file mode 100644 index 00000000000..37bb7846848 --- /dev/null +++ b/python/cugraph/cugraph/tests/test_dgl_extension_remote_wrappers.py @@ -0,0 +1,108 @@ +from cugraph.gnn.dgl_extensions.cugraph_service_store import CuGraphRemoteStore +from cugraph_service_client.client import CugraphServiceClient as Client +import numpy as np + + +def create_gs(client): + gs = CuGraphRemoteStore(client.graph(), client) + gs.add_node_data_from_parquet( + file_path="nt.a.parquet", node_col_name="node_id", ntype="nt.a", node_offset=0 + ) + gs.add_node_data_from_parquet( + file_path="nt.b.parquet", + node_col_name="node_id", + ntype="nt.b", + node_offset=gs.num_nodes(), + ) + gs.add_node_data_from_parquet( + file_path="nt.c.parquet", + node_col_name="node_id", + ntype="nt.c", + node_offset=gs.num_nodes(), + ) + + can_etype = "('nt.a', 'connects', 'nt.b')" + gs.add_edge_data_from_parquet( + file_path=f"{can_etype}.parquet", + node_col_names=["src", "dst"], + src_offset=0, + dst_offset=3, + canonical_etype=can_etype, + ) + can_etype = "('nt.a', 'connects', 'nt.c')" + gs.add_edge_data_from_parquet( + file_path=f"{can_etype}.parquet", + node_col_names=["src", "dst"], + src_offset=0, + dst_offset=6, + canonical_etype=can_etype, + ) + can_etype = "('nt.c', 'connects', 'nt.c')" + gs.add_edge_data_from_parquet( + file_path=f"{can_etype}.parquet", + node_col_names=["src", "dst"], + src_offset=6, + dst_offset=6, + canonical_etype=can_etype, + ) + + return gs + + +def assert_correct_gs(gs): + assert gs.etypes[0] == "('nt.a', 'connects', 'nt.b')" + assert gs.ntypes[0] == "nt.a" + assert gs.num_nodes_dict["nt.a"] == 3 + assert gs.num_edges_dict["('nt.a', 'connects', 'nt.b')"] == 3 + assert gs.num_nodes("nt.c") == 5 + + # Test Get Node Storage + result = gs.get_node_storage(key="node_feat", ntype="nt.a", indices_offset=0).fetch( + [0, 1, 2] + ) + result = result.cpu().numpy() + expected_result = np.asarray([0, 10, 20], dtype=np.int32) + np.testing.assert_equal(result, expected_result) + + result = gs.get_node_storage(key="node_feat", ntype="nt.b", indices_offset=3).fetch( + [0, 1, 2] + ) + result = result.cpu().numpy() + expected_result = np.asarray([30, 40, 50], dtype=np.int32) + np.testing.assert_equal(result, expected_result) + + result = gs.get_node_storage(key="node_feat", ntype="nt.c", indices_offset=5).fetch( + [1, 2, 3] + ) + result = result.cpu().numpy() + expected_result = np.asarray([60, 70, 80], dtype=np.int32) + np.testing.assert_equal(result, expected_result) + + # Test Get Edge Storage + result = gs.get_edge_storage( + key="edge_feat", etype="('nt.a', 'connects', 'nt.b')", indices_offset=0 + ).fetch([0, 1, 2]) + result = result.cpu().numpy() + expected_result = np.asarray([10, 11, 12], dtype=np.int32) + np.testing.assert_equal(result, expected_result) + + result = gs.get_edge_storage( + key="edge_feat", etype="('nt.a', 'connects', 'nt.c')", indices_offset=0 + ).fetch([4, 5]) + result = result.cpu().numpy() + expected_result = np.asarray([14, 15], dtype=np.int32) + np.testing.assert_equal(result, expected_result) + + result = gs.get_edge_storage( + key="edge_feat", etype="('nt.c', 'connects', 'nt.c')", indices_offset=0 + ).fetch([6, 8]) + result = result.cpu().numpy() + expected_result = np.asarray([16, 18], dtype=np.int32) + np.testing.assert_equal(result, expected_result) + + +def test_remote_wrappers(): + # TODO: Check with rick on how to test it + c = Client() + gs = create_gs(c) + assert_correct_gs(gs) diff --git a/python/cugraph_service/cugraph_service_client/remote_graph.py b/python/cugraph_service/cugraph_service_client/remote_graph.py index 30133533a90..126c0671a1c 100644 --- a/python/cugraph_service/cugraph_service_client/remote_graph.py +++ b/python/cugraph_service/cugraph_service_client/remote_graph.py @@ -479,8 +479,13 @@ def get_edge_data( if columns is None: columns = self.edge_property_names + if edge_ids is None: + ids = -1 + else: + ids = edge_ids + edge_data = self.__client.get_graph_edge_data( - id_or_ids=edge_ids or -1, + id_or_ids=ids, property_keys=columns, types=types, graph_id=self.__graph_id, From 00289a61df984cbddd127caae0576887888db075 Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Thu, 27 Oct 2022 17:37:54 -0700 Subject: [PATCH 27/41] Add nvidia copyright --- .../tests/test_dgl_extension_remote_wrappers.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/python/cugraph/cugraph/tests/test_dgl_extension_remote_wrappers.py b/python/cugraph/cugraph/tests/test_dgl_extension_remote_wrappers.py index 37bb7846848..f771dd2aad4 100644 --- a/python/cugraph/cugraph/tests/test_dgl_extension_remote_wrappers.py +++ b/python/cugraph/cugraph/tests/test_dgl_extension_remote_wrappers.py @@ -1,3 +1,16 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from cugraph.gnn.dgl_extensions.cugraph_service_store import CuGraphRemoteStore from cugraph_service_client.client import CugraphServiceClient as Client import numpy as np From 5354648724b404179e7bb04903c0fb2583b6310f Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Thu, 27 Oct 2022 17:40:29 -0700 Subject: [PATCH 28/41] Added copyright information --- .../dgl_extensions/service_extensions/add_data.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/service_extensions/add_data.py b/python/cugraph/cugraph/gnn/dgl_extensions/service_extensions/add_data.py index d2e08922ef6..13c8cc55403 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/service_extensions/add_data.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/service_extensions/add_data.py @@ -1,3 +1,16 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import cudf import dask_cudf import cupy as cp From 5cdd07bcb101c74115f6324abbbd7acca740c686 Mon Sep 17 00:00:00 2001 From: Rick Ratzel Date: Thu, 27 Oct 2022 22:02:06 -0500 Subject: [PATCH 29/41] Added add_graph() to server facade so extensions can also create and register graphs, added support for returning int8 so graph IDs can be returned easier. --- .../cugraph_service_client/types.py | 8 ++-- .../cugraph_service_server/cugraph_handler.py | 29 ++++++------ python/cugraph_service/tests/conftest.py | 40 +++++++++++++++++ .../tests/test_cugraph_handler.py | 18 +++++--- .../tests/test_mg_cugraph_handler.py | 18 +++++--- python/cugraph_service/tests/test_mg_e2e.py | 45 +++++++++++++++++++ 6 files changed, 129 insertions(+), 29 deletions(-) diff --git a/python/cugraph_service/cugraph_service_client/types.py b/python/cugraph_service/cugraph_service_client/types.py index 613e29e94d1..8b05186b0bb 100644 --- a/python/cugraph_service/cugraph_service_client/types.py +++ b/python/cugraph_service/cugraph_service_client/types.py @@ -51,9 +51,9 @@ class ValueWrapper(UnionWrapper): valid_types = ["int", "float", "str", "bool"] if numpy: - valid_types += ["numpy.int32", "numpy.int64", "numpy.ndarray"] + valid_types += ["numpy.int8", "numpy.int32", "numpy.int64", "numpy.ndarray"] if cupy: - valid_types += ["cupy.int32", "cupy.int64", "cupy.ndarray"] + valid_types += ["cupy.int8", "cupy.int32", "cupy.int64", "cupy.ndarray"] def __init__(self, val, val_name="value"): """ @@ -72,8 +72,8 @@ def __init__(self, val, val_name="value"): self.union = Value(int64_value=val) elif isinstance(val, float): self.union = Value(double_value=val) - elif (numpy and isinstance(val, numpy.int32)) or ( - cupy and isinstance(val, cupy.int32) + elif (numpy and isinstance(val, (numpy.int8, numpy.int32))) or ( + cupy and isinstance(val, (cupy.int8, cupy.int32)) ): self.union = Value(int32_value=int(val)) elif (numpy and isinstance(val, numpy.int64)) or ( diff --git a/python/cugraph_service/cugraph_service_server/cugraph_handler.py b/python/cugraph_service/cugraph_service_server/cugraph_handler.py index 17e30f53b84..f086342d7c5 100644 --- a/python/cugraph_service/cugraph_service_server/cugraph_handler.py +++ b/python/cugraph_service/cugraph_service_server/cugraph_handler.py @@ -128,6 +128,9 @@ def get_graph_ids(self): def get_graph(self, graph_id): return self.__handler._get_graph(graph_id) + def add_graph(self, G): + return self.__handler._add_graph(G) + class CugraphHandler: """ @@ -276,7 +279,7 @@ def call_graph_creation_extension( func_kwargs_repr, ) # FIXME: ensure graph_obj is a graph obj - return self.__add_graph(graph_obj) + return self._add_graph(graph_obj) def call_extension( self, @@ -381,7 +384,7 @@ def create_graph(self): new graph ID. """ pG = self.__create_graph() - return self.__add_graph(pG) + return self._add_graph(pG) def delete_graph(self, graph_id): """ @@ -615,7 +618,7 @@ def extract_subgraph( except Exception: raise CugraphServiceError(f"{traceback.format_exc()}") - return self.__add_graph(G) + return self._add_graph(G) def get_graph_vertex_data( self, id_or_ids, null_replacement_value, property_keys, types, graph_id @@ -935,6 +938,16 @@ def get_graph_type(self, graph_id): # "Protected" interface - used for both implementation and test/debug. Will # not be exposed to a cugraph_service client, but will be used by extensions # via the ExtensionServerFacade. + def _add_graph(self, G): + """ + Create a new graph ID for G and add G to the internal mapping of + graph ID:graph instance. + """ + gid = self.__next_graph_id + self.__graph_objs[gid] = G + self.__next_graph_id += 1 + return gid + def _get_graph(self, graph_id): """ Return the cuGraph Graph object associated with graph_id. @@ -996,16 +1009,6 @@ def __get_dataframe_from_csv(self, csv_file_name, delimiter, dtypes, header, nam return gdf - def __add_graph(self, G): - """ - Create a new graph ID for G and add G to the internal mapping of - graph ID:graph instance. - """ - gid = self.__next_graph_id - self.__graph_objs[gid] = G - self.__next_graph_id += 1 - return gid - def __create_graph(self): """ Instantiate a graph object using a type appropriate for the handler ( diff --git a/python/cugraph_service/tests/conftest.py b/python/cugraph_service/tests/conftest.py index 519a7d39536..6b9d9702eb6 100644 --- a/python/cugraph_service/tests/conftest.py +++ b/python/cugraph_service/tests/conftest.py @@ -187,6 +187,7 @@ def my_extension(arg1, arg2, server): return retval """ + extension_returns_none_file_contents = """ def my_extension(): @@ -194,6 +195,36 @@ def my_extension(): """ +extension_adds_graph_file_contents = """ +import cupy +import cudf +from cugraph.experimental import PropertyGraph + +def my_extension(arg1, arg2, server): + ''' + This extension creates a new graph, registers it with the server, and + returns the new graph ID and some additional data. + ''' + df = cudf.DataFrame({"src": [0, 1, 2], + "dst": [1, 2, 3], + "edge_prop": ["a", "b", "c"], + }) + pG = PropertyGraph() + pG.add_edge_data(df, vertex_col_names=["src", "dst"]) + + pG_gid = server.add_graph(pG) + + edge_df = pG.get_edge_data() + values = cupy.array(edge_df[pG.edge_id_col_name] + arg1 + arg2) + + # UCX-Py transfers require cupy types, and cupy types are converted to host + # for non-UCX-Py transfers. + pG_gid = cupy.int8(pG_gid) + + return (pG_gid, values) +""" + + ############################################################################### # module scope fixtures @@ -296,3 +327,12 @@ def extension_returns_none(): ) yield tmp_extension_dir.name + + +@pytest.fixture(scope="module") +def extension_adds_graph(): + tmp_extension_dir = utils.create_tmp_extension_dir( + extension_adds_graph_file_contents + ) + + yield tmp_extension_dir.name diff --git a/python/cugraph_service/tests/test_cugraph_handler.py b/python/cugraph_service/tests/test_cugraph_handler.py index a5637c0c604..e592b643deb 100644 --- a/python/cugraph_service/tests/test_cugraph_handler.py +++ b/python/cugraph_service/tests/test_cugraph_handler.py @@ -336,8 +336,9 @@ def test_get_graph_data_large_vertex_ids(graph_creation_extension_big_vertex_ids vert_data = handler.get_graph_vertex_data( id_or_ids=invalid_vert_id, null_replacement_value=0, - graph_id=new_graph_id, property_keys=None, + types=None, + graph_id=new_graph_id, ) assert len(pickle.loads(vert_data)) == 0 @@ -346,8 +347,9 @@ def test_get_graph_data_large_vertex_ids(graph_creation_extension_big_vertex_ids vert_data = handler.get_graph_vertex_data( id_or_ids=large_vert_id, null_replacement_value=0, - graph_id=new_graph_id, property_keys=None, + types=None, + graph_id=new_graph_id, ) assert len(pickle.loads(vert_data)) == 1 @@ -356,8 +358,9 @@ def test_get_graph_data_large_vertex_ids(graph_creation_extension_big_vertex_ids edge_data = handler.get_graph_edge_data( id_or_ids=invalid_edge_id, null_replacement_value=0, - graph_id=new_graph_id, property_keys=None, + types=None, + graph_id=new_graph_id, ) assert len(pickle.loads(edge_data)) == 0 @@ -366,8 +369,9 @@ def test_get_graph_data_large_vertex_ids(graph_creation_extension_big_vertex_ids edge_data = handler.get_graph_edge_data( id_or_ids=small_edge_id, null_replacement_value=0, - graph_id=new_graph_id, property_keys=None, + types=None, + graph_id=new_graph_id, ) assert len(pickle.loads(edge_data)) == 1 @@ -393,8 +397,9 @@ def test_get_graph_data_empty_graph(graph_creation_extension_empty_graph): vert_data = handler.get_graph_vertex_data( id_or_ids=invalid_vert_id, null_replacement_value=0, - graph_id=new_graph_id, property_keys=None, + types=None, + graph_id=new_graph_id, ) assert len(pickle.loads(vert_data)) == 0 @@ -403,8 +408,9 @@ def test_get_graph_data_empty_graph(graph_creation_extension_empty_graph): edge_data = handler.get_graph_edge_data( id_or_ids=invalid_edge_id, null_replacement_value=0, - graph_id=new_graph_id, property_keys=None, + types=None, + graph_id=new_graph_id, ) assert len(pickle.loads(edge_data)) == 0 diff --git a/python/cugraph_service/tests/test_mg_cugraph_handler.py b/python/cugraph_service/tests/test_mg_cugraph_handler.py index 5162f5b3c2e..6e3b1d750ad 100644 --- a/python/cugraph_service/tests/test_mg_cugraph_handler.py +++ b/python/cugraph_service/tests/test_mg_cugraph_handler.py @@ -109,8 +109,9 @@ def test_get_graph_data_large_vertex_ids( vert_data = handler.get_graph_vertex_data( id_or_ids=invalid_vert_id, null_replacement_value=0, - graph_id=new_graph_id, property_keys=None, + types=None, + graph_id=new_graph_id, ) assert len(pickle.loads(vert_data)) == 0 @@ -119,8 +120,9 @@ def test_get_graph_data_large_vertex_ids( vert_data = handler.get_graph_vertex_data( id_or_ids=large_vert_id, null_replacement_value=0, - graph_id=new_graph_id, property_keys=None, + types=None, + graph_id=new_graph_id, ) assert len(pickle.loads(vert_data)) == 1 @@ -129,8 +131,9 @@ def test_get_graph_data_large_vertex_ids( edge_data = handler.get_graph_edge_data( id_or_ids=invalid_edge_id, null_replacement_value=0, - graph_id=new_graph_id, property_keys=None, + types=None, + graph_id=new_graph_id, ) assert len(pickle.loads(edge_data)) == 0 @@ -139,8 +142,9 @@ def test_get_graph_data_large_vertex_ids( edge_data = handler.get_graph_edge_data( id_or_ids=small_edge_id, null_replacement_value=0, - graph_id=new_graph_id, property_keys=None, + types=None, + graph_id=new_graph_id, ) assert len(pickle.loads(edge_data)) == 1 @@ -167,8 +171,9 @@ def test_get_graph_data_empty_graph( vert_data = handler.get_graph_vertex_data( id_or_ids=invalid_vert_id, null_replacement_value=0, - graph_id=new_graph_id, property_keys=None, + types=None, + graph_id=new_graph_id, ) assert len(pickle.loads(vert_data)) == 0 @@ -177,8 +182,9 @@ def test_get_graph_data_empty_graph( edge_data = handler.get_graph_edge_data( id_or_ids=invalid_edge_id, null_replacement_value=0, - graph_id=new_graph_id, property_keys=None, + types=None, + graph_id=new_graph_id, ) assert len(pickle.loads(edge_data)) == 0 diff --git a/python/cugraph_service/tests/test_mg_e2e.py b/python/cugraph_service/tests/test_mg_e2e.py index 902cf359204..cf35dae9946 100644 --- a/python/cugraph_service/tests/test_mg_e2e.py +++ b/python/cugraph_service/tests/test_mg_e2e.py @@ -431,3 +431,48 @@ def test_call_extension_result_on_device( for mod_name in ext_mod_names: client.unload_extension_module(mod_name) + + +def test_extension_adds_graph( + extension_adds_graph, result_device_id, client_of_sg_server_on_device_1 +): + """ + Ensures an extension can create and add a graph to the server and return the + new graph ID and other data. + """ + extension_dir = extension_adds_graph + client = client_of_sg_server_on_device_1 + + ext_mod_names = client.load_extensions(extension_dir) + + # The extension will add a graph, compute a value based on the graph data, + # and return the new graph ID and the result. + graph_ids_before = client.get_graph_ids() + + val1 = 22 + val2 = 33.1 + results = client.call_extension( + "my_extension", val1, val2, result_device=result_device_id + ) + + graph_ids_after = client.get_graph_ids() + + assert len(graph_ids_after) - len(graph_ids_before) == 1 + new_gid = (set(graph_ids_after) - set(graph_ids_before)).pop() + assert len(results) == 2 + assert results[0] == new_gid + expected_edge_ids = [0, 1, 2] + expected_val = [n + val1 + val2 for n in expected_edge_ids] + + if result_device_id is None: + assert results[1] == expected_val + else: + device_n = cp.cuda.Device(result_device_id) + assert results[0].device == device_n + assert results[1].device == device_n + assert results[1].tolist() == expected_val + + # FIXME: much of this test could be in a fixture which ensures the extension + # is unloaded from the server before returning + for mod_name in ext_mod_names: + client.unload_extension_module(mod_name) From 133c0d0152415c04b37d027c578267bf739528b6 Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Fri, 28 Oct 2022 10:50:24 -0700 Subject: [PATCH 30/41] add_remore_storage_support --- .../dgl_extensions/cugraph_service_store.py | 80 ++++++++++++------- .../gnn/dgl_extensions/cugraph_store.py | 2 +- .../service_extensions/sampling.py | 76 ++++++++++++++++++ .../gnn/dgl_extensions/utils/sampling.py | 15 +++- .../test_dgl_extension_remote_wrappers.py | 26 ++++++ 5 files changed, 167 insertions(+), 32 deletions(-) create mode 100644 python/cugraph/cugraph/gnn/dgl_extensions/service_extensions/sampling.py diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py index 0383a9e53c1..54669ea8cc2 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py @@ -12,15 +12,15 @@ # limitations under the License. from collections import defaultdict - from .base_cugraph_store import BaseCuGraphStore - from functools import cached_property -from .utils.add_data import _update_feature_map -from .utils.add_data import deserialize_strings_from_char_ars -from .utils.sampling import sample_pg, get_subgraph_and_src_range_from_pg -from .utils.sampling import get_underlying_dtype_from_sg -from .feature_storage import CuFeatureStorage +from cugraph.gnn.dgl_extensions.utils.add_data import _update_feature_map +from cugraph.gnn.dgl_extensions.utils.add_data import deserialize_strings_from_char_ars +from cugraph.gnn.dgl_extensions.feature_storage import CuFeatureStorage + +# TODO: Make this optional in next release +# Only used cause cant transfer dlpack objects through remote +import cupy as cp class CuGraphRemoteStore(BaseCuGraphStore): @@ -39,6 +39,8 @@ def __init__(self, graph, graph_client, backend_lib="torch"): add_data_module = "cugraph.gnn.dgl_extensions.service_extensions.add_data" _ = self.client.load_extensions(add_data_module) + sampling_module = "cugraph.gnn.dgl_extensions.service_extensions.sampling" + _ = self.client.load_extensions(sampling_module) del _ else: @@ -358,20 +360,27 @@ def sample_neighbors( first_sg = sgs_obj # Uniform sampling fails when the dtype # of the seed dtype is not same as the node dtype - # TODO: Update this function self.set_sg_node_dtype(first_sg) - # Below will be called from remote storage - # Call via Remote Storage - # TODO: Update this function - sampled_result_arrays = sample_pg( - self.gdata, + # TODO: Cant send dlpack or cupy arrays or numpys arrays + # through extensions + # Ask Rick + if isinstance(nodes_cap, dict): + nodes_ar = { + k: cp.from_dlpack(v).get().tolist() for k, v in nodes_cap.items() + } + else: + nodes_ar = cp.from_dlpack(nodes_cap).get().tolist() + + sampled_result_arrays = self.client.call_extension( + "sample_pg_remote", + graph_id=self.gdata._graph_id, has_multiple_etypes=self.has_multiple_etypes, etypes=self.etypes, sgs_obj=sgs_obj, sgs_src_range_obj=sgs_src_range_obj, sg_node_dtype=self._sg_node_dtype, - nodes_cap=nodes_cap, + nodes_ar=nodes_ar, replace=replace, fanout=fanout, edge_dir=edge_dir, @@ -383,14 +392,20 @@ def sample_neighbors( ###################################### @cached_property def extracted_subgraph(self): - return get_subgraph_and_src_range_from_pg( - self.gdata, reverse_edges=False, etype=None + return self.client.call_extension( + "get_subgraph_and_src_range_from_pg_remote", + graph_id=self.gdata._graph_id, + reverse_edges=False, + etype=None, ) @cached_property def extracted_reverse_subgraph(self): - return get_subgraph_and_src_range_from_pg( - self.gdata, reverse_edges=True, etype=None + return self.client.call_extension( + "get_subgraph_and_src_range_from_pg_remote", + graph_id=self.gdata._graph_id, + reverse_edges=True, + etype=None, ) @cached_property @@ -398,8 +413,11 @@ def extracted_subgraphs_per_type(self): sg_d = {} sg_src_range_d = {} for etype in self.etypes: - sg_d[etype], sg_src_range_d[etype] = get_subgraph_and_src_range_from_pg( - self.gdata, reverse_edges=False, etype=etype + sg_d[etype], sg_src_range_d[etype] = self.client.call_extension( + "get_subgraph_and_src_range_from_pg_remote", + graph_id=self.gdata._graph_id, + reverse_edges=False, + etype=etype, ) return sg_d, sg_src_range_d @@ -408,16 +426,26 @@ def extracted_reverse_subgraphs_per_type(self): sg_d = {} sg_src_range_d = {} for etype in self.etypes: - sg_d[etype], sg_src_range_d[etype] = get_subgraph_and_src_range_from_pg( - self.gdata, reverse_edges=True, etype=etype + sg_d[etype], sg_src_range_d[etype] = self.client.call_extension( + "get_subgraph_and_src_range_from_pg_remote", + graph_id=self.gdata._graph_id, + reverse_edges=True, + etype=etype, ) return sg_d, sg_src_range_d - def set_sg_node_dtype(self, sg): + def set_sg_node_dtype(self, sg_id): if hasattr(self, "_sg_node_dtype"): return self._sg_node_dtype else: - self._sg_node_dtype = get_underlying_dtype_from_sg(sg) + dtype_nbytes = self.client.call_extension( + "get_underlying_dtype_from_sg_remote", sg_id + ) + if dtype_nbytes == 32: + dtype = "int32" + else: + dtype = "int64" + self._sg_node_dtype = dtype return self._sg_node_dtype def find_edges(self, edge_ids_cap, etype): @@ -496,8 +524,6 @@ def __clear_cached_properties(self): def create_dlpack_results_from_arrays(sampled_result_arrays, etypes: list[str]): # TODO: Extend to pytorch/numpy/etc - import cupy as cp - if len(etypes) <= 1: s, d, e_id = sampled_result_arrays # Handle numpy array, cupy array, lists etc @@ -512,7 +538,7 @@ def create_dlpack_results_from_arrays(sampled_result_arrays, etypes: list[str]): e_id = sampled_result_arrays[array_start_offset + 2] s, d, e_id = cp.asarray(s), cp.asarray(d), cp.asarray(e_id) array_start_offset = array_start_offset + 3 - if s is not None and len(s) >= 0: + if s is not None and len(s) > 0: s, d, e_id = s.toDlpack(), d.toDlpack(), e_id.toDlpack() else: s, d, e_id = None, None, None diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py index faed0c7086f..e9f2af36c92 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py @@ -266,7 +266,7 @@ def sample_neighbors( sgs_obj=sgs_obj, sgs_src_range_obj=sgs_src_range_obj, sg_node_dtype=self._sg_node_dtype, - nodes_cap=nodes_cap, + nodes_ar=nodes_cap, replace=replace, fanout=fanout, edge_dir=edge_dir, diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/service_extensions/sampling.py b/python/cugraph/cugraph/gnn/dgl_extensions/service_extensions/sampling.py new file mode 100644 index 00000000000..f3fb8f7584c --- /dev/null +++ b/python/cugraph/cugraph/gnn/dgl_extensions/service_extensions/sampling.py @@ -0,0 +1,76 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from cugraph.gnn.dgl_extensions.utils.sampling import ( + sample_pg, + get_subgraph_and_src_range_from_pg, +) +from cugraph.gnn.dgl_extensions.utils.sampling import get_underlying_dtype_from_sg +import cupy as cp + + +def get_subgraph_and_src_range_from_pg_remote(graph_id, reverse_edges, etype, server): + pG = server.get_graph(graph_id) + subg, src_range = get_subgraph_and_src_range_from_pg(pG, reverse_edges, etype) + g_id = server.add_graph(subg) + g_id = cp.int8(g_id) + return g_id, src_range + + +def get_underlying_dtype_from_sg_remote(graph_id, server): + g = server.get_graph(graph_id) + dtype_name = get_underlying_dtype_from_sg(g).name + if dtype_name == "int32": + return 32 + if dtype_name == "int64": + return 64 + else: + raise NotImplementedError( + "IDS other than int32 and int64 not yet supported" + f"got dtype = {dtype_name}" + ) + + +def sample_pg_remote( + graph_id, + has_multiple_etypes, + etypes, + sgs_obj, + sgs_src_range_obj, + sg_node_dtype, + nodes_ar, + replace, + fanout, + edge_dir, + server, +): + pg = server.get_graph(graph_id) + + if isinstance(sgs_obj, dict): + sgs_obj = {k: server.get_graph(v) for k, v in sgs_obj.items()} + else: + sgs_obj = server.get_graph(sgs_obj) + + sampled_result_arrays = sample_pg( + pg=pg, + has_multiple_etypes=has_multiple_etypes, + etypes=etypes, + sgs_obj=sgs_obj, + sgs_src_range_obj=sgs_src_range_obj, + sg_node_dtype=sg_node_dtype, + nodes_ar=nodes_ar, + replace=replace, + fanout=fanout, + edge_dir=edge_dir, + ) + + return sampled_result_arrays diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/utils/sampling.py b/python/cugraph/cugraph/gnn/dgl_extensions/utils/sampling.py index 460b44ee3b1..cdd93b0fee4 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/utils/sampling.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/utils/sampling.py @@ -216,15 +216,15 @@ def sample_pg( sgs_obj, sgs_src_range_obj, sg_node_dtype, - nodes_cap, + nodes_ar, replace, fanout, edge_dir, ): - if isinstance(nodes_cap, dict): - nodes = {t: cudf.from_dlpack(n) for t, n in nodes_cap.items()} + if isinstance(nodes_ar, dict): + nodes = {t: create_cudf_series_from_node_ar(n) for t, n in nodes_ar.items()} else: - nodes = cudf.from_dlpack(nodes_cap) + nodes = create_cudf_series_from_node_ar(nodes_ar) if isinstance(pg, MGPropertyGraph): sample_f = cugraph.dask.uniform_neighbor_sample @@ -280,3 +280,10 @@ def sample_pg( sampled_df[dst_n].values, sampled_df["indices"].values, ) + + +def create_cudf_series_from_node_ar(node_ar): + if type(node_ar).__name__ == "PyCapsule": + return cudf.from_dlpack(node_ar) + else: + return cudf.Series(node_ar) diff --git a/python/cugraph/cugraph/tests/test_dgl_extension_remote_wrappers.py b/python/cugraph/cugraph/tests/test_dgl_extension_remote_wrappers.py index f771dd2aad4..4af95dbaa4f 100644 --- a/python/cugraph/cugraph/tests/test_dgl_extension_remote_wrappers.py +++ b/python/cugraph/cugraph/tests/test_dgl_extension_remote_wrappers.py @@ -13,6 +13,7 @@ from cugraph.gnn.dgl_extensions.cugraph_service_store import CuGraphRemoteStore from cugraph_service_client.client import CugraphServiceClient as Client +import cudf import numpy as np @@ -113,6 +114,31 @@ def assert_correct_gs(gs): expected_result = np.asarray([16, 18], dtype=np.int32) np.testing.assert_equal(result, expected_result) + # Verify set_sg_dtype + # verify extracted_reverse_subgraph + subgraph, src_range = gs.extracted_reverse_subgraph + dtype = gs.set_sg_node_dtype(subgraph) + assert dtype == "int32" + + # Verify Sampling Results + nodes_cap = {"nt.c": cudf.Series([6]).to_dlpack()} + result = gs.sample_neighbors(nodes_cap) + result = { + k: cudf.DataFrame( + { + "src": cudf.from_dlpack(v[0]), + "dst": cudf.from_dlpack(v[1]), + "eid": cudf.from_dlpack(v[2]), + } + ) + for k, v in result.items() + if v[0] is not None + } + src_vals = result["('nt.c', 'connects', 'nt.c')"]["src"].values.get() + sorted(src_vals) + expected_vals = np.asarray([7, 8, 9], dtype=np.int32) + np.testing.assert_equal(src_vals, expected_vals) + def test_remote_wrappers(): # TODO: Check with rick on how to test it From e91dc1fc19252fe22798232b7903951c083a3688 Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Mon, 31 Oct 2022 08:15:56 -0700 Subject: [PATCH 31/41] Add LocalStore --- .../dgl_extensions/cugraph_service_store.py | 33 ++++-- .../gnn/dgl_extensions/cugraph_store.py | 104 +++++++++++++++++- .../service_extensions/add_data.py | 36 ++---- .../gnn/dgl_extensions/utils/add_data.py | 82 ++++---------- .../gnn/dgl_extensions/utils/feature_map.py | 60 ++++++++++ 5 files changed, 224 insertions(+), 91 deletions(-) create mode 100644 python/cugraph/cugraph/gnn/dgl_extensions/utils/feature_map.py diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py index 54669ea8cc2..90428e727dd 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py @@ -14,8 +14,7 @@ from collections import defaultdict from .base_cugraph_store import BaseCuGraphStore from functools import cached_property -from cugraph.gnn.dgl_extensions.utils.add_data import _update_feature_map -from cugraph.gnn.dgl_extensions.utils.add_data import deserialize_strings_from_char_ars +from cugraph.gnn.dgl_extensions.utils.feature_map import _update_feature_map from cugraph.gnn.dgl_extensions.feature_storage import CuFeatureStorage # TODO: Make this optional in next release @@ -172,16 +171,15 @@ def add_node_data_from_parquet( ------- None """ - # TODO: Use PR 2850: https://github.com/rapidsai/cugraph/pull/2850 c_ar, len_ar = self.client.call_extension( - func_name="add_node_data_from_parquet", + func_name="add_node_data_from_parquet_remote", file_path=file_path, node_col_name=node_col_name, node_offset=node_offset, ntype=ntype, graph_id=self.gdata._graph_id, ) - loaded_columns = deserialize_strings_from_char_ars(c_ar, len_ar) + loaded_columns = _deserialize_strings_from_char_ars(c_ar, len_ar) columns = [col for col in loaded_columns if col != node_col_name] _update_feature_map( @@ -227,9 +225,8 @@ def add_edge_data_from_parquet( None """ - # TODO: Use PR 2850: https://github.com/rapidsai/cugraph/pull/2850 c_ar, len_ar = self.client.call_extension( - func_name="add_edge_data_from_parquet", + func_name="add_edge_data_from_parquet_remote", file_path=file_path, node_col_names=node_col_names, canonical_etype=canonical_etype, @@ -237,7 +234,7 @@ def add_edge_data_from_parquet( dst_offset=dst_offset, graph_id=self.gdata._graph_id, ) - loaded_columns = deserialize_strings_from_char_ars(c_ar, len_ar) + loaded_columns = _deserialize_strings_from_char_ars(c_ar, len_ar) columns = [col for col in loaded_columns if col not in node_col_names] _update_feature_map( self.edata_feat_col_d, feat_name, contains_vector_features, columns @@ -544,3 +541,23 @@ def create_dlpack_results_from_arrays(sampled_result_arrays, etypes: list[str]): s, d, e_id = None, None, None result_d[etype] = (s, d, e_id) return result_d + + +def _deserialize_strings_from_char_ars(char_ar, len_ar): + string_start = 0 + string_list = [] + for string_offset in len_ar: + string_end = string_start + string_offset + s = char_ar[string_start:string_end] + + # Check of cupy array + if type(s).__module__ == "cupy": + s = s.get() + + # Check for numpy + if type(s).__module__ == "numpy": + s = s.tolist() + s = "".join([chr(i) for i in s]) + string_list.append(s) + string_start = string_end + return string_list diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py index e9f2af36c92..2727f557215 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py @@ -18,7 +18,8 @@ from functools import cached_property from .utils.find_edges import find_edges from .utils.node_subgraph import node_subgraph -from .utils.add_data import _update_feature_map +from .utils.feature_map import _update_feature_map +from .utils.add_data import add_edge_data_from_parquet, add_node_data_from_parquet from .utils.sampling import sample_pg, get_subgraph_and_src_range_from_pg from .utils.sampling import get_underlying_dtype_from_sg from .feature_storage import CuFeatureStorage @@ -140,6 +141,107 @@ def add_edge_data( # Clear properties if set as data has changed self.__clear_cached_properties() + def add_node_data_from_parquet( + self, + file_path, + node_col_name, + ntype=None, + node_offset=0, + feat_name=None, + contains_vector_features=False, + ): + """ + Add a dataframe describing node properties to the PropertyGraph. + + Parameters + ---------- + file_path: string + Path of the files on the server + node_col_name : string + The column name that contains the values to be used as vertex IDs. + ntype : string + The node type to be added. + For example, if dataframe contains data about users, ntype + might be "users". + If not specified, the type of properties will be added as + an empty string. + feat_name : {} or string + A map of feature names under which we should save the added + properties like {"feat_1":[f1, f2], "feat_2":[f3, f4]} + (ignored if contains_vector_features=False and the col names of + the dataframe are treated as corresponding feature names) + contains_vector_features : False + Whether to treat the columns of the dataframe being added as + as 2d features + Returns + ------- + None + """ + loaded_columns = add_node_data_from_parquet( + file_path=file_path, + node_col_name=node_col_name, + node_offset=node_offset, + ntype=ntype, + pG=self.gdata, + ) + columns = [col for col in loaded_columns if col != node_col_name] + _update_feature_map( + self.ndata_feat_col_d, feat_name, contains_vector_features, columns + ) + # Clear properties if set as data has changed + self.__clear_cached_properties() + + def add_edge_data_from_parquet( + self, + file_path, + node_col_names, + src_offset=0, + dst_offset=0, + canonical_etype=None, + feat_name=None, + contains_vector_features=False, + ): + """ + Add a dataframe describing edge properties to the PropertyGraph. + + Parameters + ---------- + file_path : string + Path of file on server + node_col_names : string + The column names that contain the values to be used as the source + and destination vertex IDs for the edges. + canonical_etype : string + The edge type to be added. This should follow the string format + '(src_type),(edge_type),(dst_type)' + If not specified, the type of properties will be added as + an empty string. + feat_name : string or dict {} + The feature name under which we should save the added properties + (ignored if contains_vector_features=False and the col names of + the dataframe are treated as corresponding feature names) + contains_vector_features : False + Whether to treat the columns of the dataframe being added as + as 2d features + Returns + ------- + None + """ + + loaded_columns = add_edge_data_from_parquet( + file_path=file_path, + node_col_names=node_col_names, + canonical_etype=canonical_etype, + src_offset=src_offset, + dst_offset=dst_offset, + pG=self.gdata, + ) + columns = [col for col in loaded_columns if col not in node_col_names] + _update_feature_map( + self.edata_feat_col_d, feat_name, contains_vector_features, columns + ) + self.__clear_cached_properties() + def get_node_storage(self, key, ntype=None, indices_offset=0): if ntype is None: ntypes = self.ntypes diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/service_extensions/add_data.py b/python/cugraph/cugraph/gnn/dgl_extensions/service_extensions/add_data.py index 13c8cc55403..918030615af 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/service_extensions/add_data.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/service_extensions/add_data.py @@ -11,44 +11,32 @@ # See the License for the specific language governing permissions and # limitations under the License. -import cudf -import dask_cudf import cupy as cp -from cugraph.experimental import MGPropertyGraph +from cugraph.gnn.dgl_extensions.utils.add_data import ( + add_edge_data_from_parquet, + add_node_data_from_parquet, +) -def add_node_data_from_parquet( +def add_node_data_from_parquet_remote( file_path, node_col_name, node_offset, ntype, graph_id, server ): pG = server.get_graph(graph_id) - if isinstance(pG, MGPropertyGraph): - df = dask_cudf.read_parquet(file_path) - else: - df = cudf.read_parquet(file_path) - - df[node_col_name] = df[node_col_name] + node_offset - pG.add_vertex_data(df, vertex_col_name=node_col_name, type_name=ntype) - - columns_list = list(df.columns) + columns_list = add_node_data_from_parquet( + file_path, node_col_name, node_offset, ntype, pG + ) return serialize_strings_to_array(columns_list) -def add_edge_data_from_parquet( +def add_edge_data_from_parquet_remote( file_path, node_col_names, canonical_etype, src_offset, dst_offset, graph_id, server ): pG = server.get_graph(graph_id) - if isinstance(pG, MGPropertyGraph): - df = dask_cudf.read_parquet(file_path) - else: - df = cudf.read_parquet(file_path) - - df[node_col_names[0]] = df[node_col_names[0]] + src_offset - df[node_col_names[1]] = df[node_col_names[1]] + dst_offset - pG.add_edge_data(df, vertex_col_names=node_col_names, type_name=canonical_etype) - - columns_list = list(df.columns) + columns_list = add_edge_data_from_parquet( + file_path, node_col_names, canonical_etype, src_offset, dst_offset, pG + ) return serialize_strings_to_array(columns_list) diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/utils/add_data.py b/python/cugraph/cugraph/gnn/dgl_extensions/utils/add_data.py index c05f794f038..7364db25f2f 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/utils/add_data.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/utils/add_data.py @@ -12,71 +12,37 @@ # limitations under the License. # Utils for adding data to cugraph graphstore objects +import dask_cudf +import cudf +from cugraph.experimental import MGPropertyGraph -def _update_feature_map( - pg_feature_map, feat_name_obj, contains_vector_features, columns -): - """ - Update the existing feature map `pg_feature_map` based on `feat_name_obj` - """ - if contains_vector_features: - if feat_name_obj is None: - raise ValueError( - "feature name must be provided when wrapping" - + " multiple columns under a single feature name" - + " or a feature map" - ) +def add_node_data_from_parquet(file_path, node_col_name, node_offset, ntype, pG): + if isinstance(pG, MGPropertyGraph): + df = dask_cudf.read_parquet(file_path) + else: + df = cudf.read_parquet(file_path) - if isinstance(feat_name_obj, str): - pg_feature_map[feat_name_obj] = columns + df[node_col_name] = df[node_col_name] + node_offset + pG.add_vertex_data(df, vertex_col_name=node_col_name, type_name=ntype) - elif isinstance(feat_name_obj, dict): - covered_columns = [] - for col in feat_name_obj.keys(): - current_cols = feat_name_obj[col] - # Handle strings too - if isinstance(current_cols, str): - current_cols = [current_cols] - covered_columns = covered_columns + current_cols + columns_list = list(df.columns) - if set(covered_columns) != set(columns): - raise ValueError( - f"All the columns {columns} not covered in {covered_columns} " - f"Please check the feature_map {feat_name_obj} provided" - ) + return columns_list - for key, cols in feat_name_obj.items(): - if isinstance(cols, str): - cols = [cols] - pg_feature_map[key] = cols - else: - raise ValueError(f"{feat_name_obj} should be str or dict") - else: - if feat_name_obj: - raise ValueError( - f"feat_name {feat_name_obj} is only valid when " - "wrapping multiple columns under feature names" - ) - for col in columns: - pg_feature_map[col] = [col] +def add_edge_data_from_parquet( + file_path, node_col_names, canonical_etype, src_offset, dst_offset, pG +): + if isinstance(pG, MGPropertyGraph): + df = dask_cudf.read_parquet(file_path) + else: + df = cudf.read_parquet(file_path) -def deserialize_strings_from_char_ars(char_ar, len_ar): - string_start = 0 - string_list = [] - for string_offset in len_ar: - string_end = string_start + string_offset - s = char_ar[string_start:string_end] + df[node_col_names[0]] = df[node_col_names[0]] + src_offset + df[node_col_names[1]] = df[node_col_names[1]] + dst_offset + pG.add_edge_data(df, vertex_col_names=node_col_names, type_name=canonical_etype) - # Check of cupy array - if type(s).__module__ == "cupy": - s = s.get() + columns_list = list(df.columns) - # Check for numpy - if type(s).__module__ == "numpy": - s = s.tolist() - s = "".join([chr(i) for i in s]) - string_list.append(s) - string_start = string_end - return string_list + return columns_list diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/utils/feature_map.py b/python/cugraph/cugraph/gnn/dgl_extensions/utils/feature_map.py new file mode 100644 index 00000000000..0716c22e266 --- /dev/null +++ b/python/cugraph/cugraph/gnn/dgl_extensions/utils/feature_map.py @@ -0,0 +1,60 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def _update_feature_map( + pg_feature_map, feat_name_obj, contains_vector_features, columns +): + """ + Update the existing feature map `pg_feature_map` based on `feat_name_obj` + """ + if contains_vector_features: + if feat_name_obj is None: + raise ValueError( + "feature name must be provided when wrapping" + + " multiple columns under a single feature name" + + " or a feature map" + ) + + if isinstance(feat_name_obj, str): + pg_feature_map[feat_name_obj] = columns + + elif isinstance(feat_name_obj, dict): + covered_columns = [] + for col in feat_name_obj.keys(): + current_cols = feat_name_obj[col] + # Handle strings too + if isinstance(current_cols, str): + current_cols = [current_cols] + covered_columns = covered_columns + current_cols + + if set(covered_columns) != set(columns): + raise ValueError( + f"All the columns {columns} not covered in {covered_columns} " + f"Please check the feature_map {feat_name_obj} provided" + ) + + for key, cols in feat_name_obj.items(): + if isinstance(cols, str): + cols = [cols] + pg_feature_map[key] = cols + else: + raise ValueError(f"{feat_name_obj} should be str or dict") + else: + if feat_name_obj: + raise ValueError( + f"feat_name {feat_name_obj} is only valid when " + "wrapping multiple columns under feature names" + ) + for col in columns: + pg_feature_map[col] = [col] From c0fab52a979a2eb86242004eb8399845b5c6e9e4 Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Mon, 31 Oct 2022 16:45:55 -0700 Subject: [PATCH 32/41] Added working graph service --- python/cugraph/cugraph/gnn/__init__.py | 1 + .../dgl_extensions/cugraph_service_store.py | 27 ++++++-- .../gnn/dgl_extensions/cugraph_store.py | 10 +++ .../test_dgl_extension_remote_wrappers.py | 61 ++++++++++++++----- 4 files changed, 80 insertions(+), 19 deletions(-) diff --git a/python/cugraph/cugraph/gnn/__init__.py b/python/cugraph/cugraph/gnn/__init__.py index 1e6f2e2b140..5fc54befd3f 100644 --- a/python/cugraph/cugraph/gnn/__init__.py +++ b/python/cugraph/cugraph/gnn/__init__.py @@ -12,4 +12,5 @@ # limitations under the License. from .dgl_extensions.cugraph_store import CuGraphStore +from .dgl_extensions.cugraph_service_store import CuGraphRemoteStore from .dgl_extensions.feature_storage import CuFeatureStorage diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py index 90428e727dd..b24c44cb037 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py @@ -31,17 +31,24 @@ class CuGraphRemoteStore(BaseCuGraphStore): This class return dlpack types and has additional functional arguments. """ - def __init__(self, graph, graph_client, backend_lib="torch"): + def __init__(self, graph, graph_client, device_id=None, backend_lib="torch"): + if type(graph).__name__ in ["RemotePropertyGraph", "RemoteMGPropertyGraph"]: + if device_id is not None: + import numba.cuda as cuda + + cuda.select_device(device_id) + cp.cuda.runtime.setDevice(device_id) + self.__G = graph self.client = graph_client + self.device_id = device_id add_data_module = "cugraph.gnn.dgl_extensions.service_extensions.add_data" _ = self.client.load_extensions(add_data_module) sampling_module = "cugraph.gnn.dgl_extensions.service_extensions.sampling" _ = self.client.load_extensions(sampling_module) del _ - else: raise ValueError("graph must be a RemoteGraph") @@ -91,7 +98,7 @@ def add_node_data( None """ raise NotImplementedError( - "Adding Node Data From Local is not yet supported" + "Adding Node Data From Local is not yet supported " "Please Use `add_node_data_from_parquet`" ) @@ -159,6 +166,9 @@ def add_node_data_from_parquet( might be "users". If not specified, the type of properties will be added as an empty string. + node_offset: int, + The offset to add for the current node type + defaults to zero feat_name : {} or string A map of feature names under which we should save the added properties like {"feat_1":[f1, f2], "feat_2":[f3, f4]} @@ -171,6 +181,7 @@ def add_node_data_from_parquet( ------- None """ + c_ar, len_ar = self.client.call_extension( func_name="add_node_data_from_parquet_remote", file_path=file_path, @@ -178,6 +189,7 @@ def add_node_data_from_parquet( node_offset=node_offset, ntype=ntype, graph_id=self.gdata._graph_id, + result_device=self.device_id, ) loaded_columns = _deserialize_strings_from_char_ars(c_ar, len_ar) @@ -213,6 +225,12 @@ def add_edge_data_from_parquet( '(src_type),(edge_type),(dst_type)' If not specified, the type of properties will be added as an empty string. + src_offset: int, + The offset to add for the source node type + defaults to zero + dst_offset: int, + The offset to add for the dst node type + defaults to zero feat_name : string or dict {} The feature name under which we should save the added properties (ignored if contains_vector_features=False and the col names of @@ -224,7 +242,6 @@ def add_edge_data_from_parquet( ------- None """ - c_ar, len_ar = self.client.call_extension( func_name="add_edge_data_from_parquet_remote", file_path=file_path, @@ -233,6 +250,7 @@ def add_edge_data_from_parquet( src_offset=src_offset, dst_offset=dst_offset, graph_id=self.gdata._graph_id, + result_device=self.device_id, ) loaded_columns = _deserialize_strings_from_char_ars(c_ar, len_ar) columns = [col for col in loaded_columns if col not in node_col_names] @@ -371,6 +389,7 @@ def sample_neighbors( sampled_result_arrays = self.client.call_extension( "sample_pg_remote", + result_device=self.device_id, graph_id=self.gdata._graph_id, has_multiple_etypes=self.has_multiple_etypes, etypes=self.etypes, diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py index 2727f557215..b4a0a54f1b6 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py @@ -165,6 +165,9 @@ def add_node_data_from_parquet( might be "users". If not specified, the type of properties will be added as an empty string. + node_offset: int, + The offset to add for the particular ntype + defaults to zero feat_name : {} or string A map of feature names under which we should save the added properties like {"feat_1":[f1, f2], "feat_2":[f3, f4]} @@ -220,6 +223,13 @@ def add_edge_data_from_parquet( The feature name under which we should save the added properties (ignored if contains_vector_features=False and the col names of the dataframe are treated as corresponding feature names) + + src_offset: int, + The offset to add for the source node type + defaults to zero + dst_offset: int, + The offset to add for the dst node type + defaults to zero contains_vector_features : False Whether to treat the columns of the dataframe being added as as 2d features diff --git a/python/cugraph/cugraph/tests/test_dgl_extension_remote_wrappers.py b/python/cugraph/cugraph/tests/test_dgl_extension_remote_wrappers.py index 4af95dbaa4f..68a9807cd4c 100644 --- a/python/cugraph/cugraph/tests/test_dgl_extension_remote_wrappers.py +++ b/python/cugraph/cugraph/tests/test_dgl_extension_remote_wrappers.py @@ -11,14 +11,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -from cugraph.gnn.dgl_extensions.cugraph_service_store import CuGraphRemoteStore -from cugraph_service_client.client import CugraphServiceClient as Client -import cudf import numpy as np +# Add Path for cugraph_service_client +# import sys +# sys.path.append("/home/nfs/vjawa/dgl/cugraph/python/cugraph_service") +from cugraph_service_client.client import CugraphServiceClient as Client + -def create_gs(client): - gs = CuGraphRemoteStore(client.graph(), client) +def create_gs(client, device_id=None): + from cugraph.gnn.dgl_extensions.cugraph_service_store import CuGraphRemoteStore + + gs = CuGraphRemoteStore(client.graph(), client, device_id, backend_lib="cupy") gs.add_node_data_from_parquet( file_path="nt.a.parquet", node_col_name="node_id", ntype="nt.a", node_offset=0 ) @@ -63,32 +67,50 @@ def create_gs(client): return gs -def assert_correct_gs(gs): +def assert_valid_device(cp_ar, device_id): + import cupy as cp + + if device_id is None: + return True + else: + device_n = cp.cuda.Device(device_id) + if cp_ar.device != device_n: + print(f"device = {cp_ar.device}, expected_device = {device_n}") + + +def assert_valid_gs(gs): + import cudf + assert gs.etypes[0] == "('nt.a', 'connects', 'nt.b')" assert gs.ntypes[0] == "nt.a" assert gs.num_nodes_dict["nt.a"] == 3 assert gs.num_edges_dict["('nt.a', 'connects', 'nt.b')"] == 3 assert gs.num_nodes("nt.c") == 5 + print("Verified ntypes, etypes, num_nodes") + # Test Get Node Storage result = gs.get_node_storage(key="node_feat", ntype="nt.a", indices_offset=0).fetch( [0, 1, 2] ) - result = result.cpu().numpy() + assert_valid_device(result, gs.device_id) + result = result.get() expected_result = np.asarray([0, 10, 20], dtype=np.int32) np.testing.assert_equal(result, expected_result) result = gs.get_node_storage(key="node_feat", ntype="nt.b", indices_offset=3).fetch( [0, 1, 2] ) - result = result.cpu().numpy() + assert_valid_device(result, gs.device_id) + result = result.get() expected_result = np.asarray([30, 40, 50], dtype=np.int32) np.testing.assert_equal(result, expected_result) result = gs.get_node_storage(key="node_feat", ntype="nt.c", indices_offset=5).fetch( [1, 2, 3] ) - result = result.cpu().numpy() + assert_valid_device(result, gs.device_id) + result = result.get() expected_result = np.asarray([60, 70, 80], dtype=np.int32) np.testing.assert_equal(result, expected_result) @@ -96,31 +118,36 @@ def assert_correct_gs(gs): result = gs.get_edge_storage( key="edge_feat", etype="('nt.a', 'connects', 'nt.b')", indices_offset=0 ).fetch([0, 1, 2]) - result = result.cpu().numpy() + assert_valid_device(result, gs.device_id) + result = result.get() expected_result = np.asarray([10, 11, 12], dtype=np.int32) np.testing.assert_equal(result, expected_result) result = gs.get_edge_storage( key="edge_feat", etype="('nt.a', 'connects', 'nt.c')", indices_offset=0 ).fetch([4, 5]) - result = result.cpu().numpy() + assert_valid_device(result, gs.device_id) + result = result.get() expected_result = np.asarray([14, 15], dtype=np.int32) np.testing.assert_equal(result, expected_result) result = gs.get_edge_storage( key="edge_feat", etype="('nt.c', 'connects', 'nt.c')", indices_offset=0 ).fetch([6, 8]) - result = result.cpu().numpy() + assert_valid_device(result, gs.device_id) + result = result.get() expected_result = np.asarray([16, 18], dtype=np.int32) np.testing.assert_equal(result, expected_result) + print("Verified edge_feat, node_feat") + # Verify set_sg_dtype # verify extracted_reverse_subgraph subgraph, src_range = gs.extracted_reverse_subgraph dtype = gs.set_sg_node_dtype(subgraph) assert dtype == "int32" - # Verify Sampling Results + # Sampling Results nodes_cap = {"nt.c": cudf.Series([6]).to_dlpack()} result = gs.sample_neighbors(nodes_cap) result = { @@ -134,6 +161,7 @@ def assert_correct_gs(gs): for k, v in result.items() if v[0] is not None } + src_vals = result["('nt.c', 'connects', 'nt.c')"]["src"].values.get() sorted(src_vals) expected_vals = np.asarray([7, 8, 9], dtype=np.int32) @@ -142,6 +170,9 @@ def assert_correct_gs(gs): def test_remote_wrappers(): # TODO: Check with rick on how to test it + # Can only be tested after the packages land c = Client() - gs = create_gs(c) - assert_correct_gs(gs) + device_ls = [None, 0, 1] + for d in device_ls: + gs = create_gs(c) + assert_valid_gs(gs) From ce470bb1a814245380be1a65a040fd1418353e35 Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Fri, 4 Nov 2022 09:39:55 -0700 Subject: [PATCH 33/41] Style Fixes --- python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py b/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py index db108ceef54..d84a29b332d 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py @@ -57,7 +57,6 @@ def _convert_ar_list_to_dlpack(ar_ls): return cap - class CuFeatureStorage: """ Storage for node/edge feature data. @@ -136,7 +135,6 @@ def fetch(self, indices, device=None, pin_memory=False, **kwargs): indices = indices + self.indices_offset - if self.storage_type == "node": result = self.pg.get_vertex_data(vertex_ids=indices, columns=self.columns) else: From be2d7cabb02fadcacdfbffc26bde0d38198afe79 Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Fri, 4 Nov 2022 11:23:59 -0700 Subject: [PATCH 34/41] Skipped pytest for now --- .../cugraph/tests/test_dgl_extension_remote_wrappers.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/python/cugraph/cugraph/tests/test_dgl_extension_remote_wrappers.py b/python/cugraph/cugraph/tests/test_dgl_extension_remote_wrappers.py index 68a9807cd4c..aa93509d903 100644 --- a/python/cugraph/cugraph/tests/test_dgl_extension_remote_wrappers.py +++ b/python/cugraph/cugraph/tests/test_dgl_extension_remote_wrappers.py @@ -12,11 +12,7 @@ # limitations under the License. import numpy as np - -# Add Path for cugraph_service_client -# import sys -# sys.path.append("/home/nfs/vjawa/dgl/cugraph/python/cugraph_service") -from cugraph_service_client.client import CugraphServiceClient as Client +import pytest def create_gs(client, device_id=None): @@ -168,7 +164,10 @@ def assert_valid_gs(gs): np.testing.assert_equal(src_vals, expected_vals) +@pytest.mark.skip(reason="Enable when cughraph-service lands in the CI") def test_remote_wrappers(): + from cugraph_service_client.client import CugraphServiceClient as Client + # TODO: Check with rick on how to test it # Can only be tested after the packages land c = Client() From 55ff270f92f582e2d1bfdc03d0ca23f3710d826a Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Sun, 6 Nov 2022 16:35:00 -0800 Subject: [PATCH 35/41] Update python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py Co-authored-by: Rick Ratzel <3039903+rlratzel@users.noreply.github.com> --- .../cugraph/gnn/dgl_extensions/cugraph_service_store.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py index b24c44cb037..ce3c327ec42 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py @@ -45,10 +45,9 @@ def __init__(self, graph, graph_client, device_id=None, backend_lib="torch"): self.device_id = device_id add_data_module = "cugraph.gnn.dgl_extensions.service_extensions.add_data" - _ = self.client.load_extensions(add_data_module) + self.client.load_extensions(add_data_module) sampling_module = "cugraph.gnn.dgl_extensions.service_extensions.sampling" - _ = self.client.load_extensions(sampling_module) - del _ + self.client.load_extensions(sampling_module) else: raise ValueError("graph must be a RemoteGraph") From 56a43507747fe2af22b80ebc1b82bb28651c721a Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Sun, 6 Nov 2022 17:13:44 -0800 Subject: [PATCH 36/41] Address Reviews --- .../gnn/dgl_extensions/cugraph_service_store.py | 10 +++++----- .../cugraph/gnn/dgl_extensions/cugraph_store.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py index b24c44cb037..881ba03b888 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py @@ -32,7 +32,8 @@ class CuGraphRemoteStore(BaseCuGraphStore): """ def __init__(self, graph, graph_client, device_id=None, backend_lib="torch"): - + # not using isinstance to check type to prevent + # on adding dependency of Remote graphs to cugraph if type(graph).__name__ in ["RemotePropertyGraph", "RemoteMGPropertyGraph"]: if device_id is not None: import numba.cuda as cuda @@ -45,10 +46,9 @@ def __init__(self, graph, graph_client, device_id=None, backend_lib="torch"): self.device_id = device_id add_data_module = "cugraph.gnn.dgl_extensions.service_extensions.add_data" - _ = self.client.load_extensions(add_data_module) + self.client.load_extensions(add_data_module) sampling_module = "cugraph.gnn.dgl_extensions.service_extensions.sampling" - _ = self.client.load_extensions(sampling_module) - del _ + self.client.load_extensions(sampling_module) else: raise ValueError("graph must be a RemoteGraph") @@ -299,7 +299,7 @@ def get_edge_storage(self, key, etype=None, indices_offset=0): etype = etypes[0] if key not in self.edata_feat_col_d: raise ValueError( - f"key {key} not found in CuGraphStore" " edge features", + f"key {key} not found in CuGraphStore edge features", f" {list(self.edata_feat_col_d.keys())}", ) columns = self.edata_feat_col_d[key] diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py index 9804d41b17e..c639b5cbd82 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py @@ -291,7 +291,7 @@ def get_edge_storage(self, key, etype=None, indices_offset=0): etype = etypes[0] if key not in self.edata_feat_col_d: raise ValueError( - f"key {key} not found in CuGraphStore" " edge features", + f"key {key} not found in CuGraphStore edge features", f" {list(self.edata_feat_col_d.keys())}", ) columns = self.edata_feat_col_d[key] From d24ab529b910687b586d358f62da3fe0b3b95783 Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Sun, 6 Nov 2022 19:22:28 -0800 Subject: [PATCH 37/41] Addresed using ntype and etype --- .../dgl_extensions/cugraph_service_store.py | 2 ++ .../gnn/dgl_extensions/cugraph_store.py | 4 ++- .../gnn/dgl_extensions/feature_storage.py | 18 +++++++++--- ...e.py => test_dgl_extension_graph_store.py} | 28 +++++++++++++++++-- 4 files changed, 44 insertions(+), 8 deletions(-) rename python/cugraph/cugraph/tests/{test_graph_store.py => test_dgl_extension_graph_store.py} (96%) diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py index 881ba03b888..54d041d7875 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py @@ -283,6 +283,7 @@ def get_node_storage(self, key, ntype=None, indices_offset=0): storage_type="node", indices_offset=indices_offset, backend_lib=self.backend_lib, + types_to_fetch=[ntype], ) def get_edge_storage(self, key, etype=None, indices_offset=0): @@ -310,6 +311,7 @@ def get_edge_storage(self, key, etype=None, indices_offset=0): storage_type="edge", backend_lib=self.backend_lib, indices_offset=indices_offset, + types_to_fetch=[etype], ) ###################################### diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py index c639b5cbd82..ab713e0d4fe 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py @@ -275,6 +275,7 @@ def get_node_storage(self, key, ntype=None, indices_offset=0): storage_type="node", indices_offset=indices_offset, backend_lib=self.backend_lib, + types_to_fetch=[ntype], ) def get_edge_storage(self, key, etype=None, indices_offset=0): @@ -302,6 +303,7 @@ def get_edge_storage(self, key, etype=None, indices_offset=0): storage_type="edge", backend_lib=self.backend_lib, indices_offset=indices_offset, + types_to_fetch=[etype], ) ###################################### @@ -377,7 +379,7 @@ def sample_neighbors( sgs_obj=sgs_obj, sgs_src_range_obj=sgs_src_range_obj, sg_node_dtype=self._sg_node_dtype, - nodes_cap=nodes_cap, + nodes_ar=nodes_cap, replace=replace, fanout=fanout, edge_dir=edge_dir, diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py b/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py index d84a29b332d..b1f41dee7e4 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/feature_storage.py @@ -63,7 +63,13 @@ class CuFeatureStorage: """ def __init__( - self, pg, columns, storage_type, backend_lib="torch", indices_offset=0 + self, + pg, + columns, + storage_type, + backend_lib="torch", + indices_offset=0, + types_to_fetch=None, ): self.pg = pg self.columns = columns @@ -89,6 +95,7 @@ def __init__( self.from_dlpack = from_dlpack self.indices_offset = indices_offset + self.types_to_fetch = types_to_fetch def fetch(self, indices, device=None, pin_memory=False, **kwargs): """Fetch the features of the given node/edge IDs to the @@ -136,10 +143,13 @@ def fetch(self, indices, device=None, pin_memory=False, **kwargs): indices = indices + self.indices_offset if self.storage_type == "node": - result = self.pg.get_vertex_data(vertex_ids=indices, columns=self.columns) + result = self.pg.get_vertex_data( + vertex_ids=indices, columns=self.columns, types=self.types_to_fetch + ) else: - result = self.pg.get_edge_data(edge_ids=indices, columns=self.columns) - + result = self.pg.get_edge_data( + edge_ids=indices, columns=self.columns, types=self.types_to_fetch + ) if type(result).__name__ == "DataFrame": result = result[self.columns] if hasattr(result, "compute"): diff --git a/python/cugraph/cugraph/tests/test_graph_store.py b/python/cugraph/cugraph/tests/test_dgl_extension_graph_store.py similarity index 96% rename from python/cugraph/cugraph/tests/test_graph_store.py rename to python/cugraph/cugraph/tests/test_dgl_extension_graph_store.py index 1b76744d393..e0d4a388abc 100644 --- a/python/cugraph/cugraph/tests/test_graph_store.py +++ b/python/cugraph/cugraph/tests/test_dgl_extension_graph_store.py @@ -378,8 +378,7 @@ def test_ntypes(dataset1_CuGraphStore): def test_get_node_storage_gs(dataset1_CuGraphStore): fs = dataset1_CuGraphStore.get_node_storage(key="merchant_k", ntype="merchant") - # indices = [11, 4, 21, 316, 11] - indices = [11, 4, 21, 316] + indices = [11, 4, 21, 316, 11] merchant_gs = fs.fetch(indices, device="cuda") merchant_df = create_df_from_dataset( @@ -389,8 +388,31 @@ def test_get_node_storage_gs(dataset1_CuGraphStore): assert cp.allclose(cudf_ar, merchant_gs) +def test_get_node_storage_ntypes(): + node_ser = cudf.Series([1, 2, 3]) + feat_ser = cudf.Series([1.0, 1.0, 1.0]) + df = cudf.DataFrame({"node_ids": node_ser, "feat": feat_ser}) + pg = PropertyGraph() + gs = CuGraphStore(pg, backend_lib="cupy") + gs.add_node_data(df, "node_ids", ntype="nt.a") + + node_ser = cudf.Series([4, 5, 6]) + feat_ser = cudf.Series([2.0, 2.0, 2.0]) + df = cudf.DataFrame({"node_ids": node_ser, "feat": feat_ser}) + gs.add_node_data(df, "node_ids", ntype="nt.b") + + # All indices from a single ntype + output_ar = gs.get_node_storage(key="feat", ntype="nt.a").fetch([1, 2, 3]) + cp.testing.assert_array_equal(cp.asarray([1, 1, 1], dtype=cp.float32), output_ar) + + # Indices from other ntype are ignored + output_ar = gs.get_node_storage(key="feat", ntype="nt.b").fetch([1, 2, 5]) + cp.testing.assert_array_equal(cp.asarray([2.0], dtype=cp.float32), output_ar) + + def test_get_edge_storage_gs(dataset1_CuGraphStore): - fs = dataset1_CuGraphStore.get_edge_storage("relationships_k", "relationships") + etype = "('user', 'relationship', 'user')" + fs = dataset1_CuGraphStore.get_edge_storage("relationships_k", etype) relationship_t = fs.fetch([6, 7, 8], device="cuda") relationships_df = create_df_from_dataset( From d7f4ace59282e5715a7b0545142b11353624e142 Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Mon, 7 Nov 2022 09:57:00 -0800 Subject: [PATCH 38/41] fix typing issue --- .../cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py index 54d041d7875..f364212470d 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py @@ -540,7 +540,7 @@ def __clear_cached_properties(self): del self.extracted_reverse_subgraphs_per_type -def create_dlpack_results_from_arrays(sampled_result_arrays, etypes: list[str]): +def create_dlpack_results_from_arrays(sampled_result_arrays, etypes): # TODO: Extend to pytorch/numpy/etc if len(etypes) <= 1: s, d, e_id = sampled_result_arrays From 00795b4574aa5fd504f82ab4f13cf6b8b8830fc0 Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Tue, 8 Nov 2022 08:56:49 -0800 Subject: [PATCH 39/41] Update python/cugraph/cugraph/tests/test_dgl_extension_remote_wrappers.py Co-authored-by: Alex Barghi <105237337+alexbarghi-nv@users.noreply.github.com> --- .../cugraph/cugraph/tests/test_dgl_extension_remote_wrappers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cugraph/cugraph/tests/test_dgl_extension_remote_wrappers.py b/python/cugraph/cugraph/tests/test_dgl_extension_remote_wrappers.py index aa93509d903..1d202f42787 100644 --- a/python/cugraph/cugraph/tests/test_dgl_extension_remote_wrappers.py +++ b/python/cugraph/cugraph/tests/test_dgl_extension_remote_wrappers.py @@ -164,7 +164,7 @@ def assert_valid_gs(gs): np.testing.assert_equal(src_vals, expected_vals) -@pytest.mark.skip(reason="Enable when cughraph-service lands in the CI") +@pytest.mark.skip(reason="Enable when cugraph-service lands in the CI") def test_remote_wrappers(): from cugraph_service_client.client import CugraphServiceClient as Client From fdd39d38f098e956861fa18503ce3ea2cb59fa60 Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Tue, 8 Nov 2022 09:11:40 -0800 Subject: [PATCH 40/41] remove relative imports --- .../dgl_extensions/cugraph_service_store.py | 2 +- .../gnn/dgl_extensions/cugraph_store.py | 22 ++++++++++++------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py index f364212470d..02430b9da04 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py @@ -12,7 +12,7 @@ # limitations under the License. from collections import defaultdict -from .base_cugraph_store import BaseCuGraphStore +from cugraph.gnn.dgl_extensions.base_cugraph_store import BaseCuGraphStore from functools import cached_property from cugraph.gnn.dgl_extensions.utils.feature_map import _update_feature_map from cugraph.gnn.dgl_extensions.feature_storage import CuFeatureStorage diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py index ab713e0d4fe..2144332a55d 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_store.py @@ -13,16 +13,22 @@ from collections import defaultdict -from .base_cugraph_store import BaseCuGraphStore +from cugraph.gnn.dgl_extensions.base_cugraph_store import BaseCuGraphStore from functools import cached_property -from .utils.find_edges import find_edges -from .utils.node_subgraph import node_subgraph -from .utils.feature_map import _update_feature_map -from .utils.add_data import add_edge_data_from_parquet, add_node_data_from_parquet -from .utils.sampling import sample_pg, get_subgraph_and_src_range_from_pg -from .utils.sampling import get_underlying_dtype_from_sg -from .feature_storage import CuFeatureStorage +from cugraph.gnn.dgl_extensions.utils.find_edges import find_edges +from cugraph.gnn.dgl_extensions.utils.node_subgraph import node_subgraph +from cugraph.gnn.dgl_extensions.utils.feature_map import _update_feature_map +from cugraph.gnn.dgl_extensions.utils.add_data import ( + add_edge_data_from_parquet, + add_node_data_from_parquet, +) +from cugraph.gnn.dgl_extensions.utils.sampling import ( + sample_pg, + get_subgraph_and_src_range_from_pg, +) +from cugraph.gnn.dgl_extensions.utils.sampling import get_underlying_dtype_from_sg +from cugraph.gnn.dgl_extensions.feature_storage import CuFeatureStorage class CuGraphStore(BaseCuGraphStore): From bf49417bc19cc65c6cf1e70fc41ae62e66727ca6 Mon Sep 17 00:00:00 2001 From: Vibhu Jawa Date: Tue, 8 Nov 2022 09:15:44 -0800 Subject: [PATCH 41/41] link cugraph/issues/2863 --- .../cugraph/gnn/dgl_extensions/cugraph_service_store.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py index 02430b9da04..f0d060ff853 100644 --- a/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py +++ b/python/cugraph/cugraph/gnn/dgl_extensions/cugraph_service_store.py @@ -379,9 +379,10 @@ def sample_neighbors( # of the seed dtype is not same as the node dtype self.set_sg_node_dtype(first_sg) - # TODO: Cant send dlpack or cupy arrays or numpys arrays + # Cant send dlpack or cupy arrays or numpys arrays # through extensions - # Ask Rick + # See issue: https://github.com/rapidsai/cugraph/issues/2863 + if isinstance(nodes_cap, dict): nodes_ar = { k: cp.from_dlpack(v).get().tolist() for k, v in nodes_cap.items()