diff --git a/python/cugraph/cugraph/structure/property_graph.py b/python/cugraph/cugraph/structure/property_graph.py index 05c44976da7..de941bb940b 100644 --- a/python/cugraph/cugraph/structure/property_graph.py +++ b/python/cugraph/cugraph/structure/property_graph.py @@ -11,7 +11,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import cudf import cugraph @@ -59,12 +58,12 @@ class EXPERIMENTAL__PropertyGraph: algorithm results with corresponding properties. """ # column name constants used in internal DataFrames - __vertex_col_name = "_VERTEX_" - __src_col_name = "_SRC_" - __dst_col_name = "_DST_" - __type_col_name = "_TYPE_" - __edge_id_col_name = "_EDGE_ID_" - __vertex_id_col_name = "_VERTEX_ID_" + vertex_col_name = "_VERTEX_" + src_col_name = "_SRC_" + dst_col_name = "_DST_" + type_col_name = "_TYPE_" + edge_id_col_name = "_EDGE_ID_" + vertex_id_col_name = "_VERTEX_ID_" def __init__(self): # The dataframe containing the properties for each vertex. @@ -133,39 +132,24 @@ def __init__(self): # incrementing this counter. self.__last_edge_id = None - # PropertyGraph read-only attributes - def vertices_ids(self): - # Create a Series of the appropriate type (cudf.Series, pandas.Series, - # etc.) based on the type currently in use, then use it to gather all - # unique vertices. - vpd = self.__vertex_prop_dataframe - epd = self.__edge_prop_dataframe - if (vpd is None) and (epd is None): - return None - - # Assume __series_type is set if this point reached! - verts = self.__series_type(name=self.__vertex_col_name) - - if vpd is not None: - verts = verts.append(vpd[self.__vertex_col_name]) - if epd is not None: - # pandas.Series.unique() can return an ndarray, which cannot be - # appended to a Series. Always construct an appropriate series_type - # from the unique values prior to appending. - verts = verts.append( - self.__series_type(epd[self.__src_col_name].unique())) - verts = verts.append( - self.__series_type(epd[self.__dst_col_name].unique())) - verts = verts.unique() - return verts + # Cached property values + self.__num_vertices = None + # PropertyGraph read-only attributes @property def num_vertices(self): - verts = self.vertices_ids() - if verts is None: - return 0 - else: - return len(verts) + if self.__num_vertices is not None: + return self.__num_vertices + + self.__num_vertices = 0 + vert_sers = self.__get_all_vertices_series() + if vert_sers: + if self.__series_type is cudf.Series: + self.__num_vertices = cudf.concat(vert_sers).nunique() + else: + self.__num_vertices = pd.concat(vert_sers).nunique() + + return self.__num_vertices @property def num_edges(self): @@ -174,12 +158,19 @@ def num_edges(self): else: return 0 + @property + def edges(self): + if self.__edge_prop_dataframe is not None: + return self.__edge_prop_dataframe[[self.src_col_name, + self.dst_col_name]] + return None + @property def vertex_property_names(self): if self.__vertex_prop_dataframe is not None: props = list(self.__vertex_prop_dataframe.columns) - props.remove(self.__vertex_col_name) - props.remove(self.__type_col_name) # should "type" be removed? + props.remove(self.vertex_col_name) + props.remove(self.type_col_name) # should "type" be removed? return props return [] @@ -187,10 +178,10 @@ def vertex_property_names(self): def edge_property_names(self): if self.__edge_prop_dataframe is not None: props = list(self.__edge_prop_dataframe.columns) - props.remove(self.__src_col_name) - props.remove(self.__dst_col_name) - props.remove(self.__edge_id_col_name) - props.remove(self.__type_col_name) # should "type" be removed? + props.remove(self.src_col_name) + props.remove(self.dst_col_name) + props.remove(self.edge_id_col_name) + props.remove(self.type_col_name) # should "type" be removed? return props return [] @@ -203,6 +194,25 @@ def _vertex_prop_dataframe(self): def _edge_prop_dataframe(self): return self.__edge_prop_dataframe + def get_vertices(self, selection=None): + """ + Return a Series containing the unique vertex IDs contained in both + the vertex and edge property data. + """ + vert_sers = self.__get_all_vertices_series() + if vert_sers: + if self.__series_type is cudf.Series: + return self.__series_type(cudf.concat(vert_sers).unique()) + else: + return self.__series_type(pd.concat(vert_sers).unique()) + return self.__series_type() + + def vertices_ids(self): + """ + Alias for get_vertices() + """ + return self.get_vertices() + def add_vertex_data(self, dataframe, vertex_id_column, @@ -243,7 +253,7 @@ def add_vertex_data(self, if vertex_id_column not in dataframe.columns: raise ValueError(f"{vertex_id_column} is not a column in " f"dataframe: {dataframe.columns}") - if type(type_name) is not str: + if (type_name is not None) and not(isinstance(type_name, str)): raise TypeError("type_name must be a string, got: " f"{type(type_name)}") if property_columns: @@ -267,9 +277,13 @@ def add_vertex_data(self, "the PropertyGraph was already initialized " f"using type {self.__dataframe_type}") + # Clear the cached value for num_vertices since more could be added in + # this method. + self.__num_vertices = None + # Initialize the __vertex_prop_dataframe if necessary using the same # type as the incoming dataframe. - default_vertex_columns = [self.__vertex_col_name, self.__type_col_name] + default_vertex_columns = [self.vertex_col_name, self.type_col_name] if self.__vertex_prop_dataframe is None: self.__vertex_prop_dataframe = \ self.__dataframe_type(columns=default_vertex_columns) @@ -279,7 +293,7 @@ def add_vertex_data(self, # https://github.com/rapidsai/cudf/issues/9981) self.__update_dataframe_dtypes( self.__vertex_prop_dataframe, - {self.__vertex_col_name: dataframe[vertex_id_column].dtype}) + {self.vertex_col_name: dataframe[vertex_id_column].dtype}) # Ensure that both the predetermined vertex ID column name and vertex # type column name are present for proper merging. @@ -288,9 +302,9 @@ def add_vertex_data(self, # columns. The copied DataFrame is then merged (another copy) and then # deleted when out-of-scope. tmp_df = dataframe.copy(deep=True) - tmp_df[self.__vertex_col_name] = tmp_df[vertex_id_column] + tmp_df[self.vertex_col_name] = tmp_df[vertex_id_column] # FIXME: handle case of a type_name column already being in tmp_df - tmp_df[self.__type_col_name] = type_name + tmp_df[self.type_col_name] = type_name if property_columns: # all columns @@ -360,7 +374,7 @@ def add_edge_data(self, if invalid_columns: raise ValueError("vertex_id_columns contains column(s) not found " f"in dataframe: {list(invalid_columns)}") - if type(type_name) is not str: + if (type_name is not None) and not(isinstance(type_name, str)): raise TypeError("type_name must be a string, got: " f"{type(type_name)}") if property_columns: @@ -384,10 +398,14 @@ def add_edge_data(self, "the PropertyGraph was already initialized " f"using type {self.__dataframe_type}") - default_edge_columns = [self.__src_col_name, - self.__dst_col_name, - self.__edge_id_col_name, - self.__type_col_name] + # Clear the cached value for num_vertices since more could be added in + # this method. + self.__num_vertices = None + + default_edge_columns = [self.src_col_name, + self.dst_col_name, + self.edge_id_col_name, + self.type_col_name] if self.__edge_prop_dataframe is None: self.__edge_prop_dataframe = \ self.__dataframe_type(columns=default_edge_columns) @@ -397,18 +415,18 @@ def add_edge_data(self, # https://github.com/rapidsai/cudf/issues/9981) self.__update_dataframe_dtypes( self.__edge_prop_dataframe, - {self.__src_col_name: dataframe[vertex_id_columns[0]].dtype, - self.__dst_col_name: dataframe[vertex_id_columns[1]].dtype, - self.__edge_id_col_name: "Int64"}) + {self.src_col_name: dataframe[vertex_id_columns[0]].dtype, + self.dst_col_name: dataframe[vertex_id_columns[1]].dtype, + self.edge_id_col_name: "Int64"}) # NOTE: This copies the incoming DataFrame in order to add the new # columns. The copied DataFrame is then merged (another copy) and then # deleted when out-of-scope. tmp_df = dataframe.copy(deep=True) - tmp_df[self.__src_col_name] = tmp_df[vertex_id_columns[0]] - tmp_df[self.__dst_col_name] = tmp_df[vertex_id_columns[1]] + tmp_df[self.src_col_name] = tmp_df[vertex_id_columns[0]] + tmp_df[self.dst_col_name] = tmp_df[vertex_id_columns[1]] # FIXME: handle case of a type_name column already being in tmp_df - tmp_df[self.__type_col_name] = type_name + tmp_df[self.type_col_name] = type_name if property_columns: # all columns @@ -471,11 +489,11 @@ def select_vertices(self, expr, from_previous_selection=None): previously_selected_rows = self.__vertex_prop_dataframe[ from_previous_selection.vertex_selections] verts_from_previously_selected_rows = \ - previously_selected_rows[self.__vertex_col_name] + previously_selected_rows[self.vertex_col_name] # get all the rows from the entire __vertex_prop_dataframe that # contain those verts rows_with_verts = \ - self.__vertex_prop_dataframe[self.__vertex_col_name]\ + self.__vertex_prop_dataframe[self.vertex_col_name]\ .isin(verts_from_previously_selected_rows) rows_to_eval = self.__vertex_prop_dataframe[rows_with_verts] locals = dict([(n, rows_to_eval[n]) @@ -528,7 +546,7 @@ def select_edges(self, expr): edge_selection_series=selected_col) def extract_subgraph(self, - create_using=None, + create_using=cugraph.Graph, selection=None, edge_weight_property=None, default_edge_weight=None, @@ -563,7 +581,9 @@ def extract_subgraph(self, Returns ------- - None + A Graph instance of the same type as create_using containing only the + vertices and edges resulting from applying the selection to the set of + vertex and edge property data. Examples -------- @@ -578,8 +598,8 @@ def extract_subgraph(self, # vertices assume the original dtypes in the user input have been # preserved. However, merge operations on the DataFrames can change # dtypes (eg. int64 to float64 in order to add NaN entries). This - # should not be a problem since this the conversions do not change - # the values. + # should not be a problem since the conversions do not change the + # values. if (selection is not None) and \ (selection.vertex_selections is not None): selected_vertex_dataframe = \ @@ -600,10 +620,10 @@ def extract_subgraph(self, # selected verts in both src and dst if (selected_vertex_dataframe is not None) and \ not(selected_vertex_dataframe.empty): - selected_verts = selected_vertex_dataframe[self.__vertex_col_name] - has_srcs = selected_edge_dataframe[self.__src_col_name]\ + selected_verts = selected_vertex_dataframe[self.vertex_col_name] + has_srcs = selected_edge_dataframe[self.src_col_name]\ .isin(selected_verts) - has_dsts = selected_edge_dataframe[self.__dst_col_name]\ + has_dsts = selected_edge_dataframe[self.dst_col_name]\ .isin(selected_verts) edges = selected_edge_dataframe[has_srcs & has_dsts] else: @@ -668,7 +688,7 @@ def annotate_dataframe(self, df, G, edge_vertex_id_columns): -------- >>> """ - # FIXME: all check args + # FIXME: check all args (src_col_name, dst_col_name) = edge_vertex_id_columns df_type = type(df) @@ -676,13 +696,10 @@ def annotate_dataframe(self, df, G, edge_vertex_id_columns): raise TypeError(f"df type {df_type} does not match DataFrame type " f"{self.__dataframe_type} used in PropertyGraph") - # FIXME: check that G has edge_data attr - - # Add the src, dst, edge_id info from the Graph to a DataFrame - edge_info_df = self.__dataframe_type(columns=[self.__src_col_name, - self.__dst_col_name, - self.__edge_id_col_name], - data=G.edge_data) + if hasattr(G, "edge_data"): + edge_info_df = G.edge_data + else: + raise AttributeError("Graph G does not have attribute 'edge_data'") # New result includes only properties from the src/dst edges identified # by edge IDs. All other data in df is merged based on src/dst values. @@ -691,12 +708,12 @@ def annotate_dataframe(self, df, G, edge_vertex_id_columns): how="inner") # FIXME: also allow edge ID col to be passed in and renamed. - new_df = df.rename(columns={src_col_name: self.__src_col_name, - dst_col_name: self.__dst_col_name}) + new_df = df.rename(columns={src_col_name: self.src_col_name, + dst_col_name: self.dst_col_name}) new_df = new_df.merge(edge_props_df) # restore the original src/dst column names - new_df.rename(columns={self.__src_col_name: src_col_name, - self.__dst_col_name: dst_col_name}, + new_df.rename(columns={self.src_col_name: src_col_name, + self.dst_col_name: dst_col_name}, inplace=True) # restore the original dtypes @@ -708,31 +725,9 @@ def annotate_dataframe(self, df, G, edge_vertex_id_columns): # columns from edge types not included in the edges in df. return new_df - @classmethod - def get_edge_tuples(cls, edge_prop_df): - """ - Returns a list of (src vertex, dst vertex, edge_id) tuples present in - edge_prop_df. - """ - if cls.__src_col_name not in edge_prop_df.columns: - raise ValueError(f"column {cls.__src_col_name} missing from " - "edge_prop_df") - if cls.__dst_col_name not in edge_prop_df.columns: - raise ValueError(f"column {cls.__dst_col_name} missing from " - "edge_prop_df") - if cls.__edge_id_col_name not in edge_prop_df.columns: - raise ValueError(f"column {cls.__edge_id_col_name} missing " - "from edge_prop_df") - src = edge_prop_df[cls.__src_col_name] - dst = edge_prop_df[cls.__dst_col_name] - edge_id = edge_prop_df[cls.__edge_id_col_name] - retlist = [(src.iloc[i], dst.iloc[i], edge_id.iloc[i]) - for i in range(len(src))] - return retlist - - @classmethod - def edge_props_to_graph(cls, edge_prop_df, - create_using=None, + def edge_props_to_graph(self, + edge_prop_df, + create_using, edge_weight_property=None, allow_multi_edges=False): """ @@ -745,12 +740,12 @@ def edge_props_to_graph(cls, edge_prop_df, "edge_prop_df") # Set up the new Graph to return - if create_using is None: - G = cugraph.Graph() - elif isinstance(create_using, cugraph.Graph): + if isinstance(create_using, cugraph.Graph): # FIXME: extract more attrs from the create_using instance attrs = {"directed": create_using.is_directed()} G = type(create_using)(**attrs) + # FIXME: this allows anything to be instantiated does not check that + # the type is a valid Graph type. elif type(create_using) is type(type): G = create_using() else: @@ -762,20 +757,20 @@ def edge_props_to_graph(cls, edge_prop_df, # non-MultiGraphs would result in ambiguous edge properties. # FIXME: make allow_multi_edges accept "auto" for use with MultiGraph if (allow_multi_edges is False) and \ - cls.has_duplicate_edges(edge_prop_df): + self.has_duplicate_edges(edge_prop_df): if create_using: if type(create_using) is type: t = create_using.__name__ else: t = type(create_using).__name__ - msg = f"{t} graph type specified by create_using" + msg = f"'{t}' graph type specified by create_using" else: msg = "default Graph graph type" raise RuntimeError("query resulted in duplicate edges which " - f"cannot be represented with a {msg}") + f"cannot be represented with the {msg}") - create_args = {"source": cls.__src_col_name, - "destination": cls.__dst_col_name, + create_args = {"source": self.src_col_name, + "destination": self.dst_col_name, "edge_attr": edge_weight_property, "renumber": True, } @@ -784,15 +779,12 @@ def edge_props_to_graph(cls, edge_prop_df, else: G.from_pandas_edgelist(edge_prop_df, **create_args) - # Set the edge_data on the resulting Graph to the list of edge tuples, - # which includes the unique edge IDs. Edge IDs are needed for future - # calls to annotate_dataframe() in order to apply properties from the - # correct edges. - # FIXME: this could be a very large list of tuples if the number of - # edges in G is large (eg. a large MNMG graph that cannot fit in host - # memory). Consider adding the edge IDs to the edgelist DataFrame in G - # instead. - G.edge_data = cls.get_edge_tuples(edge_prop_df) + # Set the edge_data on the resulting Graph to a DataFrame containing + # the edges and the edge ID for each. Edge IDs are needed for future + # calls to annotate_dataframe() in order to associate edges with their + # properties, since the PG can contain multiple edges between vertrices + # with different properties. + G.edge_data = self.__create_property_lookup_table(edge_prop_df) # FIXME: also add vertex_data return G @@ -802,14 +794,27 @@ def has_duplicate_edges(cls, df): """ Return True if df has >1 of the same src, dst pair """ + # FIXME: this can be very expensive for large DataFrames if df.empty: return False def has_duplicate_dst(df): - return df[cls.__dst_col_name].nunique() != \ - df[cls.__dst_col_name].size + return df[cls.dst_col_name].nunique() != \ + df[cls.dst_col_name].size + + return df.groupby(cls.src_col_name).apply(has_duplicate_dst).any() - return df.groupby(cls.__src_col_name).apply(has_duplicate_dst).any() + def __create_property_lookup_table(self, edge_prop_df): + """ + Returns a DataFrame containing the src vertex, dst vertex, and edge_id + values from edge_prop_df. + """ + src = edge_prop_df[self.src_col_name] + dst = edge_prop_df[self.dst_col_name] + edge_id = edge_prop_df[self.edge_id_col_name] + return self.__dataframe_type({self.src_col_name: src, + self.dst_col_name: dst, + self.edge_id_col_name: edge_id}) def __add_edge_ids(self): """ @@ -817,7 +822,7 @@ def __add_edge_ids(self): incremented by 1 for each edge. """ prev_eid = -1 if self.__last_edge_id is None else self.__last_edge_id - nans = self.__edge_prop_dataframe[self.__edge_id_col_name].isna() + nans = self.__edge_prop_dataframe[self.edge_id_col_name].isna() if nans.any(): indices = nans.index[nans] @@ -826,11 +831,26 @@ def __add_edge_ids(self): new_eids = self.__series_type( range(starting_eid, starting_eid + num_indices)) - self.__edge_prop_dataframe[self.__edge_id_col_name]\ + self.__edge_prop_dataframe[self.edge_id_col_name]\ .iloc[indices] = new_eids self.__last_edge_id = starting_eid + num_indices - 1 + def __get_all_vertices_series(self): + """ + Return a list of all Series objects that contain vertices from all + tables. + """ + vpd = self.__vertex_prop_dataframe + epd = self.__edge_prop_dataframe + vert_sers = [] + if vpd is not None: + vert_sers.append(vpd[self.vertex_col_name]) + if epd is not None: + vert_sers.append(epd[self.src_col_name]) + vert_sers.append(epd[self.dst_col_name]) + return vert_sers + @staticmethod def __get_new_column_dtypes(from_df, to_df): """ diff --git a/python/cugraph/cugraph/tests/test_property_graph.py b/python/cugraph/cugraph/tests/test_property_graph.py index 0578a7780de..8cced223bf7 100644 --- a/python/cugraph/cugraph/tests/test_property_graph.py +++ b/python/cugraph/cugraph/tests/test_property_graph.py @@ -11,14 +11,28 @@ # See the License for the specific language governing permissions and # limitations under the License. +import time import gc import pytest import pandas as pd +import numpy as np import cudf from cudf.testing import assert_frame_equal, assert_series_equal +# If the rapids-pytest-benchmark plugin is installed, the "gpubenchmark" +# fixture will be available automatically. Check that this fixture is available +# by trying to import rapids_pytest_benchmark, and if that fails, set +# "gpubenchmark" to the standard "benchmark" fixture provided by +# pytest-benchmark. +try: + import rapids_pytest_benchmark # noqa: F401 +except ImportError: + import pytest_benchmark + gpubenchmark = pytest_benchmark.plugin.benchmark + import cugraph +from cugraph.generators import rmat from cugraph.tests import utils # ============================================================================= @@ -107,21 +121,26 @@ def setup_function(): df_types = [cudf.DataFrame, pd.DataFrame] -def df_type_id(dft): +def df_type_id(dataframe_type): + """ + Return a string that describes the dataframe_type, used for test output. + """ s = "df_type=" - if dft == cudf.DataFrame: + if dataframe_type == cudf.DataFrame: return s+"cudf.DataFrame" - if dft == pd.DataFrame: + if dataframe_type == pd.DataFrame: return s+"pandas.DataFrame" return s+"?" -@pytest.fixture(scope="module", - params=utils.genFixtureParamsProduct((df_types, df_type_id)) - ) -def property_graph_instance(request): +df_types_fixture_params = utils.genFixtureParamsProduct((df_types, df_type_id)) + + +@pytest.fixture(scope="module", params=df_types_fixture_params) +def dataset1_PropertyGraph(request): """ - FIXME: fill this in + Fixture which returns an instance of a PropertyGraph with vertex and edge + data added from dataset1, parameterized for different DataFrame types. """ dataframe_type = request.param[0] from cugraph.experimental import PropertyGraph @@ -177,8 +196,74 @@ def property_graph_instance(request): return pG -############################################################################### +@pytest.fixture(scope="module", params=df_types_fixture_params) +def cyber_PropertyGraph(request): + """ + Fixture which returns an instance of a PropertyGraph with vertex and edge + data added from the cyber.csv dataset, parameterized for different + DataFrame types. + """ + from cugraph.experimental import PropertyGraph + + dataframe_type = request.param[0] + cyber_csv = utils.RAPIDS_DATASET_ROOT_DIR_PATH/"cyber.csv" + source_col_name = "srcip" + dest_col_name = "dstip" + + if dataframe_type is pd.DataFrame: + read_csv = pd.read_csv + else: + read_csv = cudf.read_csv + df = read_csv(cyber_csv, delimiter=",", + dtype={"idx": "int32", + source_col_name: "str", + dest_col_name: "str"}, + header=0) + + pG = PropertyGraph() + pG.add_edge_data(df, (source_col_name, dest_col_name)) + + return pG + + +@pytest.fixture(scope="module", params=df_types_fixture_params) +def rmat_PropertyGraph(): + """ + Fixture which uses the RMAT generator to generate a cuDF DataFrame + edgelist, then uses it to add vertex and edge data to a PropertyGraph + instance, then returns the (PropertyGraph, DataFrame) instances in a tuple. + """ + from cugraph.experimental import PropertyGraph + + source_col_name = "src" + dest_col_name = "dst" + weight_col_name = "weight" + scale = 20 + edgefactor = 16 + seed = 42 + df = rmat(scale, + (2**scale)*edgefactor, + 0.57, # from Graph500 + 0.19, # from Graph500 + 0.19, # from Graph500 + seed, + clip_and_flip=False, + scramble_vertex_ids=True, + create_using=None, # None == return edgelist + mg=False + ) + rng = np.random.default_rng(seed) + df[weight_col_name] = rng.random(size=len(df)) + + pG = PropertyGraph() + pG.add_edge_data(df, (source_col_name, dest_col_name)) + + return (pG, df) + + +# ============================================================================= # Tests +# ============================================================================= @pytest.mark.parametrize("df_type", df_types, ids=df_type_id) def test_add_vertex_data(df_type): """ @@ -202,6 +287,60 @@ def test_add_vertex_data(df_type): assert sorted(pG.vertex_property_names) == sorted(expected_props) +@pytest.mark.parametrize("df_type", df_types, ids=df_type_id) +def test_num_vertices(df_type): + """ + Ensures num_vertices is correct after various additions of specific data. + """ + from cugraph.experimental import PropertyGraph + + merchants = dataset1["merchants"] + merchants_df = df_type(columns=merchants[0], + data=merchants[1]) + + pG = PropertyGraph() + pG.add_vertex_data(merchants_df, + type_name="merchants", + vertex_id_column="merchant_id", + property_columns=None) + + # Test caching - the second retrieval should always be faster + st = time.time() + assert pG.num_vertices == 5 + compute_time = time.time() - st + assert pG.num_edges == 0 + + st = time.time() + assert pG.num_vertices == 5 + cache_retrieval_time = time.time() - st + assert cache_retrieval_time < compute_time + + users = dataset1["users"] + users_df = df_type(columns=users[0], data=users[1]) + + pG.add_vertex_data(users_df, + type_name="users", + vertex_id_column="user_id", + property_columns=None) + + assert pG.num_vertices == 9 + assert pG.num_edges == 0 + + # The taxpayers table does not add new vertices, it only adds properties to + # vertices already present in the merchants and users tables. + taxpayers = dataset1["taxpayers"] + taxpayers_df = df_type(columns=taxpayers[0], + data=taxpayers[1]) + + pG.add_vertex_data(taxpayers_df, + type_name="taxpayers", + vertex_id_column="payer_id", + property_columns=None) + + assert pG.num_vertices == 9 + assert pG.num_edges == 0 + + @pytest.mark.parametrize("df_type", df_types, ids=df_type_id) def test_null_data(df_type): """ @@ -365,9 +504,9 @@ def test_add_edge_data_bad_args(): property_columns="time") -def test_extract_subgraph_vertex_prop_condition_only(property_graph_instance): +def test_extract_subgraph_vertex_prop_condition_only(dataset1_PropertyGraph): - pG = property_graph_instance + pG = dataset1_PropertyGraph selection = pG.select_vertices("(_TYPE_=='taxpayers') & (amount<100)") G = pG.extract_subgraph(selection=selection, @@ -386,12 +525,15 @@ def test_extract_subgraph_vertex_prop_condition_only(property_graph_instance): assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True) -def test_extract_subgraph_vertex_edge_prop_condition(property_graph_instance): - pG = property_graph_instance +def test_extract_subgraph_vertex_edge_prop_condition(dataset1_PropertyGraph): + from cugraph.experimental import PropertyGraph + + pG = dataset1_PropertyGraph + tcn = PropertyGraph.type_col_name selection = pG.select_vertices("(user_location==47906) | " "(user_location==78750)") - selection += pG.select_edges("_TYPE_=='referrals'") + selection += pG.select_edges(f"{tcn}=='referrals'") G = pG.extract_subgraph(selection=selection, create_using=DiGraph_inst, edge_weight_property="stars") @@ -407,10 +549,13 @@ def test_extract_subgraph_vertex_edge_prop_condition(property_graph_instance): assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True) -def test_extract_subgraph_edge_prop_condition_only(property_graph_instance): - pG = property_graph_instance +def test_extract_subgraph_edge_prop_condition_only(dataset1_PropertyGraph): + from cugraph.experimental import PropertyGraph - selection = pG.select_edges("_TYPE_=='transactions'") + pG = dataset1_PropertyGraph + tcn = PropertyGraph.type_col_name + + selection = pG.select_edges(f"{tcn} =='transactions'") G = pG.extract_subgraph(selection=selection, create_using=DiGraph_inst) @@ -431,27 +576,33 @@ def test_extract_subgraph_edge_prop_condition_only(property_graph_instance): assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True) -def test_extract_subgraph_unweighted(property_graph_instance): +def test_extract_subgraph_unweighted(dataset1_PropertyGraph): """ Ensure a subgraph is unweighted if the edge_weight_property is None. """ - pG = property_graph_instance + from cugraph.experimental import PropertyGraph - selection = pG.select_edges("_TYPE_=='transactions'") + pG = dataset1_PropertyGraph + tcn = PropertyGraph.type_col_name + + selection = pG.select_edges(f"{tcn} == 'transactions'") G = pG.extract_subgraph(selection=selection, create_using=DiGraph_inst) assert G.is_weighted() is False -def test_extract_subgraph_specific_query(property_graph_instance): +def test_extract_subgraph_specific_query(dataset1_PropertyGraph): """ Graph of only transactions after time 1639085000 for merchant_id 4 (should be a graph of 2 vertices, 1 edge) """ - pG = property_graph_instance + from cugraph.experimental import PropertyGraph - selection = pG.select_edges("(_TYPE_=='transactions') & " + pG = dataset1_PropertyGraph + tcn = PropertyGraph.type_col_name + + selection = pG.select_edges(f"({tcn}=='transactions') & " "(merchant_id==4) & " "(time>1639085000)") G = pG.extract_subgraph(selection=selection, @@ -469,28 +620,34 @@ def test_extract_subgraph_specific_query(property_graph_instance): assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True) -def test_edge_props_to_graph(property_graph_instance): +def test_edge_props_to_graph(dataset1_PropertyGraph): """ Access the property DataFrames directly and use them to perform a more complex query, then call edge_props_to_graph() to create the corresponding graph. """ - pG = property_graph_instance + from cugraph.experimental import PropertyGraph + + pG = dataset1_PropertyGraph + vcn = PropertyGraph.vertex_col_name + tcn = PropertyGraph.type_col_name + scn = PropertyGraph.src_col_name + dcn = PropertyGraph.dst_col_name # Select referrals from only taxpayers who are users (should be 1) # Find the list of vertices that are both users and taxpayers def contains_both(df): - return (df["_TYPE_"] == "taxpayers").any() and \ - (df["_TYPE_"] == "users").any() - verts = pG._vertex_prop_dataframe.groupby("_VERTEX_")\ + return (df[tcn] == "taxpayers").any() and \ + (df[tcn] == "users").any() + verts = pG._vertex_prop_dataframe.groupby(vcn)\ .apply(contains_both) verts = verts[verts].keys() # get an array of only verts that have both # Find the "referral" edge_props containing only those verts - referrals = pG._edge_prop_dataframe["_TYPE_"] == "referrals" - srcs = pG._edge_prop_dataframe[referrals]["_SRC_"].isin(verts) - dsts = pG._edge_prop_dataframe[referrals]["_DST_"].isin(verts) + referrals = pG._edge_prop_dataframe[tcn] == "referrals" + srcs = pG._edge_prop_dataframe[referrals][scn].isin(verts) + dsts = pG._edge_prop_dataframe[referrals][dcn].isin(verts) matching_edges = (srcs & dsts) indices = matching_edges.index[matching_edges] edge_props = pG._edge_prop_dataframe.loc[indices] @@ -508,18 +665,21 @@ def contains_both(df): assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True) -def test_select_vertices_from_previous_selection(property_graph_instance): +def test_select_vertices_from_previous_selection(dataset1_PropertyGraph): """ Ensures that the intersection of vertices of multiple types (only vertices that are both type A and type B) can be selected. """ - pG = property_graph_instance + from cugraph.experimental import PropertyGraph + + pG = dataset1_PropertyGraph + tcn = PropertyGraph.type_col_name # Select referrals from only taxpayers who are users (should be 1) - selection = pG.select_vertices("_TYPE_ == 'taxpayers'") - selection = pG.select_vertices("_TYPE_ == 'users'", + selection = pG.select_vertices(f"{tcn} == 'taxpayers'") + selection = pG.select_vertices(f"{tcn} == 'users'", from_previous_selection=selection) - selection += pG.select_edges("_TYPE_ == 'referrals'") + selection += pG.select_edges(f"{tcn} == 'referrals'") G = pG.extract_subgraph(create_using=DiGraph_inst, selection=selection) expected_edgelist = cudf.DataFrame({"src": [89021], "dst": [78634]}) @@ -555,7 +715,8 @@ def test_extract_subgraph_graph_without_vert_props(): vertex_id_columns=("user_id_1", "user_id_2"), property_columns=None) - G = pG.extract_subgraph(selection=pG.select_edges("_SRC_ == 89216"), + scn = PropertyGraph.src_col_name + G = pG.extract_subgraph(selection=pG.select_edges(f"{scn} == 89216"), create_using=DiGraph_inst, edge_weight_property="relationship_type", default_edge_weight=0) @@ -572,11 +733,11 @@ def test_extract_subgraph_graph_without_vert_props(): assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True) -def test_extract_subgraph_no_edges(property_graph_instance): +def test_extract_subgraph_no_edges(dataset1_PropertyGraph): """ Valid query that only matches a single vertex. """ - pG = property_graph_instance + pG = dataset1_PropertyGraph selection = pG.select_vertices("(_TYPE_=='merchants') & (merchant_id==86)") G = pG.extract_subgraph(selection=selection) @@ -584,11 +745,11 @@ def test_extract_subgraph_no_edges(property_graph_instance): assert len(G.edgelist.edgelist_df) == 0 -def test_extract_subgraph_no_query(property_graph_instance): +def test_extract_subgraph_no_query(dataset1_PropertyGraph): """ Call extract with no args, should result in the entire property graph. """ - pG = property_graph_instance + pG = dataset1_PropertyGraph G = pG.extract_subgraph(create_using=DiGraph_inst, allow_multi_edges=True) @@ -602,16 +763,20 @@ def test_extract_subgraph_no_query(property_graph_instance): assert len(G.edgelist.edgelist_df) == num_edges -def test_extract_subgraph_multi_edges(property_graph_instance): +def test_extract_subgraph_multi_edges(dataset1_PropertyGraph): """ Ensure an exception is thrown if a graph is attempted to be extracted with multi edges. NOTE: an option to allow multi edges when create_using is MultiGraph will be provided in the future. """ - pG = property_graph_instance + from cugraph.experimental import PropertyGraph + + pG = dataset1_PropertyGraph + tcn = PropertyGraph.type_col_name + # referrals has multiple edges - selection = pG.select_edges("_TYPE_ == 'referrals'") + selection = pG.select_edges(f"{tcn} == 'referrals'") # FIXME: use a better exception with pytest.raises(RuntimeError): @@ -619,8 +784,11 @@ def test_extract_subgraph_multi_edges(property_graph_instance): create_using=DiGraph_inst) -def test_extract_subgraph_bad_args(property_graph_instance): - pG = property_graph_instance +def test_extract_subgraph_bad_args(dataset1_PropertyGraph): + from cugraph.experimental import PropertyGraph + + pG = dataset1_PropertyGraph + tcn = PropertyGraph.type_col_name # non-PropertySelection selection with pytest.raises(TypeError): @@ -629,7 +797,7 @@ def test_extract_subgraph_bad_args(property_graph_instance): edge_weight_property="stars", default_edge_weight=1.0) - selection = pG.select_edges("_TYPE_=='referrals'") + selection = pG.select_edges(f"{tcn}=='referrals'") # bad create_using type with pytest.raises(TypeError): pG.extract_subgraph(selection=selection, @@ -648,14 +816,17 @@ def test_extract_subgraph_bad_args(property_graph_instance): edge_weight_property="card_type") -def test_extract_subgraph_default_edge_weight(property_graph_instance): +def test_extract_subgraph_default_edge_weight(dataset1_PropertyGraph): """ Ensure the default_edge_weight value is added to edges with missing properties used for weights. """ - pG = property_graph_instance + from cugraph.experimental import PropertyGraph - selection = pG.select_edges("_TYPE_=='transactions'") + pG = dataset1_PropertyGraph + tcn = PropertyGraph.type_col_name + + selection = pG.select_edges(f"{tcn}=='transactions'") G = pG.extract_subgraph(create_using=DiGraph_inst, selection=selection, edge_weight_property="volume", @@ -685,12 +856,15 @@ def test_extract_subgraph_default_edge_weight(property_graph_instance): assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True) -def test_graph_edge_data_added(property_graph_instance): +def test_graph_edge_data_added(dataset1_PropertyGraph): """ Ensures the subgraph returned from extract_subgraph() has the edge_data attribute added which contains the proper edge IDs. """ - pG = property_graph_instance + from cugraph.experimental import PropertyGraph + + pG = dataset1_PropertyGraph + eicn = PropertyGraph.edge_id_col_name expected_num_edges = \ len(dataset1["transactions"][-1]) + \ @@ -703,16 +877,15 @@ def test_graph_edge_data_added(property_graph_instance): # meta-data, which includes edge IDs. G = pG.extract_subgraph(create_using=DiGraph_inst, allow_multi_edges=True) - # G.edge_data should be set to a list of tuples of (src, dst, edge_id) for - # each edge in the graph. + # G.edge_data should be set to a DataFrame with rows for each graph edge. assert len(G.edge_data) == expected_num_edges - edge_ids = sorted([d[-1] for d in G.edge_data]) + edge_ids = sorted(G.edge_data[eicn].values) assert edge_ids[0] == 0 assert edge_ids[-1] == (expected_num_edges - 1) -def test_annotate_dataframe(property_graph_instance): +def test_annotate_dataframe(dataset1_PropertyGraph): """ FIXME: Add tests for: properties list @@ -720,7 +893,7 @@ def test_annotate_dataframe(property_graph_instance): copy=False invalid args raise correct exceptions """ - pG = property_graph_instance + pG = dataset1_PropertyGraph selection = pG.select_edges("(_TYPE_ == 'referrals') & (stars > 3)") G = pG.extract_subgraph(selection=selection, @@ -800,3 +973,173 @@ def test_different_vertex_edge_input_dataframe_types(): pG.add_edge_data(df, type_name="bar", vertex_id_columns=("a", "b")) with pytest.raises(TypeError): pG.add_edge_data(pdf, type_name="bar", vertex_id_columns=("a", "b")) + + +def test_get_vertices(dataset1_PropertyGraph): + """ + Test that get_vertices() returns the correct set of vertices without + duplicates. + """ + pG = dataset1_PropertyGraph + + (merchants, users, taxpayers, + transactions, relationships, referrals) = dataset1.values() + + expected_vertices = set([t[0] for t in merchants[1]] + + [t[0] for t in users[1]] + + [t[0] for t in taxpayers[1]]) + + assert sorted(pG.get_vertices().values) == sorted(expected_vertices) + + +def test_get_edges(dataset1_PropertyGraph): + """ + Test that get_edges() returns the correct set of edges (as src/dst + columns). + """ + from cugraph.experimental import PropertyGraph + + pG = dataset1_PropertyGraph + + (merchants, users, taxpayers, + transactions, relationships, referrals) = dataset1.values() + + expected_edges = \ + [(src, dst) for (src, dst, _, _, _, _) in transactions[1]] + \ + [(src, dst) for (src, dst, _) in relationships[1]] + \ + [(src, dst) for (src, dst, _, _) in referrals[1]] + + actual_edges = pG.edges + + assert len(expected_edges) == len(actual_edges) + for i in range(len(expected_edges)): + src = actual_edges[PropertyGraph.src_col_name].iloc[i] + dst = actual_edges[PropertyGraph.dst_col_name].iloc[i] + assert (src, dst) in expected_edges + + +@pytest.mark.skip(reason="unfinished") +def test_extract_subgraph_with_vertex_ids(): + """ + FIXME: add a PropertyGraph API that makes it easy to support the common use + case of extracting a subgraph containing only specific vertex IDs. This is + currently done in the bench_extract_subgraph_for_* tests below, but could + be made easier for users to do. + """ + raise NotImplementedError + + +@pytest.mark.skip(reason="unfinished") +def test_dgl_use_case(): + """ + FIXME: add a test demonstrating typical DGL use cases + """ + raise NotImplementedError + + +# ============================================================================= +# Benchmarks +# ============================================================================= +def bench_num_vertices(gpubenchmark, dataset1_PropertyGraph): + pG = dataset1_PropertyGraph + + def get_num_vertices(): + return pG.num_vertices + + assert gpubenchmark(get_num_vertices) == 9 + + +def bench_get_vertices(gpubenchmark, dataset1_PropertyGraph): + pG = dataset1_PropertyGraph + + gpubenchmark(pG.get_vertices) + + +def bench_extract_subgraph_for_cyber(gpubenchmark, cyber_PropertyGraph): + from cugraph.experimental import PropertyGraph + + pG = cyber_PropertyGraph + scn = PropertyGraph.src_col_name + dcn = PropertyGraph.dst_col_name + + # Create a Graph containing only specific src or dst vertices + verts = ["10.40.182.3", "10.40.182.255", "59.166.0.9", "59.166.0.8"] + selected_edges = \ + pG.select_edges(f"{scn}.isin({verts}) | {dcn}.isin({verts})") + gpubenchmark(pG.extract_subgraph, + create_using=cugraph.Graph(directed=True), + selection=selected_edges, + default_edge_weight=1.0, + allow_multi_edges=True) + + +def bench_extract_subgraph_for_cyber_detect_duplicate_edges( + gpubenchmark, cyber_PropertyGraph): + from cugraph.experimental import PropertyGraph + + pG = cyber_PropertyGraph + scn = PropertyGraph.src_col_name + dcn = PropertyGraph.dst_col_name + + # Create a Graph containing only specific src or dst vertices + verts = ["10.40.182.3", "10.40.182.255", "59.166.0.9", "59.166.0.8"] + selected_edges = \ + pG.select_edges(f"{scn}.isin({verts}) | {dcn}.isin({verts})") + + def func(): + with pytest.raises(RuntimeError): + pG.extract_subgraph(create_using=cugraph.Graph(directed=True), + selection=selected_edges, + default_edge_weight=1.0, + allow_multi_edges=False) + + gpubenchmark(func) + + +def bench_extract_subgraph_for_rmat(gpubenchmark, rmat_PropertyGraph): + from cugraph.experimental import PropertyGraph + + (pG, generated_df) = rmat_PropertyGraph + scn = PropertyGraph.src_col_name + dcn = PropertyGraph.dst_col_name + + verts = [] + for i in range(0, 10000, 10): + verts.append(generated_df["src"].iloc[i]) + + selected_edges = \ + pG.select_edges(f"{scn}.isin({verts}) | {dcn}.isin({verts})") + gpubenchmark(pG.extract_subgraph, + create_using=cugraph.Graph(directed=True), + selection=selected_edges, + default_edge_weight=1.0, + allow_multi_edges=True) + + +# This test runs for *minutes* with the current implementation, and since +# benchmarking can call it multiple times per run, the overall time for this +# test can be ~20 minutes. +@pytest.mark.slow +def bench_extract_subgraph_for_rmat_detect_duplicate_edges( + gpubenchmark, rmat_PropertyGraph): + from cugraph.experimental import PropertyGraph + + (pG, generated_df) = rmat_PropertyGraph + scn = PropertyGraph.src_col_name + dcn = PropertyGraph.dst_col_name + + verts = [] + for i in range(0, 10000, 10): + verts.append(generated_df["src"].iloc[i]) + + selected_edges = \ + pG.select_edges(f"{scn}.isin({verts}) | {dcn}.isin({verts})") + + def func(): + with pytest.raises(RuntimeError): + pG.extract_subgraph(create_using=cugraph.Graph(directed=True), + selection=selected_edges, + default_edge_weight=1.0, + allow_multi_edges=False) + + gpubenchmark(func) diff --git a/python/cugraph/pytest.ini b/python/cugraph/pytest.ini index 046f972801c..0da378d3d13 100644 --- a/python/cugraph/pytest.ini +++ b/python/cugraph/pytest.ini @@ -1,9 +1,24 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + [pytest] addopts = --benchmark-warmup=off --benchmark-max-time=0 --benchmark-min-rounds=1 --benchmark-columns="mean, rounds" + ## do not run the slow tests/benchmarks by default + -m "not slow" ## for use with rapids-pytest-benchmark plugin #--benchmark-gpu-disable ## for use with pytest-cov plugin @@ -24,3 +39,16 @@ markers = cugraph_types: use cuGraph input types nx_types: use NetworkX input types matrix_types: use SciPy/CuPy matrix input types + slow: slow-running tests/benchmarks + +python_classes = + Bench* + Test* + +python_files = + bench_* + test_* + +python_functions = + bench_* + test_*