Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] add multi-column support in algorithms - part 2 #1571

Merged
merged 10 commits into from
Jun 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion notebooks/link_prediction/Jaccard-Similarity.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -451,8 +451,9 @@
"metadata": {},
"outputs": [],
"source": [
"pr_df.rename(columns={'pagerank': 'weight'}, inplace=True)",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this is a notebook, does this need a comment explaining why it's needed to the readers?

"# Call weighted Jaccard using the Pagerank scores as weights:\n",
"wdf = cugraph.jaccard_w(G, pr_df['pagerank'])"
"wdf = cugraph.jaccard_w(G, pr_df)"
]
},
{
Expand Down
34 changes: 20 additions & 14 deletions python/cugraph/community/spectral_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,8 +190,8 @@ def analyzeClustering_modularity(G, n_clusters, clustering,
Specifies the number of clusters in the given clustering
clustering : cudf.DataFrame
The cluster assignment to analyze.
vertex_col_name : str
The name of the column in the clustering dataframe identifying
vertex_col_name : str or list of str
The names of the column in the clustering dataframe identifying
the external vertex id
cluster_col_name : str
The name of the column in the clustering dataframe identifying
Expand All @@ -213,8 +213,10 @@ def analyzeClustering_modularity(G, n_clusters, clustering,
>>> df = cugraph.spectralBalancedCutClustering(G, 5)
>>> score = cugraph.analyzeClustering_modularity(G, 5, df)
"""

if type(vertex_col_name) is not str:
if type(vertex_col_name) is list:
if not all(isinstance(name, str) for name in vertex_col_name):
raise Exception("vertex_col_name must be list of string")
elif type(vertex_col_name) is not str:
raise Exception("vertex_col_name must be a string")

if type(cluster_col_name) is not str:
Expand All @@ -224,11 +226,11 @@ def analyzeClustering_modularity(G, n_clusters, clustering,

if G.renumbered:
clustering = G.add_internal_vertex_id(clustering,
vertex_col_name,
'vertex',
vertex_col_name,
drop=True)

clustering = clustering.sort_values(vertex_col_name)
clustering = clustering.sort_values('vertex')

score = spectral_clustering_wrapper.analyzeClustering_modularity(
G, n_clusters, clustering[cluster_col_name]
Expand Down Expand Up @@ -277,8 +279,10 @@ def analyzeClustering_edge_cut(G, n_clusters, clustering,
>>> df = cugraph.spectralBalancedCutClustering(G, 5)
>>> score = cugraph.analyzeClustering_edge_cut(G, 5, df)
"""

if type(vertex_col_name) is not str:
if type(vertex_col_name) is list:
if not all(isinstance(name, str) for name in vertex_col_name):
raise Exception("vertex_col_name must be list of string")
elif type(vertex_col_name) is not str:
raise Exception("vertex_col_name must be a string")

if type(cluster_col_name) is not str:
Expand All @@ -288,11 +292,11 @@ def analyzeClustering_edge_cut(G, n_clusters, clustering,

if G.renumbered:
clustering = G.add_internal_vertex_id(clustering,
vertex_col_name,
'vertex',
vertex_col_name,
drop=True)

clustering = clustering.sort_values(vertex_col_name).reset_index(drop=True)
clustering = clustering.sort_values('vertex').reset_index(drop=True)

score = spectral_clustering_wrapper.analyzeClustering_edge_cut(
G, n_clusters, clustering[cluster_col_name]
Expand Down Expand Up @@ -339,20 +343,22 @@ def analyzeClustering_ratio_cut(G, n_clusters, clustering,
>>> score = cugraph.analyzeClustering_ratio_cut(G, 5, df,
>>> 'vertex', 'cluster')
"""

if type(vertex_col_name) is not str:
if type(vertex_col_name) is list:
if not all(isinstance(name, str) for name in vertex_col_name):
raise Exception("vertex_col_name must be list of string")
elif type(vertex_col_name) is not str:
raise Exception("vertex_col_name must be a string")

if type(cluster_col_name) is not str:
raise Exception("cluster_col_name must be a string")

if G.renumbered:
clustering = G.add_internal_vertex_id(clustering,
vertex_col_name,
'vertex',
vertex_col_name,
drop=True)

clustering = clustering.sort_values(vertex_col_name)
clustering = clustering.sort_values('vertex')

score = spectral_clustering_wrapper.analyzeClustering_ratio_cut(
G, n_clusters, clustering[cluster_col_name]
Expand Down
10 changes: 5 additions & 5 deletions python/cugraph/layout/force_atlas2.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
# limitations under the License.

from cugraph.layout import force_atlas2_wrapper
from cugraph.structure.graph_classes import null_check


def force_atlas2(
Expand Down Expand Up @@ -109,13 +108,14 @@ def on_train_end(self, positions):
"""

if pos_list is not None:
null_check(pos_list["vertex"])
null_check(pos_list["x"])
null_check(pos_list["y"])
if input_graph.renumbered is True:
if input_graph.vertex_column_size() > 1:
cols = pos_list.columns[:-2].to_list()
else:
cols = 'vertex'
pos_list = input_graph.add_internal_vertex_id(pos_list,
"vertex",
"vertex")
cols)

if prevent_overlapping:
raise Exception("Feature not supported")
Expand Down
19 changes: 14 additions & 5 deletions python/cugraph/link_analysis/pagerank.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
# limitations under the License.

from cugraph.link_analysis import pagerank_wrapper
from cugraph.structure.graph_classes import null_check
import cugraph


Expand Down Expand Up @@ -67,6 +66,10 @@ def pagerank(
Subset of vertices of graph for initial guess for pagerank values
nstart['values'] : cudf.Series
Pagerank values for vertices
weight: str
The attribute column to be used as edge weights if Graph is a NetworkX
Graph. This parameter is here for NetworkX compatibility and is ignored
in case of a cugraph.Graph
dangling : dict
This parameter is here for NetworkX compatibility and ignored

Expand Down Expand Up @@ -94,17 +97,23 @@ def pagerank(
G, isNx = cugraph.utilities.check_nx_graph(G, weight)

if personalization is not None:
null_check(personalization["vertex"])
null_check(personalization["values"])
if G.renumbered is True:
if len(G.renumber_map.implementation.col_names) > 1:
cols = personalization.columns[:-1].to_list()
else:
cols = 'vertex'
personalization = G.add_internal_vertex_id(
personalization, "vertex", "vertex"
personalization, "vertex", cols
)

if nstart is not None:
if G.renumbered is True:
if len(G.renumber_map.implementation.col_names) > 1:
cols = nstart.columns[:-1].to_list()
else:
cols = 'vertex'
nstart = G.add_internal_vertex_id(
nstart, "vertex", "vertex"
nstart, "vertex", cols
)

df = pagerank_wrapper.pagerank(
Expand Down
12 changes: 3 additions & 9 deletions python/cugraph/link_prediction/jaccard.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@

import pandas as pd
import cudf
from cugraph.structure.graph_classes import Graph, null_check
from cugraph.structure.graph_classes import Graph
from cugraph.link_prediction import jaccard_wrapper
from cugraph.utilities import check_nx_graph
from cugraph.utilities import df_edge_score_to_dictionary
from cugraph.utilities import renumber_vertex_pair


def jaccard(input_graph, vertex_pair=None):
Expand Down Expand Up @@ -108,15 +109,8 @@ def jaccard(input_graph, vertex_pair=None):
if type(input_graph) is not Graph:
raise Exception("input graph must be undirected")

# FIXME: Add support for multi-column vertices
if type(vertex_pair) == cudf.DataFrame:
for col in vertex_pair.columns:
null_check(vertex_pair[col])
if input_graph.renumbered:
vertex_pair = input_graph.add_internal_vertex_id(
vertex_pair, col, col
)

vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
elif vertex_pair is None:
pass
else:
Expand Down
10 changes: 2 additions & 8 deletions python/cugraph/link_prediction/overlap.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@

import pandas as pd
from cugraph.link_prediction import overlap_wrapper
from cugraph.structure.graph_classes import null_check
import cudf
from cugraph.utilities import check_nx_graph
from cugraph.utilities import df_edge_score_to_dictionary
from cugraph.utilities import renumber_vertex_pair


def overlap_coefficient(G, ebunch=None):
Expand Down Expand Up @@ -91,14 +91,8 @@ def overlap(input_graph, vertex_pair=None):
>>> df = cugraph.overlap(G)
"""

# FIXME: Add support for multi-column vertices
if type(vertex_pair) == cudf.DataFrame:
for col in vertex_pair.columns:
null_check(vertex_pair[col])
if input_graph.renumbered:
vertex_pair = input_graph.add_internal_vertex_id(
vertex_pair, col, col,
)
vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
elif vertex_pair is None:
pass
else:
Expand Down
5 changes: 3 additions & 2 deletions python/cugraph/link_prediction/overlap_wrapper.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,9 @@ def overlap(input_graph, weights_arr=None, vertex_pair=None):
df = cudf.DataFrame()
df['overlap_coeff'] = result

first = vertex_pair['first']
second = vertex_pair['second']
cols = vertex_pair.columns.to_list()
first = vertex_pair[cols[0]]
second = vertex_pair[cols[1]]

# FIXME: multi column support
df['source'] = first
Expand Down
37 changes: 27 additions & 10 deletions python/cugraph/link_prediction/wjaccard.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from cugraph.structure.graph_classes import Graph, null_check
from cugraph.structure.graph_classes import Graph
from cugraph.link_prediction import jaccard_wrapper
import cudf
import numpy as np
from cugraph.utilities import renumber_vertex_pair


def jaccard_w(input_graph, weights, vertex_pair=None):
Expand All @@ -35,8 +37,15 @@ def jaccard_w(input_graph, weights, vertex_pair=None):
as an edge list (edge weights are not used for this algorithm). The
adjacency list will be computed if not already present.

weights : cudf.Series
weights : cudf.DataFrame
Specifies the weights to be used for each vertex.
Vertex should be represented by multiple columns for multi-column
vertices.

weights['vertex'] : cudf.Series
Contains the vertex identifiers
weights['weight'] : cudf.Series
Contains the weights of vertices

vertex_pair : cudf.DataFrame
A GPU dataframe consisting of two columns representing pairs of
Expand Down Expand Up @@ -70,20 +79,28 @@ def jaccard_w(input_graph, weights, vertex_pair=None):
if type(input_graph) is not Graph:
raise Exception("input graph must be undirected")

# FIXME: Add support for multi-column vertices
if type(vertex_pair) == cudf.DataFrame:
for col in vertex_pair.columns:
null_check(vertex_pair[col])
if input_graph.renumbered:
vertex_pair = input_graph.add_internal_vertex_id(
vertex_pair, col, col,
)
vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
elif vertex_pair is None:
pass
else:
raise ValueError("vertex_pair must be a cudf dataframe")

df = jaccard_wrapper.jaccard(input_graph, weights, vertex_pair)
if input_graph.renumbered:
vertex_size = input_graph.vertex_column_size()
if vertex_size == 1:
weights = input_graph.add_internal_vertex_id(
weights, 'vertex', 'vertex'
)
else:
cols = weights.columns[:vertex_size].to_list()
weights = input_graph.add_internal_vertex_id(
weights, 'vertex', cols
)
jaccard_weights = cudf.Series(np.ones(len(weights)))
for i in range(len(weights)):
jaccard_weights[weights['vertex'].iloc[i]] = weights['weight'].iloc[i]
df = jaccard_wrapper.jaccard(input_graph, jaccard_weights, vertex_pair)

if input_graph.renumbered:
df = input_graph.unrenumber(df, "source")
Expand Down
32 changes: 23 additions & 9 deletions python/cugraph/link_prediction/woverlap.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
# limitations under the License.

from cugraph.link_prediction import overlap_wrapper
from cugraph.structure.graph_classes import null_check
import cudf
import numpy as np
from cugraph.utilities import renumber_vertex_pair


def overlap_w(input_graph, weights, vertex_pair=None):
Expand Down Expand Up @@ -67,20 +68,33 @@ def overlap_w(input_graph, weights, vertex_pair=None):
>>> G.from_cudf_edgelist(M, source='0', destination='1')
>>> df = cugraph.overlap_w(G, M[2])
"""
# FIXME: Add support for multi-column vertices

if type(vertex_pair) == cudf.DataFrame:
for col in vertex_pair.columns:
null_check(vertex_pair[col])
if input_graph.renumbered:
vertex_pair = input_graph.add_internal_vertex_id(
vertex_pair, col, col
)
vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
elif vertex_pair is None:
pass
else:
raise ValueError("vertex_pair must be a cudf dataframe")

df = overlap_wrapper.overlap(input_graph, weights, vertex_pair)
if input_graph.renumbered:
vertex_size = input_graph.vertex_column_size()
if vertex_size == 1:
weights = input_graph.add_internal_vertex_id(
weights, 'vertex', 'vertex'
)
else:
cols = weights.columns[:vertex_size].to_list()
weights = input_graph.add_internal_vertex_id(
weights, 'vertex', cols
)

overlap_weights = cudf.Series(np.ones(len(weights)))
for i in range(len(weights)):
overlap_weights[weights['vertex'].iloc[i]] = weights['weight'].iloc[i]

overlap_weights = overlap_weights.astype('float32')

df = overlap_wrapper.overlap(input_graph, overlap_weights, vertex_pair)

if input_graph.renumbered:
df = input_graph.unrenumber(df, "source")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -472,3 +472,9 @@ def compute_renumber_edge_list(self, transposed=False):
self.edgelist = self.EdgeList(renumbered_ddf)
self.renumber_map = number_map
self.properties.store_transposed = transposed

def vertex_column_size(self):
if self.properties.renumbered:
return self.renumber_map.vertex_column_size()
else:
return 1
Original file line number Diff line number Diff line change
Expand Up @@ -823,3 +823,9 @@ def neighbors(self, n):
return self.renumber_map.from_internal_vertex_id(neighbors)["0"]
else:
return neighbors

def vertex_column_size(self):
if self.properties.renumbered:
return self.renumber_map.vertex_column_size()
else:
return 1
3 changes: 3 additions & 0 deletions python/cugraph/structure/number_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -671,3 +671,6 @@ def unrenumber(self, df, column_name, preserve_order=False,
return df, col_names
else:
return df

def vertex_column_size(self):
return len(self.implementation.col_names)
Loading