rapidsai · rapids-bot · Aug 31, 2022 · Jun 30, 2022 · Jul 7, 2022 · Jul 7, 2022
@@ -0,0 +1,368 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# PyG+cuGraph Heterogeneous MAG Example\n",
+    "# Skip notebook test\n",
+    "\n",
+    "### Requires installation of PyG"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import rmm\n",
+    "\n",
+    "rmm.reinitialize(pool_allocator=True,initial_pool_size=5e+9, maximum_pool_size=20e+9)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load MAG into CPU Memory"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cugraph\n",
+    "import cudf\n",
+    "from ogb.nodeproppred import NodePropPredDataset\n",
+    "\n",
+    "dataset = NodePropPredDataset(name = 'ogbn-mag') \n",
+    "\n",
+    "data = dataset[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create PropertyGraph from MAG Data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Partially Load the Vertex Data (just ids)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cudf\n",
+    "import dask_cudf\n",
+    "import cugraph\n",
+    "from cugraph.experimental import MGPropertyGraph\n",
+    "from cugraph.experimental import PropertyGraph\n",
+    "pG = PropertyGraph()\n",
+    "\n",
+    "vertex_offsets = {}\n",
+    "last_offset = 0\n",
+    "\n",
+    "for node_type, num_nodes in data[0]['num_nodes_dict'].items():\n",
+    "    vertex_offsets[node_type] = last_offset\n",
+    "    last_offset += num_nodes\n",
+    "    \n",
+    "    blank_df = cudf.DataFrame({'id':range(vertex_offsets[node_type], vertex_offsets[node_type] + num_nodes)})\n",
+    "    blank_df.id = blank_df.id.astype('int32')\n",
+    "    if isinstance(pG, MGPropertyGraph):\n",
+    "        blank_df = dask_cudf.from_cudf(blank_df, npartitions=2)\n",
+    "    pG.add_vertex_data(blank_df, vertex_col_name='id', type_name=node_type)\n",
+    "\n",
+    "vertex_offsets"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Add the Remaining Node Features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i, (node_type, node_features) in enumerate(data[0]['node_feat_dict'].items()):\n",
+    "    vertex_offset = vertex_offsets[node_type]\n",
+    "\n",
+    "    feature_df = cudf.DataFrame(node_features)\n",
+    "    feature_df.columns = [str(c) for c in range(feature_df.shape[1])]\n",
+    "    feature_df['id'] = range(vertex_offset, vertex_offset + node_features.shape[0])\n",
+    "    feature_df.id = feature_df.id.astype('int32')\n",
+    "    if isinstance(pG, MGPropertyGraph):\n",
+    "        feature_df = dask_cudf.from_cudf(feature_df, npartitions=2)\n",
+    "\n",
+    "    pG.add_vertex_data(feature_df, vertex_col_name='id', type_name=node_type)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Add the Edges"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i, (edge_key, eidx) in enumerate(data[0]['edge_index_dict'].items()):\n",
+    "    node_type_src, edge_type, node_type_dst = edge_key\n",
+    "    print(node_type_src, edge_type, node_type_dst)\n",
+    "    vertex_offset_src = vertex_offsets[node_type_src]\n",
+    "    vertex_offset_dst = vertex_offsets[node_type_dst]\n",
+    "    eidx = [n + vertex_offset_src for n in eidx[0]], [n + vertex_offset_dst for n in eidx[1]]\n",
+    "\n",
+    "    edge_df = cudf.DataFrame({'src':eidx[0], 'dst':eidx[1]})\n",
+    "    edge_df.src = edge_df.src.astype('int32')\n",
+    "    edge_df.dst = edge_df.dst.astype('int32')\n",
+    "    edge_df['type'] = edge_type\n",
+    "    if isinstance(pG, MGPropertyGraph):\n",
+    "        edge_df = dask_cudf.from_cudf(edge_df, npartitions=2)\n",
+    "\n",
+    "    # Adding backwards edges is currently required in both the cuGraph PG and PyG APIs.\n",
+    "    pG.add_edge_data(edge_df, vertex_col_names=['src','dst'], type_name=edge_type)\n",
+    "    pG.add_edge_data(edge_df, vertex_col_names=['dst','src'], type_name=f'{edge_type}_bw')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Add the Target Variable"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_df = cudf.DataFrame(data[1]['paper'], columns=['y'])\n",
+    "y_df['id'] = range(vertex_offsets['paper'], vertex_offsets['paper'] + len(y_df))\n",
+    "y_df.id = y_df.id.astype('int32')\n",
+    "if isinstance(pG, MGPropertyGraph):\n",
+    "    y_df = dask_cudf.from_cudf(y_df, npartitions=2)\n",
+    "\n",
+    "pG.add_vertex_data(y_df, vertex_col_name='id', type_name='paper')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Construct a Graph Store, Feature Store, and Loaders"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from cugraph.gnn.pyg_extensions.data.cugraph_store import to_pyg\n",
+    "\n",
+    "feature_store, graph_store = to_pyg(pG)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from cugraph.gnn.pyg_extensions import CuGraphLinkNeighborLoader\n",
+    "loader = CuGraphLinkNeighborLoader(\n",
+    "    data=(feature_store, graph_store),\n",
+    "    edge_label_index='writes',\n",
+    "    shuffle=True,\n",
+    "    num_neighbors=[10,25],\n",
+    "    batch_size=50,\n",
+    ")\n",
+    "\n",
+    "test_loader = CuGraphLinkNeighborLoader(\n",
+    "    data=(feature_store, graph_store),\n",
+    "    edge_label_index='writes',\n",
+    "    shuffle=True,\n",
+    "    num_neighbors=[10,25],\n",
+    "    batch_size=50,\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Create the Network"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "edge_types = [attr.edge_type for attr in graph_store.get_all_edge_attrs()]\n",
+    "edge_types"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_classes = pG.get_vertex_data(columns=['y'])['y'].max() + 1\n",
+    "if isinstance(pG, MGPropertyGraph):\n",
+    "    num_classes = num_classes.compute()\n",
+    "num_classes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import torch.nn.functional as F\n",
+    "\n",
+    "from torch_geometric.nn import HeteroConv, Linear, SAGEConv\n",
+    "\n",
+    "class HeteroGNN(torch.nn.Module):\n",
+    "    def __init__(self, edge_types, hidden_channels, out_channels, num_layers):\n",
+    "        super().__init__()\n",
+    "\n",
+    "        self.convs = torch.nn.ModuleList()\n",
+    "        for _ in range(num_layers):\n",
+    "            conv = HeteroConv({\n",
+    "                edge_type: SAGEConv((-1, -1), hidden_channels)\n",
+    "                for edge_type in edge_types\n",
+    "            })\n",
+    "            self.convs.append(conv)\n",
+    "\n",
+    "        self.lin = Linear(hidden_channels, out_channels)\n",
+    "\n",
+    "    def forward(self, x_dict, edge_index_dict):\n",
+    "        for conv in self.convs:\n",
+    "            x_dict = conv(x_dict, edge_index_dict)\n",
+    "            x_dict = {key: F.leaky_relu(x) for key, x in x_dict.items()}\n",
+    "        print(x_dict, edge_index_dict)\n",
+    "        return self.lin(x_dict['paper'])\n",
+    "\n",
+    "\n",
+    "model = HeteroGNN(edge_types, hidden_channels=64, out_channels=num_classes,\n",
+    "                  num_layers=2).cuda()\n",
+    "\n",
+    "with torch.no_grad():  # Initialize lazy modules.\n",
+    "    data = next(iter(loader))\n",
+    "    out = model(data.x_dict, data.edge_index_dict)\n",
+    "\n",
+    "optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=0.001)\n",
+    "\n",
+    "num_batches = 5\n",
+    "def train():\n",
+    "    model.train()\n",
+    "    optimizer.zero_grad()\n",
+    "    for b_i, data in enumerate(loader):\n",
+    "        if b_i == num_batches:\n",
+    "            break\n",
+    "\n",
+    "        out = model(data.x_dict, data.edge_index_dict)\n",
+    "        loss = F.cross_entropy(out, data.y_dict['paper'])\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "    \n",
+    "    return float(loss) / num_batches\n",
+    "\n",
+    "\n",
+    "@torch.no_grad()\n",
+    "def test():\n",
+    "    model.eval()\n",
+    "    test_iter = iter(test_loader)\n",
+    "\n",
+    "    acc = 0.0\n",
+    "    for _ in range(2*num_batches):\n",
+    "        data = next(test_iter)\n",
+    "        pred = model(data.x_dict, data.edge_index_dict).argmax(dim=-1)\n",
+    "\n",
+    "        \n",
+    "        acc += (pred == data['paper'].y).sum() / len(data['paper'])\n",
+    "    return acc / (2*num_batches)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Train the Network"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for epoch in range(1, 101):\n",
+    "    loss = train()\n",
+    "    train_acc = test()\n",
+    "    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_acc:.4f}')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.9.7 ('base')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "f708a36acfaef0acf74ccd43dfb58100269bf08fb79032a1e0a6f35bd9856f51"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -123,7 +123,10 @@ def uniform_neighbor_sample(input_graph,
         raise TypeError("fanout_vals must be a list, "
                         f"got: {type(fanout_vals)}")
 
-    weight_t = input_graph.edgelist.edgelist_df["value"].dtype
+    if 'value' in input_graph.edgelist.edgelist_df:
+        weight_t = input_graph.edgelist.edgelist_df["value"].dtype
+    else:
+        weight_t = 'float32'
 
     # start_list uses "external" vertex IDs, but if the graph has been
     # renumbered, the start vertex IDs must also be renumbered.

@@ -0,0 +1,14 @@
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cugraph.gnn.pyg_extensions.data import to_pyg