Merge branch 'master' into update-cugraph-relgraphconv

dmlc · Feb 15, 2023 · 474f8d2 · 474f8d2
2 parents a03d0b3 + 40c968b
commit 474f8d2
Show file tree

Hide file tree

Showing 9 changed files with 196 additions and 21 deletions.
diff --git a/python/dgl/nn/pytorch/conv/egatconv.py b/python/dgl/nn/pytorch/conv/egatconv.py
@@ -22,8 +22,7 @@ class EGATConv(nn.Module):
         f_{ij}^{\prime} &= \mathrm{LeakyReLU}\left(A [ h_{i} \| f_{ij} \| h_{j}]\right)
 
     where :math:`f_{ij}^{\prime}` are edge features, :math:`\mathrm{A}` is weight matrix and
-
-    :math: `\vec{F}` is weight vector. After that, resulting node features
+    :math:`\vec{F}` is weight vector. After that, resulting node features
     :math:`h_{i}^{\prime}` are updated in the same way as in regular GAT.
 
     Parameters

diff --git a/python/dgl/transforms/functional.py b/python/dgl/transforms/functional.py
@@ -3508,7 +3508,7 @@ def radius_graph(x, r, p=2, self_loop=False,
     distances = th.cdist(x, x, p=p, compute_mode=compute_mode)
 
     if not self_loop:
-        distances.fill_diagonal_(r + 1e-4)
+        distances.fill_diagonal_(r + 1)
 
     edges = th.nonzero(distances <= r, as_tuple=True)
 

diff --git a/script/create_dev_conda_env.sh b/script/create_dev_conda_env.sh
@@ -0,0 +1,112 @@
+#!/bin/bash
+
+readonly CUDA_VERSIONS="10.2,11.3,11.6,11.7"
+readonly TORCH_VERSION="1.12.0"
+
+usage() {
+cat << EOF
+usage: bash $0 OPTIONS
+examples:
+  bash $0 -c
+  bash $0 -g 11.7
+
+Create a developement environment for DGL developers.
+
+OPTIONS:
+  -h           Show this message.
+  -c           Create dev environment in CPU mode.
+  -g           Create dev environment in GPU mode with specified CUDA version,
+               supported: ${CUDA_VERSIONS}.
+EOF
+}
+
+validate() {
+  values=$(echo "$1" | tr "," "\n")
+  for value in ${values}
+  do
+    if [[ "${value}" == $2 ]]; then
+      return 0
+    fi
+  done
+  return 1
+}
+
+confirm() {
+  echo "Continue? [yes/no]:"
+  read confirm
+  if [[ ! ${confirm} == "yes" ]]; then
+    exit 0
+  fi
+}
+
+# Parse flags.
+while getopts "cg:h" flag; do
+  if [[ ${flag} == "c" ]]; then
+    cpu=1
+  elif [[ ${flag} == "g" ]]; then
+    gpu=${OPTARG}
+  elif [[ ${flag} == "h" ]]; then
+    usage
+    exit 0
+  else
+    usage
+    exit 1
+  fi
+done
+
+if [[ -n ${gpu} && ${cpu} -eq 1 ]]; then
+  echo "Only one mode can be specified."
+  exit 1
+fi
+
+if [[ -z ${gpu} && -z ${cpu} ]]; then
+  usage
+  exit 1
+fi
+
+# Set up CPU mode.
+if [[ ${cpu} -eq 1 ]]; then
+  torchversion=${TORCH_VERSION}"+cpu"
+  name="dgl-dev-cpu"
+fi
+
+# Set up GPU mode.
+if [[ -n ${gpu} ]]; then
+  if ! validate ${CUDA_VERSIONS} ${gpu}; then
+    echo "Error: Invalid CUDA version."
+    usage
+    exit 1
+  fi
+
+  echo "Confirm the installed CUDA version matches the specified one."
+  confirm
+
+  torchversion=${TORCH_VERSION}"+cu"${gpu//[-._]/}
+  name="dgl-dev-gpu"
+fi
+
+echo "Confirm you are excuting the script from your DGL root directory."
+echo "Current working directory: ${PWD}"
+confirm
+
+# Prepare the conda environment yaml file.
+rand=$(echo "${RANDOM}" | md5sum | head -c 20)
+mkdir -p /tmp/${rand}
+cp script/dgl_dev.yml.template /tmp/${rand}/dgl_dev.yml
+sed -i "s|__NAME__|${name}|g" /tmp/${rand}/dgl_dev.yml
+sed -i "s|__TORCH_VERSION__|${torchversion}|g" /tmp/${rand}/dgl_dev.yml
+sed -i "s|__DGL_HOME__|${PWD}|g" /tmp/${rand}/dgl_dev.yml
+
+# Ask for final confirmation.
+echo "--------------------------------------------------"
+cat /tmp/${rand}/dgl_dev.yml
+echo "--------------------------------------------------"
+echo "Create a conda enviroment with the config?"
+confirm
+
+# Create conda environment.
+conda env create -f /tmp/${rand}/dgl_dev.yml
+
+# Clean up created tmp conda environment yaml file.
+rm -rf /tmp/${rand}
+exit 0
diff --git a/script/dgl_dev.yml.template b/script/dgl_dev.yml.template
@@ -0,0 +1,29 @@
+name: __NAME__
+dependencies:
+  - python=3.7.0
+  - pip
+  - pip:
+    - --find-links https://download.pytorch.org/whl/torch_stable.html
+    - cython
+    - filelock
+    - matplotlib
+    - networkx
+    - nltk
+    - nose
+    - numpy
+    - ogb
+    - pandas
+    - psutil
+    - pyarrow
+    - pydantic
+    - pytest
+    - pyyaml
+    - rdflib
+    - requests[security]
+    - scikit-learn
+    - scipy
+    - torch==__TORCH_VERSION__
+    - torchmetrics
+    - tqdm
+variables:
+  DGL_HOME: __DGL_HOME__
diff --git a/tests/pytorch/sparse/test_example.py → tests/examples/test_sparse_examples.py b/tests/pytorch/sparse/test_example.py → tests/examples/test_sparse_examples.py
@@ -6,7 +6,6 @@
     os.path.dirname(os.path.relpath(__file__)),
     "..",
     "..",
-    "..",
     "examples",
     "sparse",
 )

diff --git a/tests/scripts/task_example_test.bat b/tests/scripts/task_example_test.bat
@@ -22,6 +22,8 @@ SET DGL_LIBRARY_PATH=!CD!\build
 SET PYTHONPATH=!CD!\python;!PYTHONPATH!
 SET DGL_DOWNLOAD_DIR=!CD!
 
+python -m pytest -v --junitxml=pytest_backend.xml --durations=100 tests\examples || GOTO :FAIL
+
 PUSHD !GCN_EXAMPLE_DIR!
 python pagerank.py || GOTO :FAIL
 python gcn\train.py --dataset cora || GOTO :FAIL

diff --git a/tests/scripts/task_example_test.sh b/tests/scripts/task_example_test.sh
@@ -36,6 +36,8 @@ export DGL_DOWNLOAD_DIR=${PWD}
 
 # test
 
+python3 -m pytest -v --junitxml=pytest_backend.xml --durations=100 tests/examples || fail "sparse examples on $1"
+
 pushd $GCN_EXAMPLE_DIR> /dev/null
 
 python3 pagerank.py || fail "run pagerank.py on $1"

diff --git a/tests/tools/utils.py b/tests/tools/utils.py
@@ -1,10 +1,10 @@
-import os
 import json
 import logging
-import numpy as np
-import torch
+import os
 
 import dgl
+import numpy as np
+import torch
 from distpartitioning import array_readwriter
 from distpartitioning.array_readwriter.parquet import ParquetArrayParser
 from files import setdir
@@ -16,12 +16,16 @@ def _chunk_numpy_array(arr, fmt_meta, chunk_sizes, path_fmt, vector_rows=False):
 
     for j, n in enumerate(chunk_sizes):
         path = os.path.abspath(path_fmt % j)
-        arr_chunk = arr[offset: offset + n]
+        arr_chunk = arr[offset : offset + n]
         shape = arr_chunk.shape
         logging.info("Chunking %d-%d" % (offset, offset + n))
         # If requested we write multi-column arrays as single-column vector Parquet files
         array_parser = array_readwriter.get_array_parser(**fmt_meta)
-        if isinstance(array_parser, ParquetArrayParser) and len(shape) > 1 and shape[1] > 1:
+        if (
+            isinstance(array_parser, ParquetArrayParser)
+            and len(shape) > 1
+            and shape[1] > 1
+        ):
             array_parser.write(path, arr_chunk, vector_rows=vector_rows)
         else:
             array_parser.write(path, arr_chunk)
@@ -83,8 +87,15 @@ def _init(g, num_chunks, key, kwargs=None):
 
 
 def _chunk_graph(
-    g, name, ndata_paths, edata_paths, num_chunks, data_fmt, edges_format,
-    vector_rows=False, **kwargs
+    g,
+    name,
+    ndata_paths,
+    edata_paths,
+    num_chunks,
+    data_fmt,
+    edges_format,
+    vector_rows=False,
+    **kwargs,
 ):
     # First deal with ndata and edata that are homogeneous
     # (i.e. not a dict-of-dict)
@@ -139,16 +150,24 @@ def _chunk_graph(
         k: v for k, v in zip(g.canonical_etypes, num_edges_per_chunk)
     }
 
+    idxes_etypestr = {
+        idx: (etype, etypestrs[etype])
+        for idx, etype in enumerate(g.canonical_etypes)
+    }
+    idxes = np.arange(len(idxes_etypestr))
+
     # Split edge index
     metadata["edges"] = {}
     with setdir("edge_index"):
-        for etype in g.canonical_etypes:
-            etypestr = etypestrs[etype]
+        np.random.shuffle(idxes)
+        for idx in idxes:
+            etype = idxes_etypestr[idx][0]
+            etypestr = idxes_etypestr[idx][1]
             logging.info("Chunking edge index for %s" % etypestr)
             edges_meta = {}
-            if edges_format == 'csv':
+            if edges_format == "csv":
                 fmt_meta = {"name": edges_format, "delimiter": " "}
-            elif edges_format == 'parquet':
+            elif edges_format == "parquet":
                 fmt_meta = {"name": edges_format}
             else:
                 raise RuntimeError(f"Invalid edges_fmt: {edges_format}")
@@ -259,7 +278,7 @@ def chunk_graph(
     num_chunks,
     output_path,
     data_fmt="numpy",
-    edges_fmt='csv',
+    edges_fmt="csv",
     vector_rows=False,
     **kwargs,
 ):
@@ -302,14 +321,26 @@ def chunk_graph(
             edata[key] = os.path.abspath(edata[key])
     with setdir(output_path):
         _chunk_graph(
-            g, name, ndata_paths, edata_paths, num_chunks, data_fmt, edges_fmt,
-            vector_rows, **kwargs
+            g,
+            name,
+            ndata_paths,
+            edata_paths,
+            num_chunks,
+            data_fmt,
+            edges_fmt,
+            vector_rows,
+            **kwargs,
         )
 
 
 def create_chunked_dataset(
-    root_dir, num_chunks, data_fmt="numpy", edges_fmt='csv',
-    vector_rows=False, **kwargs):
+    root_dir,
+    num_chunks,
+    data_fmt="numpy",
+    edges_fmt="csv",
+    vector_rows=False,
+    **kwargs,
+):
     """
     This function creates a sample dataset, based on MAG240 dataset.
 

diff --git a/tools/distpartitioning/dataset_utils.py b/tools/distpartitioning/dataset_utils.py
@@ -529,7 +529,8 @@ def get_dataset(input_dir, graph_name, rank, world_size, num_parts, schema_map):
     ]:
         edge_datadict[col] = []
 
-    for etype_name, etype_info in edge_data.items():
+    for etype_name, etype_id in etype_name_idmap.items():
+        etype_info = edge_data[etype_name]
         edge_info = etype_info[constants.STR_DATA]
 
         # edgetype strings are in canonical format, src_node_type:edge_type:dst_node_type