diff --git a/MANIFEST.in b/MANIFEST.in index bc73983a9c4..ea12b9342b3 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,4 @@ include python/versioneer.py -include python/cugraph/_version.py \ No newline at end of file +include python/cugraph/_version.py +include cugraph/experimental/datasets/*.yaml +include cugraph/experimental/datasets/metadata/*.yaml \ No newline at end of file diff --git a/python/cugraph/MANIFEST.in b/python/cugraph/MANIFEST.in index 51f5db65c13..1f6d9f7a4d0 100644 --- a/python/cugraph/MANIFEST.in +++ b/python/cugraph/MANIFEST.in @@ -1,2 +1,4 @@ include versioneer.py include cugraph/_version.py +include cugraph/experimental/datasets/*.yaml +include cugraph/experimental/datasets/metadata/*.yaml \ No newline at end of file diff --git a/python/cugraph/cugraph/experimental/__init__.py b/python/cugraph/cugraph/experimental/__init__.py index 9f912346990..964cfe369fe 100644 --- a/python/cugraph/cugraph/experimental/__init__.py +++ b/python/cugraph/cugraph/experimental/__init__.py @@ -39,3 +39,5 @@ find_bicliques = deprecated_warning_wrapper( experimental_warning_wrapper(EXPERIMENTAL__find_bicliques) ) + +from cugraph.experimental.datasets.dataset import Dataset diff --git a/python/cugraph/cugraph/experimental/datasets/__init__.py b/python/cugraph/cugraph/experimental/datasets/__init__.py new file mode 100644 index 00000000000..fedf9d0dbda --- /dev/null +++ b/python/cugraph/cugraph/experimental/datasets/__init__.py @@ -0,0 +1,51 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from cugraph.experimental.datasets.dataset import ( + Dataset, + load_all, + set_config, + set_download_dir, + get_download_dir, + default_download_dir +) +from cugraph.experimental.datasets import metadata +from pathlib import Path + + +meta_path = Path(__file__).parent / "metadata" + +karate = Dataset(meta_path / "karate.yaml") +karate_undirected = Dataset(meta_path / "karate_undirected.yaml") +karate_asymmetric = Dataset(meta_path / "karate_asymmetric.yaml") +dolphins = Dataset(meta_path / "dolphins.yaml") +polbooks = Dataset(meta_path / "polbooks.yaml") +netscience = Dataset(meta_path / "netscience.yaml") +cyber = Dataset(meta_path / "cyber.yaml") +small_line = Dataset(meta_path / "small_line.yaml") +small_tree = Dataset(meta_path / "small_tree.yaml") + + +# LARGE DATASETS +LARGE_DATASETS = [cyber] + +# <10,000 lines +MEDIUM_DATASETS = [netscience, polbooks] + +# <500 lines +SMALL_DATASETS = [karate, small_line, small_tree, dolphins] + +# ALL +ALL_DATASETS = [karate, dolphins, netscience, polbooks, cyber, + small_line, small_tree] \ No newline at end of file diff --git a/python/cugraph/cugraph/experimental/datasets/dataset.py b/python/cugraph/cugraph/experimental/datasets/dataset.py new file mode 100644 index 00000000000..3ae904904f6 --- /dev/null +++ b/python/cugraph/cugraph/experimental/datasets/dataset.py @@ -0,0 +1,207 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import cugraph +import cudf +import yaml +import os +from pathlib import Path + + +class DefaultDownloadDir: + """ + Maintains the path to the download directory used by Dataset instances. + Instances of this class are typically shared by several Dataset instances + in order to allow for the download directory to be defined and updated by + a single object. + """ + def __init__(self): + self._path = Path(os.environ.get("RAPIDS_DATASET_ROOT_DIR", + Path.home() / ".cugraph/datasets")) + + @property + def path(self): + """ + If `path` is not set, set it to the environment variable + RAPIDS_DATASET_ROOT_DIR. If the variable is not set, default to the + user's home directory. + """ + if self._path is None: + self._path = Path(os.environ.get("RAPIDS_DATASET_ROOT_DIR", + Path.home() / + ".cugraph/datasets")) + return self._path + + @path.setter + def path(self, new): + self._path = Path(new) + + def clear(self): + self._path = None + + +default_download_dir = DefaultDownloadDir() + + +class Dataset: + """ + A Dataset Object, used to easily import edgelist data and cuGraph.Graph + instances. + + Parameters + ---------- + meta_data_file_name : yaml file + The metadata file for the specific graph dataset, which includes + information on the name, type, url link, data loading format, graph + properties + + """ + def __init__(self, meta_data_file_name): + with open(meta_data_file_name, 'r') as file: + self.metadata = yaml.safe_load(file) + + self._dl_path = default_download_dir + self._edgelist = None + self._graph = None + self._path = None + + def __download_csv(self, url): + self._dl_path.path.mkdir(parents=True, exist_ok=True) + + filename = self.metadata['name'] + self.metadata['file_type'] + if self._dl_path.path.is_dir(): + df = cudf.read_csv(url) + df.to_csv(self._dl_path.path / filename, index=False) + + else: + raise RuntimeError(f"The directory {self._dl_path.path.absolute()}" + "does not exist") + + def get_edgelist(self, fetch=False): + """ + Return an Edgelist + + Parameters + ---------- + fetch : Boolean (default=False) + Automatically fetch for the dataset from the 'url' location within + the YAML file. + """ + + if self._edgelist is None: + full_path = self._dl_path.path / (self.metadata['name'] + + self.metadata['file_type']) + + if not full_path.is_file(): + if fetch: + self.__download_csv(self.metadata['url']) + else: + raise RuntimeError(f"The datafile {full_path} does not" + " exist. Try get_edgelist(fetch=True)" + " to download the datafile") + + self._edgelist = cudf.read_csv(full_path, + delimiter=self.metadata['delim'], + names=self.metadata['col_names'], + dtype=self.metadata['col_types']) + self._path = full_path + + return self._edgelist + + def get_graph(self, fetch=False): + """ + Return a Graph object. + + Parameters + ---------- + fetch : Boolean (default=False) + Automatically fetch for the dataset from the 'url' location within + the YAML file. + """ + if self._edgelist is None: + self.get_edgelist(fetch) + + self._graph = cugraph.Graph(directed=self.metadata['is_directed']) + self._graph.from_cudf_edgelist(self._edgelist, source='src', + destination='dst') + + return self._graph + + def get_path(self): + """ + Returns the location of the stored dataset file + """ + if self._path is None: + raise RuntimeError("Path to datafile has not been set." + + " Call get_edgelist or get_graph first") + + return self._path.absolute() + + +def load_all(force=False): + """ + Looks in `metadata` directory and fetches all datafiles from the the URLs + provided in each YAML file. + + Parameters + force : Boolean (default=False) + Overwrite any existing copies of datafiles. + """ + default_download_dir.path.mkdir(parents=True, exist_ok=True) + + meta_path = Path(__file__).parent.absolute() / "metadata" + for file in meta_path.iterdir(): + meta = None + if file.suffix == '.yaml': + with open(meta_path / file, 'r') as metafile: + meta = yaml.safe_load(metafile) + + if 'url' in meta: + filename = meta['name'] + meta['file_type'] + save_to = default_download_dir.path / filename + if not save_to.is_file() or force: + df = cudf.read_csv(meta['url']) + df.to_csv(save_to, index=False) + + +def set_config(cfgpath): + """ + Read in a custom config file. + + Parameters + ---------- + cfgfile : String + Read the custom config file given its path, and override the default + """ + with open(Path(cfgpath), 'r') as file: + cfg = yaml.safe_load(file) + default_download_dir.path = Path(cfg['download_dir']) + + +def set_download_dir(path): + """ + Set the download directory for fetching datasets + + Parameters + ---------- + path : String + Location used to store datafiles + """ + if path is None: + default_download_dir.clear() + else: + default_download_dir.path = path + + +def get_download_dir(): + return default_download_dir.path.absolute() diff --git a/python/cugraph/cugraph/experimental/datasets/datasets_config.yaml b/python/cugraph/cugraph/experimental/datasets/datasets_config.yaml new file mode 100644 index 00000000000..69a79db9cd9 --- /dev/null +++ b/python/cugraph/cugraph/experimental/datasets/datasets_config.yaml @@ -0,0 +1,5 @@ +--- +fetch: "False" +force: "False" +# path where datasets will be downloaded to and stored +download_dir: "datasets" diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/cyber.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/cyber.yaml new file mode 100644 index 00000000000..36858242ec7 --- /dev/null +++ b/python/cugraph/cugraph/experimental/datasets/metadata/cyber.yaml @@ -0,0 +1,21 @@ +name: cyber +file_type: .csv +author: N/A +url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/cyber.csv +refs: N/A +col_names: + - idx + - src + - dst +col_types: + - int32 + - str + - str +delim: "," +has_loop: true +is_directed: true +is_multigraph: false +is_symmetric: false +number_of_edges: 54 +number_of_nodes: 314 +number_of_lines: 2546576 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/dolphins.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/dolphins.yaml new file mode 100644 index 00000000000..ef07def2b97 --- /dev/null +++ b/python/cugraph/cugraph/experimental/datasets/metadata/dolphins.yaml @@ -0,0 +1,24 @@ +name: dolphins +file_type: .csv +author: D. Lusseau +url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/dolphins.csv +refs: + D. Lusseau, K. Schneider, O. J. Boisseau, P. Haase, E. Slooten, and S. M. Dawson, + The bottlenose dolphin community of Doubtful Sound features a large proportion of + long-lasting associations, Behavioral Ecology and Sociobiology 54, 396-405 (2003). +col_names: + - src + - dst + - wgt +col_types: + - int32 + - int32 + - float32 +delim: " " +has_loop: false +is_directed: true +is_multigraph: false +is_symmetric: false +number_of_edges: 318 +number_of_nodes: 62 +number_of_lines: 318 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/karate.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/karate.yaml new file mode 100644 index 00000000000..d86c7b1a241 --- /dev/null +++ b/python/cugraph/cugraph/experimental/datasets/metadata/karate.yaml @@ -0,0 +1,21 @@ +name: karate-data +file_type: .csv +author: Zachary W. +url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/karate-data.csv +refs: + W. W. Zachary, An information flow model for conflict and fission in small groups, + Journal of Anthropological Research 33, 452-473 (1977). +delim: "\t" +col_names: + - src + - dst +col_types: + - int32 + - int32 +has_loop: true +is_directed: true +is_multigraph: false +is_symmetric: true +number_of_edges: 156 +number_of_nodes: 34 +number_of_lines: 156 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/karate_asymmetric.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/karate_asymmetric.yaml new file mode 100644 index 00000000000..b4a81fde29e --- /dev/null +++ b/python/cugraph/cugraph/experimental/datasets/metadata/karate_asymmetric.yaml @@ -0,0 +1,23 @@ +name: karate-asymmetric +file_type: .csv +author: Zachary W. +url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/karate-asymmetric.csv +refs: + W. W. Zachary, An information flow model for conflict and fission in small groups, + Journal of Anthropological Research 33, 452-473 (1977). +delim: "\t" +col_names: + - src + - dst + - wgt +col_types: + - int32 + - int32 + - float32 +has_loop: true +is_directed: false +is_multigraph: false +is_symmetric: false +number_of_edges: 78 +number_of_nodes: 34 +number_of_lines: 78 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/karate_undirected.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/karate_undirected.yaml new file mode 100644 index 00000000000..061b3361367 --- /dev/null +++ b/python/cugraph/cugraph/experimental/datasets/metadata/karate_undirected.yaml @@ -0,0 +1,21 @@ +name: karate_undirected +file_type: .csv +author: Zachary W. +url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/karate_undirected.csv +refs: + W. W. Zachary, An information flow model for conflict and fission in small groups, + Journal of Anthropological Research 33, 452-473 (1977). +delim: "\t" +col_names: + - src + - dst +col_types: + - int32 + - int32 +has_loop: true +is_directed: false +is_multigraph: false +is_symmetric: true +number_of_edges: 78 +number_of_nodes: 34 +number_of_lines: 78 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/netscience.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/netscience.yaml new file mode 100644 index 00000000000..9c3bd8a6a1d --- /dev/null +++ b/python/cugraph/cugraph/experimental/datasets/metadata/netscience.yaml @@ -0,0 +1,21 @@ +name: netscience +file_type: .csv +author: Newman, Mark EJ +url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/netscience.csv +refs: Finding community structure in networks using the eigenvectors of matrices. +delim: " " +col_names: + - src + - dst + - wgt +col_types: + - int32 + - int32 + - float32 +has_loop: false +is_directed: true +is_multigraph: false +is_symmetric: true +number_of_edges: 2742 +number_of_nodes: 1461 +number_of_lines: 5484 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/polbooks.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/polbooks.yaml new file mode 100644 index 00000000000..75e0e69565d --- /dev/null +++ b/python/cugraph/cugraph/experimental/datasets/metadata/polbooks.yaml @@ -0,0 +1,21 @@ +name: polbooks +file_type: .csv +author: V. Krebs +url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/polbooks.csv +refs: null +delim: " " +col_names: + - src + - dst + - wgt +col_types: + - int32 + - int32 + - float32 +is_directed: true +has_loop: null +is_multigraph: null +is_symmetric: true +number_of_edges: 882 +number_of_nodes: 105 +number_of_lines: 882 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/small_line.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/small_line.yaml new file mode 100644 index 00000000000..9831ff11b30 --- /dev/null +++ b/python/cugraph/cugraph/experimental/datasets/metadata/small_line.yaml @@ -0,0 +1,21 @@ +name: small_line +file_type: .csv +author: null +url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/small_line.csv +refs: null +delim: " " +col_names: + - src + - dst + - wgt +col_types: + - int32 + - int32 + - float32 +has_loop: false +is_directed: false +is_multigraph: false +is_symmetric: true +number_of_edges: 9 +number_of_nodes: 10 +number_of_lines: 8 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/small_tree.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/small_tree.yaml new file mode 100644 index 00000000000..942f468c23b --- /dev/null +++ b/python/cugraph/cugraph/experimental/datasets/metadata/small_tree.yaml @@ -0,0 +1,21 @@ +name: small_tree +file_type: .csv +author: null +url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/small_tree.csv +refs: null +delim: " " +col_names: + - src + - dst + - wgt +col_types: + - int32 + - int32 + - float32 +has_loop: false +is_directed: true +is_multigraph: false +is_symmetric: true +number_of_edges: 11 +number_of_nodes: 9 +number_of_lines: 11 diff --git a/python/cugraph/cugraph/tests/test_dataset.py b/python/cugraph/cugraph/tests/test_dataset.py new file mode 100644 index 00000000000..093f1382bcb --- /dev/null +++ b/python/cugraph/cugraph/tests/test_dataset.py @@ -0,0 +1,169 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +import yaml +import os +from pathlib import Path +from tempfile import NamedTemporaryFile, TemporaryDirectory +from cugraph.experimental.datasets import (ALL_DATASETS) + + +# ============================================================================= +# Pytest Setup / Teardown - called for each test function +# ============================================================================= + +# Use this to simulate a fresh API import +@pytest.fixture +def datasets(): + from cugraph.experimental import datasets + yield datasets + del datasets + clear_locals() + + +def clear_locals(): + for dataset in ALL_DATASETS: + dataset._edgelist = None + dataset._graph = None + dataset._path = None + + +# We use this to create tempfiles that act as config files when we call +# set_config(). Arguments passed will act as custom download directories +def create_config(custom_path="custom_storage_location"): + config_yaml = """ + fetch: False + force: False + download_dir: None + """ + c = yaml.safe_load(config_yaml) + c['download_dir'] = custom_path + + outfile = NamedTemporaryFile() + with open(outfile.name, 'w') as f: + yaml.dump(c, f, sort_keys=False) + + return outfile + + +# setting download_dir to None effectively re-initialized the default +def test_env_var(datasets): + os.environ['RAPIDS_DATASET_ROOT_DIR'] = 'custom_storage_location' + datasets.set_download_dir(None) + + expected_path = Path("custom_storage_location").absolute() + assert datasets.get_download_dir() == expected_path + + del os.environ['RAPIDS_DATASET_ROOT_DIR'] + + +def test_home_dir(datasets): + datasets.set_download_dir(None) + expected_path = Path.home() / ".cugraph/datasets" + + assert datasets.get_download_dir() == expected_path + + +def test_set_config(datasets): + cfg = create_config() + datasets.set_config(cfg.name) + + assert datasets.get_download_dir() == \ + Path("custom_storage_location").absolute() + + cfg.close() + + +def test_set_download_dir(datasets): + tmpd = TemporaryDirectory() + datasets.set_download_dir(tmpd.name) + + assert datasets.get_download_dir() == Path(tmpd.name).absolute() + + tmpd.cleanup() + + +def test_load_all(datasets): + tmpd = TemporaryDirectory() + cfg = create_config(custom_path=tmpd.name) + datasets.set_config(cfg.name) + datasets.load_all() + + for data in datasets.ALL_DATASETS: + file_path = Path(tmpd.name) / (data.metadata['name'] + + data.metadata['file_type']) + assert file_path.is_file() + + tmpd.cleanup() + + +@pytest.mark.parametrize("dataset", ALL_DATASETS) +def test_fetch(dataset, datasets): + tmpd = TemporaryDirectory() + cfg = create_config(custom_path=tmpd.name) + datasets.set_config(cfg.name) + + E = dataset.get_edgelist(fetch=True) + + assert E is not None + assert dataset.get_path().is_file() + + tmpd.cleanup() + + +@pytest.mark.parametrize("dataset", ALL_DATASETS) +def test_get_edgelist(dataset, datasets): + tmpd = TemporaryDirectory() + datasets.set_download_dir(tmpd.name) + E = dataset.get_edgelist(fetch=True) + + assert E is not None + + tmpd.cleanup() + + +@pytest.mark.parametrize("dataset", ALL_DATASETS) +def test_get_graph(dataset, datasets): + tmpd = TemporaryDirectory() + datasets.set_download_dir(tmpd.name) + G = dataset.get_graph(fetch=True) + + assert G is not None + + tmpd.cleanup() + + +@pytest.mark.parametrize("dataset", ALL_DATASETS) +def test_metadata(dataset): + M = dataset.metadata + + assert M is not None + + +@pytest.mark.parametrize("dataset", ALL_DATASETS) +def test_get_path(dataset, datasets): + tmpd = TemporaryDirectory() + datasets.set_download_dir(tmpd.name) + dataset.get_edgelist(fetch=True) + + assert dataset.get_path().is_file() + tmpd.cleanup() + + +# Path is None until a dataset initializes its edgelist +@pytest.mark.parametrize("dataset", ALL_DATASETS) +def test_get_path_raises(dataset): + with pytest.raises(RuntimeError): + dataset.get_path() diff --git a/python/cugraph/setup.py b/python/cugraph/setup.py index 91bdea4a2bc..c2d994969c0 100644 --- a/python/cugraph/setup.py +++ b/python/cugraph/setup.py @@ -191,6 +191,11 @@ def finalize_options(self): setup_requires=['Cython>=0.29,<0.30'], ext_modules=extensions, packages=find_packages(include=['cugraph', 'cugraph.*']), + include_package_data=True, + package_data={ + '': ['python/cugraph/cugraph/experimental/datasets/metadata/*.yaml', + 'python/cugraph/cugraph/experimental/datasets/*.yaml'], + }, install_requires=INSTALL_REQUIRES, license="Apache", cmdclass=cmdclass,