diff --git a/python/cugraph/cugraph/experimental/__init__.py b/python/cugraph/cugraph/experimental/__init__.py index 1c24389bafc..3ecb9723887 100644 --- a/python/cugraph/cugraph/experimental/__init__.py +++ b/python/cugraph/cugraph/experimental/__init__.py @@ -27,4 +27,6 @@ from cugraph.experimental.community.triangle_count import \ EXPERIMENTAL__triangle_count -triangle_count = experimental_warning_wrapper(EXPERIMENTAL__triangle_count) \ No newline at end of file +triangle_count = experimental_warning_wrapper(EXPERIMENTAL__triangle_count) + +from cugraph.experimental.datasets.dataset import Dataset diff --git a/python/cugraph/cugraph/experimental/datasets/__init__.py b/python/cugraph/cugraph/experimental/datasets/__init__.py new file mode 100644 index 00000000000..675961975b1 --- /dev/null +++ b/python/cugraph/cugraph/experimental/datasets/__init__.py @@ -0,0 +1,36 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#from cugraph.experimental.datasets import ( +# karate, +#) +from cugraph.experimental.datasets.dataset import ( + Dataset, +# karate, +# dolphins, +# SMALL_DATASETS +) +from cugraph.experimental.datasets import metadata + +# SMALL DATASETS +karate = Dataset("metadata/karate.yaml") +dolphins = Dataset("metadata/dolphins.yaml") +polbooks = Dataset("metadata/polbooks.yaml") +netscience = Dataset("metadata/netscience.yaml") + +# MEDIUM DATASETS + +# LARGE DATASETS + +# GROUPS OF DATASETS +SMALL_DATASETS = [karate, dolphins] \ No newline at end of file diff --git a/python/cugraph/cugraph/experimental/datasets/dataset.py b/python/cugraph/cugraph/experimental/datasets/dataset.py new file mode 100644 index 00000000000..dbaf9e63986 --- /dev/null +++ b/python/cugraph/cugraph/experimental/datasets/dataset.py @@ -0,0 +1,115 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import cugraph +import cudf +import yaml +import requests +import re +import os +import csv +import pdb + +class Dataset: + def __init__(self, meta_data_file_name): + self.dir_path = "python/cugraph/cugraph/experimental/datasets/" + self.download_dir = "datasets/" + self.__read_config() + self.__meta_data_file_name = meta_data_file_name + self.__read_meta_data_file(self.__meta_data_file_name) + self.__edgelist = None + self.__graph = None + + + def __read_meta_data_file(self, meta_data_file): + with open(self.dir_path + meta_data_file, 'r') as file: + self.metadata = yaml.safe_load(file) + file.close() + + + def __read_config(self): + config_path = "python/cugraph/cugraph/experimental/datasets/datasets_config.yaml" + with open(config_path, 'r') as file: + cfg = yaml.safe_load(file) + self.download_dir = cfg['download_dir'] # should this be accessible by user? + file.close() + + + def __download_csv(self, url, default_path): + filename = url.split('/')[-1] + df = cudf.read_csv(url) + df.to_csv(default_path+filename, index=False) + self.metadata['path'] = default_path + filename + + with open(self.dir_path + self.__meta_data_file_name, 'w') as file: + yaml.dump(self.metadata, file, sort_keys=False) + file.close() + + # figure out throwing errors if fetch=False and file doesn't exist... + def get_edgelist(self, fetch=False): + """ + Return an Edgelist + + Parameters + ---------- + fetch : Boolean (default=False) + + Automatically fetch for the dataset from the 'url' location within the YAML file. + """ + if self.__edgelist is None: + if not os.path.isfile(self.metadata['path']): + if fetch: + self.__download_csv(self.metadata['url'], self.download_dir) + else: + raise RuntimeError("The datafile does not exist. Try get_edgelist(fetch=True) to download the datafile") + + self.__edgelist = cudf.read_csv(self.metadata['path'], delimiter=self.metadata['delimiter'], names=self.metadata['col_names'], dtype=self.metadata['col_types']) + + return self.__edgelist + + + def get_graph(self, fetch=False): + """ + Return a Graph object. + + Parameters + ---------- + fetch : Boolean (default=False) + + Automatically fetch for the dataset from the 'url' location within the YAML file. + """ + if self.__edgelist is None: + self.get_edgelist(fetch) + + self.__graph = cugraph.Graph(directed=self.metadata['is_directed']) + self.__graph.from_cudf_edgelist(self.__edgelist, source='src', destination='dst') + + return self.__graph + + + def load_all(self): + """ + Looks in `metadata` directory and fetches all datafiles from the web. + """ + meta_path = self.dir_path + 'metadata/' + for file in os.listdir(meta_path): + print(file) + meta = None + #pdb.set_trace() + if file.split('.')[-1] == 'yaml': + with open(meta_path + file, 'r') as metafile: + meta = yaml.safe_load(metafile) + metafile.close() + + print("downloading from " + meta['url']) + self.__download_csv(meta['url'], self.download_dir) diff --git a/python/cugraph/cugraph/experimental/datasets/datasets_config.yaml b/python/cugraph/cugraph/experimental/datasets/datasets_config.yaml new file mode 100644 index 00000000000..bbc3de17129 --- /dev/null +++ b/python/cugraph/cugraph/experimental/datasets/datasets_config.yaml @@ -0,0 +1,3 @@ +--- +fetch: "False" +download_dir: "datasets2/" diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/__init__.py b/python/cugraph/cugraph/experimental/datasets/metadata/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/dolphins.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/dolphins.yaml new file mode 100644 index 00000000000..81ac15a9506 --- /dev/null +++ b/python/cugraph/cugraph/experimental/datasets/metadata/dolphins.yaml @@ -0,0 +1,22 @@ +name: dolphins +author: D. Lusseau +url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/dolphins.csv +refs: D. Lusseau, K. Schneider, O. J. Boisseau, P. Haase, E. Slooten, and S. M. Dawson, + The bottlenose dolphin community of Doubtful Sound features a large proportion of + long-lasting associations, Behavioral Ecology and Sociobiology 54, 396-405 (2003). +col_names: +- src +- dst +- wgt +col_types: +- int32 +- int32 +- float32 +delimiter: ' ' +has_loop: true +is_directed: true +is_multigraph: false +is_symmetric: false +number_of_edges: 159 +number_of_nodes: 62 +path: python/cugraph/dolphins.csv diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/karate.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/karate.yaml new file mode 100644 index 00000000000..2740e33c845 --- /dev/null +++ b/python/cugraph/cugraph/experimental/datasets/metadata/karate.yaml @@ -0,0 +1,19 @@ +name: karate +author: Zachary W. +url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/karate-data.csv +refs: W. W. Zachary, An information flow model for conflict and fission in small groups, + Journal of Anthropological Research 33, 452-473 (1977). +delimiter: "\t" +col_names: +- src +- dst +col_types: +- int32 +- int32 +has_loop: true +is_directed: true +is_multigraph: false +is_symmetric: true +number_of_edges: 156 +number_of_nodes: 34 +path: datasets2/dolphins.csv diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/netscience.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/netscience.yaml new file mode 100644 index 00000000000..5de8acd232b --- /dev/null +++ b/python/cugraph/cugraph/experimental/datasets/metadata/netscience.yaml @@ -0,0 +1,20 @@ +name: netscience +author: Newman, Mark EJ +url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/netscience.csv +refs: Finding community structure in networks using the eigenvectors of matrices. +delimiter: " " +col_names: + - src + - dst + - wgt +col_types: + - int32 + - int32 + - float32 +has_loop: false +is_directed: false +is_multigraph: false +is_symmetric: true +number_of_edges: 441 +number_of_nodes: 105 +path: datasets/netscience.csv diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/polbooks.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/polbooks.yaml new file mode 100644 index 00000000000..c0e310f3626 --- /dev/null +++ b/python/cugraph/cugraph/experimental/datasets/metadata/polbooks.yaml @@ -0,0 +1,18 @@ +name: polbooks +author: V. Krebs +url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/polbooks.csv +refs: null +delimiter: null +col_names: +- src +- dst +col_types: +- int32 +- int32 +has_loop: null +is_directed: false +is_multigraph: null +is_symmetric: true +number_of_edges: 441 +number_of_nodes: 105 +path: datasets2/polbooks.csv diff --git a/python/cugraph/cugraph/tests/test_dataset.py b/python/cugraph/cugraph/tests/test_dataset.py new file mode 100644 index 00000000000..c55459df2f3 --- /dev/null +++ b/python/cugraph/cugraph/tests/test_dataset.py @@ -0,0 +1,42 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc + +import pytest +import cugraph +from cugraph.experimental.datasets import dataset, SMALL_DATASETS + +# ============================================================================= +# Pytest Setup / Teardown - called for each test function +# ============================================================================= +def setup_function(): + gc.collect() + +# A simple example Dataset class working, MetaData +# run thru an entire alg with imports +# handle cases, like fetch, also maybe config options +#config_file_path = "cugraph/cugraph/experimental/datasets/datasets_config.yaml" +#with open(config_file_path, 'r') as file: +# config_settings = yaml.safe_load(file) + +@pytest.mark.parametrize("dataset", SMALL_DATASETS) +def test_getters(dataset): + # Getting the graph does not need to depend on get_edgelist + M = dataset.get_edgelist(fetch=True) + #breakpoint() + G = dataset.get_graph(fetch=True) + + # Storing the datasets in experimental/datasets/ + +# Test that no fetches are redundant; i.e if dataset has been fetched or already exists, don't fetch again