diff --git a/.bazelversion b/.bazelversion index 0c89fc92..3bff0591 100644 --- a/.bazelversion +++ b/.bazelversion @@ -1 +1 @@ -4.0.0 \ No newline at end of file +5.1.1 \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md index 9190e38a..851016e6 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.md +++ b/.github/ISSUE_TEMPLATE/bug-report.md @@ -14,7 +14,7 @@ Please replace every line in curly brackets ( { like this } ) with appropriate a 1. OS (where TypeDB server runs): { e.g. Mac OS 10, Windows 10, Ubuntu 16.4, etc. } 2. TypeDB version (and platform): { e.g. TypeDB 2.1.0, or TypeDB Cluster 2.1.1 on Google Cloud } -3. TypeDB KGLIB and client-python version: { e.g. KGLIB 0.1 and client-python 1.4 } +3. TypeDB, typedb-ml and client-python version: { e.g. typedb-ml 0.2 and client-python 1.4 } 4. Python version: { e.g. 2.7, 3.6, etc. } 5. Other environment details: diff --git a/.gitignore b/.gitignore index b88ea66e..66309ed5 100644 --- a/.gitignore +++ b/.gitignore @@ -35,6 +35,6 @@ tmp/ __pycache__/ # Data input/output directories -kglib/kgcn_tensorflow/examples/diagnosis/events/ +examples/diagnosis/events/ *.egg-info/ \ No newline at end of file diff --git a/.grabl/automation.yml b/.grabl/automation.yml index fd63976c..edb2c377 100644 --- a/.grabl/automation.yml +++ b/.grabl/automation.yml @@ -24,8 +24,6 @@ config: dependencies: dependencies: [build] typedb-client-python: [build, release] - typedb-common: [build, release] - typedb: [build, release] build: quality: @@ -37,8 +35,8 @@ build: build: image: vaticle-ubuntu-20.04 command: | - pyenv install -s 3.6.3 - pyenv global 3.6.3 system + pyenv install -s 3.7.2 + pyenv global 3.7.2 system bazel build //... bazel run @vaticle_dependencies//tool/checkstyle:test-coverage bazel test $(bazel query 'kind(checkstyle_test, //...)') --test_output=errors @@ -46,38 +44,47 @@ build: image: vaticle-ubuntu-20.04 command: | find . -name \*.md | xargs -L1 npx markdown-link-check@3.8.0 -v - test-kgcn-data-loader: + test-pytorch-geometric: image: vaticle-ubuntu-20.04 timeout: "10m" command: | - pyenv install -s 3.6.3 - pyenv global 3.6.3 system - bazel test //kglib/utils/... --test_output=streamed --spawn_strategy=standalone --action_env=PATH - test-utils: + pyenv install -s 3.7.2 + pyenv global 3.7.2 system + bazel test //typedb_ml/pytorch_geometric/... --test_output=streamed --spawn_strategy=standalone --action_env=PATH + test-typedb: image: vaticle-ubuntu-20.04 timeout: "10m" command: | - pyenv install -s 3.6.3 - pyenv global 3.6.3 system - bazel test //kglib/kgcn_data_loader/... --test_output=streamed --spawn_strategy=standalone --action_env=PATH - test-kgcn-tensorflow: + pyenv install -s 3.7.2 + pyenv global 3.7.2 system + bazel test //typedb_ml/typedb/... --test_output=streamed --spawn_strategy=standalone --action_env=PATH + test-networkx: image: vaticle-ubuntu-20.04 timeout: "10m" command: | - pyenv install -s 3.6.3 - pyenv global 3.6.3 system - bazel test //kglib/kgcn_tensorflow/... --test_output=streamed --spawn_strategy=standalone --action_env=PATH + pyenv install -s 3.7.2 + pyenv global 3.7.2 system + bazel test //typedb_ml/networkx/... --test_output=streamed --spawn_strategy=standalone --action_env=PATH + test-examples: + image: vaticle-ubuntu-20.04 + timeout: "10m" + command: | + pyenv install -s 3.7.2 + pyenv global 3.7.2 system + bazel test //examples/... --test_output=streamed --spawn_strategy=standalone --action_env=PATH test-end-to-end: image: vaticle-ubuntu-20.04 - timeout: "30m" + timeout: "10m" command: | - pyenv install -s 3.6.3 - pyenv global 3.6.3 system - bazel test //kglib/tests/end_to_end:diagnosis --test_output=streamed --spawn_strategy=standalone --action_env=PATH + pyenv install -s 3.7.2 + pyenv global 3.7.2 system + bazel test //tests/end_to_end:diagnosis --test_output=streamed --spawn_strategy=standalone --action_env=PATH deploy-pip-snapshot: image: vaticle-ubuntu-20.04 - dependencies: [build, test-kgcn-data-loader, test-utils, test-kgcn-tensorflow, test-end-to-end] + dependencies: [build, test-pytorch-geometric, test-typedb, test-networkx, test-examples, test-end-to-end] command: | + pyenv install -s 3.7.2 + pyenv global 3.7.2 system export DEPLOY_PIP_USERNAME=$REPO_VATICLE_USERNAME export DEPLOY_PIP_PASSWORD=$REPO_VATICLE_PASSWORD bazel run --define version=$(git rev-parse HEAD) //:deploy-pip -- snapshot @@ -89,15 +96,17 @@ build: branch: master type: foreground command: | - pyenv global 3.6.10 + pyenv install -s 3.7.2 + pyenv global 3.7.2 pip3 install -U pip sudo unlink /usr/bin/python3 sudo ln -s $(which python3) /usr/bin/python3 - sudo ln -s /usr/share/pyshared/lsb_release.py /opt/pyenv/versions/3.6.10/lib/python3.6/site-packages/lsb_release.py - bazel run //test:typedb-extractor -- typedb-all-linux + sudo ln -s /usr/share/pyshared/lsb_release.py /opt/pyenv/versions/3.7.2/lib/python3.7/site-packages/lsb_release.py + bazel run //tests/end_to_end:typedb-extractor-linux -- typedb-all-linux ./typedb-all-linux/typedb server & - pip install --extra-index-url https://repo.vaticle.com/repository/pypi-snapshot/simple typedb-kglib==0.0.0-$GRABL_COMMIT - cd kglib/tests/deployment/ && python -m unittest kgcn.diagnosis && export TEST_SUCCESS=0 || + pip install -r requirements.txt + pip install --extra-index-url https://repo.vaticle.com/repository/pypi-snapshot/simple typedb-ml==0.0.0-$GRABL_COMMIT + python -m examples.diagnosis.diagnosis "./typedb-all-linux" && export TEST_SUCCESS=0 || export TEST_SUCCESS=1 kill $(jps | awk '/TypeDBServer/ {print $1}') exit $TEST_SUCCESS @@ -108,27 +117,27 @@ release: validation: validate-dependencies: image: vaticle-ubuntu-20.04 - command: bazel test //:release-validate-deps --test_output=streamed + command: bazel test //:release-validate-python-deps --test_output=streamed deployment: deploy-github: image: vaticle-ubuntu-20.04 command: | - pyenv install -s 3.6.10 - pyenv global 3.6.10 system + pyenv install -s 3.7.2 + pyenv global 3.7.2 system pip3 install -U pip pip install certifi export ARTIFACT_USERNAME=$REPO_VATICLE_USERNAME export ARTIFACT_PASSWORD=$REPO_VATICLE_PASSWORD bazel run @vaticle_dependencies//distribution/artifact:create-netrc - export RELEASE_NOTES_TOKEN=$REPO_GITHUB_TOKEN - bazel run @vaticle_dependencies//tool/release:create-notes -- kglib $(cat VERSION) ./RELEASE_TEMPLATE.md + export NOTES_CREATE_TOKEN=$REPO_GITHUB_TOKEN + bazel run @vaticle_dependencies//tool/release/notes:create -- $GRABL_OWNER $GRABL_REPO $GRABL_COMMIT $(cat VERSION) ./RELEASE_TEMPLATE.md export DEPLOY_GITHUB_TOKEN=$REPO_GITHUB_TOKEN bazel run --define version=$(cat VERSION) //:deploy-github -- $GRABL_COMMIT deploy-pip-release: image: vaticle-ubuntu-20.04 command: | - pyenv install -s 3.6.10 - pyenv global 3.6.10 system + pyenv install -s 3.7.2 + pyenv global 3.7.2 system pip3 install -U pip export ARTIFACT_USERNAME=$REPO_VATICLE_USERNAME export ARTIFACT_PASSWORD=$REPO_VATICLE_PASSWORD diff --git a/BUILD b/BUILD index b4ffda49..1b2c96d3 100644 --- a/BUILD +++ b/BUILD @@ -23,27 +23,25 @@ exports_files(["requirements.txt", "RELEASE_TEMPLATE.md"]) load("@rules_python//python:defs.bzl", "py_library", "py_test") -load("@vaticle_kglib_pip//:requirements.bzl", - vaticle_kglib_requirement = "requirement") +load("@vaticle_typedb_ml_pip//:requirements.bzl", vaticle_typedb_ml_requirement = "requirement") load("@vaticle_bazel_distribution//github:rules.bzl", "deploy_github") load("@vaticle_bazel_distribution//pip:rules.bzl", "assemble_pip", "deploy_pip") -load("@vaticle_kglib_pip//:requirements.bzl", - vaticle_kglib_requirement = "requirement") +load("@vaticle_typedb_ml_pip//:requirements.bzl", vaticle_typedb_ml_requirement = "requirement") load("@vaticle_dependencies//distribution:deployment.bzl", "deployment") load("//:deployment.bzl", github_deployment = "deployment") -load("@vaticle_dependencies//tool/release/deps:rules.bzl", "release_validate_deps") +load("@vaticle_dependencies//tool/release/deps:rules.bzl", "release_validate_python_deps") load("@vaticle_dependencies//tool/checkstyle:rules.bzl", "checkstyle_test") assemble_pip( name = "assemble-pip", - target = "//kglib:kglib", - package_name = "vaticle-kglib", + target = "//typedb_ml:typedb-ml", + package_name = "typedb-ml", classifiers = [ "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", "Intended Audience :: Developers", @@ -54,11 +52,11 @@ assemble_pip( "Topic :: Software Development :: Libraries", "Topic :: Software Development :: Libraries :: Python Modules" ], - url = "https://github.com/vaticle/kglib", + url = "https://github.com/vaticle/typedb-ml", author = "Vaticle", author_email = "community@vaticle.com", license = "Apache-2.0", - requirements_file = "//:requirements.txt", + requirements_file = "//:install_requires.txt", keywords = ["machine learning", "logical reasoning", "knowledege graph", "typedb", "database", "graph", "knowledgebase", "knowledge-engineering"], @@ -73,14 +71,12 @@ deploy_pip( release = deployment["pypi.release"], ) -release_validate_deps( - name = "release-validate-deps", - refs = "@vaticle_kglib_workspace_refs//:refs.json", +release_validate_python_deps( + name = "release-validate-python-deps", + requirements = "//:requirements.txt", tagged_deps = [ - "@vaticle_typedb", - "@vaticle_typedb_client_python", + "typedb-client", ], - tags = ["manual"] ) checkstyle_test( @@ -89,6 +85,13 @@ checkstyle_test( "*", ".grabl/*", ]), + exclude = glob([ + "*.md" + ]) + [ + ".bazelversion", + "LICENSE", + "VERSION", + ], license_type = "apache-header", ) diff --git a/README.md b/README.md index bbb860a2..6fa9dd6f 100644 --- a/README.md +++ b/README.md @@ -1,68 +1,71 @@ -[![GitHub release](https://img.shields.io/github/release/vaticle/kglib.svg)](https://github.com/vaticle/typedb/releases/latest) +[![GitHub release](https://img.shields.io/github/release/vaticle/typedb-ml.svg)](https://github.com/vaticle/typedb/releases/latest) [![Discord](https://img.shields.io/discord/665254494820368395?color=7389D8&label=chat&logo=discord&logoColor=ffffff)](https://vaticle.com/discord) [![Discussion Forum](https://img.shields.io/discourse/https/forum.vaticle.com/topics.svg)](https://forum.vaticle.com) [![Stack Overflow](https://img.shields.io/badge/stackoverflow-typedb-796de3.svg)](https://stackoverflow.com/questions/tagged/typedb) [![Stack Overflow](https://img.shields.io/badge/stackoverflow-typeql-3dce8c.svg)](https://stackoverflow.com/questions/tagged/typeql) -# TypeDB KGLIB (Knowledge Graph Library) +# TypeDB ML +_Previously known as KGLIB._ -**KGLIB provides tools to enable machine learning with [TypeDB](https://github.com/vaticle/typedb).** +**TypeDB ML provides tools to enable graph algorithms and machine learning with [TypeDB](https://github.com/vaticle/typedb).** -This library is under development and will henceforth be transformed into primarily infrastructure tools and integrations between TypeDB and machine learning libraries. +There are integrations for [NetworkX](https://networkx.org) and for [PyTorch Geometric (PyG)](https://github.com/pyg-team/pytorch_geometric). -## Machine Learning Pipeline +[NetworkX](https://networkx.org) integration allows you to use a [large library of algorithms](https://networkx.org/documentation/stable/reference/algorithms/index.html) over graph data exported from TypeDB. -![Flow Diagram](kglib/kgcn_tensorflow/.images/knowledge_graph_machine_learning.png) +[PyTorch Geometric (PyG)](https://github.com/pyg-team/pytorch_geometric) integration gives you a toolbox to build Graph Neural Networks (GNNs) for your TypeDB data, with an example included for link prediction (or: binary relation prediction, in TypeDB terms). The structure of the GNNs are totally customisable, with network components for popular topics such as graph attention and graph transformers built-in. -The pipeline provided helps by allowing us to extract subgraphs from TypeDB. Each subgraph is a training example, which are sent to the learner in batches. Algorithms using this approach are scalable since they do not need to hold the whole graph in memory for training. +## Features -The pipeline is as follows: -1. Extract data from `TypeDB` into Python [NetworkX](https://networkx.org) in-memory subgraphs by specifying multiple [TypeQL](https://github.com/vaticle/typeql) queries. -2. Encode the nodes and edges of the NetworkX graphs -3. Either (a) transform the encoded values into features, ready for input into a graph/geometric learning pipeline (for example the upcoming PyTorch implementation); or (b) Embed the encoded values according to the Types present in your database (TensorFlow only, PyTorch coming soon). This type-centric embedding is crucial to extracting the context explicitly captured in TypeDB's Type System. -4. Feed the features to a learning algorithm (see below) -5. Optionally, store the predictions made by the learner in TypeDB. These predictions can then be queried using TypeQL. This means we can trivially run more learning tasks over the knowledge base, including the newly made predictions. This is knowledge graph completion. - -## Learning Algorithms -This repo contains one algorithmic implementation: [*Knowledge Graph Convolutional Network* (KGCN)](kglib/kgcn_tensorflow). This is a generic method for relation predication over any TypeDB database. There is a [full worked example](kglib/kgcn_tensorflow/examples/diagnosis) and an explanation of how the approach works. - -You are encouraged to use the tools available in KGLIB to interface TypeDB to your own algorithmic implementations, or to use/leverage prebuilt implementations available from popular libraries such as [PyTorch Geometric](https://github.com/rusty1s/pytorch_geometric) or [Graph Nets](https://github.com/deepmind/graph_nets) (TensorFlow/Sonnet). +### NetworkX +- Declare the graph structure of your queries, with optional sampling functions. +- Query a TypeDB instance and combine many results across many queries into a single graph (`build_graph_from_queries`). +### PyTorch Geometric +- A `DataSet` object to lazily load graphs from a TypeDB instance. Each graph is converted to a PyG `Data` object. +- It's most natural to work with `HeteroData` objects since all data in TypeDB has a type. This conversion is available by default in PyG, but TypeDB-ML provides `store_concepts_by_type` to map concepts by type so that they can be re-associated after learning is finished. +- A `FeatureEncoder` to orchestrate encoders to generate features for graphs. +- Encoders for Continuous and Categorical values to apply encodings/embedding spaces to the types and attribute values present in TypeDB data. +- A [full example for link prediction](examples/diagnosis) +### Other +- Example usage of Tensorboard for PyG `HeteroData` ## Resources -You may find the following resources useful: -- [Strongly Typed Data for Machine Learning](https://www.youtube.com/watch?v=qhUyurWMiSQ) (YouTube) -- [How Can We Complete a Knowledge Graph?](https://www.youtube.com/watch?v=nYDi1_UaFtU) (YouTube) +You may find the following resources useful, particularly to understand why TypeDB-ML started: +- [Strongly Typed Data for Machine Learning](https://www.youtube.com/watch?v=qhUyurWMiSQ) (YouTube, 2021) +- [How Can We Complete a Knowledge Graph?](https://www.youtube.com/watch?v=nYDi1_UaFtU) (YouTube, 2018) ## Quickstart -### Requirements +### Install + +- Python >= 3.7.x -- Python >= 3.6, <= 3.7.x (TensorFlow 1.14.0 doesn't support later Python versions). +- Grab the `requirements.txt` file from [here](requirements.txt) and install the requirements with `pip install requirements.txt`. This is due to some intricacies installing PyG's dependencies, see [here](https://github.com/pyg-team/pytorch_geometric/issues/861) for details. -- KGLIB installed via pip: `pip install typedb-kglib`. +- Installed TypeDB-ML: `pip install typedb-ml`. -- [TypeDB 2.1.1](https://github.com/vaticle/typedb/releases) running in the background. +- [TypeDB 2.11.1](https://github.com/vaticle/typedb/releases) running in the background. -- `typedb-client-python` 2.1.0 ([PyPi](https://pypi.org/project/typedb-client/), [GitHub release](https://github.com/vaticle/typedb-client-python/releases)). This should be installed automatically when you `pip install typedb-kglib`. +- `typedb-client-python` 2.11.x ([PyPi](https://pypi.org/project/typedb-client/), [GitHub release](https://github.com/vaticle/typedb-client-python/releases)). This should be installed automatically when you `pip install typedb-ml`. ### Run the Example -Take a look at [*Knowledge Graph Convolutional Networks* (KGCNs)](kglib/kgcn_tensorflow) to see a walkthrough of how to use the library. +Take a look at the [PyTorch Geometric heterogeneous link prediction example](examples/diagnosis) to see how to use TypeDB-ML to build a GNN on TypeDB data. ### Building from source -It's expected that you will use Pip to install, but should you need to make your own changes to the library, and import it into your project, you can build from source as follows. +It's expected that you will use Pip to install, but should you need to make your own changes to the library, and import it into your project, you can build from source as follows: -Clone KGLIB: +Clone TypeDB-ML: ``` -git clone git@github.com:vaticle/kglib.git +git clone git@github.com:vaticle/typedb-ml.git ``` Go into the project directory: ``` -cd kglib +cd typedb-ml ``` Build all targets: @@ -71,10 +74,10 @@ Build all targets: bazel build //... ``` -Run all tests. Requires Python 3.6+ on your `PATH`. Test dependencies are for Linux since that is the CI environment: +Run all tests. Requires Python 3.7+ on your `PATH`. Test dependencies are for Linux since that is the CI environment: ``` -bazel test //kglib/... --test_output=streamed --spawn_strategy=standalone --action_env=PATH +bazel test //typedb_ml/... --test_output=streamed --spawn_strategy=standalone --action_env=PATH ``` Build the pip distribution. Outputs to `bazel-bin`: @@ -85,6 +88,7 @@ bazel build //:assemble-pip ## Development -To follow the development conversation, please join the [Vaticle Discord](https://discord.com/invite/grakn), and join the `#kglib` channel. Alternatively, start a new topic on the [Vaticle Discussion Forum](https://forum.vaticle.com). +To follow the development conversation, please join the [Vaticle Discord](https://discord.com/invite/vaticle), and join the `#typedb-ml` channel. Alternatively, start a new topic on the [Vaticle Discussion Forum](https://forum.vaticle.com). -KGLIB requires that you have migrated your data into a [TypeDB](https://github.com/vaticle/typedb) or TypeDB Cluster instance. There is an [official examples repo](https://github.com/vaticle/examples) for how to go about this, and information available on [migration in the docs](https://docs.vaticle.com/docs/examples/phone-calls-migration-python). Alternatively, there are fantastic community-led projects growing in the [TypeDB OSI](https://typedb.org) to facilitate fast and easy data loading. +TypeDB-ML requires that you have migrated your data into a [TypeDB](https://github.com/vaticle/typedb) or TypeDB +Cluster instance. There is an [official examples repo](https://github.com/vaticle/examples) for how to go about this, and information available on [migration in the docs](https://docs.vaticle.com/docs/examples/phone-calls-migration-python). Alternatively, there are fantastic community-led projects growing in the [TypeDB OSI](https://typedb.org) to facilitate fast and easy data loading, for example [TypeDB Loader](https://github.com/typedb-osi/typedb-loader). diff --git a/VERSION b/VERSION index ee1372d3..0d91a54c 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.2.2 +0.3.0 diff --git a/WORKSPACE b/WORKSPACE index 9c92ffc5..87c4ebbf 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -19,7 +19,7 @@ # under the License. # -workspace(name = "vaticle_kglib") +workspace(name = "vaticle_typedb_ml") ################################ # Load @vaticle_dependencies # @@ -46,13 +46,13 @@ kt_register_toolchains() # Load //builder/python load("@vaticle_dependencies//builder/python:deps.bzl", python_deps = "deps") python_deps() -load("@rules_python//python:pip.bzl", "pip_install") +load("@rules_python//python:pip.bzl", "pip_repositories") +pip_repositories() # Load //builder/grpc load("@vaticle_dependencies//builder/grpc:deps.bzl", grpc_deps = "deps") grpc_deps() -load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", -com_github_grpc_grpc_deps = "grpc_deps") +load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", com_github_grpc_grpc_deps = "grpc_deps") com_github_grpc_grpc_deps() # Load //tool/common @@ -89,23 +89,21 @@ github_deps() # Load @vaticle dependencies # ################################ +load("//dependencies/vaticle:artifacts.bzl", "vaticle_typedb_artifacts") +vaticle_typedb_artifacts() + load("//dependencies/vaticle:repositories.bzl", "vaticle_typedb_client_python") vaticle_typedb_client_python() +load("@rules_python//python:pip.bzl", "pip_install") pip_install( name = "vaticle_typedb_client_python_pip", requirements = "@vaticle_typedb_client_python//:requirements.txt", ) -load("//dependencies/vaticle:repositories.bzl", "vaticle_common") -vaticle_common() - -load("//dependencies/vaticle:artifacts.bzl", "vaticle_typedb_artifacts") -vaticle_typedb_artifacts() - pip_install( - name = "vaticle_kglib_pip", - requirements = "//:requirements-dev.txt", + name = "vaticle_typedb_ml_pip", + requirements = "//:requirements_dev.txt", ) ############################ @@ -121,5 +119,5 @@ maven(vaticle_dependencies_tool_maven_artifacts) load("@vaticle_bazel_distribution//common:rules.bzl", "workspace_refs") workspace_refs( - name = "vaticle_kglib_workspace_refs" + name = "vaticle_typedb_ml_workspace_refs" ) diff --git a/dependencies/vaticle/artifacts.bzl b/dependencies/vaticle/artifacts.bzl index 05bae53b..955b4566 100644 --- a/dependencies/vaticle/artifacts.bzl +++ b/dependencies/vaticle/artifacts.bzl @@ -29,5 +29,5 @@ def vaticle_typedb_artifacts(): artifact_name = "typedb-all-{platform}-{version}.{ext}", tag_source = deployment["artifact.release"], commit_source = deployment["artifact.snapshot"], - commit = "f91efad669b217805398b0ee05ebd24b9d9b4684", + tag = "2.11.1", ) diff --git a/dependencies/vaticle/repositories.bzl b/dependencies/vaticle/repositories.bzl index 13de3871..b7025ee4 100644 --- a/dependencies/vaticle/repositories.bzl +++ b/dependencies/vaticle/repositories.bzl @@ -32,12 +32,5 @@ def vaticle_typedb_client_python(): git_repository( name = "vaticle_typedb_client_python", remote = "https://github.com/vaticle/typedb-client-python", - commit = "d08a6c36ac4ef20f2df26017cf42e17fba230387" # sync-marker: do not remove this comment, this is used for sync-dependencies by @vaticle_typedb_client_python - ) - -def vaticle_common(): - git_repository( - name = "vaticle_common", - remote = "https://github.com/vaticle/typedb-common", - commit = "f0dd708adaea9fe1fdc3699180797a12166d33e8" # sync-marker: do not remove this comment, this is used for sync-dependencies by @vaticle_common + tag = "2.11.1" # sync-marker: do not remove this comment, this is used for sync-dependencies by @vaticle_typedb_client_python ) diff --git a/deployment.bzl b/deployment.bzl index bde4afa8..e38acea8 100644 --- a/deployment.bzl +++ b/deployment.bzl @@ -21,5 +21,5 @@ deployment = { "github.organisation" : "vaticle", - "github.repository" : "kglib" + "github.repository" : "typedb-ml" } \ No newline at end of file diff --git a/kglib/kgcn_tensorflow/.images/BUILD b/examples/diagnosis/.images/BUILD similarity index 100% rename from kglib/kgcn_tensorflow/.images/BUILD rename to examples/diagnosis/.images/BUILD diff --git a/examples/diagnosis/.images/diagnosis_data.png b/examples/diagnosis/.images/diagnosis_data.png new file mode 100644 index 00000000..77e05a6a Binary files /dev/null and b/examples/diagnosis/.images/diagnosis_data.png differ diff --git a/examples/diagnosis/.images/diagnosis_schema.png b/examples/diagnosis/.images/diagnosis_schema.png new file mode 100644 index 00000000..5af94ad9 Binary files /dev/null and b/examples/diagnosis/.images/diagnosis_schema.png differ diff --git a/kglib/kgcn_tensorflow/.images/knowledge_graph_machine_learning.png b/examples/diagnosis/.images/knowledge_graph_machine_learning.png similarity index 100% rename from kglib/kgcn_tensorflow/.images/knowledge_graph_machine_learning.png rename to examples/diagnosis/.images/knowledge_graph_machine_learning.png diff --git a/examples/diagnosis/.images/successful_prediction.png b/examples/diagnosis/.images/successful_prediction.png new file mode 100644 index 00000000..7a97d41e Binary files /dev/null and b/examples/diagnosis/.images/successful_prediction.png differ diff --git a/kglib/kgcn_tensorflow/examples/diagnosis/BUILD b/examples/diagnosis/BUILD similarity index 66% rename from kglib/kgcn_tensorflow/examples/diagnosis/BUILD rename to examples/diagnosis/BUILD index 6d37c02d..ecd72df3 100644 --- a/kglib/kgcn_tensorflow/examples/diagnosis/BUILD +++ b/examples/diagnosis/BUILD @@ -20,37 +20,26 @@ # load("@rules_python//python:defs.bzl", "py_test", "py_library") -load("@vaticle_kglib_pip//:requirements.bzl", - vaticle_kglib_requirement = "requirement") +load("@vaticle_typedb_ml_pip//:requirements.bzl", vaticle_typedb_ml_requirement = "requirement") load("@vaticle_dependencies//tool/checkstyle:rules.bzl", "checkstyle_test") -py_test( - name = "diagnosis_test", - srcs = [ - "diagnosis_test.py" - ], - deps = [ - "diagnosis", - "//kglib/utils/graph/test", - vaticle_kglib_requirement('numpy'), - vaticle_kglib_requirement('networkx'), - vaticle_kglib_requirement('decorator'), - ] -) - py_library( name = "diagnosis", srcs = [ 'diagnosis.py' ], deps = [ - "//kglib/kgcn_tensorflow/pipeline", - "//kglib/kgcn_tensorflow/learn", - "//kglib/kgcn_tensorflow/plot", - "//kglib/kgcn_tensorflow/models", - "//kglib/utils/typedb/synthetic", - "//kglib/utils/typedb/type", + "//examples/diagnosis/dataset", + "//typedb_ml/pytorch_geometric", + "//typedb_ml/networkx", + "//typedb_ml/typedb", "@vaticle_typedb_client_python//:client_python", + vaticle_typedb_ml_requirement("networkx"), + vaticle_typedb_ml_requirement("torch"), + vaticle_typedb_ml_requirement("torch_geometric"), + vaticle_typedb_ml_requirement("torch_sparse"), + vaticle_typedb_ml_requirement("torch_scatter"), + vaticle_typedb_ml_requirement("tensorboard"), ], visibility=['//visibility:public'] ) @@ -58,5 +47,8 @@ py_library( checkstyle_test( name = "checkstyle", include = glob(["*"]), + exclude = glob([ + "*.md" + ]), license_type = "apache-header", ) diff --git a/examples/diagnosis/README.md b/examples/diagnosis/README.md new file mode 100644 index 00000000..8509c2d1 --- /dev/null +++ b/examples/diagnosis/README.md @@ -0,0 +1,74 @@ +![Successful Prediction](.images/successful_prediction.png) + +# Link Prediction with TypeDB and PyTorch Geometric + +This example demonstrates how to build a Graph Neural Network (GNN) machine learning pipeline using TypeDB, TypeDB-ML and PyTorch Geometric for [link prediction](https://en.wikipedia.org/wiki/Link_prediction) over TypeDB data. + +## Run the Example + +Once you have installed TypeDB-ML (see root README for instructions) you can run the example as follows: + +1. Make sure a TypeDB server (version 2.11.1 or later) is running locally + +2. Clone TypeDB-ML (shallow clone: `git clone --depth 1`) and from the project root, run the example: `python -m examples.diagnosis.diagnosis "/path/to/my/typedb/install/directory"` + +3. The database, schema and seed data will be set up and data will be generated synthetically. You should see console output to indicate that the pipeline is running and that the model is learning. Finally, the predicted relations are shown, and they are written back into TypeDB as the type `predicted-diagnosis`. + +## Key steps in the Example + +The process conducted by the example is as follows: + +1. Define the data retrieval + - This requires specifying queries that will retrieve Concepts from TypeDB (in `build_queries`) + - The answers from these queries are merged together into an in-memory NetworkX graph +2. Find the Types and Roles present in the schema. If any are not needed for learning then they should be excluded from the exhaustive list for better accuracy. +3. Configure the `ATTRIBUTE_ENCODERS` for the different types present +4. Run the defined pipeline, including splitting and transformations of the graph(s) and defining and feeding a neural network. In this example we use `Heterogeneous Graph Transformer (HGTConv)` which is effective for our synthetic data. +5. Write the predictions made to TypeDB. Note that the example inserts link predictions made for all valid link locations. These include and overlap with node used in the training set as well as the validation and test set. Therefore the confusion matrix given is a very biased view just for demonstration purposes of how to use predictions. The number of predictions made is `N` x `M` where `N` here is the number of people and `M` is the number of diseases. This step becomes more interesting in graphs where `N x M` is larger, and accommodates making such predictions quite easily. Think for example of protein-protein interaction networks in Life Sciences. + +### Modifying the Example + +As much as possible the example here has been made as a template for link prediction or any other PyTorch Geometric task. To change and experiment with various other neural network configurations should be easy enough by modifying the layers in `LinkPredictionModel`. If you need a different problem formulation then take a look at the [examples in PyG](https://github.com/pyg-team/pytorch_geometric/tree/master/examples/hetero) for inspiration. + +You'll need to change steps 1, 2 and 3 above to suit your own schema and data. When specifying encoders for the different types present, if you need to encode full strings as features, take a look at this [sentence transformers](https://pypi.org/project/sentence-transformers/) project, which is used in some PyG examples. + +### How does Link Prediction work? + +The methodology used here for relation prediction is as follows: + +In this example, we aim to predict `diagnosis` relations. Each person in the dataset should either have a diagnosis of `"Multiple Scleerosis"` or of `"Diabetes Type II"`. We have the correct `diagnosis` relations, and we use `RandomLinkSplit` to split these examples into train, validation and test sets, and to create negative samples. + +We collapse these TypeDB `diagnosis` relations from a relation node into a binary edge, since we know this is a binary relation type with only ever two roleplayers. We use this simplification as it makes it trivial to predict all possible (and valid) binary relations between `disease` and `person` types, simply using the matrix dot product of the learned representation of each `disease` and each `person`. You can see this in `decode_all`. It's likely this approach can be extended to handle ternary or N-ary relations, but we don't support functionality for it here yet. + +### Binary Relation Prediction and Extension to N-ary + +This example can predict binary relations only; additional effort is required to predict ternary or N-ary relations. Predicting ternary or N-ary relations ought to be possible using an appropriate operator over the representations of the roleplayers involved (perhaps `mean-pool`, for example). But, this op needs to compute the predictions for `L` x `M` x `N` possible relations for a ternary relation where each roleplayer type has `L`, `M`, and `N` instances. This will also require extra work to manage the ground-truth labels rather than treating them as binary as we do here. + + +## Tensorboard + +The example demonstrates how to plot histograms for the features of each node type in the data such that you can easily debug whether they are properly normalised. It also shows the trend of scalars during learning. Try it with `tensorboard --logdir runs`. + +## Synthetic Data + +This example uses a synthetically generated dataset, which utilises a [Probability Mass Function (PMF)](https://en.wikipedia.org/wiki/Probability_mass_function) to pick the likelihood of each permutation of features being present for a given example. + +Studying the schema for this example (using TypeDB Studio), we have people who present symptoms, with some severity. Separately, we may know that certain symptoms can be caused by a disease. We also know information that contributes to risk-factors for certain diseases. These risk factors are determined by rules defined in the schema. Lastly, people can be diagnosed with a disease. + +Visualise the schema in Studio with this query: +``` +match $x sub thing; +not{ $x type thing;}; +not{ $x type entity;}; +not{ $x type relation;}; +not{ $x type attribute;}; +``` + +![Diagnosis Schema](.images/diagnosis_schema.png) + +Visualise the data with this query: +``` +match $x isa thing; +``` + +![Diagnosis Data](.images/diagnosis_data.png) diff --git a/kglib/utils/typedb/synthetic/examples/BUILD b/examples/diagnosis/dataset/BUILD similarity index 70% rename from kglib/utils/typedb/synthetic/examples/BUILD rename to examples/diagnosis/dataset/BUILD index 2a897558..69da28ca 100644 --- a/kglib/utils/typedb/synthetic/examples/BUILD +++ b/examples/diagnosis/dataset/BUILD @@ -20,36 +20,49 @@ # load("@rules_python//python:defs.bzl", "py_test", "py_library") -load("@vaticle_kglib_pip//:requirements.bzl", - vaticle_kglib_requirement = "requirement") +load("@vaticle_typedb_ml_pip//:requirements.bzl", vaticle_typedb_ml_requirement = "requirement") load("@vaticle_dependencies//tool/checkstyle:rules.bzl", "checkstyle_test") +py_library( + name = "dataset", + srcs = [ + 'generate.py', + 'pmf.py', + ], + deps = [ + "@vaticle_typedb_client_python//:client_python", + vaticle_typedb_ml_requirement('numpy'), + vaticle_typedb_ml_requirement('pandas'), + vaticle_typedb_ml_requirement('pytz'), + vaticle_typedb_ml_requirement('python-dateutil'), + vaticle_typedb_ml_requirement('six') + ], + visibility=['//visibility:public'] +) + filegroup( name = "diagnosis-example-typeql-files", srcs = [ - "diagnosis/schema.tql", - "diagnosis/seed_data.tql", + "schema.tql", + "seed_data.tql", ], visibility = ["//visibility:public"], ) -py_library( - name = "examples", +py_test( + name = "pmf_test", srcs = [ - 'diagnosis/generate.py', + "pmf_test.py" ], deps = [ - "@vaticle_typedb_client_python//:client_python", - vaticle_kglib_requirement('numpy'), - ], - visibility=['//visibility:public'] + "dataset" + ] ) checkstyle_test( name = "checkstyle", include = glob([ "*", - "diagnosis/*" ]), license_type = "apache-header", ) diff --git a/kglib/utils/typedb/synthetic/examples/diagnosis/generate.py b/examples/diagnosis/dataset/generate.py similarity index 79% rename from kglib/utils/typedb/synthetic/examples/diagnosis/generate.py rename to examples/diagnosis/dataset/generate.py index 625df675..3f362253 100644 --- a/kglib/utils/typedb/synthetic/examples/diagnosis/generate.py +++ b/examples/diagnosis/dataset/generate.py @@ -24,40 +24,37 @@ import numpy as np from typedb.client import * -from kglib.utils.typedb.synthetic.statistics.pmf import PMF +from examples.diagnosis.dataset.pmf import PMF def get_example_queries(pmf, example_id): variable_values = pmf.select() - queries = [f'insert $p isa person, has example-id {example_id};', - f'insert $doc isa person, has example-id {20000 + example_id};'] + queries = [f'insert $p isa person, has person-id {example_id};'] if variable_values['Multiple Sclerosis'] is not False: queries.append(inspect.cleandoc(f''' match $d isa disease, has name "Multiple Sclerosis"; - $p isa person, has example-id {example_id}; - $doc isa person, has example-id {20000 + example_id}; + $p isa person, has person-id {example_id}; insert - $diagnosis (patient: $p, diagnosed-disease: $d, doctor: $doc) isa diagnosis; + $diagnosis (patient: $p, diagnosed-disease: $d) isa diagnosis; $p has age {int(variable_values['Multiple Sclerosis']['age']())};''')) if variable_values['Diabetes Type II'] is not False: queries.append(inspect.cleandoc(f''' match - $p isa person, has example-id {example_id}; + $p isa person, has person-id {example_id}; $d isa disease, has name "Diabetes Type II"; - $doc isa person, has example-id {20000 + example_id}; insert - $diagnosis (patient: $p, diagnosed-disease: $d, doctor: $doc) isa diagnosis; + $diagnosis (patient: $p, diagnosed-disease: $d) isa diagnosis; $p has age {int(variable_values['Diabetes Type II']['age']())};''')) if variable_values['Fatigue'] is not False: queries.append(inspect.cleandoc(f''' match - $p isa person, has example-id {example_id}; + $p isa person, has person-id {example_id}; $s isa symptom, has name "Fatigue"; insert $sp (presented-symptom: $s, symptomatic-patient: $p) isa @@ -66,7 +63,7 @@ def get_example_queries(pmf, example_id): if variable_values['Blurred vision'] is not False: queries.append(inspect.cleandoc(f''' match - $p isa person, has example-id {example_id}; + $p isa person, has person-id {example_id}; $s isa symptom, has name "Blurred vision"; insert $sp (presented-symptom: $s, symptomatic-patient: $p) isa @@ -75,7 +72,7 @@ def get_example_queries(pmf, example_id): if variable_values['Drinking'] is not False: queries.append(inspect.cleandoc(f''' match - $p isa person, has example-id {example_id}; + $p isa person, has person-id {example_id}; $s isa substance, has name "Alcohol"; insert $c (consumer: $p, consumed-substance: $s) isa consumption, @@ -84,18 +81,18 @@ def get_example_queries(pmf, example_id): if variable_values['Parent has Diabetes Type II'] is not False: queries.append(inspect.cleandoc(f''' match - $p isa person, has example-id {example_id}; + $p isa person, has person-id {example_id}; $d isa disease, has name "Diabetes Type II"; insert (parent: $parent, child: $p) isa parentship; - $parent isa person, has example-id {example_id + 10000}; - $diagnosis (patient: $parent, diagnosed-disease: $d) isa diagnosis; + $parent isa parent; + $diagnosis (patient: $parent, diagnosed-disease: $d) isa familial-diagnosis; ''')) if variable_values['Cigarettes'] is not False: queries.append(inspect.cleandoc(f''' match - $p isa person, has example-id {example_id}; + $p isa person, has person-id {example_id}; $s isa substance, has name "Cigarettes"; insert $c (consumer: $p, consumed-substance: $s) isa consumption, @@ -109,24 +106,22 @@ def generate_example_data(client, num_examples, database="diagnosis"): session = client.session(database, SessionType.DATA) pmf_array = np.zeros([2, 2, 2, 2, 3, 2, 3], dtype=np.float) - pmf_array[0, 1, 0, 1, 0, 0, 0] = 0.1 + # Diabetes Type II pmf_array[1, 0, 1, 0, 0, 0, 0] = 0.05 pmf_array[1, 0, 1, 0, 2, 0, 0] = 0.1 - pmf_array[0, 1, 1, 0, 0, 0, 0] = 0.05 - pmf_array[1, 0, 0, 1, 0, 0, 0] = 0.19 + pmf_array[1, 0, 0, 1, 0, 0, 0] = 0.2 pmf_array[1, 0, 0, 1, 0, 1, 0] = 0.15 - pmf_array[1, 1, 1, 1, 0, 0, 0] = 0.01 + pmf_array[1, 0, 1, 1, 0, 0, 0] = 0.05 + pmf_array[1, 0, 1, 1, 2, 1, 2] = 0.1 + # Multiple Sclerosis + pmf_array[0, 1, 0, 1, 0, 0, 0] = 0.1 + pmf_array[0, 1, 1, 0, 0, 0, 0] = 0.05 pmf_array[0, 1, 1, 1, 0, 0, 0] = 0.05 pmf_array[0, 1, 1, 1, 0, 0, 1] = 0.05 pmf_array[0, 1, 1, 1, 0, 0, 2] = 0.1 - pmf_array[1, 0, 1, 1, 0, 0, 0] = 0.05 - pmf_array[1, 0, 1, 1, 2, 1, 2] = 0.1 - - def normal_dist(mean, var): - return lambda: round(np.random.normal(mean, var, 1)[0], 2) pmf = PMF({ - 'Diabetes Type II': [False, {'age': normal_dist(60, 10)}], + 'Diabetes Type II': [False, {'age': normal_dist(45, 10)}], 'Multiple Sclerosis': [False, {'age': normal_dist(30, 10)}], 'Fatigue': [False, {'severity': normal_dist(0.3, 0.1)}], 'Blurred vision': [False, {'severity': normal_dist(0.5, 0.2)}], @@ -135,18 +130,19 @@ def normal_dist(mean, var): 'Cigarettes': [False, {'units-per-week': normal_dist(5, 1)}, {'units-per-week': normal_dist(20, 3)}], }, pmf_array, seed=0) - # print(pmf.to_dataframe()) # TODO Remove pandas if this is not needed now - for example_id in range(0, num_examples): tx = session.transaction(TransactionType.WRITE) for query in get_example_queries(pmf, example_id): - #print(query) tx.query().insert(query) tx.commit() session.close() +def normal_dist(mean, var): + return lambda: round(np.random.normal(mean, var, 1)[0], 2) + + if __name__ == '__main__': with TypeDB.core_client("localhost:1729") as client: client.databases().create("diagnosis") diff --git a/kglib/utils/typedb/synthetic/statistics/pmf.py b/examples/diagnosis/dataset/pmf.py similarity index 100% rename from kglib/utils/typedb/synthetic/statistics/pmf.py rename to examples/diagnosis/dataset/pmf.py diff --git a/kglib/utils/typedb/synthetic/statistics/pmf_test.py b/examples/diagnosis/dataset/pmf_test.py similarity index 97% rename from kglib/utils/typedb/synthetic/statistics/pmf_test.py rename to examples/diagnosis/dataset/pmf_test.py index a0405842..b8cb9012 100644 --- a/kglib/utils/typedb/synthetic/statistics/pmf_test.py +++ b/examples/diagnosis/dataset/pmf_test.py @@ -24,7 +24,7 @@ import numpy as np import pandas as pd -from kglib.utils.typedb.synthetic.statistics.pmf import PMF +from pmf import PMF class TestPMF(unittest.TestCase): diff --git a/kglib/utils/typedb/synthetic/examples/diagnosis/schema.tql b/examples/diagnosis/dataset/schema.tql similarity index 80% rename from kglib/utils/typedb/synthetic/examples/diagnosis/schema.tql rename to examples/diagnosis/dataset/schema.tql index 2de2f633..53ab29a8 100644 --- a/kglib/utils/typedb/synthetic/examples/diagnosis/schema.tql +++ b/examples/diagnosis/dataset/schema.tql @@ -21,7 +21,7 @@ define -example-id sub attribute, +person-id sub attribute, value long; age sub attribute, @@ -37,30 +37,33 @@ units-per-week sub attribute, value long; person sub entity, - owns example-id @key, + owns person-id @key, owns age, plays diagnosis:patient, - plays diagnosis:doctor, - plays candidate-diagnosis:candidate-patient, + plays predicted-diagnosis:patient, plays symptom-presentation:symptomatic-patient, plays consumption:consumer, plays age-risk-factor:person-at-age-risk, plays hereditary-risk-factor:person-at-hereditary-risk, plays smoking-risk-factor:person-at-smoking-risk, plays alcohol-risk-factor:person-at-alcohol-risk, - plays parentship:parent, plays parentship:child; disease sub entity, owns name @key, plays causality:cause, plays diagnosis:diagnosed-disease, - plays candidate-diagnosis:candidate-diagnosed-disease, + plays familial-diagnosis:diagnosed-disease, + plays predicted-diagnosis:diagnosed-disease, plays age-risk-factor:age-risked-disease, plays hereditary-risk-factor:hereditary-risked-disease, plays smoking-risk-factor:smoking-risked-disease, plays alcohol-risk-factor:alcohol-risked-disease; +parent sub entity, + plays parentship:parent, + plays familial-diagnosis:patient; + substance sub entity, owns name @key, plays consumption:consumed-substance; @@ -71,37 +74,15 @@ parentship sub relation, diagnosis sub relation, relates patient, - relates diagnosed-disease, - relates doctor, - relates diagnoser, - owns probability-exists, - owns probability-non-exists, - owns probability-preexists; - -candidate-diagnosis sub relation, - relates candidate-patient, - relates candidate-diagnosed-disease; - -probability-exists sub attribute, - value double; - -probability-non-exists sub attribute, - value double; - -probability-preexists sub attribute, - value double; + relates diagnosed-disease; -kgcn sub entity, - plays diagnosis:diagnoser; +familial-diagnosis sub relation, + relates patient, + relates diagnosed-disease; -rule where-no-diagnosis-add-candidate-diagnosis: - when { - $p isa person; - $d isa disease; - not{ (patient: $p, diagnosed-disease: $d) isa diagnosis; }; - } then { - (candidate-patient: $p, candidate-diagnosed-disease: $d) isa candidate-diagnosis; - }; +predicted-diagnosis sub relation, + relates patient, + relates diagnosed-disease; causality sub relation, relates cause, @@ -150,9 +131,9 @@ when { rule heriditary-risk-of-diabetes: when { $p isa person; - $parent isa person; + $parent isa parent; $cause(parent: $parent, child: $p) isa parentship; - $dia(patient: $parent, diagnosed-disease: $d) isa diagnosis; + $dia(patient: $parent, diagnosed-disease: $d) isa familial-diagnosis; $d isa disease, has name "Diabetes Type II"; } then { (person-at-hereditary-risk: $p, hereditary-risked-disease: $d) isa hereditary-risk-factor; diff --git a/kglib/utils/typedb/synthetic/examples/diagnosis/seed_data.tql b/examples/diagnosis/dataset/seed_data.tql similarity index 96% rename from kglib/utils/typedb/synthetic/examples/diagnosis/seed_data.tql rename to examples/diagnosis/dataset/seed_data.tql index 966a4a55..1a576bb2 100644 --- a/kglib/utils/typedb/synthetic/examples/diagnosis/seed_data.tql +++ b/examples/diagnosis/dataset/seed_data.tql @@ -46,5 +46,4 @@ $vision isa symptom, has name "Blurred vision"; insert $c3(cause: $diabetes, effect: $vision) isa causality; insert $s isa substance, has name "Alcohol"; -insert $s isa substance, has name "Cigarettes"; -insert $kgcn isa kgcn; \ No newline at end of file +insert $s isa substance, has name "Cigarettes"; \ No newline at end of file diff --git a/examples/diagnosis/diagnosis.py b/examples/diagnosis/diagnosis.py new file mode 100644 index 00000000..89671992 --- /dev/null +++ b/examples/diagnosis/diagnosis.py @@ -0,0 +1,455 @@ +# +# Copyright (C) 2022 Vaticle +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +import argparse +import inspect +import os + +import networkx as nx +import torch +import torch.nn.functional as functional +import torch_geometric.transforms as transforms +from torch import as_tensor +from torch.utils.tensorboard import SummaryWriter +from torch_geometric.nn import HGTConv +from typedb.client import * + +from examples.diagnosis.dataset.generate import generate_example_data +from typedb_ml.networkx.query_graph import QueryGraph, Query +from typedb_ml.pytorch_geometric.dataset.dataset import DataSet +from typedb_ml.pytorch_geometric.transform.binary_link_prediction import LinkPredictionLabeller, \ + binary_relations_to_edges, binary_link_prediction_edge_triplets +from typedb_ml.pytorch_geometric.transform.common import clear_unneeded_fields, store_concepts_by_type +from typedb_ml.pytorch_geometric.transform.encode import FeatureEncoder, CategoricalEncoder, ContinuousEncoder +from typedb_ml.typedb.load import load_typeql_file, FileType +from typedb_ml.typedb.type import get_thing_types + +DATABASE = "diagnosis" +ADDRESS = "localhost:1729" + +# Ignore any types that exist in the TypeDB instance but which aren't being used for learning to reduce the +# number of categories to embed +TYPES_TO_IGNORE = {'risk-factor', 'person-id', 'alcohol-risked-disease', 'person-at-alcohol-risk', + 'person-at-hereditary-risk', 'hereditary-risked-disease', 'smoking-risked-disease', + 'person-at-smoking-risk', 'person-at-age-risk', 'age-risked-disease', 'predicted-diagnosis'} +# Note that this determines the edge direction when converting from a TypeDB relation +RELATION_TYPE_TO_PREDICT = ('person', 'patient', 'diagnosis', 'diagnosed-disease', 'disease') + +TYPE_ENCODING_SIZE = 16 +ATTRIBUTE_ENCODING_SIZE = 16 + +# Attribute encoders encode the value of each attribute into a fixed-length feature vector. The encoders are +# defined on a per-type basis. Easily define your own encoders for specific attribute data in your TypeDB database +ATTRIBUTE_ENCODERS = { + # Categorical Attribute types and the values of their categories + 'name': CategoricalEncoder( + ['Diabetes Type II', 'Multiple Sclerosis', 'Blurred vision', 'Fatigue', 'Cigarettes', 'Alcohol'], + ATTRIBUTE_ENCODING_SIZE + ), + # Continuous Attribute types and their min and max values + 'severity': ContinuousEncoder(0, 1, ATTRIBUTE_ENCODING_SIZE), + 'age': ContinuousEncoder(7, 80, ATTRIBUTE_ENCODING_SIZE), + 'units-per-week': ContinuousEncoder(3, 29, ATTRIBUTE_ENCODING_SIZE) +} + + +def diagnosis_example(typedb_binary_directory, + num_graphs, + database=DATABASE, + address=ADDRESS, + schema_file_path="examples/diagnosis/dataset/schema.tql", + seed_data_file_path="examples/diagnosis/dataset/seed_data.tql"): + """ + Args: + typedb_binary_directory: Location of the TypeDB binary for the purpose of loading initial schema and data + num_graphs: Number of graphs to use for training and testing combined + database: The name of the database to retrieve data from + address: The address of the running TypeDB instance + schema_file_path: Path to the diagnosis schema file + seed_data_file_path: Path to the file containing seed data + + Returns: + Final accuracies for training and for testing + """ + + client = TypeDB.core_client(address) + create_database(client, database) + + load_typeql_file(typedb_binary_directory, database, schema_file_path, FileType.Schema) + load_typeql_file(typedb_binary_directory, database, seed_data_file_path, FileType.Data) + generate_example_data(client, num_graphs, database=database) + + session = client.session(database, SessionType.DATA) + + # During the transforms below we convert the relations to predict to simple binary edges, which means the relation + # changes from a node to an edge. We therefore need to update the node_types and edge_types accordingly + + # Remove the relation from the node types, since we will be using it as a binary edge instead + to_ignore = list(TYPES_TO_IGNORE) + [RELATION_TYPE_TO_PREDICT[2]] + node_types = [t for t in get_thing_types(session) if t not in to_ignore] + edge_type_triplets, edge_type_triplets_reversed = binary_link_prediction_edge_triplets( + session, RELATION_TYPE_TO_PREDICT, TYPES_TO_IGNORE + ) + + binary_edge_to_predict = RELATION_TYPE_TO_PREDICT[::2] # Evaluates to: ('person', 'diagnosis', 'disease') + binary_rev_edge_to_predict = edge_type_triplets_reversed[edge_type_triplets.index(RELATION_TYPE_TO_PREDICT[::2])] # Evaluates to: ('disease', 'rev_diagnosis', 'person') + + edge_types = list({triplet[1] for triplet in edge_type_triplets}) + transform = transforms.Compose([ + lambda graph: binary_relations_to_edges(graph, RELATION_TYPE_TO_PREDICT[1:4]), + lambda graph: nx.convert_node_labels_to_integers(graph, label_attribute="concept"), + FeatureEncoder(node_types, edge_types, TYPE_ENCODING_SIZE, ATTRIBUTE_ENCODERS, ATTRIBUTE_ENCODING_SIZE), + LinkPredictionLabeller(RELATION_TYPE_TO_PREDICT[2]), + store_concepts_by_type, + clear_unneeded_fields + ]) + + # Create a Dataset that will load graphs from TypeDB on-demand, based on an ID + dataset = DataSet([0], node_types, edge_type_triplets, build_queries, session, True, transform) + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + data, node_type_indices, edge_type_indices = dataset[0] + data = data.to_heterogeneous( + as_tensor(node_type_indices), as_tensor(edge_type_indices), node_types, edge_type_triplets + ).to(device) # Get the first graph object. + + # Reverse edges need to be present for bi-directional message-passing but the labels should not be considered + # for node and edge representations + data = transforms.ToUndirected()(data) + for edge_from, edge, edge_to in edge_type_triplets_reversed: + del data[edge_from, edge, edge_to].edge_label # Remove "reverse" label. + + # Setting the neg_sampling_ratio higher than the number of places that a negative sample can be added causes the + # training set to have too few negative samples! + # Consider using other samplers from Pytorch Geometric in place of this one, depending on your use case + train_data, val_data, test_data = transforms.RandomLinkSplit( + num_val=0.2, + num_test=0.2, + neg_sampling_ratio=1.0, + edge_types=binary_edge_to_predict, + rev_edge_types=binary_rev_edge_to_predict + )(data) + + # Add a new `links` attribute to store the edges for prediction so that they are definitely isolated from training + train_data.link_index = train_data.edge_label_index_dict[binary_edge_to_predict] + train_data.link_labels = train_data.edge_label_dict[binary_edge_to_predict] + val_data.link_index = val_data.edge_label_index_dict[binary_edge_to_predict] + val_data.link_labels = val_data.edge_label_dict[binary_edge_to_predict] + test_data.link_index = test_data.edge_label_index_dict[binary_edge_to_predict] + test_data.link_labels = test_data.edge_label_dict[binary_edge_to_predict] + + # Delete the stores for the predicted edge now that we have stored it elsewhere above + data.links = data[binary_edge_to_predict] + data.rev_links = data[binary_rev_edge_to_predict] + del data[binary_edge_to_predict] + del data[binary_rev_edge_to_predict] + del train_data[binary_edge_to_predict] + del train_data[binary_rev_edge_to_predict] + del val_data[binary_edge_to_predict] + del val_data[binary_rev_edge_to_predict] + del test_data[binary_edge_to_predict] + del test_data[binary_rev_edge_to_predict] + + class LinkPredictionModel(torch.nn.Module): + def __init__(self, in_channels: Union[int, Dict[str, int]], hidden_channels=128, heads=8): + super().__init__() + self.conv = HGTConv(in_channels, hidden_channels, heads=heads, metadata=train_data.metadata()) + + def encode(self, x_dict, edge_index_dict): + return self.conv(x_dict, edge_index_dict) + + def decode(self, z, edge_label_index_dict): + row, col = edge_label_index_dict + logits = (z['person'][row] * z['disease'][col]).sum(dim=-1) + return logits + + def decode_all(self, z): + logits = z['person'] @ z['disease'].t() + return logits + + model = LinkPredictionModel(in_channels=-1) + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + data, model = data.to(device), model.to(device) + + optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=0.001) + + def train() -> float: + model.train() + optimizer.zero_grad() + z = model.encode(train_data.x_dict, train_data.edge_index_dict) + logits = model.decode(z, train_data.link_index) + loss = functional.binary_cross_entropy_with_logits(logits, train_data.link_labels) + loss.backward() + optimizer.step() + return float(loss) + + @torch.no_grad() + def test() -> List[Tuple[float, float, float]]: + model.eval() + results = [] + for split in train_data, val_data, test_data: + # We use `edge_index_dict` and `y_edge` for validation and testing to exclude the negative samples + z = model.encode(split.x_dict, split.edge_index_dict) + link_logits = model.decode(z, split.link_index) + link_probs = link_logits.sigmoid() + tp = ((link_probs > 0.5) * (split.link_labels == 1)).sum() + tn = ((link_probs < 0.5) * (split.link_labels == 0)).sum() + pos = (split.link_labels == 1).sum() + neg = (split.link_labels == 0).sum() + precision = tn / neg + recall = tp / pos + acc = (tp + tn) / (pos + neg) + results.append((float(acc), precision, recall)) + return results + + writer = SummaryWriter() + for edge_type, edge_store in zip(data.edge_types, data.edge_stores): + writer.add_histogram('('+', '.join(edge_type) + ')/edge_attr', edge_store["edge_attr"]) + writer.add_histogram('('+', '.join(edge_type) + ')/y_edge', edge_store["y_edge"]) + + for node_type, node_store in zip(data.node_types, data.node_stores): + writer.add_histogram(node_type + '/x', node_store["x"]) + + best_val_acc = 0 + start_patience = patience = 100 + train_results = None + test_results = None + for epoch in range(1, 100): + loss = train() + writer.add_scalar('Loss/train', loss, epoch) + train_results, val_results, test_results = test() + writer.add_scalar('Accuracy/train', train_results[0], epoch) + writer.add_scalar('Accuracy/val', val_results[0], epoch) + writer.add_scalar('Accuracy/test', test_results[0], epoch) + writer.add_scalar('Precision/test', test_results[1], epoch) + writer.add_scalar('Recall/test', test_results[2], epoch) + print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_results[0]:.4f}, ' + f'Val: {val_results[0]:.4f}, Test: {test_results[0]:.4f}, Test Precision: {test_results[1]:.4f}, ' + f'Test Recall: {test_results[2]:.4f}') + + if best_val_acc <= val_results[0]: + patience = start_patience + best_val_acc = val_results[0] + else: + patience -= 1 + + if patience <= 0: + print('Stopping training as validation accuracy did not improve ' + f'for {start_patience} epochs') + break + + z = model.encode(data.x_dict, data.edge_index_dict) + final_edge_index = (model.decode_all(z).sigmoid() > 0.5).nonzero(as_tuple=False).cpu().detach().numpy() + + # Get back the concepts for each of links predicted in order to insert the predictions into TypeDB + predicted_links = [] + for p, d in final_edge_index: + predicted_links.append( + { + 'person': data['concepts_by_type']['person'][p], + 'disease': data['concepts_by_type']['disease'][d] + } + ) + print("The following links have been predicted:") + print(predicted_links) # Bear in mind this is predicted links across *all* data: train, val and test + + with session.transaction(TransactionType.WRITE) as tx: + write_predictions_to_typedb(predicted_links, tx) + + # Now we can get the confusion matrix from querying TypeDB! Note that this includes training and validation + # examples, but serves as a demo for seeing the predictions made. + with session.transaction(TransactionType.READ) as tx: + # Also try these queries in TypeDB Studio omitting "count;" to visualise the predicted relations + tp = tx.query().match_aggregate("match $p isa person; $d isa disease; ($p, $d) isa diagnosis; " + "($p, $d) isa predicted-diagnosis; count;").get().as_int() + tn = tx.query().match_aggregate("match $p isa person; $d isa disease; not{($p, $d) isa diagnosis;}; " + "not{($p, $d) isa predicted-diagnosis;}; count;").get().as_int() + fp = tx.query().match_aggregate("match $p isa person; $d isa disease; not{($p, $d) isa diagnosis;}; " + "($p, $d) isa predicted-diagnosis; count;").get().as_int() + fn = tx.query().match_aggregate("match $p isa person; $d isa disease; ($p, $d) isa diagnosis; " + "not{($p, $d) isa predicted-diagnosis;}; count;").get().as_int() + print("Confusion matrix") + print(f"{tp} {fn}\n{fp} {tn}") + + session.close() + client.close() + + return train_results[0], test_results[0] + + +def create_database(client, database): + if client.databases().contains(database): + raise ValueError( + f"There is already a database present with the name {database}. The Diagnosis example expects a clean DB. " + f"Please delete the {database} database, or use another database name") + client.databases().create(database) + + +def build_queries(subgraph_id: int) -> List[Query]: + """ + Creates a tuple of Query objects that contain the information needed to convert query answers into NetworkX graphs. + + Args: + subgraph_id: A uniquely identifiable id used to anchor the results of the queries to a specific subgraph, + designed so that the user can easily query for segmented subgraphs to be used as batches. + + Returns: + List of Query + """ + assert subgraph_id == 0 # In this example the graph is small so we don't use any subgraphs + + # === Hereditary Feature === + hereditary_query = inspect.cleandoc(f'''match + $p isa person; + $par isa parent; + $ps(child: $p, parent: $par) isa parentship; + $diag(patient:$par, diagnosed-disease: $d) isa familial-diagnosis; + $d isa disease, has name $n; + ''') + + vars = p, par, ps, d, diag, n = 'p', 'par', 'ps', 'd', 'diag', 'n' + hereditary_query_graph = (QueryGraph() + .add_vars(vars) + .add_role_edge(ps, p, 'child') + .add_role_edge(ps, par, 'parent') + .add_role_edge(diag, par, 'patient') + .add_role_edge(diag, d, 'diagnosed-disease') + .add_has_edge(d, n)) + + # === Consumption Feature === + consumption_query = inspect.cleandoc(f'''match + $p isa person; + $s isa substance, has name $n; + $c(consumer: $p, consumed-substance: $s) isa consumption, + has units-per-week $u;''') + + vars = p, s, n, c, u = 'p', 's', 'n', 'c', 'u' + consumption_query_graph = (QueryGraph() + .add_vars(vars) + .add_has_edge(s, n) + .add_role_edge(c, p, 'consumer') + .add_role_edge(c, s, 'consumed-substance') + .add_has_edge(c, u)) + + # === Age Feature === + person_age_query = inspect.cleandoc(f'''match + $p isa person, has age $a; + ''') + + vars = p, a = 'p', 'a' + person_age_query_graph = (QueryGraph() + .add_vars(vars) + .add_has_edge(p, a)) + + # === Risk Factors Feature === + risk_factor_query = inspect.cleandoc(f'''match + $d isa disease; + $p isa person; + $r(person-at-risk: $p, risked-disease: $d) isa risk-factor; + ''') + + vars = p, d, r = 'p', 'd', 'r' + risk_factor_query_graph = (QueryGraph() + .add_vars(vars) + .add_role_edge(r, p, 'person-at-risk') + .add_role_edge(r, d, 'risked-disease')) + + # === Symptom === + vars = p, s, sn, d, dn, sp, sev, c = 'p', 's', 'sn', 'd', 'dn', 'sp', 'sev', 'c' + + symptom_query = inspect.cleandoc(f'''match + $p isa person; + $s isa symptom, has name $sn; + $d isa disease, has name $dn; + $sp(presented-symptom: $s, symptomatic-patient: $p) isa symptom-presentation, has severity $sev; + $c(cause: $d, effect: $s) isa causality; + ''') + + symptom_query_graph = (QueryGraph() + .add_vars(vars) + .add_has_edge(s, sn) + .add_has_edge(d, dn) + .add_role_edge(sp, s, 'presented-symptom') + .add_has_edge(sp, sev) + .add_role_edge(sp, p, 'symptomatic-patient') + .add_role_edge(c, s, 'effect') + .add_role_edge(c, d, 'cause')) + + # === Diagnosis === + + diag, d, p, dn = 'diag', 'd', 'p', 'dn' + + diagnosis_query = inspect.cleandoc(f'''match + $p isa person; + $d isa disease, has name $dn; + $diag(patient: $p, diagnosed-disease: $d) isa diagnosis; + ''') + + diagnosis_query_graph = (QueryGraph() + .add_vars([diag]) + .add_vars([d, p, dn]) + .add_role_edge(diag, d, 'diagnosed-disease') + .add_role_edge(diag, p, 'patient')) + + return [ + Query(symptom_query_graph, symptom_query), + Query(diagnosis_query_graph, diagnosis_query), + Query(risk_factor_query_graph, risk_factor_query), + Query(person_age_query_graph, person_age_query), + Query(consumption_query_graph, consumption_query), + Query(hereditary_query_graph, hereditary_query) + ] + + +def write_predictions_to_typedb(predicted_links, tx): + """ + Take predictions from the ML model, and insert representations of those predictions back into the graph. + + Args: + predicted_links: pairs of concepts that are predicted links + tx: TypeDB write transaction to use + + Returns: None + + """ + for predicted_link in predicted_links: + person = predicted_link['person'] + disease = predicted_link['disease'] + query = (f'match ' + f'$p iid {person.iid}; ' + f'$d iid {disease.iid}; ' + f'insert ' + f'$pd(patient: $p, diagnosed-disease: $d) isa predicted-diagnosis;') + tx.query().insert(query) + tx.commit() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description="Just an example", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("--graphs", help="num graphs", default=200) + parser.add_argument("typedb", help="TypeDB location") + args = parser.parse_args() + cwd = os.getcwd() + diagnosis_example(args.typedb, args.graphs, + database=DATABASE, + address=ADDRESS, + schema_file_path=cwd + '/' + "examples/diagnosis/dataset/schema.tql", + seed_data_file_path=cwd + '/' + "examples/diagnosis/dataset/seed_data.tql") diff --git a/install_requires.txt b/install_requires.txt new file mode 100644 index 00000000..8dee8cee --- /dev/null +++ b/install_requires.txt @@ -0,0 +1,66 @@ +# +# Copyright (C) 2022 Vaticle +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +absl-py==1.2.0 +cachetools==5.2.0 +certifi==2022.6.15 +charset-normalizer==2.1.0 +decorator==5.1.1 +google-auth==2.9.1 +google-auth-oauthlib==0.4.6 +grpcio==1.43.0 +idna==3.3 +importlib-metadata==4.12.0 +Jinja2==3.1.2 +joblib==1.1.0 +Markdown==3.4.1 +MarkupSafe==2.1.1 +networkx==2.5 +numpy==1.21.6 +oauthlib==3.2.0 +pandas==1.3.5 +protobuf==3.15.5 +pyasn1==0.4.8 +pyasn1-modules==0.2.8 +pyparsing==3.0.9 +python-dateutil==2.8.2 +pytz==2022.1 +requests==2.28.1 +requests-oauthlib==1.3.1 +rsa==4.9 +scikit-learn==1.0.2 +scipy==1.7.3 +six==1.16.0 +tensorboard==2.9.1 +tensorboard-data-server==0.6.1 +tensorboard-plugin-wit==1.8.1 +threadpoolctl==3.1.0 +torch==1.11.0 +torch-geometric==2.0.4 +torch-scatter==2.0.9 +torch-sparse==0.6.14 +tqdm==4.64.0 +typedb-client==2.9.0 +typedb-protocol==2.9.0 +typing-extensions==4.3.0 +urllib3==1.26.10 +Werkzeug==2.1.2 +zipp==3.8.1 diff --git a/kglib/__init__.py b/kglib/__init__.py deleted file mode 100644 index 4e916487..00000000 --- a/kglib/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - diff --git a/kglib/kgcn_data_loader/BUILD b/kglib/kgcn_data_loader/BUILD deleted file mode 100644 index 77257580..00000000 --- a/kglib/kgcn_data_loader/BUILD +++ /dev/null @@ -1,67 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -load("@vaticle_dependencies//tool/checkstyle:rules.bzl", "checkstyle_test") -load("@vaticle_kglib_pip//:requirements.bzl", - vaticle_kglib_requirement = "requirement") - -py_library( - name = "kgcn_data_loader", - srcs = glob([ - "*.py", - "**/*.py" - ]), - visibility=['//visibility:public'] -) - -py_test( - name = "standard_encode_test", - srcs = [ - "encoding/standard_encode_test.py" - ], - deps = [ - "kgcn_data_loader", - "//kglib/utils" - ] -) - -py_test( - name = "utils_test", - srcs = [ - "utils_test.py" - ], - deps = [ - "kgcn_data_loader", - "//kglib/utils/graph/test", - "//kglib/utils/typedb", - vaticle_kglib_requirement('networkx'), - vaticle_kglib_requirement('numpy'), - ] -) - -checkstyle_test( - name = "checkstyle", - include = glob([ - "*", - "**/*" - ]), - license_type = "apache-header", -) diff --git a/kglib/kgcn_data_loader/dataset/typedb_networkx_dataset.py b/kglib/kgcn_data_loader/dataset/typedb_networkx_dataset.py deleted file mode 100644 index 2ec351a8..00000000 --- a/kglib/kgcn_data_loader/dataset/typedb_networkx_dataset.py +++ /dev/null @@ -1,89 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -from typing import Sequence, Callable, Optional - -import networkx as nx -from typedb.client import TypeDB, TypeDBSession, SessionType, TypeDBOptions, TransactionType - -from kglib.utils.graph.thing.queries_to_networkx_graph import build_graph_from_queries - - -class TypeDBNetworkxDataSet: - """ - Loading graphs based on queries from TypeDB. - Note: not dependent on PyTorch or Pytorch Geometric. - """ - - def __init__( - self, - example_indices: Sequence, - get_query_handles_for_id: Callable, - database: Optional[str] = None, - uri: Optional[str] = "localhost:1729", - session: Optional[TypeDBSession] = None, - infer: bool = True, - transform: Optional[Callable[[nx.Graph], nx.Graph]] = None, - ): - assert (database and uri) or session - self._example_indices = example_indices - self.get_query_handles_for_id = get_query_handles_for_id - self._infer = infer - self._transform = transform - self._uri = uri - self._database = database - self._typedb_session = session - - @property - def typedb_session(self): - """ - Did this like this in an attempt to make it - also work when using with a DataLoader with - num_workers > 0. - - TODO: it does not, so look into this. - """ - if not self._typedb_session: - print("setting up session") - print(self) - client = TypeDB.core_client(self._uri) - self._typedb_session = client.session(database=self._database, session_type=SessionType.DATA) - return self._typedb_session - - def __len__(self): - return len(self._example_indices) - - def __getitem__(self, idx): - print(type(self._typedb_session)) - example_id = self._example_indices[idx] - print(f"Fetching subgraph for example {example_id}") - graph_query_handles = self.get_query_handles_for_id(example_id) - - options = TypeDBOptions.core() - options.infer = self._infer - - with self.typedb_session.transaction(TransactionType.READ, options=options) as tx: - # Build a graph from the queries, samplers, and query graphs - graph = build_graph_from_queries(graph_query_handles, tx) - graph.name = example_id - if self._transform: - graph = self._transform(graph) - return graph \ No newline at end of file diff --git a/kglib/kgcn_data_loader/encoding/standard_encode.py b/kglib/kgcn_data_loader/encoding/standard_encode.py deleted file mode 100644 index 176419f5..00000000 --- a/kglib/kgcn_data_loader/encoding/standard_encode.py +++ /dev/null @@ -1,111 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import numpy as np - -from kglib.utils.graph.iterate import multidigraph_data_iterator, multidigraph_node_data_iterator, \ - multidigraph_edge_data_iterator - - -def encode_values(graph, categorical_attributes, continuous_attributes): - for node_data in multidigraph_node_data_iterator(graph): - typ = node_data['type'] - - if categorical_attributes is not None and typ in categorical_attributes.keys(): - # Add the integer value of the category for each categorical attribute instance - category_values = categorical_attributes[typ] - node_data['encoded_value'] = category_values.index(node_data['value']) - - elif continuous_attributes is not None and typ in continuous_attributes.keys(): - min_val, max_val = continuous_attributes[typ] - node_data['encoded_value'] = (node_data['value'] - min_val) / (max_val - min_val) - - else: - node_data['encoded_value'] = 0 - for edge_data in multidigraph_edge_data_iterator(graph): - edge_data['encoded_value'] = 0 - - return graph - - -def encode_types(graph, iterator_func, types): - """ - Encodes the type found in graph data as an integer according to the index it is found in `all_types` - Args: - graph: The graph to encode - iterator_func: An function to create an iterator of data in the graph (node data, edge data or combined node and edge data) - types: The full list of types to be encoded in this order - - Returns: - The graph, which is also is updated in-place - - """ - iterator = iterator_func(graph) - - for data in iterator: - data['categorical_type'] = types.index(data['type']) - - return graph - - -def create_input_graph(graph): - input_graph = graph.copy() - - for data in multidigraph_data_iterator(input_graph): - if data["solution"] == 0: - preexists = 1 - else: - preexists = 0 - - features = stack_features([preexists, data["categorical_type"], data["encoded_value"]]) - data.clear() - data["features"] = features - - input_graph.graph["features"] = np.array([0.0] * 5, dtype=np.float32) - return input_graph - - -def create_target_graph(graph): - target_graph = graph.copy() - solution_one_hot_encoding = np.array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]], dtype=np.float32) - - for data in multidigraph_data_iterator(target_graph): - features = solution_one_hot_encoding[data["solution"]] - data.clear() - data["features"] = features - - target_graph.graph["features"] = np.array([0.0] * 5, dtype=np.float32) - return target_graph - - -def stack_features(features): - """ - Stacks features together into a single vector - - Args: - features: iterable of features, features can be a single value or iterable - - Returns: - Numpy array (vector) of stacked features - - """ - - return np.hstack([np.array(feature, dtype=np.float32) for feature in features]) diff --git a/kglib/kgcn_data_loader/encoding/standard_encode_test.py b/kglib/kgcn_data_loader/encoding/standard_encode_test.py deleted file mode 100644 index 87b77fa6..00000000 --- a/kglib/kgcn_data_loader/encoding/standard_encode_test.py +++ /dev/null @@ -1,51 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import unittest - -import numpy as np - -from kglib.kgcn_data_loader.encoding.standard_encode import stack_features - - -class TestAugmentDataFields(unittest.TestCase): - - def test_numpy_fields_augmented_as_expected(self): - features = [np.array([0, 1, 0]), np.array([5])] - - stacked = stack_features(features) - - expected = np.array([0, 1, 0, 5]) - - np.testing.assert_equal(expected, stacked) - - def test_augmenting_non_numpy_numeric(self): - data = [np.array([0, 1, 0]), 5] - - stacked = stack_features(data) - - expected = np.array([0, 1, 0, 5]) - - np.testing.assert_equal(stacked, expected) - - -if __name__ == "__main__": - unittest.main() diff --git a/kglib/kgcn_data_loader/transform/standard_kgcn_transform.py b/kglib/kgcn_data_loader/transform/standard_kgcn_transform.py deleted file mode 100644 index 255673a8..00000000 --- a/kglib/kgcn_data_loader/transform/standard_kgcn_transform.py +++ /dev/null @@ -1,139 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import networkx as nx -from kglib.utils.graph.iterate import ( - multidigraph_node_data_iterator, - multidigraph_data_iterator, - multidigraph_edge_data_iterator, -) - -from kglib.kgcn_data_loader.encoding.standard_encode import encode_types, encode_values, stack_features -from kglib.kgcn_data_loader.utils import duplicate_edges_in_reverse - - -class StandardKGCNNetworkxTransform: - """Transform of the networkx graph as it comes out of TypeDB - to a networkx graph that pytorch geometric likes to ingest. - Now this is very much geared to pytorch geometric especially - because I set the attribute names to things like "x" and - "edge_attr" which are the standard names in pytorch geometric. - - One thing I encountered when trying to load the graph form the - original kglib example directly in pytorch geometric is that - since in the original example the feature vector on the nodes - and the edges were both called "features", the stock function - from pytorch geometric: torch_geometric.utils.from_networkx - does not deal with this well (it ends up overwriting node - features with edge features). - - :arg graph: networkx graph object - :returns: networkx graph object - - """ - - def __init__( - self, - node_types, - edge_types, - target_name, - obfuscate=None, - categorical=None, - continuous=None, - duplicate_in_reverse=True, - label_attribute="concept", - ): - self.node_types = node_types - self.edge_types = edge_types - self.target_name = target_name - self.obfuscate = obfuscate or {} - self.categorical = categorical or {} - self.continuous = continuous or {} - self.duplicate = duplicate_in_reverse - self.label_attribute = label_attribute - - def __call__(self, graph): - if self.obfuscate: - obfuscate_labels(graph, self.obfuscate) - # Encode attribute values as number - graph = encode_values(graph, self.categorical, self.continuous) - graph = nx.convert_node_labels_to_integers( - graph, label_attribute=self.label_attribute - ) - if self.duplicate: - graph = duplicate_edges_in_reverse(graph) - # Node or Edge Type as int - graph = encode_types(graph, multidigraph_node_data_iterator, self.node_types) - graph = encode_types(graph, multidigraph_edge_data_iterator, self.edge_types) - - for data in multidigraph_node_data_iterator(graph): - features = create_feature_vector(data) - target = data[self.target_name] - data.clear() - data["x"] = features - data["y"] = target - - for data in multidigraph_edge_data_iterator(graph): - features = create_feature_vector(data) - target = data[self.target_name] - data.clear() - data["edge_attr"] = features - data["y_edge"] = target - - return graph - - -def create_feature_vector(node_or_edge_data_dict): - """Make a feature 3-dimensional feature vector, - - Factored out of kglib.kgcn_tensorflow.pipeline.encode.create_input_graph. - - Args: - node_or_edge_dict: the dict coming describing a node or edge - obtained from an element of graph.nodes(data=True) or graph.edges(data=True) - of a networkx graph. - - Returns: - Numpy array (vector) of stacked features - - """ - if node_or_edge_data_dict["solution"] == -1: - preexists = 1 - else: - preexists = 0 - features = stack_features( - [ - preexists, - node_or_edge_data_dict["categorical_type"], - node_or_edge_data_dict["encoded_value"], - ] - ) - return features - - -def obfuscate_labels(graph, types_and_roles_to_obfuscate): - """Taken directly from diagnosis.py from the kglib example""" - # Remove label leakage - change type labels that indicate candidates into non-candidates - for data in multidigraph_data_iterator(graph): - for label_to_obfuscate, with_label in types_and_roles_to_obfuscate.items(): - if data["type"] == label_to_obfuscate: - data.update(type=with_label) - break diff --git a/kglib/kgcn_data_loader/utils.py b/kglib/kgcn_data_loader/utils.py deleted file mode 100644 index 63561783..00000000 --- a/kglib/kgcn_data_loader/utils.py +++ /dev/null @@ -1,126 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import subprocess as sp -from typing import List - -from typedb.client import * - -from kglib.utils.typedb.type.type import get_thing_types, get_role_types - - -def load_typeql_schema_file(database, typedb_binary_location, typeql_file_path): - """Load a schema from a file""" - _load_typeql_file(database, typeql_file_path, typedb_binary_location, "schema") - - -def load_typeql_data_file(database, typedb_binary_location, typeql_file_path): - """Load data from a file""" - _load_typeql_file(database, typeql_file_path, typedb_binary_location, "data") - - -def _load_typeql_file(database, typeql_file_path, typedb_binary_location, schema_or_data): - sp.check_call([ - 'typedb', - 'console', - f'--command=transaction {database} {schema_or_data} write', - f'--command=source {typeql_file_path}', - f'--command=commit' - ], cwd=typedb_binary_location) - - -def duplicate_edges_in_reverse(graph): - """ - Takes in a directed multi graph, and creates duplicates of all edges, the duplicates having reversed direction to - the originals. This is useful since directed edges constrain the direction of messages passed. We want to permit - omni-directional message passing. - Args: - graph: The graph - - Returns: - The graph with duplicated edges, reversed, with all original edge properties attached to the duplicates - """ - for sender, receiver, keys, data in graph.edges(data=True, keys=True): - graph.add_edge(receiver, sender, keys, **data) - return graph - - -def apply_logits_to_graphs(graph, logits_graph): - """ - Take in a graph that describes the logits of the graph of interest, and store those logits on the graph as the - property 'logits'. The graphs must correspond with one another - - Args: - graph: Graph to apply logits to - logits_graph: Graph containing logits - - Returns: - graph with logits added as property 'logits' - """ - - for node, data in logits_graph.nodes(data=True): - graph.nodes[node]['logits'] = list(data['features']) - - # TODO This is the desired implementation, but the graphs are altered by the model to have duplicated reversed - # edges, so this won't work for now - # for sender, receiver, keys, data in logit_graph.edges(keys=True, data=True): - # graph.edges[sender, receiver, keys]['logits'] = list(data['features']) - - for sender, receiver, keys, data in graph.edges(keys=True, data=True): - data['logits'] = list(logits_graph.edges[sender, receiver, keys]['features']) - - return graph - - -def get_node_types_for_training(session: TypeDBSession, types_to_ignore: List[str]) -> List[str]: - """ - Takes in a list of node types to ignore and returns all node types in schema that are not to be ignored. - - Args: - session: TypeDB rpc session of type SessionType.DATA - types_to_ignore: list of strings of schema type labels - - Returns: - list of strings of schema type labels to include in training - """ - with session.transaction(TransactionType.READ) as tx: - node_types = get_thing_types(tx) - [node_types.remove(el) for el in types_to_ignore] - print(f"Found node types: {node_types}") - return node_types - - -def get_edge_types_for_training(session: TypeDBSession, roles_to_ignore: List[str]) -> List[str]: - """ - Takes in a list of role types to ignore and returns all role types in schema that are not to be ignored. - - Args: - session: TypeDB rpc session of type SessionType.DATA - roles_to_ignore: list of strings of role type labels - - Returns: - list of strings of role type labels to include in training - """ - with session.transaction(TransactionType.READ) as tx: - edge_types = get_role_types(tx) - [edge_types.remove(el) for el in roles_to_ignore] - print(f"Found edge types: {edge_types}") - return edge_types diff --git a/kglib/kgcn_data_loader/utils_test.py b/kglib/kgcn_data_loader/utils_test.py deleted file mode 100644 index f5ec5cf8..00000000 --- a/kglib/kgcn_data_loader/utils_test.py +++ /dev/null @@ -1,95 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import unittest - -import networkx as nx -import numpy as np - -from kglib.utils.typedb.object.thing import Thing -from kglib.utils.graph.test.case import GraphTestCase - -from kglib.kgcn_data_loader.utils import duplicate_edges_in_reverse, apply_logits_to_graphs - - -class TestDuplicateEdgesInReverse(GraphTestCase): - - def test_edges_are_duplicated_as_expected(self): - graph = nx.MultiDiGraph(name=0) - - p0 = Thing('V123', 'person', 'entity') - p1 = Thing('V456', 'person', 'entity') - par0 = Thing('V789', 'parentship', 'relation') - - # people - graph.add_node(p0, type='person', solution=1) - graph.add_node(p1, type='person', solution=1) - - # parentships - graph.add_node(par0, type='parentship', solution=1) - graph.add_edge(par0, p0, type='parent', solution=1) - graph.add_edge(par0, p1, type='child', solution=1) - - duplicate_edges_in_reverse(graph) - - expected_graph = nx.MultiDiGraph(name=0) - - # people - expected_graph.add_node(p0, type='person', solution=1) - expected_graph.add_node(p1, type='person', solution=1) - - # parentships - expected_graph.add_node(par0, type='parentship', solution=1) - expected_graph.add_edge(par0, p0, type='parent', solution=1) - expected_graph.add_edge(par0, p1, type='child', solution=1) - - # Duplicates - expected_graph.add_edge(p0, par0, type='parent', solution=1) - expected_graph.add_edge(p1, par0, type='child', solution=1) - self.assertGraphsEqual(expected_graph, graph) - - -class TestApplyLogitsToGraphs(GraphTestCase): - def test_logits_applied_as_expected(self): - - graph = nx.MultiDiGraph(name=0) - graph.add_node(0) - graph.add_node(1) - graph.add_edge(0, 1) - - logits_graph = nx.MultiDiGraph(name=0) - logits_graph.add_node(0, features=np.array([0.2, 0.3, 0.01])) - logits_graph.add_node(1, features=np.array([0.56, -0.04, 0.05])) - logits_graph.add_edge(0, 1, features=np.array([0.5, 0.008, -0.1])) - logits_graph.add_edge(1, 0, features=np.array([0.5, 0.008, -0.1])) - - expected_graph = nx.MultiDiGraph(name=0) - expected_graph.add_node(0, logits=[0.2, 0.3, 0.01]) - expected_graph.add_node(1, logits=[0.56, -0.04, 0.05]) - expected_graph.add_edge(0, 1, logits=[0.5, 0.008, -0.1]) - - graph_with_logits = apply_logits_to_graphs(graph, logits_graph) - - self.assertGraphsEqual(expected_graph, graph_with_logits) - - -if __name__ == '__main__': - unittest.main() diff --git a/kglib/kgcn_tensorflow/.images/basic_schema.png b/kglib/kgcn_tensorflow/.images/basic_schema.png deleted file mode 100644 index 3d3eca09..00000000 Binary files a/kglib/kgcn_tensorflow/.images/basic_schema.png and /dev/null differ diff --git a/kglib/kgcn_tensorflow/.images/basic_schema.svg b/kglib/kgcn_tensorflow/.images/basic_schema.svg deleted file mode 100644 index f88f4f61..00000000 --- a/kglib/kgcn_tensorflow/.images/basic_schema.svg +++ /dev/null @@ -1,3 +0,0 @@ - - -
diagnosed-disease
diagnosed-disease
patient
patient
has
has
doctor
doctor
diagnosis
[Not supported by viewer]
person
[Not supported by viewer]
disease
[Not supported by viewer]
name
"Diabetes Type II"
[Not supported by viewer]
diff --git a/kglib/kgcn_tensorflow/.images/edge_update.png b/kglib/kgcn_tensorflow/.images/edge_update.png deleted file mode 100644 index 0006a78c..00000000 Binary files a/kglib/kgcn_tensorflow/.images/edge_update.png and /dev/null differ diff --git a/kglib/kgcn_tensorflow/.images/edge_update.svg b/kglib/kgcn_tensorflow/.images/edge_update.svg deleted file mode 100644 index 51c2f43e..00000000 --- a/kglib/kgcn_tensorflow/.images/edge_update.svg +++ /dev/null @@ -1,3 +0,0 @@ - - -
diagnosed-disease
diagnosed-disease
patient
patient
has
has
doctor
doctor
diagnosis
[Not supported by viewer]
person
[Not supported by viewer]
disease
[Not supported by viewer]
person
[Not supported by viewer]
name
"Diabetes Type II"
[Not supported by viewer]
\ No newline at end of file diff --git a/kglib/kgcn_tensorflow/.images/edge_update_process.png b/kglib/kgcn_tensorflow/.images/edge_update_process.png deleted file mode 100644 index 73347322..00000000 Binary files a/kglib/kgcn_tensorflow/.images/edge_update_process.png and /dev/null differ diff --git a/kglib/kgcn_tensorflow/.images/interaction_edge_update.png b/kglib/kgcn_tensorflow/.images/interaction_edge_update.png deleted file mode 100644 index e5a3d93f..00000000 Binary files a/kglib/kgcn_tensorflow/.images/interaction_edge_update.png and /dev/null differ diff --git a/kglib/kgcn_tensorflow/.images/interaction_edge_update.svg b/kglib/kgcn_tensorflow/.images/interaction_edge_update.svg deleted file mode 100644 index 4d3710e7..00000000 --- a/kglib/kgcn_tensorflow/.images/interaction_edge_update.svg +++ /dev/null @@ -1,3 +0,0 @@ - - -
effect
effect
cause
cause
has
has
has
has
activity
[Not supported by viewer]
chemical
[Not supported by viewer]
gene
[Not supported by viewer]
phosphorylation
[Not supported by viewer]
gene
[Not supported by viewer]
cause
cause
effect
effect
degree
"decreases"
[Not supported by viewer]
\ No newline at end of file diff --git a/kglib/kgcn_tensorflow/.images/interaction_node_update.png b/kglib/kgcn_tensorflow/.images/interaction_node_update.png deleted file mode 100644 index 1b77bc1c..00000000 Binary files a/kglib/kgcn_tensorflow/.images/interaction_node_update.png and /dev/null differ diff --git a/kglib/kgcn_tensorflow/.images/interaction_node_update.svg b/kglib/kgcn_tensorflow/.images/interaction_node_update.svg deleted file mode 100644 index 0ada519c..00000000 --- a/kglib/kgcn_tensorflow/.images/interaction_node_update.svg +++ /dev/null @@ -1,3 +0,0 @@ - - -
effect
effect
cause
cause
has
has
has
has
activity
[Not supported by viewer]
chemical
[Not supported by viewer]
gene
[Not supported by viewer]
phosphorylation
[Not supported by viewer]
gene
[Not supported by viewer]
cause
cause
effect
effect
degree
"decreases"
[Not supported by viewer]
\ No newline at end of file diff --git a/kglib/kgcn_tensorflow/.images/learning_pipeline.png b/kglib/kgcn_tensorflow/.images/learning_pipeline.png deleted file mode 100644 index d5988292..00000000 Binary files a/kglib/kgcn_tensorflow/.images/learning_pipeline.png and /dev/null differ diff --git a/kglib/kgcn_tensorflow/.images/learning_pipeline.svg b/kglib/kgcn_tensorflow/.images/learning_pipeline.svg deleted file mode 100644 index 201d0c86..00000000 --- a/kglib/kgcn_tensorflow/.images/learning_pipeline.svg +++ /dev/null @@ -1,3 +0,0 @@ - - -
disease
[Not supported by viewer]
causality
[Not supported by viewer]
name
"flu"
[Not supported by viewer]
cause
[Not supported by viewer]
has
[Not supported by viewer]
TypeDB Knowledge Graph
[Not supported by viewer]
Encoded Graph
[Not supported by viewer]
Embedded Graph
[Not supported by viewer]
Encoding
Encoding
Embedding
Embedding
Message Passing for N steps
Message Passing for N steps
Graph Node and Edge
Feature Updates
[Not supported by viewer]
Decoding
Decoding
Decoded Graph
Decoded Graph
Feed to KGCN (in TensorFlow)
Feed to KGCN (in TensorFlow)
KGCN Output
KGCN Output
KGCN
[Not supported by viewer]
disease
-
[Not supported by viewer]
causality
-
[Not supported by viewer]
cause
-
[Not supported by viewer]
has
-
[Not supported by viewer]
name
"flu"
[Not supported by viewer]
In-Memory Graph
[Not supported by viewer]
NetworkX
NetworkX
NetworkX
NetworkX
Queries to Graph
Queries to Graph
\ No newline at end of file diff --git a/kglib/kgcn_tensorflow/.images/message_passing.png b/kglib/kgcn_tensorflow/.images/message_passing.png deleted file mode 100644 index d3cbe83e..00000000 Binary files a/kglib/kgcn_tensorflow/.images/message_passing.png and /dev/null differ diff --git a/kglib/kgcn_tensorflow/.images/node_update.png b/kglib/kgcn_tensorflow/.images/node_update.png deleted file mode 100644 index 73689622..00000000 Binary files a/kglib/kgcn_tensorflow/.images/node_update.png and /dev/null differ diff --git a/kglib/kgcn_tensorflow/.images/node_update.svg b/kglib/kgcn_tensorflow/.images/node_update.svg deleted file mode 100644 index aa04db8d..00000000 --- a/kglib/kgcn_tensorflow/.images/node_update.svg +++ /dev/null @@ -1,3 +0,0 @@ - - -
diagnosed-disease
diagnosed-disease
patient
patient
has
has
doctor
doctor
diagnosis
[Not supported by viewer]
person
[Not supported by viewer]
disease
[Not supported by viewer]
person
[Not supported by viewer]
name
"Diabetes Type II"
[Not supported by viewer]
\ No newline at end of file diff --git a/kglib/kgcn_tensorflow/.images/node_update_process.png b/kglib/kgcn_tensorflow/.images/node_update_process.png deleted file mode 100644 index 5294ed86..00000000 Binary files a/kglib/kgcn_tensorflow/.images/node_update_process.png and /dev/null differ diff --git a/kglib/kgcn_tensorflow/.images/reasoning.png b/kglib/kgcn_tensorflow/.images/reasoning.png deleted file mode 100644 index 92d305c8..00000000 Binary files a/kglib/kgcn_tensorflow/.images/reasoning.png and /dev/null differ diff --git a/kglib/kgcn_tensorflow/.images/reasoning.svg b/kglib/kgcn_tensorflow/.images/reasoning.svg deleted file mode 100644 index 8a3648fd..00000000 --- a/kglib/kgcn_tensorflow/.images/reasoning.svg +++ /dev/null @@ -1,3 +0,0 @@ - - -
diagnosed-disease
diagnosed-disease
patient
patient
has
has
doctor
doctor
diagnosis
[Not supported by viewer]
person
[Not supported by viewer]
disease
[Not supported by viewer]
person
[Not supported by viewer]
name
"Diabetes Type II"
[Not supported by viewer]
diagnosed-disease
diagnosed-disease
patient
patient
diagnosis
[Not supported by viewer]
parent
parent
child
child
parentship
[Not supported by viewer]
person
[Not supported by viewer]
risked-disease
risked-disease
person-at-risk
person-at-risk
hereditary-
risk-factor
[Not supported by viewer]
\ No newline at end of file diff --git a/kglib/kgcn_tensorflow/.images/subgraph.png b/kglib/kgcn_tensorflow/.images/subgraph.png deleted file mode 100644 index 38262b4a..00000000 Binary files a/kglib/kgcn_tensorflow/.images/subgraph.png and /dev/null differ diff --git a/kglib/kgcn_tensorflow/.images/subgraph.svg b/kglib/kgcn_tensorflow/.images/subgraph.svg deleted file mode 100644 index f5ee96d3..00000000 --- a/kglib/kgcn_tensorflow/.images/subgraph.svg +++ /dev/null @@ -1,3 +0,0 @@ - - -
candidate-diagnosis
[Not supported by viewer]
disease
[Not supported by viewer]
symptom
[Not supported by viewer]
symptom-presentation
[Not supported by viewer]
person
[Not supported by viewer]
causality
[Not supported by viewer]
disease
[Not supported by viewer]
name
"flu"
[Not supported by viewer]
name
"meningitis"
[Not supported by viewer]
name
"fever"
[Not supported by viewer]
causality
[Not supported by viewer]
diagnosis
[Not supported by viewer]
\ No newline at end of file diff --git a/kglib/kgcn_tensorflow/.images/subgraph_with_predictions.png b/kglib/kgcn_tensorflow/.images/subgraph_with_predictions.png deleted file mode 100644 index 4cc07913..00000000 Binary files a/kglib/kgcn_tensorflow/.images/subgraph_with_predictions.png and /dev/null differ diff --git a/kglib/kgcn_tensorflow/.images/subgraph_with_predictions.svg b/kglib/kgcn_tensorflow/.images/subgraph_with_predictions.svg deleted file mode 100644 index a4b84c5f..00000000 --- a/kglib/kgcn_tensorflow/.images/subgraph_with_predictions.svg +++ /dev/null @@ -1,3 +0,0 @@ - - -

<span style="font-size: 18px"><br></span>

<span style="font-size: 18px"><br></span>

<span style="font-size: 18px"><br></span>
\ No newline at end of file diff --git a/kglib/kgcn_tensorflow/.images/ternary_diagnosis.png b/kglib/kgcn_tensorflow/.images/ternary_diagnosis.png deleted file mode 100644 index 76a8873c..00000000 Binary files a/kglib/kgcn_tensorflow/.images/ternary_diagnosis.png and /dev/null differ diff --git a/kglib/kgcn_tensorflow/.images/ternary_diagnosis.svg b/kglib/kgcn_tensorflow/.images/ternary_diagnosis.svg deleted file mode 100644 index 51756c40..00000000 --- a/kglib/kgcn_tensorflow/.images/ternary_diagnosis.svg +++ /dev/null @@ -1,3 +0,0 @@ - - -
diagnosed-disease
diagnosed-disease
patient
patient
has
has
doctor
doctor
diagnosis
[Not supported by viewer]
person
[Not supported by viewer]
disease
[Not supported by viewer]
person
[Not supported by viewer]
name
"Diabetes Type II"
[Not supported by viewer]
\ No newline at end of file diff --git a/kglib/kgcn_tensorflow/README.md b/kglib/kgcn_tensorflow/README.md deleted file mode 100644 index 2447e290..00000000 --- a/kglib/kgcn_tensorflow/README.md +++ /dev/null @@ -1,153 +0,0 @@ -# KGCNs - Knowledge Graph Convolutional Networks - -This project introduces a novel model: the *Knowledge Graph Convolutional Network* (KGCN). - -### Getting Started - Running the Machine Learning Pipeline - -**Requirements** - -- Python >= 3.6 - -- KGLIB installed via pip: `pip install typedb-kglib`. - -- [TypeDB 2.1.1](https://github.com/vaticle/typedb/releases) running in the background - -- the Python TypeDB client 2.1.0 ([PyPi](https://pypi.org/project/typedb-client/), [GitHub release](https://github.com/vaticle/typedb-client-python/releases)) - -See the [full example](https://github.com/vaticle/kglib/tree/master/kglib/kgcn/examples/diagnosis/diagnosis.py) for how to use a KGCN for [Relation](https://docs.vaticle.com/docs/schema/concepts#relation) prediction. You can use the example as a template to create a KGCN for your own TypeDB data. If you need to customise the learning or model used, you'll need to make changes to your version of the [pipeline](https://github.com/vaticle/kglib/tree/master/kglib/kgcn/pipeline/pipeline.py). - -## How Do We Use Machine Learning over a Knowledge Graph? - -### Relation Prediction - -This KGCN framework is designed to provide a versatile means to perform learning tasks over a knowledge graph in TypeDB. - -Included in the [latest release](https://github.com/vaticle/kglib/releases/latest): - -- Predicting the existence of new [Relations](https://docs.vaticle.com/docs/schema/concepts#relation) between existing [Concepts](https://docs.vaticle.com/docs/concept-api/overview). These relations can be binary, **ternary** (3-way) or [**N-ary**]() (N-way), since Relations in TypeDB are graph [Hyperedges](https://en.wikipedia.org/wiki/Glossary_of_graph_theory_terms#hyperedge). - -Understand the full capabilities of KGCNs by examining the methodology outlined below. - -### Supervised Knowledge Graph Machine Learning - -We approach learning over a Knowledge Graph just as we do classical supervised learning. We learn from a ground truth set of training examples, but in this case each example is a subgraph. - -We extract these subgraphs from a TypeDB Knowledge Graph. Extracting subgraphs is performed by making Graql queries to TypeDB (multiple queries per example). - -![Knowledge Graph Machine Learning](.images/knowledge_graph_machine_learning.png) - -We then encode these graphs and feed them to the KGCN. As an output we receive the same graph with predicted node property values. Those predicted properties can be used to regress or classify the Concepts of our subgraphs as we see fit. - -Using this method we can frame Relation prediction as a node existence classification task. - -### Graphs In, Graphs Out - -We can directly ingest a graph into TensorFlow and learn over that graph. This leverages DeepMind's [Graph Nets](https://github.com/deepmind/graph_nets) framework, detailed in [their paper](https://arxiv.org/abs/1806.01261) (built in TensorFlow). This work is a generalisation of graph learning techniques, which offers plenty of ways to structure learning tailored to various knowledge graph problems. - -We extend this work for knowledge graphs in TypeDB, with a graph data flow as follows: - -![Pipeline](.images/learning_pipeline.png) - - - -## How Does Message Passing Work? - -Message passing is an iterative process. On each `superstep` messages are passed simultaneously (such that they don't influence each other) between elements of the graph. These messages are then used to update the state of graph elements. - -A KGCN is a learned graph message-passing algorithm. Neural network components transform the messages that are passed around the graph. This transformation is learned in order to pass useful updates around the graph. - -In our case, the algorithm looks like this: - -``` -for step in supersteps: - update all edges; - update all nodes; -``` - -![Message Passing](.images/message_passing.png) - -### Edge Block - -Use as input the current edge features and the features of the nodes it connects. Update the edge's features as the output of some neural network layers. Do this for all edges. - -![Edge Update](.images/edge_update.png) - -### Node Block - -Use as input the node's features and the most up-to-date features of the edges that connect to it. Update the node's features as the output of some neural network layers. Do this for all nodes. - -![Node Update](.images/node_update.png) - -### How is this Convolutional? - -This approach is described as convolutional since the same transformations are re-used across the graph. It may help your understanding to analogise this to convolution over images, where the same transformation is applied over all pixel neighbourhoods. - -## How Do We Frame Relation Prediction? - -In a typical use case, we have a specific Relation Type, `T`, that we want to predict. We want to predict the existence of `T` Relations based on the context that surrounds them in the graph. Our supervised learning approach requires ground truth examples. - -### Creating Ground Truth Examples - -Our approach is to extract subgraphs from a TypeDB knowledge graph to use as ground truth examples. - -Clearly, `T` Relations that are present are treated as fact, and given positive target labels. However, we must also consider *negative* examples of these `T` Relations. - -In our ground truth example subgraphs, we apply a [closed-world assumption](https://en.wikipedia.org/wiki/Closed-world_assumption). This means that for concepts `$a1`, `$a2`, ... ,`$aN` if a Relation does not exist in `($a1, $a2, ... ,$aN)`, this indicates that there is no such Relation in `($a1, $a2, ... ,$aN)`. - -Following this closed-world assumption, we use the absence of a `T` Relation as a negative target. - -Note that under an [open-world assumption](https://en.wikipedia.org/wiki/Open-world_assumption) a Relation in `($a1, $a2, ... ,$aN)` could exist but also be absent from the graph. - -### How Do We Represent Negative Examples? - -The KGCN needs to learn by example where it should predict new `T` Relations. Therefore, the learner needs to see all logically possible `T` Relations as candidates. - -To achieve this, wherever in the subgraph a `T` Relation *could* exist, but does not, we create one (see below), giving the new Relation a negative target label. - -The learner's job is then to classify those candidates to indicate their likelihood of true existence. - -Due to TypeDB's enforced schema, `T` Relations can logically only occur between certain Roleplayers. This means that the candidates to be added should be sparse - we don't see the combinatorial explosion of candidates that we would see in a homogenous subgraph. - -### Adding Negative Relations Dynamically - -Naturally, we don't wish to pollute our Knowledge Graph by inserting these `T` Relation candidates. Instead, we can make use of TypeDB's reasoning engine here by defining a logical [Rule](http://docs.vaticle.com/docs/schema/rules) to dynamically create these candidates (see the rule in the [example schema](../utils/typedb/synthetic/examples/diagnosis/schema.tql)). After training our learner we can simply `undefine` the rule to return to an unpolluted state. - -## Architectural Components - -Here we identify the core components used to build a working KGCN pipeline. - -### Application - -e.g. [diagnosis example](https://github.com/vaticle/kglib/tree/master/kglib/kgcn/examples/diagnosis) - -1. Fetch subgraphs, each subgraph is used as an *example* - - This requires specifying queries that will retrieve Concepts from TypeDB - - The answers from these queries are used to create subgraphs, stored in-memory as networkx graphs -2. Find the Types and Roles present in the schema. If any are not needed for learning then they should be excluded from the exhaustive list for better accuracy. -3. Run the pipeline -4. Write the predictions made to TypeDB - -### Pipeline - -Can be customised from [pipeline](https://github.com/vaticle/kglib/tree/master/kglib/kgcn/pipeline/pipeline.py). A pipeline performs the following: - -1. Take in graphs and the training/generalistaion split -2. Encode graph values (including type information) into numerics -3. Perform prerequisite graph formatting and manipulation -4. Build the neural network model, providing embedding models for Attribute Types -5. Run the learning process -6. Create visualisations -7. Record the predictions made, and return them in graphs - -### KGCNLearner -Found in [learn.py](https://github.com/vaticle/kglib/tree/master/kglib/kgcn/learn/learn.py). -- Performs the training loop -- Manages the loss function and optimiser -- Manages the TensorFlow session -- Prints results for the training and generalistaion datasets during training - -### KGCN - -Found in [core.py](https://github.com/vaticle/kglib/tree/master/kglib/kgcn/models/core.py). - -Defines the computation graph for a KGCN, including the initial embedding of values and the edge/node/graph feature update strategy during message-passing. This is the core that depends upon [Graph Nets](https://github.com/deepmind/graph_nets). diff --git a/kglib/kgcn_tensorflow/examples/diagnosis/.images/BUILD b/kglib/kgcn_tensorflow/examples/diagnosis/.images/BUILD deleted file mode 100644 index 3c5c660f..00000000 --- a/kglib/kgcn_tensorflow/examples/diagnosis/.images/BUILD +++ /dev/null @@ -1,32 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -load("@vaticle_dependencies//tool/checkstyle:rules.bzl", "checkstyle_test") - -checkstyle_test( - name = "checkstyle", - include = glob(["*"]), - exclude = glob([ - "*.png", - "*.svg" - ]), - license_type = "apache-header", -) diff --git a/kglib/kgcn_tensorflow/examples/diagnosis/.images/diagnosis_schema.png b/kglib/kgcn_tensorflow/examples/diagnosis/.images/diagnosis_schema.png deleted file mode 100644 index e8e9aefc..00000000 Binary files a/kglib/kgcn_tensorflow/examples/diagnosis/.images/diagnosis_schema.png and /dev/null differ diff --git a/kglib/kgcn_tensorflow/examples/diagnosis/.images/graph.png b/kglib/kgcn_tensorflow/examples/diagnosis/.images/graph.png deleted file mode 100644 index f8e6f4ac..00000000 Binary files a/kglib/kgcn_tensorflow/examples/diagnosis/.images/graph.png and /dev/null differ diff --git a/kglib/kgcn_tensorflow/examples/diagnosis/.images/learning.png b/kglib/kgcn_tensorflow/examples/diagnosis/.images/learning.png deleted file mode 100644 index eee07187..00000000 Binary files a/kglib/kgcn_tensorflow/examples/diagnosis/.images/learning.png and /dev/null differ diff --git a/kglib/kgcn_tensorflow/examples/diagnosis/.images/queried_subgraph.png b/kglib/kgcn_tensorflow/examples/diagnosis/.images/queried_subgraph.png deleted file mode 100644 index dfe3adf2..00000000 Binary files a/kglib/kgcn_tensorflow/examples/diagnosis/.images/queried_subgraph.png and /dev/null differ diff --git a/kglib/kgcn_tensorflow/examples/diagnosis/README.md b/kglib/kgcn_tensorflow/examples/diagnosis/README.md deleted file mode 100644 index a7c3c26a..00000000 --- a/kglib/kgcn_tensorflow/examples/diagnosis/README.md +++ /dev/null @@ -1,146 +0,0 @@ -# KGCN Diagnosis Example - -This example is entirely fabricated as a demonstration for how to construct a KGCN pipeline. Since the data for this example is generated synthetically, it also functions as a test platform for the KGCN model. - -Studying the schema for this example (using TypeDB Workbase's Schema Designer), we have people who present symptoms, with some severity. Separately, we may know that certain symptoms can be caused by a disease. We also know information that contributes to risk-factors for certain diseases. These risk factors are determined by rules defined in the schema. Lastly, people can be diagnosed with a disease. - -![Diagnosis Schema](.images/diagnosis_schema.png) - -## Running the Example - -Once you have [installed KGLIB via pip](../../#getting-started---running-the-machine-learning-pipeline) you can run the example as follows: - -1. Make sure a TypeDB server is running - -2. Run the example: `python -m kglib.kgcn_tensorflow.examples.diagnosis.diagnosis` - - The database, schema and seed data will be created automatically. Data is generated synthetically. The whole example should complete in under 10 minutes - -3. You should observe console output to indicate that the pipeline is running and that the model is learning. Afterwards two plots should be created to visualise the training process and examples of the predictions made. - -## Diagnosis Pipeline - -The process conducted by the example is as follows: - -1. Generate synthetic graphs, each graph is used as an *example* - - This requires specifying queries that will retrieve Concepts from TypeDB - - The answers from these queries are used to create subgraphs, stored in-memory as networkx graphs -2. Find the Types and Roles present in the schema. If any are not needed for learning then they should be excluded from the exhaustive list for better accuracy. -3. Run the pipeline -4. Write the predictions made to TypeDB - -## Relation Prediction - -The learner predicts three classes for each graph element. These are: - -``` -[ -Element already existed in the graph (we wish to ignore these elements), -Element does not exist in the graph, -Element does exist in the graph -] -``` - -In this way we perform relation prediction by proposing negative candidate relations (TypeDB's rules help us with this). Then we train the learner to classify these negative candidates as **does not exist** and the correct relations as **does exist**. - -## Results Output - -### Console Reporting - -During training, the console will output metrics for the performance on the training and test sets. - -You should see output such as this for the diagnosis example: - -``` -# (iteration number), T (elapsed seconds), Ltr (training loss), Lge (test/generalization loss), Ctr (training fraction nodes/edges labeled correctly), Str (training fraction examples solved correctly), Cge (test/generalization fraction nodes/edges labeled correctly), Sge (test/generalization fraction examples solved correctly) -# 00000, T 4.4, Ltr 0.7928, Lge 0.7518, Ctr 0.4900, Str 0.0000, Cge 0.5000, Sge 0.0000 -# 00020, T 9.8, Ltr 0.7036, Lge 0.6957, Ctr 0.5100, Str 0.0200, Cge 0.5000, Sge 0.0000 -# 00040, T 12.1, Ltr 0.5384, Lge 0.4540, Ctr 0.7900, Str 0.6100, Cge 0.8100, Sge 0.6300 -# 00060, T 14.4, Ltr 0.7434, Lge 0.3631, Ctr 0.7650, Str 0.5400, Cge 0.8850, Sge 0.7900 -# 00080, T 16.7, Ltr 0.3643, Lge 0.2464, Ctr 0.9200, Str 0.8800, Cge 0.9350, Sge 0.8900 -# 00100, T 19.0, Ltr 0.2806, Lge 0.1590, Ctr 0.9600, Str 0.9600, Cge 0.9650, Sge 0.9500 -# 00120, T 21.3, Ltr 0.5488, Lge 0.2577, Ctr 0.9100, Str 0.8400, Cge 0.9300, Sge 0.8800 -# 00140, T 23.5, Ltr 0.2913, Lge 0.2590, Ctr 0.9650, Str 0.9600, Cge 0.9200, Sge 0.8600 -# 00160, T 25.8, Ltr 0.2603, Lge 0.1476, Ctr 0.9650, Str 0.9600, Cge 0.9700, Sge 0.9600 -# 00180, T 28.1, Ltr 0.2656, Lge 0.1411, Ctr 0.9650, Str 0.9600, Cge 0.9700, Sge 0.9600 -... -``` - -Take note of the key: - -- \# - iteration number -- T - elapsed seconds -- Ltr - training loss -- Lge - test/generalization loss -- Ctr - training fraction nodes/edges labeled correctly -- Str - training fraction examples solved correctly -- Cge - test/generalization fraction nodes/edges labeled correctly -- Sge - test/generalization fraction examples solved correctly - -The element we are most interested in is `Sge`, the proportion of subgraphs where all elements of the subgraph were classified correctly. This therefore represents an entirely correctly predicted example. - -### Diagrams - -#### Training Metrics - -Upon running the example you will also get plots from matplotlib saved to your working directory. - -You will see plots of metrics for the training process (training iteration on the x-axis) for the training set (solid line), and test set (dotted line). From left to right: - -- The absolute loss across all of the elements in the dataset -- The fraction of all graph elements predicted correctly across the dataset -- The fraction of completely solved examples (subgraphs extracted from TypeDB that are solved in full) - -![learning metrics](.images/learning.png) - -#### Visualise the Predictions - -We also receive a plot of some of the predictions made on the test set. - -![predictions made on test set](.images/graph.png) - -**Blue box:** Ground Truth - -- Preexisting (known) graph elements are shown in blue - -- Relations and role edges that **should be predicted to exist** are shown in green - -- Candidate relations and role edges that **should not be predicted to exist** are shown faintly in red - -**Black boxes**: Model Predictions at certain message-passing steps - -This uses the same colour scheme as above, but opacity indicates a probability given by the model. - -These boxes shows the score assigned to the class **does exist**. - -Therefore, for good predictions we want to see no blue elements, and for the red elements to fade out as more messages are passed, the green elements becoming more certain. - -## How does Link Prediction work? - -The methodology used for Relation prediction is as follows: - -In this example, we aim to predict `diagnosis` Relations. We have the correct `diagnosis` relations, and we write a TypeDB rule to insert `candidate-diagnosis` relations as negative targets. They are added wherever a real `diagnosis` Relation could logically exist, but does not. - -We then teach the KGCN to distinguish between the positive and negative targets. - -## Querying for the Train/Test Datasets - -We do this by creating *examples*, where each example is a subgraph extracted from a TypeDB knowledge Graph. These subgraphs contain positive and negative instances of the relation to be predicted. - -A single subgraph is created by making multiple queries to TypeDB. In this example, each subgraph centres around a `person` who is uniquely identifiable. This is important, since we want the results for these queries to return information about the vacinity of an individual. That is, we want information about a subgraph rather than the whole graph. For this example you can find the queries made in [diagnosis.py](diagnosis.py). - -A single subgraph is extracted from TypeDB by making these queries and combining the results into a graph. For your own domain you should find queries that will retrieve the most relevant information for the Relations you are trying to predict. - -We can visualise such a subgraph by running these queries one after the other in TypeDB Workbase: - -![queried subgraph](.images/queried_subgraph.png) - -You can get the relevant version of TypeDB Workbase from the Assets of the [latest Workbase release](https://github.com/vaticle/workbase/releases/latest). - -Using Workbase like this is a great way to understand the subgraphs that are actually being delivered to the KGCN -- a great understanding and debugging tool. - -## Modifying the Example - -If you need to customise the learning or model used for your own use case, you'll need to make changes to the [pipeline](https://github.com/vaticle/kglib/tree/master/kglib/kgcn/pipeline/pipeline.py) used. - -Consider tuning parameters and adjusting elements of the pipeline if you need to improve the accuracy that you see. Start by adjusting `num_processing_steps_tr`, `num_processing_steps_ge`, `num_training_iterations`. \ No newline at end of file diff --git a/kglib/kgcn_tensorflow/examples/diagnosis/diagnosis.py b/kglib/kgcn_tensorflow/examples/diagnosis/diagnosis.py deleted file mode 100644 index cba7af28..00000000 --- a/kglib/kgcn_tensorflow/examples/diagnosis/diagnosis.py +++ /dev/null @@ -1,346 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import inspect -import time - -from typedb.client import * - -from kglib.kgcn_data_loader.utils import load_typeql_schema_file, load_typeql_data_file -from kglib.kgcn_tensorflow.pipeline.pipeline import pipeline -from kglib.utils.graph.iterate import multidigraph_data_iterator -from kglib.utils.graph.query.query_graph import QueryGraph -from kglib.utils.graph.thing.queries_to_networkx_graph import build_graph_from_queries -from kglib.utils.typedb.synthetic.examples.diagnosis.generate import generate_example_data -from kglib.utils.typedb.type.type import get_thing_types, get_role_types - -DATABASE = "diagnosis" -ADDRESS = "localhost:1729" - -# Existing elements in the graph are those that pre-exist in the graph, and should be predicted to continue to exist -PREEXISTS = 0 - -# Candidates are neither present in the input nor in the solution, they are negative samples -CANDIDATE = 1 - -# Elements to infer are the graph elements whose existence we want to predict to be true, they are positive samples -TO_INFER = 2 - -# Categorical Attribute types and the values of their categories -CATEGORICAL_ATTRIBUTES = {'name': ['Diabetes Type II', 'Multiple Sclerosis', 'Blurred vision', 'Fatigue', 'Cigarettes', - 'Alcohol']} -# Continuous Attribute types and their min and max values -CONTINUOUS_ATTRIBUTES = {'severity': (0, 1), 'age': (7, 80), 'units-per-week': (3, 29)} - -TYPES_TO_IGNORE = ['candidate-diagnosis', 'example-id', 'probability-exists', 'probability-non-exists', 'probability-preexists'] -ROLES_TO_IGNORE = ['candidate-patient', 'candidate-diagnosed-disease'] - -# The learner should see candidate relations the same as the ground truth relations, so adjust these candidates to -# look like their ground truth counterparts -TYPES_AND_ROLES_TO_OBFUSCATE = {'candidate-diagnosis': 'diagnosis', - 'candidate-patient': 'patient', - 'candidate-diagnosed-disease': 'diagnosed-disease'} - - -def diagnosis_example(typedb_binary_directory, - num_graphs=100, - num_processing_steps_tr=3, - num_processing_steps_ge=3, - num_training_iterations=50, - database=DATABASE, - address=ADDRESS, - schema_file_path="kglib/utils/typedb/synthetic/examples/diagnosis/schema.tql", - seed_data_file_path="kglib/utils/typedb/synthetic/examples/diagnosis/seed_data.tql"): - """ - Run the diagnosis example from start to finish, including traceably ingesting predictions back into TypeDB - - Args: - typedb_binary_directory: Location of the typedb binary for the purpose of loading initial schema and data - num_graphs: Number of graphs to use for training and testing combined - num_processing_steps_tr: The number of message-passing steps for training - num_processing_steps_ge: The number of message-passing steps for testing - num_training_iterations: The number of training epochs - database: The name of the database to retrieve example subgraphs from - address: The address of the running TypeDB instance - schema_file_path: Path to the diagnosis schema file - seed_data_file_path: Path to the file containing seed data, that doesn't grow as synthetic data is added - - Returns: - Final accuracies for training and for testing - """ - - tr_ge_split = int(num_graphs*0.5) - - client = TypeDB.core_client(address) - if client.databases().contains(database): - raise ValueError( - f"There is already a database present with the name {database}. The Diagnosis example expects a clean DB. " - f"Please delete the {database} database, or use another database name") - client.databases().create(database) - - load_typeql_schema_file(database, typedb_binary_directory, schema_file_path) - load_typeql_data_file(database, typedb_binary_directory, seed_data_file_path) - generate_example_data(client, num_graphs, database=database) - - session = client.session(database, SessionType.DATA) - - print("Create concept graphs") - graphs = create_concept_graphs(list(range(num_graphs)), session, infer=True) - - with session.transaction(TransactionType.READ) as tx: - # Change the terminology here onwards from thing -> node and role -> edge - node_types = get_thing_types(tx) - [node_types.remove(el) for el in TYPES_TO_IGNORE] - - edge_types = get_role_types(tx) - [edge_types.remove(el) for el in ROLES_TO_IGNORE] - print(f'Found node types: {node_types}') - print(f'Found edge types: {edge_types}') - - ge_graphs, solveds_tr, solveds_ge = pipeline(graphs, - tr_ge_split, - node_types, - edge_types, - num_processing_steps_tr=num_processing_steps_tr, - num_processing_steps_ge=num_processing_steps_ge, - num_training_iterations=num_training_iterations, - continuous_attributes=CONTINUOUS_ATTRIBUTES, - categorical_attributes=CATEGORICAL_ATTRIBUTES, - output_dir=f"./events/{time.time()}/") - - with session.transaction(TransactionType.WRITE) as tx: - write_predictions_to_typedb(ge_graphs, tx) - - session.close() - client.close() - - return solveds_tr, solveds_ge - - -def create_concept_graphs(example_indices, typedb_session, infer = True): - """ - Builds an in-memory graph for each example, with an example_id as an anchor for each example subgraph. - Args: - example_indices: The values used to anchor the subgraph queries within the entire knowledge graph - typedb_session: TypeDB Session - - Returns: - In-memory graphs of TypeDB subgraphs - """ - - graphs = [] - - options = TypeDBOptions.core() - options.infer = infer - - for example_id in example_indices: - print(f'Creating graph for example {example_id}') - graph_query_handles = get_query_handles(example_id) - - with typedb_session.transaction(TransactionType.READ, options) as tx: - # Build a graph from the queries, samplers, and query graphs - graph = build_graph_from_queries(graph_query_handles, tx) - - obfuscate_labels(graph, TYPES_AND_ROLES_TO_OBFUSCATE) - - graph.name = example_id - graphs.append(graph) - - return graphs - - -def obfuscate_labels(graph, types_and_roles_to_obfuscate): - # Remove label leakage - change type labels that indicate candidates into non-candidates - for data in multidigraph_data_iterator(graph): - for label_to_obfuscate, with_label in types_and_roles_to_obfuscate.items(): - if data['type'] == label_to_obfuscate: - data.update(type=with_label) - break - - -def get_query_handles(example_id): - """ - Creates an iterable, each element containing a Graql query, a function to sample the answers, and a QueryGraph - object which must be the TypeDB graph representation of the query. This tuple is termed a "query_handle" - - Args: - example_id: A uniquely identifiable attribute value used to anchor the results of the queries to a specific - subgraph - - Returns: - query handles - """ - - # === Hereditary Feature === - hereditary_query = inspect.cleandoc(f'''match - $p isa person, has example-id {example_id}; - $par isa person; - $ps(child: $p, parent: $par) isa parentship; - $diag(patient:$par, diagnosed-disease: $d) isa diagnosis; - $d isa disease, has name $n; - ''') - - vars = p, par, ps, d, diag, n = 'p', 'par', 'ps', 'd', 'diag', 'n' - hereditary_query_graph = (QueryGraph() - .add_vars(vars, PREEXISTS) - .add_role_edge(ps, p, 'child', PREEXISTS) - .add_role_edge(ps, par, 'parent', PREEXISTS) - .add_role_edge(diag, par, 'patient', PREEXISTS) - .add_role_edge(diag, d, 'diagnosed-disease', PREEXISTS) - .add_has_edge(d, n, PREEXISTS)) - - # === Consumption Feature === - consumption_query = inspect.cleandoc(f'''match - $p isa person, has example-id {example_id}; - $s isa substance, has name $n; - $c(consumer: $p, consumed-substance: $s) isa consumption, - has units-per-week $u;''') - - vars = p, s, n, c, u = 'p', 's', 'n', 'c', 'u' - consumption_query_graph = (QueryGraph() - .add_vars(vars, PREEXISTS) - .add_has_edge(s, n, PREEXISTS) - .add_role_edge(c, p, 'consumer', PREEXISTS) - .add_role_edge(c, s, 'consumed-substance', PREEXISTS) - .add_has_edge(c, u, PREEXISTS)) - - # === Age Feature === - person_age_query = inspect.cleandoc(f'''match - $p isa person, has example-id {example_id}, has age $a; - ''') - - vars = p, a = 'p', 'a' - person_age_query_graph = (QueryGraph() - .add_vars(vars, PREEXISTS) - .add_has_edge(p, a, PREEXISTS)) - - # === Risk Factors Feature === - risk_factor_query = inspect.cleandoc(f'''match - $d isa disease; - $p isa person, has example-id {example_id}; - $r(person-at-risk: $p, risked-disease: $d) isa risk-factor; - ''') - - vars = p, d, r = 'p', 'd', 'r' - risk_factor_query_graph = (QueryGraph() - .add_vars(vars, PREEXISTS) - .add_role_edge(r, p, 'person-at-risk', PREEXISTS) - .add_role_edge(r, d, 'risked-disease', PREEXISTS)) - - # === Symptom === - vars = p, s, sn, d, dn, sp, sev, c = 'p', 's', 'sn', 'd', 'dn', 'sp', 'sev', 'c' - - symptom_query = inspect.cleandoc(f'''match - $p isa person, has example-id {example_id}; - $s isa symptom, has name $sn; - $d isa disease, has name $dn; - $sp(presented-symptom: $s, symptomatic-patient: $p) isa symptom-presentation, has severity $sev; - $c(cause: $d, effect: $s) isa causality; - ''') - - symptom_query_graph = (QueryGraph() - .add_vars(vars, PREEXISTS) - .add_has_edge(s, sn, PREEXISTS) - .add_has_edge(d, dn, PREEXISTS) - .add_role_edge(sp, s, 'presented-symptom', PREEXISTS) - .add_has_edge(sp, sev, PREEXISTS) - .add_role_edge(sp, p, 'symptomatic-patient', PREEXISTS) - .add_role_edge(c, s, 'effect', PREEXISTS) - .add_role_edge(c, d, 'cause', PREEXISTS)) - - # === Diagnosis === - - diag, d, p, dn = 'diag', 'd', 'p', 'dn' - - diagnosis_query = inspect.cleandoc(f'''match - $p isa person, has example-id {example_id}; - $d isa disease, has name $dn; - $diag(patient: $p, diagnosed-disease: $d) isa diagnosis; - ''') - - diagnosis_query_graph = (QueryGraph() - .add_vars([diag], TO_INFER) - .add_vars([d, p, dn], PREEXISTS) - .add_role_edge(diag, d, 'diagnosed-disease', TO_INFER) - .add_role_edge(diag, p, 'patient', TO_INFER)) - - # === Candidate Diagnosis === - candidate_diagnosis_query = inspect.cleandoc(f'''match - $p isa person, has example-id {example_id}; - $d isa disease, has name $dn; - $diag(candidate-patient: $p, candidate-diagnosed-disease: $d) isa candidate-diagnosis; - ''') - - candidate_diagnosis_query_graph = (QueryGraph() - .add_vars([diag], CANDIDATE) - .add_vars([d, p, dn], PREEXISTS) - .add_role_edge(diag, d, 'candidate-diagnosed-disease', CANDIDATE) - .add_role_edge(diag, p, 'candidate-patient', CANDIDATE)) - - return [ - (symptom_query, lambda x: x, symptom_query_graph), - (diagnosis_query, lambda x: x, diagnosis_query_graph), - (candidate_diagnosis_query, lambda x: x, candidate_diagnosis_query_graph), - (risk_factor_query, lambda x: x, risk_factor_query_graph), - (person_age_query, lambda x: x, person_age_query_graph), - (consumption_query, lambda x: x, consumption_query_graph), - (hereditary_query, lambda x: x, hereditary_query_graph) - ] - - -def write_predictions_to_typedb(graphs, tx): - """ - Take predictions from the ML model, and insert representations of those predictions back into the graph. - - Args: - graphs: graphs containing the concepts, with their class predictions and class probabilities - tx: TypeDB write transaction to use - - Returns: None - - """ - for graph in graphs: - for node, data in graph.nodes(data=True): - if data['prediction'] == 2: - concept = data['concept'] - concept_type = concept.type_label - if concept_type == 'diagnosis' or concept_type == 'candidate-diagnosis': - neighbours = graph.neighbors(node) - - for neighbour in neighbours: - concept = graph.nodes[neighbour]['concept'] - if concept.type_label == 'person': - person = concept - else: - disease = concept - - p = data['probabilities'] - query = (f'match ' - f'$p iid {person.iid};' - f'$d iid {disease.iid};' - f'$kgcn isa kgcn;' - f'insert ' - f'$pd(patient: $p, diagnosed-disease: $d, diagnoser: $kgcn) isa diagnosis,' - f'has probability-exists {p[2]:.3f},' - f'has probability-non-exists {p[1]:.3f},' - f'has probability-preexists {p[0]:.3f};') - tx.query().insert(query) - tx.commit() diff --git a/kglib/kgcn_tensorflow/examples/diagnosis/diagnosis_test.py b/kglib/kgcn_tensorflow/examples/diagnosis/diagnosis_test.py deleted file mode 100644 index ea337e3a..00000000 --- a/kglib/kgcn_tensorflow/examples/diagnosis/diagnosis_test.py +++ /dev/null @@ -1,126 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import unittest -from unittest.mock import MagicMock - -from typedb.api.query.query_manager import QueryManager -from typedb.client import * -import networkx as nx -import numpy as np - -from kglib.kgcn_tensorflow.examples.diagnosis.diagnosis import write_predictions_to_typedb, obfuscate_labels -from kglib.utils.typedb.object.thing import Thing -from kglib.utils.graph.test.case import GraphTestCase - - -class TestWritePredictionsToTypeDB(unittest.TestCase): - def test_query_made_as_expected(self): - graph = nx.MultiDiGraph() - - graph.add_node(0, concept=Thing('V123', 'person', 'entity'), probabilities=np.array([1.0, 0.0, 0.0]), - prediction=0) - graph.add_node(1, concept=Thing('V1235', 'disease', 'entity'), probabilities=np.array([1.0, 0.0, 0.0]), - prediction=0) - graph.add_node(2, concept=Thing('V6543', 'diagnosis', 'relation'), probabilities=np.array([0.0, 0.0071, 0.9927]), - prediction=2) - - graph.add_edge(2, 0) - graph.add_edge(2, 1) - - graphs = [graph] - tx = MagicMock(TypeDBTransaction) - - tx.commit = MagicMock() - tx.query.return_value = query = MagicMock(QueryManager) - - write_predictions_to_typedb(graphs, tx) - - expected_query = (f'match ' - f'$p iid V123;' - f'$d iid V1235;' - f'$kgcn isa kgcn;' - f'insert ' - f'$pd(patient: $p, diagnosed-disease: $d, diagnoser: $kgcn) isa diagnosis,' - f'has probability-exists 0.993,' - f'has probability-non-exists 0.007,' - f'has probability-preexists 0.000;') - - query.insert.assert_called_with(expected_query) - - tx.commit.assert_called() - - def test_query_made_only_if_relation_wins(self): - graph = nx.MultiDiGraph() - - graph.add_node(0, concept=Thing('V123', 'person', 'entity'), - probabilities=np.array([1.0, 0.0, 0.0]), prediction=0) - graph.add_node(1, concept=Thing('V1235', 'disease', 'entity'), - probabilities=np.array([1.0, 0.0, 0.0]), prediction=0) - graph.add_node(2, concept=Thing('V6543', 'diagnosis', 'relation'), - probabilities=np.array([0.0, 0.0, 1.0]), prediction=1) - - graph.add_edge(2, 0) - graph.add_edge(2, 1) - - graphs = [graph] - tx = MagicMock(TypeDBTransaction) - - tx.commit = MagicMock() - tx.query = MagicMock(QueryManager) - - write_predictions_to_typedb(graphs, tx) - - tx.query.assert_not_called() - - tx.commit.assert_called() - - -class TestObfuscateLabels(GraphTestCase): - - def test_labels_obfuscated_as_expected(self): - - graph = nx.MultiDiGraph() - - graph.add_node(0, type='person') - graph.add_node(1, type='disease') - graph.add_node(2, type='candidate-diagnosis') - - graph.add_edge(2, 0, type='candidate-patient') - graph.add_edge(2, 1, type='candidate-diagnosed-disease') - - obfuscate_labels(graph, {'candidate-diagnosis': 'diagnosis', - 'candidate-patient': 'patient', - 'candidate-diagnosed-disease': 'diagnosed-disease'}) - - expected_graph = nx.MultiDiGraph() - expected_graph.add_node(0, type='person') - expected_graph.add_node(1, type='disease') - expected_graph.add_node(2, type='diagnosis') - - expected_graph.add_edge(2, 0, type='patient') - expected_graph.add_edge(2, 1, type='diagnosed-disease') - - self.assertGraphsEqual(graph, expected_graph) - - -if __name__ == "__main__": - unittest.main() diff --git a/kglib/kgcn_tensorflow/learn/BUILD b/kglib/kgcn_tensorflow/learn/BUILD deleted file mode 100644 index e6c2e103..00000000 --- a/kglib/kgcn_tensorflow/learn/BUILD +++ /dev/null @@ -1,103 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -load("@rules_python//python:defs.bzl", "py_test", "py_library") -load("@vaticle_kglib_pip//:requirements.bzl", - vaticle_kglib_requirement = "requirement") -load("@vaticle_dependencies//tool/checkstyle:rules.bzl", "checkstyle_test") - -py_test( - name = "metrics_test", - srcs = [ - "metrics_test.py" - ], - deps = [ - "learn" - ] -) - -py_test( - name = "learn_IT", - srcs = [ - "learn_IT.py" - ], - deps = [ - "learn", - "//kglib/kgcn_tensorflow/models", - "//kglib/kgcn_tensorflow/plot" - ] -) - -py_library( - name = "learn", - srcs = [ - 'feed.py', - 'learn.py', - 'loss.py', - 'metrics.py', - ], - deps = [ - # Networkx deps - vaticle_kglib_requirement('networkx'), - vaticle_kglib_requirement('decorator'), - - # Graph nets deps - vaticle_kglib_requirement('absl-py'), - vaticle_kglib_requirement('cloudpickle'), - vaticle_kglib_requirement('contextlib2'), - # vaticle_kglib_requirement('decorator'), - vaticle_kglib_requirement('dm-sonnet'), - vaticle_kglib_requirement('future'), - vaticle_kglib_requirement('graph-nets'), - # vaticle_kglib_requirement('networkx'), - vaticle_kglib_requirement('numpy'), - vaticle_kglib_requirement('semantic-version'), - vaticle_kglib_requirement('six'), - vaticle_kglib_requirement('tensorflow-probability'), - vaticle_kglib_requirement('wrapt'), - - # Tensorflow deps - # vaticle_kglib_requirement('absl-py'), - vaticle_kglib_requirement('astor'), - vaticle_kglib_requirement('gast'), - vaticle_kglib_requirement('google-pasta'), - vaticle_kglib_requirement('keras-applications'), - vaticle_kglib_requirement('keras-preprocessing'), - # vaticle_kglib_requirement('six'), - vaticle_kglib_requirement('protobuf'), - vaticle_kglib_requirement('tensorboard'), - vaticle_kglib_requirement('tensorflow'), - vaticle_kglib_requirement('tensorflow-estimator'), - vaticle_kglib_requirement('termcolor'), - # vaticle_kglib_requirement('wrapt'), - - # Scipy deps - vaticle_kglib_requirement('scipy') - - ], - visibility=['//visibility:public'] -) - -checkstyle_test( - name = "checkstyle", - include = glob(["*"]), - license_type = "apache-header", -) diff --git a/kglib/kgcn_tensorflow/learn/feed.py b/kglib/kgcn_tensorflow/learn/feed.py deleted file mode 100644 index 1ba8a250..00000000 --- a/kglib/kgcn_tensorflow/learn/feed.py +++ /dev/null @@ -1,57 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -from graph_nets import utils_tf, utils_np - - -def create_placeholders(input_graphs, target_graphs): - """ - Creates placeholders for the model training and evaluation. - Returns: - input_ph: The input graph's placeholders, as a graph namedtuple. - target_ph: The target graph's placeholders, as a graph namedtuple. - """ - input_ph = utils_tf.placeholders_from_networkxs(input_graphs, name="input_placeholders_from_networksx") - target_ph = utils_tf.placeholders_from_networkxs(target_graphs, name="target_placeholders_from_networkxs") - return input_ph, target_ph - - -def create_feed_dict(input_ph, target_ph, inputs, targets): - """Creates the feed dict for the placeholders for the model training and evaluation. - - Args: - input_ph: The input graph's placeholders, as a graph namedtuple. - target_ph: The target graph's placeholders, as a graph namedtuple. - inputs: The input graphs - targets: The target graphs - - Returns: - feed_dict: The feed `dict` of input and target placeholders and data. - """ - input_graphs = utils_np.networkxs_to_graphs_tuple(inputs) - target_graphs = utils_np.networkxs_to_graphs_tuple(targets) - feed_dict = {input_ph: input_graphs, target_ph: target_graphs} - return feed_dict - - -def make_all_runnable_in_session(*args): - """Lets an iterable of TF graphs be output from a session as NP graphs.""" - return [utils_tf.make_runnable_in_session(a) for a in args] \ No newline at end of file diff --git a/kglib/kgcn_tensorflow/learn/learn.py b/kglib/kgcn_tensorflow/learn/learn.py deleted file mode 100644 index 74de9eee..00000000 --- a/kglib/kgcn_tensorflow/learn/learn.py +++ /dev/null @@ -1,178 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import time - -import tensorflow as tf - -from kglib.kgcn_tensorflow.learn.feed import create_placeholders, create_feed_dict, make_all_runnable_in_session -from kglib.kgcn_tensorflow.learn.loss import loss_ops_preexisting_no_penalty -from kglib.kgcn_tensorflow.learn.metrics import existence_accuracy - - -class KGCNLearner: - """ - Responsible for running a KGCN model - """ - def __init__(self, model, num_processing_steps_tr=10, num_processing_steps_ge=10): - self._model = model - self._num_processing_steps_tr = num_processing_steps_tr - self._num_processing_steps_ge = num_processing_steps_ge - - def __call__(self, - tr_input_graphs, - tr_target_graphs, - ge_input_graphs, - ge_target_graphs, - num_training_iterations=1000, - learning_rate=1e-3, - log_every_epochs=20, - log_dir=None): - """ - Args: - tr_graphs: In-memory graphs of TypeDB concepts for training - ge_graphs: In-memory graphs of TypeDB concepts for generalisation - num_processing_steps_tr: Number of processing (message-passing) steps for training. - num_processing_steps_ge: Number of processing (message-passing) steps for generalization. - num_training_iterations: Number of training iterations - log_every_seconds: The time to wait between logging and printing the next set of results. - log_dir: Directory to store TensorFlow events files - - Returns: - - """ - - tf.set_random_seed(1) - - input_ph, target_ph = create_placeholders(tr_input_graphs, tr_target_graphs) - - # A list of outputs, one per processing step. - output_ops_tr = self._model(input_ph, self._num_processing_steps_tr) - output_ops_ge = self._model(input_ph, self._num_processing_steps_ge) - - # Training loss. - loss_ops_tr = loss_ops_preexisting_no_penalty(target_ph, output_ops_tr) - # Loss across processing steps. - loss_op_tr = sum(loss_ops_tr) / self._num_processing_steps_tr - - tf.summary.scalar('loss_op_tr', loss_op_tr) - # Test/generalization loss. - loss_ops_ge = loss_ops_preexisting_no_penalty(target_ph, output_ops_ge) - loss_op_ge = loss_ops_ge[-1] # Loss from final processing step. - tf.summary.scalar('loss_op_ge', loss_op_ge) - - # Optimizer - optimizer = tf.train.AdamOptimizer(learning_rate) - gradients, variables = zip(*optimizer.compute_gradients(loss_op_tr)) - - for grad, var in zip(gradients, variables): - try: - print(var.name) - tf.summary.histogram('gradients/' + var.name, grad) - except: - pass - - gradients, _ = tf.clip_by_global_norm(gradients, 5.0) - step_op = optimizer.apply_gradients(zip(gradients, variables)) - - input_ph, target_ph = make_all_runnable_in_session(input_ph, target_ph) - - sess = tf.Session() - merged_summaries = tf.summary.merge_all() - - train_writer = None - - if log_dir is not None: - train_writer = tf.summary.FileWriter(log_dir, sess.graph) - - sess.run(tf.global_variables_initializer()) - - logged_iterations = [] - losses_tr = [] - corrects_tr = [] - solveds_tr = [] - losses_ge = [] - corrects_ge = [] - solveds_ge = [] - - print("# (iteration number), T (elapsed seconds), " - "Ltr (training loss), Lge (test/generalization loss), " - "Ctr (training fraction nodes/edges labeled correctly), " - "Str (training fraction examples solved correctly), " - "Cge (test/generalization fraction nodes/edges labeled correctly), " - "Sge (test/generalization fraction examples solved correctly)") - - start_time = time.time() - for iteration in range(num_training_iterations): - feed_dict = create_feed_dict(input_ph, target_ph, tr_input_graphs, tr_target_graphs) - - if iteration % log_every_epochs == 0: - - train_values = sess.run( - { - "step": step_op, - "target": target_ph, - "loss": loss_op_tr, - "outputs": output_ops_tr, - "summary": merged_summaries - }, - feed_dict=feed_dict) - - if train_writer is not None: - train_writer.add_summary(train_values["summary"], iteration) - - feed_dict = create_feed_dict(input_ph, target_ph, ge_input_graphs, ge_target_graphs) - test_values = sess.run( - { - "target": target_ph, - "loss": loss_op_ge, - "outputs": output_ops_ge - }, - feed_dict=feed_dict) - correct_tr, solved_tr = existence_accuracy( - train_values["target"], train_values["outputs"][-1], use_edges=False) - correct_ge, solved_ge = existence_accuracy( - test_values["target"], test_values["outputs"][-1], use_edges=False) - - elapsed = time.time() - start_time - losses_tr.append(train_values["loss"]) - corrects_tr.append(correct_tr) - solveds_tr.append(solved_tr) - losses_ge.append(test_values["loss"]) - corrects_ge.append(correct_ge) - solveds_ge.append(solved_ge) - logged_iterations.append(iteration) - print("# {:05d}, T {:.1f}, Ltr {:.4f}, Lge {:.4f}, Ctr {:.4f}, Str" - " {:.4f}, Cge {:.4f}, Sge {:.4f}".format( - iteration, elapsed, train_values["loss"], test_values["loss"], - correct_tr, solved_tr, correct_ge, solved_ge)) - else: - train_values = sess.run( - { - "step": step_op, - "target": target_ph, - "loss": loss_op_tr, - "outputs": output_ops_tr - }, - feed_dict=feed_dict) - - training_info = logged_iterations, losses_tr, losses_ge, corrects_tr, corrects_ge, solveds_tr, solveds_ge - return train_values, test_values, training_info diff --git a/kglib/kgcn_tensorflow/learn/learn_IT.py b/kglib/kgcn_tensorflow/learn/learn_IT.py deleted file mode 100644 index 29267c0b..00000000 --- a/kglib/kgcn_tensorflow/learn/learn_IT.py +++ /dev/null @@ -1,63 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import unittest - -import networkx as nx -import numpy as np - -from kglib.kgcn_tensorflow.learn.learn import KGCNLearner -from kglib.kgcn_tensorflow.models.core import KGCN -from kglib.kgcn_tensorflow.models.embedding import ThingEmbedder, RoleEmbedder - - -class ITKGCNLearner(unittest.TestCase): - def test_learner_runs(self): - input_graph = nx.MultiDiGraph() - input_graph.add_node(0, features=np.array([0, 1, 2], dtype=np.float32)) - input_graph.add_edge(1, 0, features=np.array([0, 1, 2], dtype=np.float32)) - input_graph.add_node(1, features=np.array([0, 1, 2], dtype=np.float32)) - input_graph.add_edge(1, 2, features=np.array([0, 1, 2], dtype=np.float32)) - input_graph.add_node(2, features=np.array([0, 1, 2], dtype=np.float32)) - input_graph.graph['features'] = np.zeros(5, dtype=np.float32) - - target_graph = nx.MultiDiGraph() - target_graph.add_node(0, features=np.array([0, 1, 0], dtype=np.float32)) - target_graph.add_edge(1, 0, features=np.array([0, 0, 1], dtype=np.float32)) - target_graph.add_node(1, features=np.array([0, 0, 1], dtype=np.float32)) - target_graph.add_edge(1, 2, features=np.array([0, 0, 1], dtype=np.float32)) - target_graph.add_node(2, features=np.array([0, 1, 0], dtype=np.float32)) - target_graph.graph['features'] = np.zeros(5, dtype=np.float32) - - thing_embedder = ThingEmbedder(node_types=['a', 'b', 'c'], type_embedding_dim=5, - attr_embedding_dim=6, categorical_attributes={}, continuous_attributes={}) - - role_embedder = RoleEmbedder(num_edge_types=2, type_embedding_dim=5) - - kgcn = KGCN(thing_embedder, role_embedder, edge_output_size=3, node_output_size=3) - - learner = KGCNLearner(kgcn, num_processing_steps_tr=2, num_processing_steps_ge=2) - - learner([input_graph], [target_graph], [input_graph], [target_graph], num_training_iterations=50) - - -if __name__ == "__main__": - unittest.main() diff --git a/kglib/kgcn_tensorflow/learn/loss.py b/kglib/kgcn_tensorflow/learn/loss.py deleted file mode 100644 index 24c23ce4..00000000 --- a/kglib/kgcn_tensorflow/learn/loss.py +++ /dev/null @@ -1,66 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import numpy as np -import tensorflow as tf - - -def loss_ops_from_difference(target_op, output_ops): - """ - Loss operation which directly compares the target with the output over all nodes and edges - Args: - target_op: The target of the model - output_ops: A list of the outputs of the model, one for each message-passing step - - Returns: The loss for each message-passing step - - """ - loss_ops = [ - tf.losses.softmax_cross_entropy(target_op.nodes, output_op.nodes) - for output_op in output_ops - ] - return loss_ops - - -def loss_ops_preexisting_no_penalty(target_op, output_ops): - """ - Loss operation which doesn't penalise the output values for pre-existing nodes and edges, treating them as slack - variables - - Args: - target_op: The target of the model - output_ops: A list of the outputs of the model, one for each message-passing step - - Returns: The loss for each message-passing step - - """ - loss_ops = [] - for output_op in output_ops: - node_mask_op = tf.math.reduce_any( - tf.math.not_equal(target_op.nodes, tf.constant(np.array([1., 0., 0.]), dtype=tf.float32)), axis=1) - target_nodes = tf.boolean_mask(target_op.nodes, node_mask_op) - output_nodes = tf.boolean_mask(output_op.nodes, node_mask_op) - - loss_op = tf.losses.softmax_cross_entropy(target_nodes, output_nodes) - - loss_ops.append(loss_op) - - return loss_ops \ No newline at end of file diff --git a/kglib/kgcn_tensorflow/learn/metrics.py b/kglib/kgcn_tensorflow/learn/metrics.py deleted file mode 100644 index f9c9bf14..00000000 --- a/kglib/kgcn_tensorflow/learn/metrics.py +++ /dev/null @@ -1,104 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import numpy as np -from graph_nets import utils_np - -from scipy.special import softmax - - -def compute_accuracy(target, output, use_nodes=True, use_edges=True): - """Calculate model accuracy. - - Returns the number of elements correctly predicted to exist, and the number of completely correct graphs - (100% correct predictions). - - Args: - target: A `graphs.GraphsTuple` that contains the target graph. - output: A `graphs.GraphsTuple` that contains the output graph. - use_nodes: A `bool` indicator of whether to compute node accuracy or not. - use_edges: A `bool` indicator of whether to compute edge accuracy or not. - - Returns: - correct: A `float` fraction of correctly labeled nodes/edges. - solved: A `float` fraction of graphs that are completely correctly labeled. - - Raises: - ValueError: Nodes or edges (or both) must be used - """ - if not use_nodes and not use_edges: - raise ValueError("Nodes or edges (or both) must be used") - tdds = utils_np.graphs_tuple_to_data_dicts(target) - odds = utils_np.graphs_tuple_to_data_dicts(output) - cs = [] - ss = [] - for td, od in zip(tdds, odds): - xn = np.argmax(td["nodes"], axis=-1) - yn = np.argmax(od["nodes"], axis=-1) - xe = np.argmax(td["edges"], axis=-1) - ye = np.argmax(od["edges"], axis=-1) - c = [] - if use_nodes: - c.append(xn == yn) - if use_edges: - c.append(xe == ye) - c = np.concatenate(c, axis=0) - s = np.all(c) - cs.append(c) - ss.append(s) - correct = np.mean(np.concatenate(cs, axis=0)) - solved = np.mean(np.stack(ss)) - return correct, solved - - -def existence_accuracy(target, output, use_nodes=True, use_edges=True): - if not use_nodes and not use_edges: - raise ValueError("Nodes or edges (or both) must be used") - tdds = utils_np.graphs_tuple_to_data_dicts(target) - odds = utils_np.graphs_tuple_to_data_dicts(output) - cs = [] - ss = [] - for td, od in zip(tdds, odds): - - nodes_to_predict = td["nodes"][:, 0] == 0 - xn = np.argmax(td["nodes"][:, 1:], axis=-1) - xn = xn[nodes_to_predict] - yn = np.argmax(softmax(od["nodes"][:, 1:], axis=1), axis=-1) - yn = yn[nodes_to_predict] - - edges_to_predict = td["edges"][:, 0] == 0 - xe = np.argmax(td["edges"][:, 1:], axis=-1) - xe = xe[edges_to_predict] - ye = np.argmax(softmax(od["edges"][:, 1:], axis=1), axis=-1) - ye = ye[edges_to_predict] - - c = [] - if use_nodes: - c.append(xn == yn) - if use_edges: - c.append(xe == ye) - c = np.concatenate(c, axis=0) - s = np.all(c) - cs.append(c) - ss.append(s) - correct = np.mean(np.concatenate(cs, axis=0)) - solved = np.mean(np.stack(ss)) - return correct, solved diff --git a/kglib/kgcn_tensorflow/learn/metrics_test.py b/kglib/kgcn_tensorflow/learn/metrics_test.py deleted file mode 100644 index 0bcc9efc..00000000 --- a/kglib/kgcn_tensorflow/learn/metrics_test.py +++ /dev/null @@ -1,111 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import unittest - -import numpy as np -from graph_nets.graphs import GraphsTuple - -from kglib.kgcn_tensorflow.learn.metrics import compute_accuracy, existence_accuracy - - -class TestComputeAccuracy(unittest.TestCase): - - def test_compute_accuracy_is_as_expected(self): - - t_nodes = np.array([[1, 0], [1, 0], [0, 1]], dtype=np.float32) - o_nodes = np.array([[0, 1], [1, 0], [1, 0]], dtype=np.float32) - t_edges = np.array([[0, 1], [1, 0]], dtype=np.float32) - o_edges = np.array([[1, 0], [1, 0]], dtype=np.float32) - - globals = None - senders = np.array([0, 1]) - receivers = np.array([1, 2]) - n_node = np.array([3]) - n_edge = np.array([2]) - - target = GraphsTuple(nodes=t_nodes, - edges=t_edges, - globals=globals, - receivers=receivers, - senders=senders, - n_node=n_node, - n_edge=n_edge) - - output = GraphsTuple(nodes=o_nodes, - edges=o_edges, - globals=globals, - receivers=receivers, - senders=senders, - n_node=n_node, - n_edge=n_edge) - - correct, solved = compute_accuracy(target, output) - - expected_correct = 2 / 5 - expected_solved = 0 - - self.assertEqual(expected_correct, correct) - self.assertEqual(expected_solved, solved) - - -class TestExistenceAccuracy(unittest.TestCase): - - def test_compute_accuracy_is_as_expected(self): - - t_nodes = np.array([[1, 0, 0], [0, 0, 1], [0, 0, 1]], dtype=np.float32) - o_nodes = np.array([[0, 1, 0], [0, 1, 0], [0, 0, 1]], dtype=np.float32) - t_edges = np.array([[0, 1, 0], [1, 0, 0]], dtype=np.float32) - o_edges = np.array([[1, 0, 0], [1, 0, 0]], dtype=np.float32) - - globals = None - senders = np.array([0, 1]) - receivers = np.array([1, 2]) - n_node = np.array([3]) - n_edge = np.array([2]) - - target = GraphsTuple(nodes=t_nodes, - edges=t_edges, - globals=globals, - receivers=receivers, - senders=senders, - n_node=n_node, - n_edge=n_edge) - - output = GraphsTuple(nodes=o_nodes, - edges=o_edges, - globals=globals, - receivers=receivers, - senders=senders, - n_node=n_node, - n_edge=n_edge) - - correct, solved = existence_accuracy(target, output) - - expected_correct = 2/3 - expected_solved = 0.0 - - self.assertEqual(expected_correct, correct) - self.assertEqual(expected_solved, solved) - - -if __name__ == "__main__": - unittest.main() diff --git a/kglib/kgcn_tensorflow/models/BUILD b/kglib/kgcn_tensorflow/models/BUILD deleted file mode 100644 index 75007c76..00000000 --- a/kglib/kgcn_tensorflow/models/BUILD +++ /dev/null @@ -1,150 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -load("@rules_python//python:defs.bzl", "py_test", "py_library") -load("@vaticle_kglib_pip//:requirements.bzl", - vaticle_kglib_requirement = "requirement") -load("@vaticle_dependencies//tool/checkstyle:rules.bzl", "checkstyle_test") - -py_test( - name = "attribute_test", - srcs = [ - "attribute_test.py" - ], - deps = [ - "//kglib/utils/test", - "models" - ] -) - -py_test( - name = "attribute_IT", - srcs = [ - "attribute_IT.py" - ], - deps = [ - "models" - ] -) - -py_test( - name = "embedding_test", - srcs = [ - "embedding_test.py" - ], - deps = [ - "//kglib/utils/test", - "models" - ] -) - -py_test( - name = "embedding_IT", - srcs = [ - "embedding_IT.py" - ], - deps = [ - "models" - ] -) - -py_test( - name = "typewise_test", - srcs = [ - "typewise_test.py" - ], - deps = [ - "//kglib/utils/test", - "models" - ] -) - -py_test( - name = "typewise_IT", - srcs = [ - "typewise_IT.py" - ], - deps = [ - "models" - ] -) - -py_test( - name = "core_IT", - srcs = [ - "core_IT.py" - ], - deps = [ - "models" - ] -) - -py_library( - name = "models", - srcs = [ - 'attribute.py', - 'core.py', - 'embedding.py', - 'typewise.py', - ], - deps = [ - # Networkx deps - vaticle_kglib_requirement('networkx'), - vaticle_kglib_requirement('decorator'), - - # Graph nets deps - vaticle_kglib_requirement('absl-py'), - vaticle_kglib_requirement('cloudpickle'), - vaticle_kglib_requirement('contextlib2'), - # vaticle_kglib_requirement('decorator'), - vaticle_kglib_requirement('dm-sonnet'), - vaticle_kglib_requirement('future'), - vaticle_kglib_requirement('graph-nets'), - # vaticle_kglib_requirement('networkx'), - vaticle_kglib_requirement('numpy'), - vaticle_kglib_requirement('semantic-version'), - vaticle_kglib_requirement('six'), - vaticle_kglib_requirement('tensorflow-probability'), - vaticle_kglib_requirement('wrapt'), - - # Tensorflow deps - # vaticle_kglib_requirement('absl-py'), - vaticle_kglib_requirement('astor'), - vaticle_kglib_requirement('gast'), - vaticle_kglib_requirement('google-pasta'), - vaticle_kglib_requirement('keras-applications'), - vaticle_kglib_requirement('keras-preprocessing'), - # vaticle_kglib_requirement('six'), - vaticle_kglib_requirement('protobuf'), - vaticle_kglib_requirement('tensorboard'), - vaticle_kglib_requirement('tensorflow'), - vaticle_kglib_requirement('tensorflow-estimator'), - vaticle_kglib_requirement('termcolor'), - # vaticle_kglib_requirement('wrapt'), - ], - visibility=['//visibility:public'] -) - -checkstyle_test( - name = "checkstyle", - include = glob(["*"]), - license_type = "apache-header", -) diff --git a/kglib/kgcn_tensorflow/models/attribute.py b/kglib/kgcn_tensorflow/models/attribute.py deleted file mode 100644 index 689401e5..00000000 --- a/kglib/kgcn_tensorflow/models/attribute.py +++ /dev/null @@ -1,74 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import abc - -import sonnet as snt -import tensorflow as tf - - -class Attribute(snt.AbstractModule, abc.ABC): - """ - Abstract base class for Attribute value embedding models - """ - def __init__(self, attr_embedding_dim, name='AttributeEmbedder'): - super(Attribute, self).__init__(name=name) - self._attr_embedding_dim = attr_embedding_dim - - -class ContinuousAttribute(Attribute): - def __init__(self, attr_embedding_dim, name='ContinuousAttributeEmbedder'): - super(ContinuousAttribute, self).__init__(attr_embedding_dim, name=name) - - def _build(self, attribute_value): - tf.summary.histogram('cont_attribute_value_histogram', attribute_value) - embedding = snt.Sequential([ - snt.nets.MLP([self._attr_embedding_dim] * 3, activate_final=True, use_dropout=True), - snt.LayerNorm(), - ])(tf.cast(attribute_value, dtype=tf.float32)) - tf.summary.histogram('cont_embedding_histogram', embedding) - return embedding - - -class CategoricalAttribute(Attribute): - def __init__(self, num_categories, attr_embedding_dim, name='CategoricalAttributeEmbedder'): - super(CategoricalAttribute, self).__init__(attr_embedding_dim, name=name) - - self._num_categories = num_categories - - def _build(self, attribute_value): - int_attribute_value = tf.cast(attribute_value, dtype=tf.int32) - tf.summary.histogram('cat_attribute_value_histogram', int_attribute_value) - embedding = snt.Embed(self._num_categories, self._attr_embedding_dim)(int_attribute_value) - tf.summary.histogram('cat_embedding_histogram', embedding) - return tf.squeeze(embedding, axis=1) - - -class BlankAttribute(Attribute): - - def __init__(self, attr_embedding_dim, name='BlankAttributeEmbedder'): - super(BlankAttribute, self).__init__(attr_embedding_dim, name=name) - - def _build(self, attribute_value): - shape = tf.stack([tf.shape(attribute_value)[0], self._attr_embedding_dim]) - - encoded_features = tf.zeros(shape, dtype=tf.float32) - return encoded_features diff --git a/kglib/kgcn_tensorflow/models/attribute_IT.py b/kglib/kgcn_tensorflow/models/attribute_IT.py deleted file mode 100644 index 53c6dc52..00000000 --- a/kglib/kgcn_tensorflow/models/attribute_IT.py +++ /dev/null @@ -1,39 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import unittest - -from kglib.kgcn_tensorflow.models.attribute import CategoricalAttribute -import tensorflow as tf -import numpy as np - - -class ITCategoricalAttribute(unittest.TestCase): - def test_output_tensorspec(self): - cat = CategoricalAttribute(2, 5) - inp = tf.zeros((3, 1), dtype=tf.float32) - output = cat(inp) - np.testing.assert_array_equal(tf.TensorShape([3, 5]), output.shape) - np.testing.assert_equal(output.dtype, tf.float32) - - -if __name__ == "__main__": - unittest.main() diff --git a/kglib/kgcn_tensorflow/models/attribute_test.py b/kglib/kgcn_tensorflow/models/attribute_test.py deleted file mode 100644 index 2c984173..00000000 --- a/kglib/kgcn_tensorflow/models/attribute_test.py +++ /dev/null @@ -1,67 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import unittest - -from unittest.mock import Mock, patch - -from kglib.kgcn_tensorflow.models.attribute import CategoricalAttribute -import tensorflow as tf - -from kglib.utils.test.utils import get_call_args - - -class TestCategoricalAttribute(tf.test.TestCase): - - def setUp(self): - self._mock_embed_instance = Mock(return_value=tf.zeros((3, 1, 5), dtype=tf.float32)) - self._mock_embed_class = Mock(return_value=self._mock_embed_instance) - self._patcher = patch('kglib.kgcn_tensorflow.models.attribute.snt.Embed', new=self._mock_embed_class, - spec=True) - self._patcher.start() - - def tearDown(self): - self._patcher.stop() - - def test_embed_invoked_correctly(self): - attr_embedding_dim = 5 - cat = CategoricalAttribute(2, 5) - cat(tf.zeros((3, 1), tf.float32)) - self._mock_embed_class.assert_called_once_with(2, attr_embedding_dim) - - def test_output_is_as_expected(self): - inp = tf.zeros((3, 1), dtype=tf.float32) - expected_output = tf.zeros((3, 5), dtype=tf.float32) - cat = CategoricalAttribute(2, 5) - output = cat(inp) - self.assertAllClose(expected_output, output) - self.assertEqual(expected_output.dtype, output.dtype) - - def test_embed_instance_called_correctly(self): - inp = tf.zeros((3, 1), dtype=tf.float32) - cat = CategoricalAttribute(2, 5) - cat(inp) - self.assertAllClose(get_call_args(self._mock_embed_instance), [[tf.zeros((3, 1), dtype=tf.int32)]]) - self.assertEqual(get_call_args(self._mock_embed_instance)[0][0].dtype, tf.int32) - - -if __name__ == "__main__": - unittest.main() diff --git a/kglib/kgcn_tensorflow/models/core.py b/kglib/kgcn_tensorflow/models/core.py deleted file mode 100644 index a1c309ec..00000000 --- a/kglib/kgcn_tensorflow/models/core.py +++ /dev/null @@ -1,132 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import numpy as np -import sonnet as snt -from graph_nets import modules -from graph_nets import utils_tf -from graph_nets.modules import GraphIndependent - - -def softmax(x): - return np.exp(x) / np.sum(np.exp(x)) - - -def make_mlp_model(latent_size=16, num_layers=2): - """Instantiates a new MLP, followed by LayerNorm. - - The parameters of each new MLP are not shared with others generated by - this function. - - Returns: - A Sonnet module which contains the MLP and LayerNorm. - """ - return snt.Sequential([ - snt.nets.MLP([latent_size] * num_layers, activate_final=True), - snt.LayerNorm() - ]) - - -class MLPGraphIndependent(snt.AbstractModule): - """GraphIndependent with MLP edge, node, and global models.""" - - def __init__(self, name="MLPGraphIndependent"): - super(MLPGraphIndependent, self).__init__(name=name) - with self._enter_variable_scope(): - self._network = GraphIndependent( - edge_model_fn=make_mlp_model, - node_model_fn=make_mlp_model) - - def _build(self, inputs): - return self._network(inputs) - - -class MLPInteractionNetwork(snt.AbstractModule): - """InteractionNetwork with MLP edge, node, and global models.""" - - def __init__(self, name="MLPInteractionNetwork"): - super(MLPInteractionNetwork, self).__init__(name=name) - with self._enter_variable_scope(): - self._network = modules.InteractionNetwork(make_mlp_model, make_mlp_model) - - def _build(self, inputs): - return self._network(inputs) - - -class KGCN(snt.AbstractModule): - """ - A KGCN Neural Network with Message Passing. Implemented as a Sonnet Module. - """ - - def __init__(self, - thing_embedder, - role_embedder, - edge_output_size=3, - node_output_size=3, - latent_size=16, - num_layers=2, - name="KGCN"): - super(KGCN, self).__init__(name=name) - - self._thing_embedder = thing_embedder - self._role_embedder = role_embedder - - self._latent_size = latent_size - self._num_layers = num_layers - - # Transforms the outputs into the appropriate shapes. - if edge_output_size is None: - edge_fn = None - else: - edge_fn = lambda: snt.Linear(edge_output_size, name="edge_output") - if node_output_size is None: - node_fn = None - else: - node_fn = lambda: snt.Linear(node_output_size, name="node_output") - with self._enter_variable_scope(): - self._encoder = self._kg_encoder() - self._core = MLPInteractionNetwork() - self._decoder = MLPGraphIndependent() - self._output_transform = modules.GraphIndependent(edge_fn, node_fn) - - def _edge_model(self): - return snt.Sequential([self._role_embedder, - snt.nets.MLP([self._latent_size] * self._num_layers, activate_final=True), - snt.LayerNorm()]) - - def _node_model(self): - return snt.Sequential([self._thing_embedder, - snt.nets.MLP([self._latent_size] * self._num_layers, activate_final=True), - snt.LayerNorm()]) - - def _kg_encoder(self): - return GraphIndependent(self._edge_model, self._node_model, name='kg_encoder') - - def _build(self, input_op, num_processing_steps): - latent = self._encoder(input_op) - latent0 = latent - output_ops = [] - for _ in range(num_processing_steps): - core_input = utils_tf.concat([latent0, latent], axis=1) - latent = self._core(core_input) - decoded_op = self._decoder(latent) - output_ops.append(self._output_transform(decoded_op)) - return output_ops diff --git a/kglib/kgcn_tensorflow/models/core_IT.py b/kglib/kgcn_tensorflow/models/core_IT.py deleted file mode 100644 index ff11639c..00000000 --- a/kglib/kgcn_tensorflow/models/core_IT.py +++ /dev/null @@ -1,58 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import unittest - -import numpy as np -import tensorflow as tf -from graph_nets.graphs import GraphsTuple - -from kglib.kgcn_tensorflow.models.core import KGCN -from kglib.kgcn_tensorflow.models.embedding import ThingEmbedder, RoleEmbedder - - -class ITKGCN(unittest.TestCase): - - def test_kgcn_runs(self): - tf.enable_eager_execution() - - graph = GraphsTuple(nodes=tf.convert_to_tensor(np.array([[1, 2, 0], [1, 0, 0], [1, 1, 0]], dtype=np.float32)), - edges=tf.convert_to_tensor(np.array([[1, 0, 0], [1, 0, 0]], dtype=np.float32)), - globals=tf.convert_to_tensor(np.array([[0, 0, 0, 0, 0]], dtype=np.float32)), - receivers=tf.convert_to_tensor(np.array([1, 2], dtype=np.int32)), - senders=tf.convert_to_tensor(np.array([0, 1], dtype=np.int32)), - n_node=tf.convert_to_tensor(np.array([3], dtype=np.int32)), - n_edge=tf.convert_to_tensor(np.array([2], dtype=np.int32))) - - thing_embedder = ThingEmbedder(node_types=['a', 'b', 'c'], type_embedding_dim=5, attr_embedding_dim=6, - categorical_attributes={'a': ['a1', 'a2', 'a3'], 'b': ['b1', 'b2', 'b3']}, - continuous_attributes={'c': (0, 1)}) - - role_embedder = RoleEmbedder(num_edge_types=2, type_embedding_dim=5) - - kgcn = KGCN(thing_embedder, role_embedder, edge_output_size=3, node_output_size=3) - - kgcn(graph, 2) - - -if __name__ == "__main__": - tf.enable_eager_execution() - unittest.main() diff --git a/kglib/kgcn_tensorflow/models/embedding.py b/kglib/kgcn_tensorflow/models/embedding.py deleted file mode 100644 index 693ebe08..00000000 --- a/kglib/kgcn_tensorflow/models/embedding.py +++ /dev/null @@ -1,138 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import tensorflow as tf -import sonnet as snt - -from kglib.kgcn_tensorflow.models.attribute import CategoricalAttribute, ContinuousAttribute, BlankAttribute -from kglib.kgcn_tensorflow.models.typewise import TypewiseEncoder - - -class ThingEmbedder(snt.AbstractModule): - def __init__(self, node_types, type_embedding_dim, attr_embedding_dim, categorical_attributes, - continuous_attributes, name="ThingEmbedder"): - super(ThingEmbedder, self).__init__(name=name) - - self._node_types = node_types - self._type_embedding_dim = type_embedding_dim - self._attr_embedding_dim = attr_embedding_dim - - # Create embedders for the different attribute types - self._attr_embedders = dict() - - if categorical_attributes is not None: - self._attr_embedders.update( - construct_categorical_embedders(node_types, attr_embedding_dim, categorical_attributes)) - - if continuous_attributes is not None: - self._attr_embedders.update( - construct_continuous_embedders(node_types, attr_embedding_dim, continuous_attributes)) - - self._attr_embedders.update( - construct_non_attribute_embedders(node_types, attr_embedding_dim, categorical_attributes, - continuous_attributes)) - - def _build(self, features): - return tf.concat([embed_type(features, len(self._node_types), self._type_embedding_dim), - embed_attribute(features, self._attr_embedders, self._attr_embedding_dim)], axis=1) - - -class RoleEmbedder(snt.AbstractModule): - def __init__(self, num_edge_types, type_embedding_dim, name="RoleEmbedder"): - super(RoleEmbedder, self).__init__(name=name) - self._num_edge_types = num_edge_types - self._type_embedding_dim = type_embedding_dim - - def _build(self, features): - return embed_type(features, self._num_edge_types, self._type_embedding_dim) - - -def embed_type(features, num_types, type_embedding_dim): - preexistance_feat = tf.expand_dims(tf.cast(features[:, 0], dtype=tf.float32), axis=1) - type_embedder = snt.Embed(num_types, type_embedding_dim) - norm = snt.LayerNorm() - type_embedding = norm(type_embedder(tf.cast(features[:, 1], tf.int32))) - tf.summary.histogram('type_embedding_histogram', type_embedding) - return tf.concat([preexistance_feat, type_embedding], axis=1) - - -def embed_attribute(features, attr_encoders, attr_embedding_dim): - typewise_attribute_encoder = TypewiseEncoder(attr_encoders, attr_embedding_dim) - attr_embedding = typewise_attribute_encoder(features[:, 1:]) - tf.summary.histogram('attribute_embedding_histogram', attr_embedding) - return attr_embedding - - -def construct_categorical_embedders(node_types, attr_embedding_dim, categorical_attributes): - attr_embedders = dict() - - # Construct attribute embedders - for attribute_type, categories in categorical_attributes.items(): - - attr_typ_index = node_types.index(attribute_type) - - def make_embedder(): - return CategoricalAttribute(len(categories), attr_embedding_dim, - name=attribute_type + '_cat_embedder') - - # Record the embedder, and the index of the type that it should encode - attr_embedders[make_embedder] = [attr_typ_index] - - return attr_embedders - - -def construct_continuous_embedders(node_types, attr_embedding_dim, continuous_attributes): - attr_embedders = dict() - - # Construct attribute embedders - for attribute_type in continuous_attributes.keys(): - - attr_typ_index = node_types.index(attribute_type) - - def make_embedder(): - return ContinuousAttribute(attr_embedding_dim, name=attribute_type + '_cat_embedder') - - # Record the embedder, and the index of the type that it should encode - attr_embedders[make_embedder] = [attr_typ_index] - - return attr_embedders - - -def construct_non_attribute_embedders(node_types, attr_embedding_dim, categorical_attributes, continuous_attributes): - - attribute_names = list(categorical_attributes.keys()) - attribute_names.extend(list(continuous_attributes.keys())) - - non_attribute_nodes = [] - for i, type in enumerate(node_types): - if type not in attribute_names: - non_attribute_nodes.append(i) - - # All entities and relations (non-attributes) also need an embedder with matching output dimension, which does - # nothing. This is provided as a list of their indices - def make_blank_embedder(): - return BlankAttribute(attr_embedding_dim) - - attr_embedders = dict() - - if len(non_attribute_nodes) > 0: - attr_embedders[make_blank_embedder] = non_attribute_nodes - return attr_embedders diff --git a/kglib/kgcn_tensorflow/models/embedding_IT.py b/kglib/kgcn_tensorflow/models/embedding_IT.py deleted file mode 100644 index b16bc5ac..00000000 --- a/kglib/kgcn_tensorflow/models/embedding_IT.py +++ /dev/null @@ -1,80 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import unittest - -from kglib.kgcn_tensorflow.models.embedding import construct_categorical_embedders, construct_continuous_embedders, \ - construct_non_attribute_embedders - - -def construct_embedders(node_types, attr_embedding_dim, categorical_attributes, continuous_attributes): - attr_embedders = dict() - - if categorical_attributes is not None: - attr_embedders.update(construct_categorical_embedders(node_types, attr_embedding_dim, categorical_attributes)) - - if continuous_attributes is not None: - attr_embedders.update(construct_continuous_embedders(node_types, attr_embedding_dim, continuous_attributes)) - - attr_embedders.update(construct_non_attribute_embedders(node_types, attr_embedding_dim, categorical_attributes, - continuous_attributes)) - return attr_embedders - - -class TestConstructingEmbedders(unittest.TestCase): - - def test_all_types_encoded(self): - node_types = ['a', 'b', 'c'] - attr_embedding_dim = 5 - categorical_attributes = {'a': ['option1', 'option2']} - continuous_attributes = {'b': (0, 1)} - - attr_embedders = construct_embedders(node_types, attr_embedding_dim, categorical_attributes, - continuous_attributes) - all_types = [l for el in list(attr_embedders.values()) for l in el] - - expected_types = [0, 1, 2] - - self.assertListEqual(expected_types, all_types) - - def test_multiple_categorical_embedders(self): - node_types = ['a', 'b', 'c'] - attr_embedding_dim = 5 - categorical_attributes = {'a': ['option1', 'option2'], 'c': ['option3', 'option4']} - continuous_attributes = {'b': (0, 1)} - - attr_embedders = construct_embedders(node_types, attr_embedding_dim, categorical_attributes, - continuous_attributes) - - all_types = [l for el in list(attr_embedders.values()) for l in el] - all_types.sort() - - expected_types = [0, 1, 2] - print(attr_embedders) - - self.assertListEqual(expected_types, all_types) - - for types in attr_embedders.values(): - self.assertNotEqual(types, []) - - -if __name__ == "__main__": - unittest.main() diff --git a/kglib/kgcn_tensorflow/models/embedding_test.py b/kglib/kgcn_tensorflow/models/embedding_test.py deleted file mode 100644 index f7e88b7c..00000000 --- a/kglib/kgcn_tensorflow/models/embedding_test.py +++ /dev/null @@ -1,70 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import unittest - -import numpy as np -import tensorflow as tf -from unittest.mock import Mock -from unittest.mock import patch -from kglib.kgcn_tensorflow.models.embedding import embed_type, embed_attribute -from kglib.utils.test.utils import get_call_args - - -class TestTypeEmbedding(unittest.TestCase): - def setUp(self): - tf.enable_eager_execution() - - def test_embedding_output_shape_as_expected(self): - features = np.array([[1, 0, 0.7], [1, 2, 0.7], [0, 1, 0.5]], dtype=np.float32) - type_embedding_dim = 5 - output = embed_type(features, 3, type_embedding_dim) - - np.testing.assert_array_equal(np.array([3, 6]), output.shape) - - -class TestAttributeEmbedding(unittest.TestCase): - def setUp(self): - tf.enable_eager_execution() - - def test_embedding_is_typewise(self): - features = np.array([[1, 0, 0.7], [1, 2, 0.7], [0, 1, 0.5]]) - - mock_instance = Mock(return_value=tf.convert_to_tensor(np.array([[1, 0.7], [1, 0.7], [0, 0.5]]))) - mock = Mock(return_value=mock_instance) - patcher = patch('kglib.kgcn_tensorflow.models.embedding.TypewiseEncoder', spec=True, new=mock) - mock_class = patcher.start() - - attr_encoders = Mock() - attr_embedding_dim = Mock() - - embed_attribute(features, attr_encoders, attr_embedding_dim) # Function under test - - mock_class.assert_called_once_with(attr_encoders, attr_embedding_dim) - call_args = get_call_args(mock_instance) - - np.testing.assert_array_equal([[np.array([[0, 0.7], [2, 0.7], [1, 0.5]])]], call_args) - - patcher.stop() - - -if __name__ == "__main__": - unittest.main() diff --git a/kglib/kgcn_tensorflow/models/typewise.py b/kglib/kgcn_tensorflow/models/typewise.py deleted file mode 100644 index 641dd9c0..00000000 --- a/kglib/kgcn_tensorflow/models/typewise.py +++ /dev/null @@ -1,84 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import sonnet as snt -import tensorflow as tf - - -class TypewiseEncoder(snt.AbstractModule): - """ - Orchestrates encoding elements according to their types. Defers encoding of each feature to the appropriate encoder - for the type of that feature. Assumes that the type is given categorically as an integer value in the 0th position - of the provided features Tensor. - """ - def __init__(self, encoders_for_types, feature_length, name="typewise_encoder"): - """ - Args: - encoders_for_types: Dict - keys: encoders; values: a list of type categories the encoder should be used for - feature_length: The length of features to output for matrix initialisation - name: The name for this Module - """ - super(TypewiseEncoder, self).__init__(name=name) - - types_considered = [] - for a in encoders_for_types.values(): - types_considered.extend(a) - types_considered.sort() - - expected_types = list(range(max(types_considered) + 1)) - - if types_considered != expected_types: - raise ValueError( - f'Encoder categories are inconsistent. Expected {expected_types}, but got {types_considered}') - - self._feature_length = feature_length - self._encoders_for_types = encoders_for_types - - def _build(self, features): - - tf.summary.histogram('typewise_encoder_features_histogram', features) - - shape = tf.stack([tf.shape(features)[0], self._feature_length]) - - encoded_features = tf.zeros(shape, dtype=tf.float32) - - for encoder, types in self._encoders_for_types.items(): - - feat_types = tf.cast(features[:, 0], tf.int32) # The types for each feature, as integers - - # Expand dimensions ready for element-wise equality comparison - exp_types = tf.expand_dims(types, axis=0) - exp_feat_types = tf.expand_dims(feat_types, axis=1) - - elementwise_equality = tf.equal(exp_feat_types, exp_types) - - # Use this encoder when the feat_type matches any of the types - applicable_types_mask = tf.reduce_any(elementwise_equality, axis=1) - indices_to_encode = tf.where(applicable_types_mask) - - feats_to_encode = tf.squeeze(tf.gather(features[:, 1:], indices_to_encode), axis=1) - encoded_feats = encoder()(feats_to_encode) - - encoded_features += tf.scatter_nd(tf.cast(indices_to_encode, dtype=tf.int32), encoded_feats, shape) - - tf.summary.histogram('typewise_encoder_encoded_features_histogram', encoded_features) - - return encoded_features diff --git a/kglib/kgcn_tensorflow/models/typewise_IT.py b/kglib/kgcn_tensorflow/models/typewise_IT.py deleted file mode 100644 index a37247c6..00000000 --- a/kglib/kgcn_tensorflow/models/typewise_IT.py +++ /dev/null @@ -1,55 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import unittest - -import numpy as np -import tensorflow as tf -from tensorflow.python.framework.ops import EagerTensor - -from kglib.kgcn_tensorflow.models.typewise import TypewiseEncoder - - -class ITTypewiseEncoder(unittest.TestCase): - - def setUp(self): - tf.enable_eager_execution() - - def test_with_tensors(self): - tf.reset_default_graph() - tf.set_random_seed(1) - - things = tf.convert_to_tensor(np.array([[0, 0], [1, 0], [2, 0.5673]], dtype=np.float32)) - - entity_relation = lambda x: x - continuous_attribute = lambda x: x - - encoders_for_types = {lambda: entity_relation: [0, 1], lambda: continuous_attribute: [2]} - - tm = TypewiseEncoder(encoders_for_types, 1) - encoded_things = tm(things) # The function under test - - # Check that tensorflow was actually used - self.assertEqual(EagerTensor, type(encoded_things)) - - -if __name__ == '__main__': - unittest.main() diff --git a/kglib/kgcn_tensorflow/models/typewise_test.py b/kglib/kgcn_tensorflow/models/typewise_test.py deleted file mode 100644 index ee973deb..00000000 --- a/kglib/kgcn_tensorflow/models/typewise_test.py +++ /dev/null @@ -1,82 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import unittest - -import numpy as np -import tensorflow as tf -from unittest.mock import Mock - -from kglib.utils.test.utils import get_call_args -from kglib.kgcn_tensorflow.models.typewise import TypewiseEncoder - - -class TestTypewiseEncoder(unittest.TestCase): - def setUp(self): - tf.enable_eager_execution() - - def test_types_encoded_by_expected_functions(self): - things = np.array([[0, 0], [1, 0], [2, 0.5673]], dtype=np.float32) - - mock_entity_relation_encoder = Mock(return_value=np.array([[0, 0, 0], [0, 0, 0]], dtype=np.float32)) - - mock_attribute_encoder = Mock(return_value=np.array([[0.9527, 0.2367, 0.7582]], dtype=np.float32)) - - encoders_for_types = {lambda: mock_entity_relation_encoder: [0, 1], lambda: mock_attribute_encoder: [2]} - - tm = TypewiseEncoder(encoders_for_types, 3) - encoding = tm(things) # The function under test - - np.testing.assert_array_equal([[np.array([[0], [0]], dtype=np.float32)]], - get_call_args(mock_entity_relation_encoder)) - - np.testing.assert_array_equal([[np.array([[0.5673]], dtype=np.float32)]], get_call_args(mock_attribute_encoder)) - - expected_encoding = np.array([[0, 0, 0], [0, 0, 0], [0.9527, 0.2367, 0.7582]], dtype=np.float32) - np.testing.assert_array_equal(expected_encoding, encoding.numpy()) - - def test_basic_encoding(self): - things = np.array([[0], [1], [2]], dtype=np.float32) - - mock_entity_relation_encoder = Mock(return_value=np.array([[0.1, 0, 0], [0.1, 0, 0], [0.1, 0, 0]], dtype=np.float32)) - - encoders_for_types = {lambda: mock_entity_relation_encoder: [0, 1, 2]} - - tm = TypewiseEncoder(encoders_for_types, 3) - encoding = tm(things) # The function under test - - expected_encoding = np.array([[0.1, 0, 0], [0.1, 0, 0], [0.1, 0, 0]], dtype=np.float32) - np.testing.assert_array_equal(expected_encoding, encoding.numpy()) - - def test_encoders_do_not_fulfil_classes(self): - mock_entity_relation_encoder = Mock() - - encoders_for_types = {lambda: mock_entity_relation_encoder: [0, 2]} - - with self.assertRaises(ValueError) as context: - TypewiseEncoder(encoders_for_types, 3) - - self.assertEqual('Encoder categories are inconsistent. Expected [0, 1, 2], but got [0, 2]', - str(context.exception)) - - -if __name__ == '__main__': - unittest.main() diff --git a/kglib/kgcn_tensorflow/pipeline/BUILD b/kglib/kgcn_tensorflow/pipeline/BUILD deleted file mode 100644 index 17fe83bd..00000000 --- a/kglib/kgcn_tensorflow/pipeline/BUILD +++ /dev/null @@ -1,51 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -load("@rules_python//python:defs.bzl", "py_test", "py_library") -load("@vaticle_kglib_pip//:requirements.bzl", - vaticle_kglib_requirement = "requirement") -load("@vaticle_dependencies//tool/checkstyle:rules.bzl", "checkstyle_test") - -py_library( - name = "pipeline", - srcs = [ - 'pipeline.py', - ], - deps = [ - vaticle_kglib_requirement('graph_nets'), - vaticle_kglib_requirement('numpy'), - vaticle_kglib_requirement('dm-sonnet'), - vaticle_kglib_requirement('tensorflow'), - vaticle_kglib_requirement('tensorflow-probability'), - vaticle_kglib_requirement('semantic-version'), - vaticle_kglib_requirement('contextlib2'), - vaticle_kglib_requirement('wrapt'), - "//kglib/utils/graph", - "//kglib/kgcn_data_loader", - ], - visibility=['//visibility:public'] -) - -checkstyle_test( - name = "checkstyle", - include = glob(["*"]), - license_type = "apache-header", -) diff --git a/kglib/kgcn_tensorflow/pipeline/pipeline.py b/kglib/kgcn_tensorflow/pipeline/pipeline.py deleted file mode 100644 index e7261be0..00000000 --- a/kglib/kgcn_tensorflow/pipeline/pipeline.py +++ /dev/null @@ -1,112 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import networkx as nx -import numpy as np -from graph_nets.utils_np import graphs_tuple_to_networkxs - -from kglib.kgcn_tensorflow.learn.learn import KGCNLearner -from kglib.kgcn_tensorflow.models.core import softmax, KGCN -from kglib.kgcn_tensorflow.models.embedding import ThingEmbedder, RoleEmbedder -from kglib.kgcn_tensorflow.plot.plotting import plot_across_training, plot_predictions - -from kglib.kgcn_data_loader.encoding.standard_encode import encode_types, create_input_graph, create_target_graph, encode_values -from kglib.kgcn_data_loader.utils import apply_logits_to_graphs, duplicate_edges_in_reverse -from kglib.utils.graph.iterate import multidigraph_node_data_iterator, multidigraph_data_iterator, \ - multidigraph_edge_data_iterator - -def pipeline(graphs, - tr_ge_split, - node_types, - edge_types, - num_processing_steps_tr=10, - num_processing_steps_ge=10, - num_training_iterations=10000, - continuous_attributes=None, - categorical_attributes=None, - type_embedding_dim=5, - attr_embedding_dim=6, - edge_output_size=3, - node_output_size=3, - output_dir=None): - - ############################################################ - # Manipulate the graph data - ############################################################ - - # Encode attribute values - graphs = [encode_values(graph, categorical_attributes, continuous_attributes) for graph in graphs] - - indexed_graphs = [nx.convert_node_labels_to_integers(graph, label_attribute='concept') for graph in graphs] - graphs = [duplicate_edges_in_reverse(graph) for graph in indexed_graphs] - - graphs = [encode_types(graph, multidigraph_node_data_iterator, node_types) for graph in graphs] - graphs = [encode_types(graph, multidigraph_edge_data_iterator, edge_types) for graph in graphs] - - input_graphs = [create_input_graph(graph) for graph in graphs] - target_graphs = [create_target_graph(graph) for graph in graphs] - - tr_input_graphs = input_graphs[:tr_ge_split] - tr_target_graphs = target_graphs[:tr_ge_split] - ge_input_graphs = input_graphs[tr_ge_split:] - ge_target_graphs = target_graphs[tr_ge_split:] - - ############################################################ - # Build and run the KGCN - ############################################################ - - thing_embedder = ThingEmbedder(node_types, type_embedding_dim, attr_embedding_dim, categorical_attributes, - continuous_attributes) - - role_embedder = RoleEmbedder(len(edge_types), type_embedding_dim) - - kgcn = KGCN(thing_embedder, - role_embedder, - edge_output_size=edge_output_size, - node_output_size=node_output_size) - - learner = KGCNLearner(kgcn, - num_processing_steps_tr=num_processing_steps_tr, - num_processing_steps_ge=num_processing_steps_ge) - - train_values, test_values, tr_info = learner(tr_input_graphs, - tr_target_graphs, - ge_input_graphs, - ge_target_graphs, - num_training_iterations=num_training_iterations, - log_dir=output_dir) - - plot_across_training(*tr_info, output_file=f'{output_dir}learning.png') - plot_predictions(graphs[tr_ge_split:], test_values, num_processing_steps_ge, output_file=f'{output_dir}graph.png') - - logit_graphs = graphs_tuple_to_networkxs(test_values["outputs"][-1]) - - indexed_ge_graphs = indexed_graphs[tr_ge_split:] - ge_graphs = [apply_logits_to_graphs(graph, logit_graph) for graph, logit_graph in - zip(indexed_ge_graphs, logit_graphs)] - - for ge_graph in ge_graphs: - for data in multidigraph_data_iterator(ge_graph): - data['probabilities'] = softmax(data['logits']) - data['prediction'] = int(np.argmax(data['probabilities'])) - - _, _, _, _, _, solveds_tr, solveds_ge = tr_info - return ge_graphs, solveds_tr, solveds_ge diff --git a/kglib/kgcn_tensorflow/plot/BUILD b/kglib/kgcn_tensorflow/plot/BUILD deleted file mode 100644 index 0c1c24e9..00000000 --- a/kglib/kgcn_tensorflow/plot/BUILD +++ /dev/null @@ -1,93 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -load("@rules_python//python:defs.bzl", "py_test", "py_library") -load("@vaticle_kglib_pip//:requirements.bzl", - vaticle_kglib_requirement = "requirement") -load("@vaticle_dependencies//tool/checkstyle:rules.bzl", "checkstyle_test") - -py_test( - name = "plotting_test", - srcs = [ - "plotting_test.py" - ], - deps = [ - "plot" - ] -) - -py_library( - name = "plot", - srcs = [ - 'draw.py', - 'plotting.py', - ], - deps = [ - # Matplotlib deps - vaticle_kglib_requirement('cycler'), - vaticle_kglib_requirement('kiwisolver'), - vaticle_kglib_requirement('matplotlib'), - vaticle_kglib_requirement('pyparsing'), - vaticle_kglib_requirement('python-dateutil'), - - # Networkx deps - vaticle_kglib_requirement('networkx'), - vaticle_kglib_requirement('decorator'), - - # Graph nets deps - vaticle_kglib_requirement('absl-py'), - vaticle_kglib_requirement('cloudpickle'), - vaticle_kglib_requirement('contextlib2'), - # vaticle_kglib_requirement('decorator'), - vaticle_kglib_requirement('dm-sonnet'), - vaticle_kglib_requirement('future'), - vaticle_kglib_requirement('graph-nets'), - # vaticle_kglib_requirement('networkx'), - vaticle_kglib_requirement('numpy'), - vaticle_kglib_requirement('semantic-version'), - vaticle_kglib_requirement('six'), - vaticle_kglib_requirement('tensorflow-probability'), - vaticle_kglib_requirement('wrapt'), - - # Tensorflow deps - # vaticle_kglib_requirement('absl-py'), - vaticle_kglib_requirement('astor'), - vaticle_kglib_requirement('gast'), - vaticle_kglib_requirement('google-pasta'), - vaticle_kglib_requirement('keras-applications'), - vaticle_kglib_requirement('keras-preprocessing'), - # vaticle_kglib_requirement('six'), - vaticle_kglib_requirement('protobuf'), - vaticle_kglib_requirement('tensorboard'), - vaticle_kglib_requirement('tensorflow'), - vaticle_kglib_requirement('tensorflow-estimator'), - vaticle_kglib_requirement('termcolor'), - # vaticle_kglib_requirement('wrapt'), - - ], - visibility=['//visibility:public'] -) - -checkstyle_test( - name = "checkstyle", - include = glob(["*"]), - license_type = "apache-header", -) diff --git a/kglib/kgcn_tensorflow/plot/draw.py b/kglib/kgcn_tensorflow/plot/draw.py deleted file mode 100644 index 472ae918..00000000 --- a/kglib/kgcn_tensorflow/plot/draw.py +++ /dev/null @@ -1,290 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import networkx.utils - - -def draw_networkx_labels(G, pos, - labels=None, - font_size=12, - font_color=None, - font_family='sans-serif', - font_weight='normal', - alpha=None, - bbox=None, - ax=None, - **kwds): - """Draw node labels on the graph G. - - Parameters - ---------- - G : graph - A networkx graph - - pos : dictionary - A dictionary with nodes as keys and positions as values. - Positions should be sequences of length 2. - - labels : dictionary, optional (default=None) - Node labels in a dictionary keyed by node of text labels - Node-keys in labels should appear as keys in `pos`. - If needed use: `{n:lab for n,lab in labels.items() if n in pos}` - - font_size : int - Font size for text labels (default=12) - - font_color : string - Font color string (default='k' black) - - font_family : string - Font family (default='sans-serif') - - font_weight : string - Font weight (default='normal') - - alpha : float - The text transparency (default=1.0) - - ax : Matplotlib Axes object, optional - Draw the graph in the specified Matplotlib axes. - - Returns - ------- - dict - `dict` of labels keyed on the nodes - - Examples - -------- - >>> G = nx.dodecahedral_graph() - >>> labels = nx.draw_networkx_labels(G, pos=nx.spring_layout(G)) - - Also see the NetworkX drawing examples at - https://networkx.github.io/documentation/latest/auto_examples/index.html - - See Also - -------- - draw() - draw_networkx() - draw_networkx_nodes() - draw_networkx_edges() - draw_networkx_edge_labels() - """ - try: - import matplotlib.pyplot as plt - import matplotlib.cbook as cb - except ImportError: - raise ImportError("Matplotlib required for draw()") - except RuntimeError: - print("Matplotlib unable to open display") - raise - - if ax is None: - ax = plt.gca() - - if labels is None: - labels = dict((n, n) for n in G.nodes()) - - # set optional alignment - horizontalalignment = kwds.get('horizontalalignment', 'center') - verticalalignment = kwds.get('verticalalignment', 'center') - - text_items = {} # there is no text collection so we'll fake one - for n, label in labels.items(): - (x, y) = pos[n] - if not networkx.utils.is_string_like(label): - label = str(label) # this makes "1" and 1 labeled the same - t = ax.text(x, y, - label, - size=font_size, - color=font_color[n], - family=font_family, - weight=font_weight, - alpha=alpha[n], - horizontalalignment=horizontalalignment, - verticalalignment=verticalalignment, - transform=ax.transData, - bbox=bbox, - clip_on=True, - ) - text_items[n] = t - - plt.tick_params( - axis='both', - which='both', - bottom=False, - left=False, - labelbottom=False, - labelleft=False) - - return text_items - - -def draw_networkx_edge_labels(G, pos, - edge_labels=None, - label_pos=0.5, - font_size=10, - font_color=None, - font_family='sans-serif', - font_weight='normal', - alpha=None, - bbox=None, - ax=None, - rotate=True, - **kwds): - """Draw edge labels. - - Parameters - ---------- - G : graph - A networkx graph - - pos : dictionary - A dictionary with nodes as keys and positions as values. - Positions should be sequences of length 2. - - ax : Matplotlib Axes object, optional - Draw the graph in the specified Matplotlib axes. - - alpha : float - The text transparency (default=1.0) - - edge_labels : dictionary - Edge labels in a dictionary keyed by edge two-tuple of text - labels (default=None). Only labels for the keys in the dictionary - are drawn. - - label_pos : float - Position of edge label along edge (0=head, 0.5=center, 1=tail) - - font_size : int - Font size for text labels (default=12) - - font_color : string - Font color string (default='k' black) - - font_weight : string - Font weight (default='normal') - - font_family : string - Font family (default='sans-serif') - - bbox : Matplotlib bbox - Specify text box shape and colors. - - clip_on : bool - Turn on clipping at axis boundaries (default=True) - - Returns - ------- - dict - `dict` of labels keyed on the edges - - Examples - -------- - >>> G = nx.dodecahedral_graph() - >>> edge_labels = nx.draw_networkx_edge_labels(G, pos=nx.spring_layout(G)) - - Also see the NetworkX drawing examples at - https://networkx.github.io/documentation/latest/auto_examples/index.html - - See Also - -------- - draw() - draw_networkx() - draw_networkx_nodes() - draw_networkx_edges() - draw_networkx_labels() - """ - try: - import matplotlib.pyplot as plt - import numpy as np - except ImportError: - raise ImportError("Matplotlib required for draw()") - except RuntimeError: - print("Matplotlib unable to open display") - raise - - if ax is None: - ax = plt.gca() - if edge_labels is None: - labels = {(u, v): d for u, v, d in G.edges(data=True)} - else: - labels = edge_labels - text_items = {} - for (n1, n2), label in labels.items(): - (x1, y1) = pos[n1] - (x2, y2) = pos[n2] - (x, y) = (x1 * label_pos + x2 * (1.0 - label_pos), - y1 * label_pos + y2 * (1.0 - label_pos)) - - if rotate: - # in degrees - angle = np.arctan2(y2 - y1, x2 - x1) / (2.0 * np.pi) * 360 - # make label orientation "right-side-up" - if angle > 90: - angle -= 180 - if angle < - 90: - angle += 180 - # transform data coordinate angle to screen coordinate angle - xy = np.array((x, y)) - trans_angle = ax.transData.transform_angles(np.array((angle,)), - xy.reshape((1, 2)))[0] - else: - trans_angle = 0.0 - # use default box of white with white border - if bbox is None: - bbox = dict(boxstyle='round', - ec=(1.0, 1.0, 1.0), - fc=(1.0, 1.0, 1.0), - ) - if not networkx.utils.is_string_like(label): - label = str(label) # this makes "1" and 1 labeled the same - - # set optional alignment - horizontalalignment = kwds.get('horizontalalignment', 'center') - verticalalignment = kwds.get('verticalalignment', 'center') - - t = ax.text(x, y, - label, - size=font_size, - color=font_color[(n1, n2)], - family=font_family, - weight=font_weight, - alpha=alpha[(n1, n2)], - horizontalalignment=horizontalalignment, - verticalalignment=verticalalignment, - rotation=trans_angle, - transform=ax.transData, - bbox=bbox, - zorder=1, - clip_on=True, - ) - text_items[(n1, n2)] = t - - plt.tick_params( - axis='both', - which='both', - bottom=False, - left=False, - labelbottom=False, - labelleft=False) - - return text_items diff --git a/kglib/kgcn_tensorflow/plot/plotting.py b/kglib/kgcn_tensorflow/plot/plotting.py deleted file mode 100644 index 0a233f3a..00000000 --- a/kglib/kgcn_tensorflow/plot/plotting.py +++ /dev/null @@ -1,289 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import math - -import graph_nets.utils_np as utils_np -import matplotlib.pyplot as plt -import networkx as nx -import numpy as np - -import kglib.kgcn_tensorflow.plot.draw as custom_nx - - -def plot_across_training(logged_iterations, losses_tr, losses_ge, corrects_tr, corrects_ge, solveds_tr, solveds_ge, - output_file='./learning.png'): - # Plot results curves. - fig = plt.figure(1, figsize=(18, 3)) - fig.clf() - x = np.array(logged_iterations) - # Loss. - y_tr = losses_tr - y_ge = losses_ge - ax = fig.add_subplot(1, 3, 1) - ax.plot(x, y_tr, "k", label="Training") - ax.plot(x, y_ge, "k--", label="Test/generalization") - ax.set_title("Loss across training") - ax.set_xlabel("Training iteration") - ax.set_ylabel("Loss (binary cross-entropy)") - ax.legend() - # Correct. - y_tr = corrects_tr - y_ge = corrects_ge - ax = fig.add_subplot(1, 3, 2) - ax.plot(x, y_tr, "k", label="Training") - ax.plot(x, y_ge, "k--", label="Test/generalization") - ax.set_title("Fraction correct across training") - ax.set_xlabel("Training iteration") - ax.set_ylabel("Fraction nodes/edges correct") - # Solved. - y_tr = solveds_tr - y_ge = solveds_ge - ax = fig.add_subplot(1, 3, 3) - ax.plot(x, y_tr, "k", label="Training") - ax.plot(x, y_ge, "k--", label="Test/generalization") - ax.set_title("Fraction solved across training") - ax.set_xlabel("Training iteration") - ax.set_ylabel("Fraction examples solved") - - plt.savefig(output_file, bbox_inches='tight') - - -def plot_predictions(raw_graphs, test_values, num_processing_steps_ge, solution_weights=(-0.5, 0.5, 0.5), - output_file='./graph.png'): - - # # Plot graphs and results after each processing step. - # The white node is the start, and the black is the end. Other nodes are colored - # from red to purple to blue, where red means the model is confident the node is - # off the shortest path, blue means the model is confident the node is on the - # shortest path, and purplish colors mean the model isn't sure. - - max_graphs_to_plot = 10 - num_steps_to_plot = 3 - node_size = 120 - - num_graphs = len(raw_graphs) - targets = utils_np.graphs_tuple_to_data_dicts(test_values["target"]) - step_indices = np.floor( - np.linspace(0, num_processing_steps_ge - 1, - num_steps_to_plot)).astype(int).tolist() - outputs = list( - zip(*(utils_np.graphs_tuple_to_data_dicts(test_values["outputs"][i]) - for i in step_indices))) - h = min(num_graphs, max_graphs_to_plot) - w = num_steps_to_plot + 2 - fig = plt.figure(101, figsize=(18, h * 3)) - fig.clf() - for j, (graph, target, output) in enumerate(zip(raw_graphs, targets, outputs)): - if j >= h: - break - for s, r, d in graph.edges(data=True): - d['weight'] = solution_weights[d['solution']] # Looks good with high k - pos = nx.spring_layout(graph, k=3 / math.sqrt(graph.number_of_nodes()), seed=1, weight='weight', iterations=50) - # pos = nx.circular_layout(graph, scale=2) - ground_truth_node_prob = target["nodes"][:, -1] - ground_truth_edge_prob = target["edges"][:, -1] - - non_preexist_node_mask = mask_preexists(target["nodes"]) - non_preexist_edge_mask = mask_preexists(target["edges"]) - - # Ground truth. - iax = j * (2 + num_steps_to_plot) + 1 - ax = draw_subplot(graph, fig, pos, node_size, h, w, iax, ground_truth_node_prob, ground_truth_edge_prob, True) - - # Format the ground truth plot axes - ax.set_axis_on() - ax.set_xticks([]) - ax.set_yticks([]) - ax.spines['bottom'].set_color('blue') - ax.spines['top'].set_color('blue') - ax.spines['right'].set_color('blue') - ax.spines['left'].set_color('blue') - ax.grid(None) - ax.set_title("Ground truth") - - # Prediction. - for k, outp in enumerate(output): - iax = j * (2 + num_steps_to_plot) + 2 + k - node_prob = softmax_prob_last_dim(outp["nodes"]) * non_preexist_node_mask - edge_prob = softmax_prob_last_dim(outp["edges"]) * non_preexist_edge_mask - ax = draw_subplot(graph, fig, pos, node_size, h, w, iax, node_prob, edge_prob, False) - ax.set_title("Model-predicted\nStep {:02d} / {:02d}".format( - step_indices[k] + 1, step_indices[-1] + 1)) - - # Class Winners - # Displays whether the class represented by the last dimension was the winner - node_prob = last_dim_was_class_winner(output[-1]["nodes"]) * non_preexist_node_mask - edge_prob = last_dim_was_class_winner(output[-1]["edges"]) * non_preexist_edge_mask - - iax = j * (2 + num_steps_to_plot) + 2 + len(output) - ax = draw_subplot(graph, fig, pos, node_size, h, w, iax, node_prob, edge_prob, False) - - # Format the class winners plot axes - ax.set_axis_on() - ax.set_xticks([]) - ax.set_yticks([]) - ax.spines['bottom'].set_color('green') - ax.spines['top'].set_color('green') - ax.spines['right'].set_color('green') - ax.spines['left'].set_color('green') - ax.grid(None) - ax.set_title("Model-predicted winners") - - plt.savefig(output_file, bbox_inches='tight') - - -def mask_preexists(arr): - return (arr[:, 0] == 0) * 1 - - -def softmax_prob_last_dim(x): - e = np.exp(x) - return e[:, -1] / np.sum(e, axis=-1) - - -def last_dim_was_class_winner(x): - return (np.argmax(x, axis=-1) == 2) * 1 - - -def element_color(gt_plot, probability, element_props): - """ - Determine the color values to use for a node/edge and its label - gt plot: - blue for existing elements, green for those to infer, red candidates - - output plot: - blue for existing elements, green for those to infer, red for candidates, all with transparency - """ - - existing = 0 - candidate = 1 - to_infer = 2 - - solution = element_props.get('solution') - - color_config = { - to_infer: {'color': [0.0, 1.0, 0.0], 'gt_opacity': 1.0}, - candidate: {'color': [1.0, 0.0, 0.0], 'gt_opacity': 1.0}, - existing: {'color': [0.0, 0.0, 1.0], 'gt_opacity': 0.2} - } - - chosen_config = color_config[solution] - - if gt_plot: - opacity = chosen_config['gt_opacity'] - else: - opacity = probability - - label = np.array([0.0, 0.0, 0.0] + [opacity]) - color = np.array(chosen_config['color'] + [opacity]) - - return dict(element=color, label=label) - - -def draw_subplot(graph, fig, pos, node_size, h, w, iax, node_prob, edge_prob, gt_plot): - ax = fig.add_subplot(h, w, iax) - node_color = {} - node_label_color = {} - edge_color = {} - edge_label_color = {} - - for i, (n, props) in enumerate(graph.nodes(data=True)): - colors = element_color(gt_plot, node_prob[n], props) - - node_color[n] = colors['element'] - node_label_color[n] = colors['label'] - - for n, (sender, receiver, props) in enumerate(graph.edges(data=True)): - colors = element_color(gt_plot, edge_prob[n], props) - - edge_color[(sender, receiver)] = colors['element'] - edge_label_color[(sender, receiver)] = colors['label'] - - draw_graph(graph, pos, ax, node_size=node_size, node_color=node_color, node_label_color=node_label_color, - edge_color=edge_color, edge_label_color=edge_label_color) - return ax - - -def draw_graph(graph, - pos, - ax, - node_size=200, - node_color=(0.4, 0.8, 0.4), - node_label_color=None, - edge_color=(0.0, 0.0, 0.0), - edge_label_color=None, - node_linewidth=1.0, - edge_width=1.0, - font_size=6): - - def _draw(draw_function, zorder=None, **kwargs): - # draw_kwargs = self._make_draw_kwargs(**kwargs) - _base_draw_kwargs = dict(G=graph, pos=pos, ax=ax) - kwargs.update(_base_draw_kwargs) - collection = draw_function(**kwargs) - if collection is not None and zorder is not None: - try: - # This is for compatibility with older matplotlib. - collection.set_zorder(zorder) - except AttributeError: - # This is for compatibility with newer matplotlib. - collection[0].set_zorder(zorder) - return collection - - # Plot nodes. - c = [node_color[n] for n in graph.nodes()] - _draw(nx.draw_networkx_nodes, - node_size=node_size, - node_color=c, - linewidths=node_linewidth, - alpha=[node_color[n][-1] for n in graph.nodes()], - zorder=-10) - - # Plot edges. - e = [edge_color[(s, r)] for s, r, k in graph.edges] - _draw(nx.draw_networkx_edges, - edgelist=graph.edges, - width=edge_width, - zorder=-20, - edge_color=e - ) - - bbox_props = dict(boxstyle="square,pad=0.0", fc="none", ec="none", lw=1) - labels_dict = {node_id: graph.nodes[node_id]['type'] for node_id in graph.nodes} - edge_labels_dict = {(edge_id[0], edge_id[1]): graph.edges[edge_id]['type'] for edge_id in graph.edges} - - custom_nx.draw_networkx_labels(graph, - pos, - labels=labels_dict, - font_size=font_size, - font_color=node_label_color, - alpha=[node_label_color[n][-1] for n in graph.nodes()]) - - custom_nx.draw_networkx_edge_labels(graph, - pos, - edge_labels=edge_labels_dict, - font_size=font_size, - # font_color=np.array([0.0, 0.5, 0.0, 0.1]), - font_color=edge_label_color, - # alpha=0.2, - alpha={n: edge_label_color[n][-1] for n in graph.edges()}, - bbox=bbox_props) diff --git a/kglib/kgcn_tensorflow/plot/plotting_test.py b/kglib/kgcn_tensorflow/plot/plotting_test.py deleted file mode 100644 index c3b8860c..00000000 --- a/kglib/kgcn_tensorflow/plot/plotting_test.py +++ /dev/null @@ -1,84 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import datetime -import os -import unittest - -import networkx as nx -import numpy as np -from graph_nets.graphs import GraphsTuple - -from kglib.kgcn_tensorflow.plot.plotting import plot_predictions - - -class TestPlotPredictions(unittest.TestCase): - def test_plot_is_created(self): - num_processing_steps_ge = 6 - - graph = nx.MultiDiGraph(name=0) - - existing = dict(solution=0) - to_infer = dict(solution=2) - candidate = dict(solution=1) - - # people - graph.add_node(0, type='person', **existing) - graph.add_node(1, type='person', **candidate) - - # parentships - graph.add_node(2, type='parentship', **to_infer) - graph.add_edge(2, 0, type='parent', **to_infer) - graph.add_edge(2, 1, type='child', **candidate) - - graph_tuple_target = GraphsTuple(nodes=np.array([[1., 0., 0.], - [0., 1., 0.], - [0., 0., 1.]]), - edges=np.array([[0., 0., 1.], - [0., 1., 0.]]), - receivers=np.array([1, 2], dtype=np.int32), - senders=np.array([0, 1], dtype=np.int32), - globals=np.array([[0., 0., 0., 0., 0.]], dtype=np.float32), - n_node=np.array([3], dtype=np.int32), - n_edge=np.array([2], dtype=np.int32)) - - graph_tuple_output = GraphsTuple(nodes=np.array([[1., 0., 0.], - [1., 1., 0.], - [1., 0., 1.]]), - edges=np.array([[1., 0., 0.], - [1., 1., 0.]]), - receivers=np.array([1, 2], dtype=np.int32), - senders=np.array([0, 1], dtype=np.int32), - globals=np.array([[0., 0., 0., 0., 0.]], dtype=np.float32), - n_node=np.array([3], dtype=np.int32), - n_edge=np.array([2], dtype=np.int32)) - - test_values = {"target": graph_tuple_target, "outputs": [graph_tuple_output for _ in range(6)]} - - filename = f'./graph_{datetime.datetime.now()}.png' - - plot_predictions([graph], test_values, num_processing_steps_ge, output_file=filename) - - self.assertTrue(os.path.isfile(filename)) - - -if __name__ == "__main__": - unittest.main() diff --git a/kglib/tests/deployment/BUILD b/kglib/tests/deployment/BUILD deleted file mode 100644 index 1a4bf662..00000000 --- a/kglib/tests/deployment/BUILD +++ /dev/null @@ -1,31 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -load("@vaticle_dependencies//tool/checkstyle:rules.bzl", "checkstyle_test") - -checkstyle_test( - name = "checkstyle", - include = glob([ - "*", - "**/*" - ]), - license_type = "apache-header", -) diff --git a/kglib/utils/BUILD b/kglib/utils/BUILD deleted file mode 100644 index f0bc0193..00000000 --- a/kglib/utils/BUILD +++ /dev/null @@ -1,39 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -load("@rules_python//python:defs.bzl", "py_library") -load("@vaticle_dependencies//tool/checkstyle:rules.bzl", "checkstyle_test") - -py_library( - name = "utils", - deps = [ - '//kglib/utils/typedb', - '//kglib/utils/graph', - '//kglib/utils/test' - ], - visibility=['//visibility:public'] -) - -checkstyle_test( - name = "checkstyle", - include = glob(["*"]), - license_type = "apache-header", -) diff --git a/kglib/utils/__init__.py b/kglib/utils/__init__.py deleted file mode 100644 index 4e916487..00000000 --- a/kglib/utils/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - diff --git a/kglib/utils/graph/BUILD b/kglib/utils/graph/BUILD deleted file mode 100644 index 519134a9..00000000 --- a/kglib/utils/graph/BUILD +++ /dev/null @@ -1,39 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -load("@rules_python//python:defs.bzl", "py_library") -load("@vaticle_dependencies//tool/checkstyle:rules.bzl", "checkstyle_test") - -py_library( - name = "graph", - srcs = glob(['**/*.py']), - deps = [ - '//kglib/utils/graph/query', - '//kglib/utils/graph/thing', - ], - visibility=['//visibility:public'] -) - -checkstyle_test( - name = "checkstyle", - include = glob(["*"]), - license_type = "apache-header", -) diff --git a/kglib/utils/graph/query/BUILD b/kglib/utils/graph/query/BUILD deleted file mode 100644 index bb705dc0..00000000 --- a/kglib/utils/graph/query/BUILD +++ /dev/null @@ -1,54 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -load("@rules_python//python:defs.bzl", "py_test", "py_library") -load("@vaticle_kglib_pip//:requirements.bzl", - vaticle_kglib_requirement = "requirement") -load("@vaticle_dependencies//tool/checkstyle:rules.bzl", "checkstyle_test") - - -py_test( - name = "query_graph_test", - srcs = [ - "query_graph_test.py" - ], - deps = [ - "query", - ], -) - -py_library( - name = "query", - srcs = [ - 'query_graph.py', - ], - deps = [ - vaticle_kglib_requirement('networkx'), - vaticle_kglib_requirement('decorator'), - ], - visibility=['//visibility:public'] -) - -checkstyle_test( - name = "checkstyle", - include = glob(["*"]), - license_type = "apache-header", -) diff --git a/kglib/utils/graph/test/BUILD b/kglib/utils/graph/test/BUILD deleted file mode 100644 index 781dcce6..00000000 --- a/kglib/utils/graph/test/BUILD +++ /dev/null @@ -1,37 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -load("@rules_python//python:defs.bzl", "py_test", "py_library") -load("@vaticle_dependencies//tool/checkstyle:rules.bzl", "checkstyle_test") - -py_library( - name = "test", - srcs = [ - 'case.py', - ], - visibility=['//visibility:public'] -) - -checkstyle_test( - name = "checkstyle", - include = glob(["*"]), - license_type = "apache-header", -) diff --git a/kglib/utils/graph/thing/BUILD b/kglib/utils/graph/thing/BUILD deleted file mode 100644 index 4b795846..00000000 --- a/kglib/utils/graph/thing/BUILD +++ /dev/null @@ -1,85 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -load("@rules_python//python:defs.bzl", "py_test", "py_library") -load("@vaticle_kglib_pip//:requirements.bzl", - vaticle_kglib_requirement = "requirement") -load("@vaticle_dependencies//tool/checkstyle:rules.bzl", "checkstyle_test") - - -py_test( - name = "queries_to_networkx_graph_test", - srcs = [ - "queries_to_networkx_graph_test.py" - ], - deps = [ - "thing", - "//kglib/utils/typedb/test", - "//kglib/utils/graph/test", - ] -) - -py_test( - name = "queries_to_networkx_graph_it", - srcs = [ - "queries_to_networkx_graph_it.py" - ], - deps = [ - "thing", - "//kglib/utils/typedb/test", - "//kglib/utils/graph/test", - "@vaticle_typedb_client_python//:client_python", - ], - data = ["@vaticle_typedb_artifact_linux//file"], - args = ["$(location @vaticle_typedb_artifact_linux//file)"], -) - - -py_test( - name = "concept_dict_to_graph_test", - srcs = [ - "concept_dict_to_graph_test.py" - ], - deps = [ - "thing", - "//kglib/utils/graph/test", - ], -) - -py_library( - name = "thing", - srcs = [ - 'concept_dict_to_networkx_graph.py', - 'queries_to_networkx_graph.py' - ], - deps = [ - "//kglib/utils/typedb/object", - vaticle_kglib_requirement('networkx'), - vaticle_kglib_requirement('decorator'), - ], - visibility=['//visibility:public'] -) - -checkstyle_test( - name = "checkstyle", - include = glob(["*"]), - license_type = "apache-header", -) diff --git a/kglib/utils/test/BUILD b/kglib/utils/test/BUILD deleted file mode 100644 index 13238d0a..00000000 --- a/kglib/utils/test/BUILD +++ /dev/null @@ -1,35 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -load("@rules_python//python:defs.bzl", "py_library") -load("@vaticle_dependencies//tool/checkstyle:rules.bzl", "checkstyle_test") - -py_library( - name = "test", - srcs = glob(['**/*.py']), - visibility=['//visibility:public'] -) - -checkstyle_test( - name = "checkstyle", - include = glob(["*"]), - license_type = "apache-header", -) diff --git a/kglib/utils/test/utils.py b/kglib/utils/test/utils.py deleted file mode 100644 index 58126508..00000000 --- a/kglib/utils/test/utils.py +++ /dev/null @@ -1,37 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -def get_call_args(mock): - """ - Get the arguments used to call a mock for each call to the mock. Necessary since `.assert_has_calls` won't work - for numpy arrays - Args: - mock: the mock - - Returns: - A list of lists. The outer list is the calls made, the inner list is the arguments given for that call - """ - flat_args = [] - args_list = mock.call_args_list - for call in args_list: - args, kwargs = call - flat_args.append(args) - return flat_args diff --git a/kglib/utils/typedb/BUILD b/kglib/utils/typedb/BUILD deleted file mode 100644 index 4c7f65fe..00000000 --- a/kglib/utils/typedb/BUILD +++ /dev/null @@ -1,42 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -load("@rules_python//python:defs.bzl", "py_library") -load("@vaticle_kglib_pip//:requirements.bzl", - vaticle_kglib_requirement = "requirement") -load("@vaticle_dependencies//tool/checkstyle:rules.bzl", "checkstyle_test") - -py_library( - name = "typedb", - deps = [ - '//kglib/utils/typedb/test', - '//kglib/utils/typedb/object', - '//kglib/utils/typedb/synthetic', - '//kglib/utils/typedb/type', - ], - visibility=['//visibility:public'] -) - -checkstyle_test( - name = "checkstyle", - include = glob(["*"]), - license_type = "apache-header", -) diff --git a/kglib/utils/typedb/synthetic/BUILD b/kglib/utils/typedb/synthetic/BUILD deleted file mode 100644 index 29329537..00000000 --- a/kglib/utils/typedb/synthetic/BUILD +++ /dev/null @@ -1,41 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -load("@rules_python//python:defs.bzl", "py_library") -load("@vaticle_kglib_pip//:requirements.bzl", - vaticle_kglib_requirement = "requirement") -load("@vaticle_dependencies//tool/checkstyle:rules.bzl", "checkstyle_test") - - -py_library( - name = "synthetic", - deps = [ - '//kglib/utils/typedb/synthetic/statistics', - '//kglib/utils/typedb/synthetic/examples', - ], - visibility=['//visibility:public'] -) - -checkstyle_test( - name = "checkstyle", - include = glob(["*"]), - license_type = "apache-header", -) diff --git a/kglib/utils/typedb/synthetic/statistics/BUILD b/kglib/utils/typedb/synthetic/statistics/BUILD deleted file mode 100644 index 16bcae3b..00000000 --- a/kglib/utils/typedb/synthetic/statistics/BUILD +++ /dev/null @@ -1,56 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -load("@rules_python//python:defs.bzl", "py_test", "py_library") -load("@vaticle_kglib_pip//:requirements.bzl", - vaticle_kglib_requirement = "requirement") -load("@vaticle_dependencies//tool/checkstyle:rules.bzl", "checkstyle_test") - -py_test( - name = "pmf_test", - srcs = [ - "pmf_test.py" - ], - deps = [ - "statistics" - ] -) - -py_library( - name = "statistics", - srcs = [ - 'pmf.py', - ], - deps = [ - vaticle_kglib_requirement('numpy'), - vaticle_kglib_requirement('pandas'), - vaticle_kglib_requirement('pytz'), - vaticle_kglib_requirement('python-dateutil'), - vaticle_kglib_requirement('six') - ], - visibility=['//visibility:public'] -) - -checkstyle_test( - name = "checkstyle", - include = glob(["*"]), - license_type = "apache-header", -) diff --git a/kglib/utils/typedb/type/BUILD b/kglib/utils/typedb/type/BUILD deleted file mode 100644 index 71428367..00000000 --- a/kglib/utils/typedb/type/BUILD +++ /dev/null @@ -1,39 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -load("@rules_python//python:defs.bzl", "py_library") -load("@vaticle_kglib_pip//:requirements.bzl", - vaticle_kglib_requirement = "requirement") -load("@vaticle_dependencies//tool/checkstyle:rules.bzl", "checkstyle_test") - -py_library( - name = "type", - srcs = [ - 'type.py', - ], - visibility=['//visibility:public'] -) - -checkstyle_test( - name = "checkstyle", - include = glob(["*"]), - license_type = "apache-header", -) diff --git a/kglib/utils/typedb/type/type.py b/kglib/utils/typedb/type/type.py deleted file mode 100644 index a43fe89a..00000000 --- a/kglib/utils/typedb/type/type.py +++ /dev/null @@ -1,49 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -def get_thing_types(tx): - """ - Get all schema types, excluding those for implicit attribute relations and base types - Args: - tx: TypeDB transaction - - Returns: - TypeDB types - """ - schema_concepts = tx.query().match("match $x sub thing;") - thing_types = [schema_concept.get('x').get_label().name() for schema_concept in schema_concepts] - [thing_types.remove(el) for el in ['thing', 'relation', 'entity', 'attribute']] - return thing_types - - -def get_role_types(tx): - """ - Get all schema roles, excluding those for implicit attribute relations, the base role type - Args: - tx: TypeDB transaction - - Returns: - TypeDB roles - """ - schema_concepts = tx.query().match("match $rel sub relation, relates $r;") - role_types = ['has'] + [role.get('r').get_label().name() for role in schema_concepts] - role_types.remove('role') - return role_types diff --git a/requirements.txt b/requirements.txt index 1533b947..c2e9cf8a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,48 +19,51 @@ # under the License. # -absl-py==0.11.0 -astor==0.8.1 -behave==1.2.6 -cached-property==1.5.2 -cloudpickle==1.1.1 -contextlib2==0.6.0.post1 -cycler==0.10.0 -decorator==4.4.2 -dm-sonnet==1.35 -dm-tree==0.1.5 -future==0.18.2 -gast==0.2.2 -google-pasta==0.2.0 -typedb-client==2.1.0 -graph-nets==1.0.4 -grpcio==1.35.0 -h5py==3.1.0 -importlib-metadata==3.3.0 -Keras-Applications==1.0.8 -Keras-Preprocessing==1.1.2 -kiwisolver==1.3.1 -Markdown==3.3.3 -matplotlib==3.1.1 +--find-links https://download.pytorch.org/whl/cpu/torch_stable.html +--find-links https://data.pyg.org/whl/torch-1.11.0+cpu.html + +absl-py==1.2.0 +cachetools==5.2.0 +certifi==2022.6.15 +charset-normalizer==2.1.0 +decorator==5.1.1 +google-auth==2.9.1 +google-auth-oauthlib==0.4.6 +grpcio==1.43.0 +idna==3.3 +importlib-metadata==4.12.0 +Jinja2==3.1.2 +joblib==1.1.0 +Markdown==3.4.1 +MarkupSafe==2.1.1 networkx==2.5 -numpy==1.19.5 -pandas==0.25.1 -parse==1.18.0 -parse-type==0.5.2 -protobuf==3.14.0 -PyHamcrest==2.0.2 -pyparsing==2.4.7 -python-dateutil==2.8.1 -pytz==2020.5 -scipy==1.3.1 -semantic-version==2.8.5 -six==1.15.0 -tensorboard==1.14.0 -tensorflow==1.14.0 -tensorflow-estimator==1.14.0 -tensorflow-probability==0.7.0 -termcolor==1.1.0 -typing-extensions==3.7.4.3 -Werkzeug==1.0.1 -wrapt==1.12.1 -zipp==3.4.0 +numpy==1.21.6 +oauthlib==3.2.0 +pandas==1.3.5 +protobuf==3.15.5 +pyasn1==0.4.8 +pyasn1-modules==0.2.8 +pyparsing==3.0.9 +python-dateutil==2.8.2 +pytz==2022.1 +requests==2.28.1 +requests-oauthlib==1.3.1 +rsa==4.9 +scikit-learn==1.0.2 +scipy==1.7.3 +six==1.16.0 +tensorboard==2.9.1 +tensorboard-data-server==0.6.1 +tensorboard-plugin-wit==1.8.1 +threadpoolctl==3.1.0 +torch==1.11.0 +torch-geometric==2.0.4 +torch-scatter==2.0.9 +torch-sparse==0.6.14 +tqdm==4.64.0 +typedb-client==2.9.0 +typedb-protocol==2.9.0 +typing-extensions==4.3.0 +urllib3==1.26.10 +Werkzeug==2.1.2 +zipp==3.8.1 diff --git a/requirements-dev.txt b/requirements_dev.txt similarity index 50% rename from requirements-dev.txt rename to requirements_dev.txt index a9b3a4f8..58faed4a 100644 --- a/requirements-dev.txt +++ b/requirements_dev.txt @@ -20,48 +20,49 @@ # --extra-index-url https://repo.vaticle.com/repository/pypi-snapshot/simple +--find-links https://download.pytorch.org/whl/cpu/torch_stable.html +--find-links https://data.pyg.org/whl/torch-1.11.0+cpu.html -absl-py==0.11.0 -astor==0.8.1 -behave==1.2.6 -cached-property==1.5.2 -cloudpickle==1.1.1 -contextlib2==0.6.0.post1 -cycler==0.10.0 -decorator==4.4.2 -dm-sonnet==1.35 -dm-tree==0.1.5 -future==0.18.2 -gast==0.2.2 -google-pasta==0.2.0 -graph-nets==1.0.4 -grpcio==1.35.0 -h5py==3.1.0 -importlib-metadata==3.3.0 -Keras-Applications==1.0.8 -Keras-Preprocessing==1.1.2 -kiwisolver==1.3.1 -Markdown==3.3.3 -matplotlib==3.1.1 +absl-py==1.2.0 +cachetools==5.2.0 +certifi==2022.6.15 +charset-normalizer==2.1.0 +decorator==5.1.1 +google-auth==2.9.1 +google-auth-oauthlib==0.4.6 +grpcio==1.43.0 +idna==3.3 +importlib-metadata==4.12.0 +Jinja2==3.1.2 +joblib==1.1.0 +Markdown==3.4.1 +MarkupSafe==2.1.1 networkx==2.5 -numpy==1.19.5 -pandas==0.25.1 -parse==1.18.0 -parse-type==0.5.2 -protobuf==3.14.0 -PyHamcrest==2.0.2 -pyparsing==2.4.7 -python-dateutil==2.8.1 -pytz==2020.5 -scipy==1.3.1 -semantic-version==2.8.5 -six==1.15.0 -tensorboard==1.14.0 -tensorflow==1.14.0 -tensorflow-estimator==1.14.0 -tensorflow-probability==0.7.0 -termcolor==1.1.0 -typing-extensions==3.7.4.3 -Werkzeug==1.0.1 -wrapt==1.12.1 -zipp==3.4.0 +numpy==1.21.6 +oauthlib==3.2.0 +pandas==1.3.5 +protobuf==3.15.5 +pyasn1==0.4.8 +pyasn1-modules==0.2.8 +pyparsing==3.0.9 +python-dateutil==2.8.2 +pytz==2022.1 +requests==2.28.1 +requests-oauthlib==1.3.1 +rsa==4.9 +scikit-learn==1.0.2 +scipy==1.7.3 +six==1.16.0 +tensorboard==2.9.1 +tensorboard-data-server==0.6.1 +tensorboard-plugin-wit==1.8.1 +threadpoolctl==3.1.0 +torch==1.11.0 +torch-geometric==2.0.4 +torch-scatter==2.0.9 +torch-sparse==0.6.14 +tqdm==4.64.0 +typing-extensions==4.3.0 +urllib3==1.26.10 +Werkzeug==2.1.2 +zipp==3.8.1 diff --git a/setup.py b/setup.py index 76af24a0..1b3cc2d4 100644 --- a/setup.py +++ b/setup.py @@ -22,12 +22,12 @@ from setuptools import setup setup( - name='kglib', + name='typedb_ml', version='', - packages=['kglib', 'kglib.utils', 'kglib.kgcn_tensorflow', 'kglib.kgcn_data_loader'], + packages=['typedb_ml', 'typedb_ml.networkx', 'typedb_ml.pytorch_geometric', 'typedb_ml.typedb'], url='', license='', - author='jms_fltchr', + author='jmsfltchr', author_email='', description='' ) diff --git a/test/BUILD b/test/BUILD deleted file mode 100644 index d9cc3eab..00000000 --- a/test/BUILD +++ /dev/null @@ -1,34 +0,0 @@ -# -# Copyright (C) 2022 Vaticle -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -load("@vaticle_bazel_distribution//artifact:rules.bzl", "artifact_extractor") -load("@vaticle_dependencies//tool/checkstyle:rules.bzl", "checkstyle_test") - -artifact_extractor( - name = "typedb-extractor-linux", - artifact = "@vaticle_typedb_artifact_linux//file", -) - -checkstyle_test( - name = "checkstyle", - include = glob(["*"]), - license_type = "apache-header", -) diff --git a/kglib/tests/end_to_end/BUILD b/tests/end_to_end/BUILD similarity index 68% rename from kglib/tests/end_to_end/BUILD rename to tests/end_to_end/BUILD index 1a037d54..8695ccc4 100644 --- a/kglib/tests/end_to_end/BUILD +++ b/tests/end_to_end/BUILD @@ -20,24 +20,25 @@ # load("@rules_python//python:defs.bzl", "py_test") +load("@vaticle_bazel_distribution//artifact:rules.bzl", "artifact_extractor") load("@vaticle_dependencies//tool/checkstyle:rules.bzl", "checkstyle_test") py_test( name = "diagnosis", size = "enormous", srcs = [ - "kgcn/diagnosis.py" + "diagnosis.py" ], deps = [ - "//kglib/kgcn_tensorflow/examples/diagnosis", - "//kglib/utils/typedb/test", + "//examples/diagnosis", + "//typedb_ml/typedb/test", ], data = [ - "//kglib/utils/typedb/synthetic/examples:diagnosis-example-typeql-files", + "//examples/diagnosis/dataset:diagnosis-example-typeql-files", "@vaticle_typedb_artifact_linux//file" ], args = [ - "$(locations //kglib/utils/typedb/synthetic/examples:diagnosis-example-typeql-files)", + "$(locations //examples/diagnosis/dataset:diagnosis-example-typeql-files)", "$(location @vaticle_typedb_artifact_linux//file)", ], ) @@ -46,25 +47,29 @@ py_test( name = "diagnosis-debug", size = "enormous", srcs = [ - "kgcn/diagnosis_debug.py" + "diagnosis_debug.py" ], - main = "kgcn/diagnosis_debug.py", + main = "diagnosis_debug.py", deps = [ - "//kglib/kgcn_tensorflow/examples/diagnosis", + "//examples/diagnosis", ], data = [ - "//kglib/utils/typedb/synthetic/examples:diagnosis-example-typeql-files", + "//examples/diagnosis/dataset:diagnosis-example-typeql-files", ], args = [ - "$(locations //kglib/utils/typedb/synthetic/examples:diagnosis-example-typeql-files)", + "$(locations //examples/diagnosis/dataset:diagnosis-example-typeql-files)", ], ) +artifact_extractor( + name = "typedb-extractor-linux", + artifact = "@vaticle_typedb_artifact_linux//file", +) + checkstyle_test( name = "checkstyle", include = glob([ "*", - "**/*" ]), license_type = "apache-header", ) diff --git a/kglib/tests/end_to_end/kgcn/diagnosis.py b/tests/end_to_end/diagnosis.py similarity index 65% rename from kglib/tests/end_to_end/kgcn/diagnosis.py rename to tests/end_to_end/diagnosis.py index 771fd879..46af1ec1 100644 --- a/kglib/tests/end_to_end/kgcn/diagnosis.py +++ b/tests/end_to_end/diagnosis.py @@ -20,30 +20,35 @@ # import sys import unittest +import os -from kglib.kgcn_tensorflow.examples.diagnosis.diagnosis import diagnosis_example -from kglib.utils.typedb.test.base import TypeDBServer +from examples.diagnosis.diagnosis import diagnosis_example +from typedb_ml.typedb.test.base import TypeDBServer class TestDiagnosisExample(unittest.TestCase): def setUp(self): + cwd = os.getcwd() self._tdb = TypeDBServer(sys.argv.pop()) self._tdb.start() self._typedb_binary_location = self._tdb.typedb_binary_location - self._data_file_location = sys.argv.pop() - self._schema_file_location = sys.argv.pop() + self._data_file_location = cwd + '/' + sys.argv.pop() + self._schema_file_location = cwd + '/' + sys.argv.pop() def tearDown(self): self._tdb.stop() def test_learning_is_done(self): - solveds_tr, solveds_ge = diagnosis_example(self._typedb_binary_location, - schema_file_path=self._schema_file_location, - seed_data_file_path=self._data_file_location) - self.assertGreaterEqual(solveds_tr[-1], 0.7) - self.assertLessEqual(solveds_tr[-1], 0.99) - self.assertGreaterEqual(solveds_ge[-1], 0.7) - self.assertLessEqual(solveds_ge[-1], 0.99) + train_accuracy, test_accuracy = diagnosis_example( + self._typedb_binary_location, + 200, + schema_file_path=self._schema_file_location, + seed_data_file_path=self._data_file_location + ) + self.assertGreaterEqual(train_accuracy, 0.75) + self.assertLessEqual(train_accuracy, 0.99) + self.assertGreaterEqual(test_accuracy, 0.75) + self.assertLessEqual(test_accuracy, 0.99) if __name__ == "__main__": diff --git a/kglib/tests/end_to_end/kgcn/diagnosis_debug.py b/tests/end_to_end/diagnosis_debug.py similarity index 68% rename from kglib/tests/end_to_end/kgcn/diagnosis_debug.py rename to tests/end_to_end/diagnosis_debug.py index daa18995..941069b8 100644 --- a/kglib/tests/end_to_end/kgcn/diagnosis_debug.py +++ b/tests/end_to_end/diagnosis_debug.py @@ -22,14 +22,14 @@ import sys import unittest -from kglib.kgcn_tensorflow.examples.diagnosis.diagnosis import diagnosis_example +from examples.diagnosis.diagnosis import diagnosis_example class TestDiagnosisExampleDebug(unittest.TestCase): """ A copy of the end-to-end test for local debugging. Requires a TypeDB server to be started in the background manually. Run with: - bazel test //kglib/tests/end_to_end:diagnosis --test_output=streamed --spawn_strategy=standalone --action_env=PATH --test_arg=-- + bazel test //tests/end_to_end:diagnosis --test_output=streamed --spawn_strategy=standalone --action_env=PATH --test_arg=-- """ def setUp(self): @@ -39,13 +39,16 @@ def setUp(self): self._schema_file_location = base_dir + sys.argv.pop() def test_learning_is_done(self): - solveds_tr, solveds_ge = diagnosis_example(self._typedb_binary_location, - schema_file_path=self._schema_file_location, - seed_data_file_path=self._data_file_location) - self.assertGreaterEqual(solveds_tr[-1], 0.7) - self.assertLessEqual(solveds_tr[-1], 0.99) - self.assertGreaterEqual(solveds_ge[-1], 0.7) - self.assertLessEqual(solveds_ge[-1], 0.99) + train_accuracy, test_accuracy = diagnosis_example( + self._typedb_binary_location, + 200, + schema_file_path=self._schema_file_location, + seed_data_file_path=self._data_file_location + ) + self.assertGreaterEqual(train_accuracy, 0.75) + self.assertLessEqual(train_accuracy, 0.99) + self.assertGreaterEqual(test_accuracy, 0.75) + self.assertLessEqual(test_accuracy, 0.99) if __name__ == "__main__": diff --git a/kglib/BUILD b/typedb_ml/BUILD similarity index 91% rename from kglib/BUILD rename to typedb_ml/BUILD index 56040a2e..d39af225 100644 --- a/kglib/BUILD +++ b/typedb_ml/BUILD @@ -24,13 +24,14 @@ load("@vaticle_bazel_distribution_pip//:requirements.bzl", deployment_requiremen load("@vaticle_dependencies//tool/checkstyle:rules.bzl", "checkstyle_test") py_library( - name = "kglib", + name = "typedb-ml", srcs = [ '__init__.py', ], deps = [ - '//kglib/kgcn_tensorflow:kgcn', - '//kglib/utils', + '//typedb_ml/typedb', + '//typedb_ml/networkx', + '//typedb_ml/pytorch_geometric', ], visibility=['//visibility:public'] ) diff --git a/kglib/tests/deployment/requirements.txt b/typedb_ml/__init__.py similarity index 85% rename from kglib/tests/deployment/requirements.txt rename to typedb_ml/__init__.py index 6e0a5917..b9326ce9 100644 --- a/kglib/tests/deployment/requirements.txt +++ b/typedb_ml/__init__.py @@ -18,5 +18,3 @@ # specific language governing permissions and limitations # under the License. # - -https://repo.vaticle.com/repository/pypi-snapshot-group/packages/typedb-kglib/KGLIB_VERSION_MARKER/typedb-kglib-KGLIB_VERSION_MARKER.tar.gz \ No newline at end of file diff --git a/typedb_ml/networkx/BUILD b/typedb_ml/networkx/BUILD new file mode 100644 index 00000000..9d0c57fc --- /dev/null +++ b/typedb_ml/networkx/BUILD @@ -0,0 +1,105 @@ +# +# Copyright (C) 2022 Vaticle +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +load("@rules_python//python:defs.bzl", "py_library") +load("@vaticle_typedb_ml_pip//:requirements.bzl", vaticle_typedb_ml_requirement = "requirement") +load("@vaticle_dependencies//tool/checkstyle:rules.bzl", "checkstyle_test") + +py_library( + name = "networkx", + srcs = [ + 'query_graph.py', + 'concept_dict_to_networkx.py', + 'queries_to_networkx.py', + 'iterate.py', + ], + deps = [ + "//typedb_ml/typedb", + vaticle_typedb_ml_requirement('networkx'), + vaticle_typedb_ml_requirement('decorator'), + ], + visibility=['//visibility:public'] +) + +py_test( + name = "query_graph_test", + srcs = [ + "query_graph_test.py", + "query_graph.py" + ], + deps = [ + vaticle_typedb_ml_requirement('networkx'), + ] +) + +py_test( + name = "queries_to_networkx_test", + srcs = [ + "queries_to_networkx_test.py", + "queries_to_networkx.py", + "concept_dict_to_networkx.py", + "query_graph.py", + "graph_test_case.py" + ], + deps = [ + "//typedb_ml/typedb", + vaticle_typedb_ml_requirement('networkx'), + ] +) + +py_test( + name = "queries_to_networkx_it", + srcs = [ + "queries_to_networkx_it.py", + "queries_to_networkx.py", + "concept_dict_to_networkx.py", + "query_graph.py", + "graph_test_case.py" + ], + deps = [ + "//typedb_ml/typedb", + "//typedb_ml/typedb/test", + "@vaticle_typedb_client_python//:client_python", + vaticle_typedb_ml_requirement('networkx'), + ], + data = ["@vaticle_typedb_artifact_linux//file"], + args = ["$(location @vaticle_typedb_artifact_linux//file)"], +) + + +py_test( + name = "concept_dict_to_networkx_test", + srcs = [ + "concept_dict_to_networkx_test.py", + "concept_dict_to_networkx.py", + "graph_test_case.py" + ], + deps = [ + "//typedb_ml/typedb", + vaticle_typedb_ml_requirement('networkx'), + ], +) + +checkstyle_test( + name = "checkstyle", + include = glob(["*"]), + license_type = "apache-header", +) diff --git a/kglib/utils/graph/thing/concept_dict_to_networkx_graph.py b/typedb_ml/networkx/concept_dict_to_networkx.py similarity index 98% rename from kglib/utils/graph/thing/concept_dict_to_networkx_graph.py rename to typedb_ml/networkx/concept_dict_to_networkx.py index 14ce059e..a8631f62 100644 --- a/kglib/utils/graph/thing/concept_dict_to_networkx_graph.py +++ b/typedb_ml/networkx/concept_dict_to_networkx.py @@ -22,7 +22,7 @@ import networkx as nx -def concept_dict_to_graph(concept_dict, variable_graph): +def concept_dict_to_networkx(concept_dict, variable_graph): """ Create a new graph from a concept_dict, based on a `variable_graph` that describes the interactions between the variables in the `concept_dict` diff --git a/kglib/utils/graph/thing/concept_dict_to_graph_test.py b/typedb_ml/networkx/concept_dict_to_networkx_test.py similarity index 86% rename from kglib/utils/graph/thing/concept_dict_to_graph_test.py rename to typedb_ml/networkx/concept_dict_to_networkx_test.py index fe7e5710..4811fa9f 100644 --- a/kglib/utils/graph/thing/concept_dict_to_graph_test.py +++ b/typedb_ml/networkx/concept_dict_to_networkx_test.py @@ -22,9 +22,9 @@ import unittest import networkx as nx -from kglib.utils.typedb.object.thing import Thing -from kglib.utils.graph.thing.concept_dict_to_networkx_graph import concept_dict_to_graph -from kglib.utils.graph.test.case import GraphTestCase +from typedb_ml.typedb.thing import Thing +from typedb_ml.networkx.concept_dict_to_networkx import concept_dict_to_networkx +from typedb_ml.networkx.graph_test_case import GraphTestCase class TestConceptDictToTypeDBGraph(GraphTestCase): @@ -35,7 +35,7 @@ def test_single_entity_graph_is_as_expected(self): person = Thing('V123', 'person', 'entity') concept_dict = {'x': person} - typedb_graph = concept_dict_to_graph(concept_dict, variable_graph) + typedb_graph = concept_dict_to_networkx(concept_dict, variable_graph) expected_typedb_graph = nx.MultiDiGraph() expected_typedb_graph.add_node(person, type='person') @@ -48,7 +48,7 @@ def test_single_attribute_graph_is_as_expected(self): name = Thing('V123', 'name', 'attribute', value_type='string', value='Bob') concept_dict = {'x': name} - typedb_graph = concept_dict_to_graph(concept_dict, variable_graph) + typedb_graph = concept_dict_to_networkx(concept_dict, variable_graph) expected_typedb_graph = nx.MultiDiGraph() expected_typedb_graph.add_node(name, type='name', value_type='string', value='Bob') @@ -64,7 +64,7 @@ def test_single_entity_single_relation_graph_is_as_expected(self): employment = Thing('V456', 'employment', 'relation') concept_dict = {'x': person, 'y': employment} - typedb_graph = concept_dict_to_graph(concept_dict, variable_graph) + typedb_graph = concept_dict_to_networkx(concept_dict, variable_graph) expected_typedb_graph = nx.MultiDiGraph() expected_typedb_graph.add_node(person, type='person') expected_typedb_graph.add_node(employment, type='employment') @@ -85,7 +85,7 @@ def test_two_entity_single_relation_graph_is_as_expected(self): employment = Thing('V12345', 'employment', 'relation') concept_dict = {'x': person, 'y': company, 'r': employment} - typedb_graph = concept_dict_to_graph(concept_dict, variable_graph) + typedb_graph = concept_dict_to_networkx(concept_dict, variable_graph) expected_typedb_graph = nx.MultiDiGraph() expected_typedb_graph.add_node(person, type='person') @@ -106,7 +106,7 @@ def test_same_thing_occurs_in_two_different_variables(self): concept_dict = {'x': person, 'y': person2} - typedb_graph = concept_dict_to_graph(concept_dict, variable_graph) + typedb_graph = concept_dict_to_networkx(concept_dict, variable_graph) expected_typedb_graph = nx.MultiDiGraph() expected_typedb_graph.add_node(person, type='person') @@ -123,7 +123,7 @@ def test_edge_starting_from_entity_throws_exception(self): concept_dict = {'x': person, 'y': employment} with self.assertRaises(ValueError) as context: - _ = concept_dict_to_graph(concept_dict, variable_graph) + _ = concept_dict_to_networkx(concept_dict, variable_graph) self.assertEqual('An edge in the variable_graph originates from a non-relation, check the variable_graph!', str(context.exception)) @@ -139,7 +139,7 @@ def test_edge_starting_from_attribute_throws_exception(self): concept_dict = {'x': name, 'y': employment} with self.assertRaises(ValueError) as context: - _ = concept_dict_to_graph(concept_dict, variable_graph) + _ = concept_dict_to_networkx(concept_dict, variable_graph) self.assertEqual('An edge in the variable_graph originates from a non-relation, check the variable_graph!', str(context.exception)) @@ -156,7 +156,7 @@ def test_exception_if_sets_of_variables_differ(self): 'a': thing} with self.assertRaises(ValueError) as context: - _ = concept_dict_to_graph(concept_dict, variable_graph) + _ = concept_dict_to_networkx(concept_dict, variable_graph) self.assertEqual( 'The variables in the variable_graph must match those in the concept_dict\n' @@ -166,19 +166,19 @@ def test_exception_if_sets_of_variables_differ(self): def test_variable_graph_properties_are_transferred_to_graph(self): variable_graph = nx.MultiDiGraph() - variable_graph.add_node('x', input=1, solution=1) - variable_graph.add_node('y', input=1, solution=1) - variable_graph.add_edge('y', 'x', type='employee', input=0, solution=1) + variable_graph.add_node('x', input=1) + variable_graph.add_node('y', input=1) + variable_graph.add_edge('y', 'x', type='employee', input=0) person = Thing('V123', 'person', 'entity') employment = Thing('V456', 'employment', 'relation') concept_dict = {'x': person, 'y': employment} - typedb_graph = concept_dict_to_graph(concept_dict, variable_graph) + typedb_graph = concept_dict_to_networkx(concept_dict, variable_graph) expected_typedb_graph = nx.MultiDiGraph() - expected_typedb_graph.add_node(person, type='person', input=1, solution=1) - expected_typedb_graph.add_node(employment, type='employment', input=1, solution=1) - expected_typedb_graph.add_edge(employment, person, type='employee', input=0, solution=1) + expected_typedb_graph.add_node(person, type='person', input=1) + expected_typedb_graph.add_node(employment, type='employment', input=1) + expected_typedb_graph.add_edge(employment, person, type='employee', input=0) self.assertGraphsEqual(expected_typedb_graph, typedb_graph) diff --git a/kglib/utils/graph/test/case.py b/typedb_ml/networkx/graph_test_case.py similarity index 100% rename from kglib/utils/graph/test/case.py rename to typedb_ml/networkx/graph_test_case.py diff --git a/kglib/utils/graph/iterate.py b/typedb_ml/networkx/iterate.py similarity index 100% rename from kglib/utils/graph/iterate.py rename to typedb_ml/networkx/iterate.py diff --git a/kglib/utils/graph/thing/queries_to_networkx_graph.py b/typedb_ml/networkx/queries_to_networkx.py similarity index 76% rename from kglib/utils/graph/thing/queries_to_networkx_graph.py rename to typedb_ml/networkx/queries_to_networkx.py index 1acf1904..8cd65bb8 100644 --- a/kglib/utils/graph/thing/queries_to_networkx_graph.py +++ b/typedb_ml/networkx/queries_to_networkx.py @@ -24,8 +24,62 @@ from functools import reduce import networkx as nx -from kglib.utils.typedb.object.thing import build_thing -from kglib.utils.graph.thing.concept_dict_to_networkx_graph import concept_dict_to_graph +from typedb_ml.typedb.thing import build_thing +from typedb_ml.networkx.concept_dict_to_networkx import concept_dict_to_networkx + + +def build_graph_from_queries(queries, transaction, concept_dict_converter=concept_dict_to_networkx): + """ + Builds a graph of Things, interconnected by _roles_ and _has_ edges, from a set of Query objects + + Args: + queries: An iterable of Query objects + transaction: A TypeDB transaction + concept_dict_converter: The function to use to convert from concept_dicts to a TypeDB model. This could be + a typical model or a mathematical model + + Returns: + A networkx graph + """ + + query_concept_graphs = [] + + for query in queries: + + print("Working on query: " + query.string) + concept_maps = transaction.query().match(query.string) + print("Query completed") + + concept_dicts = [concept_dict_from_concept_map(concept_map) for concept_map in concept_maps] + print("Constructed concept_dicts") + + answer_concept_graphs = [] + for concept_dict in concept_dicts: + try: + answer_concept_graphs.append(concept_dict_converter(concept_dict, query.graph)) + except ValueError as e: + raise ValueError(str(e) + f'Encountered processing query:\n \"{query.string}\"') + + if len(answer_concept_graphs) > 1: + query_concept_graph = combine_n_graphs(answer_concept_graphs) + query_concept_graphs.append(query_concept_graph) + else: + if len(answer_concept_graphs) > 0: + query_concept_graphs.append(answer_concept_graphs[0]) + else: + warnings.warn(f'There were no results for query: \n\"{query}\"\nand so nothing will be added to the ' + f'graph for this query') + + if len(query_concept_graphs) == 0: + # Raise exception when none of the queries returned any results + raise RuntimeError( + f'The graph from queries: ' + f'{[query.string for query in queries]}\n' + f'could not be created, since none of these queries returned results' + ) + + concept_graph = combine_n_graphs(query_concept_graphs) + return concept_graph def concept_dict_from_concept_map(concept_map): @@ -83,58 +137,3 @@ def combine_n_graphs(graphs_list): """ return reduce(lambda x, y: combine_2_graphs(x, y), graphs_list) - - -def build_graph_from_queries(query_sampler_variable_graph_tuples, typedb_transaction, - concept_dict_converter=concept_dict_to_graph): - """ - Builds a graph of Things, interconnected by roles (and *has*), from a set of queries and graphs representing those - queries (variable graphs)of those queries, over a TypeDB transaction - - Args: - infer: whether to use TypeDB's inference engine - query_sampler_variable_graph_tuples: A list of tuples, each tuple containing a query, a sampling function, - and a variable_graph - typedb_transaction: A TypeDB transaction - concept_dict_converter: The function to use to convert from concept_dicts to a TypeDB model. This could be - a typical model or a mathematical model - - Returns: - A networkx graph - """ - - query_concept_graphs = [] - - for query, sampler, variable_graph in query_sampler_variable_graph_tuples: - - print("working on query: " + query) - concept_maps = sampler(typedb_transaction.query().match(query)) - print("query completed") - - concept_dicts = [concept_dict_from_concept_map(concept_map) for concept_map in concept_maps] - print("constructed concept_dicts") - - answer_concept_graphs = [] - for concept_dict in concept_dicts: - try: - answer_concept_graphs.append(concept_dict_converter(concept_dict, variable_graph)) - except ValueError as e: - raise ValueError(str(e) + f'Encountered processing query:\n \"{query}\"') - - if len(answer_concept_graphs) > 1: - query_concept_graph = combine_n_graphs(answer_concept_graphs) - query_concept_graphs.append(query_concept_graph) - else: - if len(answer_concept_graphs) > 0: - query_concept_graphs.append(answer_concept_graphs[0]) - else: - warnings.warn(f'There were no results for query: \n\"{query}\"\nand so nothing will be added to the ' - f'graph for this query') - - if len(query_concept_graphs) == 0: - # Raise exception when none of the queries returned any results - raise RuntimeError(f'The graph from queries: {[query_sampler_variable_graph_tuple[0] for query_sampler_variable_graph_tuple in query_sampler_variable_graph_tuples]}\n' - f'could not be created, since none of these queries returned results') - - concept_graph = combine_n_graphs(query_concept_graphs) - return concept_graph diff --git a/kglib/utils/graph/thing/queries_to_networkx_graph_it.py b/typedb_ml/networkx/queries_to_networkx_it.py similarity index 79% rename from kglib/utils/graph/thing/queries_to_networkx_graph_it.py rename to typedb_ml/networkx/queries_to_networkx_it.py index 770605cc..fa3dca2d 100644 --- a/kglib/utils/graph/thing/queries_to_networkx_graph_it.py +++ b/typedb_ml/networkx/queries_to_networkx_it.py @@ -23,13 +23,13 @@ import unittest import networkx as nx -from typedb.api.concept.type.attribute_type import AttributeType from typedb.client import * -from kglib.utils.graph.test.case import GraphTestCase -from kglib.utils.graph.thing.queries_to_networkx_graph import build_graph_from_queries -from kglib.utils.typedb.object.thing import build_thing -from kglib.utils.typedb.test.base import TypeDBServer +from typedb_ml.networkx.graph_test_case import GraphTestCase +from typedb_ml.networkx.queries_to_networkx import build_graph_from_queries +from typedb_ml.networkx.query_graph import Query +from typedb_ml.typedb.test.base import TypeDBServer +from typedb_ml.typedb.thing import build_thing class ITBuildGraphFromQueriesWithRealTypeDB(GraphTestCase): @@ -71,13 +71,11 @@ def test_graph_is_built_from_typedb_as_expected(self): g3.add_edge('r', 'x', type='child') g3.add_edge('r', 'y', type='parent') - query_sampler_variable_graph_tuples = [('match $x isa person;', mock_sampler, g1), - ('match $x isa person, has name $n;', mock_sampler, g2), - ('match $x isa person; $r(child: $x, parent: $y);', mock_sampler, g3), - # TODO Add functionality for loading schema at a later date - # ('match $x sub person; $x sub $type;', g4), - # ('match $x sub $y;', g5), - ] + queries = [ + Query(g1, 'match $x isa person;'), + Query(g2, 'match $x isa person, has name $n;'), + Query(g3, 'match $x isa person; $r(child: $x, parent: $y);'), + ] with self._client.session(self._database, SessionType.SCHEMA) as session: @@ -91,7 +89,7 @@ def test_graph_is_built_from_typedb_as_expected(self): tx.query().insert(ITBuildGraphFromQueriesWithRealTypeDB.DATA) tx.commit() with session.transaction(TransactionType.READ) as tx: - combined_graph = build_graph_from_queries(query_sampler_variable_graph_tuples, tx) + combined_graph = build_graph_from_queries(queries, tx) person_exp = build_thing(next(tx.query().match('match $x isa person;')).get('x')) name_exp = build_thing(next(tx.query().match('match $x isa name;')).get('x')) diff --git a/kglib/utils/graph/thing/queries_to_networkx_graph_test.py b/typedb_ml/networkx/queries_to_networkx_test.py similarity index 78% rename from kglib/utils/graph/thing/queries_to_networkx_graph_test.py rename to typedb_ml/networkx/queries_to_networkx_test.py index 0576e8e5..ea89a857 100644 --- a/kglib/utils/graph/thing/queries_to_networkx_graph_test.py +++ b/typedb_ml/networkx/queries_to_networkx_test.py @@ -24,16 +24,13 @@ import networkx as nx from typedb.api.concept.type.attribute_type import AttributeType -from kglib.utils.graph.test.case import GraphTestCase -from kglib.utils.graph.thing.queries_to_networkx_graph import concept_dict_from_concept_map, \ +from typedb_ml.networkx.graph_test_case import GraphTestCase +from typedb_ml.networkx.queries_to_networkx import concept_dict_from_concept_map, \ combine_n_graphs, build_graph_from_queries -from kglib.utils.typedb.object.thing import Thing -from kglib.utils.typedb.test.mock.answer import MockConceptMap -from kglib.utils.typedb.test.mock.concept import MockType, MockAttributeType, MockThing, MockAttribute - - -def mock_sampler(input_iter): - return input_iter +from typedb_ml.networkx.query_graph import Query +from typedb_ml.typedb.test.mock.answer import MockConceptMap +from typedb_ml.typedb.test.mock.concept import MockType, MockAttributeType, MockThing, MockAttribute +from typedb_ml.typedb.thing import Thing class TestBuildGraphFromQueries(GraphTestCase): @@ -53,13 +50,11 @@ def test_graph_is_built_as_expected(self): g3.add_edge('r', 'x', type='child') g3.add_edge('r', 'y', type='parent') - query_sampler_variable_graph_tuples = [('match $x iid V123;', mock_sampler, g1), - ('match $x iid V123, has name $n;', mock_sampler, g2), - ('match $x iid V123; $r(child: $x, parent: $y);', mock_sampler, g3), - # TODO Add functionality for loading schema at a later date - # ('match $x sub person; $x sub $type;', g4), - # ('match $x sub $y;', g5), - ] + queries = [ + Query(g1, 'match $x iid V123;'), + Query(g2, 'match $x iid V123, has name $n;'), + Query(g3, 'match $x iid V123; $r(child: $x, parent: $y);'), + ] class MockTransaction: @@ -90,7 +85,7 @@ def match(self, query): mock_tx = MockTransaction() - combined_graph = build_graph_from_queries(query_sampler_variable_graph_tuples, mock_tx) + combined_graph = build_graph_from_queries(queries, mock_tx) person_exp = Thing('V123', 'person', 'entity') parentship_exp = Thing('V567', 'parentship', 'relation') @@ -110,8 +105,8 @@ def test_warning_given_when_one_query_gives_no_results(self): g1.add_node('x') g2 = nx.MultiDiGraph() g2.add_node('y') - query_sampler_variable_graph_tuples = [('match $x iid V123;', mock_sampler, g1), - ('match $y iid V123;', mock_sampler, g2)] + queries = [Query(g1, 'match $x iid V123;'), + Query(g2, 'match $y iid V123;')] class MockTransaction: def query(self): @@ -127,14 +122,14 @@ def match(self, query): mock_tx = MockTransaction() with self.assertWarns(UserWarning) as context: - build_graph_from_queries(query_sampler_variable_graph_tuples, mock_tx) + build_graph_from_queries(queries, mock_tx) self.assertEqual(f'There were no results for query: \n\"match $y iid V123;\"\nand so nothing will be added to the graph for this query', str(context.warning)) def test_exception_is_raised_when_there_are_no_results_for_any_query(self): g1 = nx.MultiDiGraph() g1.add_node('x') - query_sampler_variable_graph_tuples = [('match $x iid V123;', mock_sampler, g1)] + queries = [Query(g1, 'match $x iid V123;')] class QueryManagerEmpty: def match(self, query): @@ -147,9 +142,9 @@ def query(self): mock_tx = MockTransactionEmpty() with self.assertRaises(RuntimeError) as context: - build_graph_from_queries(query_sampler_variable_graph_tuples, mock_tx) + build_graph_from_queries(queries, mock_tx) - self.assertEqual(f'The graph from queries: {[query_sampler_variable_graph_tuple[0] for query_sampler_variable_graph_tuple in query_sampler_variable_graph_tuples]}\n' + self.assertEqual(f'The graph from queries: {[query.string for query in queries]}\n' f'could not be created, since none of these queries returned results', str(context.exception)) @@ -214,47 +209,47 @@ def test_when_graph_node_properties_are_mismatched_exception_is_raised(self): person_a = Thing('V123', 'person', 'entity') name_a = Thing('V1234', 'name', 'attribute', value_type='string', value='Bob') typedb_graph_a = nx.MultiDiGraph(name='a') - typedb_graph_a.add_node(person_a, input=1, solution=1) - typedb_graph_a.add_node(name_a, input=1, solution=1) - typedb_graph_a.add_edge(person_a, name_a, type='has', input=0, solution=1) + typedb_graph_a.add_node(person_a, input=1) + typedb_graph_a.add_node(name_a, input=1) + typedb_graph_a.add_edge(person_a, name_a, type='has', input=0) person_b = Thing('V123', 'person', 'entity') name_b = Thing('V1234', 'name', 'attribute', value_type='string', value='Bob') typedb_graph_b = nx.MultiDiGraph(name='b') - typedb_graph_b.add_node(person_b, input=1, solution=1) - typedb_graph_b.add_node(name_b, input=0, solution=1) - typedb_graph_b.add_edge(person_b, name_b, type='has', input=0, solution=1) + typedb_graph_b.add_node(person_b, input=1) + typedb_graph_b.add_node(name_b, input=0) + typedb_graph_b.add_edge(person_b, name_b, type='has', input=0) with self.assertRaises(ValueError) as context: combine_n_graphs([typedb_graph_a, typedb_graph_b]) self.assertEqual(('Found non-matching node properties for node ' 'between graphs a and b:\n' - 'In graph a: {\'input\': 1, \'solution\': 1}\n' - 'In graph b: {\'input\': 0, \'solution\': 1}'), str(context.exception)) + 'In graph a: {\'input\': 1}\n' + 'In graph b: {\'input\': 0}'), str(context.exception)) def test_when_graph_edge_properties_are_mismatched_exception_is_raised(self): person_a = Thing('V123', 'person', 'entity') name_a = Thing('V1234', 'name', 'attribute', value_type='string', value='Bob') typedb_graph_a = nx.MultiDiGraph(name='a') - typedb_graph_a.add_node(person_a, input=1, solution=1) - typedb_graph_a.add_node(name_a, input=1, solution=1) - typedb_graph_a.add_edge(person_a, name_a, type='has', input=0, solution=1) + typedb_graph_a.add_node(person_a, input=1) + typedb_graph_a.add_node(name_a, input=1) + typedb_graph_a.add_edge(person_a, name_a, type='has', input=0) person_b = Thing('V123', 'person', 'entity') name_b = Thing('V1234', 'name', 'attribute', value_type='string', value='Bob') typedb_graph_b = nx.MultiDiGraph(name='b') - typedb_graph_b.add_node(person_b, input=1, solution=1) - typedb_graph_b.add_node(name_b, input=1, solution=1) - typedb_graph_b.add_edge(person_b, name_b, type='has', input=1, solution=0) + typedb_graph_b.add_node(person_b, input=1) + typedb_graph_b.add_node(name_b, input=1) + typedb_graph_b.add_edge(person_b, name_b, type='has', input=1) with self.assertRaises(ValueError) as context: combine_n_graphs([typedb_graph_a, typedb_graph_b]) self.assertEqual(('Found non-matching edge properties for edge (, , 0) ' 'between graphs a and b:\n' - 'In graph a: {\'type\': \'has\', \'input\': 0, \'solution\': 1}\n' - 'In graph b: {\'type\': \'has\', \'input\': 1, \'solution\': 0}'), str(context.exception)) + 'In graph a: {\'type\': \'has\', \'input\': 0}\n' + 'In graph b: {\'type\': \'has\', \'input\': 1}'), str(context.exception)) if __name__ == "__main__": diff --git a/kglib/utils/graph/query/query_graph.py b/typedb_ml/networkx/query_graph.py similarity index 75% rename from kglib/utils/graph/query/query_graph.py rename to typedb_ml/networkx/query_graph.py index 5445f67a..11deecd1 100644 --- a/kglib/utils/graph/query/query_graph.py +++ b/typedb_ml/networkx/query_graph.py @@ -24,48 +24,54 @@ class QueryGraph(nx.MultiDiGraph): """ - A custom graph to represent a query. Has additional helper methods specific to adding Graql patterns. + A custom graph to represent a query. Has additional helper methods specific to adding TypeQL patterns. """ - def add_vars(self, vars, solution): + def add_vars(self, vars): """ - Add Graql variables, stored as nodes in the graph + Add variables, stored as nodes in the graph Args: vars: String variables - solution: Indicator of the ground truth class that the variables belongs to Returns: self """ for var in vars: - self.add_node(var, solution=solution) + self.add_node(var) return self - def add_has_edge(self, owner_var, attribute_var, solution): + def add_has_edge(self, owner_var, attribute_var): """ Add a "has" edge to represent ownership of an attribute Args: owner_var: The variable of the owner attribute_var: The variable of the owned attribute - solution: Indicator of the ground truth class that the edge belongs to Returns: self """ - self.add_edge(owner_var, attribute_var, type='has', solution=solution) + self.add_edge(owner_var, attribute_var, type='has') return self - def add_role_edge(self, relation_var, roleplayer_var, role_label, solution): + def add_role_edge(self, relation_var, roleplayer_var, role_label): """ Add an edge to represent the role a variable plays in a relation Args: relation_var: The variable of the relation roleplayer_var: The variable of the roleplayer in the relation role_label: The role the roleplayer plays in the relation - solution: Indicator of the ground truth class that the edge belongs to Returns: self """ - self.add_edge(relation_var, roleplayer_var, type=role_label, solution=solution) + self.add_edge(relation_var, roleplayer_var, type=role_label) return self + + +class Query: + def __init__(self, graph: nx.MultiDiGraph, string: str): + self.graph = graph + self.string = string + + def __str__(self): + return self.string diff --git a/kglib/utils/graph/query/query_graph_test.py b/typedb_ml/networkx/query_graph_test.py similarity index 75% rename from kglib/utils/graph/query/query_graph_test.py rename to typedb_ml/networkx/query_graph_test.py index 74ab9e4b..66fa06a5 100644 --- a/kglib/utils/graph/query/query_graph_test.py +++ b/typedb_ml/networkx/query_graph_test.py @@ -21,37 +21,37 @@ import unittest -from kglib.utils.graph.query.query_graph import QueryGraph +from typedb_ml.networkx.query_graph import QueryGraph class TestQueryGraph(unittest.TestCase): def test_add_single_var_adds_variable_node_as_expected(self): g = QueryGraph() - g.add_vars(['a'], 0) - self.assertDictEqual({'solution': 0}, g.nodes['a']) + g.add_vars(['a']) + self.assertDictEqual({}, g.nodes['a']) def test_add_vars_adds_variable_nodes_as_expected(self): g = QueryGraph() - g.add_vars(['a', 'b'], 0) + g.add_vars(['a', 'b']) nodes = {node for node in g.nodes} self.assertSetEqual({'a', 'b'}, nodes) def test_add_has_edge_adds_edge_as_expected(self): g = QueryGraph() - g.add_vars('a', 'b') - g.add_has_edge('a', 'b', 0) + g.add_vars('a') + g.add_has_edge('a', 'b') edges = [edge for edge in g.edges] self.assertEqual(1, len(edges)) - self.assertDictEqual({'type': 'has', 'solution': 0}, g.edges['a', 'b', 0]) + self.assertDictEqual({'type': 'has'}, g.edges['a', 'b', 0]) def test_add_role_edge_adds_role_as_expected(self): g = QueryGraph() - g.add_vars('a', 'b') - g.add_role_edge('a', 'b', 'role_label', 1) + g.add_vars('a') + g.add_role_edge('a', 'b', 'role_label') edges = [edge for edge in g.edges] self.assertEqual(1, len(edges)) - self.assertDictEqual({'type': 'role_label', 'solution': 1}, g.edges['a', 'b', 0]) + self.assertDictEqual({'type': 'role_label'}, g.edges['a', 'b', 0]) if __name__ == "__main__": diff --git a/kglib/kgcn_tensorflow/BUILD b/typedb_ml/pytorch_geometric/BUILD similarity index 67% rename from kglib/kgcn_tensorflow/BUILD rename to typedb_ml/pytorch_geometric/BUILD index 284d416e..89271acf 100644 --- a/kglib/kgcn_tensorflow/BUILD +++ b/typedb_ml/pytorch_geometric/BUILD @@ -19,23 +19,30 @@ # under the License. # -load("@rules_python//python:defs.bzl", "py_library") load("@vaticle_dependencies//tool/checkstyle:rules.bzl", "checkstyle_test") +load("@vaticle_typedb_ml_pip//:requirements.bzl", vaticle_typedb_ml_requirement = "requirement") py_library( - name = "kgcn", + name = "pytorch_geometric", + srcs = glob([ + "*.py", + "**/*.py" + ]), deps = [ - '//kglib/kgcn_tensorflow/examples/diagnosis', - '//kglib/kgcn_tensorflow/learn', - '//kglib/kgcn_tensorflow/models', - '//kglib/kgcn_tensorflow/pipeline', - '//kglib/kgcn_tensorflow/plot', + vaticle_typedb_ml_requirement("networkx"), + vaticle_typedb_ml_requirement("torch"), + vaticle_typedb_ml_requirement("torch_geometric"), + vaticle_typedb_ml_requirement("torch_sparse"), + vaticle_typedb_ml_requirement("torch_scatter"), ], visibility=['//visibility:public'] ) checkstyle_test( name = "checkstyle", - include = glob(["*"]), + include = glob([ + "*", + "**/*" + ]), license_type = "apache-header", ) diff --git a/typedb_ml/pytorch_geometric/dataset/dataset.py b/typedb_ml/pytorch_geometric/dataset/dataset.py new file mode 100644 index 00000000..36eaeead --- /dev/null +++ b/typedb_ml/pytorch_geometric/dataset/dataset.py @@ -0,0 +1,85 @@ +# +# Copyright (C) 2022 Vaticle +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +from typing import Sequence, Callable, Optional + +import networkx as nx +from torch_geometric.utils import from_networkx +from typedb.client import TypeDBSession, TypeDBOptions, TransactionType + +from typedb_ml.networkx.queries_to_networkx import build_graph_from_queries + + +class DataSet: + """ + A DataSet to lazily load graphs based on queries from TypeDB and some arbitrary id. + """ + + def __init__( + self, + indices: Sequence, + node_types, + edge_type_triplets, + queries_for_id: Callable, + session: Optional[TypeDBSession] = None, + infer: bool = True, + transform: Optional[Callable[[nx.Graph], nx.Graph]] = None, + ): + self._indices = indices + self._node_types = node_types + self._edge_type_triplets = edge_type_triplets + self.queries_for_id = queries_for_id + self._infer = infer + self._transform = transform + self.session = session + + def __len__(self): + return len(self._indices) + + def __getitem__(self, idx): + id = self._indices[idx] + print(f"Fetching graph for id: {id}") + queries = self.queries_for_id(id) + + options = TypeDBOptions.core() + options.infer = self._infer + + with self.session.transaction(TransactionType.READ, options=options) as tx: + # Build a graph from the queries + graph = build_graph_from_queries(queries, tx) + graph.name = id + if self._transform: + graph = self._transform(graph) + data = from_networkx(graph) + data.concepts_by_type = graph.concepts_by_type + return data, self.node_type_indices(graph), self.edge_type_indices(graph) + + def node_type_indices(self, graph): + indices = [] + for _, data in graph.nodes(data=True): + indices.append(self._node_types.index(data['type'])) + return indices + + def edge_type_indices(self, graph): + indices = [] + for src, dst, data in graph.edges(data=True): + indices.append(self._edge_type_triplets.index((graph.nodes[src]["type"], data["type"], graph.nodes[dst]["type"]))) + return indices diff --git a/typedb_ml/pytorch_geometric/transform/binary_link_prediction.py b/typedb_ml/pytorch_geometric/transform/binary_link_prediction.py new file mode 100644 index 00000000..396ec093 --- /dev/null +++ b/typedb_ml/pytorch_geometric/transform/binary_link_prediction.py @@ -0,0 +1,117 @@ +# +# Copyright (C) 2022 Vaticle +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +import networkx as nx + +from typedb_ml.networkx.iterate import multidigraph_edge_data_iterator +from typedb_ml.typedb.type import get_edge_type_triplets, reverse_edge_type_triplets + + +class LinkPredictionLabeller: + + def __init__(self, edge_type_to_predict): + self.edge_type_to_predict = edge_type_to_predict + + def __call__(self, graph): + self.label_edges(graph) + return graph + + def label_edges(self, graph): + for data in multidigraph_edge_data_iterator(graph): + if data["type"] == self.edge_type_to_predict: + data["y_edge"] = 1 + else: + data["y_edge"] = 0 + + +def binary_relations_to_edges(graph: nx.MultiDiGraph, binary_relation_type): + """ + A function to convert a TypeDB relation (a hyperedge) to a directed binary edge. + + The replaced relations may not be: + - a roleplayer in any other relations + - own any attributes + - have more than two roles + and must: + - have exactly two outgoing role edges (which can be of the same role) + + This is useful because edges are often stored as an adjacency matrix. In PyTorch Geometric, for example, + this adjacency matrix expects binary edges. To be compatible we convert to binary edges so that when creating + negative samples we can simply change the values of the adjacency matrix. In the case of a hyperedge we cannot + simply do this to add negative edges. + + Args: + graph: The graph to modify in-place + binary_relation_type: A triple of the `(role1, relation, role2)` types to convert to a single edge labelled with `relation` + + Returns: + Dict of edges to the node each replaces + """ + replacement_edges = {} + for node, node_data in graph.nodes(data=True): + if node_data["type"] == binary_relation_type[1]: + if len(list(graph.in_edges(node))) > 0: + raise ValueError( + "The given binary relation can't be transformed into an edge because it plays a role in another " + "relation." + ) + roles = list(graph.edges(node, data=True)) # equivalent to out_edges() + assert len(roles) == 2 + new_edge_start = None + new_edge_end = None + for from_roleplayer, to_roleplayer, data in roles: + if data["type"] == binary_relation_type[0]: + if new_edge_start is None: + new_edge_start = to_roleplayer + else: + # In this case the given relation type is symmetric. Therefore direction is arbitrary (but + # expect to add a reverse edge elsewhere). + assert binary_relation_type[0] == binary_relation_type[2] + new_edge_end = to_roleplayer + elif data["type"] == binary_relation_type[2]: + new_edge_end = to_roleplayer + else: + raise ValueError( + f"Unexpected role in relation {binary_relation_type[1]}. Expected \"" + f"{binary_relation_type[0]}\" or \"{binary_relation_type[2]}\" but got \"{data['type']}\"." + ) + graph.remove_edge(from_roleplayer, to_roleplayer) + graph.add_edge(new_edge_start, new_edge_end, type=binary_relation_type[1]) + replacement_edges[(new_edge_start, new_edge_end)] = node + for node in replacement_edges.values(): + graph.remove_node(node) + return graph + + +def binary_link_prediction_edge_triplets(session, relation_type_to_predict, types_to_ignore): + edge_type_triplets = get_edge_type_triplets(session) + edge_type_triplets = [e for e in edge_type_triplets if not (set(e).intersection(types_to_ignore))] + replace_relation_with_binary_edge(edge_type_triplets, relation_type_to_predict) + edge_type_triplets_reversed = reverse_edge_type_triplets(edge_type_triplets) + return edge_type_triplets, edge_type_triplets_reversed + + +def replace_relation_with_binary_edge(edge_type_triplets, relation_type_to_predict): + # Remove the two roles of the relation_to_predict and replace them with a binary edge that uses the name of the + # binary relation it represents + edge_type_triplets.remove(relation_type_to_predict[2:]) + if relation_type_to_predict[1] != relation_type_to_predict[3]: + edge_type_triplets.remove(tuple(reversed(relation_type_to_predict[:3]))) + edge_type_triplets.append((relation_type_to_predict[0], relation_type_to_predict[2], relation_type_to_predict[4])) diff --git a/typedb_ml/pytorch_geometric/transform/common.py b/typedb_ml/pytorch_geometric/transform/common.py new file mode 100644 index 00000000..0dcc8676 --- /dev/null +++ b/typedb_ml/pytorch_geometric/transform/common.py @@ -0,0 +1,60 @@ +# +# Copyright (C) 2022 Vaticle +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +from typedb_ml.networkx.iterate import multidigraph_node_data_iterator, multidigraph_edge_data_iterator + + +def clear_unneeded_fields(graph): + for node_data in multidigraph_node_data_iterator(graph): + x = node_data["x"] + t = node_data["type"] + node_data.clear() + node_data["x"] = x + node_data["type"] = t + + for edge_data in multidigraph_edge_data_iterator(graph): + x = edge_data["edge_attr"] + y = edge_data["y_edge"] + t = edge_data["type"] + edge_data.clear() + edge_data["edge_attr"] = x + edge_data["y_edge"] = y + edge_data["type"] = t + return graph + + +def store_concepts_by_type(graph): + """ + Organises concepts by type the same way `data.to_heterogeneous()` organises nodes by type. This is necessary to + be able to map back from a `HeteroData` object to the TypeDB concepts that the nodes refer to. + Args: + graph: NetworkX graph to operate on + Returns: + The same graph, with a field `concepts_by_type` holding concepts organised by type + """ + concepts_by_type = {} + for node_data in multidigraph_node_data_iterator(graph): + typ = node_data['type'] + if typ in concepts_by_type: + concepts_by_type[typ].append(node_data['concept']) + else: + concepts_by_type[typ] = [node_data['concept']] + graph.concepts_by_type = concepts_by_type + return graph diff --git a/typedb_ml/pytorch_geometric/transform/encode.py b/typedb_ml/pytorch_geometric/transform/encode.py new file mode 100644 index 00000000..2d0011e2 --- /dev/null +++ b/typedb_ml/pytorch_geometric/transform/encode.py @@ -0,0 +1,90 @@ +# +# Copyright (C) 2022 Vaticle +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +from typing import List + +import torch +from torch.nn import Embedding + +from typedb_ml.networkx.iterate import ( + multidigraph_node_data_iterator, + multidigraph_edge_data_iterator, +) + + +class FeatureEncoder: + """ + Feature encoder for NetworkX representations of TypeDB data. Type data is assumed always present for each node + and edge, consistent with the TypeDB knowledge model. Therefore, this encoder provides a de-factor method to + embed that type information. Supply attribute-specific encoders to handle encoding values of attributes, + since the meaning of these values is domain-dependent. + """ + + def __init__(self, node_types, edge_types, type_encoding_size, attribute_encoders, attribute_encoding_size): + self.node_types = node_types + self.edge_types = edge_types + self.attribute_encoders = attribute_encoders + self.attribute_encoding_size = attribute_encoding_size + self.node_type_embedding = Embedding(len(self.node_types), type_encoding_size) + self.edge_type_embedding = Embedding(len(self.edge_types), type_encoding_size) + + def __call__(self, graph): + self.encode_node_features(graph) + self.encode_edge_features(graph) + return graph + + def encode_node_features(self, graph): + for node_data in multidigraph_node_data_iterator(graph): + typ = node_data['type'] + if typ in self.attribute_encoders.keys(): + # Add the integer value of the category for each categorical attribute instance + encoded_value = self.attribute_encoders[typ](node_data['value']) + else: + encoded_value = [0] * self.attribute_encoding_size + + type_embedding = self.node_type_embedding(torch.as_tensor(self.node_types.index(node_data['type']))) + node_data['x'] = torch.hstack([type_embedding, torch.as_tensor(encoded_value)])\ + .cpu().detach().numpy() # Conversion to numpy array, otherwise the graph representation breaks + + def encode_edge_features(self, graph): + for edge_data in multidigraph_edge_data_iterator(graph): + type_embedding = self.node_type_embedding(torch.as_tensor(self.edge_types.index(edge_data['type']))) + edge_data['edge_attr'] = torch.hstack([type_embedding, torch.as_tensor([0] * self.attribute_encoding_size)])\ + .cpu().detach().numpy() # Conversion to numpy array, otherwise the graph representation breaks + + +class CategoricalEncoder: + + def __init__(self, categories: List, attribute_encoding_size): + self.categories = categories + self.embedding = Embedding(len(self.categories), attribute_encoding_size) + + def __call__(self, value): + return self.embedding(torch.as_tensor(self.categories.index(value))) + + +class ContinuousEncoder: + def __init__(self, min_val, max_val, attribute_encoding_size): + self.attribute_encoding_size = attribute_encoding_size + self.max_val = max_val + self.min_val = min_val + + def __call__(self, value): + return [(value - self.min_val) / (self.max_val - self.min_val)] * self.attribute_encoding_size diff --git a/kglib/utils/typedb/object/BUILD b/typedb_ml/typedb/BUILD similarity index 86% rename from kglib/utils/typedb/object/BUILD rename to typedb_ml/typedb/BUILD index 0a2cb033..3b8294e6 100644 --- a/kglib/utils/typedb/object/BUILD +++ b/typedb_ml/typedb/BUILD @@ -20,16 +20,18 @@ # load("@rules_python//python:defs.bzl", "py_library") -load("@vaticle_kglib_pip//:requirements.bzl", - vaticle_kglib_requirement = "requirement") +load("@vaticle_typedb_ml_pip//:requirements.bzl", vaticle_typedb_ml_requirement = "requirement") load("@vaticle_dependencies//tool/checkstyle:rules.bzl", "checkstyle_test") py_library( - name = "object", + name = "typedb", srcs = [ + 'type.py', 'thing.py', + 'load.py', ], deps = [ + '//typedb_ml/typedb/test', "@vaticle_typedb_client_python//:client_python", ], visibility=['//visibility:public'] diff --git a/kglib/tests/deployment/kgcn/diagnosis.py b/typedb_ml/typedb/load.py similarity index 52% rename from kglib/tests/deployment/kgcn/diagnosis.py rename to typedb_ml/typedb/load.py index f4d417f6..fb0c90f3 100644 --- a/kglib/tests/deployment/kgcn/diagnosis.py +++ b/typedb_ml/typedb/load.py @@ -19,20 +19,32 @@ # under the License. # -import unittest +import subprocess as sp +from enum import Enum -from kglib.kgcn_tensorflow.examples.diagnosis.diagnosis import diagnosis_example +class FileType(Enum): + Schema = "schema" + Data = "data" -class TestDiagnosisExample(unittest.TestCase): - def test_example_runs_without_exception(self): - diagnosis_example("./typedb-all-linux", - num_graphs=6, - num_processing_steps_tr=2, - num_processing_steps_ge=2, - num_training_iterations=20) +def load_typeql_file(typedb_binary_location: str, database_name: str, typeql_file_path: str, + file_type: FileType): + """ + Load a file into a TypeDB database - -if __name__ == "__main__": - unittest.main() + Args: + typedb_binary_location: the location of TypeDB + database_name: The name of the TypeDB database to load into + typeql_file_path: The path to the file to load + file_type: The content of the file and therefore the transaction type to use, either schema or data + Returns: + None + """ + sp.check_call([ + './typedb', + 'console', + f'--command=transaction {database_name} {file_type.value} write', + f'--command=source {typeql_file_path}', + f'--command=commit' + ], cwd=typedb_binary_location) diff --git a/kglib/utils/typedb/test/BUILD b/typedb_ml/typedb/test/BUILD similarity index 100% rename from kglib/utils/typedb/test/BUILD rename to typedb_ml/typedb/test/BUILD diff --git a/kglib/utils/typedb/test/base.py b/typedb_ml/typedb/test/base.py similarity index 100% rename from kglib/utils/typedb/test/base.py rename to typedb_ml/typedb/test/base.py diff --git a/kglib/utils/typedb/test/mock/answer.py b/typedb_ml/typedb/test/mock/answer.py similarity index 100% rename from kglib/utils/typedb/test/mock/answer.py rename to typedb_ml/typedb/test/mock/answer.py diff --git a/kglib/utils/typedb/test/mock/concept.py b/typedb_ml/typedb/test/mock/concept.py similarity index 100% rename from kglib/utils/typedb/test/mock/concept.py rename to typedb_ml/typedb/test/mock/concept.py diff --git a/kglib/utils/typedb/object/thing.py b/typedb_ml/typedb/thing.py similarity index 100% rename from kglib/utils/typedb/object/thing.py rename to typedb_ml/typedb/thing.py diff --git a/typedb_ml/typedb/type.py b/typedb_ml/typedb/type.py new file mode 100644 index 00000000..22fc043b --- /dev/null +++ b/typedb_ml/typedb/type.py @@ -0,0 +1,89 @@ +# +# Copyright (C) 2022 Vaticle +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +from typedb.api.connection.transaction import TransactionType + + +def get_thing_types(session): + """ + Get all schema types, excluding those for implicit attribute relations and base types + Args: + session: TypeDB session + + Returns: + TypeDB types + """ + with session.transaction(TransactionType.READ) as tx: + schema_concepts = tx.query().match("match $x sub thing;") + thing_types = [schema_concept.get('x').get_label().name() for schema_concept in schema_concepts] + [thing_types.remove(el) for el in ['thing', 'relation', 'entity', 'attribute']] + return thing_types + + +def get_role_triplets(tx): + """ + Get triples of all schema roles and the relation and roleplayer they connect + Args: + tx: TypeDB transaction + + Returns: + TypeDB role triples + """ + role_triples = [] + schema_concepts = tx.query().match("match $rel sub relation; $rel relates $r; $rp plays $r;") + for answer in schema_concepts: + relation = answer.get('rel').get_label().name() + roles = [r.get_label().name() for r in answer.get('r').as_remote(tx).get_supertypes()] + roles.remove('role') + player = answer.get("rp").get_label().name() + for role in roles: + role_triples.append((relation, role, player)) + return role_triples + + +def get_has_triplets(tx): + """ + Get triples of all ownerships: the owner type and owned attribute + Args: + tx: TypeDB transaction + + Returns: + TypeDB ownership triples + """ + has_triples = [] + schema_concepts = tx.query().match("match $owner sub thing, owns $owned;") + for answer in schema_concepts: + owner = answer.get('owner').get_label().name() + owned = answer.get('owned').get_label().name() + has_triples.append((owner, "has", owned)) + return has_triples + + +def get_edge_type_triplets(session): + with session.transaction(TransactionType.READ) as tx: + edge_types = get_role_triplets(tx) + get_has_triplets(tx) + return edge_types + + +def reverse_edge_type_triplets(edge_types): + reversed_edge_triples = [] + for edge_from, edge, edge_to in edge_types: + reversed_edge_triples.append((edge_to, f"rev_{edge}", edge_from)) + return reversed_edge_triples