From ec242f424a8b89c14057360e100824da0b286224 Mon Sep 17 00:00:00 2001 From: Alkid Date: Tue, 5 Mar 2024 13:26:47 +0100 Subject: [PATCH 1/3] documentation workflow --- .github/workflows/sphinx.yml | 78 ++-- docs/_static/documentation_options_patch.js | 1 + docs/_static/js/theme.js | 243 +++++++++++ docs/_static/theme_tweak.css | 9 + docs/_templates/search.html | 2 + docs/conf.py | 40 +- docs/index.rst | 44 +- docs/usage/main.md | 399 ++++++++++++++++++ .../Training.md => usage/training.md} | 0 requirements.txt | 17 + 10 files changed, 761 insertions(+), 72 deletions(-) create mode 100644 docs/_static/documentation_options_patch.js create mode 100644 docs/_static/js/theme.js create mode 100644 docs/_static/theme_tweak.css create mode 100644 docs/_templates/search.html create mode 100644 docs/usage/main.md rename docs/{using_dice_embedding_framework/Training.md => usage/training.md} (100%) create mode 100644 requirements.txt diff --git a/.github/workflows/sphinx.yml b/.github/workflows/sphinx.yml index 70d087b3..94b03356 100644 --- a/.github/workflows/sphinx.yml +++ b/.github/workflows/sphinx.yml @@ -1,33 +1,55 @@ -# This is a basic workflow to help you get started with Actions +name: Build docs -name: Build-sphinx-docs +on: + push: + branches: + - main + - develop + - documentation # just for testing + pull_request: -on: [push,pull_request] - -# A workflow run is made up of one or more jobs that can run sequentially or in parallel -jobs: - # This workflow contains a single job called "build" - build: - # The type of runner that the job will run on +jobs: + docs: runs-on: ubuntu-latest + strategy: + matrix: + python-version: [ "3.10.11" ] + max-parallel: 5 - # Steps represent a sequence of tasks that will be executed as part of the job steps: - # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - - uses: actions/checkout@v2 - - - name: Set up Python 3.9.18 - uses: actions/setup-python@v2 - with: - python-version: "3.9.18" - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - # # pip install -r requirements.txt - - - name: Build HTML and import - run: | - # sphinx-apidoc -o docs dicee/ && make -C docs/ html && ghp-import -n -p -f docs/_build/html - - + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Prepare required software + run: | + # epstopdf & dot & noto-fonts + sudo apt update && sudo apt install texlive-font-utils graphviz fonts-noto\ + + - name: Build docs + run: | + sphinx-build -M html docs/ docs/_build/ + + - name: Build LaTeX docs + run: | + sphinx-build -M latex docs/ docs/_build/ + + - name: Compile LaTeX document + uses: docker://texlive/texlive:latest + with: + args: make -C docs/_build/latex + - run: | + cp docs/_build/latex/dicee.pdf docs/_build/html/ + + - name: Deploy to GitHub Pages + uses: peaceiris/actions-gh-pages@v3 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: 'docs/_build/html' \ No newline at end of file diff --git a/docs/_static/documentation_options_patch.js b/docs/_static/documentation_options_patch.js new file mode 100644 index 00000000..c43a72bb --- /dev/null +++ b/docs/_static/documentation_options_patch.js @@ -0,0 +1 @@ +DOCUMENTATION_OPTIONS.LINK_SUFFIX = DOCUMENTATION_OPTIONS.FILE_SUFFIX diff --git a/docs/_static/js/theme.js b/docs/_static/js/theme.js new file mode 100644 index 00000000..2f013342 --- /dev/null +++ b/docs/_static/js/theme.js @@ -0,0 +1,243 @@ +var jQuery = (typeof(window) != 'undefined') ? window.jQuery : require('jquery'); + +// Sphinx theme nav state +function ThemeNav () { + + var nav = { + navBar: null, + win: null, + winScroll: false, + winResize: false, + linkScroll: false, + winPosition: 0, + winHeight: null, + docHeight: null, + isRunning: false + }; + + nav.enable = function (withStickyNav) { + var self = this; + + // TODO this can likely be removed once the theme javascript is broken + // out from the RTD assets. This just ensures old projects that are + // calling `enable()` get the sticky menu on by default. All other cals + // to `enable` should include an argument for enabling the sticky menu. + if (typeof(withStickyNav) == 'undefined') { + withStickyNav = true; + } + + if (self.isRunning) { + // Only allow enabling nav logic once + return; + } + + self.isRunning = true; + jQuery(function ($) { + self.init($); + + self.reset(); + self.win.on('hashchange', self.reset); + + if (withStickyNav) { + // Set scroll monitor + self.win.on('scroll', function () { + if (!self.linkScroll) { + if (!self.winScroll) { + self.winScroll = true; + requestAnimationFrame(function() { self.onScroll(); }); + } + } + }); + } + + // Set resize monitor + self.win.on('resize', function () { + if (!self.winResize) { + self.winResize = true; + requestAnimationFrame(function() { self.onResize(); }); + } + }); + + self.onResize(); + }); + + }; + + // TODO remove this with a split in theme and Read the Docs JS logic as + // well, it's only here to support 0.3.0 installs of our theme. + nav.enableSticky = function() { + this.enable(true); + }; + + nav.init = function ($) { + var doc = $(document), + self = this; + + this.navBar = $('div.wy-side-scroll:first'); + this.win = $(window); + + // Set up javascript UX bits + $(document) + // Shift nav in mobile when clicking the menu. + .on('click', "[data-toggle='wy-nav-top']", function() { + $("[data-toggle='wy-nav-shift']").toggleClass("shift"); + $("[data-toggle='rst-versions']").toggleClass("shift"); + }) + + // Nav menu link click operations + .on('click', ".wy-menu-vertical .current ul li a", function() { + var target = $(this); + // Close menu when you click a link. + $("[data-toggle='wy-nav-shift']").removeClass("shift"); + $("[data-toggle='rst-versions']").toggleClass("shift"); + // Handle dynamic display of l3 and l4 nav lists + self.toggleCurrent(target); + self.hashChange(); + }) + .on('click', "[data-toggle='rst-current-version']", function() { + $("[data-toggle='rst-versions']").toggleClass("shift-up"); + }) + + // Make tables responsive + $("table.docutils:not(.field-list,.footnote,.citation)") + .wrap("
"); + + // Add extra class to responsive tables that contain + // footnotes or citations so that we can target them for styling + $("table.docutils.footnote") + .wrap("
"); + $("table.docutils.citation") + .wrap("
"); + + // Add expand links to all parents of nested ul + $('.wy-menu-vertical ul').not('.simple').siblings('a').each(function () { + var link = $(this); + expand = $(''); + expand.on('click', function (ev) { + self.toggleCurrent(link); + ev.stopPropagation(); + return false; + }); + link.prepend(expand); + }); + }; + + nav.reset = function () { + // Get anchor from URL and open up nested nav + var anchor = encodeURI(window.location.hash) || '#'; + + try { + var vmenu = $('.wy-menu-vertical'); + var link = vmenu.find('[href="' + anchor + '"]'); + if (link.length === 0) { + // this link was not found in the sidebar. + // Find associated id element, then its closest section + // in the document and try with that one. + var id_elt = $('.document [id="' + anchor.substring(1) + '"]'); + var closest_section = id_elt.closest('div.section'); + link = vmenu.find('[href="#' + closest_section.attr("id") + '"]'); + if (link.length === 0) { + // still not found in the sidebar. fall back to main section + link = vmenu.find('[href="#"]'); + } + } + // If we found a matching link then reset current and re-apply + // otherwise retain the existing match + if (link.length > 0) { + $('.wy-menu-vertical .current').removeClass('current'); + link.addClass('current'); + link.closest('li.toctree-l1').parent().addClass('current'); + for (let i = 1; i <= 10; i++) { + link.closest('li.toctree-l' + i).addClass('current'); + } + link[0].scrollIntoView(); + } + } + catch (err) { + console.log("Error expanding nav for anchor", err); + } + + }; + + nav.onScroll = function () { + this.winScroll = false; + var newWinPosition = this.win.scrollTop(), + winBottom = newWinPosition + this.winHeight, + navPosition = this.navBar.scrollTop(), + newNavPosition = navPosition + (newWinPosition - this.winPosition); + if (newWinPosition < 0 || winBottom > this.docHeight) { + return; + } + this.navBar.scrollTop(newNavPosition); + this.winPosition = newWinPosition; + }; + + nav.onResize = function () { + this.winResize = false; + this.winHeight = this.win.height(); + this.docHeight = $(document).height(); + }; + + nav.hashChange = function () { + this.linkScroll = true; + this.win.one('hashchange', function () { + this.linkScroll = false; + }); + }; + + nav.toggleCurrent = function (elem) { + var parent_li = elem.closest('li'); + parent_li.siblings('li.current').removeClass('current'); + parent_li.siblings().find('li.current').removeClass('current'); + var children = parent_li.find('> ul li'); + // Don't toggle terminal elements. + if (children.length) { + children.removeClass('current'); + parent_li.toggleClass('current'); + } + } + + return nav; +}; + +module.exports.ThemeNav = ThemeNav(); + +if (typeof(window) != 'undefined') { + window.SphinxRtdTheme = { + Navigation: module.exports.ThemeNav, + // TODO remove this once static assets are split up between the theme + // and Read the Docs. For now, this patches 0.3.0 to be backwards + // compatible with a pre-0.3.0 layout.html + StickyNav: module.exports.ThemeNav, + }; +} + + +// requestAnimationFrame polyfill by Erik Möller. fixes from Paul Irish and Tino Zijdel +// https://gist.github.com/paulirish/1579671 +// MIT license + +(function() { + var lastTime = 0; + var vendors = ['ms', 'moz', 'webkit', 'o']; + for(var x = 0; x < vendors.length && !window.requestAnimationFrame; ++x) { + window.requestAnimationFrame = window[vendors[x]+'RequestAnimationFrame']; + window.cancelAnimationFrame = window[vendors[x]+'CancelAnimationFrame'] + || window[vendors[x]+'CancelRequestAnimationFrame']; + } + + if (!window.requestAnimationFrame) + window.requestAnimationFrame = function(callback, element) { + var currTime = new Date().getTime(); + var timeToCall = Math.max(0, 16 - (currTime - lastTime)); + var id = window.setTimeout(function() { callback(currTime + timeToCall); }, + timeToCall); + lastTime = currTime + timeToCall; + return id; + }; + + if (!window.cancelAnimationFrame) + window.cancelAnimationFrame = function(id) { + clearTimeout(id); + }; +}()); diff --git a/docs/_static/theme_tweak.css b/docs/_static/theme_tweak.css new file mode 100644 index 00000000..aeedc7fa --- /dev/null +++ b/docs/_static/theme_tweak.css @@ -0,0 +1,9 @@ +.rst-content dl:not(.docutils) dl dl dt { + width: auto; +} +.rst-content dl:not(.docutils) dl dt { + width: 96%; +} +.rst-content dl:not(.docutils) dt { + width: 100%; +} diff --git a/docs/_templates/search.html b/docs/_templates/search.html new file mode 100644 index 00000000..07a4b787 --- /dev/null +++ b/docs/_templates/search.html @@ -0,0 +1,2 @@ +{% extends "!search.html" %} +{% set script_files = [ '_static/documentation_options_patch.js', '_static/language_data.js' ] + script_files %} diff --git a/docs/conf.py b/docs/conf.py index 4201590b..1849de94 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -14,12 +14,26 @@ project = 'DICE Embeddings' copyright = '2023, Caglar Demir' author = 'Caglar Demir' -release = '0.1.2' +release = '0.1.3.2' # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration -extensions = ["sphinx.ext.todo", "sphinx.ext.viewcode", "sphinx.ext.autodoc"] +extensions = ["autoapi.extension", + "sphinx.ext.githubpages", + "sphinx.ext.todo", + "sphinx.ext.viewcode", + "sphinx.ext.autodoc"] + +# autoapi for dicee. +autoapi_dirs = ['../dicee'] + +# by default all are included but had to reinitialize this to remove private members from showing +autoapi_options = ['members', 'undoc-members', 'show-inheritance', 'show-module-summary', 'special-members', + 'imported-members'] + +# this is set to false, so we can add it manually in index.rst together with the other .md files of the documentation. +autoapi_add_toctree_entry = False templates_path = ['_templates'] exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] @@ -27,5 +41,25 @@ # -- Options for HTML output ------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output -html_theme = 'furo' html_static_path = ['_static'] + +# -- Options for LaTeX output ------------------------------------------------ + +latex_engine = 'xelatex' +latex_show_urls = 'footnote' +latex_theme = 'howto' + +latex_elements = { + 'preamble': r''' +\renewcommand{\pysiglinewithargsret}[3]{% + \item[{% + \parbox[t]{\linewidth}{\setlength{\hangindent}{12ex}% + \raggedright#1\sphinxcode{(}\linebreak[0]{\renewcommand{\emph}[1]{\mbox{\textit{##1}}}#2}\sphinxcode{)}\linebreak[0]\mbox{#3}}}]} +''', + 'printindex': '\\def\\twocolumn[#1]{#1}\\footnotesize\\raggedright\\printindex', +} + + +def setup(app): + # -- Options for HTML output --------------------------------------------- + app.add_css_file('theme_tweak.css') diff --git a/docs/index.rst b/docs/index.rst index 991fc294..09b9bec0 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -3,56 +3,18 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -Welcome to DICE Embeddings! +Welcome to DICE Embeddings documentation! =========================================== `DICE Embeddings `_: Hardware-agnostic Framework for Large-scale Knowledge Graph Embeddings: ======= - - -.. warning:: - - Train embedding models in multi-node, multi-GPUs, distributed data parallel or model parallel without expert knowledge! - - - .. code-block:: bash - - // 1 CPU - (dicee) $ dicee --dataset_dir KGs/UMLS - // 10 CPU - (dicee) $ dicee --dataset_dir KGs/UMLS --num_core 10 - // Distributed Data Parallel (DDP) with all GPUs - (dicee) $ dicee --trainer PL --accelerator gpu --strategy ddp --dataset_dir KGs/UMLS - // Model Parallel with all GPUs and low precision - (dicee) $ dicee --trainer PL --accelerator gpu --strategy deepspeed_stage_3 --dataset_dir KGs/UMLS --precision 16 - // DDP with all GPUs on two nodes (felis and nebula): - (dicee) cdemir@felis $ torchrun --nnodes 2 --nproc_per_node=gpu --node_rank 0 --rdzv_id 455 --rdzv_backend c10d --rdzv_endpoint=nebula -m dicee.run --trainer torchDDP --dataset_dir KGs/UMLS - (dicee) cdemir@nebula $ torchrun --nnodes 2 --nproc_per_node=gpu --node_rank 1 --rdzv_id 455 --rdzv_backend c10d --rdzv_endpoint=nebula -m dicee.run --trainer torchDDP --dataset_dir KGs/UMLS - .. toctree:: :maxdepth: 2 :caption: Contents: -Usage -------- - -.. code-block:: console - - $ git clone https://github.com/dice-group/dice-embeddings.git - - $ conda create -n dice python=3.9.18 --no-default-packages && conda activate dice - - (dice) $ pip3 install -r requirements.txt - -or - -.. code-block:: console - - (dice) $ pip install dicee - - - + usage/main + autoapi/dicee/index Indices and tables ------- diff --git a/docs/usage/main.md b/docs/usage/main.md new file mode 100644 index 00000000..de2fbefc --- /dev/null +++ b/docs/usage/main.md @@ -0,0 +1,399 @@ +## Dice Embeddings! + +**Version:** dicee 0.1.3.2 + +**GitHub repository:** [https://github.com/dice-group/dice-embeddings](https://github.com/dice-group/dice-embeddings) + +**Publisher and maintainer:** [Caglar Demir](https://github.com/Demirrr) + +**Contact**: [caglar.demir@upb.de](mailto:caglar.demir@upb.de) + +**License:** OSI Approved :: MIT License + +-------------------------------------------- + +Dicee is a hardware-agnostic framework for large-scale knowledge graph embeddings. + + +Knowledge graph embedding research has mainly focused on learning continuous +representations of knowledge graphs towards the link prediction problem. +Recently developed frameworks can be effectively applied in a wide range +of research-related applications. Yet, using these frameworks in real-world +applications becomes more challenging as the size of the knowledge graph +grows + +We developed the DICE Embeddings framework (dicee) to compute embeddings for large-scale knowledge graphs in a hardware-agnostic manner. +To achieve this goal, we rely on +1. **[Pandas](https://pandas.pydata.org/) & Co.** to use parallelism at preprocessing a large knowledge graph, +2. **[PyTorch](https://pytorch.org/) & Co.** to learn knowledge graph embeddings via multi-CPUs, GPUs, TPUs or computing cluster, and +3. **[Huggingface](https://huggingface.co/)** to ease the deployment of pre-trained models. + +**Why [Pandas](https://pandas.pydata.org/) & Co. ?** +A large knowledge graph can be read and preprocessed (e.g. removing literals) by pandas, modin, or polars in parallel. +Through polars, a knowledge graph having more than 1 billion triples can be read in parallel fashion. +Importantly, using these frameworks allow us to perform all necessary computations on a single CPU as well as a cluster of computers. + +**Why [PyTorch](https://pytorch.org/) & Co. ?** +PyTorch is one of the most popular machine learning frameworks available at the time of writing. +PytorchLightning facilitates scaling the training procedure of PyTorch without boilerplate. +In our framework, we combine [PyTorch](https://pytorch.org/) & [PytorchLightning](https://www.pytorchlightning.ai/). +Users can choose the trainer class (e.g., DDP by Pytorch) to train large knowledge graph embedding models with billions of parameters. +PytorchLightning allows us to use state-of-the-art model parallelism techniques (e.g. Fully Sharded Training, FairScale, or DeepSpeed) +without extra effort. +With our framework, practitioners can directly use PytorchLightning for model parallelism to train gigantic embedding models. + +**Why [Hugging-face Gradio](https://huggingface.co/gradio)?** +Deploy a pre-trained embedding model without writing a single line of code. + + +## Installation + +### Installation from Source +``` bash +git clone https://github.com/dice-group/dice-embeddings.git +conda create -n dice python=3.10.13 --no-default-packages && conda activate dice && cd dice-embeddings && +pip3 install -e . +``` +or +```bash +pip install dicee +``` +## Download Knowledge Graphs +```bash +wget https://files.dice-research.org/datasets/dice-embeddings/KGs.zip --no-check-certificate && unzip KGs.zip +``` +To test the Installation +```bash +python -m pytest -p no:warnings -x # Runs >114 tests leading to > 15 mins +python -m pytest -p no:warnings --lf # run only the last failed test +python -m pytest -p no:warnings --ff # to run the failures first and then the rest of the tests. +``` + + + +## Knowledge Graph Embedding Models + +1. TransE, DistMult, ComplEx, ConEx, QMult, OMult, ConvO, ConvQ, Keci +2. All 44 models available in https://github.com/pykeen/pykeen#models + +> For more, please refer to `examples`. + + +## How to Train + +To Train a KGE model (KECI) and evaluate it on the train, validation, and test sets of the UMLS benchmark dataset. +```python +from dicee.executer import Execute +from dicee.config import Namespace +args = Namespace() +args.model = 'Keci' +args.scoring_technique = "KvsAll" # 1vsAll, or AllvsAll, or NegSample +args.dataset_dir = "KGs/UMLS" +args.path_to_store_single_run = "Keci_UMLS" +args.num_epochs = 100 +args.embedding_dim = 32 +args.batch_size = 1024 +reports = Execute(args).start() +print(reports["Train"]["MRR"]) # => 0.9912 +print(reports["Test"]["MRR"]) # => 0.8155 +# See the Keci_UMLS folder embeddings and all other files +``` +where the data is in the following form +```bash +$ head -3 KGs/UMLS/train.txt +acquired_abnormality location_of experimental_model_of_disease +anatomical_abnormality manifestation_of physiologic_function +alga isa entity +``` +A KGE model can also be trained from the command line +```bash +dicee --dataset_dir "KGs/UMLS" --model Keci --eval_model "train_val_test" +``` +dicee automaticaly detects available GPUs and trains a model with distributed data parallels technique. Under the hood, dicee uses lighning as a default trainer. +```bash +# Train a model by only using the GPU-0 +CUDA_VISIBLE_DEVICES=0 dicee --dataset_dir "KGs/UMLS" --model Keci --eval_model "train_val_test" +# Train a model by only using GPU-1 +CUDA_VISIBLE_DEVICES=1 dicee --dataset_dir "KGs/UMLS" --model Keci --eval_model "train_val_test" +NCCL_P2P_DISABLE=1 CUDA_VISIBLE_DEVICES=0,1 python dicee/scripts/run.py --trainer PL --dataset_dir "KGs/UMLS" --model Keci --eval_model "train_val_test" +``` +Under the hood, dicee executes run.py script and uses lighning as a default trainer +```bash +# Two equivalent executions +# (1) +dicee --dataset_dir "KGs/UMLS" --model Keci --eval_model "train_val_test" +# Evaluate Keci on Train set: Evaluate Keci on Train set +# {'H@1': 0.9518788343558282, 'H@3': 0.9988496932515337, 'H@10': 1.0, 'MRR': 0.9753123402351737} +# Evaluate Keci on Validation set: Evaluate Keci on Validation set +# {'H@1': 0.6932515337423313, 'H@3': 0.9041411042944786, 'H@10': 0.9754601226993865, 'MRR': 0.8072362996241839} +# Evaluate Keci on Test set: Evaluate Keci on Test set +# {'H@1': 0.6951588502269289, 'H@3': 0.9039334341906202, 'H@10': 0.9750378214826021, 'MRR': 0.8064032293278861} + +# (2) +CUDA_VISIBLE_DEVICES=0,1 python dicee/scripts/run.py --trainer PL --dataset_dir "KGs/UMLS" --model Keci --eval_model "train_val_test" +# Evaluate Keci on Train set: Evaluate Keci on Train set +# {'H@1': 0.9518788343558282, 'H@3': 0.9988496932515337, 'H@10': 1.0, 'MRR': 0.9753123402351737} +# Evaluate Keci on Train set: Evaluate Keci on Train set +# Evaluate Keci on Validation set: Evaluate Keci on Validation set +# {'H@1': 0.6932515337423313, 'H@3': 0.9041411042944786, 'H@10': 0.9754601226993865, 'MRR': 0.8072362996241839} +# Evaluate Keci on Test set: Evaluate Keci on Test set +# {'H@1': 0.6951588502269289, 'H@3': 0.9039334341906202, 'H@10': 0.9750378214826021, 'MRR': 0.8064032293278861} +``` +Similarly, models can be easily trained with torchrun +```bash +torchrun --standalone --nnodes=1 --nproc_per_node=gpu dicee/scripts/run.py --trainer torchDDP --dataset_dir "KGs/UMLS" --model Keci --eval_model "train_val_test" +# Evaluate Keci on Train set: Evaluate Keci on Train set: Evaluate Keci on Train set +# {'H@1': 0.9518788343558282, 'H@3': 0.9988496932515337, 'H@10': 1.0, 'MRR': 0.9753123402351737} +# Evaluate Keci on Validation set: Evaluate Keci on Validation set +# {'H@1': 0.6932515337423313, 'H@3': 0.9041411042944786, 'H@10': 0.9754601226993865, 'MRR': 0.8072499937521418} +# Evaluate Keci on Test set: Evaluate Keci on Test set +{'H@1': 0.6951588502269289, 'H@3': 0.9039334341906202, 'H@10': 0.9750378214826021, 'MRR': 0.8064032293278861} +``` +You can also train a model in multi-node multi-gpu setting. +```bash +torchrun --nnodes 2 --nproc_per_node=gpu --node_rank 0 --rdzv_id 455 --rdzv_backend c10d --rdzv_endpoint=nebula dicee/scripts/run.py --trainer torchDDP --dataset_dir KGs/UMLS +torchrun --nnodes 2 --nproc_per_node=gpu --node_rank 1 --rdzv_id 455 --rdzv_backend c10d --rdzv_endpoint=nebula dicee/scripts/run.py --trainer torchDDP --dataset_dir KGs/UMLS +``` +Train a KGE model by providing the path of a single file and store all parameters under newly created directory +called `KeciFamilyRun`. +```bash +dicee --path_single_kg "KGs/Family/family-benchmark_rich_background.owl" --model Keci --path_to_store_single_run KeciFamilyRun --backend rdflib +``` +where the data is in the following form +```bash +$ head -3 KGs/Family/train.txt +_:1 . + . + . +``` +**Apart from n-triples or standard link prediction dataset formats, we support ["owl", "nt", "turtle", "rdf/xml", "n3"]***. +Moreover, a KGE model can be also trained by providing **an endpoint of a triple store**. +```bash +dicee --sparql_endpoint "http://localhost:3030/mutagenesis/" --model Keci +``` +For more, please refer to `examples`. + + +## Creating an Embedding Vector Database + +##### Learning Embeddings +```bash +# Train an embedding model +dicee --dataset_dir KGs/Countries-S1 --path_to_store_single_run CountryEmbeddings --model Keci --p 0 --q 1 --embedding_dim 32 --adaptive_swa +``` +#### Loading Embeddings into Qdrant Vector Database +```bash +# Ensure that Qdrant available +# docker pull qdrant/qdrant && docker run -p 6333:6333 -p 6334:6334 -v $(pwd)/qdrant_storage:/qdrant/storage:z qdrant/qdrant +diceeindex --path_model "CountryEmbeddings" --collection_name "dummy" --location "localhost" +``` +#### Launching Webservice +```bash +diceeserve --path_model "CountryEmbeddings" --collection_name "dummy" --collection_location "localhost" +``` +##### Retrieve and Search + +Get embedding of germany +```bash +curl -X 'GET' 'http://0.0.0.0:8000/api/get?q=germany' -H 'accept: application/json' +``` + +Get most similar things to europe +```bash +curl -X 'GET' 'http://0.0.0.0:8000/api/search?q=europe' -H 'accept: application/json' +{"result":[{"hit":"europe","score":1.0}, +{"hit":"northern_europe","score":0.67126536}, +{"hit":"western_europe","score":0.6010134}, +{"hit":"puerto_rico","score":0.5051694}, +{"hit":"southern_europe","score":0.4829831}]} +``` + + + + +## Answering Complex Queries + +```python +# pip install dicee +# wget https://files.dice-research.org/datasets/dice-embeddings/KGs.zip --no-check-certificate & unzip KGs.zip +from dicee.executer import Execute +from dicee.config import Namespace +from dicee.knowledge_graph_embeddings import KGE +# (1) Train a KGE model +args = Namespace() +args.model = 'Keci' +args.p=0 +args.q=1 +args.optim = 'Adam' +args.scoring_technique = "AllvsAll" +args.path_single_kg = "KGs/Family/family-benchmark_rich_background.owl" +args.backend = "rdflib" +args.num_epochs = 200 +args.batch_size = 1024 +args.lr = 0.1 +args.embedding_dim = 512 +result = Execute(args).start() +# (2) Load the pre-trained model +pre_trained_kge = KGE(path=result['path_experiment_folder']) +# (3) Single-hop query answering +# Query: ?E : \exist E.hasSibling(E, F9M167) +# Question: Who are the siblings of F9M167? +# Answer: [F9M157, F9F141], as (F9M167, hasSibling, F9M157) and (F9M167, hasSibling, F9F141) +predictions = pre_trained_kge.answer_multi_hop_query(query_type="1p", + query=('http://www.benchmark.org/family#F9M167', + ('http://www.benchmark.org/family#hasSibling',)), + tnorm="min", k=3) +top_entities = [topk_entity for topk_entity, query_score in predictions] +assert "http://www.benchmark.org/family#F9F141" in top_entities +assert "http://www.benchmark.org/family#F9M157" in top_entities +# (2) Two-hop query answering +# Query: ?D : \exist E.Married(D, E) \land hasSibling(E, F9M167) +# Question: To whom a sibling of F9M167 is married to? +# Answer: [F9F158, F9M142] as (F9M157 #married F9F158) and (F9F141 #married F9M142) +predictions = pre_trained_kge.answer_multi_hop_query(query_type="2p", + query=("http://www.benchmark.org/family#F9M167", + ("http://www.benchmark.org/family#hasSibling", + "http://www.benchmark.org/family#married")), + tnorm="min", k=3) +top_entities = [topk_entity for topk_entity, query_score in predictions] +assert "http://www.benchmark.org/family#F9M142" in top_entities +assert "http://www.benchmark.org/family#F9F158" in top_entities +# (3) Three-hop query answering +# Query: ?T : \exist D.type(D,T) \land Married(D,E) \land hasSibling(E, F9M167) +# Question: What are the type of people who are married to a sibling of F9M167? +# (3) Answer: [Person, Male, Father] since F9M157 is [Brother Father Grandfather Male] and F9M142 is [Male Grandfather Father] + +predictions = pre_trained_kge.answer_multi_hop_query(query_type="3p", query=("http://www.benchmark.org/family#F9M167", + ("http://www.benchmark.org/family#hasSibling", + "http://www.benchmark.org/family#married", + "http://www.w3.org/1999/02/22-rdf-syntax-ns#type")), + tnorm="min", k=5) +top_entities = [topk_entity for topk_entity, query_score in predictions] +print(top_entities) +assert "http://www.benchmark.org/family#Person" in top_entities +assert "http://www.benchmark.org/family#Father" in top_entities +assert "http://www.benchmark.org/family#Male" in top_entities +``` +For more, please refer to `examples/multi_hop_query_answering`. + + +## Predicting Missing Links + +```python +from dicee import KGE +# (1) Train a knowledge graph embedding model.. +# (2) Load a pretrained model +pre_trained_kge = KGE(path='..') +# (3) Predict missing links through head entity rankings +pre_trained_kge.predict_topk(h=[".."],r=[".."],topk=10) +# (4) Predict missing links through relation rankings +pre_trained_kge.predict_topk(h=[".."],t=[".."],topk=10) +# (5) Predict missing links through tail entity rankings +pre_trained_kge.predict_topk(r=[".."],t=[".."],topk=10) +``` + + + +## Downloading Pretrained Models + +```python +from dicee import KGE +# (1) Load a pretrained ConEx on DBpedia +model = KGE(url="https://files.dice-research.org/projects/DiceEmbeddings/KINSHIP-Keci-dim128-epoch256-KvsAll") +``` + +- For more please look at [dice-research.org/projects/DiceEmbeddings/](https://files.dice-research.org/projects/DiceEmbeddings/) + + + +## How to Deploy + +```python +from dicee import KGE +KGE(path='...').deploy(share=True,top_k=10) +``` + +Italian Trulli + +## Docker +To build the Docker image: +``` +docker build -t dice-embeddings . +``` + +To test the Docker image: +``` +docker run --rm -v ~/.local/share/dicee/KGs:/dicee/KGs dice-embeddings ./main.py --model AConEx --embedding_dim 16 +``` + + +## How to cite +Currently, we are working on our manuscript describing our framework. +If you really like our work and want to cite it now, feel free to chose one :) +``` +# Keci +@inproceedings{demir2023clifford, + title={Clifford Embeddings--A Generalized Approach for Embedding in Normed Algebras}, + author={Demir, Caglar and Ngonga Ngomo, Axel-Cyrille}, + booktitle={Joint European Conference on Machine Learning and Knowledge Discovery in Databases}, + pages={567--582}, + year={2023}, + organization={Springer} +} +# LitCQD +@inproceedings{demir2023litcqd, + title={LitCQD: Multi-Hop Reasoning in Incomplete Knowledge Graphs with Numeric Literals}, + author={Demir, Caglar and Wiebesiek, Michel and Lu, Renzhong and Ngonga Ngomo, Axel-Cyrille and Heindorf, Stefan}, + booktitle={Joint European Conference on Machine Learning and Knowledge Discovery in Databases}, + pages={617--633}, + year={2023}, + organization={Springer} +} +# DICE Embedding Framework +@article{demir2022hardware, + title={Hardware-agnostic computation for large-scale knowledge graph embeddings}, + author={Demir, Caglar and Ngomo, Axel-Cyrille Ngonga}, + journal={Software Impacts}, + year={2022}, + publisher={Elsevier} +} +# KronE +@inproceedings{demir2022kronecker, + title={Kronecker decomposition for knowledge graph embeddings}, + author={Demir, Caglar and Lienen, Julian and Ngonga Ngomo, Axel-Cyrille}, + booktitle={Proceedings of the 33rd ACM Conference on Hypertext and Social Media}, + pages={1--10}, + year={2022} +} +# QMult, OMult, ConvQ, ConvO +@InProceedings{pmlr-v157-demir21a, + title = {Convolutional Hypercomplex Embeddings for Link Prediction}, + author = {Demir, Caglar and Moussallem, Diego and Heindorf, Stefan and Ngonga Ngomo, Axel-Cyrille}, + booktitle = {Proceedings of The 13th Asian Conference on Machine Learning}, + pages = {656--671}, + year = {2021}, + editor = {Balasubramanian, Vineeth N. and Tsang, Ivor}, + volume = {157}, + series = {Proceedings of Machine Learning Research}, + month = {17--19 Nov}, + publisher = {PMLR}, + pdf = {https://proceedings.mlr.press/v157/demir21a/demir21a.pdf}, + url = {https://proceedings.mlr.press/v157/demir21a.html}, +} +# ConEx +@inproceedings{demir2021convolutional, +title={Convolutional Complex Knowledge Graph Embeddings}, +author={Caglar Demir and Axel-Cyrille Ngonga Ngomo}, +booktitle={Eighteenth Extended Semantic Web Conference - Research Track}, +year={2021}, +url={https://openreview.net/forum?id=6T45-4TFqaX}} +# Shallom +@inproceedings{demir2021shallow, + title={A shallow neural model for relation prediction}, + author={Demir, Caglar and Moussallem, Diego and Ngomo, Axel-Cyrille Ngonga}, + booktitle={2021 IEEE 15th International Conference on Semantic Computing (ICSC)}, + pages={179--182}, + year={2021}, + organization={IEEE} +``` +For any questions or wishes, please contact: ```caglar.demir@upb.de``` diff --git a/docs/using_dice_embedding_framework/Training.md b/docs/usage/training.md similarity index 100% rename from docs/using_dice_embedding_framework/Training.md rename to docs/usage/training.md diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..0ca198b2 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,17 @@ +torch>=2.2.0 +lightning>=2.1.3 +pandas>=2.1.0 +polars>=0.16.14 +scikit-learn>=1.2.2 +pyarrow>=11.0.0 +pykeen>=1.10.2 +zstandard>=0.21.0 +pytest>=7.2.2 +psutil>=5.9.4 +ruff>=0.0.284 +gradio>=3.23.0 +rdflib>=7.0.0 +tiktoken>=0.5.1 +matplotlib>=3.8.2 +sphinx-autoapi>=3.0.0 +sphinx>=7.2.6 \ No newline at end of file From 4b4df01b6ac8c7919139f3f5943217b62aae3de9 Mon Sep 17 00:00:00 2001 From: Alkid Date: Tue, 5 Mar 2024 13:41:35 +0100 Subject: [PATCH 2/3] matched latex naming --- .github/workflows/sphinx.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/sphinx.yml b/.github/workflows/sphinx.yml index 94b03356..25b8fb62 100644 --- a/.github/workflows/sphinx.yml +++ b/.github/workflows/sphinx.yml @@ -45,8 +45,9 @@ jobs: uses: docker://texlive/texlive:latest with: args: make -C docs/_build/latex - - run: | - cp docs/_build/latex/dicee.pdf docs/_build/html/ + - name: Copy Latex pdf to ./html + run: | + cp docs/_build/latex/diceembeddings.pdf docs/_build/html/ - name: Deploy to GitHub Pages uses: peaceiris/actions-gh-pages@v3 From 62d0036df66c3b12c148482c318e566a44ef4bc8 Mon Sep 17 00:00:00 2001 From: Alkid Date: Tue, 5 Mar 2024 15:41:20 +0100 Subject: [PATCH 3/3] added sphinx extensions --- docs/conf.py | 52 +++++++++++++++++++++++++++++++++++++++++++++- docs/index.rst | 10 ++------- docs/usage/main.md | 4 +--- requirements.txt | 7 ++++++- 4 files changed, 60 insertions(+), 13 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 1849de94..86bd1d69 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -22,8 +22,13 @@ extensions = ["autoapi.extension", "sphinx.ext.githubpages", "sphinx.ext.todo", + "sphinx.ext.napoleon", "sphinx.ext.viewcode", - "sphinx.ext.autodoc"] + "sphinx.ext.autodoc", + "sphinxcontrib.plantuml", + "myst_parser", + "sphinx_rtd_theme", + ] # autoapi for dicee. autoapi_dirs = ['../dicee'] @@ -35,14 +40,56 @@ # this is set to false, so we can add it manually in index.rst together with the other .md files of the documentation. autoapi_add_toctree_entry = False +inheritance_graph_attrs = dict(rankdir="TB") + +myst_enable_extensions = [ + 'colon_fence', + 'deflist', +] + +myst_heading_anchors = 3 + templates_path = ['_templates'] exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] # -- Options for HTML output ------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output +pygments_style = 'rainbow_dash' + +plantuml_output_format = 'svg_img' +plantuml_latex_output_format = 'pdf' + +stanford_theme_mod = True +html_theme_options = { + 'navigation_depth': 6, +} + html_static_path = ['_static'] +if stanford_theme_mod: + html_theme = 'sphinx_rtd_theme' + + def _import_theme(): + import os + import shutil + import sphinx_theme + html_theme = 'stanford_theme' + for _type in ['fonts']: + shutil.copytree( + os.path.join(sphinx_theme.get_html_theme_path(html_theme), + html_theme, 'static', _type), + os.path.join('_static_gen', _type), + dirs_exist_ok=True) + shutil.copy2( + os.path.join(sphinx_theme.get_html_theme_path(html_theme), + html_theme, 'static', 'css', 'theme.css'), + os.path.join('_static_gen', 'theme.css'), + ) + + _import_theme() + html_static_path = ['_static_gen'] + html_static_path + # -- Options for LaTeX output ------------------------------------------------ latex_engine = 'xelatex' @@ -62,4 +109,7 @@ def setup(app): # -- Options for HTML output --------------------------------------------- + if stanford_theme_mod: + app.add_css_file('theme.css') app.add_css_file('theme_tweak.css') + app.add_css_file('pygments.css') diff --git a/docs/index.rst b/docs/index.rst index 09b9bec0..8a67a6f0 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -3,11 +3,11 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -Welcome to DICE Embeddings documentation! +Welcome to DICE Embeddings! =========================================== `DICE Embeddings `_: Hardware-agnostic Framework for Large-scale Knowledge Graph Embeddings: -======= + .. toctree:: :maxdepth: 2 @@ -16,9 +16,3 @@ Welcome to DICE Embeddings documentation! usage/main autoapi/dicee/index -Indices and tables -------- - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` diff --git a/docs/usage/main.md b/docs/usage/main.md index de2fbefc..9f89c45d 100644 --- a/docs/usage/main.md +++ b/docs/usage/main.md @@ -1,4 +1,4 @@ -## Dice Embeddings! +## Dicee Manual **Version:** dicee 0.1.3.2 @@ -313,8 +313,6 @@ from dicee import KGE KGE(path='...').deploy(share=True,top_k=10) ``` -Italian Trulli - ## Docker To build the Docker image: ``` diff --git a/requirements.txt b/requirements.txt index 0ca198b2..c267a2aa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,9 @@ rdflib>=7.0.0 tiktoken>=0.5.1 matplotlib>=3.8.2 sphinx-autoapi>=3.0.0 -sphinx>=7.2.6 \ No newline at end of file +sphinx>=7.2.6 +myst-parser>=2.0.0 +sphinx_rtd_theme>=2.0.0 +sphinx-theme>=1.0 +sphinxcontrib-plantuml>=0.27 +plantuml-local-client>=1.2022.6 \ No newline at end of file