diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ba9208d4bd..a04cd49a50 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -28,16 +28,26 @@ jobs: runs-on: ubuntu-latest steps: - name: Fetch the code - uses: actions/checkout@v2 - - name: Set up Python 3.8 - uses: actions/setup-python@v2 + uses: actions/checkout@v4 + - uses: conda-incubator/setup-miniconda@v3 with: - python-version: "3.8" - - name: Install dependencies + python-version: 3.9 + mamba-version: "*" + channels: conda-forge + channel-priority: true + activate-environment: monodocs-env + environment-file: monodocs-environment.yaml + - shell: bash -el {0} run: | - python -m pip install --upgrade pip - if [ -f doc-requirements.txt ]; then pip install -r doc-requirements.txt; fi + conda info + conda list + conda config --show-sources + conda config --show + printenv | sort + - name: Setup Graphviz + uses: ts-graphviz/setup-graphviz@v1 - name: Build the documentation + shell: bash -el {0} run: make docs generate_kustomize: diff --git a/.gitignore b/.gitignore index 646cb4bec1..a8e78b52d6 100644 --- a/.gitignore +++ b/.gitignore @@ -10,10 +10,6 @@ _repos/ _rsts/ rsts_tmp/ .doctrees/ -docs/_sources/ -docs/flytekit/flytekit.interfaces.html -docs/searchindex.js -docs/ !flyteidl/protos/docs release/ __pycache__/ @@ -33,5 +29,14 @@ dist *.db vendor/ /docker/sandbox-bundled/images/tar -rsts/_tags/ -**/bin/ \ No newline at end of file +**/bin/ +docs/_tags/ +docs/flytectl +docs/protos +docs/flytekit +docs/flytesnacks +docs/examples +docs/_src +docs/_projects +docs/api +docs/tests diff --git a/.readthedocs.yml b/.readthedocs.yml index de76f3976a..b63d11acfc 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -16,4 +16,4 @@ sphinx: # Optionally set the version of Python and requirements required to build your docs python: install: - - requirements: doc-requirements.txt + - requirements: doc-requirements.in diff --git a/CHANGELOG/CHANGELOG-v0.13.0.md b/CHANGELOG/CHANGELOG-v0.13.0.md index 8e834c660a..a06c79c5c8 100644 --- a/CHANGELOG/CHANGELOG-v0.13.0.md +++ b/CHANGELOG/CHANGELOG-v0.13.0.md @@ -2,8 +2,8 @@ ## Platform - Oauth2 support with SingleSignOn and configuration examples for popular IDP's now available in Flyte. - Please see the updated [information and description of the feature](https://github.com/flyteorg/flyte/blob/master/rsts/howto/authentication/index.rst), and the [setup information](https://github.com/flyteorg/flyte/blob/master/rsts/howto/authentication/setup.rst) - **Attention: If using Auth already - this is a BREAKING change**. refer to the [migration guide](https://github.com/flyteorg/flyte/blob/master/rsts/howto/authentication/migration.rst) to update configuration to ensure Admin continues to work. (No migration needed if auth is not turned on.) + Please see the updated [information and description of the feature](https://github.com/flyteorg/flyte/blob/master/docs/howto/authentication/index.rst), and the [setup information](https://github.com/flyteorg/flyte/blob/master/docs/howto/authentication/setup.rst) + **Attention: If using Auth already - this is a BREAKING change**. refer to the [migration guide](https://github.com/flyteorg/flyte/blob/master/docs/howto/authentication/migration.rst) to update configuration to ensure Admin continues to work. (No migration needed if auth is not turned on.) * Backend improvements to support dynamic workflow visualization (in future releases). * Lot of features added to [flytectl](https://flytectl.readthedocs.io/en/latest/) . diff --git a/CHANGELOG/CHANGELOG-v1.3.0-b5.md b/CHANGELOG/CHANGELOG-v1.3.0-b5.md index 73858bf058..81ae2e1233 100644 --- a/CHANGELOG/CHANGELOG-v1.3.0-b5.md +++ b/CHANGELOG/CHANGELOG-v1.3.0-b5.md @@ -99,7 +99,7 @@ flytectl demo start --image ghcr.io/flyteorg/flyte-sandbox-bundled:sha-e240038be ``` ### Databricks Code -You'll need to upload an [entrypoint](https://gist.github.com/pingsutw/482e7f0134414dac437500344bac5134) file to your dbfs (or S3). This is the referenced gist from the primary [Databricks plugin documentation](https://github.com/flyteorg/flyte/blob/master/rsts/deployment/plugin_setup/webapi/databricks.rst) as well, which currently only covers the `flyte-core` Helm chart installation. +You'll need to upload an [entrypoint](https://gist.github.com/pingsutw/482e7f0134414dac437500344bac5134) file to your dbfs (or S3). This is the referenced gist from the primary [Databricks plugin documentation](https://github.com/flyteorg/flyte/blob/master/docs/deployment/plugin_setup/webapi/databricks.rst) as well, which currently only covers the `flyte-core` Helm chart installation. ### User Code diff --git a/Makefile b/Makefile index 84ed86f0f8..3400f627bb 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ include boilerplate/flyte/end2end/Makefile include boilerplate/flyte/golang_test_targets/Makefile define PIP_COMPILE -pip-compile $(1) --upgrade --verbose --resolver=backtracking +pip-compile $(1) --upgrade --verbose --resolver=backtracking --annotation-style=line endef GIT_VERSION := $(shell git describe --always --tags) @@ -81,7 +81,7 @@ helm_upgrade: ## Upgrade helm charts .PHONY: docs docs: - make -C rsts clean html SPHINXOPTS=-W + make -C docs clean html SPHINXOPTS=-W .PHONY: help help: SHELL := /bin/sh diff --git a/RELEASE.md b/RELEASE.md index d4c21f95da..acd72b3c94 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -7,7 +7,7 @@ 1. Open [issues](https://github.com/flyteorg/flyte/issues) and filter by milestone and make sure they are either closed or moved over to the next milestone. ## Start a release PR 1. Run [Generate Flyte Manifests workflow](https://github.com/flyteorg/flyte/actions/workflows/generate_flyte_manifest.yml). Itโ€™ll create a PR ([example](https://github.com/flyteorg/flyte/pull/888)) -1. Update [docs version](https://github.com/flyteorg/flyte/blob/master/rsts/conf.py#L33) to match the milestone version. +1. Update [docs version](https://github.com/flyteorg/flyte/blob/master/docs/conf.py#L33) to match the milestone version. 1. Create a CHANGELOG file ([example](https://github.com/flyteorg/flyte/pull/888/files#diff-0c33dda4ecbd7e1116ddce683b5e143d85b22e43223ca258ecc571fb3b240a57)) 1. Wait for endtoend tests to finish then Merge PR. ## Create a release diff --git a/doc-requirements.txt b/doc-requirements.txt index 289144e3ad..2bde883f6a 100644 --- a/doc-requirements.txt +++ b/doc-requirements.txt @@ -1,153 +1,61 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile doc-requirements.in +# pip-compile --annotation-style=line doc-requirements.in # -alabaster==0.7.13 - # via sphinx -astroid==2.15.6 - # via sphinx-autoapi -babel==2.12.1 - # via sphinx -beautifulsoup4==4.12.2 - # via - # furo - # sphinx-code-include -certifi==2023.7.22 - # via requests -cfgv==3.4.0 - # via pre-commit -charset-normalizer==3.2.0 - # via requests -codespell==2.2.6 - # via -r doc-requirements.in -distlib==0.3.7 - # via virtualenv -docutils==0.17.1 - # via - # sphinx - # sphinx-panels - # sphinx-tabs -filelock==3.12.4 - # via virtualenv -furo @ git+https://github.com/flyteorg/furo@main - # via -r doc-requirements.in -identify==2.5.29 - # via pre-commit -idna==3.4 - # via requests -imagesize==1.4.1 - # via sphinx -importlib-metadata==6.8.0 - # via sphinx -jinja2==3.0.3 - # via - # sphinx - # sphinx-autoapi - # sphinx-tabs -lazy-object-proxy==1.9.0 - # via astroid -markupsafe==2.1.3 - # via jinja2 -nodeenv==1.8.0 - # via pre-commit -packaging==23.1 - # via sphinx -platformdirs==3.10.0 - # via virtualenv -pre-commit==3.4.0 - # via sphinx-tags -pygments==2.16.1 - # via - # furo - # sphinx - # sphinx-prompt - # sphinx-tabs -pyyaml==6.0.1 - # via - # pre-commit - # sphinx-autoapi -requests==2.31.0 - # via - # sphinx - # sphinxcontrib-youtube -six==1.16.0 - # via - # sphinx-code-include - # sphinxext-remoteliteralinclude -snowballstemmer==2.2.0 - # via sphinx -soupsieve==2.5 - # via beautifulsoup4 -sphinx==4.5.0 - # via - # -r doc-requirements.in - # furo - # sphinx-autoapi - # sphinx-basic-ng - # sphinx-code-include - # sphinx-copybutton - # sphinx-fontawesome - # sphinx-issues - # sphinx-panels - # sphinx-prompt - # sphinx-tabs - # sphinx-tags - # sphinxcontrib-video - # sphinxcontrib-youtube -sphinx-autoapi==2.0.1 - # via -r doc-requirements.in -sphinx-basic-ng==1.0.0b2 - # via furo -sphinx-code-include==1.1.1 - # via -r doc-requirements.in -sphinx-copybutton==0.5.2 - # via -r doc-requirements.in -sphinx-fontawesome==0.0.6 - # via -r doc-requirements.in -sphinx-issues==3.0.1 - # via -r doc-requirements.in -sphinx-panels==0.6.0 - # via -r doc-requirements.in -sphinx-prompt==1.5.0 - # via -r doc-requirements.in -sphinx-tabs==3.4.0 - # via -r doc-requirements.in -sphinx-tags==0.2.1 - # via -r doc-requirements.in -sphinxcontrib-applehelp==1.0.4 - # via sphinx -sphinxcontrib-devhelp==1.0.2 - # via sphinx -sphinxcontrib-htmlhelp==2.0.1 - # via sphinx -sphinxcontrib-jsmath==1.0.1 - # via sphinx -sphinxcontrib-mermaid==0.9.2 - # via -r doc-requirements.in -sphinxcontrib-qthelp==1.0.3 - # via sphinx -sphinxcontrib-serializinghtml==1.1.5 - # via sphinx -sphinxcontrib-video==0.2.0 - # via -r doc-requirements.in -sphinxcontrib-youtube==1.2.0 - # via -r doc-requirements.in -sphinxext-remoteliteralinclude==0.4.0 - # via -r doc-requirements.in -typing-extensions==4.8.0 - # via astroid -unidecode==1.3.6 - # via sphinx-autoapi -urllib3==2.0.6 - # via requests -virtualenv==20.24.5 - # via pre-commit -wrapt==1.15.0 - # via astroid -zipp==3.17.0 - # via importlib-metadata +alabaster==0.7.13 # via sphinx +astroid==3.0.1 # via sphinx-autoapi +babel==2.13.1 # via sphinx +beautifulsoup4==4.12.2 # via furo, sphinx-code-include +certifi==2023.11.17 # via requests +cfgv==3.4.0 # via pre-commit +charset-normalizer==3.3.2 # via requests +codespell==2.2.6 # via -r doc-requirements.in +distlib==0.3.7 # via virtualenv +docutils==0.17.1 # via sphinx, sphinx-panels, sphinx-tabs +filelock==3.13.1 # via virtualenv +furo @ git+https://github.com/flyteorg/furo@main # via -r doc-requirements.in +identify==2.5.32 # via pre-commit +idna==3.6 # via requests +imagesize==1.4.1 # via sphinx +jinja2==3.0.3 # via sphinx, sphinx-autoapi, sphinx-tabs +markupsafe==2.1.3 # via jinja2 +nodeenv==1.8.0 # via pre-commit +packaging==23.2 # via sphinx +platformdirs==4.1.0 # via virtualenv +pre-commit==3.5.0 # via sphinx-tags +pygments==2.17.2 # via furo, sphinx, sphinx-prompt, sphinx-tabs +pyyaml==6.0.1 # via pre-commit, sphinx-autoapi +requests==2.31.0 # via sphinx, sphinxcontrib-youtube +six==1.16.0 # via sphinx-code-include, sphinxext-remoteliteralinclude +snowballstemmer==2.2.0 # via sphinx +soupsieve==2.5 # via beautifulsoup4 +sphinx==4.5.0 # via -r doc-requirements.in, furo, sphinx-autoapi, sphinx-basic-ng, sphinx-code-include, sphinx-copybutton, sphinx-fontawesome, sphinx-issues, sphinx-panels, sphinx-prompt, sphinx-tabs, sphinx-tags, sphinxcontrib-video, sphinxcontrib-youtube, sphinxext-remoteliteralinclude +sphinx-autoapi==2.0.1 # via -r doc-requirements.in +sphinx-basic-ng==1.0.0b2 # via furo +sphinx-code-include==1.1.1 # via -r doc-requirements.in +sphinx-copybutton==0.5.2 # via -r doc-requirements.in +sphinx-fontawesome==0.0.6 # via -r doc-requirements.in +sphinx-issues==3.0.1 # via -r doc-requirements.in +sphinx-panels==0.6.0 # via -r doc-requirements.in +sphinx-prompt==1.5.0 # via -r doc-requirements.in +sphinx-tabs==3.4.0 # via -r doc-requirements.in +sphinx-tags==0.2.1 # via -r doc-requirements.in +sphinxcontrib-applehelp==1.0.4 # via sphinx +sphinxcontrib-devhelp==1.0.2 # via sphinx +sphinxcontrib-htmlhelp==2.0.1 # via sphinx +sphinxcontrib-jsmath==1.0.1 # via sphinx +sphinxcontrib-mermaid==0.9.2 # via -r doc-requirements.in +sphinxcontrib-qthelp==1.0.3 # via sphinx +sphinxcontrib-serializinghtml==1.1.5 # via sphinx +sphinxcontrib-video==0.2.0 # via -r doc-requirements.in +sphinxcontrib-youtube==1.2.0 # via -r doc-requirements.in +sphinxext-remoteliteralinclude==0.4.0 # via -r doc-requirements.in +typing-extensions==4.8.0 # via astroid +unidecode==1.3.7 # via sphinx-autoapi +urllib3==2.1.0 # via requests +virtualenv==20.25.0 # via pre-commit # The following packages are considered to be unsafe in a requirements file: # setuptools diff --git a/docs/.readthedocs.yaml b/docs/.readthedocs.yaml new file mode 100644 index 0000000000..9e9f236bc3 --- /dev/null +++ b/docs/.readthedocs.yaml @@ -0,0 +1,17 @@ +# .readthedocs.yml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 +build: + os: "ubuntu-22.04" + tools: + python: "mambaforge-22.9" + +# Build documentation in the docs/ directory with Sphinx +sphinx: + configuration: docs/conf.py + +conda: + environment: monodocs-environment.yaml diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000000..0bcb4e2f47 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,23 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SPHINXPROJ = flyte +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +clean: + rm -rf _build _src api flytectl flytekit flytesnacks protos examples diff --git a/docs/_ext/__init__.py b/docs/_ext/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/_ext/auto_examples.py b/docs/_ext/auto_examples.py new file mode 100644 index 0000000000..6442c52fe1 --- /dev/null +++ b/docs/_ext/auto_examples.py @@ -0,0 +1,153 @@ +"""Custom extension to auto-generate example docs from example directory.""" + +import inspect +import shutil +from pathlib import Path + +import jupytext +from docutils import nodes +from docutils.statemachine import StringList, string2lines +from sphinx.application import Sphinx +from sphinx.config import Config +from sphinx.util.docutils import SphinxDirective + +__version__ = "0.0.0" + + +TOC_TEMPLATE = """ +```{{toctree}} +:maxdepth: 1 +:hidden: +{toc} +``` +""" + +TABLE_TEMPLATE = """ +```{{list-table}} +:header-rows: 0 +:widths: 100 +{rows} +``` +""" + + +class AutoExamplesTOC(SphinxDirective): + """Custom directive to convert examples into table of contents.""" + + has_content = True + + def run(self) -> list: + return [self.parse()] + + def get_root_fp(self) -> str: + index_fp, _ = self.get_source_info() + index_fp = Path(index_fp) + example_fp = [] + example_fp = str(index_fp).split(f"{self.config.auto_examples_dir_dest}/")[-1] + return str(Path(self.config.auto_examples_dir_dest) / Path(example_fp).parent) + + def parse(self): + """Parses the directive""" + + root_fp = self.get_root_fp() + toc, rows = "", "" + for filename in self.content: + toc += f"\n{filename}" + rows += f"\n* - {{fa}}`file` {{doc}}`/{root_fp}/{filename}`" + + container = nodes.container("") + toc = inspect.cleandoc(TOC_TEMPLATE.format(toc=toc)) + table = inspect.cleandoc(TABLE_TEMPLATE.format(rows=rows)) + content = f"{toc}\n\n{table}" + + self.state.nested_parse(StringList(string2lines(content)), 0, container) + return container + + +# This allows the sphinx myst parser to recognize markdown files as something +# this it can potentially execute +MYST_NOTEBOOK_METADATA = { + "jupytext": { + "notebook_metadata_filter": "all", + "cell_metadata_filter": "all", + "formats": "md:myst", + "text_representation": { + "extension": ".md", + "format_name": "myst", + }, + }, + "kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, +} + + +def convert_to_mdmyst(file: Path, dest_dir: Path, from_format: str): + notebook = jupytext.read(file, fmt=from_format) + jupytext.header.recursive_update(notebook.metadata, MYST_NOTEBOOK_METADATA) + jupytext.write( + notebook, + dest_dir / f"{file.stem}.md", + fmt="md:myst", + ) + + +def convert_py_example(file: Path, dest_dir: Path): + """ + Converts a python file in the specified auto examples directory. + + Converting sphinx-gallery format python files to .rst is only supported + for backwards compatibility. The py:percent format conversion to myst + markdown is the strongly encouraged format. + """ + convert_to_mdmyst(file, dest_dir, "py:percent") + + +def generate_auto_examples(app: Sphinx, config: Config): + """Converts all example files into myst markdown format.""" + # copy files over to docs directory + if config.auto_examples_refresh: + shutil.rmtree(config.auto_examples_dir_dest) + + if Path(config.auto_examples_dir_dest).exists(): + return + + for source_dir in (x for x in Path(config.auto_examples_dir_root).glob("*") if x.is_dir()): + source_dir = Path(source_dir) + dest_dir = Path(config.auto_examples_dir_dest, *source_dir.parts[-1:]) + + dest_dir.mkdir(exist_ok=True, parents=True) + + # copy README.md file for root project content and table of contents + shutil.copy(source_dir / "README.md", dest_dir / "index.md") + + # assume that the python source files are in a directory with the same + # name as the project directory + project_name = source_dir.name + assert (source_dir / project_name).exists(), ( + "Python example files must be the same name as the project " f"directory name {project_name}" + ) + + for f in (x for x in source_dir.glob(f"{project_name}/*.py") if x.name != "__init__.py"): + # converts sphinx-gallery file to rst + convert_to_mdmyst(f, dest_dir, "py:percent") + + for f in (x for x in source_dir.glob(f"{project_name}/*.ipynb")): + convert_to_mdmyst(f, dest_dir, from_format="ipynb") + + for f in (x for x in source_dir.glob(f"{project_name}/*.md")): + convert_to_mdmyst(f, dest_dir, from_format="md") + + for f in (x for x in source_dir.glob("**/Dockerfile")): + shutil.copy(f, dest_dir) + + +def setup(app: Sphinx) -> dict: + app.add_config_value("auto_examples_refresh", None, False) + app.add_config_value("auto_examples_dir_root", None, False) + app.add_config_value("auto_examples_dir_dest", None, False) + app.connect("config-inited", generate_auto_examples, priority=2) + app.add_directive("auto-examples-toc", AutoExamplesTOC) + return { + "version": __version__, + "parallel_read_safe": True, + "parallel_write_safe": True, + } diff --git a/docs/_ext/import_projects.py b/docs/_ext/import_projects.py new file mode 100644 index 0000000000..b4cb011012 --- /dev/null +++ b/docs/_ext/import_projects.py @@ -0,0 +1,139 @@ +import os +import re +import shutil +import subprocess +import sys +from dataclasses import dataclass, field +from functools import partial +from pathlib import Path +from typing import Optional, List, Union + +from git import Repo +from sphinx.application import Sphinx +from sphinx.config import Config + +__version__ = "0.0.0" + + +@dataclass +class ImportProjectsConfig: + clone_dir: str + flytekit_api_dir: str + source_regex_mapping: dict = field(default_factory=dict) + + +@dataclass +class Project: + source: str + dest: str + local: bool = False + cmd: Optional[List[Union[str, List[str]]]] = None + docs_path: Optional[str] = None + refresh: bool = False + + +def update_sys_path_for_flytekit(import_project_config: ImportProjectsConfig): + # add flytekit to python path + flytekit_dir = os.path.abspath(import_project_config.flytekit_api_dir) + flytekit_src_dir = os.path.abspath(os.path.join(flytekit_dir, "flytekit")) + plugins_dir = os.path.abspath(os.path.join(flytekit_dir, "plugins")) + + sys.path.insert(0, flytekit_src_dir) + sys.path.insert(0, flytekit_dir) + + # add plugins to python path + for possible_plugin_dir in os.listdir(plugins_dir): + dir_path = os.path.abspath((os.path.join(plugins_dir, possible_plugin_dir))) + plugin_path = os.path.abspath(os.path.join(dir_path, "flytekitplugins")) + if os.path.isdir(dir_path) and os.path.exists(plugin_path): + sys.path.insert(0, dir_path) + + +def import_projects(app: Sphinx, config: Config): + """Clone projects from git or copy from local directory.""" + projects = [Project(**p) for p in config.import_projects] + import_projects_config = ImportProjectsConfig(**config.import_projects_config) + + srcdir = Path(app.srcdir) + + for _dir in ( + import_projects_config.clone_dir, + import_projects_config.flytekit_api_dir, + ): + (srcdir / _dir).mkdir(parents=True, exist_ok=True) + + for project in projects: + if project.local: + local_dir = srcdir / project.source + else: + local_dir = srcdir / import_projects_config.clone_dir / project.dest + shutil.rmtree(local_dir, ignore_errors=True) + Repo.clone_from(project.source, local_dir) + + local_docs_path = local_dir / project.docs_path + dest_docs_dir = srcdir / project.dest + + if project.refresh or not dest_docs_dir.exists(): + shutil.rmtree(dest_docs_dir, ignore_errors=True) + shutil.copytree(local_docs_path, dest_docs_dir, dirs_exist_ok=True) + + if project.cmd: + if isinstance(project.cmd[0], list): + for c in project.cmd: + subprocess.run(c) + else: + subprocess.run(project.cmd) + + # remove cloned directories + shutil.rmtree(import_projects_config.clone_dir) + + # add flytekit and plugins to path so that API reference can build + update_sys_path_for_flytekit(import_projects_config) + + # add functions to clean up source and docstring refs + for i, (patt, repl) in enumerate(import_projects_config.source_regex_mapping.items()): + app.connect( + "source-read", + partial(replace_refs_in_files, patt, repl), + priority=i, + ) + app.connect( + "autodoc-process-docstring", + partial(replace_refs_in_docstrings, patt, repl), + priority=i, + ) + + +def replace_refs_in_files(patt: str, repl: str, app: Sphinx, docname: str, source: List[str]): + text = source[0] + + if re.search(patt, text): + text = re.sub(patt, repl, text) + + # replace source file + source[0] = text + + +def replace_refs_in_docstrings( + patt: str, repl: str, app: Sphinx, what: str, name: str, obj: str, options: dict, lines: List[str], +): + replace = {} + for i, text in enumerate(lines): + if re.search(patt, text): + text = re.sub(patt, repl, text) + replace[i] = text + + for i, text in replace.items(): + lines[i] = text + + +def setup(app: Sphinx) -> dict: + app.add_config_value("import_projects_config", None, False) + app.add_config_value("import_projects", None, False) + app.connect("config-inited", import_projects, priority=0) + + return { + "version": __version__, + "parallel_read_safe": True, + "parallel_write_safe": True, + } diff --git a/docs/_static/custom.css b/docs/_static/custom.css new file mode 100644 index 0000000000..f9687a7b15 --- /dev/null +++ b/docs/_static/custom.css @@ -0,0 +1,38 @@ +.getting-started-panels div.card-body { + padding: 0; +} + +.getting-started-panels a.btn-primary { + color: white !important; + background-color: var(--color-link); + border-color: var(--color-link); +} + +.getting-started-panels a.btn-outline-primary { + color: white !important; + background-color: var(--color-link); + opacity: 0.5; + border-color: var(--color-link); +} + +html .tabbed-set > label { + color: var(--color-foreground-border); +} + +html .tabbed-set > input:checked + label { + border-color: var(--color-link); + color: var(--color-link); +} + +html .tabbed-set > label:hover { + color: var(--color-link); +} + +html .tabbed-content { + box-shadow: 0 -.0625rem var(--color-background-border),0 .0625rem var(--color-background-border); +} + +table { + width: 100%; + box-shadow: none !important; +} diff --git a/docs/_static/flyte.css b/docs/_static/flyte.css new file mode 100644 index 0000000000..a0e5225c25 --- /dev/null +++ b/docs/_static/flyte.css @@ -0,0 +1,534 @@ +h1, +h2, +h3, +h4, +h5, +h6 { + font-weight: 400; + margin-top: 1.75rem; +} + +.sidebar-scroll { + scroll-behavior: auto; +} + +.sidebar-tree ul:first-child li:not(:last-child) { + margin-bottom: 5px; +} + +.sidebar-tree .caption { + text-transform: none; + margin-bottom: 12px; + margin-left: 16px; + margin-top: 30px; +} + +.sidebar-tree li.current-page > a.current { + background: #9d68e41f; + font-weight: unset; +} + +.sidebar-tree li.current-page.toctree-l2 > a.current { + background: var(--color-sidebar-item-background--hover); + font-weight: unset; +} + +/* Algolia Docs Search Style */ +.docsearch { + width: 100% !important; +} + +.DocSearch-Button { + height: 60px !important; + width: 100% !important; + margin: 0px !important; + border-radius: 0 !important; + border-bottom: 1px solid var(--color-header-border) !important; + background: var(--color-sidebar-background) !important; + padding: 0 15px !important; +} + +.DocSearch-Button:hover, .DocSearch-Button:active { + box-shadow: none !important; + background: var(--docsearch-searchbox-background) !important; +} + +.sidebar-search-container::before { + content: none; +} + +.tabbed-set > input:checked + label { + border-color: var(--color-link); + color: var(--color-link); +} + +.tabbed-set > input:checked + label:hover { + color: var(--color-link); +} + +@media only screen and (min-width: 1200px) { + .sidebar-drawer { + width: 20em; + } +} + +.sidebar-brand { + flex-direction: row; + align-items: center; + justify-content: center; +} + +.sidebar-brand-text { + font-size: 2.5em; + line-height: 2.5em; +} + +.sidebar-container { + width: 20em; +} + +.sidebar-logo { + max-width: 60px; + margin: 0 15px 0 0; + float: right; +} + +.sidebar-tree i.fa { + padding-right: 15px; + width: 1.75em; + text-align: center; +} + +.sidebar-tree .caption { + padding: 0; +} + +.caption-text { + font-size: 15px; + font-weight: 600; + color: #696969; +} + +/* .sidebar-tree .reference.external:after { + content: none; +} */ + +.sphx-glr-thumbnails { + display: block; +} + +.sphx-glr-thumbcontainer { + background-color: var(--color-background-secondary); + border: 1px solid var(--color-background-border); + box-shadow: 0 0.2rem 0.5rem rgba(0, 0, 0, 0.05), + 0 0 0.0625rem rgba(0, 0, 0, 0.1) !important; + min-height: 60px; + width: 100%; + padding-top: 0px; + margin: 10px 0; + display: block; +} + +.sphx-glr-thumbcontainer:hover { + border: 1px solid #cca9ff; + background-color: #f2e9ff; + box-shadow: none !important; +} + +body:not([data-theme="light"]) .sphx-glr-thumbcontainer:hover { + border: 1px solid #2a144a; + background-color: #2a144a; +} + +.sphx-glr-thumbcontainer img { + float: left; + width: 30px; + margin-top: 14px; + margin-left: 14px; + opacity: 0.5; +} + +.sphx-glr-thumbnail-title { + margin-top: 18px; + margin-left: 65px; + color: #696969; + font-size: 15px; +} + +div.sphx-glr-download-link-note, +p.sphx-glr-signature { + height: 0px; + visibility: hidden; + padding: 0; + margin: 0; +} + +div.sphx-glr-footer { + width: 100%; + display: flex; + border-top: 1px solid var(--color-background-border); + border-bottom: 1px solid var(--color-background-border); + margin: 40px 0px 10px 0px; +} + +div.sphx-glr-download { + margin: 0; + width: 50%; +} + +.sphx-glr-thumbcontainer a.internal { + position: absolute; + padding: 10px; + padding-left: 75px; + font-size: 0.9rem; + text-decoration: none; + color: var(--color-sidebar-link-text); + /* center text in example cards */ + display: flex; + justify-content: left; + align-items: center; +} + +.sphx-glr-thumbcontainer:hover a.internal { + text-decoration: none; + color: var(--color-link); +} + +/* hide download thumbnail captions and sphinx gallery script timing */ +.sphx-glr-thumbcontainer[tooltip]:hover:before, +.sphx-glr-thumbcontainer[tooltip]:hover:after, +.sphx-glr-timing { + display: none; +} + +div.sphx-glr-download a { + color: var(--color-sidebar-link-text); + background: transparent; + background-image: none; + border: none; + border-radius: 0; + font-family: inherit; + font-weight: lighter; + font-size: 0.9em; + padding: 30px 0px; + width: 100%; +} + +div.sphx-glr-download a:hover { + box-shadow: none; + background: var(--color-sidebar-item-background--hover); + border: none; + border-radius: 0; +} + +div.sphx-glr-footer div.sphx-glr-download p { + margin: 0; +} + +div.sphx-glr-footer div.sphx-glr-download:first-child p { + border-right: 1px solid var(--color-background-border); +} + +div.sphx-glr-thumbcontainer a.headerlink { + display: none; +} + +.sphx-glr-script-out .highlight pre { + background-color: #f8f8f8; +} + +p.sphx-glr-script-out { + padding-top: 0em; +} + +.search__outer::-webkit-scrollbar-track { + border-radius: 0px; +} + +@media (prefers-color-scheme: dark) { + .search__outer { + background-color: #131416 !important; + border: 1px solid #131416 !important; + } + .search__outer__input { + background-color: #1a1c1e !important; + } + .search__result__single { + border-bottom: #303335 !important; + } + .outer_div_page_results:hover { + background-color: black; + } + .search__result__title, + .rtd_ui_search_subtitle { + color: #9d68e4 !important; + border-bottom: 1px solid #9d68e4 !important; + } + .search__outer .search__result__title span, + .search__outer .search__result__content span { + background-color: #9d68e454; + } + .search__result__subheading, + .search__result__content { + color: #ffffffd9 !important; + } + .search__outer::-webkit-scrollbar-track { + background-color: #131416 !important; + } + .rtd__search__credits { + background-color: #1a1c1e !important; + border: 1px solid #1a1c1e !important; + color: #81868d !important; + } + .rtd__search__credits a, + .search__error__box { + color: #9ca0a5 !important; + } + details.sphinx-bs.dropdown { + background: var(--color-background-primary); + border: 1px solid var(--color-background-border); + } + details.sphinx-bs.dropdown .card-header { + color: var(--color-link); + } +} + +div.sphinx-bs .card { + flex-direction: row; +} + +/* sphinx-panels custom styles */ +div.sphinx-bs .card-header { + border-bottom: none; + background-color: var(--color-background-primary); + display: flex; + align-items: center; + justify-content: left; + width: 35%; + float: left; +} + +.sphinx-bs .card-header:first-child { + border-radius: calc(0.25rem - 1px) 0 0 calc(0.25rem - 1px); +} + +div.sphinx-bs .card-header .sphinx-bs.btn, +div.sphinx-bs .card-body .sphinx-bs.btn, +div.sphinx-bs .card-header p.card-text { + font-size: 1rem; + text-decoration: none; + word-spacing: 2.5px; + color: var(--color-sidebar-link-text); +} + +div.sphinx-bs .card-header p.card-text a { + text-align: left; +} + +.sphinx-bs.btn:focus { + box-shadow: none; +} + +div.sphinx-bs .card-body { + width: 65%; + float: left; +} + +.sphinx-bs .card-body .fa { + color: var(--color-sidebar-link-text); +} + +.sphinx-bs .card-body:hover .fa { + color: var(--color-link--hover); +} + +.sphinx-bs .card-body .fa { + font-size: 2rem; +} + +div.sphinx-bs .card:hover { + box-shadow: none !important; + border-color: #cca9ff; +} + +div.sphinx-bs .card:hover .card-header { + background-color: #f2e9ff; + color: #fff; +} + +body[data-theme="dark"] div.sphinx-bs .card:hover { + border-color: #2a144a; +} + +body[data-theme="dark"] div.sphinx-bs .card:hover .card-header { + background-color: #2a144a; + color: #fff; +} + +/* make sure hover style is consistent if user prefers dark theme at OS level */ +@media (prefers-color-scheme: dark) { + body:not([data-theme="light"]) div.sphinx-bs .card:hover { + border-color: #2a144a; + } + body:not([data-theme="light"]) div.sphinx-bs .card:hover .card-header { + background-color: #2a144a; + color: #fff; + } +} + +div.sphinx-bs .card:hover .sphinx-bs.btn { + color: var(--color-link); +} + +div.sphinx-bs .card:hover .card-body .sphinx-bs.btn { + color: var(--color-link--hover); +} + +.getting-started-panels div.sphinx-bs .sphinx-bs.btn:hover { + border-color: var(--color-link); + background-color: #9d68e4; + color: #ffffff; +} + +div.sphinx-bs .card { + background-color: var(--color-background-secondary); + border: 1px solid var(--color-background-border); +} + +.center-card-content p { + margin: auto !important; +} + +.sphinx-tabs { + padding-top: 10px; +} + +.sphinx-tabs-tab { + color: var(--color-link); +} + +/* sphinx tabs */ +.sphinx-tabs-tab[aria-selected="true"] { + background-color: var(--color-background-primary); + border: 1px solid var(--color-background-border); + border-bottom: 1px solid var(--color-background-primary); +} + +.sphinx-tabs-panel { + border: 1px solid var(--color-background-border); + background: var(--color-background-primary); + border-top: 0; +} + +[role="tablist"] { + border-bottom: 1px solid var(--color-background-border); +} + +/* mermaid diagrams */ +div.mermaid { + /* hide the div if the raw data hasn't been processed */ + display: none; +} + +div.mermaid[data-processed="true"] { + display: block; + background: var(--color-background-secondary); + padding: 20px 0; +} + +div.mermaid svg { + max-height: 800px; +} + +div.mermaid .messageText { + fill: var(--color-foreground-primary) !important; + stroke: var(--color-foreground-primary) !important; +} + +div.mermaid line { + stroke: var(--color-foreground-secondary) !important; +} + +/* code cell styles */ +div.highlight a { + text-decoration-color: var(--color-content-foreground); +} + +div.sphx-glr-script-out .highlight { + margin-left: 0; + margin-top: 0; +} + +.sphx-glr-script-out .highlight pre { + background-color: var(--color-background-secondary); +} + +/* Make sure autosummary table cells have left text alignment */ +.article .align-center, +article .align-default { + text-align: left; +} + +/* rate-the-docs custom styles */ +.ratd-widget { + z-index: 10000 !important; + right: 0 !important; + font-size: 0.85em !important; + background-color: var(--color-sidebar-background) !important; + border: 1px solid var(--color-sidebar-background-border) !important; + color: var(--color-sidebar-link-text) !important; +} + +.ratd-widget .btn { + color: var(--color-sidebar-link-text) !important; +} + +.ratd-toggle.ratd-toggle-close { + margin: 12px 0 !important; +} + +.ratd-panel-thanks { + padding-top: 10px !important; +} + +.ratd-panel-thanks[aria-hidden="true"] { + padding-top: 0px !important; +} + +.ratd-widget .ratd-panel-suggestion .btn { + background-color: var(--color-link); + border-color: var(--color-link); + color: var(--color-content-foreground) !important; +} + +.ratd-widget i { + color: var(--color-foreground-muted) !important; +} + +/* readthedocs custom styling */ +.rst-versions.rst-badge { + bottom: 80px !important; + max-width: none !important; +} + +.rst-versions.rst-badge.shift-up .rst-current-version .fa-book { + padding-top: 8px !important; +} + +/* tag styles */ +p.tags span { + font-weight: bold +} + +p.tags a { + background: var(--color-background-secondary); + border: 1px solid var(--color-background-border); + padding: 5px 10px; + border-radius: 10px; + font-size: 0.9em; + margin-right: 5px; +} + +p.tags span:not(:first-child) { + display: none; +} diff --git a/docs/_templates/custom.rst b/docs/_templates/custom.rst new file mode 100644 index 0000000000..17c9b00963 --- /dev/null +++ b/docs/_templates/custom.rst @@ -0,0 +1,42 @@ +{{ fullname | escape | underline}} + +.. currentmodule:: {{ module }} + +{% if objtype == 'class' %} + +.. autoclass:: {{ objname }} + + {% block methods %} + {% if methods %} + + .. rubric:: {{ _('Methods') }} + {% for item in methods %} + + {% if item != '__init__' %} + .. automethod:: {{ item }} + {% endif %} + + {%- endfor %} + {% endif %} + {% endblock %} + + {% block attributes %} + {% if attributes %} + + .. rubric:: {{ _('Attributes') }} + {% for item in attributes %} + .. autoattribute:: {{ item }} + :noindex: + {%- endfor %} + + {% endif %} + {% endblock %} + + +{% endif %} + +{% if objtype == 'function' %} + +.. autofunction:: {{ objname }} + +{% endif %} diff --git a/docs/_templates/file_types.rst b/docs/_templates/file_types.rst new file mode 100644 index 0000000000..e7629ea363 --- /dev/null +++ b/docs/_templates/file_types.rst @@ -0,0 +1,39 @@ +{{ fullname | escape | underline}} + +.. currentmodule:: {{ module }} + +{% if objname == 'FlyteFile' %} + +.. autoclass:: {{ objname }} + + {% block methods %} + {% if methods %} + + .. rubric:: {{ _('Methods') }} + {% for item in methods %} + + {% if item != '__init__' %} + .. automethod:: {{ item }} + {% endif %} + + {%- endfor %} + {% endif %} + {% endblock %} + + {% block attributes %} + {% if attributes %} + + .. rubric:: {{ _('Attributes') }} + {% for item in attributes %} + .. autoattribute:: {{ item }} + {%- endfor %} + + {% endif %} + {% endblock %} + + +{% else %} + +.. autodata:: {{ objname }} + +{% endif %} diff --git a/docs/community/contribute.rst b/docs/community/contribute.rst new file mode 100644 index 0000000000..b914769f93 --- /dev/null +++ b/docs/community/contribute.rst @@ -0,0 +1,737 @@ +.. _contribute_Flyte: + +##################### +Contributing to Flyte +##################### + +.. tags:: Contribute, Basic + +Thank you for taking the time to contribute to Flyte! +Please read our `Code of Conduct `__ before contributing to Flyte. + +Here are some guidelines for you to follow, which will make your first and follow-up contributions easier. + +TL;DR: Find the repo-specific contribution guidelines in the `Component Reference <#component-reference>`__ section. + +๐Ÿ’ป Becoming a contributor +========================= + +An issue tagged with `good first issue `__ is the best place to start for first-time contributors. + +**Appetizer for every repo: Fork and clone the concerned repository. Create a new branch on your fork and make the required changes. Create a pull request once your work is ready for review.** + +.. note:: + To open a pull request, refer to `GitHub's guide `__ for detailed instructions. + +Example PR for your reference: `GitHub PR `__. +A couple of checks are introduced to help maintain the robustness of the project. + +#. To get through DCO, sign off on every commit (`Reference `__) +#. To improve code coverage, write unit tests to test your code +#. Make sure all the tests pass. If you face any issues, please let us know + +On a side note, format your Go code with ``golangci-lint`` followed by ``goimports`` (use ``make lint`` and ``make goimports``), and Python code with ``black`` and ``isort`` (use ``make fmt``). +If make targets are not available, you can manually format the code. +Refer to `Effective Go `__, `Black `__, and `Isort `__ for full coding standards. + +As you become more involved with the project, you may be able to be added as a contributor to the repos you're working on, +but there is a medium term effort to move all development to forks. + +๐Ÿ“ƒ Documentation +================ + +Flyte uses Sphinx for documentation. ``protoc-gen-doc`` is used to generate the documentation from ``.proto`` files. + +Sphinx spans multiple repositories under `flyteorg `__. It uses reStructured Text (rst) files to store the documentation content. +For API- and code-related content, it extracts docstrings from the code files. + +To get started, refer to the `reStructuredText reference `__. + +For minor edits that don't require a local setup, you can edit the GitHub page in the documentation to propose improvements. + +Intersphinx +*********** + +`Intersphinx `__ can generate automatic links to the documentation of objects in other projects. + +To establish a reference to any other documentation from Flyte or within it, use Intersphinx. + +To do so, create an ``intersphinx_mapping`` in the ``conf.py`` file which should be present in the respective ``docs`` repository. +For example, ``rsts`` is the docs repository for the ``flyte`` repo. + +For example: + +.. code-block:: python + + intersphinx_mapping = { + "python": ("https://docs.python.org/3", None), + "flytekit": ("https://flyte.readthedocs.io/projects/flytekit/en/master/", None), + } + +The key refers to the name used to refer to the file (while referencing the documentation), and the URL denotes the precise location. + +Here is an example using ``:std:doc``: + +* Direct reference + + .. code-block:: text + + Task: :std:doc:`/api/flytekit/generated/flytekit.task` + + Output: + + Task: :std:doc:`/api/flytekit/generated/flytekit.task` + +* Custom name + + .. code-block:: text + + :std:doc:`Using custom words ` + + Output: + + :std:doc:`Using custom words ` + +| + +You can cross-reference multiple Python objects. Check out this `section `__ to learn more. + +| + +For instance, `task` decorator in flytekit uses the ``func`` role. + +.. code-block:: text + + Link to flytekit code :py:func:`flytekit:flytekit.task` + +Output: + +Link to flytekit code :py:func:`flytekit:flytekit.task` + +| + +Here are a couple more examples. + +.. code-block:: text + + :py:mod:`Module ` + :py:class:`Class ` + :py:data:`Data ` + :py:func:`Function ` + :py:meth:`Method ` + +Output: + +:py:mod:`Module ` + +:py:class:`Class ` + +:py:data:`Data ` + +:py:func:`Function ` + +:py:meth:`Method ` + +๐Ÿงฑ Component reference +====================== + +To understand how the below components interact with each other, refer to :ref:`Understand the lifecycle of a workflow `. + +.. figure:: https://raw.githubusercontent.com/flyteorg/static-resources/main/flyte/contribution_guide/dependency_graph.png + :alt: Dependency graph between various flyteorg repos + :align: center + :figclass: align-center + + The dependency graph between various flyte repos + +``flyte`` +********* + +.. list-table:: + + * - `Repo `__ + * - **Purpose**: Deployment, Documentation, and Issues + * - **Languages**: Kustomize & RST + +.. note:: + For the ``flyte`` repo, run the following command in the repo's root to generate documentation locally. + + .. code-block:: console + + make -C rsts html + +``flyteidl`` +************ + +.. list-table:: + + * - `Repo `__ + * - **Purpose**: Flyte workflow specification is in `protocol buffers `__ which forms the core of Flyte + * - **Language**: Protobuf + * - **Guidelines**: Refer to the `README `__ + +``flytepropeller`` +****************** + +.. list-table:: + + * - `Repo `__ | `Code Reference `__ + * - **Purpose**: Kubernetes-native operator + * - **Language**: Go + * - **Guidelines:** + + * Check for Makefile in the root repo + * Run the following commands: + * ``make generate`` + * ``make test_unit`` + * ``make link`` + * To compile, run ``make compile`` + +``flyteadmin`` +************** + +.. list-table:: + + * - `Repo `__ | `Code Reference `__ + * - **Purpose**: Control Plane + * - **Language**: Go + * - **Guidelines**: + + * Check for Makefile in the root repo + * If the service code has to be tested, run it locally: + * ``make compile`` + * ``make server`` + * To seed data locally: + * ``make compile`` + * ``make seed_projects`` + * ``make migrate`` + * To run integration tests locally: + * ``make integration`` + * (or to run in containerized dockernetes): ``make k8s_integration`` + +``flytekit`` +************ + +.. list-table:: + + * - `Repo `__ + * - **Purpose**: Python SDK & Tools + * - **Language**: Python + * - **Guidelines**: Refer to the `Flytekit Contribution Guide `__ + +``flyteconsole`` +**************** + +.. list-table:: + + * - `Repo `__ + * - **Purpose**: Admin Console + * - **Language**: Typescript + * - **Guidelines**: Refer to the `README `__ + +``datacatalog`` +*************** + +.. list-table:: + + * - `Repo `__ | `Code Reference `__ + * - **Purpose**: Manage Input & Output Artifacts + * - **Language**: Go + +``flyteplugins`` +**************** + +.. list-table:: + + * - `Repo `__ | `Code Reference `__ + * - **Purpose**: Flyte Plugins + * - **Language**: Go + * - **Guidelines**: + + * Check for Makefile in the root repo + * Run the following commands: + * ``make generate`` + * ``make test_unit`` + * ``make link`` + +``flytestdlib`` +*************** + +.. list-table:: + + * - `Repo `__ + * - **Purpose**: Standard Library for Shared Components + * - **Language**: Go + +``flytesnacks`` +*************** + +.. list-table:: + + * - `Repo `__ + * - **Purpose**: Examples, Tips, and Tricks to use Flytekit SDKs + * - **Language**: Python (In the future, Java examples will be added) + * - **Guidelines**: Refer to the `Flytesnacks Contribution Guide `__ + +``flytectl`` +************ + +.. list-table:: + + * - `Repo `__ + * - **Purpose**: A standalone Flyte CLI + * - **Language**: Go + * - **Guidelines**: Refer to the `FlyteCTL Contribution Guide `__ + + +๐Ÿ”ฎ Development Environment Setup Guide +====================================== + +This guide provides a step-by-step approach to setting up a local development environment for +`flyteidl `_, `flyteadmin `_, +`flyteplugins `_, `flytepropeller `_, +`flytekit `_ , `flyteconsole `_, +`datacatalog `_, and `flytestdlib `_. + +The video below is a tutorial on how to set up a local development environment for Flyte. + +.. youtube:: V-KlVQmQAjE + +Requirements +************ + +This guide has been tested and used on AWS EC2 with an Ubuntu 22.04 +image. The following tools are required: + +- `Docker `__ +- `Kubectl `__ +- `Go `__ + +Content +******* + +- `How to setup dev environment for flyteidl, flyteadmin, flyteplugins, + flytepropeller, datacatalog and flytestdlib? <#how-to-setup-dev-environment-for-flyteidl-flyteadmin-flyteplugins-flytepropeller-datacatalog-and-flytestdlib>`__ + +- `How to setup dev environment for + flytekit? <#how-to-setup-dev-environment-for-flytekit>`__ + +- `How to setup dev environment for + flyteconsole? <#how-to-setup-dev-environment-for-flyteconsole>`__ + +- `How to access Flyte UI, minio, postgres, k3s, and endpoints? + <#how-to-access-flyte-ui-minio-postgres-k3s-and-endpoints>`__ + +How to setup dev environment for flyteidl, flyteadmin, flyteplugins, flytepropeller, datacatalog and flytestdlib? +****************************************************************************************************************************** + +**1. Install flytectl** + + +`Flytectl `__ is a portable and lightweight command-line interface to work with Flyte. + +.. code:: shell + + # Step1: Install the latest version of flytectl + curl -sL https://ctl.flyte.org/install | bash + # flyteorg/flytectl info checking GitHub for latest tag + # flyteorg/flytectl info found version: 0.6.39 for v0.6.39/Linux/x86_64 + # flyteorg/flytectl info installed ./bin/flytectl + + # Step2: Export flytectl path based on the previous log "flyteorg/flytectl info installed ./bin/flytectl" + export PATH=$PATH:/home/ubuntu/bin # replace with your path + +**2. Build a k3s cluster that runs minio and postgres Pods.** + + +| `Minio `__ is an S3-compatible object store that will be used later to store task output, input, etc. +| `Postgres `__ is an open-source object-relational database that will later be used by flyteadmin/dataCatalog to + store all Flyte information. + +.. code:: shell + + # Step1: Start k3s cluster, create Pods for postgres and minio. Note: We cannot access Flyte UI yet! but we can access the minio console now. + flytectl demo start --dev + # ๐Ÿ‘จโ€๐Ÿ’ป Flyte is ready! Flyte UI is available at http://localhost:30080/console ๐Ÿš€ ๐Ÿš€ ๐ŸŽ‰ + # โ‡๏ธ Run the following command to export demo environment variables for accessing flytectl + # export FLYTECTL_CONFIG=/home/ubuntu/.flyte/config-sandbox.yaml + # ๐Ÿ‹ Flyte sandbox ships with a Docker registry. Tag and push custom workflow images to localhost:30000 + # ๐Ÿ“‚ The Minio API is hosted on localhost:30002. Use http://localhost:30080/minio/login for Minio console + + # Step2: Export FLYTECTL_CONFIG as the previous log indicated. + FLYTECTL_CONFIG=/home/ubuntu/.flyte/config-sandbox.yaml + + # Step3: The kubeconfig will be automatically copied to the user's main kubeconfig (default is `/.kube/config`) with "flyte-sandbox" as the context name. + # Check that we can access the K3s cluster. Verify that postgres and minio are running. + kubectl get pod -n flyte + # NAME READY STATUS RESTARTS AGE + # flyte-sandbox-docker-registry-85745c899d-dns8q 1/1 Running 0 5m + # flyte-sandbox-kubernetes-dashboard-6757db879c-wl4wd 1/1 Running 0 5m + # flyte-sandbox-proxy-d95874857-2wc5n 1/1 Running 0 5m + # flyte-sandbox-minio-645c8ddf7c-sp6cc 1/1 Running 0 5m + # flyte-sandbox-postgresql-0 1/1 Running 0 5m + + +**3. Run all Flyte components (flyteadmin, flytepropeller, datacatalog, flyteconsole, etc) in a single binary.** + +The `Flyte repository `__ includes Go code +that integrates all Flyte components into a single binary. + +.. code:: shell + + # Step1: Clone flyte repo + git clone https://github.com/flyteorg/flyte.git + cd flyte + + # Step2: Build a single binary that bundles all the Flyte components. + # The version of each component/library used to build the single binary are defined in `go.mod`. + sudo apt-get -y install jq # You may need to install jq + go mod tidy + make compile + + # Step3: Edit the config file: ./flyte-single-binary-local.yaml. + # Replace occurrences of $HOME with the actual path of your home directory. + sedi=(-i) + case "$(uname)" in + Darwin*) sedi=(-i "") + esac + sed "${sedi[@]}" -e "s|\$HOME|${HOME}|g" flyte-single-binary-local.yaml + + # Step 4: Prepare a namespace template for the cluster resource controller. + # The configuration file "flyte-single-binary-local.yaml" has an entry named cluster_resources.templatePath. + # This entry needs to direct to a directory containing the templates for the cluster resource controller to use. + # We will now create a simple template that allows the automatic creation of required namespaces for projects. + # For example, with Flyte's default project "flytesnacks", the controller will auto-create the following namespaces: + # flytesnacks-staging, flytesnacks-development, and flytesnacks-production. + mkdir $HOME/.flyte/cluster-resource-templates/ + echo "apiVersion: v1 + kind: Namespace + metadata: + name: '{{ namespace }}'" > $HOME/.flyte/cluster-resource-templates/namespace.yaml + + # Step5: Running the single binary. + # The POD_NAMESPACE environment variable is necessary for the webhook to function correctly. + # You may encounter an error due to `ERROR: duplicate key value violates unique constraint`. Running the command again will solve the problem. + POD_NAMESPACE=flyte ./flyte start --config flyte-single-binary-local.yaml + # All logs from flyteadmin, flyteplugins, flytepropeller, etc. will appear in the terminal. + + +**4. Build single binary with your own code.** + + +The following instructions provide guidance on how to build single binary with your customized code under the ``flyteadmin`` as an example. + + +- **Note** Although we'll use ``flyteadmin`` as an example, these steps can be applied to other Flyte components or libraries as well. + ``{flyteadmin}`` below can be substituted with other Flyte components/libraries: ``flyteidl``, ``flyteplugins``, ``flytepropeller``, ``datacatalog``, or ``flytestdlib``. +- **Note** If you want to learn how flyte compiles those components and replace the repositories, you can study how ``go mod edit`` works. + +.. code:: shell + + # Step1: Install Go. Flyte uses Go 1.19, so make sure to switch to Go 1.19. + export PATH=$PATH:$(go env GOPATH)/bin + go install golang.org/dl/go1.19@latest + go1.19 download + export GOROOT=$(go1.19 env GOROOT) + export PATH="$GOROOT/bin:$PATH" + + # You may need to install goimports to fix lint errors. + # Refer to https://pkg.go.dev/golang.org/x/tools/cmd/goimports + go install golang.org/x/tools/cmd/goimports@latest + export PATH=$(go env GOPATH)/bin:$PATH + + # Step2: Go to the {flyteadmin} repository, modify the source code accordingly. + cd flyte/flyteadmin + + # Step3: Now, you can build the single binary. Go back to Flyte directory. + go mod tidy + make compile + POD_NAMESPACE=flyte ./flyte start --config flyte-single-binary-local.yaml + +**5. Test by running a hello world workflow.** + + +.. code:: shell + + # Step1: Install flytekit + pip install flytekit && export PATH=$PATH:/home/ubuntu/.local/bin + + # Step2: Run a hello world example + pyflyte run --remote https://raw.githubusercontent.com/flyteorg/flytesnacks/master/examples/basics/basics/hello_world.py hello_world_wf + # Go to http://localhost:30080/console/projects/flytesnacks/domains/development/executions/fd63f88a55fed4bba846 to see execution in the console. + # You can go to the [flytesnacks repository](https://github.com/flyteorg/flytesnacks) to see more useful examples. + +**6. Tear down the k3s cluster after finishing developing.** + + +.. code:: shell + + flytectl demo teardown + # context removed for "flyte-sandbox". + # ๐Ÿงน ๐Ÿงน Sandbox cluster is removed successfully. + # โ‡๏ธ Run the following command to unset sandbox environment variables for accessing flytectl + # unset FLYTECTL_CONFIG + +How to setup dev environment for flytekit? +******************************************* + +**1. Set up local Flyte Cluster.** + + +If you are also modifying the code for flyteidl, flyteadmin, flyteplugins, flytepropeller datacatalog, or flytestdlib, +refer to the instructions in the `previous section <#how-to-setup-dev-environment-for-flyteidl-flyteadmin-flyteplugins-flytepropeller-datacatalog-and-flytestdlib>`__ to set up a local Flyte cluster. + +If not, we can start backends with a single command. + +.. code:: shell + + # Step1: Install the latest version of flytectl, a portable and lightweight command-line interface to work with Flyte. + curl -sL https://ctl.flyte.org/install | bash + # flyteorg/flytectl info checking GitHub for latest tag + # flyteorg/flytectl info found version: 0.6.39 for v0.6.39/Linux/x86_64 + # flyteorg/flytectl info installed ./bin/flytectl + + # Step2: Export flytectl path based on the previous log "flyteorg/flytectl info installed ./bin/flytectl" + export PATH=$PATH:/home/ubuntu/bin # replace with your path + + # Step3: Starts the Flyte demo cluster. This will setup a k3s cluster running minio, postgres Pods, and all Flyte components: flyteadmin, flyteplugins, flytepropeller, etc. + # See https://docs.flyte.org/projects/flytectl/en/latest/gen/flytectl_demo_start.html for more details. + flytectl demo start + # ๐Ÿ‘จโ€๐Ÿ’ป Flyte is ready! Flyte UI is available at http://localhost:30080/console ๐Ÿš€ ๐Ÿš€ ๐ŸŽ‰ + # โ‡๏ธ Run the following command to export demo environment variables for accessing flytectl + # export FLYTECTL_CONFIG=/home/ubuntu/.flyte/config-sandbox.yaml + # ๐Ÿ‹ Flyte sandbox ships with a Docker registry. Tag and push custom workflow images to localhost:30000 + # ๐Ÿ“‚ The Minio API is hosted on localhost:30002. Use http://localhost:30080/minio/login for Minio console + +**2. Run workflow locally.** + + +.. code:: shell + + # Step1: Build a virtual environment for developing Flytekit. This will allow your local changes to take effect when the same Python interpreter runs `import flytekit`. + git clone https://github.com/flyteorg/flytekit.git # replace with your own repo + cd flytekit + virtualenv ~/.virtualenvs/flytekit + source ~/.virtualenvs/flytekit/bin/activate + make setup + pip install -e . + + # If you are also developing the plugins, consider the following: + + # Installing Specific Plugins: + # If you wish to only use few plugins, you can install them individually. + # Take [Flytekit BigQuery Plugin](https://github.com/flyteorg/flytekit/tree/master/plugins/flytekit-bigquery#flytekit-bigquery-plugin) for example: + # You have to go to the bigquery plugin folder and install it. + cd plugins/flytekit-bigquery/ + pip install -e . + # Now you can use the bigquery plugin, and the performance is fast. + + # (Optional) Installing All Plugins: + # If you wish to install all available plugins, you can execute the command below. + # However, it's not typically recommended because the current version of plugins does not support + # lazy loading. This can lead to a slowdown in the performance of your Python engine. + cd plugins + pip install -e . + # Now you can use all plugins, but the performance is slow. + + # Step2: Modify the source code for flytekit, then run unit tests and lint. + make lint + make test + + # Step3: Run a hello world sample to test locally + pyflyte run https://raw.githubusercontent.com/flyteorg/flytesnacks/master/examples/basics/basics/hello_world.py hello_world_wf + # Running hello_world_wf() hello world + +**3. Run workflow in sandbox.** + + +Before running your workflow in the sandbox, make sure you're able to successfully run it locally. +To deploy the workflow in the sandbox, you'll need to build a Flytekit image. +Create a Dockerfile in your Flytekit directory with the minimum required configuration to run a task, as shown below. +If your task requires additional components, such as plugins, you may find it useful to refer to the construction of the `officail flitekit image `__ + +.. code:: Dockerfile + + FROM python:3.9-slim-buster + USER root + WORKDIR /root + ENV PYTHONPATH /root + RUN apt-get update && apt-get install build-essential -y + RUN apt-get install git -y + # The following line is an example of how to install your modified plugins. In this case, it demonstrates how to install the 'deck' plugin. + # RUN pip install -U git+https://github.com/Yicheng-Lu-llll/flytekit.git@"demo#egg=flytekitplugins-deck-standard&subdirectory=plugins/flytekit-deck-standard" # replace with your own repo and branch + RUN pip install -U git+https://github.com/Yicheng-Lu-llll/flytekit.git@demo # replace with your own repo and branch + ENV FLYTE_INTERNAL_IMAGE "localhost:30000/flytekit:demo" # replace with your own image name and tag + +The instructions below explain how to build the image, push the image to +the Flyte cluster, and finally submit the workflow. + +.. code:: shell + + # Step1: Ensure you have pushed your changes to the remote repo + # In the flytekit folder + git add . && git commit -s -m "develop" && git push + + # Step2: Build the image + # In the flytekit folder + export FLYTE_INTERNAL_IMAGE="localhost:30000/flytekit:demo" # replace with your own image name and tag + docker build --no-cache -t "${FLYTE_INTERNAL_IMAGE}" -f ./Dockerfile . + + # Step3: Push the image to the Flyte cluster + docker push ${FLYTE_INTERNAL_IMAGE} + + # Step4: Submit a hello world workflow to the Flyte cluster + cd flytesnacks + pyflyte run --image ${FLYTE_INTERNAL_IMAGE} --remote https://raw.githubusercontent.com/flyteorg/flytesnacks/master/examples/basics/basics/hello_world.py hello_world_wf + # Go to http://localhost:30080/console/projects/flytesnacks/domains/development/executions/f5c17e1b5640c4336bf8 to see execution in the console. + +How to setup dev environment for flyteconsole? +********************************************** + +**1. Set up local Flyte cluster.** + +Depending on your needs, refer to one of the following guides to setup up the Flyte cluster: + +- If you do not need to change the backend code, refer to the section on `How to Set Up a Dev Environment for Flytekit? <#how-to-setup-dev-environment-for-flytekit>`__ +- If you need to change the backend code, refer to the section on `How to setup dev environment for flyteidl, flyteadmin, flyteplugins, flytepropeller, datacatalog and flytestdlib? <#how-to-setup-dev-environment-for-flyteidl-flyteadmin-flyteplugins-flytepropeller-datacatalog-and-flytestdlib>`__ + + +**2. Start flyteconsole.** + + +.. code:: shell + + # Step1: Clone the repo and navigate to the Flyteconsole folder + git clone https://github.com/flyteorg/flyteconsole.git + cd flyteconsole + + # Step2: Install Node.js 18. Refer to https://github.com/nodesource/distributions/blob/master/README.md#using-ubuntu-2. + curl -fsSL https://deb.nodesource.com/setup_18.x | sudo -E bash - &&\ + sudo apt-get install -y nodejs + + # Step3: Install yarn. Refer to https://classic.yarnpkg.com/lang/en/docs/install/#debian-stable. + curl -sS https://dl.yarnpkg.com/debian/pubkey.gpg | sudo apt-key add - + echo "deb https://dl.yarnpkg.com/debian/ stable main" | sudo tee /etc/apt/sources.list.d/yarn.list + sudo apt update && sudo apt install yarn + + # Step4: Add environment variables + export BASE_URL=/console + export ADMIN_API_URL=http://localhost:30080 + export DISABLE_AUTH=1 + export ADMIN_API_USE_SSL="http" + + # Step5: Generate SSL certificate + # Note, since we will use HTTP, SSL is not required. However, missing an SSL certificate will cause an error when starting Flyteconsole. + make generate_ssl + + # Step6: Install node packages + yarn install + yarn build:types # It is fine if seeing error `Property 'at' does not exist on type 'string[]'` + yarn run build:prod + + # Step7: Start flyteconsole + yarn start + +**3. Install the Chrome plugin:** `Moesif Origin & CORS Changer `__. + + +We need to disable `CORS `__ to load resources. + +:: + + 1. Activate plugin (toggle to "on") + 2. Open 'Advanced Settings': + 3. set Access-Control-Allow-Credentials: true + +**4. Go to** http://localhost:3000/console/. + + +How to access Flyte UI, minio, postgres, k3s, and endpoints? +************************************************************************* + + +This section presumes a local Flyte cluster is already setup. If it isn't, refer to either: + +- `How to setup dev environment for flytekit? <#how-to-setup-dev-environment-for-flytekit>`__ +- `How to setup dev environment for flyteidl, flyteadmin, flyteplugins, flytepropeller, datacatalog and flytestdlib? <#how-to-setup-dev-environment-for-flyteidl-flyteadmin-flyteplugins-flytepropeller-datacatalog-and-flytestdlib>`__ + + +**1. Access the Flyte UI.** + + +`Flyte UI `__ is a web-based user interface for Flyte that lets you interact with Flyte objects and build directed acyclic graphs (DAGs) for your workflows. + +You can access it via http://localhost:30080/console. + +**2. Access the minio console.** + + +Core Flyte components, such as admin, propeller, and datacatalog, as well as user runtime containers rely on an object store (in this case, minio) to hold files. +During development, you might need to examine files such as `input.pb/output.pb `__, or `deck.html `__ stored in minio. + +Access the minio console at: http://localhost:30080/minio/login. The default credentials are: + +- Username: ``minio`` +- Password: ``miniostorage`` + + +**3. Access the postgres.** + + +FlyteAdmin and datacatalog use postgres to store persistent records, and you can interact with postgres on port ``30001``. Here is an example of using `psql` to connect: + +.. code:: shell + + # Step1: Install the PostgreSQL client. + sudo apt-get update + sudo apt-get install postgresql-client + + # Step2: Connect to the PostgreSQL server. The password is "postgres". + psql -h localhost -p 30001 -U postgres -d flyte + + +**4. Access the k3s dashboard.** + + +Access the k3s dashboard at: http://localhost:30080/kubernetes-dashboard. + +**5. Access the endpoints.** + + +Service endpoints are defined in the `flyteidl` repository under the `service` directory. You can browse them at `here `__. + +For example, the endpoint for the `ListTaskExecutions `__ API is: + +.. code:: shell + + /api/v1/task_executions/{node_execution_id.execution_id.project}/{node_execution_id.execution_id.domain}/{node_execution_id.execution_id.name}/{node_execution_id.node_id} + +You can access this endpoint at: + +.. code:: shell + + # replace with your specific task execution parameters + http://localhost:30080/api/v1/task_executions/flytesnacks/development/fe92c0a8cbf684ad19a8/n0?limit=10000 + + + + + + +๐Ÿž File an issue +================ + +We use `GitHub Issues `__ for issue tracking. The following issue types are available for filing an issue: + +* `Plugin Request `__ +* `Bug Report `__ +* `Documentation Bug/Update Request `__ +* `Core Feature Request `__ +* `Flytectl Feature Request `__ +* `Housekeeping `__ +* `UI Feature Request `__ + +If none of the above fit your requirements, file a `blank `__ issue. +Also, add relevant labels to your issue. For example, if you are filing a Flytekit plugin request, add the ``flytekit`` label. + +For feedback at any point in the contribution process, feel free to reach out to us on `Slack `__. diff --git a/docs/community/index.rst b/docs/community/index.rst new file mode 100644 index 0000000000..c2ee55ae23 --- /dev/null +++ b/docs/community/index.rst @@ -0,0 +1,127 @@ +.. _community: + +########## +Community +########## + +Flyte is an ambitious open source project and would not be possible without an +amazing community. We are a completely open community and strive to treat +every member with respect. You will find the community welcoming and responsive! + +Please join us on: + +.. image:: https://img.shields.io/badge/Slack-Chat-pink?style=for-the-badge + :target: https://slack.flyte.org + :alt: Flyte Slack + +.. image:: https://img.shields.io/badge/Github-Discussion-green?style=for-the-badge + :target: https://github.com/flyteorg/flyte/discussions + :alt: Github Discussion + +.. image:: https://img.shields.io/badge/Twitter-Social-blue?style=for-the-badge + :target: https://twitter.com/flyteorg + :alt: Twitter + +.. image:: https://img.shields.io/badge/LinkedIn-Social-lightblue?style=for-the-badge + :target: https://www.linkedin.com/groups/13962256 + :alt: LinkedIn + + +Open Source Community Meeting +----------------------------- + +When: every other Tuesday, 9:00 AM Pacific Time. +You're welcome to join and learn from other community members sharing their experiences with Flyte or any other technology from the AI ecosystem. +Check out the event details and add it to your `calendar `_, or just pop in! + +.. image:: https://img.shields.io/badge/Join-Zoom-blue?style=for-the-badge + :target: https://www.addevent.com/event/EA7823958 + :alt: Zoom Link + +Office Hours +------------ + +`Book a 30 minutes session `_ with a Flyte maintainer and get your questions answered! + +Schedule your session depending on the topic to secure the availability of a maintainer with expertise in the area: + +- **7:00a.m. PT**: + - Anything flytekit-related + - Flyte releases + - flytepropeller features + - Plugin implementation + - Platform configuration +- **1:00p.m. PT**: + - Flyte deployment, auth +- **9:00p.m. PT**: + - Flytekit-related + - Use cases + - Getting started (workflow onboarding) + - Integrations + + +Newsletter +---------- + +`Join the Flyte mailing list `_ to receive the monthly newsletter + + +Slack guidelines +----------------- + +Flyte strives to build and maintain an open, inclusive, productive and self-governing open source community. In consequence, +we expect all community members to respect the following guidelines: + +Abide by the `LF's Code of Conduct `__ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +As a Linux Foundation project, we must enforce the rules that govern professional and positive open source communities. + +Avoid using DMs and @mentions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Whenever possible, post your questions and responses in public channels so other Community Members can benefit from the conversation and outcomes. +Exceptions to this are when you need to share private or sensible information. In such a case, the outcome should still be shared publicly. +Limit the use of @mentions of other Community Members to be considerate of notification noise. + +Make use of threads +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Threads help us keep conversations contained and organized, reducing the time it takes to give you the support you need. + +Thread best practices: + +- Don't break your question into multiple messages. Put everything in one. +- For long questions, write a few sentences in the first message, and put the rest in a thread. +- If there's a code snippet (more than 5 lines of code), put it inside the thread. +- Avoid using the โ€œAlso send to channelโ€ feature unless it's really necessary. +- If your question contains multiple questions, make sure to break them into multiple messages, so each could be answered in a separate thread. + + +Do not post the same question across multiple channels +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If you consider that question needs to be shared on other channels, ask it once and then indicate explicitly that you're cross-posting. + +If you're having a tough time getting the support you need (or aren't sure where to go!), please DM @David Espejo(he/him) or @Samhita Alla for support. + +Do not solicit members of our Slack +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The Flyte Community exists to collaborate with, learn from, and support one another. It is not a space to pitch your products or services directly to our members via public channels, private channels, or direct messages. + +We are excited to have a growing presence from vendors to help answer questions from Community Members as they may arise, but we have a strict 3-strike policy against solicitation: + +- First occurrence: We'll give you a friendly but public reminder that the behavior is inappropriate according to our guidelines. +- Second occurrence: We'll send you a DM warning that any additional violations will result in removal from the community. +- Third occurrence: We'll delete or ban your account. + +We reserve the right to ban users without notice if they are clearly spamming our Community Members. + +If you want to promote a product or service, go to the #shameless-promotion channel and make sure to follow these rules: + +- Don't post more than two promotional posts per week +- Non-relevant topics aren't allowed + +Messages that don't follow these rules will be deleted. + + diff --git a/docs/community/roadmap.rst b/docs/community/roadmap.rst new file mode 100644 index 0000000000..3e6bc3f5ae --- /dev/null +++ b/docs/community/roadmap.rst @@ -0,0 +1,145 @@ +.. _community_roadmap: + +############### +Roadmap +############### + +How the Community Works +======================= +Flyte is actively used in production at multiple companies. We pride ourselves on being extremely customer-focused, and care about providing a high quality customer experience. We therefore always +prioritize stability, reliability, observability and maintainability over raw feature development. + +Features are usually developed in response to specific use cases and user scenarios. That being said, we are proactively thinking about the evolution of the system and how we want to keep adapting to changing requirements. Thus most of our changes reflect future development scenarios, and in +cases where we feel rapid prototyping would enable us to discover potential pitfalls or uncover hidden use cases, we would proactively develop features behind feature flags. + +It is extremely important to let the community know about your use cases, so that we adapt parts of Flyte to meet those requirements. We welcome collaboration and contributions, but please follow our `Contribution Guidelines `_. The quarterly planning meeting is also hosted publicly, please see more below. + + +Milestones and Release Processes +================================ +Flyte consists of many components and services. Each service is independently iterated and coordinated by maintaining backwards compatible contracts using Protobuf messages defined in `FlyteIDL `__. + +Release Cadence +--------------- +We aim to release Flyte quarterly, with the understanding that rather than being tied strictly to the calendar, we aim to have substantial features, improvements, and bug fixes at each quarter. If features slated for a given release are delayed, then the release will be delayed as well. The increased time will also give the Flyte development team more time to beta test each feature and release. + +Versioning Scheme +----------------- +*Please keep in mind the CI work to implement this scheme is still in progress* + +At each quarterly release, major components of Flyte and the Flyte repository itself will be released with an incremented minor version number and the version number will be aligned across those components. The major version number will remain ``1`` for the foreseeable future. That is, if the current version of Flyte is ``1.2.x``, the next release will be ``1.3.0`` for Flyte and the major components. + +After each version is released, merges to master will be assigned beta releases of the next release version. That is, if ``flytepropeller`` version ``v1.2.0`` was just released, the next merge to master will be tagged ``v1.3.0b0``. + +Not strictly forcing a time-constraint on the Flyte release cycle means that if a substantial number of changes is merged, perhaps due to a security issue or just a rapid pace of feature development, we can always bring up the timeline of the release. + +Components with versions aligned +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +* Propeller +* Admin +* Console +* datacatalog +* flytectl +* flytesnacks +* Flytekit +* flytekit-java + +The last two we are going to tie together for now, but realize that we may want to unpin in the future. + +Components versioned independently +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +* flyteidl +* flytestdlib +* flyteplugins +* flytecopilot + +Helm Charts +^^^^^^^^^^^ +Helm charts deserve a special mention here. Unlike the other components which will have patch versions that differ, the Flyte release version and the Helm chart version will always be identical down to the patch. That is, a Flyte release is a Helm release and vice-versa. + +Release Branches and Patching +----------------------------- +After each minor release, a release branch will be created. There will be no alignment of patch versions across the components. That is, by the end of the ``1.3.x`` release cycle, ``flyteadmin`` may be on ``1.3.8`` and ``flytepropeller`` may be on ``1.3.2``. + +When developing bug fixes, by default we will continue to develop off of master, which will not be the stable branch. After such bug fixes are merged, it will be the responsibility of the developer to ensure that the patches are also applied to prior releases. At the current time, we propose only supporting one release back (two for security patches). That is, if ``flytepropeller`` has a bug fix that results in ``v1.3.0b0`` that patch will be applied to the ``v1.2.x`` release, but not the ``v1.1.x`` release. + +Beta Patch Releases +^^^^^^^^^^^^^^^^^^^ +We also propose that beta patch versions be merged into the release branch when patching prior releases. For example, assuming no patches have yet to be made to the ``v1.2.0`` release, when porting a bug fix that resulted in ``v1.3.0b0`` onto the ``release-v1.2`` branch, the developer can first release ``v1.2.1b0`` for testing into ``release-v1.2`` before releasing the ``v1.2.1`` release. Such beta releases should be made at the discretion of the developer. + +Whether or not a patch version of any of the Flyte components also creates a Flyte patch release shall also be left to the discretion of the developer. + +Documentation Versioning +------------------------ +We also currently have an issue with our documentation versioning. While our readthedocs page does have versioning enabled and we publish the [docs version](https://github.com/flyteorg/flyte/blob/80c098f10334b1c916d1e4274ab9f204152d9d80/rsts/conf.py#L33), all the [intersphinx mappings](https://github.com/flyteorg/flyte/blob/80c098f10334b1c916d1e4274ab9f204152d9d80/rsts/conf.py#L219) just point to `latest`. Keep in mind that this mapping not only exists in this `flyte` repo, but also in all the other repos that that mapping points to. That is, to maintain an accurate mapping of different versions of documentation, we'll need to update the mapping in all the repos. + +To remediate this, we propose the following: + +* Documentation should be pinned only to Major.Minor on all the repos that have their versions "aligned". + + * This means that as we release patch versions of Admin, Propeller, etc., if we're on v1.1 for instance, as Admin code/auto-generated documentation changes, the v1.1 listing of readthedocs will automatically pick it up. +* Repos that are not aligned will just default to the "latest" documentation version. + +Planning Process +================ + +Quarterly Planning +------------------ +Members of the community should feel free to join these! Core members of the Flyte team will come prepared with general initiatives in mind. We will use these meetings to prioritize these ideas, assess community interest and impact, and decide what goes into the GitHub milestone for the next release. Members of the community looking to contribute should also join. Please look for this meeting invite on the calendar - it may not be set up as a recurring meeting simply because it will likely change by a few days each quarter. + +Change Management +------------------ +To ensure that changes are trackable and the history is explainable, we use a slightly cumbersome but helpful process, with the following immediate goals: +- Every PR is associated with an issue (automatic searchable documentation) +- Large PRs are associated with Proposals +- Every major change is associated with documentation +- Owner files exist for all repositories + +Issue Lifecycle +--------------- +- Incoming issues are tagged automatically as untriaged. +- Periodically, members of the Flyte community will meet to triage incoming issues. We aim to do this on a weekly basis. +- During this meeting we'll attempt to assign each issue to a milestone. Some issues however will need to be investigated before we can fully assess. +- Once an issue is assigned to a milestone, this means we are committed to delivering it that release. This means the burden for adding something to the milestone is relatively high. Issues that slip should only slip for good reason. + +Browse Features and Issues +============================ + +Issues by Theme +---------------- + ++-------------+----------------------------------------------------------------+---------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------+ +| Theme | Description | Open Issues | Comment | ++-------------+----------------------------------------------------------------+---------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------+ +| Bugs | Currently known and open bugs. | `Bugs `_ | We are always working on bugs. Open a new one `here `_. | ++-------------+----------------------------------------------------------------+---------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------+ +| Security | Issues related to security enhancements. | `Security issues `_ | | ++-------------+----------------------------------------------------------------+---------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------+ +| Docs | All issues open with our documentation | `Docs issues `_ | Starting Feb 2021, we will be completely overhauling our docs. Feedback appreciated! | ++-------------+----------------------------------------------------------------+---------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------+ +| Features | All new features in development | `Features issues `_ | | ++-------------+----------------------------------------------------------------+---------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------+ +| Plugins | New capabilities and plugins that are built into Flyte. | `Plugins issues `_ | This is one of the best places to get started contributing to Flyte. Issues with both | +| | These could be hosted services, K8s native execution, etc. | | `plugins` and `flytekit` labels refer to purely client-side plugins and are the fastest to contribute to. | ++-------------+----------------------------------------------------------------+---------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------+ +| Scale | These issues deal with performance, reliability and | `Scale issues `_ | We are always working on these issues and we would love to hear feedback about what you | +| | scalability of Flyte | | would want to change or what we should prioritize. | ++-------------+----------------------------------------------------------------+---------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------+ +| Contribute | If you are looking to contribute and want a great first issue, | `Contribute issues `_ | These are the best issues to get started with. | +| | check out these issues | | | ++-------------+----------------------------------------------------------------+---------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------+ + + +Issues by Components +--------------------- + ++---------------+---------------------------------------+------------------------------------------------------------------------+ +| Theme | Description | Open Issues | ++===============+=======================================+========================================================================+ +| Flyte Console | Issues concerning our web UI. | `Flyte Console issues `_ | ++---------------+---------------------------------------+------------------------------------------------------------------------+ +| Flytectl | Issues concerning our standalone CLI. | `Flytectl issues `_ | ++---------------+---------------------------------------+------------------------------------------------------------------------+ + +For an overview of what we're currently working on, check out our `live roadmap `__. + diff --git a/docs/community/troubleshoot.rst b/docs/community/troubleshoot.rst new file mode 100644 index 0000000000..b4f6c271d4 --- /dev/null +++ b/docs/community/troubleshoot.rst @@ -0,0 +1,135 @@ +.. _troubleshoot: + +===================== +Troubleshooting Guide +===================== + +.. tags:: Troubleshoot, Basic + +The content in this section will help Flyte users isolate the most probable causes for some of the common issues that could arise while getting started with the project. + +Before getting started, collect the following information from the underlying infrastructure: + +- Capture the ``Status`` column from the output of: + +.. prompt:: bash $ + + $ kubectl describe pod -n + +Where will typically correspond to the node execution string that you can find in the UI. + +- Pay close attention to the `Events` section in the output. +- Also, collect the logs from the Pod: + +.. prompt:: bash $ + + $ kubectl logs pods -n + +Where will typically correspond to the Flyte -, e.g. flytesnacks-development. + +Depending on the contents of the logs or the `Events`, you can try different things: + +Debugging common execution errors +---------------------------------- + +``message: '0/1 nodes are available: 1 Insufficient cpu. preemption: 0/1 nodes are available: 1 No preemption victims found for incoming pod.'`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This issue is more common on MacOS devices. Make sure that your Docker daemon has allocated a minimum of 4 CPU cores and 3GB of RAM + +``terminated with exit code (137). Reason [OOMKilled]`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- For single binary environment deployed with Helm chart, make sure you are using `the most recent charts `_ + +- For EKS deployments, you cand adjust resource limits and requests in the `inline `_ section of the ``eks-production.yaml`` file. Example: + +.. code-block:: yaml + + inline: + task_resources: + defaults: + cpu: 100m + memory: 100Mi + storage: 100Mi + limits: + memory: 1Gi + +- Also, the default container resource limits are can be overridden from the task itself: + +.. code-block:: python + + from flytekit import Resources, task + @task(limits=Resources(mem="256Mi") + def your_task(... + +``Error: ImagePullBackOff`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- If your environment requires the use of a network proxy use the ``--env`` option when starting the sandbox and pass the proxy configuration: + +.. prompt:: bash $ + + $ flytectl demo start --env HTTP_PROXY= + +- If you're building a custom Docker image, make sure to use a tag other than ``latest``. Otherwise, the Kubernetes default pull policy will be changed from ``IfNotPresent`` to ``Always``, forcing an image pull with every Pod deployment. + +Issues running workloads +------------------------- + +``OPENSSL_internal:WRONG_VERSION_NUMBER`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- For ``flyte-binary``: make sure that the endpoint name you have set in your ``config.yaml`` file, is included in the DNS names of the SSL certificate installed (be it self signed or issued by a Certificate Authority) +- For ``sandbox``: verify the ``FLYTECTL_CONFIG`` environment variable has the correct value by running: + +.. prompt:: bash $ + + $ export FLYTECTL_CONFIG=~/.flyte/config-sandbox.yaml + +``ModuleNotFoundError`` +^^^^^^^^^^^^^^^^^^^^^^^ + +- If you're using a custom container image and using Docker, make sure your ``Dockerfile`` is located at the same level of the ``flyte`` directory and that there is an empty ``__init__.py`` file in your project's folder : + +.. prompt:: bash $ + + myflyteapp + โ”œโ”€โ”€ Dockerfile + โ”œโ”€โ”€ docker_build_and_tag.sh + โ”œโ”€โ”€ flyte + โ”‚ โ”œโ”€โ”€ __init__.py + โ”‚ โ””โ”€โ”€ workflows + โ”‚ โ”œโ”€โ”€ __init__.py + โ”‚ โ””โ”€โ”€ example.py + โ””โ”€โ”€ requirements.txt + +``An error occurred (AccessDenied) when calling the PutObject operation`` in an EKS deployment +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- Make sure that the Kubernetes service account Flyte is using has the annotation that refers to the IAM Role is connected to: + +.. prompt:: bash $ + + $ kubectl describe sa -n + +Example output: + +.. prompt:: bash $ + + Name: + Namespace: flyte + Labels: app.kubernetes.io/managed-by=eksctl + Annotations: eks.amazonaws.com/role-arn: arn:aws:iam:::role/flyte-system-role + Image pull secrets: + Mountable secrets: + Tokens: + Events: + +- Otherwise, obtain your IAM role's ARN and manually annotate the service account: + +.. prompt:: bash $ + + $ kubectl annotate serviceaccount -n eks.amazonaws.com/role-arn=arn:aws:iam::xxxx:role/ + +- Refer to this community-maintained `guides `_ for further information about Flyte deployment on EKS diff --git a/docs/concepts/admin.rst b/docs/concepts/admin.rst new file mode 100644 index 0000000000..4e6ee67a8e --- /dev/null +++ b/docs/concepts/admin.rst @@ -0,0 +1,500 @@ +.. _divedeep-admin: + +########## +FlyteAdmin +########## + +.. tags:: Advanced, Design + +Admin Structure +=============== + +FlyteAdmin serves as the main Flyte API to process all client requests to the system. Clients include the FlyteConsole, which calls: + +1. FlyteAdmin to list the workflows, get execution details, etc. +2. Flytekit, which in turn calls FlyteAdmin to register, launch workflows, etc. + +Below, we'll dive into each component defined in admin in more detail. + +RPC +--- + +FlyteAdmin uses the `grpc-gateway `__ library to serve incoming gRPC and HTTP requests with identical handlers. +Refer to the admin service :std:ref:`definition ` for a detailed API overview, including request and response entities. +The RPC handlers are thin shims that enforce request structure validation and call out to the appropriate :ref:`manager ` methods to process requests. + +You can find a detailed explanation of the service in the :ref:`admin service ` page. + +.. _divedeep-admin-manager: + +Managers +-------- + +The Admin API is broken up into entities: + +- Executions +- Launch plans +- Node Executions +- Projects (and their respective domains) +- Task Executions +- Tasks +- Workflows + +Each API entity has an entity manager in FlyteAdmin responsible for implementing business logic for the entity. +Entity managers handle full validation of creating, updating and getting requests and +data persistence in the backing store (see the :ref:`divedeep-admin-repository` section). + + +Additional Components ++++++++++++++++++++++ + +The managers utilize additional components to process requests. These additional components include: + +- :ref:`workflow engine `: compiles workflows and launches workflow executions from launch plans. +- :ref:`data ` (remote cloud storage): offloads data blobs to the configured cloud provider. +- :ref:`runtime `: loads values from a config file to assign task resources, initialization values, execution queues, and more. +- :ref:`async processes `: provides functions to schedule and execute the workflows as well as enqueue and trigger notifications. + +.. _divedeep-admin-repository: + +Repository +---------- +Serialized entities (tasks, workflows, launch plans) and executions (workflow-, node- and task-) are stored as protos defined +`here `__. +We use the excellent `gorm `__ library to interface with our database, which currently supports a Postgres +implementation. You can find the actual code for issuing queries with gorm in the +`gormimpl `__ directory. + +Models +++++++ +Database models are defined in the `models `__ directory and correspond 1:1 with the database tables [0]_. + +The full set of database tables includes: + +- executions +- execution_events +- launch_plans +- node_executions +- node_execution_events +- tasks +- task_executions +- workflows + +These database models inherit primary keys and indexes as defined in the corresponding `models `__ file. + +The repositories code also includes `transformers `__. +These convert entities from the database format to a response format for the external API. +If you change either of these structures, you must change the corresponding transformers too. + + +.. _divedeep-admin-async: + +Component Details +================= + +This section dives into the details of each top-level directory defined in ``pkg/``. + +Asynchronous Components +----------------------- + +Notifications and schedules are handled by async routines that are responsible for enqueuing and subsequently processing dequeued messages. + +FlyteAdmin uses the `gizmo toolkit `__ to abstract queueing implementation. Gizmo's +`pubsub `__ library offers implementations for Amazon SNS/SQS, Google Pubsub, Kafka topics, and publishing over HTTP. + +For the sandbox development, no-op implementations of the notifications and schedule handlers are used to remove external cloud dependencies. + + +Common +------ + +As the name implies, ``common`` houses shared components used across different FlyteAdmin components in a single, top-level directory to avoid cyclic dependencies. These components include execution naming and phase utils, query filter definitions, query sorting definitions, and named constants. + +.. _divedeep-admin-data: + +Data +----- + +Data interfaces are primarily handled by the `storage `__ library implemented in ``flytestdlib``. However, neither this nor the underlying `stow `__ library expose `HEAD `__ support. Hence, the data package in admin exists as the layer responsible for additional, remote data operations. + +Errors +------ + +The errors directory contains centrally defined errors that are designed for compatibility with gRPC statuses. + +.. _divedeep-admin-config: + +Runtime +------- +Values specific to the FlyteAdmin application, including task, workflow registration, and execution are configured in the `runtime `__ directory. These interfaces expose values configured in the ``flyteadmin`` top-level key in the application config. + +.. _divedeep-admin-workflowengine: + +Workflow engine +---------------- + +This directory contains the interfaces to build and execute workflows leveraging FlytePropeller compiler and client components. + +.. [0] Given the unique naming constraints, some models are redefined in `migration_models `__ to guarantee unique index values. + +.. _divedeep-admin-service: + + +FlyteAdmin Service Background +============================= + +Entities +--------- + +The :std:ref:`admin service definition ` defines REST operations for the entities that +FlyteAdmin administers. + +As a refresher, the primary :ref:`entities ` across Flyte maps to FlyteAdmin entities. + +Static entities ++++++++++++++++ + +These include: + +- Workflows +- Tasks +- Launch Plans + +Permitted operations include: + +- Create +- Get +- List + +The above entities are designated by an :std:ref:`identifier ` +that consists of a project, domain, name, and version specification. These entities are, for the most part, immutable. To update one of these entities, the updated +version must be re-registered with a unique and new version identifier attribute. + +One caveat is that the launch plan can toggle between :std:ref:`ACTIVE and INACTIVE ` states. +At a given point in time, only one launch plan version across a shared {Project, Domain, Name} specification can be active. The state affects the scheduled launch plans only. +An inactive launch plan can be used to launch individual executions. However, only an active launch plan runs on a schedule (given it has a schedule defined). + + +Static entities metadata (Named Entities) ++++++++++++++++++++++++++++++++++++++++++ + +A :std:ref:`named entity ` includes metadata for one of the above entities +(workflow, task or launch plan) across versions. It also includes a resource type (workflow, task or launch plan) and an +:std:ref:`id ` which is composed of project, domain and name. +The named entity also includes metadata, which are mutable attributes about the referenced entity. + +This metadata includes: + +- Description: a human-readable description for the Named Entity collection. +- State (workflows only): this determines whether the workflow is shown on the overview list of workflows scoped by project and domain. + +Permitted operations include: + +- Create +- Update +- Get +- List + + +Execution entities +++++++++++++++++++ + +These include: + +- (Workflow) executions +- Node executions +- Task executions + +Permitted operations include: + +- Create +- Get +- List + +After an execution begins, FlytePropeller monitors the execution and sends the events which the admin uses to update the above executions. + +These :std:ref:`events ` include + +- WorkflowExecutionEvent +- NodeExecutionEvent +- TaskExecutionEvent + +and contain information about respective phase transitions, phase transition time and optional output data if the event concerns a terminal phase change. + +These events provide the **only** way to update an execution. No raw update endpoint exists. + +To track the lifecycle of an execution, admin and store attributes such as `duration` and `timestamp` at which an execution transitioned to running and end time are used. + +For debugging purposes, admin also stores Workflow and Node execution events in its database, but does not currently expose them through an API. Because array tasks can yield many executions, admin does **not** store TaskExecutionEvents. + + +Platform entities ++++++++++++++++++ +Projects: Like named entities, projects have mutable metadata such as human-readable names and descriptions, in addition to their unique string ids. + +Permitted project operations include: + +- Register +- List + +.. _divedeep-admin-matchable-resources: + +Matchable resources ++++++++++++++++++++ + +A thorough background on :ref:`matchable resources ` explains +their purpose and application logic. As a summary, these are used to override system level defaults for Kubernetes cluster +resource management, default execution values, and more across different levels of specificity. + +These entities consist of: + +- ProjectDomainAttributes +- WorkflowAttributes + +``ProjectDomainAttributes`` configure customizable overrides at the project and domain level, and ``WorkflowAttributes`` configure customizable overrides at the project, domain and workflow level. + +Permitted attribute operations include: + +- Update (implicitly creates if there is no existing override) +- Get +- Delete + + +Defaults +-------- + +Task resource defaults +++++++++++++++++++++++ + +User-facing documentation on configuring task resource requests and limits can be found in :std:ref:`cookbook:customizing task resources`. + +As a system administrator you may want to define default task resource requests and limits across your Flyte deployment. +This can be done through the flyteadmin config. + +**Default** values get injected as the task requests and limits when a task definition omits a specific resource. +**Limit** values are only used as validation. Neither a task request nor limit can exceed the limit for a resource type. + + +Using the Admin Service +----------------------- + +Adding request filters +++++++++++++++++++++++ + +We use `gRPC Gateway `_ to reverse proxy HTTP requests into gRPC. +While this allows for a single implementation for both HTTP and gRPC, an important limitation is that fields mapped to the path pattern cannot be +repeated and must have a primitive (non-message) type. Unfortunately this means that repeated string filters cannot use a proper protobuf message. Instead, they use +the internal syntax shown below:: + + func(field,value) or func(field, value) + +For example, multiple filters would be appended to an http request like:: + + ?filters=ne(version, TheWorst)+eq(workflow.name, workflow) + +Timestamp fields use the ``RFC3339Nano`` spec (For example: "2006-01-02T15:04:05.999999999Z07:00") + +The fully supported set of filter functions are + +- contains +- gt (greater than) +- gte (greter than or equal to) +- lt (less than) +- lte (less than or equal to) +- eq (equal) +- ne (not equal) +- value_in (for repeated sets of values) + +"value_in" is a special case where multiple values are passed to the filter expression. For example:: + + value_in(phase, RUNNING;SUCCEEDED;FAILED) + +.. note:: + If you're issuing your requests over http(s), be sure to URL encode the ";" semicolon using ``%3B`` like so: ``value_in(phase, RUNNING%3BSUCCEEDED%3BFAILED)`` + +Filterable fields vary based on entity types: + +- Task + + - project + - domain + - name + - version + - created_at + +- Workflow + + - project + - domain + - name + - version + - created_at + +- Launch plans + + - project + - domain + - name + - version + - created_at + - updated_at + - workflows.{any workflow field above} (for example: workflow.domain) + - state (you must use the integer enum, for example: 1) + - States are defined in :std:ref:`launchplanstate `. + +- Named Entity Metadata + + - state (you must use the integer enum, for example: 1) + - States are defined in :std:ref:`namedentitystate `. + +- Executions (Workflow executions) + + - project + - domain + - name + - workflow.{any workflow field above} (for example: workflow.domain) + - launch_plan.{any launch plan field above} (for example: launch_plan.name) + - phase (you must use the upper-cased string name, for example: ``RUNNING``) + - Phases are defined in :std:ref:`workflowexecution.phase `. + - execution_created_at + - execution_updated_at + - duration (in seconds) + - mode (you must use the integer enum, for example: 1) + - Modes are defined in :std:ref:`executionmode `. + - user (authenticated user or role from flytekit config) + +- Node Executions + + - node_id + - execution.{any execution field above} (for example: execution.domain) + - phase (you must use the upper-cased string name, for example: ``QUEUED``) + - Phases are defined in :std:ref:`nodeexecution.phase `. + - started_at + - node_execution_created_at + - node_execution_updated_at + - duration (in seconds) + +- Task Executions + + - retry_attempt + - task.{any task field above} (for example: task.version) + - execution.{any execution field above} (for example: execution.domain) + - node_execution.{any node execution field above} (for example: node_execution.phase) + - phase (you must use the upper-cased string name, for example: ``SUCCEEDED``) + - Phases are defined in :std:ref:`taskexecution.phase `. + - started_at + - task_execution_created_at + - task_execution_updated_at + - duration (in seconds) + +Putting It All Together +----------------------- + +If you wish to query specific executions that were launched using a specific launch plan for a workflow with specific attributes, use: + +:: + + gte(duration, 100)+value_in(phase,RUNNING;SUCCEEDED;FAILED)+eq(lauch_plan.project, foo) + +eq(launch_plan.domain, bar)+eq(launch_plan.name, baz) + +eq(launch_plan.version, 1234) + +lte(workflow.created_at,2018-11-29T17:34:05.000000000Z07:00) + + + +Adding sorting to requests +++++++++++++++++++++++++++ + +Only a subset of fields are supported for sorting list queries. The explicit list is shown below: + +- ListTasks + + - project + - domain + - name + - version + - created_at + +- ListTaskIds + + - project + - domain + +- ListWorkflows + + - project + - domain + - name + - version + - created_at + +- ListWorkflowIds + + - project + - domain + +- ListLaunchPlans + + - project + - domain + - name + - version + - created_at + - updated_at + - state (you must use the integer enum, for example: 1) + - States are defined in :std:ref:`launchplanstate `. + +- ListWorkflowIds + + - project + - domain + +- ListExecutions + + - project + - domain + - name + - phase (you must use the upper-cased string name, for example: ``RUNNING``) + - Phases are defined in :std:ref:`workflowexecution.phase `. + - execution_created_at + - execution_updated_at + - duration (in seconds) + - mode (you must use the integer enum, for example: 1) + - Modes are defined :std:ref:`execution.proto `. + +- ListNodeExecutions + + - node_id + - retry_attempt + - phase (you must use the upper-cased string name, for example: ``QUEUED``) + - Phases are defined in :std:ref:`nodeexecution.phase `. + - started_at + - node_execution_created_at + - node_execution_updated_at + - duration (in seconds) + +- ListTaskExecutions + + - retry_attempt + - phase (you must use the upper-cased string name, for example: ``SUCCEEDED``) + - Phases are defined in :std:ref:`taskexecution.phase `. + - started_at + - task_execution_created_at + - task_execution_updated_at + - duration (in seconds) + +Sorting syntax +-------------- + +Adding sorting to a request requires specifying the ``key``. For example: The attribute you wish to sort on. Sorting can also optionally specify the direction (one of ``ASCENDING`` or ``DESCENDING``) where ``DESCENDING`` is the default. + +Example sorting HTTP parameter: + +:: + + sort_by.key=created_at&sort_by.direction=DESCENDING + +Alternatively, since ``DESCENDING`` is the default sorting direction, the above could be written as + +:: + + sort_by.key=created_at diff --git a/docs/concepts/architecture.rst b/docs/concepts/architecture.rst new file mode 100644 index 0000000000..6727bd61db --- /dev/null +++ b/docs/concepts/architecture.rst @@ -0,0 +1,121 @@ +.. _divedeep-architecture-overview: + +###################### +Component Architecture +###################### + +.. tags:: Advanced, Glossary, Design + +This document aims to demystify how Flyte's major components ``Flyteidl``, ``Flytekit``, ``Flytectl``, ``FlyteConsole``, ``FlyteAdmin``, ``FlytePropeller``, and ``FlytePlugins`` fit together at a high level. + +FlyteIDL +======== + +In Flyte, entities like "Workflows", "Tasks", "Launch Plans", and "Schedules" are recognized by multiple system components. For components to communicate effectively, they need a shared understanding about the structure of these entities. + +Flyteidl (Interface Definition Language) is where shared Flyte entities are defined. It also defines the RPC service definition for the :std:ref:`core Flyte API `. + +Flyteidl uses the `protobuf `_ schema to describe entities. Clients are generated for Python, Golang, and JavaScript and imported by Flyte components. + + +Planes +====== + +Flyte components are separated into 3 logical planes. The planes are summarized and explained in detail below. The goal is that these planes can be replaced by alternate implementations. + ++-------------------+---------------------------------------------------------------------------------------------------------------+ +| **User Plane** | The User Plane consists of all user tools that assist in interacting with the core Flyte API. | +| | These tools include the FlyteConsole, Flytekit, and Flytectl. | ++-------------------+---------------------------------------------------------------------------------------------------------------+ +| **Control Plane** | The Control Plane implements the core Flyte API. | +| | It serves all client requests coming from the User Plane. | +| | It stores information such as current and past running workflows, and provides that information upon request. | +| | It also accepts requests to execute workflows, but offloads the work to the Data Plane. | ++-------------------+---------------------------------------------------------------------------------------------------------------+ +| **Data Plane** | The sole responsibility of the the Data Plane is to fulfill workflows. | +| | It accepts workflow requests from the Control Plane and guides the workflow to completion, | +| | launching tasks on a cluster of machines as necessary based on the workflow graph. | +| | It sends status events back to the control plane so the information can be stored and surfaced to end-users. | ++-------------------+---------------------------------------------------------------------------------------------------------------+ + +.. image:: https://raw.githubusercontent.com/flyteorg/static-resources/main/flyte/concepts/architecture/flyte-logical-architecture.png + +User Plane +---------- + +In Flyte, workflows are represented as a Directed Acyclic Graph (DAG) of tasks. While this representation is logical for services, managing workflow DAGs in this format is a tedious exercise for humans. The Flyte User Plane provides tools to create, manage, and visualize workflows in a format that is easily digestible to the users. + +These tools include: + +Flytekit + Flytekit is an SDK that helps users design new workflows using the Python programming language. It can parse the Python code, compile it into a valid Workflow DAG, and submit it to Flyte for execution. + +FlyteConsole + FlyteConsole provides the Web interface for Flyte. Users and administrators can use the console to view workflows, launch plans, schedules, tasks, and individual task executions. The console provides tools to visualize workflows, and surfaces relevant logs for debugging failed tasks. + +Flytectl + Flytectl provides interactive access to Flyte to launch and access workflows via terminal. + + +Control Plane +------------- + +The Control Plane supports the core REST/gRPC API defined in Flyteidl. User Plane tools like FlyteConsole and Flytekit contact the control plane on behalf of users to store and retrieve information. + +Currently, the entire control plane is handled by a single service called **FlyteAdmin**. + +FlyteAdmin is stateless. It processes requests to create entities like tasks, workflows, and schedules by persisting data in a relational database. + +While FlyteAdmin serves the Workflow Execution API, it does not itself execute workflows. To launch workflow executions, FlyteAdmin sends the workflow DAG to the DataPlane. For added scalability and fault-tolerance, FlyteAdmin can be configured to load-balance workflows across multiple isolated data-plane clusters. + + +Data Plane +---------- + +The Data Plane is the engine that accepts DAGs, and fulfills workflow executions by launching tasks in the order defined by the graph. Requests to the Data Plane generally come via the control plane, and not from end-users. + +In order to support compute-intensive workflows at massive scale, the Data Plane needs to launch containers on a cluster of machines. The current implementation leverages `Kubernetes `_ for cluster management. + +Unlike the user-facing Control Plane, the Data Plane does not expose a traditional REST/gRPC API. To launch an execution in the Data Plane, you create a โ€œflyteworkflowโ€ resource in Kubernetes. +A โ€œflyteworkflowโ€ is a Kubernetes `Custom Resource `_ (CRD) created by our team. This custom resource represents the Flyte workflow DAG. + +The core state machine that processes flyteworkflows is the worker known as **FlytePropeller**. + +FlytePropeller leverages the Kubernetes `operator pattern `_. It polls the Kubernetes API, looking for newly created flyteworkflow resources. FlytePropeller understands the workflow DAG, and launches the appropriate Kubernetes pods as needed to complete tasks. It periodically checks for completed tasks, launching downstream tasks until the workflow is complete. + +**Plugins** + +Each task in a flyteworkflow DAG has a specified **type**. The logic for fulfilling a task is determined by its task type. +In the basic case, FlytePropeller launches a single Kubernetes pod to fulfill a task. +Complex task types require workloads to be distributed across hundreds of pods. + +The type-specific task logic is separated into isolated code modules known as **plugins**. +Each task type has an associated plugin that is responsible for handling tasks of its type. +For each task in a workflow, FlytePropeller activates the appropriate plugin based on the task type in order to fulfill the task. + +The Flyte team has pre-built plugins for Hive, Spark, AWS Batch, and :ref:`more `. +To support new use-cases, developers can create their own plugins and bundle them in their FlytePropeller deployment. + +Component Code Architecture +=========================== + +- :ref:`FlytePropeller ` +- :ref:`Flyte Native Scheduler ` + +Component Code References +========================= + +- `FlyteAdmin `__ +- `FlytePropeller `__ +- `DataCatalog `__ +- `FlytePlugins `__ +- `Flyte Native Scheduler `__ + + +.. toctree:: + :maxdepth: 1 + :name: component code architecture + :hidden: + + component_architecture/flytepropeller_architecture + component_architecture/native_scheduler_architecture diff --git a/docs/concepts/basics.rst b/docs/concepts/basics.rst new file mode 100644 index 0000000000..e36f0e55cc --- /dev/null +++ b/docs/concepts/basics.rst @@ -0,0 +1,24 @@ +.. _divedeep: + +############# +Concepts +############# + +.. toctree:: + :maxdepth: 1 + :name: Core Concepts + + tasks + workflows + nodes + launchplans + schedules + registration + executions + state_machine + execution_timeline + data_management + flyte_console + catalog + versioning + workflow_lifecycle diff --git a/docs/concepts/catalog.rst b/docs/concepts/catalog.rst new file mode 100644 index 0000000000..8b092e73c0 --- /dev/null +++ b/docs/concepts/catalog.rst @@ -0,0 +1,63 @@ +.. _divedeep-catalog: + +What is Data Catalog? +===================== + +.. tags:: Advanced, Design + +`DataCatalog `__ is a service to index parameterized, strongly-typed data artifacts across revisions. It allows clients to query artifacts based on meta information and tags. + + +How Flyte Memoizes Task Executions on Data Catalog +-------------------------------------------------- + +Flyte `memoizes task executions` by creating artifacts in DataCatalog and associating meta information regarding the execution with the artifact. Let's walk through what happens when a task execution is cached on DataCatalog. + +Every task instance is represented as a DataSet: + +.. code-block:: javascript + + Dataset { + project: Flyte project the task was registered in + domain: Flyte domain for the task execution + name: flyte_task- + version: -- + } + +Every task execution is represented as an Artifact in the Dataset above: + +.. code-block:: javascript + + Artifact { + id: uuid + Metadata: [executionName, executionVersion] + ArtifactData: [List of ArtifactData] + } + + + ArtifactData { + Name: + value: + } + +To retrieve the Artifact, tag the Artifact with a hash of the input values for the memoized task execution: + +.. code-block:: javascript + + ArtifactTag { + Name: flyte_cached- + } + +When caching an execution, FlytePropeller will: + +1. Create a dataset for the task. +2. Create an artifact that represents the execution, along with the artifact data that represents the execution output. +3. Tag the artifact with a unique hash of the input values. + +To ensure that the task execution is memoized, Flyte Propeller will: + +1. Compute the tag by computing the hash of the input. +2. Check if a tagged artifact exists with that hash. + + - If it exists, we have a cache hit and the Propeller can skip the task execution. + - If an artifact is not associated with the tag, Propeller needs to run the task. diff --git a/docs/concepts/component_architecture/flytepropeller_architecture.rst b/docs/concepts/component_architecture/flytepropeller_architecture.rst new file mode 100644 index 0000000000..a04f6dbe4d --- /dev/null +++ b/docs/concepts/component_architecture/flytepropeller_architecture.rst @@ -0,0 +1,81 @@ +.. _flytepropeller-architecture: + +########################### +FlytePropeller Architecture +########################### + +.. tags:: Advanced, Design + +.. note:: + In the frame of this document, we use the term โ€œworkflowโ€ to describe the single execution of a workflow definition. + +Introduction +============ + +A Flyte :ref:`workflow ` is represented as a Directed Acyclic Graph (DAG) of interconnected Nodes. Flyte supports a robust collection of Node types to ensure diverse functionality. + +- ``TaskNodes`` support a plugin system to externally add system integrations. +- Control flow can be altered during runtime using ``BranchNodes``, which prune downstream evaluation paths based on input. +- ``DynamicNodes`` add nodes to the DAG. +- ``WorkflowNodes`` allow embedding workflows within each other. + +FlytePropeller is responsible for scheduling and tracking execution of Flyte workflows. It is implemented using a K8s controller and adheres to the established K8s design principles. In this scheme, resources are periodically evaluated and the goal is to transition from the observed state to a requested state. + +In our case, workflows are the resources and they are iteratively evaluated to transition from the current state to success. During each loop, the current workflow state is established as the phase of workflow nodes and subsequent tasks, and FlytePropeller performs operations to transition this state to success. The operations may include scheduling (or rescheduling) node executions, evaluating dynamic or branch nodes, etc. These design decisions ensure that FlytePropeller can scale to manage a large number of concurrent workflows without performance degradation. + +This document attempts to break down the FlytePropeller architecture by tracking workflow life cycle through each internal component. Below is a high-level illustration of the FlytePropeller architecture and a flow chart of each component's responsibilities during FlyteWorkflow execution. + +.. image:: https://raw.githubusercontent.com/flyteorg/static-resources/main/flyte/concepts/architecture/flytepropeller_architecture.png + +Components +========== + +FlyteWorkflow CRD / K8s Integration +----------------------------------- + +Workflows in Flyte are maintained as Custom Resource Definitions (CRDs) in Kubernetes, which are stored in the backing etcd cluster. Each execution of a workflow definition results in the creation of a new FlyteWorkflow CR (Custom Resource) which maintains a state for the entirety of processing. CRDs provide variable definitions to describe both resource specifications (spec) and status' (status). The FlyteWorkflow CRD uses the spec subsection to detail the workflow DAG, embodying node dependencies, etc. The status subsection tracks workflow metadata including overall workflow status, node/task phases, status/phase transition timestamps, etc. + +K8s exposes a powerful controller/operator API that enables entities to track creation/updates over a specific resource type. FlytePropeller uses this API to track FlyteWorkflows, meaning every time an instance of the FlyteWorkflow CR is created/updated, the FlytePropeller instance is notified. FlyteAdmin is the common entry point, where initialization of FlyteWorkflow CRs may be triggered by user workflow definition executions, automatic relaunches, or periodically scheduled workflow definition executions. However, it is conceivable to manually create FlyteWorkflow CRs, but this will have limited visibility and usability. + +WorkQueue/WorkerPool +---------------------- + +FlytePropeller supports concurrent execution of multiple, unique workflows using a WorkQueue and WorkerPool. + +The WorkQueue is a FIFO queue storing workflow ID strings that require a lookup to retrieve the FlyteWorkflow CR to ensure up-to-date status. A workflow may be added to the queue in a variety of circumstances: + +#. A new FlyteWorkflow CR is created or an existing instance is updated +#. The K8s Informer resyncs the FlyteWorkflow periodically (necessary to detect workflow timeouts and ensure liveness) +#. A FlytePropeller worker experiences an error during a processing loop +#. The WorkflowExecutor observes a completed downstream node +#. A NodeHandler observes state change and explicitly enqueues its owner (For example, K8s pod informer observes completion of a task) + +The WorkerPool is implemented as a collection of goroutines, one for each worker. Using this lightweight construct, FlytePropeller can scale to 1000s of workers on a single CPU. Workers continually poll the WorkQueue for workflows. On success, the workflow is executed (passed to WorkflowExecutor). + +WorkflowExecutor +---------------- + +The WorkflowExecutor is responsible for handling high-level workflow operations. This includes maintaining the workflow phase (for example: running, failing, succeeded, etc.) according to the underlying node phases and administering pending cleanup operations. For example, aborting existing node evaluations during workflow failures or removing FlyteWorkflow CRD finalizers on completion to ensure the CR is deleted. Additionally, at the conclusion of each evaluation round, the WorkflowExecutor updates the FlyteWorkflow CR with updated metadata fields to track the status between evaluation iterations. + +NodeExecutor +------------ + +The NodeExecutor is executed on a single node, beginning with the workflow's start node. It traverses the workflow using a visitor pattern with a modified depth-first search (DFS), evaluating each node along the path. A few examples of node evaluation based on phase: successful nodes are skipped, unevaluated nodes are queued for processing, and failed nodes may be reattempted up to a configurable threshold. There are many configurable parameters to tune evaluation criteria including max parallelism which restricts the number of nodes which may be scheduled concurrently. Additionally, nodes may be retried to ensure recoverability on failure. + +The NodeExecutor is also responsible for linking data readers/writers to facilitate data transfer between node executions. The data transfer process occurs automatically within Flyte, using efficient K8s events rather than a polling listener pattern which incurs more overhead. Relatively small amounts of data may be passed between nodes inline, but it is more common to pass data URLs to backing storage. A component of this is writing to and checking the data cache, which facilitates the reuse of previously completed evaluations. + +NodeHandlers +------------ + +FlytePropeller includes a robust collection of NodeHandlers to support diverse evaluation of the workflow DAG: + +* **TaskHandler (Plugins)**: These are responsible for executing plugin specific tasks. This may include contacting FlyteAdmin to schedule K8s pod to perform work, calling a web API to begin/track evaluation, and much more. The plugin paradigm exposes an extensible interface for adding functionality to Flyte workflows. +* **DynamicHandler**: Flyte workflow CRs are initialized using a DAG compiled during the registration process. The numerous benefits of this approach are beyond the scope of this document. However, there are situations where the complete DAG is unknown at compile time. For example, when executing a task on each value of an input list. Using Dynamic nodes, a new DAG subgraph may be dynamically compiled during runtime and linked to the existing FlyteWorkflow CR. +* **WorkflowHandler**: This handler allows embedding workflows within another workflow definition. The API exposes this functionality using either (1) an inline execution, where the workflow function is invoked directly resulting in a single FlyteWorkflow CR with an appended sub-workflow, or (2) a launch plan, which uses a TODO to create a separate sub-FlyteWorkflow CR whose execution state is linked to the parent FlyteWorkflow CR. +* **BranchHandler**: The branch handler allows the DAG to follow a specific control path based on input (or computed) values. +* **Start / End Handlers**: These are dummy handlers which process input and output data and in turn transition start and end nodes to success. + +FlyteAdmin Events +----------------- + +It should be noted that the WorkflowExecutor, NodeExecutor, and TaskHandlers send events to FlyteAdmin, enabling it to track workflows in near real-time. diff --git a/docs/concepts/component_architecture/native_scheduler_architecture.rst b/docs/concepts/component_architecture/native_scheduler_architecture.rst new file mode 100644 index 0000000000..19f13ef6c7 --- /dev/null +++ b/docs/concepts/component_architecture/native_scheduler_architecture.rst @@ -0,0 +1,77 @@ +.. _native-scheduler-architecture: + +################################### +Flyte Native Scheduler Architecture +################################### + +.. tags:: Advanced, Design + +Introduction +============ +Any workflow engine needs functionality to support scheduled executions. Flyte +fulfills this using an in-built native scheduler, which schedules fixed rate and +cron-based schedules. The workflow author specifies the schedule during the +:ref:`launchplan creation ` +and :ref:`activates or deactivates ` +the schedule using the +:ref:`admin APIs ` +exposed for the launch plan. + +Characteristics +=============== + +#. Cloud provider independent +#. Standard `cron `__ support +#. Independently scalable +#. Small memory footprint +#. Schedules run as lightweight goroutines +#. Fault tolerant and available +#. Support in sandbox environment + + +Components +========== + +Schedule Management +------------------- + +This component supports creation/activation and deactivation of schedules. Each schedule is tied to a launch plan and is versioned in a similar manner. The schedule is created or its state is changed to activated/deactivated whenever the `admin API `__ is invoked for it with `ACTIVE/INACTIVE state `__. This is done either through `flytectl `__ or through any other client that calls the GRPC API. +The API is similar to a launchplan, ensuring that only one schedule is active for a given launchplan. + + +Scheduler +--------- + +This component is a singleton and is responsible for reading the schedules from the DB and running them at the cadence defined by the schedule. The lowest granularity supported is `minutes` for scheduling through both cron and fixed rate schedulers. The scheduler can run in one replica, two at the most during redeployment. Multiple replicas will only duplicate the work, since each execution for a scheduleTime will have a unique identifier derived from the schedule name and the time of the schedule. The idempotency aspect of the admin for the same identifier prevents duplication on the admin side. The scheduler runs continuously in a loop reading the updated schedule entries in the data store and adding or removing the schedules. Removing a schedule will not alter the in-flight goroutines launched by the scheduler. Thus, the behavior of these executions is undefined. + + +Snapshoter +********** + +This component is responsible for writing the snapshot state of all schedules at a regular cadence to a persistent store. It uses a DB to store the GOB format of the snapshot, which is versioned. The snapshot is a map[string]time.Time, which stores a map of schedule names to their last execution times. During bootup, the snapshot is bootstrapped from the data store and loaded into memory. The Scheduler uses this snapshot to schedule any missed schedules. + +CatchupAll-System +***************** +This component runs at bootup and catches up all the schedules to current time, i.e., time.Now(). New runs for the schedules are sent to the admin in parallel. +Any failure in catching up is considered a hard failure and stops the scheduler. The rerun tries to catchup from the last snapshot of data. + +GOCronWrapper +************* + +This component is responsible for locking in the time for the scheduled job to be invoked and adding those to the cron scheduler. It is a wrapper around `this framework `__ for fixed rate and cron schedules that creates in-memory representation of the scheduled job functions. The scheduler schedules a function with scheduleTime parameters. When this scheduled function is invoked, the scheduleTime parameters provide the current schedule time used by the scheduler. This scheduler supports standard cron scheduling which has 5 `fields `__. It requires 5 entries representing ``minute``, ``hour``, ``day of month``, ``month`` and ``day of week``, in that order. + +Job Executor +************ + +The job executor component is responsible for sending the scheduled executions to FlyteAdmin. The job function accepts ``scheduleTime`` and the schedule which is used to create an execution request to the admin. Each job function is tied to the schedule which is executed in a separate goroutine in accordance with the schedule cadence. + +Monitoring +---------- + +To monitor the system health, the following metrics are published by the native scheduler: + +#. JobFuncPanicCounter : count of crashes of the job functions executed by the scheduler. +#. JobScheduledFailedCounter : count of scheduling failures by the scheduler. +#. CatchupErrCounter : count of unsuccessful attempts to catchup on the schedules. +#. FailedExecutionCounter : count of unsuccessful attempts to fire executions of a schedule. +#. SuccessfulExecutionCounter : count of successful attempts to fire executions of a schedule. diff --git a/docs/concepts/console.rst b/docs/concepts/console.rst new file mode 100644 index 0000000000..d872f8990c --- /dev/null +++ b/docs/concepts/console.rst @@ -0,0 +1,128 @@ +.. _divedeep-console: + +############ +FlyteConsole +############ + +.. tags:: Intermediate, Contribute + +FlyteConsole is the web UI for the Flyte platform. Here's a video that dives into the graph UX: + +.. youtube:: 7YSc-QHk_Ec + +********************* +Running FlyteConsole +********************* + +===================== +Install Dependencies +===================== +Running FlyteConsole locally requires `NodeJS `_ and +`yarn `_. Once these are installed, all of the dependencies +can be installed by running ``yarn`` in the project directory. + +====================== +Environment Variables +====================== +Before we can run the server, we need to set up an environment variable or two. + +``ADMIN_API_URL`` (default: `window.location.origin `_) + +FlyteConsole displays information fetched from the FlyteAdmin API. This +environment variable specifies the host prefix used in constructing API requests. + +.. NOTE:: + This is only the host portion of the API endpoint, consisting of the + protocol, domain, and port (if not using the standard 80/443). + +This value will be combined with a suffix (such as ``/api/v1``) to construct the +final URL used in an API request. + +**Default Behavior** + +In most cases, ``FlyteConsole`` is hosted in the same cluster as the Admin +API, meaning that the domain used to access the console is the same as that used to +access the API. For this reason, if no value is set for ``ADMIN_API_URL``, the +default behavior is to use the value of `window.location.origin`. + + +**``BASE_URL`` (default: ``undefined``)** + +This allows running the console at a prefix on the target host. This is +necessary when hosting the API and console on the same domain (with prefixes of +``/api/v1`` and ``/console`` for example). For local development, this is +usually not needed, so the default behavior is to run without a prefix. + + +**``CORS_PROXY_PREFIX`` (default: ``/cors_proxy``)** + +Sets the local endpoint for `CORS request proxying `_. + +=============== +Run the Server +=============== + +To start the local development server, run ``yarn start``. This will spin up a +Webpack development server, compile all of the code into bundles, and start the +NodeJS server on the default port (3000). All requests to the NodeJS server will +be stalled until the bundles have finished. The application will be accessible +at http://localhost:3000 (if using the default port). + +************ +Development +************ + +========== +Storybook +========== + +FlyteConsole uses `Storybook `__. +Component stories live next to the components they test in the ``__stories__`` +directory with the filename pattern ``{Component}.stories.tsx``. + +You can run storybook with ``npm run storybook``, and view the stories at http://localhost:9001. + +============================= +Protobuf and the Network tab +============================= + +Communication with the FlyteAdmin API is done using Protobuf as the +request/response format. Protobuf is a binary format, which means looking at +responses in the Network tab won't be helpful. To make debugging easier, +each network request is logged to the console with its URL, followed by the +decoded Protobuf payload. You must have debug output enabled (on by default in +development) to see these messages. + +============ +Debug Output +============ + +This application makes use of the `debug `_ +library to provide namespaced debug output in the browser console. In +development, all debug output is enabled. For other environments, the debug +output must be enabled manually. You can do this by setting a flag in +localStorage using the console: ``localStorage.debug = 'flyte:*'``. Each module in +the application sets its own namespace. So if you'd like to only view output for +a single module, you can specify that one specifically +(ex. ``localStorage.debug = 'flyte:adminEntity'`` to only see decoded Flyte +Admin API requests). + +.. _cors-proxy: + +============== +CORS Proxying +============== + +In the common hosting arrangement, all API requests are made to the same origin +serving the client application, making CORS unnecessary. For any requests which +do not share the same ``origin`` value, the client application will route +requests through a special endpoint on the NodeJS server. One example would be +hosting the Admin API on a different domain than the console. Another example is fetching execution data from external storage such as S3. This is done to +minimize the extra configuration required for ingress to the Admin API +and data storage, as well as to simplify local development of the console without +the need to grant CORS access to ``localhost``. + +The requests and responses are piped through the NodeJS server with minimal +overhead. However, it is still recommended to host the Admin API and console on +the same domain to prevent unnecessary load on the NodeJS server and extra +latency on API requests due to the additional hop. diff --git a/docs/concepts/control_plane.rst b/docs/concepts/control_plane.rst new file mode 100644 index 0000000000..16656877c0 --- /dev/null +++ b/docs/concepts/control_plane.rst @@ -0,0 +1,14 @@ +.. _control-plane: + +################ +Control Plane +################ + +.. toctree:: + :maxdepth: 1 + + projects + domains + admin + console + dynamic_spec diff --git a/docs/concepts/data_management.rst b/docs/concepts/data_management.rst new file mode 100644 index 0000000000..33a7f499a1 --- /dev/null +++ b/docs/concepts/data_management.rst @@ -0,0 +1,176 @@ +.. _divedeep-data-management: + +################################# +Understand How Flyte Handles Data +################################# + +.. tags:: Basic, Glossary, Design + +Types of Data +============= + +There are two parts to the data in Flyte: + +1. Metadata + +* It consists of data about inputs to a task, and other artifacts. +* It is configured globally for FlytePropeller, FlyteAdmin etc., and the running pods/jobs need access to this bucket to get the data. + +2. Raw data + +* It is the actual data (such as the Pandas DataFrame, Spark DataFrame, etc.). +* Raw data paths are unique for every execution, and the prefixes can be modified per execution. +* None of the Flyte control plane components would access the raw data. This provides great separation of data between the control plane and the data plane. + +.. note: + Metadata and raw data can be present in entirely separate buckets. + + +Let us consider a simple Python task: + +.. code-block:: python + + @task + def my_task(m: int, n: str, o: FlyteFile) -> pd.DataFrame: + ... + +In the above code sample, ``m``, ``n``, ``o`` are inputs to the task. +``m`` of type ``int`` and ``n`` of type ``str`` are simple primitive types, while ``o`` is an arbitrarily sized file. +All of them from Flyte's point of view are ``data``. +The difference lies in how Flyte stores and passes each of these data items. + +For every task that receives input, Flyte sends an **Inputs Metadata** object, which contains all the primitive or simple scalar values inlined, but in the case of +complex, large objects, they are offloaded and the `Metadata` simply stores a reference to the object. In our example, ``m`` and ``n`` are inlined while +``o`` and the output ``pd.DataFrame`` are offloaded to an object store, and their reference is captured in the metadata. + +`Flytekit TypeTransformers` make it possible to use complex objects as if they are available locally - just like persistent filehandles. But Flyte backend only deals with +the references. + +Thus, primitive data types and references to large objects fall under Metadata - `Meta input` or `Meta output`, and the actual large object is known as **Raw data**. +A unique property of this separation is that all `meta values` are read by FlytePropeller engine and available on the FlyteConsole or CLI from the control plane. +`Raw` data is not read by any of the Flyte components and hence it is possible to store it in a completely separate blob storage or alternate stores, which can't be accessed by Flyte control plane components +but can be accessed by users's container/tasks. + +Raw Data Prefix +~~~~~~~~~~~~~~~ + +Every task can read/write its own data files. If ``FlyteFile`` or any natively supported type like ``pandas.DataFrame`` is used, Flyte will automatically offload and download +data from the configured object-store paths. These paths are completely customizable per `LaunchPlan` or `Execution`. + +- The default Rawoutput path (prefix in an object store like S3/GCS) can be configured during registration as shown in :std:ref:`flytectl_register_files`. + The argument ``--outputLocationPrefix`` allows us to set the destination directory for all the raw data produced. Flyte will create randomized folders in this path to store the data. +- To override the ``RawOutput`` path (prefix in an object store like S3/GCS), you can specify an alternate location when invoking a Flyte execution, as shown in the following screenshot of the LaunchForm in FlyteConsole: + + .. image:: https://raw.githubusercontent.com/flyteorg/static-resources/main/flyte/concepts/data_movement/launch_raw_output.png + +- In the sandbox, the default Rawoutput-prefix is configured to be the root of the local bucket. Hence Flyte will write all the raw data (reference types like blob, file, df/schema/parquet, etc.) under a path defined by the execution. + + +Metadata +~~~~~~~~ + +Metadata in Flyte is critical to enable the passing of data between tasks. It allows to perform in-memory computations for branches or send partial outputs from one task to another or compose outputs from multiple tasks into one input to be sent to a task. + +Thus, metadata is restricted due to its omnipresence. Each `meta output`/`input` cannot be larger than 1MB. If you have `List[int]`, it cannot be larger than 1MB, considering other input entities. In scenarios where large lists or strings need to be sent between tasks, file abstraction is preferred. + +``LiteralType`` & Literals +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +SERIALIZATION TIME +^^^^^^^^^^^^^^^^^^ + +When a task is declared with inputs and outputs, Flyte extracts the interface of the task and converts it to an internal representation called a :std:ref:`ref_flyteidl.core.typedinterface`. +For each variable, a corresponding :std:ref:`ref_flyteidl.core.literaltype` is created. + +For example, the following Python function's interface is transformed as follows: + +.. code-block:: python + + @task + def my_task(a: int, b: str) -> FlyteFile: + """ + Description of my function + + :param a: My input integer + :param b: My input string + :return: My output file + """ + ... + +.. code-block:: + + interface { + inputs { + variables { + key: "a" + value { + type { + simple: INTEGER + } + description: "My input Integer" + } + } + variables { + key: "b" + value { + type { + simple: STRING + } + description: "My input string" + } + } + } + outputs { + variables { + key: "o0" + value { + type { + blob { + } + } + description: "My output File" + } + } + } + } + + +RUNTIME +^^^^^^^ + +At runtime, data passes through Flyte using :std:ref:`ref_flyteidl.core.literal` where the values are set. +For files, the corresponding ``Literal`` is called ``LiteralBlob`` (:std:ref:`ref_flyteidl.core.blob`) which is a binary large object. +Many different objects can be mapped to the underlying `Blob` or `Struct` types. For example, an image is a Blob, a ``pandas.DataFrame`` is a Blob of type parquet, etc. + +Data Movement +============= + +Flyte is primarily a **DataFlow Engine**. It enables movement of data and provides an abstraction to enable movement of data between different languages. + +One implementation of Flyte is the current workflow engine. + +The workflow engine is responsible for moving data from a previous task to the next task. As explained previously, Flyte only deals with Metadata and not the actual Raw data. +The illustration below explains how data flows from engine to the task and how that is transferred between tasks. The medium to transfer the data can change, and will change in the future. +We could use fast metadata stores to speed up data movement or exploit locality. + +Between Flytepropeller and Tasks +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. image:: https://raw.githubusercontent.com/flyteorg/static-resources/main/flyte/concepts/data_movement/flyte_data_movement.png + + +Between Tasks +~~~~~~~~~~~~~~ + +.. image:: https://raw.githubusercontent.com/flyteorg/static-resources/main/flyte/concepts/data_movement/flyte_data_transfer.png + + +Bringing in Your Own Datastores for Raw Data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Flytekit has a pluggable `data persistence layer `__. +This is driven by PROTOCOL. +For example, it is theoretically possible to use S3 ``s3://`` for metadata and GCS ``gcs://`` for raw data. It is also possible to create your own protocol ``my_fs://``, to change how data is stored and accessed. +But for Metadata, the data should be accessible to Flyte control plane. + +Data persistence is also pluggable. By default, it supports all major blob stores and uses an interface defined in Flytestdlib. diff --git a/docs/concepts/domains.rst b/docs/concepts/domains.rst new file mode 100644 index 0000000000..bb306924dd --- /dev/null +++ b/docs/concepts/domains.rst @@ -0,0 +1,13 @@ +.. _divedeep-domains: + +Domains +======= + +.. tags:: Basic, Glossary + +Domains provide an abstraction to isolate resources and feature configuration for different +deployment environments. + +For example: We develop and deploy Flyte workflows in development, staging, and production. We configure Flyte domains with those names, and specify lower resource limits on the development and staging domains than production domains. + +We also use domains to disable launch plans and schedules from development and staging domains, since those features are typically meant for production deployments. \ No newline at end of file diff --git a/docs/concepts/dynamic_spec.rst b/docs/concepts/dynamic_spec.rst new file mode 100644 index 0000000000..4e9e11ad3c --- /dev/null +++ b/docs/concepts/dynamic_spec.rst @@ -0,0 +1,50 @@ +.. _divedeep-dynamic-spec: + +Dynamic Job Spec +================ + +.. tags:: Basic, Design + +A dynamic job spec is a subset of the entire workflow spec that defines a set of tasks, workflows, nodes, and output bindings that control how the job should assemble its outputs. + +This spec is currently only supported as an intermediate step in running Dynamic Tasks. + +.. code-block:: protobuf + :caption: Dynamic job spec in Protobuf + + message DynamicJobSpec { + repeated Node nodes = 1; + int64 min_successes = 2; + repeated Binding outputs = 3; + + repeated TaskTemplate tasks = 4; + repeated WorkflowTemplate subworkflows = 5; + } + +.. _divedeep-dynamic-tasks: + +Tasks +----- + +Defines one or more :ref:`Tasks ` that can then be referenced in the spec. + +.. _divedeep-dynamic-subworkflows: + +Subworkflows +------------ + +Defines zero or more :ref:`Workflows ` that can then be referenced in the spec. + +.. _divedeep-dynamic-nodes: + +Nodes +----- + +Defines one or more :ref:`Nodes ` that can run in parallel to produce the final outputs of the spec. + +.. _divedeep-dynamic-outputs: + +Outputs +------- + +Defines one or more binding that instructs engine on how to assemble the final outputs. \ No newline at end of file diff --git a/docs/concepts/execution_timeline.rst b/docs/concepts/execution_timeline.rst new file mode 100644 index 0000000000..276930c94e --- /dev/null +++ b/docs/concepts/execution_timeline.rst @@ -0,0 +1,72 @@ +.. _divedeep-execution-timeline: + +######################################## +Timeline of a workflow execution +######################################## + +.. tags:: Intermediate, Glossary + +The illustration below shows the timeline view of a workflow execution. + +.. image:: https://raw.githubusercontent.com/flyteorg/static-resources/main/flyte/deployment/monitoring/flyte_wf_timeline.svg?sanitize=true + + +The illustration above refers to a simple workflow, with 2 nodes N1 & N2. This can be represented as follows, + +.. mermaid:: + + graph LR; + Start --> N1; + N1 --> N2; + N2 --> End; + + +Acceptance Latency +==================== +Every workflow starts in the ``Acceptance`` phase. Acceptance refers to the time between FlyteAdmin receiving an execution request and FlytePropeller evaluating the first round of workflow. +Usually, within this phase, the K8s queuing latency is the largest contributor to latency where the overall acceptance latency of <5s is desirable. + +Transition Latency +=================== +Transition latency refers to the time between successive node executions, that is, between ``N1`` and ``N2``. For the first node ``N1``, this latency also encapsulates executing the start node. + +Similarly, the last node also encapsulates executing end node. ``Start Node`` and ``End Node`` are capstones inserted to mark the beginning and end of the DAG. + +The latency involves time consumed to: + +#. Gather outputs for a node after the node completes execution. +#. Send an observation event to FlyteAdmin. Failing to do so will be regarded as an error and will be tried until it succeeds or system max retries are exhausted (the number of max system retries is configured to be 30 by default and can be altered per deployment). +#. Persist data to Kubernetes. +#. Receive the persisted object back from Kubernetes (as this process is eventually consistent using informer caches). +#. Gather inputs for a node before the node starts. +#. Send a queued event for the next node to FlyteAdmin (this is what is persisted and drives the UI/CLI and historical information). + +Queuing Latency +================ +Queuing latency is the time taken by Kubernetes to start the pod, other services to start the job, HTTP throttle to be met, or any rate-limiting that needs to be overcome. This +is usually tied to the available resources and quota, and is out of control for Flyte. + +Completion Latency +=================== +Completion latency is the time taken to mark the workflow as complete and accumulate outputs of a workflow after the last node completes its execution. + + +Overview of Various Latencies in FlytePropeller +================================================= + +=================================== ================================================================================================================================== + Description of main events for workflow execution +----------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Events Description +=================================== ================================================================================================================================== +Acceptance Measures the time between when we receive service call to create an Execution (Unknown) and when it has moved to Queued. +Transition Latency Measures the latency between two consecutive node executions, the time spent in Flyte engine. +Queuing Latency Measures the latency between the time a node's been queued to the time the handler reported the executable moved to running state. +Task Execution Actual time spent executing user code +Repeat steps 2-4 for every task +Transition Latency See #2 +Completion Latency Measures the time between when the WF moved to succeeding/failing state and when it finally moved to a terminal state. +=================================== ================================================================================================================================== + +.. note:: + **The core team is working on optimizing Completion Latency, Transition Latency, and Acceptance Latency.** \ No newline at end of file diff --git a/docs/concepts/executions.rst b/docs/concepts/executions.rst new file mode 100644 index 0000000000..b6ee602520 --- /dev/null +++ b/docs/concepts/executions.rst @@ -0,0 +1,20 @@ +.. _divedeep-executions: + +########## +Executions +########## + +.. tags:: Basic, Glossary + +**Executions** are instances of workflows, nodes or tasks created in the system as a result of a user-requested execution or a scheduled execution. + +Typical Flow Using Flytectl +--------------------------- + +* When an execution of a workflow is triggered using UI/Flytecli/other stateless systems, the system first calls the ``getLaunchPlan`` endpoint and retrieves a launch plan matching the given version. The launch plan definition includes definitions of all input variables declared for the workflow. +* The user-side component then ensures that all the required inputs are supplied and requests the FlyteAdmin service for an execution. +* The FlyteAdmin service validates the inputs, ensuring that they are all specified and, if required, within the declared bounds. +* FlyteAdmin then fetches the previously validated and compiled workflow closure and translates it to an executable format with all the inputs. +* This executable workflow is launched on Kubernetes with an execution record in the database. + +.. image:: https://raw.githubusercontent.com/flyteorg/static-resources/main/flyte/concepts/executions/flyte_wf_execution_overview.svg?sanitize=true \ No newline at end of file diff --git a/docs/concepts/flyte_console.rst b/docs/concepts/flyte_console.rst new file mode 100644 index 0000000000..8e9484789b --- /dev/null +++ b/docs/concepts/flyte_console.rst @@ -0,0 +1,232 @@ +.. _ui: + +How to Use Flyte UI +=================== + +.. tags:: Basic, UI + +Flyte UI is a web-based user interface for Flyte. It helps interact with Flyte objects and builds DAGs out of your workflows. + +With Flyte UI, you can: + +* Launch tasks +* Launch workflows +* View Versioned Tasks and Workflows +* Trigger Versioned Tasks and Workflows +* Inspect Executions through Inputs, Outputs, Logs, and Graphs +* Clone Executions +* Relaunch Executions +* Recover Executions + +.. note:: + `FlyteConsole `__ hosts the Flyte user interface code. + +Launching Workflows +------------------- + +You can launch a workflow by clicking on the **Launch Workflow** button. Workflows are viewable after they are registered. +The UI should be accessible at http://localhost:30081/console. + +| + +.. figure:: https://raw.githubusercontent.com/flyteorg/static-resources/main/flyte/flyteconsole/launch_execution_001.png + :alt: "Launch Workflow" button + + Launch a workflow using the "Launch Workflow" button. + +| + +The end-to-end process from writing code to registering workflows is present in the :std:ref:`getting-started`. + +A pop-up window appears with input fields that the execution requires upon clicking the **Launch Workflow** button. +If the default inputs are given, they will be auto-populated. + +| + +.. figure:: https://raw.githubusercontent.com/flyteorg/static-resources/main/flyte/flyteconsole/launch_execution_002.png + :alt: Launch form + + A pop-up window appears after clicking the "Launch Workflow" button. + +| + +An execution can be terminated/aborted by clicking on the **Terminate** button. + +| + +.. figure:: https://raw.githubusercontent.com/flyteorg/static-resources/main/flyte/flyteconsole/launch_execution_003.png + :alt: "Terminate" button + + Terminate an execution by clicking the "Terminate" button. + +| + +Launching Tasks +--------------- + +You can launch a task by clicking on the **Launch Task** button. Tasks are viewable after they are registered. +The UI should be accessible at http://localhost:30081/console. + +| + +.. figure:: https://raw.githubusercontent.com/flyteorg/static-resources/main/flyte/flyteconsole/launch_task_001.png + :alt: "Launch Task" button + + Launch a task by clicking the "Launch Task" button. + +| + +A pop-up window appears with input fields that the task requires and the role with which the task has to run on clicking the **Launch Task** button. + +| + +.. figure:: https://raw.githubusercontent.com/flyteorg/static-resources/main/flyte/flyteconsole/launch_task_002.png + :alt: Launch form + + A pop-up window appears on clicking the "Launch Task" button. + +| + +Viewing Versioned Tasks and Workflows +------------------------------------- + +Every registered Flyte entity is tagged with a version. All the registered versions of workflows and tasks are viewable in the UI. + +| + +.. figure:: https://raw.githubusercontent.com/flyteorg/static-resources/main/flyte/flyteconsole/versioned_executions.png + :alt: Versioned workflows + + View versioned workflows. + +| + +Triggering Versioned Tasks and Workflows +---------------------------------------- + +Every registered Flyte entity is versioned and can be triggered anytime. + +| + +.. figure:: https://raw.githubusercontent.com/flyteorg/static-resources/main/flyte/flyteconsole/trigger_versioned_executions.png + :alt: Trigger versioned workflows + + Trigger versioned workflows. + +| + +Inspecting Executions +--------------------- + +Executions can be inspected through the UI. Inputs and Outputs for every node and execution can be viewed. + +| + +.. figure:: https://raw.githubusercontent.com/flyteorg/static-resources/main/flyte/flyteconsole/inspect_execution_001.png + :alt: Node's inputs and outputs + + View every execution node's inputs and outputs. + +| + +.. figure:: https://raw.githubusercontent.com/flyteorg/static-resources/main/flyte/flyteconsole/inspect_execution_002.png + :alt: Execution's inputs and outputs + + View every execution's inputs and outputs. + +| + +Logs are accessible as well. + +| + +.. figure:: https://raw.githubusercontent.com/flyteorg/static-resources/main/flyte/flyteconsole/inspect_execution_003.png + :alt: Logs + + View Kubernetes logs. + +| + +Every execution has two views: Nodes and Graph. + +A node in the nodes view encapsulates an instance of a task, but it can also contain an entire subworkflow or trigger an external workflow. +More about nodes can be found in :std:ref:`divedeep-nodes`. + +| + +.. figure:: https://raw.githubusercontent.com/flyteorg/static-resources/main/flyte/flyteconsole/inspect_execution_004.png + :alt: Nodes + + Inspect execution's nodes in the UI. + +| + +Graph view showcases a static DAG. + +| + +.. figure:: https://raw.githubusercontent.com/flyteorg/static-resources/main/flyte/flyteconsole/inspect_execution_005.png + :alt: DAG + + Inspect execution's DAG in the UI. + +| + +Cloning Executions +------------------ + +An execution in the ``RUNNING`` state can be cloned. + +Click on the ellipsis on the top right corner of the UI. + +| + +.. figure:: https://raw.githubusercontent.com/flyteorg/static-resources/main/flyte/flyteconsole/clone_execution_001.png + :alt: Clone execution + + Step 1: Click on the ellipsis. + +| + +Click on the **Clone Execution** button. + +| + +.. figure:: https://raw.githubusercontent.com/flyteorg/static-resources/main/flyte/flyteconsole/clone_execution_002.png + :alt: Clone execution + + Step 2: "Clone execution" button. + +| + +Relaunching Executions +---------------------- + +The **Relaunch** button allows you to relaunch a terminated execution with pre-populated inputs. +This option can be helpful to try out a new version of a Flyte entity. + +| + +.. figure:: https://raw.githubusercontent.com/flyteorg/static-resources/main/flyte/flyteconsole/relaunch_execution.png + :alt: Relaunch an execution + + Relaunch an execution. + +| + +A pop-up window appears on clicking the relaunch button, allowing you to modify the version and inputs. + +Recovering Executions +--------------------- + +Recovery mode allows you to recover an individual execution by copying all successful node executions and running from the failed nodes. +The **Recover** button helps recover a failed execution. + +| + +.. figure:: https://raw.githubusercontent.com/flyteorg/static-resources/main/flyte/flyteconsole/recover_execution.png + :alt: Recover an execution + + Recover an execution. + +| diff --git a/docs/concepts/flyte_wf_tasks_high_level.png b/docs/concepts/flyte_wf_tasks_high_level.png new file mode 100644 index 0000000000..83e987ee18 Binary files /dev/null and b/docs/concepts/flyte_wf_tasks_high_level.png differ diff --git a/docs/concepts/launchplans.rst b/docs/concepts/launchplans.rst new file mode 100644 index 0000000000..2efb6af998 --- /dev/null +++ b/docs/concepts/launchplans.rst @@ -0,0 +1,57 @@ +.. _divedeep-launchplans: + +Launch plans +============ + +.. tags:: Basic, Glossary, Design + +Launch plans help execute workflows. A workflow can be associated with multiple launch plans and launch plan versions, but an individual launch plan is always associated with a single, specific workflow. After creating a launch plan, it is easy to share and execute them. + +Launch plans provide a way to templatize Flyte workflow invocations. Launch plans contain a set of bound workflow inputs that are passed as arguments to create an execution. Launch plans do not necessarily contain the entire set of required workflow inputs, but a launch plan is always necessary to trigger an execution. Additional input arguments can be provided at execution time to supplement launch plan static input values. + +In addition to templatized inputs, launch plans allow you to run your workflow on one or multiple schedules. Each launch +plan can optionally define a single schedule (which can be easily disabled by disabling the launch plan) as well as +optional notifications. Refer to the :ref:`deployment-configuration-notifications` for a deep dive into available notifications. + +The Association between Workflows and LaunchPlans +------------------------------------------------- + +Every workflow comes with a `default` launch plan that has the same name as that of a workflow. The default launch plan is authored (in code) as part of creating a new workflow. +A launch plan version can only ever be mapped to one workflow version; meaning a launch plan version cannot be used twice. This is because part of what makes a new launch plan version is the mapping to the specific workflow version. + +.. note:: + Users rarely interact with the default launch plan. + +Suppose we have ``Workflow A`` in ``version 1``, ``LaunchPlans`` ``A`` and ``B`` in ``version 1``, and ``LaunchPlan`` ``B`` in ``version 2``, then: + +1. ``Workflow A`` can be associated with ``LaunchPlan A`` (version 1); +2. ``Workflow A`` can be associated with ``LaunchPlan B`` (different launch plan name; version 1); +3. ``Workflow A`` can be associated with ``LaunchPlan B`` (version 2). + + +What do Launch Plans Provide? +------------------------------ + +- One click invocation of workflows with predefined inputs and friendly launch plan names. +- Multiple schedules with different default values for inputs per workflow. +- Ability to easily enable and disable schedules. +- Can be created dynamically with flyteclient or statically using the Flyte SDK. +- Associate different notifications with your workflows. +- Restrict inputs to be passed to the workflows at launch time using the :ref:`fixed_inputs ` parameter. +- Multiple versions of the launch plan (with same name) with only one active version. Schedule will reflect only on the active launch plan version. + +.. _concepts-launchplans-inputs: + +Launch plan inputs +------------------ +Generally launch plan inputs correspond to their related workflow definition's inputs, in that the variable type and names are expected to match. Launch plans cannot introduce any inputs not defined in the core workflow definition. However, launch plan inputs differ slightly from workflow inputs in that the former are categorized into **default inputs** and **fixed inputs**. + +Default Inputs +^^^^^^^^^^^^^^ +Default inputs behave much like default workflow inputs. As their name implies, default inputs provide default workflow input values at execution time in the absence of any dynamically provided values. + +.. _fixed_inputs: + +Fixed Inputs +^^^^^^^^^^^^ +Fixed inputs cannot be overridden. If a workflow is executed with a launch plan and dynamic inputs that attempt to redefine the launch plan's fixed inputs, the execution creation request *will fail*. diff --git a/docs/concepts/nodes.rst b/docs/concepts/nodes.rst new file mode 100644 index 0000000000..d67c15457c --- /dev/null +++ b/docs/concepts/nodes.rst @@ -0,0 +1,34 @@ +.. _divedeep-nodes: + +Nodes +===== + +.. tags:: Basic, Glossary + +A node represents a unit of execution or work within a workflow. Ordinarily, a node encapsulates an instance of +a :ref:`task `, but it can also contain an entire subworkflow or trigger an external workflow. +Nodes can have inputs and outputs, which are used to coordinate task inputs and outputs. +Moreover, node outputs can be used as inputs to other nodes within a workflow. + +Tasks are always encapsulated within a node. Like tasks, nodes can come in a variety of flavors determined by their *target*. +These targets include :ref:`task nodes `, :ref:`workflow nodes `, and :ref:`branch nodes `. + +.. _divedeep-task-nodes: + +Task Nodes +---------- + +Tasks referenced in a workflow are always enclosed in nodes. This extends to all task types. +For example, an array task will be enclosed by a single node. + +.. _divedeep-workflow-nodes: + +Workflow Nodes +-------------- +A node can contain an entire sub-workflow. Since workflow executions always require a launch plan, workflow nodes have a reference to a launch plan to trigger their enclosed workflows. + +.. _divedeep-branch-nodes: + +Branch Nodes +------------ +Branch nodes alter the flow of the workflow graph. Conditions at runtime are evaluated to determine the control flow. diff --git a/docs/concepts/projects.rst b/docs/concepts/projects.rst new file mode 100644 index 0000000000..99ed0daf3f --- /dev/null +++ b/docs/concepts/projects.rst @@ -0,0 +1,14 @@ +.. _divedeep-projects: + +Projects +======== + +.. tags:: Basic, Glossary + +A project in Flyte is a group of :ref:`workflows ` and :ref:`tasks ` tied together to achieve a goal. + +A Flyte project can map to an engineering project or everything that's owned by a team or an individual. There cannot be multiple projects with the same name in Flyte. + +Since the fully-qualified name for tasks and workflows include the project and domain name, the task/workflow names are only required to be unique within a project. The workflows in a project ``A`` can refer to tasks and workflows in other projects using the fully-qualified name. + +Flyte allows users to set resource limits and provides basic reports and dashboards automatically for each project. The information captured in these reports includes workflow/task level insights, resource usage, and billing information. \ No newline at end of file diff --git a/docs/concepts/registration.rst b/docs/concepts/registration.rst new file mode 100644 index 0000000000..bc745f7a0f --- /dev/null +++ b/docs/concepts/registration.rst @@ -0,0 +1,33 @@ +.. _divedeep-registration: + +############ +Registration +############ + +.. tags:: Basic, Glossary, Design + +During registration, Flyte validates the workflow structure and saves the workflow. The registration process also updates the workflow graph. + +.. image:: https://raw.githubusercontent.com/flyteorg/static-resources/main/flyte/concepts/executions/flyte_wf_registration_overview.svg?sanitize=true + +Typical Flow +------------- +The following steps elaborate on the specifics of the registration process: + +* Define the tasks using the :py:mod:`Flytekit ` Task Definition language. +* Define a workflow using the :py:mod:`Flytekit ` Workflow definition language. +* Use `flytectl register CLI `__ to compile the tasks into their serialized representation as described in :std:ref:`Flyte Specification language `. During this, the task representation is bound to a container that constitutes the code for the task. This associated entity is registered with FlyteAdmin using the registerTask API. +* Use flytectl register CLI to compile the workflow into their serialized representation as described in :std:ref:`Flyte Specification language `. The referenced tasks are replaced by their FlyteAdmin registered Identifiers, obtained in the previous step. The associated entity is registered with FlyteAdmin using the registerWorkflow API. +* Launch an execution using the FlyteAdmin launch execution API, which requires the necessary inputs provided. This is automatically done if the user uses flytectl to launch the execution. +* Use the FlyteAdmin read APIs to get details of the execution, monitor it to completion, or retrieve a historical execution. +* **OR** use the FlyteConsole to visualize the execution in real time as it progresses or visualize any historical execution. The console makes it easy to view debugging information for the execution. +* Set specific rules such as *notification* on **failure** or **success** or publish all events in the execution to a pub-sub system. +* Query the datastore to get a summary of all the executions and the compute resources consumed. + +.. note:: + Workflows and tasks are purely specifications and can be provided using tools like ``YAML``, ``JSON``, ``protobuf binary`` or any other programming language, and hence registration is possible using other tools. Contributions welcome! + +Registration in the Backend +--------------------------- + +When FlyteAdmin receives a workflow registration request, it uses the workflow compiler to compile and validate the workflow. It also fetches all the referenced tasks and creates a complete workflow closure, which is stored in the metastore. If the workflow compilation fails, the compiler returns an error to the client. diff --git a/docs/concepts/schedules.rst b/docs/concepts/schedules.rst new file mode 100644 index 0000000000..34644b217b --- /dev/null +++ b/docs/concepts/schedules.rst @@ -0,0 +1,102 @@ +.. _concepts-schedules: + +Schedules +========= + +.. tags:: Basic, Glossary + +Workflows can be run automatically using :ref:`schedules ` associated with launch plans. + +Only one launch plan version for a given {Project, Domain, Name} combination can be active, which means only one schedule can be active for a launch plan. This is because a single active schedule can exist across all versions of the launch plan. + +A :ref:`workflow ` version can have multiple schedules associated with it, given that these schedules exist as versions of different launch plans. + +Creating a new schedule creates a new version of the launch plan. +If you wish to change a schedule, you will have to create a new version of that launch plan since a **schedule cannot be edited**. + +FlyteAdmin keeps track of the newly-added schedules, and searches through all the versions of launch plans to set them to 'deactivated'. + +The launch plan versions with schedules that were previously deactivated can be manually used, by clicking on the launch button and selecting the specific launch plan version. + +Let's now look at how schedules can be defined through cron_expression_ or rate_unit_. + +.. _cron_expression: + +Cron Expression +--------------- +Cron expression strings use :ref:`this ` syntax. They are validated at launch plan registration time. + +.. _rate_unit: + +Format +------ + +A cron expression represents a set of times, with the help of 5 space-separated fields. + +.. _cron_expression_table: + ++--------------+------------+-----------------+----------------------------+ +| Field name | Mandatory? | Allowed values | Allowed special characters | ++==============+============+=================+============================+ +| Minutes | Yes | 0-59 | * / , - | ++--------------+------------+-----------------+----------------------------+ +| Hours | Yes | 0-23 | * / , - | ++--------------+------------+-----------------+----------------------------+ +| Day of month | Yes | 1-31 | * / , - ? | ++--------------+------------+-----------------+----------------------------+ +| Month | Yes | 1-12 or JAN-DEC | * / , - | ++--------------+------------+-----------------+----------------------------+ +| Day of week | Yes | 0-6 or SUN-SAT | * / , - ? | ++--------------+------------+-----------------+----------------------------+ + +**Note**: The 'Month' and 'Day of week' fields are case insensitive. + + +Cron schedules +-------------- +An incorrect cron schedule expression leads to a failure in triggering the schedule. :ref:`Here ` is a table that shows the format of a cron expression. + +Below is another example: + +.. code-block:: default + + cron_lp_every_min_of_hour = LaunchPlan.get_or_create( + name="my_cron_scheduled_lp", + workflow=date_formatter_wf, + schedule=CronSchedule( + # Note that kickoff_time_input_arg matches the workflow input we defined above: kickoff_time + # But in case you are using the AWS scheme of schedules and not using the native scheduler then switch over the schedule parameter with cron_expression + schedule="@hourly", # Following schedule runs every hour at beginning of the hour + kickoff_time_input_arg="kickoff_time", + ), + + ) + + +Fixed rate schedules +---------------------- +Instead of cron schedules, fixed rate schedules can be used. + +You can specify the duration in the schedule using `timedelta`, that supports `minutes`, `hours`, `days` and `weeks`. + +:ref:`Here ` is an example with duration in `minutes`. + +Below is an example with duration in `days`. + +.. code-block:: default + + fixed_rate_lp_days = LaunchPlan.get_or_create( + name="my_fixed_rate_lp_days", + workflow=positive_wf, + # Note that the above workflow doesn't accept any kickoff time arguments. + # We omit the ``kickoff_time_input_arg`` from the FixedRate schedule invocation + schedule=FixedRate(duration=timedelta(days=1)), + fixed_inputs={"name": "you"}, + +) + + +Rate Unit +--------- + +Schedules can also be defined using fixed rates in units of **days**, **hours** and **minutes**. diff --git a/docs/concepts/state_machine.rst b/docs/concepts/state_machine.rst new file mode 100644 index 0000000000..8507a59d9c --- /dev/null +++ b/docs/concepts/state_machine.rst @@ -0,0 +1,154 @@ +.. _divedeep-state-machine: + +################################################ +Understanding the State Transition in a Workflow +################################################ + +.. tags:: Basic, Design + +High Level Overview of How a Workflow Progresses to Success +=========================================================== + +.. mermaid:: + + flowchart TD + id1(( )) + id1 --> Ready + Ready --> Running + subgraph Running + id2(( )) + id2 --> NodeQueued + NodeQueued --> NodeRunning + subgraph NodeRunning + id3(( )) + id3 --> TaskQueued + TaskQueued --> TaskRunning + TaskRunning --> TaskSuccess + end + TaskSuccess --> NodeSuccess + end + NodeSuccess --> Success + + +This state diagram illustrates a high-level, simplistic view of the state transitions that a workflow with a single task and node would go through as the user observes success. + +The following sections explain the various observable (and some hidden) states for workflow, node, and task state transitions. + +Workflow States +=============== + +.. mermaid:: + + flowchart TD + Queued -->|On system errors more than threshold| Aborted + Queued --> Ready + Ready--> |Write inputs to workflow| Running + Running--> |On system error| Running + Running--> |On all Nodes Success| Succeeding + Succeeding--> |On successful event send to Admin| Succeeded + Succeeding--> |On system error| Succeeding + Ready--> |On precondition failure| Failing + Running--> |On any Node Failure| Failing + Ready--> |On user initiated abort| Aborting + Running--> |On user initiated abort| Aborting + Succeeding--> |On user initiated abort| Aborting + Failing--> |If Failure node exists| HandleFailureNode + Failing--> |On user initiated abort| Aborting + HandleFailureNode--> |On completing failure node| Failed + HandleFailureNode--> |On user initiated abort| Aborting + Failing--> |On successful send of Failure node| Failed + Aborting--> |On successful event send to Admin| Aborted + +A workflow always starts in the ``Ready`` state and ends either in ``Failed``, ``Succeeded``, or ``Aborted`` state. +Any system error within a state causes a retry on that state. These retries are capped by :ref:`system retries ` which eventually lead to an ``Aborted`` state if the failure persists. + +Every transition between states is recorded in FlyteAdmin using :std:ref:`workflowexecutionevent `. + +The phases in the above state diagram are captured in the admin database as specified here :std:ref:`workflowexecution.phase ` and are sent as a part of the Execution event. + +The state machine specification for the illustration can be found `here `__. + + +Node States +=========== + +.. mermaid:: + + flowchart TD + id1(( )) + id1-->NotYetStarted + id1-->|Will stop the node execution |Aborted + NotYetStarted-->|If all upstream nodes are ready, i.e, inputs are ready | Queued + NotYetStarted--> |If the branch was not taken |Skipped + Queued-->|Start task execution- attempt 0 | Running + Running-->|If task timeout has elapsed and retry_attempts >= max_retries|TimingOut + Running-->|Internal state|Succeeding + Running-->|For dynamic nodes generating workflows| DynamicRunning + DynamicRunning-->TimingOut + DynamicRunning-->RetryableFailure + TimingOut-->|If total node timeout has elapsed|TimedOut + DynamicRunning-->Succeeding + Succeeding-->|User observes the task as succeeded| Succeeded + Running-->|on retryable failure| RetryableFailure + RetryableFailure-->|if retry_attempts < max_retries|Running + RetryableFailure-->|retry_attempts >= max_retries|Failing + Failing-->Failed + Succeeded-->id2(( )) + Failed-->id2(( )) + + +This state diagram illustrates the node transition through various states. This is the core finite state machine for a node. +From the user's perspective, a workflow simply consists of a sequence of tasks. But to Flyte, a workflow internally creates a meta entity known as **node**. + +Once a Workflow enters the ``Running`` state, it triggers the phantom ``start node`` of the workflow. The ``start node`` is considered to be the entry node of any workflow. +The ``start node`` begins by executing all its child-nodes using a modified Depth First Search algorithm recursively. + +Nodes can be of different types as listed below, but all the nodes traverse through the same transitions: + +#. Start Node - Only exists during the execution and is not modeled in the core spec. +#. :std:ref:`Task Node ` +#. :std:ref:`Branch Node ` +#. :std:ref:`Workflow Node ` +#. Dynamic Node - Just a task node that does not return output but constitutes a dynamic workflow. + When the task runs, it remains in the ``RUNNING`` state. Once the task completes and Flyte starts executing the dynamic workflow, + the overarching node that contains both the original task and the dynamic workflow enters `DYNAMIC_RUNNING` state. +#. End Node - Only exists during the execution and is not modeled in the core spec + +Every transition between states is recorded in FlyteAdmin using :std:ref:`nodeexecutionevent `. + +Every ``NodeExecutionEvent`` can have any :std:ref:`nodeexecution.phase `. + +.. note:: TODO: Add explanation for each phase. + +The state machine specification for the illustration can be found `here `__. + +Task States +=========== + +.. mermaid:: + + flowchart TD + id1(( )) + id1-->|Aborted by NodeHandler- timeouts, external abort, etc,.| NotReady + id1-->Aborted + NotReady-->|Optional-Blocked on resource quota or resource pool | WaitingForResources + WaitingForResources--> |Optional- Has been submitted, but hasn't started |Queued + Queued-->|Optional- Prestart initialization | Initializing + Initializing-->|Actual execution of user code has started|Running + Running-->|Successful execution|Success + Running-->|Failed with a retryable error|RetryableFailure + Running-->|Unrecoverable failure, will stop all execution|PermanentFailure + Success-->id2(( )) + RetryableFailure-->id2(( )) + PermanentFailure-->id2(( )) + + +The state diagram above illustrates the various states through which a task transitions. This is the core finite state machine for a task. + +Every transition between states is recorded in FlyteAdmin using :std:ref:`taskexecutionevent `. + +Every ``TaskExecutionEvent`` can have any :std:ref:`taskexecution.phase `. + +.. note:: TODO: Add explanation for each phase. + +The state machine specification for the illustration can be found `here `__. diff --git a/docs/concepts/tasks.rst b/docs/concepts/tasks.rst new file mode 100644 index 0000000000..301287fc7b --- /dev/null +++ b/docs/concepts/tasks.rst @@ -0,0 +1,123 @@ +.. _divedeep-tasks: + +Tasks +===== + +.. tags:: Basic, Glossary + +Tasks are fully independent units of execution and first-class entities of Flyte. +They are the fundamental building blocks and extension points that encapsulate the users' code. + +Characteristics +--------------- + +A Flyte task is characterized by: + +1. A combination of :ref:`projects ` and :ref:`domains `, +2. A unique unicode name (we recommend it not to exceed 32 characters), +3. A version string, and/or +4. *Optional* Task interface definition. + + For tasks to exchange data with each other, a task can define a signature (much like a function/method + signature in programming languages). A task interface defines the input and output variables โ€” + :std:ref:`variablesentry ` + and their types, :std:ref:`literaltype `. + +Can "X" Be a Flyte Task? +------------------------- + +When deciding if a unit of execution constitutes a Flyte task, consider these questions: + +- Is there a well-defined graceful/successful exit criteria for the task? A task is expected to exit after completion of input processing. +- Is it repeatable? Under certain circumstances, a task might be retried, rerun, etc. with the same inputs. It is expected + to produce the same output every single time. For example, avoid using random number generators with current clock as seed. Use a system-provided clock as the seed instead. +- Is it a pure function, i.e., does it have side effects that are unknown to the system (calls a web-service)? It is recommended to avoid side-effects in tasks. When side-effects are evident, ensure that the operations are idempotent. + +Dynamic Tasks +-------------- + +"Dynamic tasks" is a misnomer. +Flyte is one-of-a-kind workflow engine that ships with the concept of truly `Dynamic Workflows `__! +Users can generate workflows in reaction to user inputs or computed values at runtime. +These executions are evaluated to generate a static graph before execution. + +Extending Task +--------------- + +Plugins +^^^^^^^ + +Flyte exposes an extensible model to express tasks in an execution-independent language. +It contains first-class task plugins (for example: `Papermill `__, +`Great Expectations `__, and :ref:`more `.) +that execute the Flyte tasks. +Almost any action can be implemented and introduced into Flyte as a "Plugin", which includes: + +- Tasks that run queries on distributed data warehouses like Redshift, Hive, Snowflake, etc. +- Tasks that run executions on compute engines like Spark, Flink, AWS Sagemaker, AWS Batch, Kubernetes pods, jobs, etc. +- Tasks that call web services. + +Flyte ships with certain defaults, for example, running a simple Python function does not need any hosted service. Flyte knows how to +execute these kinds of tasks on Kubernetes. It turns out these are the vast majority of tasks in machine learning, and Flyte is adept at +handling an enormous scale on Kubernetes. This is achieved by implementing a unique scheduler on Kubernetes. + +Types +^^^^^ + +It is impossible to define the unit of execution of a task in the same way for all tasks. Hence, Flyte allows for different task +types in the system. Flyte has a set of defined, battle-tested task types. It allows for a flexible model to +:std:ref:`define new types `. + +Inherent Features +----------------- + +Fault tolerance +^^^^^^^^^^^^^^^ + +In any distributed system, failure is inevitable. Allowing users to design a fault-tolerant system (e.g. workflow) is an inherent goal of Flyte. +At a high level, tasks offer two parameters to achieve fault tolerance: + +**Retries** + +Tasks can define a retry strategy to let the system know how to handle failures (For example: retry 3 times on any kind of error). + +There are two kinds of retries: + +1. System retry: It is a system-defined, recoverable failure that is used when system failures occur. The number of retries is validated against the number of system retries. + +.. _system-retry: + +System retry can be of two types: + +- **Downstream System Retry**: When a downstream system (or service) fails, or remote service is not contactable, the failure is retried against the number of retries set `here `__. This performs end-to-end system retry against the node whenever the task fails with a system error. This is useful when the downstream service throws a 500 error, abrupt network failure, etc. + +- **Transient Failure Retry**: This retry mechanism offers resiliency against transient failures, which are opaque to the user. It is tracked across the entire duration of execution. It helps Flyte entities and the additional services connected to Flyte like S3, to continue operating despite a system failure. Indeed, all transient failures are handled gracefully by Flyte! Moreover, in case of a transient failure retry, Flyte does not necessarily retry the entire task. โ€œRetrying an entire taskโ€ means that the entire pod associated with the Flyte task would be rerun with a clean slate; instead, it just retries the atomic operation. For example, Flyte tries to persist the state until it can, exhausts the max retries, and backs off. + + To set a transient failure retry: + + - Update `MaxWorkflowRetries `__ in the propeller configuration. + + - Or update `max-workflow-retries `__ in helm. + +2. User retry: If a task fails to execute, it is retried for a specific number of times, and this number is set by the user in `TaskMetadata `__. The number of retries must be less than or equal to 10. + +.. note:: + + Recoverable vs. Non-Recoverable failures: Recoverable failures will be retried and counted against the task's retry count. Non-recoverable failures will just fail, i.e., the task isnโ€™t retried irrespective of user/system retry configurations. All user exceptions are considered non-recoverable unless the exception is a subclass of FlyteRecoverableException. + + +.. note:: + + `RFC 3902 `_ implements an alternative, simplified retry behaviour with which both system and user retries are counted towards a single retry budget defined in the task decorator (thus, without a second retry budget defined in the platform configuration). The last retries are always performed on non-spot instances to guarantee completion. To activate this behaviour, set ``configmap.core.propeller.node-config.ignore-retry-cause`` to ``true`` in the helm values. + + +**Timeouts** + +To ensure that the system is always making progress, tasks must be guaranteed to end gracefully/successfully. The system defines a default timeout period for the tasks. It is possible for task authors to define a timeout period, after which the task is marked as ``failure``. Note that a timed-out task will be retried if it has a retry strategy defined. The timeout can be handled in the `TaskMetadata `__. + + +Caching/Memoization +^^^^^^^^^^^^^^^^^^^ + +Flyte supports memoization of task outputs to ensure that identical invocations of a task are not executed repeatedly, thereby saving compute resources and execution time. For example, if you wish to run the same piece of code multiple times, you can reuse the output instead of re-computing it. +For more information on memoization, refer to the :std:doc:`Caching Example `. diff --git a/docs/concepts/versioning.rst b/docs/concepts/versioning.rst new file mode 100644 index 0000000000..42df830e6c --- /dev/null +++ b/docs/concepts/versioning.rst @@ -0,0 +1,104 @@ +.. _divedeep-versioning: + +Versions +======== + +.. tags:: Basic, Glossary + +One of the most important features and reasons for certain design decisions in Flyte is the need for machine learning and data practitioners to experiment. +When users experiment, they do so in isolation and try multiple iterations. +Unlike traditional software, the users must conduct multiple experiments concurrently with different environments, algorithms, etc. +This may happen when multiple data scientists simultaneously iterate on the same workflow/pipeline. + +The cost of creating an independent infrastructure for each version is enormous and undesirable. +It is beneficial to share the same centralized infrastructure, where the burden of maintaining the infrastructure is with a central infrastructure team, +while the users can use it independently. This improves the cost of operation since the same infrastructure can be reused by multiple teams. + +Versioned workflows help users quickly reproduce prior results or identify the source of previous successful experiments. + +Why Do You Need Versioning? +--------------------------- + +Versioning is required to: + +- Work on the same project concurrently and identify the version/experiment that was successful. +- Capture the environment for a version and independently launch it. +- Visualize prior runs and tie them to experiment results. +- Rollback to production deployments in case of failures with ease. +- Execute multiple experiments in production, which may use different training or data processing algorithms. +- Understand how a specific system evolved and answer questions related to the effectiveness of a specific strategy. + +Operational Benefits of Completely Versioned Workflows/Pipelines +------------------------------------------------------------------- + +The entire workflow in Flyte is versioned and all tasks and entities are immutable which makes it possible to completely change the structure of a workflow between versions, without worrying about the consequences for the pipelines in production. +This hermetic property makes it effortless to manage and deploy new workflow versions and is important for workflows that are long-running. +If a workflow execution is in progress and another new workflow version has been activated, Flyte guarantees that the execution of the old version continues unhindered. + +Consider a scenario where you need to run all the previous executions if there's a bug to be fixed. +Simply fixing the bug in the task may not solve the problem. +Moreover, fixing bugs involves code changes, which may affect the workflow structure. +Flyte addresses this using two properties: + +1. Since the entire workflow is versioned, changing the structure has no impact on the existing execution, and the workflow state won't be corrupted. +2. Flyte provides caching/memoization of outputs. As long as the tasks and their behavior have not changed, it is possible to move them around and still recover their previous outputs, without having to rerun the tasks. This strategy will work even if the workflow changes are in a task. + +Let us take a sample workflow: + +.. mermaid:: + + graph TD; + A-->B; + B-->C; + C-->D; + +In the above graph, let us assume that task `C` fails. It is then possible to simply fix `C` and ``relaunch`` the previous execution (maintaining the inputs etc). This will not re-run tasks ``A``, and ``B`` as long as they are marked as `cache=True`. + +Now, let us consider that the only solution to fix the bug is to change the graph structure and introduce a new step ``B1`` that short circuits the execution to ``D``: + +.. mermaid:: + + graph TD; + A-->B; + B-->B1; + B1-->D; + B1-->C; + C-->D; + +The same ``cache=True`` will handle this complicated situation as well. + +Why Is Versioning Hard? +----------------------- + +Git has become the defacto-standard in version control for code, making it easy to work on branches, merge them, and revert unwanted changes. +But achieving this for a live (running) algorithm usually requires the entire infrastructure to be associated and potentially re-created for every execution. + +How Is Versioning Tied to Reproducibility? +------------------------------------------ + +Workflows can be reproduced without explicit versioning within the system. +To reproduce a past experiment, users need to identify the source code and resurrect any dependencies that the code may have used (for example, TensorFlow 1.x instead of TensorFlow 2.x, or specific Python libraries). +It is also required to instantiate the infrastructure that the previous version may have used. If not recorded, you'll have to ensure that the previously used dataset (say) can be reconstructed. + +This is exactly how Flyte was conceived! + +In Flyte, every task is versioned, and it precisely captures the dependency set. For external tasks, memoization is recommended so that the constructed dataset can be cached on the Flyte side. This way, one can guarantee reproducible behavior from the external systems. + +Moreover, every piece of code is registered with the version of the code that was used to create the instance. +Therefore, users can easily construct the data lineage for all the parts of the workflow. + +What Is the Cost of Versioning & Reproducibility? +------------------------------------------------- + +One of the costs of versioning and allowing on-demand reproducibility is the need to re-instantiate the infrastructure from scratch. +This may sometimes result in additional overhead. However, the advent of Docker containers and Kubernetes has made it possible to build a platform to achieve these goals. + +.. admonition:: Coming soon! + + We are working on reducing the penalty of on-demand infrastructure creation while still maintaining the guarantees. Stay tuned! + +What Is the Best Way to Version Your Tasks and Workflows? +--------------------------------------------------------- + +The best way to version tasks and workflows is to independently version every task with the GIT-SHA or hash of the entire code artifact. +The workflows are also versioned using the GIT-SHA of the containing repository. diff --git a/docs/concepts/workflow_lifecycle.rst b/docs/concepts/workflow_lifecycle.rst new file mode 100644 index 0000000000..efb11e52d8 --- /dev/null +++ b/docs/concepts/workflow_lifecycle.rst @@ -0,0 +1,246 @@ +.. _workflow-lifecycle: + +################################################################# +Understand the Lifecycle of a Flyte Workflow +################################################################# + +.. tags:: Basic, Design + +Let's understand how Flyte's plugin machinery works and how information flows from one component to another in Flyte. + +Under the hood, Flyte relies on a primitive called โ€œPluginsโ€. Every task that you run on Flyte is powered by a plugin. Some of these plugins are native and guaranteed by Flyte system. These native plugins, for example, run your Flyte tasks inside a k8s pod. There are three native plugins, namely, ``Container``, ``K8sPod``, and ``Sql``. + +Moreover, there are plugins that are actual extensions; they create additional infrastructure and communicate with SaaS on your behalf. Examples include :ref:`Spark `, :ref:`AWS Athena `, etc. + +A plugin requires code to live in multiple locations. + +1. Some parts of plugins logic resides in Flytekit's SDK. This let users define tasks. You can find this logic in Flytekitโ€™s Python (https://github.com/flyteorg/flytekit/tree/master/plugins). Think of this as a client for an RPC service or a web service + +2. Another big chunk of plugins logic lives in + `Flyteplugins `__. This is a library that gets loaded into `FlytePropeller `__. + FlytePropeller (a Kubernetes operator) loads Flyteplugins upon starting. + FlytePropeller is aware of the plugins and their dependency on task execution. + However, FlytePropeller is unaware of how these plugins are executed. + +------------ + +To better Illustrate how things work, lets take for example the โ€œSparkโ€ +plugin and understand what is the sequence of steps that take place for +it to work. + +The Spark plugin lets a user define a task that has access to a Spark Session. +In the background Flyte will provide all the needed infrastructure such that by the time the declared task needs to run, all needed Spark infrastructure is ready and running. + +1. User codes in python a task that uses Spark (See code below) + +.. code:: python + + @task( + task_config=Spark( + spark_conf={ + "spark.driver.memory": "1000M", + "spark.executor.instances": "2", + "spark.driver.cores": "1", + } + ) + ) + def hello_spark(i: int) -> float: + ... + ... + +As mentioned earlier some part of plugin logic lives on the SDK. In this +case think of ``Spark`` data class here as a placeholder for all the +Spark settings that we need our plugin to know. We need to pass this +data across multiple places. This is the config that Flyte operator (Flytepropeller) +will need in order to build the needed spark cluster. ``Spark`` class also tells +Flytekitโ€™s SDK that this task will run as a ``PysparkFunctionTask`` +because ``task_config`` points to a ``Spark`` object instance, this is +clearly illustrated `in spark plugin registration step run in the +background `__ + +2. Once the user has finished writing needed Workflows. A packaging step + is needed before user can run the workflows. This packaging step + transforms workflows and tasks we described in python into a Protobuf + representation. This protobuf representation is used by Flyte across its multiple codebases. For + further details on the protobuf representation check `FlyteIdl + repository `__ . Package step is carried out by the sdk tooling you are using. + +This serialization step will transform our ``hello_spark`` task into a +protobuf representation. It will also transform other tasks, workflows +and launch plans to a protobuf representation. + +Our ``hello_spark`` protobuf representation will look as below. A Task +is serialized as a +`TaskTemplate `__ +as defined in ``FlyteIDL``. + +:: + + Id: Task, "example.example.hello_spark" + Type: "Spark" + Metadata: + runtime: + type: FLYTE_SDK + version: 1.0.3 + flavor: python + + interface: + inputs: + i : + type : simple:Integer + description: "i" + outputs: + o0: + type: FLOAT + description: o0 + custom: + executorpath: "/opt/venv/bin/python3" + mainApplicationFile: /opt/venv/bin/entrypoint.py + sparkConf: + spark.driver.cores: 1 + spark.executor.instances: 2 + spark.driver.memory: 1000M + + + Container: + image: "hello_world:1" + args: + [ + "pyflyte-execute" + "--inputs" + "{{.input}}" + "--output-prefix" + "{{.outputPrefix}}" + "--raw-output-data-prefix" + "{{.rawOutputDataPrefix}}" + "--checkpoint-path" + "{{.checkpointOutputPrefix}}" + "--prev-checkpoint" + "{{.prevCheckpointPrefix}}" + "--resolver" + "flytekit.core.python_auto_container.default_task_resolver" + "--" + "task-module" + "example.example" + "task-name" + "hello_spark" + ] + +This representation is generated within Flytekit. Essentially the SDK is +generating the instructions that Flyteโ€™s kubernetes operator needs to +know in order to run this task at a later stage. + +The ``Type`` field is really important as we will see later this will be +used by Flytepropeller (Kubernetes Operator) to know โ€œhowโ€ to execute +this task. + +``Interface`` contains information about what are the inputs and outputs +of our task. Flyte uses this interface to check if tasks are composible. + +``Custom`` is a collection of arbitrary Key/Values, think of it as a +Json dict that any plugin can define as it wishes. In this case the +Spark plugin expects all its particular settings in this field i.e: +Spark workers, driver memory etc. + +`Container `__ +is part of Flyteโ€™s IDL primitives. Essentially any Flyte task is ran as +either three primitives a ``Container`` a ``K8sPod`` or ``Sql``. Every +task contains a ``Target`` which has to be either of these. In this +particular case, our Spark cluster is a ``Container`` target. A +``Container`` specifies all the needed parameters you would in a K8s +ContainerSpec i.e: What docker image to run, what is the command that +will be ran, args etc. + +It is important for the reader to note that Flyte expects to run in a +container that has an entrypoint called ``pyflyte-execute``. This +entrypoint is provided when you ``pip install flytekit``. This +entrypoint and flytekit is what provides a lot of the plumbing logic +inside Flyte. For example It is this entrypoint what automagically +deserializes parquet dataframes an injects them to our taskโ€™s functions +if need be. + +It should be clear to the reader that a lot of parameters are surrounded +by ``{}`` these are template variables that are to be rendered at +execution time. + +What is important from this representation is that it contains all the +information that Flyteโ€™s operator needs to know to execute this task: It +is a ``"Spark"`` task, it has a function signature (inputs and outputs), +it tells what docker image to run, and finally, it tells what spark +settings are needed for the cluster. + +For more information on why this task contains these fields check +``TaskTemplate`` in `FlyteIDL +repository `__. +I strongly advice you to take a look at the data structures in this file +as they provide good insight in the interfaces used all across Flyteโ€™s +codebases. + +3. Once user has packaged workflows and tasks then a registration step + is needed. During registration Flyte adds these protocolbuffer files to its + database, essentially making these tasks and workflows runnable for + the user. Registration is done via `Flytectl ` __ + +4. At somepoint a Flyte user will trigger a Workflow run. The workflow + run will start running the defined DAG. Eventually our Spark task + will need to run,. This is where the second step of a plugin kicks + in. Flytepropeller (Kubernetes Operator) will realize that this is a + Task of type ``Spark`` and it will handle it differently. + + - FlytePropeller knows a task is of type Spark, because our ``TaskTemplate`` defined it so ``Type: Spark`` + + - Flyte has a ``PluginRegistry`` which has a dictionary from ``Task Type`` to ``Plugin Handlers``. + + - At run time Flytepropeller will run our task, Flytepropeller will figure out it is a Spark task, and then call the method ``BuildResource`` in Spark's plugin implementation. ``BuildResource`` is a method that each plugin has to implement. + + - `Plugin `__ is a Golang interface providing an important method ``BuildResource`` + + - Spark has its own Plugin defined `here in Flyteplugins repo `__ + +Inside Sparkโ€™s +`BuildResource `__ +method is where magic happens. At task runtime: + + - Flytepropeller will call ``BuildResource`` method. This method will ask for the ``Custom`` field, tasks flagged as ``type=Spark`` will have a dictionary containing all sort of Spark settings. + + - Using these settings Flytepropeller will use Sparkโ€™s K8s Operator to spawn a spark cluster on the go and run a Spark app (Our python task). + + - The spark app will run a pod with ``pyflyte-execute`` as entrypoint. All the inputs and outputs rendered to what they need to be i.e: paths to the actual data inputs instead of ``{{input}}`` + + - For more information on Sparkโ€™s K8s operator see : `SparkApplicationSpec `__ + +5. A pod with entrypoint to ``pyflyte-execute`` execute starts running (Spark App). + + + - ``pyflyte-execute`` provides all the plumbing magic that is needed. In this particular case, It will create a SparkSession and injects it somewhere so that it is ready for when the user defined pythonโ€™s code starts running. Be aware that this is part of the SDK code (Flytekit). + + - ``pyflyte-execute`` points to `execute_task_cmd `__. + + This entrypoint does a lot of things: + + - Resolves the function that the user wants to run. i.e: where is the needed package where this function lives? . this is what ``"flytekit.core.python_auto_container.default_task_resolver"`` does + + - Downloads needed inputs and do a transformation if need be. I.e: is this a Dataframe? if so we need to transform it into a Pandas DF from parquet. + + - Calls `dispatch_execute `__ . This trigger the execution of our spark task. + + - `PysparkFunctionTask `__. defines what gets run just before the user's task code gets executed. It essentially creatse a spark session and then run the user function (The actual code we want to run!). + +------------ + +Recap +----- + +- Flyte requires coordination between multiple pieces of code. In this + case the SDK and FlytePropeller (K8s operator) +- `Flyte IDL (Interface Language Definition) `__ provides some primitives + for services to talk with each other. Flyte uses Procolbuffer + representations of these primitives +- Three important primitives are : ``Container``, ``K8sPod``, ``Sql``. + At the end of the day all tasks boil down to one of those three. +- github.com/flyteorg/FlytePlugins repository contains all code for plugins: + Spark, AWS Athena, BigQueryโ€ฆ +- Flyte entrypoints are the ones carrying out the heavy lifting: making + sure that inputs are downloaded and/or transformed as needed. +- When running workflows on Flyte, if we want to use Flyte underlying plumbing then + we should include Flyte entrypoints: either Jflyte or Flytekit. diff --git a/docs/concepts/workflows.rst b/docs/concepts/workflows.rst new file mode 100644 index 0000000000..78a3dce3bd --- /dev/null +++ b/docs/concepts/workflows.rst @@ -0,0 +1,51 @@ +.. _divedeep-workflows: + +Workflows +========= + +.. tags:: Basic, Glossary + +A workflow is a directed acyclic graph (DAG) of units of work encapsulated by :ref:`nodes `. +Specific instantiations of a workflow (commonly bound with input arguments) are referred to as **workflow executions**, +or just executions. In other words, a workflow is a template for an ordered task execution. + +Flyte workflows are defined in ``protobuf`` and the flytekit SDK facilitates writing workflows. Users can define workflows as a collection of nodes. +Nodes within a workflow can produce outputs that subsequent nodes could consume as inputs. These dependencies dictate the structure of the workflow. + +Workflows written using the SDK don't need to explicitly define nodes to enclose execution units (tasks, sub-workflows, launch plans); +they will be injected by the SDK and captured at registration time. + +Structure +--------- + +Workflows accept inputs and produce outputs and reuse task definitions across :ref:`projects ` and :ref:`domains `. Every workflow has a default :ref:`launchplan ` with the same name as that of the workflow. + +Workflow structure is flexible because: + +- Nodes can be executed in parallel. +- The same task definition can be re-used within a different workflow. +- A single workflow can contain any combination of task types. +- A workflow can contain a single functional node. +- A workflow can contain multiple nodes in all sorts of arrangements. +- A workflow can launch other workflows. + +At execution time, node executions are triggered as soon as their inputs are available. + +**Workflow nodes naturally run in parallel when possible**. +For example, when a workflow has five independent nodes, i.e., when these five nodes don't consume outputs produced by other nodes, +Flyte runs these nodes in parallel in accordance with the data and resource constraints. + +Flyte-Specific Structure +^^^^^^^^^^^^^^^^^^^^^^^^ + +During :ref:`registration `, Flyte validates the workflow structure and saves the workflow. +The registration process updates the workflow graph. +A compiled workflow will always have a start and end node injected into the workflow graph. +In addition, a failure handler will catch and process execution failures. + +Versioning +---------- + +Like :ref:`tasks `, workflows are versioned too. Registered workflows are immutable, i.e., an instance of a +workflow defined by a specific {Project, Domain, Name, Version} combination can't be updated. +Tasks referenced in a workflow version are immutable and are tied to specific tasks' versions. \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000000..b8581c49bf --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,458 @@ +# -*- coding: utf-8 -*- +# +# Configuration file for the Sphinx documentation builder. +# +# This file does only contain a selection of the most common options. For a +# full list see the documentation: +# http://www.sphinx-doc.org/en/stable/config + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. + +import os +import logging +import sys + +import sphinx.application +import sphinx.errors +from sphinx.util import logging as sphinx_logging + + +sys.path.insert(0, os.path.abspath("../")) +sys.path.append(os.path.abspath("./_ext")) + +sphinx.application.ExtensionError = sphinx.errors.ExtensionError + +# -- Project information ----------------------------------------------------- + +project = "Flyte" +copyright = "2022, Flyte Authors" +author = "Flyte" + +# The short X.Y version +version = "" +# The full version, including alpha/beta/rc tags +release = "1.8.0" + +# -- General configuration --------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.napoleon", + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.autosectionlabel", + "sphinx.ext.doctest", + "sphinx.ext.inheritance_diagram", + "sphinx.ext.intersphinx", + "sphinx.ext.graphviz", + "sphinx.ext.todo", + "sphinx.ext.coverage", + "sphinx.ext.ifconfig", + "sphinx.ext.viewcode", + "sphinx.ext.extlinks", + "sphinx-prompt", + "sphinx_copybutton", + "sphinxext.remoteliteralinclude", + "sphinx_issues", + "sphinx_click", + "sphinx_panels", + "sphinxcontrib.mermaid", + "sphinxcontrib.video", + "sphinxcontrib.youtube", + "sphinx_tabs.tabs", + "sphinx_tags", + "myst_nb", + # custom extensions + "auto_examples", + "import_projects", +] + +source_suffix = { + ".rst": "restructuredtext", + ".md": "myst-nb", +} + +extlinks = { + "propeller": ("https://github.com/flyteorg/flytepropeller/tree/master/%s", ""), + "stdlib": ("https://github.com/flyteorg/flytestdlib/tree/master/%s", ""), + "kit": ("https://github.com/flyteorg/flytekit/tree/master/%s", ""), + "plugins": ("https://github.com/flyteorg/flyteplugins/tree/v0.1.4/%s", ""), + "idl": ("https://github.com/flyteorg/flyteidl/tree/v0.14.1/%s", ""), + "admin": ("https://github.com/flyteorg/flyteadmin/tree/master/%s", ""), + "cookbook": ("https://flytecookbook.readthedocs.io/en/latest/", None), +} + + +autosummary_generate = True +suppress_warnings = ["autosectionlabel.*"] +autodoc_typehints = "description" + +# The master toctree document. +master_doc = "index" + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = "en" + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path . +exclude_patterns = [ + "_build", + "Thumbs.db", + ".DS_Store", + "auto/**/*.ipynb", + "auto/**/*.py", + "auto/**/*.md", + "examples/**/*.ipynb", + "examples/**/*.py", + "jupyter_execute/**", + "README.md", + "_projects/**", + "_src/**", + "examples/**", + "flytesnacks/index.md", + "flytesnacks/bioinformatics_examples.md", + "flytesnacks/feature_engineering.md", + "flytesnacks/flyte_lab.md", + "flytesnacks/integrations.md", + "flytesnacks/ml_training.md", + "flytesnacks/tutorials.md", + "flytesnacks/userguide.md", + "flytesnacks/README.md", + "flytekit/**/README.md", + "flytekit/_templates/**", + "flytectl/index.rst", + "protos/boilerplate/**", + "protos/tmp/**", + "protos/gen/**", + "protos/docs/**/index.rst", + "protos/index.rst", + "api/flytekit/_templates/**", + "api/flytekit/index.rst", + "reference/index.rst", +] + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_favicon = "images/favicon-flyte-docs.png" +html_logo = "images/favicon-flyte-docs.png" +html_theme = "furo" +html_title = "Flyte" + +templates_path = ["_templates"] + +pygments_style = "tango" +pygments_dark_style = "native" + +html_theme_options = { + "light_css_variables": { + "color-brand-primary": "#4300c9", + "color-brand-content": "#4300c9", + }, + "dark_css_variables": { + "color-brand-primary": "#9D68E4", + "color-brand-content": "#9D68E4", + }, + # custom flyteorg furo theme options + # "github_repo": "flyte", + # "github_username": "flyteorg", + # "github_commit": "master", + # "docs_path": "rsts", # path to documentation source + # "sphinx_gallery_src_dir": "cookbook", # path to directory of sphinx gallery source files relative to repo root + # "sphinx_gallery_dest_dir": "auto", # path to root directory containing auto-generated sphinx gallery examples +} + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["_static"] +html_css_files = ["custom.css", "flyte.css"] + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# The default sidebars (for documents that don't match any pattern) are +# defined by theme itself. Builtin themes are using these templates by +# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', +# 'searchbox.html']``. +# +# html_sidebars = {"**": ["logo-text.html", "globaltoc.html", "localtoc.html", "searchbox.html"]} + +# -- Options for HTMLHelp output --------------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = "Flytedoc" + +# -- Options for LaTeX output ------------------------------------------------ + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, "Flyte.tex", "Flyte Documentation", "Flyte Authors", "manual"), +] + +# -- Options for manual page output ------------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [(master_doc, "flyte", "Flyte Documentation", [author], 1)] + +# -- Options for Texinfo output ---------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ( + master_doc, + "Flyte", + "Flyte Documentation", + author, + "Flyte", + "Accelerate your ML and data workflows to production.", + "Miscellaneous", + ), +] + +# -- Extension configuration ------------------------------------------------- +# autosectionlabel_prefix_document = True +autosectionlabel_maxdepth = 2 + +# Tags config +tags_create_tags = True +tags_extension = ["md", "rst"] +tags_page_title = "Tag" +tags_overview_title = "Pages by Tags" + +# -- Options for intersphinx extension --------------------------------------- + +# Example configuration for intersphinx: refer to the Python standard library. +# intersphinx configuration. Uncomment the repeats with the local paths and update your username +# to help with local development. +intersphinx_mapping = { + "python": ("https://docs.python.org/3", None), + "numpy": ("https://numpy.org/doc/stable", None), + "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None), + "torch": ("https://pytorch.org/docs/master/", None), + "scipy": ("https://docs.scipy.org/doc/scipy/reference", None), + "matplotlib": ("https://matplotlib.org", None), + "pandera": ("https://pandera.readthedocs.io/en/stable/", None), +} + +myst_enable_extensions = ["colon_fence"] + +# Sphinx-mermaid config +mermaid_output_format = "raw" +mermaid_version = "latest" +mermaid_init_js = "mermaid.initialize({startOnLoad:false});" + +# Makes it so that only the command is copied, not the output +copybutton_prompt_text = "$ " + +# prevent css style tags from being copied by the copy button +copybutton_exclude = 'style[type="text/css"]' + +nb_execution_mode = "off" +nb_execution_excludepatterns = [ + "flytekit/**/*", + "flytesnacks/**/*", + "examples/**/*", +] + +# Pattern for removing intersphinx references from source files. +# This should handle cases like: +# +# - :ref:`cookbook:label` -> :ref:`label` +# - :ref:`Text ` -> :ref:`Text