diff --git a/cookbook/integrations/flytekit_plugins/dolt/README.rst b/cookbook/integrations/flytekit_plugins/dolt/README.rst index d166733ac..f4c0703fe 100644 --- a/cookbook/integrations/flytekit_plugins/dolt/README.rst +++ b/cookbook/integrations/flytekit_plugins/dolt/README.rst @@ -1,12 +1,11 @@ Dolt -=============================================== +==== -The `DoltTable` plugin is a wrapper that uses `Dolt `__ to move data between -`pandas.DataFrame`'s at execution time and database tables at rest. +The ``DoltTable`` plugin is a wrapper that uses `Dolt `__ to move data between +``pandas.DataFrame``'s at execution time and database tables at rest. Installation ------------ - The dolt plugin and dolt command line tool are required to run these examples: .. code:: bash @@ -14,14 +13,14 @@ The dolt plugin and dolt command line tool are required to run these examples: pip install flytekitplugins.dolt sudo bash -c 'curl -L https://github.com/dolthub/dolt/releases/latest/download/install.sh | sudo bash' -Dolt requires a user configuration to run `init`: +Dolt requires a user configuration to run ``init``: .. code:: bash dolt config --global --add user.email dolt config --global --add user.name -These demos assume a `foo` database has been created locally: +These demos assume a ``foo`` database has been created locally: .. code:: bash diff --git a/cookbook/integrations/flytekit_plugins/dolt/dolt_branch_example.py b/cookbook/integrations/flytekit_plugins/dolt/dolt_branch_example.py index 2c097b40c..be785c77d 100644 --- a/cookbook/integrations/flytekit_plugins/dolt/dolt_branch_example.py +++ b/cookbook/integrations/flytekit_plugins/dolt/dolt_branch_example.py @@ -2,8 +2,7 @@ Dolt Branches ------------- -In this example we'll show you how to use DoltTable -along with Dolt's branch feature. +In this example, we'll show how to use DoltTable along with Dolt's ``Branch`` feature. """ import os @@ -18,18 +17,19 @@ # %% # A Simple Workflow -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# ^^^^^^^^^^^^^^^^^ # We will run a simple data workflow: -# 1. Create a `users` table with `name` and `count` columns. -# 2. Filter the `users` table for users with `count > 5`. -# 3. Record the filtered user's names in a `big_users` table. +# +# 1. Create a ``users`` table with ``name`` and ``count`` columns. +# 2. Filter the ``users`` table for users with ``count > 5``. +# 3. Record the filtered users' names in a ``big_users`` table. # %% # Database Configuration -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# ====================== # Let's define our database configuration. -# Our `DoltConfig`s reference a `foo` folder containing -# our database. Either a `tablename` or a `sql` select +# Our ``DoltConfig`` references a ``foo`` folder containing +# our database. Use either a ``tablename`` or a ``sql`` select # statement to fetch data. doltdb_path = os.path.join(os.path.dirname(__file__), "foo") @@ -58,21 +58,20 @@ def generate_confs(a: int) -> typing.Tuple[DoltConfig, DoltConfig, DoltConfig]: return users_conf, query_users, big_users_conf # %% -# A `DoltTable` is an extension of `DoltConfig` that wraps -# a `pandas.DataFrame` -- accessible via the `DoltTable.data` -# attribute at execution time. +# .. tip :: +# A ``DoltTable`` is an extension of ``DoltConfig`` that wraps a ``pandas.DataFrame`` -- accessible via the ``DoltTable.data`` +# attribute at execution time. # %% # Type Annotating Tasks and Workflows -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - +# =================================== # We can turn our data processing pipeline into a Flyte workflow # by decorating functions with the :py:func:`~flytekit.task` and :py:func:`~flytekit.workflow` decorators. -# Annotating the inputs and outputs of those functions with dolt schemas +# Annotating the inputs and outputs of those functions with Dolt schemas # indicates how to save and load data between tasks. - -# The `DoltTable.data` attribute loads dataframes for input arguments. -# Return types of `DoltTable` save the `data` to the +# +# The ``DoltTable.data`` attribute loads dataframes for input arguments. +# Return types of ``DoltTable`` save the ``data`` to the # Dolt database given a connection configuration. @task @@ -110,7 +109,20 @@ def wf(a: int) -> int: result = wf(a=a) print(f"Running wf(), returns int\n{result}\n{type(result)}") -# %% Results -# ^^^^^^^^^^^^ +# %% +# We will run this workflow twice: +# +# .. prompt:: $ +# +# python branch_example.py 2 +# +# .. prompt:: $ +# +# python branch_example.py 3 +# +# Which creates distinct branches for our two ``a`` values: +# +# .. prompt:: $ # -# Output results are split between branches: +# cd foo +# dolt branch \ No newline at end of file diff --git a/cookbook/integrations/flytekit_plugins/dolt/dolt_quickstart_example.py b/cookbook/integrations/flytekit_plugins/dolt/dolt_quickstart_example.py index fbde56961..0652fa295 100644 --- a/cookbook/integrations/flytekit_plugins/dolt/dolt_quickstart_example.py +++ b/cookbook/integrations/flytekit_plugins/dolt/dolt_quickstart_example.py @@ -2,18 +2,19 @@ Quickstart ---------- -In this example we'll show you how to use DoltTable -to annotate dataframe inputs and outputs in your flyte tasks. - +In this example, we'll learn how to use ``DoltTable`` to annotate DataFrame inputs and outputs in the Flyte tasks. """ +# %% +# First, let's import the libraries. import os import sys -import typing from flytekitplugins.dolt.schema import DoltConfig, DoltTable from flytekit import task, workflow import pandas as pd +# %% +# Next, we initialize Dolt's config. doltdb_path = os.path.join(os.path.dirname(__file__), "foo") rabbits_conf = DoltConfig( @@ -21,16 +22,22 @@ tablename="rabbits", ) +# %% +# We define a task to create a DataFrame and store the table in Dolt. @task def populate_rabbits(a: int) -> DoltTable: rabbits = [("George", a), ("Alice", a * 2), ("Sugar Maple", a * 3)] df = pd.DataFrame(rabbits, columns=["name", "count"]) return DoltTable(data=df, config=rabbits_conf) +# %% +# ``unwrap_rabbits`` task does the exact opposite -- reading the table from Dolt and returning a DataFrame. @task def unwrap_rabbits(table: DoltTable) -> pd.DataFrame: return table.data +# %% +# Our workflow combines the above two tasks: @workflow def wf(a: int) -> pd.DataFrame: rabbits = populate_rabbits(a=a) @@ -44,3 +51,10 @@ def wf(a: int) -> pd.DataFrame: a = int(sys.argv[1]) result = wf(a=a) print(f"Running wf(), returns dataframe\n{result}\n{result.dtypes}") + +# %% +# Run this task by issuing the following command: +# +# .. prompt:: $ +# +# python quickstart_example.py 1