From a2363f72bc94ab2f453830fcc5a0ca1cf7ee464e Mon Sep 17 00:00:00 2001 From: SmritiSatyanV <94349093+SmritiSatyanV@users.noreply.github.com> Date: Wed, 4 May 2022 23:23:31 +0530 Subject: [PATCH] Resolving pre-commit-hook changes (#741) * Resolving pre-commit-hook changes * Resolving pre-commit-hook changes * Resolving pre-commit-hook changes * updated makefile Signed-off-by: SmritiSatyanV * Updated makefile Added spellcheck Signed-off-by: SmritiSatyanV * spellcheck Signed-off-by: SmritiSatyanV --- .pre-commit-config.yaml | 24 + Makefile | 14 +- .../bioinformatics/blast/blastx_example.py | 2 - .../feature_engineering/eda/notebook.py | 8 +- .../eda/notebook_and_task.py | 10 +- .../eda/notebooks_as_tasks.py | 14 +- .../feast_integration/Feast_Flyte_Demo.ipynb | 2 +- .../feast_integration/feast_workflow.py | 6 +- .../feast_integration/feature_eng_tasks.py | 4 +- .../house_price_predictor.py | 39 +- .../multiregion_house_price_predictor.py | 25 +- .../pytorch_single_node_and_gpu.py | 28 +- .../pytorch_single_node_multi_gpu.py | 94 ++- .../ml_training/pima_diabetes/diabetes.py | 11 +- .../keras_spark_rossmann_estimator.py | 47 +- cookbook/common/leaf.mk | 2 +- .../python/calculate-ellipse-area.py | 17 +- .../core/containerization/raw_container.py | 5 +- .../core/containerization/spot_instances.py | 2 +- cookbook/core/containerization/use_secrets.py | 69 +- cookbook/core/control_flow/chain_tasks.py | 7 +- cookbook/core/control_flow/checkpoint.py | 19 +- cookbook/core/control_flow/conditions.py | 74 +- cookbook/core/control_flow/dynamics.py | 11 +- cookbook/core/control_flow/merge_sort.py | 10 +- cookbook/core/control_flow/subworkflows.py | 51 +- cookbook/core/extend_flyte/backend_plugins.py | 4 +- .../core/extend_flyte/custom_task_plugin.py | 7 +- cookbook/core/extend_flyte/custom_types.py | 13 +- cookbook/core/flyte_basics/basic_workflow.py | 4 +- .../core/flyte_basics/decorating_tasks.py | 11 +- .../core/flyte_basics/decorating_workflows.py | 11 +- .../core/flyte_basics/documented_workflow.py | 3 +- cookbook/core/flyte_basics/files.py | 9 +- cookbook/core/flyte_basics/folders.py | 15 +- cookbook/core/flyte_basics/lp.py | 4 +- cookbook/core/flyte_basics/named_outputs.py | 2 +- cookbook/core/flyte_basics/shell_task.py | 5 +- cookbook/core/flyte_basics/task_cache.py | 30 +- .../core/flyte_basics/task_cache_serialize.py | 9 +- cookbook/core/type_system/enums.py | 5 +- cookbook/core/type_system/flyte_pickle.py | 2 +- .../core/type_system/structured_dataset.py | 17 +- cookbook/core/type_system/typed_schema.py | 4 +- cookbook/deployment/configure_use_gpus.py | 2 +- cookbook/deployment/deploying_workflows.py | 4 +- cookbook/deployment/lp_notifications.py | 15 +- cookbook/dev-requirements.in | 7 +- cookbook/dev-requirements.txt | 794 +++--------------- cookbook/docs/README.md | 2 +- cookbook/docs/conf.py | 9 +- cookbook/integrations/aws/batch/batch.py | 3 +- .../sagemaker_pytorch_distributed_training.py | 4 +- .../sagemaker_builtin_algo_training.py | 8 +- .../sagemaker_custom_training.py | 17 +- .../external_services/snowflake/snowflake.py | 21 +- .../dolt/dolt_branch_example.py | 33 +- .../dolt/dolt_quickstart_example.py | 11 +- .../greatexpectations/type_example.py | 39 +- .../modin_examples/knn_classifier.py | 4 +- .../pandera_examples/basic_schema_example.py | 40 +- .../validating_and_testing_ml_pipelines.py | 15 +- .../flytekit_plugins/sql/sql_alchemy.py | 10 +- .../kubernetes/k8s_spark/README.rst | 2 +- .../kubernetes/k8s_spark/pyspark_pi.py | 2 +- .../kubernetes/kfmpi/mpi_mnist.py | 71 +- .../kubernetes/kfpytorch/pytorch_mnist.py | 4 +- .../kubernetes/kftensorflow/tf_mnist.py | 15 +- cookbook/integrations/kubernetes/pod/pod.py | 10 +- cookbook/larger_apps/larger_apps_iterate.py | 2 +- 70 files changed, 790 insertions(+), 1104 deletions(-) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000..316f1b0ee2 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,24 @@ +repos: +- repo: https://github.com/PyCQA/flake8 + rev: 3.9.2 + hooks: + - id: flake8 +- repo: https://github.com/psf/black + rev: 22.3.0 + hooks: + - id: black +- repo: https://github.com/PyCQA/isort + rev: 5.9.3 + hooks: + - id: isort + args: ["--profile", "black"] +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.0.1 + hooks: + - id: check-yaml + - id: end-of-file-fixer + - id: trailing-whitespace +- repo: https://github.com/shellcheck-py/shellcheck-py + rev: v0.7.2.1 + hooks: + - id: shellcheck diff --git a/Makefile b/Makefile index 90aaab115f..523e3fe22f 100644 --- a/Makefile +++ b/Makefile @@ -34,7 +34,19 @@ update_boilerplate: @curl https://raw.githubusercontent.com/flyteorg/boilerplate/master/boilerplate/update.sh -o boilerplate/update.sh @boilerplate/update.sh - +.PHONY: fmt +fmt: ## Format code with black and isort + pre-commit run black --all-files || true + pre-commit run isort --all-files || true + +.PHONY: lint +lint: ## Run linters + pre-commit run --all-files + +.PHONY: spellcheck +spellcheck: ## Runs a spellchecker over all code and documentation + codespell -L "te,raison,fo" --skip="./docs/build,./.git" + .PHONY: help help: ## Show help message @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^[$$()% a-zA-Z_-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) diff --git a/cookbook/case_studies/bioinformatics/blast/blastx_example.py b/cookbook/case_studies/bioinformatics/blast/blastx_example.py index 325cf99eb8..a4b1cda5d2 100644 --- a/cookbook/case_studies/bioinformatics/blast/blastx_example.py +++ b/cookbook/case_studies/bioinformatics/blast/blastx_example.py @@ -11,12 +11,10 @@ import matplotlib.pyplot as plt import pandas as pd - from flytekit import conditional, kwtypes, task, workflow from flytekit.extras.tasks.shell import OutputLocation, ShellTask from flytekit.types.file import FlyteFile, PNGImageFile - # %% # A ``ShellTask`` is useful to run commands on the shell. # In this example, we use ``ShellTask`` to generate and run the BLASTX command. diff --git a/cookbook/case_studies/feature_engineering/eda/notebook.py b/cookbook/case_studies/feature_engineering/eda/notebook.py index 7891587347..b5b5e8ce03 100644 --- a/cookbook/case_studies/feature_engineering/eda/notebook.py +++ b/cookbook/case_studies/feature_engineering/eda/notebook.py @@ -2,8 +2,8 @@ Flyte Pipeline in One Jupyter Notebook ======================================= -In this example, we will implement a simple pipeline that takes hyperparameters, does EDA, feature engineering, and measures the Gradient -Boosting model's performace using mean absolute error (MAE), all in one notebook. +In this example, we will implement a simple pipeline that takes hyperparameters, does EDA, feature engineering, and measures the Gradient +Boosting model's performance using mean absolute error (MAE), all in one notebook. """ # %% @@ -15,7 +15,7 @@ from flytekitplugins.papermill import NotebookTask # %% -# We define a ``NotebookTask`` to run the `Jupyter notebook +# We define a ``NotebookTask`` to run the `Jupyter notebook # `__. # # .. list-table:: ``NotebookTask`` Parameters @@ -49,6 +49,8 @@ # %% # Since a task need not be defined, we create a ``workflow`` and return the MAE score. + + @workflow def notebook_wf( n_estimators: int = 150, diff --git a/cookbook/case_studies/feature_engineering/eda/notebook_and_task.py b/cookbook/case_studies/feature_engineering/eda/notebook_and_task.py index bed8ff8a59..163f948533 100644 --- a/cookbook/case_studies/feature_engineering/eda/notebook_and_task.py +++ b/cookbook/case_studies/feature_engineering/eda/notebook_and_task.py @@ -2,9 +2,9 @@ EDA and Feature Engineering in Jupyter Notebook and Modeling in a Flyte Task ============================================================================ -In this example, we will implement a simple pipeline that takes hyperparameters, does EDA, feature engineering -(step 1: EDA and feature engineering in notebook), and measures the Gradient Boosting model's performace using mean absolute error (MAE) -(step 2: Modeling in a Flyte Task). +In this example, we will implement a simple pipeline that takes hyperparameters, does EDA, feature engineering +(step 1: EDA and feature engineering in notebook), and measures the Gradient Boosting model's performance using mean absolute error (MAE) +(step 2: Modeling in a Flyte Task). """ # %% @@ -37,7 +37,7 @@ class Hyperparameters(object): # %% -# We define a ``NotebookTask`` to run the `Jupyter notebook +# We define a ``NotebookTask`` to run the `Jupyter notebook # `__. # This notebook returns ``dummified_data`` and ``dataset`` as the outputs. # @@ -55,6 +55,8 @@ class Hyperparameters(object): # %% # Next, we define a ``cross_validate`` function and a ``modeling`` task to compute the MAE score of the data against # the Gradient Boosting Regressor. + + def cross_validate(model, nfolds, feats, targets): score = -1 * ( cross_val_score( diff --git a/cookbook/case_studies/feature_engineering/eda/notebooks_as_tasks.py b/cookbook/case_studies/feature_engineering/eda/notebooks_as_tasks.py index d09b975133..7d07308269 100644 --- a/cookbook/case_studies/feature_engineering/eda/notebooks_as_tasks.py +++ b/cookbook/case_studies/feature_engineering/eda/notebooks_as_tasks.py @@ -2,9 +2,9 @@ EDA and Feature Engineering in One Jupyter Notebook and Modeling in the Other ============================================================================= -In this example, we will implement a simple pipeline that takes hyperparameters, does EDA, feature engineering -(step 1: EDA and feature engineering in notebook), and measures the Gradient Boosting model's performace using mean absolute error -(MAE) (step 2: Modeling in notebook). +In this example, we will implement a simple pipeline that takes hyperparameters, does EDA, feature engineering +(step 1: EDA and feature engineering in notebook), and measures the Gradient Boosting model's performance using mean absolute error +(MAE) (step 2: Modeling in notebook). """ # %% @@ -17,7 +17,7 @@ from flytekitplugins.papermill import NotebookTask # %% -# We define a ``NotebookTask`` to run the `Jupyter notebook +# We define a ``NotebookTask`` to run the `Jupyter notebook # `__ (EDA). # This notebook returns ``dummified_data`` and ``dataset`` as the outputs. # @@ -35,8 +35,8 @@ ) # %% -# We define a ``NotebookTask`` to run the `Jupyter notebook -# `__ +# We define a ``NotebookTask`` to run the `Jupyter notebook +# `__ # (Modeling). # # This notebook returns ``mae_score`` as the output. @@ -60,6 +60,8 @@ # %% # We define a ``Workflow`` to run the notebook tasks. + + @workflow def notebook_wf( n_estimators: int = 150, diff --git a/cookbook/case_studies/feature_engineering/feast_integration/Feast_Flyte_Demo.ipynb b/cookbook/case_studies/feature_engineering/feast_integration/Feast_Flyte_Demo.ipynb index f4724a9fc3..263e50e8bc 100644 --- a/cookbook/case_studies/feature_engineering/feast_integration/Feast_Flyte_Demo.ipynb +++ b/cookbook/case_studies/feature_engineering/feast_integration/Feast_Flyte_Demo.ipynb @@ -71,7 +71,7 @@ "source": [ "#### Retrieve the latest registered version of the pipeline\n", "\n", - "FlyteRemote provides convienient methods to retrieve version of the pipeline from the remote server.\n", + "FlyteRemote provides convenient methods to retrieve version of the pipeline from the remote server.\n", "\n", "**NOTE** It is possible to get a specific version of the workflow and trigger a launch for that, but let's just get the latest." ] diff --git a/cookbook/case_studies/feature_engineering/feast_integration/feast_workflow.py b/cookbook/case_studies/feature_engineering/feast_integration/feast_workflow.py index dc36661ab8..2b21161b0d 100644 --- a/cookbook/case_studies/feature_engineering/feast_integration/feast_workflow.py +++ b/cookbook/case_studies/feature_engineering/feast_integration/feast_workflow.py @@ -18,18 +18,19 @@ """ import logging +import random import typing # %% # Let's import the libraries. from datetime import datetime, timedelta -import random import boto3 import joblib import pandas as pd from feast import Entity, Feature, FeatureStore, FeatureView, FileSource, ValueType -from flytekit import task, workflow, TaskMetadata, Resources +from feast_dataobjects import FeatureStore, FeatureStoreConfig # noqa : F811 +from flytekit import Resources, TaskMetadata, task, workflow from flytekit.configuration.internal import AWS from flytekit.extras.sqlite3.task import SQLite3Config, SQLite3Task from flytekit.types.file import JoblibSerializedFile @@ -37,7 +38,6 @@ from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB -from feast_dataobjects import FeatureStore, FeatureStoreConfig from .feature_eng_tasks import mean_median_imputer, univariate_selection logger = logging.getLogger(__file__) diff --git a/cookbook/case_studies/feature_engineering/feast_integration/feature_eng_tasks.py b/cookbook/case_studies/feature_engineering/feast_integration/feature_eng_tasks.py index eb4ef7cd32..faf1a66726 100644 --- a/cookbook/case_studies/feature_engineering/feast_integration/feature_eng_tasks.py +++ b/cookbook/case_studies/feature_engineering/feast_integration/feature_eng_tasks.py @@ -27,7 +27,7 @@ # %% -# We define a ``mean_median_imputer`` task to fill in the missing values of the dataset, for which we use the +# We define a ``mean_median_imputer`` task to fill in the missing values of the dataset, for which we use the # `SimpleImputer `__ class from the ``scikit-learn`` library. @task(cache=True, cache_version="1.0") def mean_median_imputer( @@ -53,7 +53,7 @@ def mean_median_imputer( # %% # Let's define the other task called ``univariate_selection`` that does feature selection. -# The `SelectKBest `__ method removes all +# The `SelectKBest `__ method removes all # but the highest scoring features (DataFrame columns). @task(cache=True, cache_version="1.0") def univariate_selection( diff --git a/cookbook/case_studies/ml_training/house_price_prediction/house_price_predictor.py b/cookbook/case_studies/ml_training/house_price_prediction/house_price_predictor.py index a8cc38b43a..8b5874dea7 100644 --- a/cookbook/case_studies/ml_training/house_price_prediction/house_price_predictor.py +++ b/cookbook/case_studies/ml_training/house_price_prediction/house_price_predictor.py @@ -4,12 +4,12 @@ Predicting House Price in a Region Using XGBoost ------------------------------------------------ -`XGBoost `__ is an optimized distributed gradient boosting library designed to be efficient, flexible, and portable. +`XGBoost `__ is an optimized distributed gradient boosting library designed to be efficient, flexible, and portable. It uses `gradient boosting `__ technique to implement Machine Learning algorithms. In this tutorial, we will understand how to predict house prices using XGBoost, and Flyte. -We will split the generated dataset into train, test and validation set. +We will split the generated dataset into train, test and validation set. Next, we will create three Flyte tasks, that will: @@ -17,6 +17,7 @@ 2. Train the model using XGBoost. 3. Generate predictions. + Let's get started with the example! """ @@ -30,20 +31,21 @@ # pip install joblib # pip install xgboost +import os + # %% # First, let's import the required packages into the environment. import typing +from typing import Tuple -import os import flytekit import joblib import numpy as np import pandas as pd -from sklearn.model_selection import train_test_split -from xgboost import XGBRegressor from flytekit import Resources, task, workflow from flytekit.types.file import JoblibSerializedFile -from typing import Tuple +from sklearn.model_selection import train_test_split +from xgboost import XGBRegressor # %% # We initialize a variable to represent columns in the dataset. The other variables help generate the dataset. @@ -66,6 +68,8 @@ # ===================== # # We define a function to compute the price of a house based on multiple factors (``number of bedrooms``, ``number of bathrooms``, ``area``, ``garage space``, and ``year built``). + + def gen_price(house) -> int: _base_price = int(house["SQUARE_FEET"] * 150) _price = int( @@ -93,7 +97,7 @@ def gen_houses(num_houses) -> pd.DataFrame: "YEAR_BUILT": min(MAX_YEAR, int(np.random.normal(1995, 10))), } _price = gen_price(_house) - # column names/features + # column names/features _house_list.append( [ _price, @@ -105,13 +109,14 @@ def gen_houses(num_houses) -> pd.DataFrame: _house["GARAGE_SPACES"], ] ) - # convert the list to a DataFrame + # convert the list to a DataFrame _df = pd.DataFrame( _house_list, columns=COLUMNS, ) return _df + # %% # Data Preprocessing and Splitting # =================================== @@ -122,16 +127,16 @@ def split_data( ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: seed = seed - val_size = split[1] # 0.3 - test_size = split[2] # 0.1 + val_size = split[1] # 0.3 + test_size = split[2] # 0.1 num_samples = df.shape[0] # retain the features, skip the target column - x1 = df.values[:num_samples, 1:] + x1 = df.values[:num_samples, 1:] # retain the target column - y1 = df.values[:num_samples, :1] + y1 = df.values[:num_samples, :1] - # divide the features and target column into random train and test subsets, based on `test_size` + # divide the features and target column into random train and test subsets, based on `test_size` x_train, x_test, y_train, y_test = train_test_split( x1, y1, test_size=test_size, random_state=seed ) @@ -139,7 +144,7 @@ def split_data( x_train, x_val, y_train, y_val = train_test_split( x_train, y_train, - test_size=(val_size / (1 - test_size)), # here, `test_size` computes to 0.3 + test_size=(val_size / (1 - test_size)), # here, `test_size` computes to 0.3 random_state=seed, ) @@ -164,6 +169,7 @@ def split_data( ), ) + # %% # Next, we create a ``NamedTuple`` to map a variable name to its respective data type. dataset = typing.NamedTuple( @@ -175,11 +181,14 @@ def split_data( # %% # We define a task to call the aforementioned functions. + + @task(cache=True, cache_version="0.1", limits=Resources(mem="600Mi")) def generate_and_split_data(number_of_houses: int, seed: int) -> dataset: _houses = gen_houses(number_of_houses) return split_data(_houses, seed, split=SPLIT_RATIOS) + # %% # Training # ========== @@ -261,4 +270,4 @@ def house_price_predictor_trainer( # # We can run the workflow locally provided the required libraries are installed. The output would be a list of house prices, generated using the XGBoost model. if __name__ == "__main__": - print(house_price_predictor_trainer()) \ No newline at end of file + print(house_price_predictor_trainer()) diff --git a/cookbook/case_studies/ml_training/house_price_prediction/multiregion_house_price_predictor.py b/cookbook/case_studies/ml_training/house_price_prediction/multiregion_house_price_predictor.py index e1fe25097f..1c2eb9e71a 100644 --- a/cookbook/case_studies/ml_training/house_price_prediction/multiregion_house_price_predictor.py +++ b/cookbook/case_studies/ml_training/house_price_prediction/multiregion_house_price_predictor.py @@ -5,7 +5,7 @@ In this tutorial, we will understand how to predict house prices in multiple regions using XGBoost, and :ref:`dynamic workflows ` in Flyte. -We will split the generated dataset into train, test and validation set. +We will split the generated dataset into train, test and validation set. Next, we will create two dynamic workflows in Flyte, that will: @@ -25,17 +25,9 @@ # %% # We define a ``try-catch`` block to import data preprocessing functions from :ref:`here `. try: - from .house_price_predictor import ( - generate_and_split_data, - fit, - predict, - ) + from .house_price_predictor import fit, generate_and_split_data, predict except ImportError: - from house_price_predictor import ( - generate_and_split_data, - fit, - predict, - ) + from house_price_predictor import fit, generate_and_split_data, predict # %% # We initialize a variable to represent columns in the dataset. The other variables help generate the dataset. @@ -64,7 +56,7 @@ ] # %% -# Data Generation and Preprocessing +# Data Generation and Preprocessing # ==================================== # We call the :ref:`data generation ` and :ref:`data preprocessing ` functions to generate train, test, and validation data. # First, let's create a ``NamedTuple`` that maps variable names to their respective data types. @@ -77,13 +69,15 @@ # %% # Next, we create a :py:func:`~flytekit:flytekit.dynamic` workflow to generate and split the data for multiple regions. + + @dynamic(cache=True, cache_version="0.1", limits=Resources(mem="600Mi")) def generate_and_split_data_multiloc( locations: typing.List[str], number_of_houses_per_location: int, seed: int, ) -> dataset: - train_sets = [] # create empty lists for train, validation, and test subsets + train_sets = [] # create empty lists for train, validation, and test subsets val_sets = [] test_sets = [] for _ in locations: @@ -99,7 +93,7 @@ def generate_and_split_data_multiloc( test_sets.append( _test, ) - # split the dataset into train, validation, and test subsets + # split the dataset into train, validation, and test subsets return train_sets, val_sets, test_sets @@ -116,7 +110,7 @@ def parallel_fit_predict( multi_test: typing.List[pd.DataFrame], ) -> typing.List[typing.List[float]]: preds = [] - + # generate predictions for multiple regions for loc, train, val, test in zip(LOCATIONS, multi_train, multi_val, multi_test): model = fit(loc=loc, train=train, val=val) @@ -149,6 +143,7 @@ def multi_region_house_price_prediction_model_trainer( return predictions + # %% # Running the Model Locally # ========================== diff --git a/cookbook/case_studies/ml_training/mnist_classifier/pytorch_single_node_and_gpu.py b/cookbook/case_studies/ml_training/mnist_classifier/pytorch_single_node_and_gpu.py index a9ae93cf9b..3ebcc659bc 100644 --- a/cookbook/case_studies/ml_training/mnist_classifier/pytorch_single_node_and_gpu.py +++ b/cookbook/case_studies/ml_training/mnist_classifier/pytorch_single_node_and_gpu.py @@ -10,7 +10,7 @@ provisioned to have GPU machines, Flyte will execute the task on a node that has GPU(s). Currently, Flyte does not provide any specific task type for PyTorch (though it is entirely possible to provide a task-type -that supports *PyTorch-Ignite* or *PyTorch Lightening* support, but this is not critical). One can request for a GPU, simply +that supports *PyTorch-Ignite* or *PyTorch Lightning* support, but this is not critical). One can request for a GPU, simply by setting GPU="1" resource request and then at runtime, the GPU will be provisioned. In this example, we'll see how we can create any PyTorch model, train it using Flyte and a specialized container. The following video will outline the basics of this process. @@ -52,7 +52,10 @@ # We'll call this function in the ``pytorch_mnist_task`` defined below. def wandb_setup(): wandb.login() - wandb.init(project="mnist-single-node-single-gpu", entity=os.environ.get("WANDB_USERNAME", "my-user-name")) + wandb.init( + project="mnist-single-node-single-gpu", + entity=os.environ.get("WANDB_USERNAME", "my-user-name"), + ) # %% @@ -85,6 +88,7 @@ def forward(self, x): # The Data Loader # =============== + def mnist_dataloader(batch_size, train=True, **kwargs): return torch.utils.data.DataLoader( datasets.MNIST( @@ -188,9 +192,15 @@ def test(model, device, test_loader): for images, targets in test_loader: images, targets = images.to(device), targets.to(device) # device conversion outputs = model(images) # forward pass -- generate predictions - test_loss += F.nll_loss(outputs, targets, reduction="sum").item() # sum up batch loss - _, predicted = torch.max(outputs.data, 1) # get the index of the max log-probability - correct += (predicted == targets).sum().item() # compare predictions to true label + test_loss += F.nll_loss( + outputs, targets, reduction="sum" + ).item() # sum up batch loss + _, predicted = torch.max( + outputs.data, 1 + ) # get the index of the max log-probability + correct += ( + (predicted == targets).sum().item() + ) # compare predictions to true label # log predictions to the ``wandb`` table if log_counter < NUM_BATCHES_TO_LOG: @@ -206,7 +216,9 @@ def test(model, device, test_loader): accuracy = float(correct) / len(test_loader.dataset) # log the average loss, accuracy, and table - wandb.log({"test_loss": test_loss, "accuracy": accuracy, "mnist_predictions": my_table}) + wandb.log( + {"test_loss": test_loss, "accuracy": accuracy, "mnist_predictions": my_table} + ) return accuracy @@ -343,7 +355,9 @@ def pytorch_training_wf( # It is possible to run the model locally with almost no modifications (as long as the code takes care of resolving # if the code is distributed or not). This is how we can do it: if __name__ == "__main__": - model, accuracies = pytorch_training_wf(hp=Hyperparameters(epochs=10, batch_size=128)) + model, accuracies = pytorch_training_wf( + hp=Hyperparameters(epochs=10, batch_size=128) + ) print(f"Model: {model}, Accuracies: {accuracies}") # %% diff --git a/cookbook/case_studies/ml_training/mnist_classifier/pytorch_single_node_multi_gpu.py b/cookbook/case_studies/ml_training/mnist_classifier/pytorch_single_node_multi_gpu.py index dba6d33bd7..f05d92ad22 100644 --- a/cookbook/case_studies/ml_training/mnist_classifier/pytorch_single_node_multi_gpu.py +++ b/cookbook/case_studies/ml_training/mnist_classifier/pytorch_single_node_multi_gpu.py @@ -33,13 +33,15 @@ import wandb from flytekit import Resources, task, workflow from flytekit.types.file import PythonPickledFile + # %% # We'll re-use certain classes and functions from the # :ref:`single node and gpu tutorial ` # such as the ``Net`` model architecture, ``Hyperparameters``, and ``log_test_predictions``. -from mnist_classifier.pytorch_single_node_and_gpu import Net, Hyperparameters, log_test_predictions +from mnist_classifier.pytorch_single_node_and_gpu import Hyperparameters, Net, log_test_predictions from torch import distributed as dist -from torch import nn, multiprocessing as mp, optim +from torch import multiprocessing as mp +from torch import nn, optim from torchvision import datasets, transforms # %% @@ -67,7 +69,10 @@ # We'll call this function in the ``pytorch_mnist_task`` defined below. def wandb_setup(): wandb.login() - wandb.init(project="mnist-single-node-multi-gpu", entity=os.environ.get("WANDB_USERNAME", "my-user-name")) + wandb.init( + project="mnist-single-node-multi-gpu", + entity=os.environ.get("WANDB_USERNAME", "my-user-name"), + ) # %% @@ -95,17 +100,32 @@ def download_mnist(data_dir): # # This function will be called in the training function to be distributed across all available GPUs. Note that # we set ``download=False`` here to avoid race conditions as mentioned above. -def mnist_dataloader(data_dir, batch_size, train=True, distributed=False, rank=None, world_size=None, **kwargs): +def mnist_dataloader( + data_dir, + batch_size, + train=True, + distributed=False, + rank=None, + world_size=None, + **kwargs, +): dataset = datasets.MNIST( data_dir, train=train, download=False, - transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307), (0.3081))]), + transform=transforms.Compose( + [transforms.ToTensor(), transforms.Normalize((0.1307), (0.3081))] + ), ) if distributed: - assert rank is not None, "rank needs to be specified when doing distributed training." + assert ( + rank is not None + ), "rank needs to be specified when doing distributed training." sampler = torch.utils.data.distributed.DistributedSampler( - dataset, rank=rank, num_replicas=1 if world_size is None else world_size, shuffle=True + dataset, + rank=rank, + num_replicas=1 if world_size is None else world_size, + shuffle=True, ) else: sampler = None @@ -178,19 +198,27 @@ def test(model, rank, test_loader): with torch.no_grad(): # loop through the test data loader - total = 0. + total = 0.0 for images, targets in test_loader: total += len(targets) images, targets = images.to(rank), targets.to(rank) # device conversion outputs = model(images) # forward pass -- generate predictions - test_loss += F.nll_loss(outputs, targets, reduction="sum").item() # sum up batch loss - _, predicted = torch.max(outputs.data, 1) # get the index of the max log-probability - correct += (predicted == targets).sum().item() # compare predictions to true label + test_loss += F.nll_loss( + outputs, targets, reduction="sum" + ).item() # sum up batch loss + _, predicted = torch.max( + outputs.data, 1 + ) # get the index of the max log-probability + correct += ( + (predicted == targets).sum().item() + ) # compare predictions to true label # log predictions to the ``wandb`` table if log_counter < NUM_BATCHES_TO_LOG: if rank == 0: - log_test_predictions(images, targets, outputs, predicted, my_table, log_counter) + log_test_predictions( + images, targets, outputs, predicted, my_table, log_counter + ) log_counter += 1 # compute the average loss @@ -200,7 +228,13 @@ def test(model, rank, test_loader): if rank == 0: print("\ntest_loss={:.4f}\naccuracy={:.4f}\n".format(test_loss, accuracy)) # log the average loss, accuracy, and table - wandb.log({"test_loss": test_loss, "accuracy": accuracy, "mnist_predictions": my_table}) + wandb.log( + { + "test_loss": test_loss, + "accuracy": accuracy, + "mnist_predictions": my_table, + } + ) return accuracy @@ -223,6 +257,7 @@ def test(model, rank, test_loader): # ``dist_setup`` is a helper function that instantiates a distributed environment. We're pointing all of the # processes across all available GPUs to the address of the main process. + def dist_setup(rank, world_size, backend): os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = "8888" @@ -244,6 +279,7 @@ def dist_setup(rank, world_size, backend): # - save the trained model to disk # - keep track of validation metrics + def train_mnist(rank: int, world_size: int, hp: Hyperparameters): # store the hyperparameters' config in ``wandb`` if rank == 0: @@ -264,16 +300,26 @@ def train_mnist(rank: int, world_size: int, hp: Hyperparameters): kwargs = {"num_workers": 0, "pin_memory": True} if use_cuda else {} print("Getting data loaders") training_data_loader = mnist_dataloader( - DATA_DIR, hp.batch_size, train=True, distributed=use_cuda, rank=rank, world_size=world_size, **kwargs + DATA_DIR, + hp.batch_size, + train=True, + distributed=use_cuda, + rank=rank, + world_size=world_size, + **kwargs, + ) + test_data_loader = mnist_dataloader( + DATA_DIR, hp.test_batch_size, train=False, **kwargs ) - test_data_loader = mnist_dataloader(DATA_DIR, hp.test_batch_size, train=False, **kwargs) # define the distributed model and optimizer print("Defining model") model = Net().cuda(rank) model = nn.parallel.DistributedDataParallel(model, device_ids=[rank]) - optimizer = optim.SGD(model.parameters(), lr=hp.learning_rate, momentum=hp.sgd_momentum) + optimizer = optim.SGD( + model.parameters(), lr=hp.learning_rate, momentum=hp.sgd_momentum + ) # train the model: run multiple epochs and capture the accuracies for each epoch print(f"Training for {hp.epochs} epochs") @@ -349,8 +395,12 @@ def train_mnist(rank: int, world_size: int, hp: Hyperparameters): retries=2, cache=True, cache_version="1.2", - requests=Resources(gpu=gpu, mem=mem, storage=storage, ephemeral_storage=ephemeral_storage), - limits=Resources(gpu=gpu, mem=mem, storage=storage, ephemeral_storage=ephemeral_storage), + requests=Resources( + gpu=gpu, mem=mem, storage=storage, ephemeral_storage=ephemeral_storage + ), + limits=Resources( + gpu=gpu, mem=mem, storage=storage, ephemeral_storage=ephemeral_storage + ), ) def pytorch_mnist_task(hp: Hyperparameters) -> TrainingOutputs: print("Start MNIST training:") @@ -375,7 +425,9 @@ def pytorch_mnist_task(hp: Hyperparameters) -> TrainingOutputs: # %% # Finally, we define a workflow to run the training algorithm. We return the model and accuracies. @workflow -def pytorch_training_wf(hp: Hyperparameters = Hyperparameters(epochs=10, batch_size=128)) -> TrainingOutputs: +def pytorch_training_wf( + hp: Hyperparameters = Hyperparameters(epochs=10, batch_size=128) +) -> TrainingOutputs: return pytorch_mnist_task(hp=hp) @@ -386,7 +438,9 @@ def pytorch_training_wf(hp: Hyperparameters = Hyperparameters(epochs=10, batch_s # It is possible to run the model locally with almost no modifications (as long as the code takes care of resolving # if the code is distributed or not). This is how to do it: if __name__ == "__main__": - model, accuracies = pytorch_training_wf(hp=Hyperparameters(epochs=10, batch_size=128)) + model, accuracies = pytorch_training_wf( + hp=Hyperparameters(epochs=10, batch_size=128) + ) print(f"Model: {model}, Accuracies: {accuracies}") # %% diff --git a/cookbook/case_studies/ml_training/pima_diabetes/diabetes.py b/cookbook/case_studies/ml_training/pima_diabetes/diabetes.py index 714669a1ac..405775136c 100644 --- a/cookbook/case_studies/ml_training/pima_diabetes/diabetes.py +++ b/cookbook/case_studies/ml_training/pima_diabetes/diabetes.py @@ -10,6 +10,7 @@ import typing from collections import OrderedDict from dataclasses import dataclass +from typing import Tuple import joblib import pandas as pd @@ -20,7 +21,6 @@ from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split from xgboost import XGBClassifier -from typing import Tuple # %% # Since we are working with a specific dataset, we will create a strictly typed schema for the dataset. @@ -158,7 +158,8 @@ def fit( @task(cache_version="1.0", cache=True, limits=Resources(mem="200Mi")) def predict( - x: FlyteSchema[FEATURE_COLUMNS], model_ser: FlyteFile[MODELSER_JOBLIB], + x: FlyteSchema[FEATURE_COLUMNS], + model_ser: FlyteFile[MODELSER_JOBLIB], ) -> FlyteSchema[CLASSES_COLUMNS]: """ Given a any trained model, serialized using joblib (this method can be shared!) and features, this method returns @@ -207,7 +208,11 @@ def diabetes_xgboost_model( x_train, x_test, y_train, y_test = split_traintest_dataset( dataset=dataset, seed=seed, test_split_ratio=test_split_ratio ) - model = fit(x=x_train, y=y_train, hyperparams=XGBoostModelHyperparams(max_depth=4),) + model = fit( + x=x_train, + y=y_train, + hyperparams=XGBoostModelHyperparams(max_depth=4), + ) predictions = predict(x=x_test, model_ser=model.model) return model.model, score(predictions=predictions, y=y_test) diff --git a/cookbook/case_studies/ml_training/spark_horovod/keras_spark_rossmann_estimator.py b/cookbook/case_studies/ml_training/spark_horovod/keras_spark_rossmann_estimator.py index 82223db53f..1c0422c05c 100644 --- a/cookbook/case_studies/ml_training/spark_horovod/keras_spark_rossmann_estimator.py +++ b/cookbook/case_studies/ml_training/spark_horovod/keras_spark_rossmann_estimator.py @@ -4,8 +4,8 @@ Data-Parallel Distributed Training Using Horovod on Spark --------------------------------------------------------- -When time- and compute-intensive deep learning workloads need to be trained efficiently, data-parallel distributed training comes to the rescue. -This technique parallelizes the data and requires sharing of weights between different worker nodes involved in the distributed training after every epoch, which ensures that all worker nodes train a consistent model. +When time- and compute-intensive deep learning workloads need to be trained efficiently, data-parallel distributed training comes to the rescue. +This technique parallelizes the data and requires sharing of weights between different worker nodes involved in the distributed training after every epoch, which ensures that all worker nodes train a consistent model. Overall, data-parallel distributed training can help speed up the execution time. In this tutorial, we will understand how data-parallel distributed training works with Flyte, Horovod, and Spark. @@ -14,7 +14,7 @@ Lastly, we will build a Keras model and perform distributed training using Horovod's `KerasEstimator API `__. Before executing the code, create `work_dir`, an s3 bucket. - + Let's get started with the example! """ @@ -44,16 +44,7 @@ from horovod.spark.common.store import Store from horovod.tensorflow.keras.callbacks import BestModelCheckpoint from pyspark import Row -from tensorflow.keras.layers import ( - BatchNormalization, - Concatenate, - Dense, - Dropout, - Embedding, - Flatten, - Input, - Reshape, -) +from tensorflow.keras.layers import BatchNormalization, Concatenate, Dense, Dropout, Embedding, Flatten, Input, Reshape # %% # We define two variables to represent categorical and continuous columns in the dataset. @@ -101,6 +92,8 @@ # %% # Next, let's initialize a data class to store the hyperparameters that will be used with the model (``epochs``, ``learning_rate``, ``batch_size``, etc.). + + @dataclass_json @dataclass class Hyperparameters: @@ -112,6 +105,7 @@ class Hyperparameters: local_checkpoint_file: str = "checkpoint.h5" local_submission_csv: str = "submission.csv" + # %% # Downloading the Data # ==================== @@ -155,11 +149,12 @@ def download_data(dataset: str) -> FlyteDirectory: # return the directory populated with Rossmann data files return FlyteDirectory(path=str(data_dir)) + # %% # Data Preprocessing # ===================== # -# 1. Let's start with cleaning and preparing the Google trend data. We create new 'Date' and 'State' columns using PySpark's ``withColumn``. These columns, in addition to other features, will contribute to the prediction of sales. +# 1. Let's start with cleaning and preparing the Google trend data. We create new 'Date' and 'State' columns using PySpark's ``withColumn``. These columns, in addition to other features, will contribute to the prediction of sales. def prepare_google_trend( google_trend_csv: pyspark.sql.DataFrame, ) -> pyspark.sql.DataFrame: @@ -180,6 +175,7 @@ def prepare_google_trend( # expand dates return expand_date(google_trend_all) + # %% # 2. Next, we set a few date-specific values in the DataFrame to analyze the seasonal effects on sales. def expand_date(df: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame: @@ -191,6 +187,7 @@ def expand_date(df: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame: .withColumn("Day", F.dayofmonth(df.Date)) ) + # %% # 3. We retrieve the number of days before/after a special event (such as a promo or holiday). This data helps analyze how the sales may vary before/after a special event. def add_elapsed(df: pyspark.sql.DataFrame, cols: List[str]) -> pyspark.sql.DataFrame: @@ -221,6 +218,7 @@ def fn(rows): df = rdd.toDF() return df + # %% # 4. We define a function to merge several Spark DataFrames into a single DataFrame to create training and test data. def prepare_df( @@ -233,7 +231,7 @@ def prepare_df( ) -> pyspark.sql.DataFrame: num_rows = df.count() - # expand dates + # expand dates df = expand_date(df) # create new columns in the DataFrame by filtering out special events(promo/holiday where sales was zero or store was closed). @@ -327,6 +325,7 @@ def prepare_df( assert num_rows == df.count(), "lost rows in joins" return df + # %% # 5. We build a dictionary of sorted, distinct categorical variables to create an embedding layer in our Keras model. def build_vocabulary(df: pyspark.sql.DataFrame) -> Dict[str, List[Any]]: @@ -338,6 +337,7 @@ def build_vocabulary(df: pyspark.sql.DataFrame) -> Dict[str, List[Any]]: vocab[col] = sorted(values, key=lambda x: x or default_value) return vocab + # %% # 6. Next, we cast continuous columns to float as part of data preprocessing. def cast_columns(df: pyspark.sql.DataFrame, cols: List[str]) -> pyspark.sql.DataFrame: @@ -345,6 +345,7 @@ def cast_columns(df: pyspark.sql.DataFrame, cols: List[str]) -> pyspark.sql.Data df = df.withColumn(col, F.coalesce(df[col].cast(T.FloatType()), F.lit(0.0))) return df + # %% # 7. Lastly, define a function that returns a list of values based on a key. def lookup_columns( @@ -360,6 +361,7 @@ def fn(v): df = df.withColumn(col, lookup(mapping)(df[col])) return df + # %% # The ``data_preparation`` function consolidates all the aforementioned data processing functions. def data_preparation( @@ -447,7 +449,7 @@ def data_preparation( .cache(), ) - # cast continuous columns to float + # cast continuous columns to float train_df = cast_columns(train_df, CONTINUOUS_COLS + ["Sales"]) # search for a key and return a list of values based on a key train_df = lookup_columns(train_df, vocab) @@ -492,6 +494,7 @@ def data_preparation( return max_sales, vocab, train_df, test_df + # %% # Training # =========== @@ -581,7 +584,7 @@ def act_sigmoid_scaled(x): # create an object of Store class store = Store.create(work_dir.remote_source) # 'SparkBackend' uses `horovod.spark.run` to execute the distributed training function, and - # returns a list of results by running 'train' on every worker in the cluster + # returns a list of results by running 'train' on every worker in the cluster backend = SparkBackend( num_proc=hp.num_proc, stdout=sys.stdout, @@ -613,7 +616,7 @@ def act_sigmoid_scaled(x): # the user would provide a Keras model to the `KerasEstimator`` # this `KerasEstimator`` will fit the data and store it in a Spark DataFrame keras_model = keras_estimator.fit(train_df).setOutputCols(["Sales_output"]) - # retrieve the model training history + # retrieve the model training history history = keras_model.getHistory() best_val_rmspe = min(history["val_exp_rmspe"]) print("Best RMSPE: %f" % best_val_rmspe) @@ -624,7 +627,8 @@ def act_sigmoid_scaled(x): "Written checkpoint to %s" % os.path.join(working_dir, hp.local_checkpoint_file) ) # the Estimator returns a Transformer representation of the trained model once training is complete - return keras_model + return keras_model + # %% # Evaluation @@ -659,6 +663,7 @@ def test( return working_dir + # %% # Defining the Spark Task # ======================== @@ -668,7 +673,7 @@ def test( # Within the task, let's call the data pre-processing, training, and evaluation functions. # # .. note:: -# +# # To set up Spark, refer to :ref:`flyte-and-spark`. # @task( @@ -710,6 +715,7 @@ def horovod_spark_task( # generate predictions return test(keras_model, working_dir, test_df, hp) + # %% # Lastly, we define a workflow to run the pipeline. @workflow @@ -723,6 +729,7 @@ def horovod_spark_wf( # work_dir corresponds to the Horovod-Spark store return horovod_spark_task(data_dir=data_dir, hp=hp, work_dir=work_dir) + # %% # Running the Model Locally # ========================== diff --git a/cookbook/common/leaf.mk b/cookbook/common/leaf.mk index da6cea8ea7..63a0be557f 100644 --- a/cookbook/common/leaf.mk +++ b/cookbook/common/leaf.mk @@ -15,7 +15,7 @@ ifdef SANDBOX # The url for Flyte Control plane export FLYTE_HOST ?= localhost:30081 -# Overrides s3 url. This is solely needed for SANDBOX deployments. Shouldn't be overriden in production AWS S3. +# Overrides s3 url. This is solely needed for SANDBOX deployments. Shouldn't be overridden in production AWS S3. export FLYTE_AWS_ENDPOINT ?= http://localhost:30084/ # Used to authenticate to s3. For a production AWS S3, it's discouraged to use keys and key ids. diff --git a/cookbook/core/containerization/raw-containers-supporting-files/per-language/python/calculate-ellipse-area.py b/cookbook/core/containerization/raw-containers-supporting-files/per-language/python/calculate-ellipse-area.py index a523ca1037..a932f7df60 100644 --- a/cookbook/core/containerization/raw-containers-supporting-files/per-language/python/calculate-ellipse-area.py +++ b/cookbook/core/containerization/raw-containers-supporting-files/per-language/python/calculate-ellipse-area.py @@ -1,29 +1,32 @@ import math import sys + def read_input(input_dir, v): - with open(f'{input_dir}/{v}', 'r') as f: + with open(f"{input_dir}/{v}", "r") as f: return float(f.read()) + def write_output(output_dir, output_file, v): - with open(f'{output_dir}/{output_file}', 'w') as f: + with open(f"{output_dir}/{output_file}", "w") as f: f.write(str(v)) + def calculate_area(a, b): return math.pi * a * b def main(input_dir, output_dir): - a = read_input(input_dir, 'a') - b = read_input(input_dir, 'b') + a = read_input(input_dir, "a") + b = read_input(input_dir, "b") area = calculate_area(a, b) - write_output(output_dir, 'area', area) - write_output(output_dir, 'metadata', '[from python rawcontainer]') + write_output(output_dir, "area", area) + write_output(output_dir, "metadata", "[from python rawcontainer]") -if __name__ == '__main__': +if __name__ == "__main__": input_dir = sys.argv[1] output_dir = sys.argv[2] diff --git a/cookbook/core/containerization/raw_container.py b/cookbook/core/containerization/raw_container.py index ac8fdffa35..28be59d3c4 100644 --- a/cookbook/core/containerization/raw_container.py +++ b/cookbook/core/containerization/raw_container.py @@ -12,9 +12,8 @@ Refer to the raw protocol to understand how to leverage this. """ import logging -from typing import Tuple, Any, Mapping, List, Set -from flytekit import task, workflow -from flytekit import ContainerTask, kwtypes, workflow + +from flytekit import ContainerTask, kwtypes, task, workflow logger = logging.getLogger(__file__) diff --git a/cookbook/core/containerization/spot_instances.py b/cookbook/core/containerization/spot_instances.py index 2f8639b8b3..00575922ae 100644 --- a/cookbook/core/containerization/spot_instances.py +++ b/cookbook/core/containerization/spot_instances.py @@ -34,7 +34,7 @@ # What Are Interruptible Tasks? # ============================= # -# If specified, the ``interruptible flag`` is added to the task definition and signals to the Flyte engine that it may be scheduled on machines that may be preempted, such as AWS spot instances. This is low-hanging fruit for any cost-savings initiative. +# If specified, the ``interruptible flag`` is added to the task definition and signals to the Flyte engine that it may be scheduled on machines that may be preempted, such as AWS spot instances. This is low-hanging fruit for any cost-savings initiative. # # Setting Interruptible # ^^^^^^^^^^^^^^^^^^^^^ diff --git a/cookbook/core/containerization/use_secrets.py b/cookbook/core/containerization/use_secrets.py index 94abb295a6..77e98599c5 100644 --- a/cookbook/core/containerization/use_secrets.py +++ b/cookbook/core/containerization/use_secrets.py @@ -22,10 +22,10 @@ 2. Flyte will apply labels and annotations that are referenced to all secrets the task is requesting access to. 3. Flyte will send a POST request to ApiServer to create the object. 4. Before persisting the Pod, ApiServer will invoke all registered Pod Webhooks. Flyte's Pod Webhook will be called. -5. Flyte Pod Webhook will then, using the labels and annotiations attached in step 2, lookup globally mounted secrets for each of the requested secrets. +5. Flyte Pod Webhook will then, using the labels and annotiations attached in step 2, lookup globally mounted secrets for each of the requested secrets. 6. If found, Pod Webhook will mount them directly in the Pod. If not found, it will inject the appropriate annotations to load the secrets for K8s (or Vault or Confidant or any other secret management system plugin configured) into the task pod. -Once the secret is injected into the task pod, Flytekit can read it using the secret manager (see examples below). +Once the secret is injected into the task pod, Flytekit can read it using the secret manager (see examples below). The webhook is included in all overlays in the Flytekit repo. The deployment file creates (mainly) two things; a Job and a Deployment. @@ -35,41 +35,41 @@ Secret Discovery ---------------- -Flyte identifies secrets using a secret group and a secret key. +Flyte identifies secrets using a secret group and a secret key. In a task decorator you request a secret like this: ``@task(secret_requests=[Secret(group=SECRET_GROUP, key=SECRET_NAME)])`` Flytekit provides a shorthand for loading the requested secret inside a task: ``secret = flytekit.current_context().secrets.get(SECRET_GROUP, SECRET_NAME)`` -See the python examples further down for more details on how to request and use secrets in a task. +See the python examples further down for more details on how to request and use secrets in a task. Flytekit relies on the following environment variables to load secrets (defined `here `_). When running tasks and workflows locally you should make sure to store your secrets accordingly or to modify these: - FLYTE_SECRETS_DEFAULT_DIR - The directory Flytekit searches for secret files, default: "/etc/secrets" - FLYTE_SECRETS_FILE_PREFIX - a common file prefix for Flyte secrets, default: "" - FLYTE_SECRETS_ENV_PREFIX - a common env var prefix for Flyte secrets, default: "_FSEC_" -When running a workflow on a Flyte cluster, the configured secret manager will use the secret Group and Key to try and retrieve a secret. +When running a workflow on a Flyte cluster, the configured secret manager will use the secret Group and Key to try and retrieve a secret. If successful, it will make the secret available as either file or environment variable and will if necessary modify the above variables automatically so that the task can load and use the secrets. Configuring a secret management system plugin into use ------------------------------------------------------ -When a task requests a secret Flytepropeller will try to retrieve secrets in the following order: 1.) checking for global secrets (secrets mounted as files or environment variables on the flyte-pod-webhook pod) and 2.) checking with an additional configurable secret manager. -Note that the global secrets take precedence over any secret discoverable by the secret manager plugins. +When a task requests a secret Flytepropeller will try to retrieve secrets in the following order: 1.) checking for global secrets (secrets mounted as files or environment variables on the flyte-pod-webhook pod) and 2.) checking with an additional configurable secret manager. +Note that the global secrets take precedence over any secret discoverable by the secret manager plugins. -The following additional secret managers are available at the time of writing: +The following additional secret managers are available at the time of writing: - `K8s secrets `_ (default) - flyte-pod-webhook will try to look for a K8s secret named after the secret Group and retrieve the value for the secret Key. - AWS Secret Manager - flyte-pod-webhook will add the AWS Secret Manager sidecar container to a task Pod which will mount the secret. - `Vault Agent Injector `_ - flyte-pod-webhook will annotate the task Pod with the respective Vault annotations that trigger an existing Vault Agent Injector to retrieve the specified secret Key from a vault path defined as secret Group. -You can configure the additional secret manager by defining `secretManagerType` to be either 'K8s', 'AWS' or 'Vault' in +You can configure the additional secret manager by defining `secretManagerType` to be either 'K8s', 'AWS' or 'Vault' in the `core config ` of the Flytepropeller. When using the K8s secret manager plugin (enabled by default), the secrets need to be available in the same namespace as the task execution -(for example `flytesnacks-development`). K8s secrets can be mounted as either files or injected as environment variables into the task pod, -so if you need to make larger files available to the task, then this might be the better option. -Furthermore, this method also allows you to have separate credentials for different domains but still using the same name for the secret. -The `group` of the secret request corresponds to the K8s secret name, while the `name` of the request corresponds to the key of the specific entry in the secret. +(for example `flytesnacks-development`). K8s secrets can be mounted as either files or injected as environment variables into the task pod, +so if you need to make larger files available to the task, then this might be the better option. +Furthermore, this method also allows you to have separate credentials for different domains but still using the same name for the secret. +The `group` of the secret request corresponds to the K8s secret name, while the `name` of the request corresponds to the key of the specific entry in the secret. When using the Vault secret manager, make sure you have Vault Agent deployed on your cluster (`step-by-step tutorial `_). -Vault secrets can only be mounted as files and will become available under "/etc/flyte/secrets/SECRET_GROUP/SECRET_NAME". Vault comes with `two versions `_ of the key-value secret store. +Vault secrets can only be mounted as files and will become available under "/etc/flyte/secrets/SECRET_GROUP/SECRET_NAME". Vault comes with `two versions `_ of the key-value secret store. By default the Vault secret manager will try to retrieve Version 2 secrets. You can specify the KV version by setting webhook.vaultSecretManager.kvVersion in the configmap. Note that the version number needs to be an explicit string (e.g. "1"). You can also configure the Vault role under which Flyte will try to read the secret by setting webhook.vaultSecretManager.role (default: "flyte"). @@ -84,12 +84,14 @@ # %% import os -import flytekit from typing import Tuple +import flytekit + # %% # Flytekit exposes a type/class called Secrets. It can be imported as follows. from flytekit import Secret, task, workflow +from flytekit.testing import SecretsManager # %% # Secrets consists of a name and an enum that indicates how the secrets will be accessed. If the mounting_requirement is @@ -106,10 +108,10 @@ # %% -# Now declare the secret in the requests. The request tells Flyte to make the secret available to the task. The secret can -# then be accessed inside the task using the :py:class:`flytekit.ExecutionParameters`, through the global flytekit -# context as shown below. At runtime, flytekit looks inside the task pod for an environment variable or a mounted file with -# a predefined name/path and loads the value. +# Now declare the secret in the requests. The request tells Flyte to make the secret available to the task. The secret can +# then be accessed inside the task using the :py:class:`flytekit.ExecutionParameters`, through the global flytekit +# context as shown below. At runtime, flytekit looks inside the task pod for an environment variable or a mounted file with +# a predefined name/path and loads the value. @task(secret_requests=[Secret(group=SECRET_GROUP, key=SECRET_NAME)]) def secret_task() -> str: secret_val = flytekit.current_context().secrets.get(SECRET_GROUP, SECRET_NAME) @@ -138,9 +140,15 @@ def secret_task() -> str: # %% # The Secret structure allows passing two fields, matching the key and the group, as previously described: @task( - secret_requests=[Secret(key=USERNAME_SECRET, group=SECRET_GROUP), Secret(key=PASSWORD_SECRET, group=SECRET_GROUP)]) + secret_requests=[ + Secret(key=USERNAME_SECRET, group=SECRET_GROUP), + Secret(key=PASSWORD_SECRET, group=SECRET_GROUP), + ] +) def user_info_task() -> Tuple[str, str]: - secret_username = flytekit.current_context().secrets.get(SECRET_GROUP, USERNAME_SECRET) + secret_username = flytekit.current_context().secrets.get( + SECRET_GROUP, USERNAME_SECRET + ) secret_pwd = flytekit.current_context().secrets.get(SECRET_GROUP, PASSWORD_SECRET) # Please do not print the secret value, this is just a demonstration. print(f"{secret_username}={secret_pwd}") @@ -153,7 +161,15 @@ def user_info_task() -> Tuple[str, str]: # keys (certs etc). Another reason may be that a dependent library necessitates that the secret be available as a file. # In these scenarios you can specify the mount_requirement. In the following example we force the mounting to be # an Env variable -@task(secret_requests=[Secret(group=SECRET_GROUP, key=SECRET_NAME, mount_requirement=Secret.MountType.ENV_VAR)]) +@task( + secret_requests=[ + Secret( + group=SECRET_GROUP, + key=SECRET_NAME, + mount_requirement=Secret.MountType.ENV_VAR, + ) + ] +) def secret_file_task() -> Tuple[str, str]: # SM here is a handle to the secrets manager sm = flytekit.current_context().secrets @@ -176,13 +192,16 @@ def my_secret_workflow() -> Tuple[str, str, str, str, str]: # %% # The simplest way to test Secret accessibility is to export the secret as an environment variable. There are some # helper methods available to do so -from flytekit.testing import SecretsManager if __name__ == "__main__": sec = SecretsManager() os.environ[sec.get_secrets_env_var(SECRET_GROUP, SECRET_NAME)] = "value" - os.environ[sec.get_secrets_env_var(SECRET_GROUP, USERNAME_SECRET)] = "username_value" - os.environ[sec.get_secrets_env_var(SECRET_GROUP, PASSWORD_SECRET)] = "password_value" + os.environ[ + sec.get_secrets_env_var(SECRET_GROUP, USERNAME_SECRET) + ] = "username_value" + os.environ[ + sec.get_secrets_env_var(SECRET_GROUP, PASSWORD_SECRET) + ] = "password_value" x, y, z, f, s = my_secret_workflow() assert x == "value" assert y == "username_value" diff --git a/cookbook/core/control_flow/chain_tasks.py b/cookbook/core/control_flow/chain_tasks.py index 4e5adcfe8d..1f35973815 100644 --- a/cookbook/core/control_flow/chain_tasks.py +++ b/cookbook/core/control_flow/chain_tasks.py @@ -9,15 +9,18 @@ In this example, let's enforce an order for ``read()`` to happen after ``write()``. """ +import pandas as pd + # %% # First, we import the necessary dependencies. from flytekit import task, workflow from flytekit.core.node_creation import create_node -import pandas as pd DATABASE = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv" # %% # We define a ``read()`` task to read from the file. + + @task def read() -> pd.DataFrame: data = pd.read_csv(DATABASE) @@ -29,7 +32,7 @@ def read() -> pd.DataFrame: @task def write(): # dummy code - df = pd.DataFrame( + df = pd.DataFrame( # noqa : F841 data={ "sepal_length": [5.3], "sepal_width": [3.8], diff --git a/cookbook/core/control_flow/checkpoint.py b/cookbook/core/control_flow/checkpoint.py index dda62e06a0..7709ea9982 100644 --- a/cookbook/core/control_flow/checkpoint.py +++ b/cookbook/core/control_flow/checkpoint.py @@ -4,7 +4,7 @@ .. note:: - This feature is available from Flytekit version 0.30.0b6+ and needs a Flyte backend version of atleast 0.19.0+. + This feature is available from Flytekit version 0.30.0b6+ and needs a Flyte backend version of at least 0.19.0+. A checkpoint recovers a task from a previous failure by recording the state of a task before the failure and resuming from the latest recorded state. @@ -39,18 +39,17 @@ significant fault-tolerance to ensure successful completion. But as the time for a task increases, the cost of re-running it increases, and reduces the chances of successful -completion. This is where Flyte's intra-task checkpointing truly shines. +completion. This is where Flyte's intra-task checkpointing truly shines. Let's look at an example of how to develop tasks which utilize intra-task checkpointing. It only provides the low-level API, though. We intend to integrate higher-level checkpointing APIs available in popular training frameworks like Keras, Pytorch, Scikit-learn, and big-data frameworks like Spark and Flink to supercharge their fault-tolerance. """ -from flytekit import task, workflow, current_context +from flytekit import current_context, task, workflow from flytekit.exceptions.user import FlyteRecoverableException - -RETRIES=3 +RETRIES = 3 # %% @@ -75,7 +74,9 @@ def use_checkpoint(n_iterations: int) -> int: # simulate a deterministic failure, for demonstration. We want to show how it eventually completes within # the given retries if i > start and i % failure_interval == 0: - raise FlyteRecoverableException(f"Failed at iteration {start}, failure_interval {failure_interval}") + raise FlyteRecoverableException( + f"Failed at iteration {start}, failure_interval {failure_interval}" + ) # save progress state. It is also entirely possible save state every few intervals. cp.write(f"{i + 1}".encode()) @@ -83,7 +84,7 @@ def use_checkpoint(n_iterations: int) -> int: # %% -# The workflow here simply calls the task. The task itself +# The workflow here simply calls the task. The task itself # will be retried for the :ref:`FlyteRecoverableException `. # @workflow @@ -91,11 +92,11 @@ def example(n_iterations: int) -> int: return use_checkpoint(n_iterations=n_iterations) -#%% +# %% # The checkpoint is stored locally, but it is not used since retries are not supported. if __name__ == "__main__": try: example(n_iterations=10) - except RuntimeError as e: + except RuntimeError as e: # noqa : F841 # no retries are performed, so an exception is expected when run locally. pass diff --git a/cookbook/core/control_flow/conditions.py b/cookbook/core/control_flow/conditions.py index 944ffaee10..05d0507136 100644 --- a/cookbook/core/control_flow/conditions.py +++ b/cookbook/core/control_flow/conditions.py @@ -47,10 +47,10 @@ def double(n: float) -> float: def multiplier(my_input: float) -> float: return ( conditional("fractions") - .if_((my_input >= 0.1) & (my_input <= 1.0)) - .then(double(n=my_input)) - .else_() - .then(square(n=my_input)) + .if_((my_input >= 0.1) & (my_input <= 1.0)) + .then(double(n=my_input)) + .else_() + .then(square(n=my_input)) ) @@ -73,12 +73,12 @@ def multiplier(my_input: float) -> float: def multiplier_2(my_input: float) -> float: return ( conditional("fractions") - .if_((my_input > 0.1) & (my_input < 1.0)) - .then(double(n=my_input)) - .elif_((my_input > 1.0) & (my_input <= 10.0)) - .then(square(n=my_input)) - .else_() - .fail("The input must be between 0 and 10") + .if_((my_input > 0.1) & (my_input < 1.0)) + .then(double(n=my_input)) + .elif_((my_input > 1.0) & (my_input <= 10.0)) + .then(square(n=my_input)) + .else_() + .fail("The input must be between 0 and 10") ) @@ -94,12 +94,12 @@ def multiplier_2(my_input: float) -> float: def multiplier_3(my_input: float) -> float: result = ( conditional("fractions") - .if_((my_input > 0.1) & (my_input < 1.0)) - .then(double(n=my_input)) - .elif_((my_input > 1.0) & (my_input < 10.0)) - .then(square(n=my_input)) - .else_() - .fail("The input must be between 0 and 10") + .if_((my_input > 0.1) & (my_input < 1.0)) + .then(double(n=my_input)) + .elif_((my_input > 1.0) & (my_input < 10.0)) + .then(square(n=my_input)) + .else_() + .fail("The input must be between 0 and 10") ) # the 'result' will either be the output of `double` or `square`. If none of the conditions is true, @@ -119,7 +119,7 @@ def multiplier_3(my_input: float) -> float: # # .. note:: # -# How do output values get these methods? +# How do output values get these methods? # In a workflow, no output can be accessed directly. The inputs and outputs are auto-wrapped in a special object called :py:class:`flytekit.extend.Promise`. # # In this example, we create a biased coin whose seed can be controlled. @@ -179,7 +179,9 @@ def bool_input_wf(b: bool) -> int: print("Running basic_boolean_wf a few times") for i in range(0, 5): print(f"Basic boolean wf output {basic_boolean_wf()}") - print(f"Boolean input {True if i < 2 else False}, workflow output {bool_input_wf(b=True if i < 2 else False)}") + print( + f"Boolean input {True if i < 2 else False}, workflow output {bool_input_wf(b=True if i < 2 else False)}" + ) # %% @@ -192,20 +194,20 @@ def bool_input_wf(b: bool) -> int: def nested_conditions(my_input: float) -> float: return ( conditional("fractions") - .if_((my_input > 0.1) & (my_input < 1.0)) - .then( + .if_((my_input > 0.1) & (my_input < 1.0)) + .then( conditional("inner_fractions") - .if_(my_input < 0.5) - .then(double(n=my_input)) - .elif_((my_input > 0.5) & (my_input < 0.7)) - .then(square(n=my_input)) - .else_() - .fail("Only <0.7 allowed") - ) - .elif_((my_input > 1.0) & (my_input < 10.0)) + .if_(my_input < 0.5) + .then(double(n=my_input)) + .elif_((my_input > 0.5) & (my_input < 0.7)) .then(square(n=my_input)) .else_() - .then(double(n=my_input)) + .fail("Only <0.7 allowed") + ) + .elif_((my_input > 1.0) & (my_input < 10.0)) + .then(square(n=my_input)) + .else_() + .then(double(n=my_input)) ) @@ -246,10 +248,10 @@ def consume_outputs(my_input: float, seed: int = 5) -> float: is_heads = coin_toss(seed=seed) res = ( conditional("double_or_square") - .if_(is_heads.is_true()) - .then(square(n=my_input)) - .else_() - .then(calc_sum(a=my_input, b=my_input)) + .if_(is_heads.is_true()) + .then(square(n=my_input)) + .else_() + .then(calc_sum(a=my_input, b=my_input)) ) # Regardless of the result, call ``double`` before @@ -261,6 +263,8 @@ def consume_outputs(my_input: float, seed: int = 5) -> float: # The workflow can be executed locally. if __name__ == "__main__": print( - f"consume_outputs(0.4) with default seed=5. This should return output of calc_sum => {consume_outputs(my_input=0.4)}") + f"consume_outputs(0.4) with default seed=5. This should return output of calc_sum => {consume_outputs(my_input=0.4)}" + ) print( - f"consume_outputs(0.4, seed=7), this should return output of square => {consume_outputs(my_input=0.4, seed=7)}") + f"consume_outputs(0.4, seed=7), this should return output of square => {consume_outputs(my_input=0.4, seed=7)}" + ) diff --git a/cookbook/core/control_flow/dynamics.py b/cookbook/core/control_flow/dynamics.py index 1b16c89199..6e3fae6e88 100644 --- a/cookbook/core/control_flow/dynamics.py +++ b/cookbook/core/control_flow/dynamics.py @@ -28,6 +28,7 @@ from flytekit import dynamic, task, workflow + # %% # Next, we write a task that returns the index of a character (A-Z/a-z is equivalent to 0 to 25). @task @@ -79,8 +80,8 @@ def derive_count(freq1: typing.List[int], freq2: typing.List[int]) -> int: # At execution (run) time, Flytekit runs the compilation step, and produces # a ``WorkflowTemplate`` (from the dynamic workflow), which Flytekit then passes back to Flyte Propeller for further running, exactly how sub-workflows are handled. # -# .. note:: -# The dynamic pattern isn't the most efficient method to iterate over a list. `Map tasks `_ +# .. note:: +# The dynamic pattern isn't the most efficient method to iterate over a list. `Map tasks `_ # might be more efficient in certain cases. But they only work for Python tasks (tasks decorated with the @task decorator) not SQL/Spark/etc,. # # We now define a dynamic workflow that encapsulates the above mentioned points. @@ -91,21 +92,21 @@ def count_characters(s1: str, s2: str) -> int: # s1 and s2 are accessible - # initiliaze an empty list consisting of 26 empty slots corresponding to every alphabet (lower and upper case) + # initialize an empty list consisting of 26 empty slots corresponding to every alphabet (lower and upper case) freq1 = [0] * 26 freq2 = [0] * 26 # looping through the string s1 for i in range(len(s1)): - # index and freq1 are not accesible as they are promises + # index and freq1 are not accessible as they are promises index = return_index(character=s1[i]) freq1 = update_list(freq_list=freq1, list_index=index) # looping through the string s2 for i in range(len(s2)): - # index and freq2 are not accesible as they are promises + # index and freq2 are not accessible as they are promises index = return_index(character=s2[i]) freq2 = update_list(freq_list=freq2, list_index=index) diff --git a/cookbook/core/control_flow/merge_sort.py b/cookbook/core/control_flow/merge_sort.py index 59a603607f..ebda6a9ffd 100644 --- a/cookbook/core/control_flow/merge_sort.py +++ b/cookbook/core/control_flow/merge_sort.py @@ -15,9 +15,9 @@ import typing from datetime import datetime from random import random, seed +from typing import Tuple from flytekit import conditional, dynamic, task, workflow -from typing import Tuple # seed random number generator seed(datetime.now().microsecond) @@ -26,11 +26,12 @@ # %% # A simple split function that divides a list into two halves. + @task def split(numbers: typing.List[int]) -> Tuple[typing.List[int], typing.List[int], int]: return ( - numbers[0:int(len(numbers) / 2)], - numbers[int(len(numbers) / 2):], + numbers[0 : int(len(numbers) / 2)], + numbers[int(len(numbers) / 2) :], int(len(numbers) / 2), ) @@ -124,8 +125,7 @@ def generate_inputs(numbers_count: int) -> typing.List[int]: # %% # The entire workflow can be executed locally as follows... if __name__ == "__main__": - print(f"Running Merge Sort Locally...") count = 20 x = generate_inputs(count) print(x) - print(merge_sort(numbers=x, numbers_count=count)) + print(f"Running Merge Sort Locally...{merge_sort(numbers=x, numbers_count=count)}") diff --git a/cookbook/core/control_flow/subworkflows.py b/cookbook/core/control_flow/subworkflows.py index 30aad652fa..d4fbf93ca1 100644 --- a/cookbook/core/control_flow/subworkflows.py +++ b/cookbook/core/control_flow/subworkflows.py @@ -2,21 +2,21 @@ Subworkflows ------------ -Subworkflows are similar to :ref:`launch plans `, since they allow users to kick off one workflow from inside another. +Subworkflows are similar to :ref:`launch plans `, since they allow users to kick off one workflow from inside another. -What's the difference? +What's the difference? Think of launch plans as pass by pointer and subworkflows as pass by value. .. note:: - The reason why subworkflows exist is that this is how Flyte handles dynamic workflows. + The reason why subworkflows exist is that this is how Flyte handles dynamic workflows. Instead of hiding this functionality, we expose it at the user level. There are pros and cons of using subworkflows as described below. When Should I Use SubWorkflows? ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ If you want to limit parallelism within a workflow and its launched sub-flows, subworkflows provide a clean way -to achieve that because they execute within the same context of the parent workflow. +to achieve that because they execute within the same context of the parent workflow. Thus, all nodes of a subworkflow are constrained to the total constraint on the parent workflow. Consider this: When you include Workflow A as a subworkflow of Workflow B, and when Workflow B is run, the entire graph of workflow A is @@ -31,6 +31,7 @@ # We import the required dependencies into the environment. import typing from typing import Tuple + from flytekit import task, workflow # %% @@ -38,10 +39,12 @@ # We usually try and define ``NamedTuple`` as a distinct type as a best practice (although it can be defined inline). op = typing.NamedTuple("OutputsBC", t1_int_output=int, c=str) + @task def t1(a: int) -> op: return op(a + 2, "world") + # %% # Then we define a subworkflow like a typical workflow that can run like any other workflow. @workflow @@ -50,19 +53,21 @@ def my_subwf(a: int = 42) -> Tuple[str, str]: u, v = t1(a=x) return y, v + # %% # We call the workflow declared above in a `parent` workflow below -# which showcases how to override the node name of a task (or subworkflow in this case). -# +# which showcases how to override the node name of a task (or subworkflow in this case). +# # Typically, nodes are just named sequentially: ``n0``, ``n1``, and so on. Since the inner ``my_subwf`` also has a ``n0``, you may # wish to change the name of the first one. Not changing the name is fine because Flyte automatically prepends an attribute -# to the inner ``n0`` since node IDs must be distinct within a workflow graph. +# to the inner ``n0`` since node IDs must be distinct within a workflow graph. @workflow def parent_wf(a: int) -> Tuple[int, str, str]: x, y = t1(a=a).with_overrides(node_name="node-t1-parent") u, v = my_subwf(a=x) return x, u, v + # %% # .. note:: # The with_overrides method provides a new name to the graph-node for better rendering or readability. @@ -95,28 +100,29 @@ def nested_parent_wf(a: int) -> Tuple[int, str, str, str]: # # When launch plans are used within a workflow to launch the execution of a previously defined workflow, a new # external execution is launched, with a separate execution ID and can be observed as a distinct entity in -# FlyteConsole/Flytectl. +# FlyteConsole/Flytectl. # # They may have separate parallelism constraints since the context is not shared. # We refer to such external invocations of a workflow using launch plans from a parent workflow as ``External Workflows``. # # .. tip:: -# +# # If your deployment uses :ref:`multicluster-setup `, then external workflows may allow you to distribute the workload of a workflow to multiple clusters. # # Here is an example demonstrating external workflows: # %% # We import the required dependencies into the environment. -import typing -from typing import Tuple, Dict -from flytekit import dynamic,task, workflow -from flytekit import conditional, task, workflow, LaunchPlan +import typing # noqa: E402 +from collections import Counter # noqa: E402 +from typing import Dict, Tuple # noqa: E402 -from collections import Counter +from flytekit import LaunchPlan, task, workflow # noqa: E402 # %% # We define a task that computes the frequency of every word in a string, and returns a dictionary mapping every word to its count. + + @task def count_freq_words(input_string1: str) -> Dict: # input_string = "The cat sat on the mat" @@ -124,6 +130,7 @@ def count_freq_words(input_string1: str) -> Dict: wordCount = dict(Counter(words)) return wordCount + # %% # We define a workflow that executes the previously defined task. @workflow @@ -131,27 +138,35 @@ def ext_workflow(my_input: str) -> Dict: result = count_freq_words(input_string1=my_input) return result + # %% # Next, we create a launch plan. -external_lp = LaunchPlan.get_or_create(ext_workflow, "parent_workflow_execution",) +external_lp = LaunchPlan.get_or_create( + ext_workflow, + "parent_workflow_execution", +) # %% # We define another task that returns the repeated keys (in our case, words) from a dictionary. + + @task def count_repetitive_words(word_counter: Dict) -> typing.List[str]: repeated_words = [key for key, value in word_counter.items() if value > 1] return repeated_words + # %% # We define a workflow that triggers the launch plan of the previously-defined workflow. @workflow def parent_workflow(my_input1: str) -> typing.List[str]: - my_op1 = external_lp(my_input=my_input1) - my_op2 = count_repetitive_words(word_counter = my_op1) + my_op1 = external_lp(my_input=my_input1) + my_op2 = count_repetitive_words(word_counter=my_op1) return my_op2 + # %% # Here, ``parent_workflow`` is an external workflow. This can be run locally too. if __name__ == "__main__": print("Running parent workflow...") - print(parent_workflow(my_input1= "the cat took the apple and ate the apple")) \ No newline at end of file + print(parent_workflow(my_input1="the cat took the apple and ate the apple")) diff --git a/cookbook/core/extend_flyte/backend_plugins.py b/cookbook/core/extend_flyte/backend_plugins.py index 9ca2917dfa..8633471f4c 100644 --- a/cookbook/core/extend_flyte/backend_plugins.py +++ b/cookbook/core/extend_flyte/backend_plugins.py @@ -29,7 +29,7 @@ This makes it possible to extend a task-template beyond the default supported targets -- :std:ref:`container ` (WIP, sql etc). The motivation of the Custom field, is to marshal a JSON structure that specifies information beyond what a regular TaskTemplate can capture. The actual structure of the JSON is known only to the implemented backend-plugin and the SDK components. The core Flyte platform, does not understand of look into the specifics of this structure. -It is highly recommended to use an interface definition lanugage like Protobuf, OpenAPISpec etc to declare specify the structure of the JSON. From here, on we refer to this as the ``Plugin Specification``. +It is highly recommended to use an interface definition language like Protobuf, OpenAPISpec etc to declare specify the structure of the JSON. From here, on we refer to this as the ``Plugin Specification``. For Spark we decided to use Protobuf to specify the plugin as can be seen `here `__. Note it is not necessary to have the Plugin structure specified in FlyteIDL, we do it for simplicity, ease of maintenance alongwith the core platform and because of existing tooling to generate code for protobuf. @@ -45,7 +45,7 @@ FlytePropeller backend Plugin ------------------------------ -The backend plugin is where the actual logic of the execution is implemented. The backend plugin uses Flyte - PluginMachinery inteface to implement a plugin which can be one of the following supported types +The backend plugin is where the actual logic of the execution is implemented. The backend plugin uses Flyte - PluginMachinery interface to implement a plugin which can be one of the following supported types #. A `Kubernetes operator Plugin `_. The following demo shows 2 examples of K8s backend plugins: Flytekit Athena & Spark, and Flyte K8s Pod & Spark: diff --git a/cookbook/core/extend_flyte/custom_task_plugin.py b/cookbook/core/extend_flyte/custom_task_plugin.py index d740c58b61..9f5f95e73b 100644 --- a/cookbook/core/extend_flyte/custom_task_plugin.py +++ b/cookbook/core/extend_flyte/custom_task_plugin.py @@ -6,7 +6,7 @@ Flytekit is designed to be extremely extensible. You can add new task-types that are useful only for your use-cases. Flyte does come with the capability of extending the backend, but that is only required if you want the capability to be -extended to all users of Flyte, or there is a cost/visibility benefit of doing so. +extended to all users of Flyte, or there is a cost/visibility benefit of doing so. The following demo shows how to build Flyte container task extensions, with an SQLAlchemy extension as an example: @@ -59,7 +59,10 @@ class WaitForObjectStoreFile(PythonTask): _VAR_NAME: str = "path" def __init__( - self, name: str, poll_interval: timedelta = timedelta(seconds=10), **kwargs, + self, + name: str, + poll_interval: timedelta = timedelta(seconds=10), + **kwargs, ): super(WaitForObjectStoreFile, self).__init__( task_type="object-store-sensor", diff --git a/cookbook/core/extend_flyte/custom_types.py b/cookbook/core/extend_flyte/custom_types.py index fa0843fc34..fe0883fdc4 100644 --- a/cookbook/core/extend_flyte/custom_types.py +++ b/cookbook/core/extend_flyte/custom_types.py @@ -27,19 +27,10 @@ import typing from typing import Type -from flytekit import ( - Blob, - BlobMetadata, - BlobType, - FlyteContext, - Literal, - LiteralType, - Scalar, - task, - workflow, -) +from flytekit import Blob, BlobMetadata, BlobType, FlyteContext, Literal, LiteralType, Scalar, task, workflow from flytekit.extend import TypeEngine, TypeTransformer + # %% # .. note:: # ``FlyteContext`` is used to access a random local directory. diff --git a/cookbook/core/flyte_basics/basic_workflow.py b/cookbook/core/flyte_basics/basic_workflow.py index d83dc28233..587a4df827 100644 --- a/cookbook/core/flyte_basics/basic_workflow.py +++ b/cookbook/core/flyte_basics/basic_workflow.py @@ -17,9 +17,9 @@ Now, let's get started with a simple workflow. """ import typing +from typing import Tuple from flytekit import task, workflow -from typing import Tuple @task @@ -52,7 +52,7 @@ def my_wf(a: int, b: str) -> Tuple[int, str]: # workflow is executed. # # A workflow can be executed locally where the evaluation will happen immediately, or using the CLI, UI, etc., which will trigger an evaluation. -# Although Flyte workflows decorated with ``@workflow`` look like Python functions, they are actually python-esque, Domain Specific Language (DSL) entities +# Although Flyte workflows decorated with ``@workflow`` look like Python functions, they are actually python-esque, Domain Specific Language (DSL) entities # that recognize the ``@task`` decorators. When a workflow encounters a ``@task``-decorated Python function, it creates a # :py:class:`flytekit.core.promise.Promise` object. This promise doesn't contain the actual output of the task, and is only fulfilled at execution time. # diff --git a/cookbook/core/flyte_basics/decorating_tasks.py b/cookbook/core/flyte_basics/decorating_tasks.py index fd33a61776..79f919fc09 100644 --- a/cookbook/core/flyte_basics/decorating_tasks.py +++ b/cookbook/core/flyte_basics/decorating_tasks.py @@ -14,7 +14,6 @@ from flytekit import task, workflow - logger = logging.getLogger(__file__) @@ -25,6 +24,7 @@ # # Here we define decorator that logs the input and output information of a decorated task. + def log_io(fn): @wraps(fn) def wrapper(*args, **kwargs): @@ -42,6 +42,7 @@ def wrapper(*args, **kwargs): # .. note:: # The order of invoking the decorators is important. ``@task`` should always be the outer-most decorator. + @task @log_io def t1(x: int) -> int: @@ -61,12 +62,15 @@ def t1(x: int) -> int: # .. note:: # The ``validate_output`` output uses :py:func:`~functools.partial` to implement parameterized decorators. + def validate_output(fn=None, *, floor=0): @wraps(fn) def wrapper(*args, **kwargs): out = fn(*args, **kwargs) if out <= floor: - raise ValueError(f"output of task {fn.__name__} must be a positive number, found {out}") + raise ValueError( + f"output of task {fn.__name__} must be a positive number, found {out}" + ) return out if fn is None: @@ -74,9 +78,11 @@ def wrapper(*args, **kwargs): return wrapper + # %% # Now let's define a function that uses both the logging and validator decorators: + @task @log_io @validate_output(floor=10) @@ -87,6 +93,7 @@ def t2(x: int) -> int: # %% # Finally, we compose a workflow that calls ``t1`` and ``t2``. + @workflow def wf(x: int) -> int: return t2(x=t1(x=x)) diff --git a/cookbook/core/flyte_basics/decorating_workflows.py b/cookbook/core/flyte_basics/decorating_workflows.py index 47b891b83d..88031a8714 100644 --- a/cookbook/core/flyte_basics/decorating_workflows.py +++ b/cookbook/core/flyte_basics/decorating_workflows.py @@ -20,9 +20,10 @@ """ from functools import partial, wraps +from unittest.mock import MagicMock import flytekit -from flytekit import task, workflow, FlyteContextManager +from flytekit import FlyteContextManager, task, workflow from flytekit.core.node_creation import create_node # %% @@ -30,7 +31,6 @@ # :py:class:`unittest.mock.MagicMock` class to create a fake external service that we want to initialize at the # beginning of our workflow and finish at the end. -from unittest.mock import MagicMock external_service = MagicMock() @@ -57,6 +57,7 @@ def teardown(): # # Next we create the decorator that we'll use to wrap our workflow function. + def setup_teardown(fn=None, *, before, after): @wraps(fn) def wrapper(*args, **kwargs): @@ -93,6 +94,7 @@ def wrapper(*args, **kwargs): return wrapper + # %% # There are a few key pieces to note in the ``setup_teardown`` decorator above: # @@ -112,6 +114,7 @@ def wrapper(*args, **kwargs): # # Now let's define two tasks that will constitute the workflow + @task def t1(x: float) -> float: return x - 1 @@ -119,11 +122,13 @@ def t1(x: float) -> float: @task def t2(x: float) -> float: - return x ** 2 + return x**2 + # %% # And then create our decorated workflow: + @workflow @setup_teardown(before=setup, after=teardown) def wf(x: float) -> float: diff --git a/cookbook/core/flyte_basics/documented_workflow.py b/cookbook/core/flyte_basics/documented_workflow.py index 10db71b512..e31d8fb125 100644 --- a/cookbook/core/flyte_basics/documented_workflow.py +++ b/cookbook/core/flyte_basics/documented_workflow.py @@ -17,6 +17,7 @@ from dataclasses_json import dataclass_json from flytekit import task, workflow + # %% # We define a dataclass. @dataclass_json @@ -67,7 +68,7 @@ def sphinx_docstring(df: pd.DataFrame, data: PandasData = PandasData()) -> pd.Da # The first block of the docstring is a one-liner about the workflow. # The second block of the docstring consists of a detailed description. # The third block of the docstring describes all the parameters along with their data types. -# The fourth block of the docstring descibes the return type along with its data type. +# The fourth block of the docstring describes the return type along with its data type. @workflow def numpy_docstring(df: pd.DataFrame, data: PandasData = PandasData()) -> pd.DataFrame: """ diff --git a/cookbook/core/flyte_basics/files.py b/cookbook/core/flyte_basics/files.py index 0da1942c97..3079452ffd 100644 --- a/cookbook/core/flyte_basics/files.py +++ b/cookbook/core/flyte_basics/files.py @@ -26,7 +26,6 @@ from flytekit import task, workflow from flytekit.types.file import FlyteFile - # %% # Next, we write a task that accepts a ``FlyteFile``, a list of column names, # and a list of column names to normalize, then outputs a csv file of only @@ -39,6 +38,7 @@ # ``FlyteFile[typing.TypeVar("jpeg")]``). The format is entirely optional, # and if not specified, defaults to ``""``. + @task def normalize_columns( csv_url: FlyteFile, @@ -48,7 +48,7 @@ def normalize_columns( ) -> FlyteFile: # read the data from the raw csv file parsed_data = defaultdict(list) - with open(csv_url, newline='\n') as input_file: + with open(csv_url, newline="\n") as input_file: reader = csv.DictReader(input_file, fieldnames=column_names) for row in (x for i, x in enumerate(reader) if i > 0): for column in columns_to_normalize: @@ -94,6 +94,7 @@ def normalize_columns( # the workflow. This is passed to the ``location`` input of the task. If it's not an empty string, the task attempts to # upload its file to that location. + @workflow def normalize_csv_file( csv_url: FlyteFile, @@ -125,7 +126,9 @@ def normalize_csv_file( ), ] print(f"Running {__file__} main...") - for index, (csv_url, column_names, columns_to_normalize) in enumerate(default_files): + for index, (csv_url, column_names, columns_to_normalize) in enumerate( + default_files + ): normalized_columns = normalize_csv_file( csv_url=csv_url, column_names=column_names, diff --git a/cookbook/core/flyte_basics/folders.py b/cookbook/core/flyte_basics/folders.py index 081c872c13..d4f565f2e4 100644 --- a/cookbook/core/flyte_basics/folders.py +++ b/cookbook/core/flyte_basics/folders.py @@ -30,13 +30,14 @@ def download_files(csv_urls: List[str]) -> FlyteDirectory: working_dir = flytekit.current_context().working_directory local_dir = Path(os.path.join(working_dir, "csv_files")) local_dir.mkdir(exist_ok=True) - + # get the number of digits needed to preserve the order of files in the local directory zfill_len = len(str(len(csv_urls))) for idx, remote_location in enumerate(csv_urls): local_image = os.path.join( # prefix the file name with the index location of the file in the original csv_urls list - local_dir, f"{str(idx).zfill(zfill_len)}_{os.path.basename(remote_location)}" + local_dir, + f"{str(idx).zfill(zfill_len)}_{os.path.basename(remote_location)}", ) urllib.request.urlretrieve(remote_location, local_image) return FlyteDirectory(path=str(local_dir)) @@ -50,6 +51,7 @@ def download_files(csv_urls: List[str]) -> FlyteDirectory: # demonstrates how Flyte tasks are simply entrypoints of execution, which can themselves call # other functions and routines that are written in pure python. + def normalize_columns( local_csv_file: str, column_names: List[str], @@ -57,7 +59,7 @@ def normalize_columns( ): # read the data from the raw csv file parsed_data = defaultdict(list) - with open(local_csv_file, newline='\n') as input_file: + with open(local_csv_file, newline="\n") as input_file: reader = csv.DictReader(input_file, fieldnames=column_names) for row in (x for i, x in enumerate(reader) if i > 0): for column in columns_to_normalize: @@ -82,6 +84,7 @@ def normalize_columns( # Now we define a task that accepts the previously downloaded folder, along with some metadata about the # column names of each file in the directory and the column names that we want to normalize. + @task def normalize_all_files( csv_files_dir: FlyteDirectory, @@ -103,6 +106,7 @@ def normalize_all_files( # of url strings pointing to a remote location containing a csv file, a list of column names # associated with each csv file, and a list of columns that we want to normalize. + @workflow def download_and_normalize_csv_files( csv_urls: List[str], @@ -139,7 +143,4 @@ def download_and_normalize_csv_files( columns_metadata=columns_metadata, columns_to_normalize_metadata=columns_to_normalize_metadata, ) - print( - f"Running download_and_normalize_csv_files on {csv_urls}: " - f"{directory}" - ) + print(f"Running download_and_normalize_csv_files on {csv_urls}: " f"{directory}") diff --git a/cookbook/core/flyte_basics/lp.py b/cookbook/core/flyte_basics/lp.py index f442e1ce29..c21c58379e 100644 --- a/cookbook/core/flyte_basics/lp.py +++ b/cookbook/core/flyte_basics/lp.py @@ -52,7 +52,9 @@ def my_wf(val: int) -> int: # %% # It is possible to **fix** launch plan inputs, so that they can't be overridden at execution call time. -my_fixed_lp = LaunchPlan.get_or_create(name="always_2_lp", workflow=my_wf, fixed_inputs={"val": 4}) +my_fixed_lp = LaunchPlan.get_or_create( + name="always_2_lp", workflow=my_wf, fixed_inputs={"val": 4} +) square_2 = my_fixed_lp() # error: # square_1 = my_fixed_lp(val=1) diff --git a/cookbook/core/flyte_basics/named_outputs.py b/cookbook/core/flyte_basics/named_outputs.py index bdaa2f0205..aa4975a212 100644 --- a/cookbook/core/flyte_basics/named_outputs.py +++ b/cookbook/core/flyte_basics/named_outputs.py @@ -15,7 +15,6 @@ from flytekit import task, workflow - # %% # Named outputs can be declared inline as in the following task signature. # @@ -53,6 +52,7 @@ def say_hello() -> hello_output: # Note that we are de-referencing the individual task execution outputs because named-outputs use NamedTuple # which are tuples that need to be de-referenced. + @workflow def my_wf() -> wf_outputs: return wf_outputs(say_hello().greet, say_hello().greet) diff --git a/cookbook/core/flyte_basics/shell_task.py b/cookbook/core/flyte_basics/shell_task.py index f2027b6918..01e6494c3b 100644 --- a/cookbook/core/flyte_basics/shell_task.py +++ b/cookbook/core/flyte_basics/shell_task.py @@ -16,7 +16,6 @@ from flytekit.types.directory import FlyteDirectory from flytekit.types.file import FlyteFile - t1 = ShellTask( name="task_1", debug=True, @@ -32,9 +31,7 @@ fi """, inputs=kwtypes(x=FlyteFile), - output_locs=[ - OutputLocation(var="i", var_type=FlyteFile, location="{inputs.x}") - ], + output_locs=[OutputLocation(var="i", var_type=FlyteFile, location="{inputs.x}")], ) diff --git a/cookbook/core/flyte_basics/task_cache.py b/cookbook/core/flyte_basics/task_cache.py index cfe66ccf87..6ab4317d86 100644 --- a/cookbook/core/flyte_basics/task_cache.py +++ b/cookbook/core/flyte_basics/task_cache.py @@ -16,11 +16,16 @@ """ +import time + +import pandas + # %% -# +# # For any :py:func:`flytekit.task` in Flyte, there is always one required import, which is: -from flytekit import task - +from flytekit import HashMethod, task, workflow +from flytekit.core.node_creation import create_node +from typing_extensions import Annotated # %% # Task caching is disabled by default to avoid unintended consequences of caching tasks with side effects. To enable caching and control its behavior, use the ``cache`` and ``cache_version`` parameters when constructing a task. @@ -28,6 +33,8 @@ # ``cache_version`` field indicates that the task functionality has changed. # Bumping the ``cache_version`` is akin to invalidating the cache. # Flyte users can manually update this version and Flyte will cache the next execution instead of relying on the old cache. + + @task(cache=True, cache_version="1.0") def square(n: int) -> int: """ @@ -152,30 +159,29 @@ def square(n: int) -> int: # %% # Here's a complete example of the feature: -import pandas -import time -from typing_extensions import Annotated - -from flytekit import HashMethod, workflow -from flytekit.core.node_creation import create_node - def hash_pandas_dataframe(df: pandas.DataFrame) -> str: return str(pandas.util.hash_pandas_object(df)) + @task -def uncached_data_reading_task() -> Annotated[pandas.DataFrame, HashMethod(hash_pandas_dataframe)]: +def uncached_data_reading_task() -> Annotated[ + pandas.DataFrame, HashMethod(hash_pandas_dataframe) +]: return pandas.DataFrame({"column_1": [1, 2, 3]}) + @task(cache=True, cache_version="1.0") def cached_data_processing_task(df: pandas.DataFrame) -> pandas.DataFrame: time.sleep(1) return df * 2 + @task def compare_dataframes(df1: pandas.DataFrame, df2: pandas.DataFrame): assert df1.equals(df2) + @workflow def cached_dataframe_wf(): raw_data = uncached_data_reading_task() @@ -192,5 +198,5 @@ def cached_dataframe_wf(): if __name__ == "__main__": - print(f"Running cached_dataframe_wf once") df1 = cached_dataframe_wf() + print(f"Running cached_dataframe_wf once : {df1}") diff --git a/cookbook/core/flyte_basics/task_cache_serialize.py b/cookbook/core/flyte_basics/task_cache_serialize.py index 09ba37b5f7..8235e0b9f8 100644 --- a/cookbook/core/flyte_basics/task_cache_serialize.py +++ b/cookbook/core/flyte_basics/task_cache_serialize.py @@ -12,7 +12,7 @@ """ # %% -# +# # For any :py:func:`flytekit.task` in Flyte, there is always one required import, which is: from flytekit import task @@ -20,7 +20,7 @@ # %% # Task cache serializing is disabled by default to avoid unexpected behavior for task executions. To enable use the ``cache_serialize`` parameter. # ``cache_serialize`` is a switch to enable or disable serialization of the task -# This operation is only useful for cachable tasks, where one may reuse output from a previous execution. Flyte requires implicitly enabling the ``cache`` parameter on all cache serializable tasks. +# This operation is only useful for cacheable tasks, where one may reuse output from a previous execution. Flyte requires implicitly enabling the ``cache`` parameter on all cache serializable tasks. # Cache key definitions follow the same rules as non-serialized cache tasks. It is important to understand the implications of the task signature and ``cache_version`` parameter in defining cached results. @task(cache=True, cache_serialize=True, cache_version="1.0") def square(n: int) -> int: @@ -35,6 +35,7 @@ def square(n: int) -> int: """ return n * n + # %% # In the above example calling `square(n=2)` multiple times concurrently (even in different executions or workflows) will only execute the multiplication operation once. # Concurrently evaluated tasks will wait for completion of the first instance before reusing the cached results and subsequent evaluations will instantly reuse existing cache results. @@ -45,7 +46,7 @@ def square(n: int) -> int: # # The cache serialize paradigm introduces a new artifact reservation system. Tasks may use this reservation system to acquire an artifact reservation, indicating that they are actively evaluating the task, and release the reservation, once the execution is completed. Flyte uses a clock-skew algorithm to define reservation timeouts. Therefore, tasks are required to periodically extend the reservation during execution. # -# The first execution of a serializable cached task will successfully acquire the artifact reservation. Execution will be performed as usual and upon completion, the results are written to the cache and reservation is released. Concurrently executed task instances (i.e. in parallel with the initial execution) will observe an active reservation, in which case the execution will wait until the next reevaluation and perform another check. Once the initial execution completes it will reuse the cached results. Subsequently executed task instances (i.e. after an execution has already completed successfully) will immediately reuse the existing cached results. +# The first execution of a serializable cached task will successfully acquire the artifact reservation. Execution will be performed as usual and upon completion, the results are written to the cache and reservation is released. Concurrently executed task instances (i.e. in parallel with the initial execution) will observe an active reservation, in which case the execution will wait until the next reevaluation and perform another check. Once the initial execution completes it will reuse the cached results. Subsequently executed task instances (i.e. after an execution has already completed successfully) will immediately reuse the existing cached results. # # Flyte handles task execution failures using a timeout on the reservation. If the task currently holding the reservation fails to extend it before it times out, another task may acquire the reservation and begin executing the task. -# +# diff --git a/cookbook/core/type_system/enums.py b/cookbook/core/type_system/enums.py index 4df6daf66b..eb72c524e7 100644 --- a/cookbook/core/type_system/enums.py +++ b/cookbook/core/type_system/enums.py @@ -16,9 +16,10 @@ and the Enum types are not optional. So when defining enums, design them well to always make the first value as a valid default. """ -from flytekit import task, workflow -from typing import Tuple from enum import Enum +from typing import Tuple + +from flytekit import task, workflow # %% diff --git a/cookbook/core/type_system/flyte_pickle.py b/cookbook/core/type_system/flyte_pickle.py index 61bc9c3fad..00a7033706 100644 --- a/cookbook/core/type_system/flyte_pickle.py +++ b/cookbook/core/type_system/flyte_pickle.py @@ -54,4 +54,4 @@ def welcome(name: str) -> People: This workflow can be run locally. During local execution also, the custom object (People) will be marshalled to and from python pickle. """ - welcome(name='Foo') + welcome(name="Foo") diff --git a/cookbook/core/type_system/structured_dataset.py b/cookbook/core/type_system/structured_dataset.py index 2761f56d8f..65db0efa16 100644 --- a/cookbook/core/type_system/structured_dataset.py +++ b/cookbook/core/type_system/structured_dataset.py @@ -23,19 +23,18 @@ import pandas as pd import pyarrow as pa import pyarrow.parquet as pq - -from flytekit import task, workflow, kwtypes, FlyteContext, StructuredDatasetType +from flytekit import FlyteContext, StructuredDatasetType, kwtypes, task, workflow from flytekit.models import literals from flytekit.models.literals import StructuredDatasetMetadata from flytekit.types.schema import FlyteSchema from flytekit.types.structured.structured_dataset import ( + LOCAL, + PARQUET, + S3, StructuredDataset, - StructuredDatasetEncoder, StructuredDatasetDecoder, + StructuredDatasetEncoder, StructuredDatasetTransformerEngine, - PARQUET, - S3, - LOCAL, ) try: @@ -161,6 +160,8 @@ def decode( # %% # Let's define a task to test the above functionality. # We open a structured dataset of type ``numpy.ndarray`` and serialize it again. + + @task def to_numpy( ds: Annotated[StructuredDataset, subset_cols] @@ -193,6 +194,6 @@ def schema_compatibility_wf(a: int) -> Annotated[StructuredDataset, subset_cols] # You can run the code locally as follows: if __name__ == "__main__": numpy_array_one = pandas_compatibility_wf(a=42).open(np.ndarray).all() - print(f"pandas DataFrame compatibility check output: ", numpy_array_one) + print(f"pandas DataFrame compatibility check output: {numpy_array_one}") numpy_array_two = schema_compatibility_wf(a=42).open(np.ndarray).all() - print(f"Schema compatibility check output: ", numpy_array_two) + print(f"Schema compatibility check output: {numpy_array_two}") diff --git a/cookbook/core/type_system/typed_schema.py b/cookbook/core/type_system/typed_schema.py index b9653e241f..caed2b8c61 100644 --- a/cookbook/core/type_system/typed_schema.py +++ b/cookbook/core/type_system/typed_schema.py @@ -11,7 +11,7 @@ from flytekit import kwtypes, task, workflow # %% -# Flytekit consists of some pre-built type extenstions, one of them is the FlyteSchema type +# Flytekit consists of some pre-built type extensions, one of them is the FlyteSchema type from flytekit.types.schema import FlyteSchema # %% @@ -52,7 +52,7 @@ def wf() -> FlyteSchema[kwtypes(x=int)]: # %% -# Local execution will convert the data to and from the serialized representation thus, mimicing a complete distributed +# Local execution will convert the data to and from the serialized representation thus, mimicking a complete distributed # execution. if __name__ == "__main__": print(f"Running {__file__} main...") diff --git a/cookbook/deployment/configure_use_gpus.py b/cookbook/deployment/configure_use_gpus.py index a690898065..7a7eb03369 100644 --- a/cookbook/deployment/configure_use_gpus.py +++ b/cookbook/deployment/configure_use_gpus.py @@ -29,4 +29,4 @@ operator: "Equal" value: "value1" effect: "NoSchedule" -""" \ No newline at end of file +""" diff --git a/cookbook/deployment/deploying_workflows.py b/cookbook/deployment/deploying_workflows.py index 6abd569928..bba3d9f96c 100644 --- a/cookbook/deployment/deploying_workflows.py +++ b/cookbook/deployment/deploying_workflows.py @@ -19,7 +19,7 @@ - Use caching to avoid calling the same task with the same inputs (for the same version) - Portability: You can reference pre-registered entities under any domain or project within your workflow code -- Sharable executions: you can easily share links to your executions with your teammates +- Shareable executions: you can easily share links to your executions with your teammates Please refer to the :doc:`Getting Started ` for details on getting started with the Flyte installation. @@ -99,7 +99,7 @@ In-container serialization ~~~~~~~~~~~~~~~~~~~~~~~~~~ -Notice that the commands above are run locally, _not_ inside the container. Strictly speaking, to be rigourous, serialization should be done within the container for the following reasons. +Notice that the commands above are run locally, _not_ inside the container. Strictly speaking, to be rigorous, serialization should be done within the container for the following reasons. 1. It ensures that the versions of all libraries used at execution time on the Flyte platform, are the same that are used during serialization. 2. Since serialization runs part of flytekit, it helps ensure that your container is set up correctly. diff --git a/cookbook/deployment/lp_notifications.py b/cookbook/deployment/lp_notifications.py index c486a834fa..d993c91371 100644 --- a/cookbook/deployment/lp_notifications.py +++ b/cookbook/deployment/lp_notifications.py @@ -22,9 +22,11 @@ # the :py:class:`flytekit:flytekit.Email`, :py:class:`flytekit:flytekit.PagerDuty`, or :py:class:`flytekit:flytekit.Slack` # objects can be used in the construction of a :py:class:`flytekit:flytekit.LaunchPlan`. +from datetime import timedelta + # %% # Consider the following example workflow: -from flytekit import Email, LaunchPlan, task, workflow, WorkflowExecutionPhase +from flytekit import Email, FixedRate, LaunchPlan, PagerDuty, Slack, WorkflowExecutionPhase, task, workflow @task @@ -37,9 +39,10 @@ def int_doubler_wf(a: int) -> str: doubled = double_int_and_print(a=a) return doubled + # %% # Here are three scenarios that can help deepen your understanding of how notifications work: -# +# # 1. Launch Plan triggers email notifications when the workflow execution reaches the ``SUCCEEDED`` phase. int_doubler_wf_lp = LaunchPlan.get_or_create( name="int_doubler_wf", @@ -55,9 +58,7 @@ def int_doubler_wf(a: int) -> str: # %% # 2. Notifications shine when used for scheduled workflows to alert for failures. -from datetime import timedelta -from flytekit import FixedRate, PagerDuty int_doubler_wf_scheduled_lp = LaunchPlan.get_or_create( name="int_doubler_wf_scheduled", @@ -75,7 +76,6 @@ def int_doubler_wf(a: int) -> str: # %% # 3. Notifications can be combined with different permutations of terminal phases and recipient targets. -from flytekit import Slack wacky_int_doubler_lp = LaunchPlan.get_or_create( name="wacky_int_doubler", @@ -122,7 +122,7 @@ def int_doubler_wf(a: int) -> str: # ====== # # To publish notifications, you'll need to set up an `SNS topic `_. -# +# # To process notifications, you'll need to set up an `AWS SQS `_ queue to consume notification events. This queue must be configured as a subscription to your SNS topic you created above. # # To publish notifications, you'll need a `verified SES email address `_ which will be used to send notification emails and alerts using email APIs. @@ -134,7 +134,7 @@ def int_doubler_wf(a: int) -> str: # .. code-block:: bash # # notifications: -# type: "aws" +# type: "aws" # noqa: F821 # region: "us-east-1" # publisher: # topicName: "arn:aws:sns:us-east-1:{{ YOUR ACCOUNT ID }}:{{ YOUR TOPIC }}" @@ -162,4 +162,3 @@ def int_doubler_wf(a: int) -> str: # * **body**: Configurable email body used in notifications. # # The complete set of parameters that can be used for email templating are checked in `here `_. - diff --git a/cookbook/dev-requirements.in b/cookbook/dev-requirements.in index 21792ba3a7..f6a42a5e3c 100644 --- a/cookbook/dev-requirements.in +++ b/cookbook/dev-requirements.in @@ -11,11 +11,16 @@ -r ./case_studies/ml_training/mnist_classifier/requirements.in -r ./case_studies/ml_training/pima_diabetes/requirements.in -black==19.10b0 +black==22.3.0 coverage flake8 +pre-commit flake8-black flake8-isort +codespell isort mock pytest +mypy +google-cloud-bigquery +google-cloud-bigquery-storage \ No newline at end of file diff --git a/cookbook/dev-requirements.txt b/cookbook/dev-requirements.txt index b600cb7d7f..7fdd1266a5 100644 --- a/cookbook/dev-requirements.txt +++ b/cookbook/dev-requirements.txt @@ -1,746 +1,172 @@ # -# This file is autogenerated by pip-compile with python 3.8 +# This file is autogenerated by pip-compile with python 3.9 # To update, run: # -# /Library/Developer/CommandLineTools/usr/bin/make dev-requirements.txt +# pip-compile dev-requirements.in # -absl-py==0.12.0 - # via - # tensorboard - # tensorflow - # tensorflow-datasets - # tensorflow-metadata -altair==4.1.0 - # via great-expectations -appdirs==1.4.4 - # via black -appnope==0.1.2 - # via - # ipykernel - # ipython -argon2-cffi==21.1.0 - # via notebook -astunparse==1.6.3 - # via tensorflow -attrs==21.2.0 - # via - # black - # hypothesis - # jsonschema - # pytest - # scantree - # tensorflow-datasets -backcall==0.2.0 - # via ipython -backports.zoneinfo==0.2.1 - # via tzlocal -bcrypt==3.2.0 - # via paramiko -black==19.10b0 +attrs==21.4.0 + # via pytest +black==22.3.0 # via # -r dev-requirements.in # flake8-black -bleach==4.1.0 - # via nbconvert -boto3==1.18.56 - # via sagemaker-training -botocore==1.21.56 - # via - # boto3 - # s3transfer -cachetools==4.2.4 +cachetools==5.0.0 # via google-auth -certifi==2021.5.30 - # via - # kubernetes - # requests - # sentry-sdk -cffi==1.14.6 - # via - # argon2-cffi - # bcrypt - # cryptography - # pynacl -charset-normalizer==2.0.6 +certifi==2021.10.8 # via requests -clang==5.0 - # via tensorflow -click==7.1.2 - # via - # black - # flytekit - # great-expectations - # wandb -configparser==5.0.2 - # via wandb -coverage==6.0 +cfgv==3.3.1 + # via pre-commit +charset-normalizer==2.0.12 + # via requests +click==8.1.2 + # via black +codespell==2.1.0 # via -r dev-requirements.in -croniter==1.0.15 - # via flytekit -cryptography==35.0.0 - # via paramiko -cycler==0.10.0 - # via matplotlib -dataclasses-json==0.5.6 - # via - # dolt-integrations - # flytekit -debugpy==1.5.0 - # via ipykernel -decorator==5.1.0 - # via - # ipython - # retry -defusedxml==0.7.1 - # via nbconvert -deprecated==1.2.13 - # via flytekit -dill==0.3.4 - # via tensorflow-datasets -dirhash==0.2.1 - # via flytekit -diskcache==5.2.1 - # via flytekit -docker-image-py==0.1.12 - # via flytekit -docker-pycreds==0.4.0 - # via wandb -docstring-parser==0.11 - # via flytekit -dolt-integrations==0.1.5 - # via flytekitplugins.dolt -doltcli==0.1.15 - # via dolt-integrations -entrypoints==0.3 - # via - # altair - # jupyter-client - # nbconvert -flake8==3.9.2 +coverage==6.3.2 + # via -r dev-requirements.in +distlib==0.3.4 + # via virtualenv +filelock==3.6.0 + # via virtualenv +flake8==4.0.1 # via # -r dev-requirements.in # flake8-black # flake8-isort -flake8-black==0.2.3 +flake8-black==0.3.2 # via -r dev-requirements.in -flake8-isort==4.0.0 +flake8-isort==4.1.1 # via -r dev-requirements.in -flatbuffers==1.12 - # via tensorflow -flyteidl==0.20.2 - # via flytekit -flytekit==0.22.2 - # via - # -r ./case_studies/ml_training/house_price_prediction/../../../common/requirements-common.in - # -r ./case_studies/ml_training/mnist_classifier/../../../common/requirements-common.in - # -r ./case_studies/ml_training/pima_diabetes/../../../common/requirements-common.in - # -r ./integrations/aws/sagemaker_pytorch/../../../common/requirements-common.in - # -r ./integrations/aws/sagemaker_training/../../../common/requirements-common.in - # -r ./integrations/external_services/hive/../../../common/requirements-common.in - # -r ./integrations/flytekit_plugins/dolt/../../../common/requirements-common.in - # -r ./integrations/flytekit_plugins/pandera_examples/../../../common/requirements-common.in - # -r ./integrations/kubernetes/k8s_spark/../../../common/requirements-common.in - # -r ./integrations/kubernetes/kfpytorch/../../../common/requirements-common.in - # -r ./integrations/kubernetes/pod/../../../common/requirements-common.in - # flytekitplugins-awssagemaker - # flytekitplugins-hive - # flytekitplugins-kfpytorch - # flytekitplugins-pandera - # flytekitplugins-pod - # flytekitplugins-spark - # flytekitplugins.dolt -flytekitplugins-awssagemaker==0.22.3 +google-api-core[grpc]==2.7.3 # via - # -r ./integrations/aws/sagemaker_pytorch/requirements.in - # -r ./integrations/aws/sagemaker_training/requirements.in -flytekitplugins-hive==0.22.3 - # via -r ./integrations/external_services/hive/requirements.in -flytekitplugins-kfpytorch==0.22.3 - # via -r ./integrations/kubernetes/kfpytorch/requirements.in -flytekitplugins-pandera==0.22.3 - # via -r ./integrations/flytekit_plugins/pandera_examples/requirements.in -flytekitplugins-pod==0.22.3 - # via -r ./integrations/kubernetes/pod/requirements.in -flytekitplugins-spark==0.22.3 - # via -r ./integrations/kubernetes/k8s_spark/requirements.in -flytekitplugins.dolt==0.22.3 - # via -r ./integrations/flytekit_plugins/dolt/requirements.in -future==0.18.2 - # via tensorflow-datasets -gast==0.4.0 - # via tensorflow -gevent==21.8.0 - # via sagemaker-training -gitdb==4.0.7 - # via gitpython -gitpython==3.1.24 - # via wandb -google-auth==1.35.0 + # google-cloud-bigquery + # google-cloud-bigquery-storage + # google-cloud-core +google-auth==2.6.6 # via - # google-auth-oauthlib - # kubernetes - # tensorboard -google-auth-oauthlib==0.4.6 - # via tensorboard -google-pasta==0.2.0 - # via tensorflow -googleapis-common-protos==1.53.0 - # via tensorflow-metadata -great-expectations==0.13.36 - # via -r ./integrations/flytekit_plugins/dolt/requirements.in -greenlet==1.1.2 - # via gevent -grpcio==1.41.0 + # google-api-core + # google-cloud-core +google-cloud-bigquery==3.0.1 + # via -r dev-requirements.in +google-cloud-bigquery-storage==2.13.1 # via - # flytekit - # tensorboard - # tensorflow -h5py==3.1.0 - # via tensorflow -hypothesis==6.23.1 - # via -r ./integrations/flytekit_plugins/pandera_examples/requirements.in -idna==3.2 + # -r dev-requirements.in + # google-cloud-bigquery +google-cloud-core==2.3.0 + # via google-cloud-bigquery +google-crc32c==1.3.0 + # via google-resumable-media +google-resumable-media==2.3.2 + # via google-cloud-bigquery +googleapis-common-protos==1.56.0 + # via + # google-api-core + # grpcio-status +grpcio==1.44.0 + # via + # google-api-core + # google-cloud-bigquery + # grpcio-status +grpcio-status==1.44.0 + # via google-api-core +identify==2.4.12 + # via pre-commit +idna==3.3 # via requests -importlib-metadata==4.8.1 - # via - # great-expectations - # keyring -importlib-resources==5.2.2 - # via tensorflow-datasets iniconfig==1.1.1 # via pytest -inotify_simple==1.2.1 - # via sagemaker-training -ipykernel==6.4.1 - # via - # ipywidgets - # notebook -ipython==7.28.0 - # via - # ipykernel - # ipywidgets -ipython-genutils==0.2.0 - # via - # ipykernel - # ipywidgets - # nbformat - # notebook -ipywidgets==7.6.5 - # via great-expectations -isort==5.9.3 +isort==5.10.1 # via # -r dev-requirements.in # flake8-isort -jedi==0.18.0 - # via ipython -jinja2==3.0.2 - # via - # altair - # great-expectations - # nbconvert - # notebook -jmespath==0.10.0 - # via - # boto3 - # botocore -joblib==1.0.1 - # via - # -r ./case_studies/ml_training/house_price_prediction/requirements.in - # -r ./case_studies/ml_training/pima_diabetes/requirements.in - # scikit-learn -jsonpatch==1.32 - # via great-expectations -jsonpointer==2.1 - # via jsonpatch -jsonschema==4.0.1 - # via - # altair - # great-expectations - # nbformat -jupyter-client==7.0.6 - # via - # ipykernel - # nbclient - # notebook -jupyter-core==4.8.1 - # via - # jupyter-client - # nbconvert - # nbformat - # notebook -jupyterlab-pygments==0.1.2 - # via nbconvert -jupyterlab-widgets==1.0.2 - # via ipywidgets -keras==2.6.0 - # via tensorflow -keras-preprocessing==1.1.2 - # via tensorflow -keyring==23.2.1 - # via flytekit -kiwisolver==1.3.2 - # via matplotlib -kubernetes==18.20.0 - # via flytekitplugins-pod -markdown==3.3.4 - # via tensorboard -markupsafe==2.0.1 - # via jinja2 -marshmallow==3.13.0 - # via - # dataclasses-json - # marshmallow-enum - # marshmallow-jsonschema -marshmallow-enum==1.5.1 - # via dataclasses-json -marshmallow-jsonschema==0.12.0 - # via flytekit -matplotlib==3.4.3 - # via - # -r ./case_studies/ml_training/house_price_prediction/../../../common/requirements-common.in - # -r ./case_studies/ml_training/house_price_prediction/requirements.in - # -r ./case_studies/ml_training/mnist_classifier/../../../common/requirements-common.in - # -r ./case_studies/ml_training/pima_diabetes/../../../common/requirements-common.in - # -r ./case_studies/ml_training/pima_diabetes/requirements.in - # -r ./integrations/aws/sagemaker_pytorch/../../../common/requirements-common.in - # -r ./integrations/aws/sagemaker_training/../../../common/requirements-common.in - # -r ./integrations/external_services/hive/../../../common/requirements-common.in - # -r ./integrations/flytekit_plugins/dolt/../../../common/requirements-common.in - # -r ./integrations/flytekit_plugins/pandera_examples/../../../common/requirements-common.in - # -r ./integrations/kubernetes/k8s_spark/../../../common/requirements-common.in - # -r ./integrations/kubernetes/kfpytorch/../../../common/requirements-common.in - # -r ./integrations/kubernetes/pod/../../../common/requirements-common.in -matplotlib-inline==0.1.3 - # via - # ipykernel - # ipython mccabe==0.6.1 # via flake8 -mistune==0.8.4 - # via - # great-expectations - # nbconvert mock==4.0.3 # via -r dev-requirements.in +mypy==0.950 + # via -r dev-requirements.in mypy-extensions==0.4.3 - # via typing-inspect -natsort==7.1.1 - # via flytekit -nbclient==0.5.4 - # via nbconvert -nbconvert==6.2.0 - # via notebook -nbformat==5.1.3 - # via - # ipywidgets - # nbclient - # nbconvert - # notebook -nest-asyncio==1.5.1 # via - # jupyter-client - # nbclient -notebook==6.4.4 - # via widgetsnbextension -numpy==1.19.5 - # via - # altair - # great-expectations - # h5py - # keras-preprocessing - # matplotlib - # opt-einsum - # pandas - # pandera - # pyarrow - # sagemaker-training - # scikit-learn - # scipy - # tensorboard - # tensorboardx - # tensorflow - # tensorflow-datasets - # torch - # torchvision - # xgboost -oauthlib==3.1.1 - # via requests-oauthlib -opt-einsum==3.3.0 - # via tensorflow -packaging==21.0 - # via - # bleach - # pandera + # black + # mypy +nodeenv==1.6.0 + # via pre-commit +numpy==1.21.6 + # via pyarrow +packaging==21.3 + # via + # google-cloud-bigquery # pytest -pandas==1.3.3 - # via - # altair - # dolt-integrations - # flytekit - # great-expectations - # pandera -pandera==0.7.2 - # via - # -r ./integrations/flytekit_plugins/pandera_examples/requirements.in - # flytekitplugins-pandera -pandocfilters==1.5.0 - # via nbconvert -paramiko==2.7.2 - # via sagemaker-training -parso==0.8.2 - # via jedi pathspec==0.9.0 + # via black +platformdirs==2.5.2 # via # black - # scantree -pathtools==0.1.2 - # via wandb -pexpect==4.8.0 - # via ipython -pickleshare==0.7.5 - # via ipython -pillow==8.3.2 - # via - # matplotlib - # torchvision + # virtualenv pluggy==1.0.0 # via pytest -prometheus-client==0.11.0 - # via notebook -promise==2.3 +pre-commit==2.18.1 + # via -r dev-requirements.in +proto-plus==1.20.3 # via - # tensorflow-datasets - # wandb -prompt-toolkit==3.0.20 - # via ipython -protobuf==3.18.1 + # google-cloud-bigquery + # google-cloud-bigquery-storage +protobuf==3.20.1 # via - # flyteidl - # flytekit + # google-api-core + # google-cloud-bigquery # googleapis-common-protos - # sagemaker-training - # tensorboard - # tensorboardx - # tensorflow - # tensorflow-datasets - # tensorflow-metadata - # wandb -psutil==5.8.0 - # via - # sagemaker-training - # wandb -ptyprocess==0.7.0 - # via - # pexpect - # terminado -py==1.10.0 - # via - # pytest - # retry -py4j==0.10.9 - # via pyspark -pyarrow==3.0.0 - # via - # flytekit - # pandera + # grpcio-status + # proto-plus +py==1.11.0 + # via pytest +pyarrow==6.0.1 + # via google-cloud-bigquery pyasn1==0.4.8 # via # pyasn1-modules # rsa pyasn1-modules==0.2.8 # via google-auth -pycodestyle==2.7.0 +pycodestyle==2.8.0 # via flake8 -pycparser==2.20 - # via cffi -pyflakes==2.3.1 +pyflakes==2.4.0 # via flake8 -pygments==2.10.0 - # via - # ipython - # jupyterlab-pygments - # nbconvert -pynacl==1.4.0 - # via paramiko pyparsing==2.4.7 - # via - # great-expectations - # matplotlib - # packaging -pyrsistent==0.18.0 - # via jsonschema -pyspark==3.1.2 - # via - # -r ./integrations/kubernetes/k8s_spark/requirements.in - # flytekitplugins-spark -pytest==6.2.5 + # via packaging +pytest==7.1.2 # via -r dev-requirements.in -python-dateutil==2.8.1 - # via - # botocore - # croniter - # flytekit - # great-expectations - # jupyter-client - # kubernetes - # matplotlib - # pandas - # wandb -python-json-logger==2.0.2 - # via flytekit -pytimeparse==1.1.8 - # via flytekit -pytz==2018.4 - # via - # flytekit - # great-expectations - # pandas -pyyaml==5.4.1 - # via - # kubernetes - # wandb -pyzmq==22.3.0 - # via - # jupyter-client - # notebook -regex==2021.9.30 - # via - # black - # docker-image-py -requests==2.26.0 - # via - # flytekit - # great-expectations - # kubernetes - # requests-oauthlib - # responses - # tensorboard - # tensorflow-datasets - # wandb -requests-oauthlib==1.3.0 - # via - # google-auth-oauthlib - # kubernetes -responses==0.14.0 - # via flytekit -retry==0.9.2 - # via flytekit -retrying==1.3.3 - # via sagemaker-training -rsa==4.7.2 +python-dateutil==2.8.2 + # via google-cloud-bigquery +pyyaml==6.0 + # via pre-commit +requests==2.27.1 + # via + # google-api-core + # google-cloud-bigquery +rsa==4.8 # via google-auth -ruamel.yaml==0.17.16 - # via great-expectations -ruamel.yaml.clib==0.2.6 - # via ruamel.yaml -s3transfer==0.5.0 - # via boto3 -sagemaker-training==3.9.2 - # via flytekitplugins-awssagemaker -scantree==0.0.1 - # via dirhash -scikit-learn==1.0 - # via sklearn -scipy==1.7.1 - # via - # great-expectations - # sagemaker-training - # scikit-learn - # xgboost -send2trash==1.8.0 - # via notebook -sentry-sdk==1.4.3 - # via wandb -shortuuid==1.0.1 - # via wandb -six==1.15.0 +six==1.16.0 # via - # absl-py - # astunparse - # bcrypt - # bleach - # cycler - # docker-pycreds - # flytekit # google-auth - # google-pasta # grpcio - # keras-preprocessing - # kubernetes - # promise - # pynacl # python-dateutil - # responses - # retrying - # sagemaker-training - # scantree - # tensorflow - # tensorflow-datasets - # wandb -sklearn==0.0 - # via - # -r ./case_studies/ml_training/house_price_prediction/requirements.in - # -r ./case_studies/ml_training/pima_diabetes/requirements.in -smmap==4.0.0 - # via gitdb -sortedcontainers==2.4.0 - # via - # flytekit - # hypothesis -statsd==3.3.0 - # via flytekit -subprocess32==3.5.4 - # via wandb -tabulate==0.8.9 - # via - # -r ./case_studies/ml_training/house_price_prediction/requirements.in - # -r ./case_studies/ml_training/pima_diabetes/requirements.in -tensorboard==2.6.0 - # via tensorflow -tensorboard-data-server==0.6.1 - # via tensorboard -tensorboard-plugin-wit==1.8.0 - # via tensorboard -tensorboardx==2.4 - # via - # -r ./integrations/aws/sagemaker_pytorch/requirements.in - # -r ./integrations/kubernetes/kfpytorch/requirements.in -tensorflow==2.6.0 - # via -r ./integrations/aws/sagemaker_training/requirements.in -tensorflow-datasets==4.4.0 - # via -r ./integrations/aws/sagemaker_training/requirements.in -tensorflow-estimator==2.6.0 - # via tensorflow -tensorflow-metadata==1.2.0 - # via tensorflow-datasets -termcolor==1.1.0 - # via - # great-expectations - # tensorflow - # tensorflow-datasets - # yaspin -terminado==0.12.1 - # via notebook -testfixtures==6.18.3 + # virtualenv +testfixtures==6.18.5 # via flake8-isort -testpath==0.5.0 - # via nbconvert -threadpoolctl==3.0.0 - # via scikit-learn toml==0.10.2 + # via pre-commit +tomli==2.0.1 # via # black # flake8-black + # mypy # pytest -toolz==0.11.1 - # via altair -torch==1.8.1 - # via - # -r ./case_studies/ml_training/mnist_classifier/requirements.in - # -r ./integrations/aws/sagemaker_pytorch/requirements.in - # -r ./integrations/kubernetes/kfpytorch/requirements.in - # torchvision -torchvision==0.9.1 - # via - # -r ./case_studies/ml_training/mnist_classifier/requirements.in - # -r ./integrations/aws/sagemaker_pytorch/requirements.in - # -r ./integrations/kubernetes/kfpytorch/requirements.in -tornado==6.1 - # via - # ipykernel - # jupyter-client - # notebook - # terminado -tqdm==4.62.3 - # via - # great-expectations - # tensorflow-datasets -traitlets==5.1.0 - # via - # ipykernel - # ipython - # ipywidgets - # jupyter-client - # jupyter-core - # matplotlib-inline - # nbclient - # nbconvert - # nbformat - # notebook -typed-ast==1.4.3 - # via black -typing-extensions==3.7.4.3 - # via - # gitpython - # tensorflow - # torch - # typing-inspect -typing-inspect==0.7.1 - # via - # dataclasses-json - # pandera -tzlocal==3.0 - # via great-expectations -urllib3==1.26.7 - # via - # botocore - # flytekit - # kubernetes - # requests - # responses - # sentry-sdk -wandb==0.12.4 - # via -r ./case_studies/ml_training/mnist_classifier/requirements.in -wcwidth==0.2.5 - # via prompt-toolkit -webencodings==0.5.1 - # via bleach -websocket-client==1.2.1 - # via kubernetes -werkzeug==2.0.2 - # via - # sagemaker-training - # tensorboard -wheel==0.37.0 +typing-extensions==4.2.0 # via - # -r ./case_studies/ml_training/house_price_prediction/../../../common/requirements-common.in - # -r ./case_studies/ml_training/mnist_classifier/../../../common/requirements-common.in - # -r ./case_studies/ml_training/pima_diabetes/../../../common/requirements-common.in - # -r ./integrations/aws/sagemaker_pytorch/../../../common/requirements-common.in - # -r ./integrations/aws/sagemaker_training/../../../common/requirements-common.in - # -r ./integrations/external_services/hive/../../../common/requirements-common.in - # -r ./integrations/flytekit_plugins/dolt/../../../common/requirements-common.in - # -r ./integrations/flytekit_plugins/pandera_examples/../../../common/requirements-common.in - # -r ./integrations/kubernetes/k8s_spark/../../../common/requirements-common.in - # -r ./integrations/kubernetes/kfpytorch/../../../common/requirements-common.in - # -r ./integrations/kubernetes/pod/../../../common/requirements-common.in - # astunparse - # flytekit - # tensorboard - # tensorflow -widgetsnbextension==3.5.1 - # via ipywidgets -wrapt==1.12.1 - # via - # deprecated - # flytekit - # pandera - # tensorflow -xgboost==1.4.2 - # via - # -r ./case_studies/ml_training/house_price_prediction/requirements.in - # -r ./case_studies/ml_training/pima_diabetes/requirements.in -yaspin==2.1.0 - # via wandb -zipp==3.6.0 - # via - # importlib-metadata - # importlib-resources -zope.event==4.5.0 - # via gevent -zope.interface==5.4.0 - # via gevent - -# The following packages are considered to be unsafe in a requirements file: -# pip -# setuptools + # black + # mypy +urllib3==1.26.9 + # via requests +virtualenv==20.14.1 + # via pre-commit diff --git a/cookbook/docs/README.md b/cookbook/docs/README.md index 17383f4efa..9726c65b83 100644 --- a/cookbook/docs/README.md +++ b/cookbook/docs/README.md @@ -12,7 +12,7 @@ To make this work, it is essential that the examples are written with comments f - The example directory should have a README.rst. - The example itself should have a header comment, which should have a heading as well. - - Docs interspersed in the example should preceed with `# %%` comment and then + - Docs interspersed in the example should proceed with `# %%` comment and then multiline comments should not have blank spaces between them. ```rst # %% diff --git a/cookbook/docs/conf.py b/cookbook/docs/conf.py index 94fd73401f..75a3866283 100644 --- a/cookbook/docs/conf.py +++ b/cookbook/docs/conf.py @@ -60,8 +60,7 @@ class CustomSorter(FileNameSortKey): # Type System "flyte_python_types.py", "schema.py", - "structured_dataset.py" - "typed_schema.py", + "structured_dataset.py" "typed_schema.py", "custom_objects.py", "enums.py", "lp_schedules.py", @@ -127,9 +126,7 @@ class CustomSorter(FileNameSortKey): ## GCP # TODO ## External Services - "hive.py" - "snowflake.py" - "bigquery.py" + "hive.py" "snowflake.py" "bigquery.py" # Extending Flyte "backend_plugins.py", # NOTE: for some reason this needs to be listed first here to show up last on the TOC "custom_types.py", @@ -347,7 +344,7 @@ def __call__(self, filename): sphinx_gallery_conf = { "examples_dirs": examples_dirs, "gallery_dirs": gallery_dirs, - "ignore_pattern": f"({'|'.join(ignore_py_files)})\.py", + "ignore_pattern": f"({'|'.join(ignore_py_files)})\.py", # noqa: W605 # "subsection_order": ExplicitOrder( # [ # "../core/basic", diff --git a/cookbook/integrations/aws/batch/batch.py b/cookbook/integrations/aws/batch/batch.py index 8d99d6bbc1..f65406b9ed 100644 --- a/cookbook/integrations/aws/batch/batch.py +++ b/cookbook/integrations/aws/batch/batch.py @@ -6,9 +6,8 @@ that you use to run your jobs, allowing you to focus on analyzing results and solving problems. """ -from flytekitplugins.awsbatch import AWSBatchConfig from flytekit import task, workflow - +from flytekitplugins.awsbatch import AWSBatchConfig # %% # Use this to configure SubmitJobInput for a AWS batch job. Task's marked with this will automatically execute diff --git a/cookbook/integrations/aws/sagemaker_pytorch/sagemaker_pytorch_distributed_training.py b/cookbook/integrations/aws/sagemaker_pytorch/sagemaker_pytorch_distributed_training.py index b2fb481eca..5f2c8af960 100644 --- a/cookbook/integrations/aws/sagemaker_pytorch/sagemaker_pytorch_distributed_training.py +++ b/cookbook/integrations/aws/sagemaker_pytorch/sagemaker_pytorch_distributed_training.py @@ -353,7 +353,9 @@ def download_test_data(training_dir): input_content_type=InputContentType.TEXT_CSV, ), training_job_resource_config=TrainingJobResourceConfig( - instance_type="ml.p3.8xlarge", instance_count=2, volume_size_in_gb=25, + instance_type="ml.p3.8xlarge", + instance_count=2, + volume_size_in_gb=25, ), ), cache_version="1.0", diff --git a/cookbook/integrations/aws/sagemaker_training/sagemaker_builtin_algo_training.py b/cookbook/integrations/aws/sagemaker_training/sagemaker_builtin_algo_training.py index 6e218321ea..bd672ee62a 100644 --- a/cookbook/integrations/aws/sagemaker_training/sagemaker_builtin_algo_training.py +++ b/cookbook/integrations/aws/sagemaker_training/sagemaker_builtin_algo_training.py @@ -79,7 +79,9 @@ task_config=SagemakerTrainingJobConfig( algorithm_specification=alg_spec, training_job_resource_config=TrainingJobResourceConfig( - instance_type="ml.m4.xlarge", instance_count=1, volume_size_in_gb=25, + instance_type="ml.m4.xlarge", + instance_count=1, + volume_size_in_gb=25, ), ), metadata=TaskMetadata(cache_version="1.0", cache=True), @@ -106,7 +108,9 @@ # and split it and uploaded to an s3 bucket: def execute_training(): xgboost_train_task( - static_hyperparameters=xgboost_hyperparameters, train="", validation="", + static_hyperparameters=xgboost_hyperparameters, + train="", + validation="", ) diff --git a/cookbook/integrations/aws/sagemaker_training/sagemaker_custom_training.py b/cookbook/integrations/aws/sagemaker_training/sagemaker_custom_training.py index 7edf57b706..74114a825f 100644 --- a/cookbook/integrations/aws/sagemaker_training/sagemaker_custom_training.py +++ b/cookbook/integrations/aws/sagemaker_training/sagemaker_custom_training.py @@ -5,13 +5,13 @@ with very few modifications. """ import typing +from typing import Tuple import matplotlib.pyplot as plt import tensorflow as tf import tensorflow_datasets as tfds from flytekit import task, workflow from flytekit.types.directory import TensorboardLogs -from typing import Tuple # %% # Training Algorithm @@ -85,7 +85,9 @@ def normalize_img(image, label): input_content_type=InputContentType.TEXT_CSV, ), training_job_resource_config=TrainingJobResourceConfig( - instance_type="ml.m4.xlarge", instance_count=1, volume_size_in_gb=25, + instance_type="ml.m4.xlarge", + instance_count=1, + volume_size_in_gb=25, ), ), cache_version="1.0", @@ -132,7 +134,10 @@ def custom_training_task(epochs: int, batch_size: int) -> TrainingOutputs: tb_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir) history = model.fit( - ds_train, epochs=epochs, validation_data=ds_test, callbacks=[tb_callback], + ds_train, + epochs=epochs, + validation_data=ds_test, + callbacks=[tb_callback], ) serialized_model = "my_model.h5" @@ -194,9 +199,9 @@ def mnist_trainer( # %% # As long as you have tensorflow setup locally, it will run like a regular python script. if __name__ == "__main__": - model, accurracy, loss, logs = mnist_trainer() + model, accuracy, loss, logs = mnist_trainer() print( - f"Model: {model}, Accuracy PNG: {accurracy}, loss PNG: {loss}, Tensorboard Log Dir: {logs}" + f"Model: {model}, Accuracy PNG: {accuracy}, loss PNG: {loss}, Tensorboard Log Dir: {logs}" ) # %% @@ -220,6 +225,6 @@ def mnist_trainer( # # # If running remotely (executing on Flyte hosted environment), the workflow execution outputs can be retrieved. -# +# # You can retrieve the outputs - which will be a path to a blob store like S3, GCS, minio, etc. Tensorboad can be # pointed to on your local laptop to visualize the results. diff --git a/cookbook/integrations/external_services/snowflake/snowflake.py b/cookbook/integrations/external_services/snowflake/snowflake.py index 23c788032c..8bb0fde84e 100644 --- a/cookbook/integrations/external_services/snowflake/snowflake.py +++ b/cookbook/integrations/external_services/snowflake/snowflake.py @@ -5,9 +5,9 @@ This example shows how to use a Flyte SnowflakeTask to execute a query. """ -from flytekit import kwtypes, task, workflow -from flytekit.types.schema import FlyteSchema +from flytekit import kwtypes, workflow from flytekitplugins.snowflake import SnowflakeConfig, SnowflakeTask + # %% # This is the world's simplest query. Note that in order for registration to work properly, you'll need to give your # Snowflake task a name that's unique across your project/domain for your Flyte installation. @@ -16,8 +16,12 @@ inputs={}, query_template="SELECT 1", output_schema_type=None, - task_config=SnowflakeConfig(account="ha63105.us-central1.gcp", database="SNOWFLAKE_SAMPLE_DATA", - schema="TPCH_SF1000", warehouse="COMPUTE_WH"), + task_config=SnowflakeConfig( + account="ha63105.us-central1.gcp", + database="SNOWFLAKE_SAMPLE_DATA", + schema="TPCH_SF1000", + warehouse="COMPUTE_WH", + ), ) @@ -57,8 +61,12 @@ def no_io_wf(): name="sql.snowflake.w_io", # Define inputs as well as their types that can be used to customize the query. inputs=kwtypes(nation_key=int), - task_config=SnowflakeConfig(account="ha63105.us-central1.gcp", database="SNOWFLAKE_SAMPLE_DATA", - schema="TPCH_SF1000", warehouse="COMPUTE_WH"), + task_config=SnowflakeConfig( + account="ha63105.us-central1.gcp", + database="SNOWFLAKE_SAMPLE_DATA", + schema="TPCH_SF1000", + warehouse="COMPUTE_WH", + ), query_template="SELECT * from CUSTOMER where C_NATIONKEY = {{ .inputs.nation_key }} limit 100", ) @@ -67,6 +75,7 @@ def no_io_wf(): def full_snowflake_wf(nation_key: int): return snowflake_task_templatized_query(nation_key=nation_key) + # %% # Check query result on snowflake console: ``https://.snowflakecomputing.com/console#/monitoring/queries/detail`` # diff --git a/cookbook/integrations/flytekit_plugins/dolt/dolt_branch_example.py b/cookbook/integrations/flytekit_plugins/dolt/dolt_branch_example.py index be785c77dd..c75db662fc 100644 --- a/cookbook/integrations/flytekit_plugins/dolt/dolt_branch_example.py +++ b/cookbook/integrations/flytekit_plugins/dolt/dolt_branch_example.py @@ -9,11 +9,10 @@ import sys import typing +import pandas as pd from dolt_integrations.core import NewBranch -from flytekitplugins.dolt.schema import DoltConfig, DoltTable from flytekit import task, workflow -import pandas as pd - +from flytekitplugins.dolt.schema import DoltConfig, DoltTable # %% # A Simple Workflow @@ -34,11 +33,10 @@ doltdb_path = os.path.join(os.path.dirname(__file__), "foo") + def generate_confs(a: int) -> typing.Tuple[DoltConfig, DoltConfig, DoltConfig]: users_conf = DoltConfig( - db_path=doltdb_path, - tablename="users", - branch_conf=NewBranch(f"run/a_is_{a}") + db_path=doltdb_path, tablename="users", branch_conf=NewBranch(f"run/a_is_{a}") ) query_users = DoltTable( @@ -57,6 +55,7 @@ def generate_confs(a: int) -> typing.Tuple[DoltConfig, DoltConfig, DoltConfig]: return users_conf, query_users, big_users_conf + # %% # .. tip :: # A ``DoltTable`` is an extension of ``DoltConfig`` that wraps a ``pandas.DataFrame`` -- accessible via the ``DoltTable.data`` @@ -74,33 +73,43 @@ def generate_confs(a: int) -> typing.Tuple[DoltConfig, DoltConfig, DoltConfig]: # Return types of ``DoltTable`` save the ``data`` to the # Dolt database given a connection configuration. + @task def get_confs(a: int) -> typing.Tuple[DoltConfig, DoltTable, DoltConfig]: return generate_confs(a) + @task def populate_users(a: int, conf: DoltConfig) -> DoltTable: - users = [("George", a), ("Alice", a*2), ("Stephanie", a*3)] + users = [("George", a), ("Alice", a * 2), ("Stephanie", a * 3)] df = pd.DataFrame(users, columns=["name", "count"]) return DoltTable(data=df, config=conf) + @task -def filter_users(a: int, all_users: DoltTable, filtered_users: DoltTable, conf: DoltConfig) -> DoltTable: +def filter_users( + a: int, all_users: DoltTable, filtered_users: DoltTable, conf: DoltConfig +) -> DoltTable: usernames = filtered_users.data[["name"]] return DoltTable(data=usernames, config=conf) + @task def count_users(users: DoltTable) -> int: return users.data.shape[0] + @workflow def wf(a: int) -> int: user_conf, query_conf, big_user_conf = get_confs(a=a) users = populate_users(a=a, conf=user_conf) - big_users = filter_users(a=a, all_users=users, filtered_users=query_conf, conf=big_user_conf) + big_users = filter_users( + a=a, all_users=users, filtered_users=query_conf, conf=big_user_conf + ) big_user_cnt = count_users(users=big_users) return big_user_cnt + if __name__ == "__main__": print(f"Running {__file__} main...") if len(sys.argv) != 2: @@ -109,9 +118,9 @@ def wf(a: int) -> int: result = wf(a=a) print(f"Running wf(), returns int\n{result}\n{type(result)}") -# %% +# %% # We will run this workflow twice: -# +# # .. prompt:: $ # # python branch_example.py 2 @@ -125,4 +134,4 @@ def wf(a: int) -> int: # .. prompt:: $ # # cd foo -# dolt branch \ No newline at end of file +# dolt branch diff --git a/cookbook/integrations/flytekit_plugins/dolt/dolt_quickstart_example.py b/cookbook/integrations/flytekit_plugins/dolt/dolt_quickstart_example.py index 88acc51393..2f0e2ee465 100644 --- a/cookbook/integrations/flytekit_plugins/dolt/dolt_quickstart_example.py +++ b/cookbook/integrations/flytekit_plugins/dolt/dolt_quickstart_example.py @@ -12,9 +12,9 @@ import os import sys -from flytekitplugins.dolt.schema import DoltConfig, DoltTable -from flytekit import task, workflow import pandas as pd +from flytekit import task, workflow +from flytekitplugins.dolt.schema import DoltConfig, DoltTable # %% # Next, we initialize Dolt's config. @@ -26,19 +26,23 @@ ) # %% -# We define a task to create a DataFrame and store the table in Dolt. +# We define a task to create a DataFrame and store the table in Dolt. + + @task def populate_rabbits(a: int) -> DoltTable: rabbits = [("George", a), ("Alice", a * 2), ("Sugar Maple", a * 3)] df = pd.DataFrame(rabbits, columns=["name", "count"]) return DoltTable(data=df, config=rabbits_conf) + # %% # ``unwrap_rabbits`` task does the exact opposite -- reading the table from Dolt and returning a DataFrame. @task def unwrap_rabbits(table: DoltTable) -> pd.DataFrame: return table.data + # %% # Our workflow combines the above two tasks: @workflow @@ -47,6 +51,7 @@ def wf(a: int) -> pd.DataFrame: df = unwrap_rabbits(table=rabbits) return df + if __name__ == "__main__": print(f"Running {__file__} main...") if len(sys.argv) != 2: diff --git a/cookbook/integrations/flytekit_plugins/greatexpectations/type_example.py b/cookbook/integrations/flytekit_plugins/greatexpectations/type_example.py index a13dea0e08..5f2f2331a3 100644 --- a/cookbook/integrations/flytekit_plugins/greatexpectations/type_example.py +++ b/cookbook/integrations/flytekit_plugins/greatexpectations/type_example.py @@ -14,17 +14,12 @@ # %% # First, let's import the required libraries. import os -import typing import pandas as pd from flytekit import Resources, task, workflow from flytekit.types.file import CSVFile from flytekit.types.schema import FlyteSchema -from flytekitplugins.great_expectations import ( - BatchRequestConfig, - GreatExpectationsFlyteConfig, - GreatExpectationsType, -) +from flytekitplugins.great_expectations import BatchRequestConfig, GreatExpectationsFlyteConfig, GreatExpectationsType # %% # .. note:: @@ -46,21 +41,23 @@ # The directory that's being used is defined in ``my_assets``. You can find ``my_assets`` in the Great Expectations config file. # # The parameters within the ``data_connector_query`` convey that we're fetching all those files that have "2019" and "01" in the file names. + + @task(limits=Resources(mem="500Mi")) def simple_task( directory: GreatExpectationsType[ str, GreatExpectationsFlyteConfig( - datasource_name="data", - expectation_suite_name="test.demo", - data_connector_name="my_data_connector", + datasource_name="data", # noqa: F821 + expectation_suite_name="test.demo", # noqa: F821 + data_connector_name="my_data_connector", # noqa: F821 batch_request_config=BatchRequestConfig( data_connector_query={ - "batch_filter_parameters": { - "year": "2019", - "month": "01", # noqa: F722 + "batch_filter_parameters": { # noqa: F821 + "year": "2019", # noqa: F821 + "month": "01", # noqa: F821, F722 }, - "limit": 10, + "limit": 10, # noqa: F821 }, ), context_root_dir=CONTEXT_ROOT_DIR, @@ -97,6 +94,8 @@ def simple_wf(directory: str = "my_assets") -> str: # # The first value that's being sent within ``GreatExpectationsType`` is ``CSVFile`` (this is a pre-formatted FlyteFile type). # This means that we want to validate the ``FlyteFile`` data. + + @task(limits=Resources(mem="500Mi")) def file_task( dataset: GreatExpectationsType[CSVFile, great_expectations_config] @@ -122,12 +121,14 @@ def file_wf() -> pd.DataFrame: @task(limits=Resources(mem="500Mi")) def schema_task( dataframe: GreatExpectationsType[ - FlyteSchema, - GreatExpectationsFlyteConfig( - datasource_name="data", - expectation_suite_name="test.demo", - data_connector_name="data_flytetype_data_connector", - batch_request_config=BatchRequestConfig(data_connector_query={"limit": 10}), + FlyteSchema, # noqa: F821 + GreatExpectationsFlyteConfig( # noqa: F821 + datasource_name="data", # noqa: F821 + expectation_suite_name="test.demo", # noqa: F821 + data_connector_name="data_flytetype_data_connector", # noqa: F821 + batch_request_config=BatchRequestConfig( + data_connector_query={"limit": 10} # noqa : F841 + ), # noqa: F821 local_file_path="/tmp/test.parquet", # noqa: F722 context_root_dir=CONTEXT_ROOT_DIR, ), diff --git a/cookbook/integrations/flytekit_plugins/modin_examples/knn_classifier.py b/cookbook/integrations/flytekit_plugins/modin_examples/knn_classifier.py index e4875fe2ae..3aba9de846 100644 --- a/cookbook/integrations/flytekit_plugins/modin_examples/knn_classifier.py +++ b/cookbook/integrations/flytekit_plugins/modin_examples/knn_classifier.py @@ -25,14 +25,14 @@ # Let's import the necessary dependencies. from typing import List, NamedTuple -import flytekitplugins.modin +import flytekitplugins.modin # noqa: F401 import modin.pandas import ray from flytekit import task, workflow from sklearn.datasets import load_wine -from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split +from sklearn.neighbors import KNeighborsClassifier ray.shutdown() # close previous instance of ray (if any) ray.init(num_cpus=2) # open a new instance of ray diff --git a/cookbook/integrations/flytekit_plugins/pandera_examples/basic_schema_example.py b/cookbook/integrations/flytekit_plugins/pandera_examples/basic_schema_example.py index 16d5c090ad..29eeb24e35 100644 --- a/cookbook/integrations/flytekit_plugins/pandera_examples/basic_schema_example.py +++ b/cookbook/integrations/flytekit_plugins/pandera_examples/basic_schema_example.py @@ -9,28 +9,31 @@ import typing -import flytekitplugins.pandera +import flytekitplugins.pandera # noqa : F401 import pandas as pd import pandera as pa from flytekit import task, workflow from pandera.typing import DataFrame, Series - # %% # A Simple Data Processing Pipeline # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # # Let's first define a simple data processing pipeline in pure python. + def total_pay(df): return df.assign(total_pay=df.hourly_pay * df.hours_worked) + def add_id(df, worker_id): return df.assign(worker_id=worker_id) + def process_data(df, worker_id): return add_id(df=total_pay(df=df), worker_id=worker_id) + # %% # As you can see, the ``process_data`` function is composed of two simpler functions: # One that computes ``total_pay`` and another that simply adds an ``id`` column to @@ -43,6 +46,7 @@ def process_data(df, worker_id): # Next we define the schemas that provide type and statistical annotations # for the raw, intermediate, and final outputs of our pipeline. + class InSchema(pa.SchemaModel): hourly_pay: Series[float] = pa.Field(ge=7) hours_worked: Series[float] = pa.Field(ge=10) @@ -55,6 +59,7 @@ def check_numbers_are_positive(cls, series: Series) -> Series[bool]: class Config: coerce = True + class IntermediateSchema(InSchema): total_pay: Series[float] @@ -63,9 +68,11 @@ def check_total_pay(cls, df: DataFrame) -> Series[bool]: """Defines a dataframe-level custom check.""" return df["total_pay"] == df["hourly_pay"] * df["hours_worked"] + class OutSchema(IntermediateSchema): worker_id: Series[str] = pa.Field() + # %% # Columns are specified as class attributes with a specified data type using the # type-hinting syntax, and you can place additional statistical constraints on the @@ -74,7 +81,7 @@ class OutSchema(IntermediateSchema): # :py:func:`~pandera.model_components.dataframe_check` (dataframe-level checks), which automatically make them # class methods. # -# Pandera uses inheritence to make sure that :py:class:`~pandera.model.SchemaModel` subclasses contain +# Pandera uses inheritance to make sure that :py:class:`~pandera.model.SchemaModel` subclasses contain # all of the same columns and custom check methods as their base class. Inheritance semantics # apply to schema models so you can override column attributes or check methods in subclasses. This has # the nice effect of providing an explicit graph of type dependencies as data @@ -89,32 +96,41 @@ class OutSchema(IntermediateSchema): # by decorating our functions with the :py:func:`~flytekit.task` and :py:func:`~flytekit.workflow` decorators and # annotating the inputs and outputs of those functions with the pandera schemas: + @task def dict_to_dataframe(data: dict) -> DataFrame[InSchema]: """Helper task to convert a dictionary input to a dataframe.""" return pd.DataFrame(data) + @task -def total_pay(df: DataFrame[InSchema]) -> DataFrame[IntermediateSchema]: +def total_pay(df: DataFrame[InSchema]) -> DataFrame[IntermediateSchema]: # noqa : F811 return df.assign(total_pay=df.hourly_pay * df.hours_worked) + @task -def add_ids(df: DataFrame[IntermediateSchema], worker_ids: typing.List[str]) -> DataFrame[OutSchema]: +def add_ids( + df: DataFrame[IntermediateSchema], worker_ids: typing.List[str] +) -> DataFrame[OutSchema]: return df.assign(worker_id=worker_ids) + @workflow -def process_data( - data: dict = {"hourly_pay": [12.0, 13.5, 10.1], "hours_worked": [30.5, 40.0, 41.75]}, - worker_ids: typing.List[str] = ["a", "b", "c"] +def process_data( # noqa : F811 + data: dict = { + "hourly_pay": [12.0, 13.5, 10.1], + "hours_worked": [30.5, 40.0, 41.75], + }, + worker_ids: typing.List[str] = ["a", "b", "c"], ) -> DataFrame[OutSchema]: - return add_ids( - df=total_pay(df=dict_to_dataframe(data=data)), worker_ids=worker_ids - ) + return add_ids(df=total_pay(df=dict_to_dataframe(data=data)), worker_ids=worker_ids) + if __name__ == "__main__": print(f"Running {__file__} main...") result = process_data( - data={"hourly_pay": [12.0, 13.5, 10.1], "hours_worked": [30.5, 40.0, 41.75]}, worker_ids=["a", "b", "c"] + data={"hourly_pay": [12.0, 13.5, 10.1], "hours_worked": [30.5, 40.0, 41.75]}, + worker_ids=["a", "b", "c"], ) print(f"Running wf(), returns dataframe\n{result}\n{result.dtypes}") diff --git a/cookbook/integrations/flytekit_plugins/pandera_examples/validating_and_testing_ml_pipelines.py b/cookbook/integrations/flytekit_plugins/pandera_examples/validating_and_testing_ml_pipelines.py index f9b8467f16..00fc81c52e 100644 --- a/cookbook/integrations/flytekit_plugins/pandera_examples/validating_and_testing_ml_pipelines.py +++ b/cookbook/integrations/flytekit_plugins/pandera_examples/validating_and_testing_ml_pipelines.py @@ -38,21 +38,19 @@ import typing +import flytekitplugins.pandera # noqa: F401 import joblib import pandas as pd import pandera as pa from flytekit import task, workflow from flytekit.types.file import JoblibSerializedFile -from pandera.typing import DataFrame, Series, Index +from pandera.typing import DataFrame, Index, Series # noqa: F401 from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score - # %% # We also need to import the ``pandera`` flytekit plugin to enable dataframe runtime type-checking: -import flytekitplugins.pandera - # %% # The Dataset: UCI Heart Disease @@ -96,7 +94,7 @@ # * - ``ca`` # - number of major vessels (0-3) colored by flourosopy # * - ``thal`` -# - 3 = normal; 6 = fixed defect; 7 = reversable defect +# - 3 = normal; 6 = fixed defect; 7 = reversible defect # * - ``target`` # - the predicted attribute # @@ -148,7 +146,7 @@ class RawData(pa.SchemaModel): isin=[ 3, # normal 6, # fixed defect - 7, # reversable defect + 7, # reversible defect ] ) target: Series[int] = pa.Field(ge=0, le=4) @@ -195,7 +193,7 @@ def fetch_raw_data() -> DataFrame[RawData]: # having values ranging from ``0 - 4``` to a binary representation where ``0`` represents absence of heart disease and # ``1`` represents presence of heart disease. # -# Here we can use inheritence to define a ``ParsedData`` schema by overriding just the ``target`` attribute: +# Here we can use inheritance to define a ``ParsedData`` schema by overriding just the ``target`` attribute: class ParsedData(RawData): @@ -220,7 +218,6 @@ def parse_raw_data(raw_data: DataFrame[RawData]) -> DataFrame[ParsedData]: # Now it's time to split the data into a training and test set. Here we'll showcase the utility of # :ref:`named outputs ` combined with pandera schemas. -import typing DataSplits = typing.NamedTuple( "DataSplits", training_set=DataFrame[ParsedData], test_set=DataFrame[ParsedData] @@ -330,7 +327,7 @@ class NegativeExamples(ParsedData): @given( PositiveExamples.strategy(size=5), NegativeExamples.strategy(size=5), - st.integers(min_value=0, max_value=2 ** 32), + st.integers(min_value=0, max_value=2**32), ) @hypothesis.settings( deadline=1000, diff --git a/cookbook/integrations/flytekit_plugins/sql/sql_alchemy.py b/cookbook/integrations/flytekit_plugins/sql/sql_alchemy.py index 5c0ba4f9c6..c176bac7ef 100644 --- a/cookbook/integrations/flytekit_plugins/sql/sql_alchemy.py +++ b/cookbook/integrations/flytekit_plugins/sql/sql_alchemy.py @@ -19,12 +19,10 @@ # %% # Let's first import the libraries. -import pandas from flytekit import kwtypes, task, workflow from flytekit.types.schema import FlyteSchema from flytekitplugins.sqlalchemy import SQLAlchemyConfig, SQLAlchemyTask - # %% # First we define a ``SQLALchemyTask``, which returns the first ``n`` records from the ``rna`` table of the # `RNA central database `__ . Since this database is public, we can @@ -39,7 +37,9 @@ # **Never** store passwords for proprietary or sensitive databases! If you need to store and access secrets in a task, # Flyte provides a convenient API. See :ref:`sphx_glr_auto_core_containerization_use_secrets.py` for more details. -DATABASE_URI = "postgresql://reader:NWDMCE5xdipIjRrp@hh-pgsql-public.ebi.ac.uk:5432/pfmegrnargs" +DATABASE_URI = ( + "postgresql://reader:NWDMCE5xdipIjRrp@hh-pgsql-public.ebi.ac.uk:5432/pfmegrnargs" +) # Here we define the schema of the expected output of the query, which we then re-use in the `get_mean_length` task. DataSchema = FlyteSchema[kwtypes(sequence_length=int)] @@ -71,7 +71,9 @@ def get_mean_length(data: DataSchema) -> float: # Finally, we put everything together into a workflow: @workflow def my_wf(min_length: int, max_length: int, limit: int) -> float: - return get_mean_length(data=sql_task(min_length=min_length, max_length=max_length, limit=limit)) + return get_mean_length( + data=sql_task(min_length=min_length, max_length=max_length, limit=limit) + ) if __name__ == "__main__": diff --git a/cookbook/integrations/kubernetes/k8s_spark/README.rst b/cookbook/integrations/kubernetes/k8s_spark/README.rst index 0989f18a02..c183241616 100644 --- a/cookbook/integrations/kubernetes/k8s_spark/README.rst +++ b/cookbook/integrations/kubernetes/k8s_spark/README.rst @@ -74,7 +74,7 @@ Step 2: Environment Setup #. Build Spark image correctly as explained in :ref:`spark-docker-image`. -#. Enable Spark plugin for Flyte refering to the :ref:`spark-examples` section. Additionally, Flyte uses the SparkOperator to run Spark Jobs and separate K8s Service Account/Role per namespace, which are created as part of the standard Flyte deployment. +#. Enable Spark plugin for Flyte referring to the :ref:`spark-examples` section. Additionally, Flyte uses the SparkOperator to run Spark Jobs and separate K8s Service Account/Role per namespace, which are created as part of the standard Flyte deployment. #. Ensure you have enough resources on your K8s cluster. Based on the resources required for your Spark job (across drivers/executors), you may have to tweak resource quotas for the namespace. diff --git a/cookbook/integrations/kubernetes/k8s_spark/pyspark_pi.py b/cookbook/integrations/kubernetes/k8s_spark/pyspark_pi.py index ab58fe1c7c..e51a6343bd 100644 --- a/cookbook/integrations/kubernetes/k8s_spark/pyspark_pi.py +++ b/cookbook/integrations/kubernetes/k8s_spark/pyspark_pi.py @@ -113,7 +113,7 @@ def hello_spark(partitions: int) -> float: def f(_): x = random.random() * 2 - 1 y = random.random() * 2 - 1 - return 1 if x ** 2 + y ** 2 <= 1 else 0 + return 1 if x**2 + y**2 <= 1 else 0 # %% diff --git a/cookbook/integrations/kubernetes/kfmpi/mpi_mnist.py b/cookbook/integrations/kubernetes/kfmpi/mpi_mnist.py index 4e1caa1df7..6812022849 100644 --- a/cookbook/integrations/kubernetes/kfmpi/mpi_mnist.py +++ b/cookbook/integrations/kubernetes/kfmpi/mpi_mnist.py @@ -11,13 +11,14 @@ import pathlib import flytekit -import tensorflow as tf import horovod.tensorflow as hvd -from flytekit import task, workflow, Resources -from flytekit.types.directory import FlyteDirectory +import tensorflow as tf +from flytekit import Resources, task, workflow from flytekit.core.base_task import IgnoreOutputs +from flytekit.types.directory import FlyteDirectory from flytekitplugins.kfmpi import MPIJob + # %% # We define a training step that will be called from the training loop. # This step captures the training loss and updates the model weights through gradients. @@ -47,6 +48,7 @@ def training_step(images, labels, first_batch, mnist_model, loss, opt): return loss_value + # %% # We define an MPIJob-enabled task. The configuration given in the MPIJob constructor will be used to set up the distributed training environment. # @@ -66,10 +68,12 @@ def training_step(images, labels, first_batch, mnist_model, loss, opt): retries=3, cache=True, cache_version="0.1", - requests=Resources(cpu='1', mem="600Mi"), - limits=Resources(cpu='2'), + requests=Resources(cpu="1", mem="600Mi"), + limits=Resources(cpu="2"), ) -def horovod_train_task(batch_size: int, buffer_size: int, dataset_size: int) -> FlyteDirectory: +def horovod_train_task( + batch_size: int, buffer_size: int, dataset_size: int +) -> FlyteDirectory: """ :param batch_size: Represents the number of consecutive elements of this dataset to combine in a single batch. :param buffer_size: Defines the size of the buffer used to hold elements of the dataset used for training. @@ -78,25 +82,30 @@ def horovod_train_task(batch_size: int, buffer_size: int, dataset_size: int) -> """ hvd.init() - (mnist_images, mnist_labels), _ = \ - tf.keras.datasets.mnist.load_data(path='mnist-%d.npz' % hvd.rank()) + (mnist_images, mnist_labels), _ = tf.keras.datasets.mnist.load_data( + path="mnist-%d.npz" % hvd.rank() + ) dataset = tf.data.Dataset.from_tensor_slices( - (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), - tf.cast(mnist_labels, tf.int64)) + ( + tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), + tf.cast(mnist_labels, tf.int64), + ) ) dataset = dataset.repeat().shuffle(buffer_size).batch(batch_size) - mnist_model = tf.keras.Sequential([ - tf.keras.layers.Conv2D(32, [3, 3], activation='relu'), - tf.keras.layers.Conv2D(64, [3, 3], activation='relu'), - tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), - tf.keras.layers.Dropout(0.25), - tf.keras.layers.Flatten(), - tf.keras.layers.Dense(128, activation='relu'), - tf.keras.layers.Dropout(0.5), - tf.keras.layers.Dense(10, activation='softmax') - ]) + mnist_model = tf.keras.Sequential( + [ + tf.keras.layers.Conv2D(32, [3, 3], activation="relu"), + tf.keras.layers.Conv2D(64, [3, 3], activation="relu"), + tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), + tf.keras.layers.Dropout(0.25), + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(128, activation="relu"), + tf.keras.layers.Dropout(0.5), + tf.keras.layers.Dense(10, activation="softmax"), + ] + ) loss = tf.losses.SparseCategoricalCrossentropy() # Horovod: adjust learning rate based on number of GPUs. @@ -112,7 +121,7 @@ def horovod_train_task(batch_size: int, buffer_size: int, dataset_size: int) -> loss_value = training_step(images, labels, batch == 0, mnist_model, loss, opt) if batch % 10 == 0 and hvd.local_rank() == 0: - print('Step #%d\tLoss: %.6f' % (batch, loss_value)) + print("Step #%d\tLoss: %.6f" % (batch, loss_value)) if hvd.rank() != 0: raise IgnoreOutputs("I am not rank 0") @@ -122,22 +131,34 @@ def horovod_train_task(batch_size: int, buffer_size: int, dataset_size: int) -> checkpoint.save(checkpoint_prefix) tf.keras.models.save_model( - mnist_model, str(working_dir), overwrite=True, include_optimizer=True, save_format=None, - signatures=None, options=None, save_traces=True + mnist_model, + str(working_dir), + overwrite=True, + include_optimizer=True, + save_format=None, + signatures=None, + options=None, + save_traces=True, ) return FlyteDirectory(path=str(working_dir)) + # %% # Lastly, we can call the workflow and run the example. @workflow -def horovod_training_wf(batch_size: int = 128, buffer_size: int = 10000, dataset_size: int = 10000) -> FlyteDirectory: +def horovod_training_wf( + batch_size: int = 128, buffer_size: int = 10000, dataset_size: int = 10000 +) -> FlyteDirectory: """ :param batch_size: Represents the number of consecutive elements of this dataset to combine in a single batch. :param buffer_size: Defines the size of the buffer used to hold elements of the dataset used for training. :param dataset_size: The number of elements of this dataset that should be taken to form the new dataset when running batched training. """ - return horovod_train_task(batch_size=batch_size, buffer_size=buffer_size, dataset_size=dataset_size) + return horovod_train_task( + batch_size=batch_size, buffer_size=buffer_size, dataset_size=dataset_size + ) + if __name__ == "__main__": model, plot, logs = horovod_training_wf() diff --git a/cookbook/integrations/kubernetes/kfpytorch/pytorch_mnist.py b/cookbook/integrations/kubernetes/kfpytorch/pytorch_mnist.py index 445c37ba81..403ca25c80 100644 --- a/cookbook/integrations/kubernetes/kfpytorch/pytorch_mnist.py +++ b/cookbook/integrations/kubernetes/kfpytorch/pytorch_mnist.py @@ -9,6 +9,7 @@ import os import typing from dataclasses import dataclass +from typing import Tuple import matplotlib.pyplot as plt import torch @@ -22,7 +23,6 @@ from torch import distributed as dist from torch import nn, optim from torchvision import datasets, transforms -from typing import Tuple WORLD_SIZE = int(os.environ.get("WORLD_SIZE", 1)) @@ -186,7 +186,7 @@ class Hyperparameters(object): cache=True, cache_version="1.0", requests=Resources(cpu=cpu_request, mem=mem_request, gpu=gpu_request), - limits=Resources(mem=mem_limit, gpu=gpu_limit) + limits=Resources(mem=mem_limit, gpu=gpu_limit), ) def mnist_pytorch_job(hp: Hyperparameters) -> TrainingOutputs: log_dir = "logs" diff --git a/cookbook/integrations/kubernetes/kftensorflow/tf_mnist.py b/cookbook/integrations/kubernetes/kftensorflow/tf_mnist.py index a5a9b8c613..a9b15429ce 100644 --- a/cookbook/integrations/kubernetes/kftensorflow/tf_mnist.py +++ b/cookbook/integrations/kubernetes/kftensorflow/tf_mnist.py @@ -14,16 +14,15 @@ # %% # First, we load the libraries. import os +from dataclasses import dataclass +from typing import NamedTuple, Tuple -from flytekitplugins.kftensorflow import TfJob -from flytekit import task, workflow, Resources - -import tensorflow_datasets as tfds import tensorflow as tf -from typing import NamedTuple, Tuple -from flytekit.types.directory import FlyteDirectory -from dataclasses import dataclass +import tensorflow_datasets as tfds from dataclasses_json import dataclass_json +from flytekit import Resources, task, workflow +from flytekit.types.directory import FlyteDirectory +from flytekitplugins.kftensorflow import TfJob # %% # We define ``MODEL_FILE_PATH`` indicating where to store the model file. @@ -31,6 +30,8 @@ # %% # We initialize a data class to store the hyperparameters. + + @dataclass_json @dataclass class Hyperparameters(object): diff --git a/cookbook/integrations/kubernetes/pod/pod.py b/cookbook/integrations/kubernetes/pod/pod.py index fef6786abc..e7b1a5c289 100644 --- a/cookbook/integrations/kubernetes/pod/pod.py +++ b/cookbook/integrations/kubernetes/pod/pod.py @@ -38,7 +38,7 @@ import time from typing import List -from flytekit import task, workflow, Resources +from flytekit import Resources, TaskMetadata, dynamic, map_task, task, workflow from flytekitplugins.pod import Pod from kubernetes.client.models import ( V1Container, @@ -53,6 +53,8 @@ # %% # We define a simple pod spec with two containers. + + def generate_pod_spec_for_task(): # Primary containers do not require us to specify an image, the default image built for Flyte tasks will get used. @@ -126,7 +128,6 @@ def pod_workflow() -> str: # # To use pod task as part of map task, we send pod task definition to :py:func:`~flytekit:flytekit.map_task`. # This will run pod task across a collection of inputs. -from flytekit import map_task, TaskMetadata @task( @@ -151,7 +152,7 @@ def pod_workflow() -> str: limits={"cpu": ".5", "memory": "500Mi"}, ), ) - ] + ], ), primary_container_name="primary", ) @@ -180,7 +181,6 @@ def my_map_workflow(a: List[int]) -> str: # ==================== # # To use pod task a dynamic task, simply pass the pod task config to an annotated dynamic task. -from flytekit import dynamic @task @@ -209,7 +209,7 @@ def my_dynamic_pod_task(val: int) -> str: @workflow -def my_dynamic_pod_task_workflow(val: int=6) -> str: +def my_dynamic_pod_task_workflow(val: int = 6) -> str: s = my_dynamic_pod_task(val=val) return s diff --git a/cookbook/larger_apps/larger_apps_iterate.py b/cookbook/larger_apps/larger_apps_iterate.py index f55d30fa52..a300040a9e 100644 --- a/cookbook/larger_apps/larger_apps_iterate.py +++ b/cookbook/larger_apps/larger_apps_iterate.py @@ -163,7 +163,7 @@ def my_wf(message: str) -> str: Finally, you can execute the updated workflow programmatically with ``flytectl``. To pass arguments to the workflow, update the execution spec file that we previously generated in the -:ref:`Deploying to the Coud ` step. +:ref:`Deploying to the Cloud ` step. Generate an execution spec file. This will prompt you to overwrite and answer 'y' on it.