Resolving pre-commit-hook changes (flyteorg#741)

* Resolving pre-commit-hook changes * Resolving pre-commit-hook changes * Resolving pre-commit-hook changes * updated makefile Signed-off-by: SmritiSatyanV <[email protected]> * Updated makefile Added spellcheck Signed-off-by: SmritiSatyanV <[email protected]> * spellcheck Signed-off-by: SmritiSatyanV <[email protected]>
eapolinario · May 4, 2022 · a2363f7 · a2363f7
1 parent 78c89c3
commit a2363f7
Show file tree

Hide file tree

Showing 70 changed files with 790 additions and 1,104 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,24 @@
+repos:
+- repo: https://github.com/PyCQA/flake8
+  rev:  3.9.2
+  hooks:
+    - id: flake8
+- repo: https://github.com/psf/black
+  rev:  22.3.0
+  hooks:
+    - id: black
+- repo: https://github.com/PyCQA/isort
+  rev:  5.9.3
+  hooks:
+    - id: isort
+      args: ["--profile", "black"]
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v4.0.1
+  hooks:
+    - id: check-yaml
+    - id: end-of-file-fixer
+    - id: trailing-whitespace
+- repo: https://github.com/shellcheck-py/shellcheck-py
+  rev: v0.7.2.1
+  hooks:
+  - id: shellcheck
diff --git a/Makefile b/Makefile
@@ -34,7 +34,19 @@ update_boilerplate:
 	@curl https://raw.githubusercontent.com/flyteorg/boilerplate/master/boilerplate/update.sh -o boilerplate/update.sh
 	@boilerplate/update.sh
 
-
+.PHONY: fmt
+fmt: ## Format code with black and isort
+	pre-commit run black --all-files || true
+	pre-commit run isort --all-files || true
+
+.PHONY: lint
+lint: ## Run linters
+	pre-commit run --all-files
+
+.PHONY: spellcheck
+spellcheck:  ## Runs a spellchecker over all code and documentation
+	codespell -L "te,raison,fo" --skip="./docs/build,./.git"
+
 .PHONY: help
 help: ## Show help message
 	@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n  make \033[36m\033[0m\n"} /^[$$()% a-zA-Z_-]+:.*?##/ { printf "  \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)

diff --git a/cookbook/case_studies/bioinformatics/blast/blastx_example.py b/cookbook/case_studies/bioinformatics/blast/blastx_example.py
@@ -11,12 +11,10 @@
 
 import matplotlib.pyplot as plt
 import pandas as pd
-
 from flytekit import conditional, kwtypes, task, workflow
 from flytekit.extras.tasks.shell import OutputLocation, ShellTask
 from flytekit.types.file import FlyteFile, PNGImageFile
 
-
 # %%
 # A ``ShellTask`` is useful to run commands on the shell.
 # In this example, we use ``ShellTask`` to generate and run the BLASTX command.

diff --git a/cookbook/case_studies/feature_engineering/eda/notebook.py b/cookbook/case_studies/feature_engineering/eda/notebook.py
@@ -2,8 +2,8 @@
 Flyte Pipeline in One Jupyter Notebook
 =======================================
 
-In this example, we will implement a simple pipeline that takes hyperparameters, does EDA, feature engineering, and measures the Gradient 
-Boosting model's performace using mean absolute error (MAE), all in one notebook.
+In this example, we will implement a simple pipeline that takes hyperparameters, does EDA, feature engineering, and measures the Gradient
+Boosting model's performance using mean absolute error (MAE), all in one notebook.
 """
 
 # %%
@@ -15,7 +15,7 @@
 from flytekitplugins.papermill import NotebookTask
 
 # %%
-# We define a ``NotebookTask`` to run the `Jupyter notebook 
+# We define a ``NotebookTask`` to run the `Jupyter notebook
 # <https://github.com/flyteorg/flytesnacks/blob/master/cookbook/case_studies/feature_engineering/eda/supermarket_regression.ipynb>`__.
 #
 # .. list-table:: ``NotebookTask`` Parameters
@@ -49,6 +49,8 @@
 
 # %%
 # Since a task need not be defined, we create a ``workflow`` and return the MAE score.
+
+
 @workflow
 def notebook_wf(
     n_estimators: int = 150,

diff --git a/cookbook/case_studies/feature_engineering/eda/notebook_and_task.py b/cookbook/case_studies/feature_engineering/eda/notebook_and_task.py
@@ -2,9 +2,9 @@
 EDA and Feature Engineering in Jupyter Notebook and Modeling in a Flyte Task
 ============================================================================
 
-In this example, we will implement a simple pipeline that takes hyperparameters, does EDA, feature engineering 
-(step 1: EDA and feature engineering in notebook), and measures the Gradient Boosting model's performace using mean absolute error (MAE) 
-(step 2: Modeling in a Flyte Task). 
+In this example, we will implement a simple pipeline that takes hyperparameters, does EDA, feature engineering
+(step 1: EDA and feature engineering in notebook), and measures the Gradient Boosting model's performance using mean absolute error (MAE)
+(step 2: Modeling in a Flyte Task).
 """
 
 # %%
@@ -37,7 +37,7 @@ class Hyperparameters(object):
 
 
 # %%
-# We define a ``NotebookTask`` to run the `Jupyter notebook 
+# We define a ``NotebookTask`` to run the `Jupyter notebook
 # <https://github.com/flyteorg/flytesnacks/blob/master/cookbook/case_studies/feature_engineering/eda/supermarket_regression_1.ipynb>`__.
 # This notebook returns ``dummified_data`` and ``dataset`` as the outputs.
 #
@@ -55,6 +55,8 @@ class Hyperparameters(object):
 # %%
 # Next, we define a ``cross_validate`` function and a ``modeling`` task to compute the MAE score of the data against
 # the Gradient Boosting Regressor.
+
+
 def cross_validate(model, nfolds, feats, targets):
     score = -1 * (
         cross_val_score(

diff --git a/cookbook/case_studies/feature_engineering/eda/notebooks_as_tasks.py b/cookbook/case_studies/feature_engineering/eda/notebooks_as_tasks.py
@@ -2,9 +2,9 @@
 EDA and Feature Engineering in One Jupyter Notebook and Modeling in the Other
 =============================================================================
 
-In this example, we will implement a simple pipeline that takes hyperparameters, does EDA, feature engineering 
-(step 1: EDA and feature engineering in notebook), and measures the Gradient Boosting model's performace using mean absolute error 
-(MAE) (step 2: Modeling in notebook). 
+In this example, we will implement a simple pipeline that takes hyperparameters, does EDA, feature engineering
+(step 1: EDA and feature engineering in notebook), and measures the Gradient Boosting model's performance using mean absolute error
+(MAE) (step 2: Modeling in notebook).
 """
 
 # %%
@@ -17,7 +17,7 @@
 from flytekitplugins.papermill import NotebookTask
 
 # %%
-# We define a ``NotebookTask`` to run the `Jupyter notebook 
+# We define a ``NotebookTask`` to run the `Jupyter notebook
 # <https://github.com/flyteorg/flytesnacks/blob/master/cookbook/case_studies/feature_engineering/eda/supermarket_regression_1.ipynb>`__ (EDA).
 # This notebook returns ``dummified_data`` and ``dataset`` as the outputs.
 #
@@ -35,8 +35,8 @@
 )
 
 # %%
-# We define a ``NotebookTask`` to run the `Jupyter notebook 
-# <https://github.com/flyteorg/flytesnacks/blob/master/cookbook/case_studies/feature_engineering/eda/supermarket_regression_2.ipynb>`__ 
+# We define a ``NotebookTask`` to run the `Jupyter notebook
+# <https://github.com/flyteorg/flytesnacks/blob/master/cookbook/case_studies/feature_engineering/eda/supermarket_regression_2.ipynb>`__
 # (Modeling).
 #
 # This notebook returns ``mae_score`` as the output.
@@ -60,6 +60,8 @@
 
 # %%
 # We define a ``Workflow`` to run the notebook tasks.
+
+
 @workflow
 def notebook_wf(
     n_estimators: int = 150,

diff --git a/cookbook/case_studies/feature_engineering/feast_integration/Feast_Flyte_Demo.ipynb b/cookbook/case_studies/feature_engineering/feast_integration/Feast_Flyte_Demo.ipynb
@@ -71,7 +71,7 @@
    "source": [
     "#### Retrieve the latest registered version of the pipeline\n",
     "\n",
-    "FlyteRemote provides convienient methods to retrieve version of the pipeline from the remote server.\n",
+    "FlyteRemote provides convenient methods to retrieve version of the pipeline from the remote server.\n",
     "\n",
     "**NOTE** It is possible to get a specific version of the workflow and trigger a launch for that, but let's just get the latest."
    ]

diff --git a/cookbook/case_studies/feature_engineering/feast_integration/feast_workflow.py b/cookbook/case_studies/feature_engineering/feast_integration/feast_workflow.py
@@ -18,26 +18,26 @@
 """
 
 import logging
+import random
 import typing
 
 # %%
 # Let's import the libraries.
 from datetime import datetime, timedelta
-import random
 
 import boto3
 import joblib
 import pandas as pd
 from feast import Entity, Feature, FeatureStore, FeatureView, FileSource, ValueType
-from flytekit import task, workflow, TaskMetadata, Resources
+from feast_dataobjects import FeatureStore, FeatureStoreConfig  # noqa : F811
+from flytekit import Resources, TaskMetadata, task, workflow
 from flytekit.configuration.internal import AWS
 from flytekit.extras.sqlite3.task import SQLite3Config, SQLite3Task
 from flytekit.types.file import JoblibSerializedFile
 from flytekit.types.schema import FlyteSchema
 from sklearn.model_selection import train_test_split
 from sklearn.naive_bayes import GaussianNB
 
-from feast_dataobjects import FeatureStore, FeatureStoreConfig
 from .feature_eng_tasks import mean_median_imputer, univariate_selection
 
 logger = logging.getLogger(__file__)

diff --git a/cookbook/case_studies/feature_engineering/feast_integration/feature_eng_tasks.py b/cookbook/case_studies/feature_engineering/feast_integration/feature_eng_tasks.py
@@ -27,7 +27,7 @@
 
 
 # %%
-# We define a ``mean_median_imputer`` task to fill in the missing values of the dataset, for which we use the 
+# We define a ``mean_median_imputer`` task to fill in the missing values of the dataset, for which we use the
 # `SimpleImputer <https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html>`__ class from the ``scikit-learn`` library.
 @task(cache=True, cache_version="1.0")
 def mean_median_imputer(
@@ -53,7 +53,7 @@ def mean_median_imputer(
 
 # %%
 # Let's define the other task called ``univariate_selection`` that does feature selection.
-# The `SelectKBest <https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html#sklearn.feature_selection.SelectKBest>`__ method removes all 
+# The `SelectKBest <https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html#sklearn.feature_selection.SelectKBest>`__ method removes all
 # but the highest scoring features (DataFrame columns).
 @task(cache=True, cache_version="1.0")
 def univariate_selection(

diff --git a/cookbook/case_studies/ml_training/house_price_prediction/house_price_predictor.py b/cookbook/case_studies/ml_training/house_price_prediction/house_price_predictor.py
@@ -4,19 +4,20 @@
 Predicting House Price in a Region Using XGBoost
 ------------------------------------------------
 
-`XGBoost <https://xgboost.readthedocs.io/en/latest/>`__ is an optimized distributed gradient boosting library designed to be efficient, flexible, and portable. 
+`XGBoost <https://xgboost.readthedocs.io/en/latest/>`__ is an optimized distributed gradient boosting library designed to be efficient, flexible, and portable.
 It uses `gradient boosting <https://en.wikipedia.org/wiki/Gradient_boosting>`__ technique to implement Machine Learning algorithms.
 
 In this tutorial, we will understand how to predict house prices using XGBoost, and Flyte.
 
-We will split the generated dataset into train, test and validation set. 
+We will split the generated dataset into train, test and validation set.
 
 Next, we will create three Flyte tasks, that will:
 
 1. Generate house details, and split the dataset.
 2. Train the model using XGBoost.
 3. Generate predictions.
 
+
 Let's get started with the example!
 
 """
@@ -30,20 +31,21 @@
 #       pip install joblib
 #       pip install xgboost
 
+import os
+
 # %%
 # First, let's import the required packages into the environment.
 import typing
+from typing import Tuple
 
-import os
 import flytekit
 import joblib
 import numpy as np
 import pandas as pd
-from sklearn.model_selection import train_test_split
-from xgboost import XGBRegressor
 from flytekit import Resources, task, workflow
 from flytekit.types.file import JoblibSerializedFile
-from typing import Tuple
+from sklearn.model_selection import train_test_split
+from xgboost import XGBRegressor
 
 # %%
 # We initialize a variable to represent columns in the dataset. The other variables help generate the dataset.
@@ -66,6 +68,8 @@
 # =====================
 #
 # We define a function to compute the price of a house based on multiple factors (``number of bedrooms``, ``number of bathrooms``, ``area``, ``garage space``, and ``year built``).
+
+
 def gen_price(house) -> int:
     _base_price = int(house["SQUARE_FEET"] * 150)
     _price = int(
@@ -93,7 +97,7 @@ def gen_houses(num_houses) -> pd.DataFrame:
             "YEAR_BUILT": min(MAX_YEAR, int(np.random.normal(1995, 10))),
         }
         _price = gen_price(_house)
-        # column names/features 
+        # column names/features
         _house_list.append(
             [
                 _price,
@@ -105,13 +109,14 @@ def gen_houses(num_houses) -> pd.DataFrame:
                 _house["GARAGE_SPACES"],
             ]
         )
-    # convert the list to a DataFrame    
+    # convert the list to a DataFrame
     _df = pd.DataFrame(
         _house_list,
         columns=COLUMNS,
     )
     return _df
 
+
 # %%
 # Data Preprocessing and Splitting
 # ===================================
@@ -122,24 +127,24 @@ def split_data(
 ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
 
     seed = seed
-    val_size = split[1]  # 0.3 
-    test_size = split[2] # 0.1
+    val_size = split[1]  # 0.3
+    test_size = split[2]  # 0.1
 
     num_samples = df.shape[0]
     # retain the features, skip the target column
-    x1 = df.values[:num_samples, 1:]  
+    x1 = df.values[:num_samples, 1:]
     # retain the target column
-    y1 = df.values[:num_samples, :1]  
+    y1 = df.values[:num_samples, :1]
 
-    # divide the features and target column into random train and test subsets, based on `test_size` 
+    # divide the features and target column into random train and test subsets, based on `test_size`
     x_train, x_test, y_train, y_test = train_test_split(
         x1, y1, test_size=test_size, random_state=seed
     )
     # divide the train data into train and validation subsets, based on `test_size`
     x_train, x_val, y_train, y_val = train_test_split(
         x_train,
         y_train,
-        test_size=(val_size / (1 - test_size)), # here, `test_size` computes to 0.3
+        test_size=(val_size / (1 - test_size)),  # here, `test_size` computes to 0.3
         random_state=seed,
     )
 
@@ -164,6 +169,7 @@ def split_data(
         ),
     )
 
+
 # %%
 # Next, we create a ``NamedTuple`` to map a variable name to its respective data type.
 dataset = typing.NamedTuple(
@@ -175,11 +181,14 @@ def split_data(
 
 # %%
 # We define a task to call the aforementioned functions.
+
+
 @task(cache=True, cache_version="0.1", limits=Resources(mem="600Mi"))
 def generate_and_split_data(number_of_houses: int, seed: int) -> dataset:
     _houses = gen_houses(number_of_houses)
     return split_data(_houses, seed, split=SPLIT_RATIOS)
 
+
 # %%
 # Training
 # ==========
@@ -261,4 +270,4 @@ def house_price_predictor_trainer(
 #
 # We can run the workflow locally provided the required libraries are installed. The output would be a list of house prices, generated using the XGBoost model.
 if __name__ == "__main__":
-    print(house_price_predictor_trainer())
+    print(house_price_predictor_trainer())