Skip to content

Commit

Permalink
Resolving pre-commit-hook changes (flyteorg#741)
Browse files Browse the repository at this point in the history
* Resolving pre-commit-hook changes

* Resolving pre-commit-hook changes

* Resolving pre-commit-hook changes

* updated makefile

Signed-off-by: SmritiSatyanV <[email protected]>

* Updated makefile

Added spellcheck
Signed-off-by: SmritiSatyanV <[email protected]>

* spellcheck

Signed-off-by: SmritiSatyanV <[email protected]>
  • Loading branch information
SmritiSatyanV authored May 4, 2022
1 parent 78c89c3 commit a2363f7
Show file tree
Hide file tree
Showing 70 changed files with 790 additions and 1,104 deletions.
24 changes: 24 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
repos:
- repo: https://github.com/PyCQA/flake8
rev: 3.9.2
hooks:
- id: flake8
- repo: https://github.com/psf/black
rev: 22.3.0
hooks:
- id: black
- repo: https://github.com/PyCQA/isort
rev: 5.9.3
hooks:
- id: isort
args: ["--profile", "black"]
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.0.1
hooks:
- id: check-yaml
- id: end-of-file-fixer
- id: trailing-whitespace
- repo: https://github.com/shellcheck-py/shellcheck-py
rev: v0.7.2.1
hooks:
- id: shellcheck
14 changes: 13 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,19 @@ update_boilerplate:
@curl https://raw.githubusercontent.com/flyteorg/boilerplate/master/boilerplate/update.sh -o boilerplate/update.sh
@boilerplate/update.sh


.PHONY: fmt
fmt: ## Format code with black and isort
pre-commit run black --all-files || true
pre-commit run isort --all-files || true

.PHONY: lint
lint: ## Run linters
pre-commit run --all-files

.PHONY: spellcheck
spellcheck: ## Runs a spellchecker over all code and documentation
codespell -L "te,raison,fo" --skip="./docs/build,./.git"

.PHONY: help
help: ## Show help message
@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^[$$()% a-zA-Z_-]+:.*?##/ { printf " \033[36m%-15s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
Expand Down
2 changes: 0 additions & 2 deletions cookbook/case_studies/bioinformatics/blast/blastx_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,10 @@

import matplotlib.pyplot as plt
import pandas as pd

from flytekit import conditional, kwtypes, task, workflow
from flytekit.extras.tasks.shell import OutputLocation, ShellTask
from flytekit.types.file import FlyteFile, PNGImageFile


# %%
# A ``ShellTask`` is useful to run commands on the shell.
# In this example, we use ``ShellTask`` to generate and run the BLASTX command.
Expand Down
8 changes: 5 additions & 3 deletions cookbook/case_studies/feature_engineering/eda/notebook.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
Flyte Pipeline in One Jupyter Notebook
=======================================
In this example, we will implement a simple pipeline that takes hyperparameters, does EDA, feature engineering, and measures the Gradient
Boosting model's performace using mean absolute error (MAE), all in one notebook.
In this example, we will implement a simple pipeline that takes hyperparameters, does EDA, feature engineering, and measures the Gradient
Boosting model's performance using mean absolute error (MAE), all in one notebook.
"""

# %%
Expand All @@ -15,7 +15,7 @@
from flytekitplugins.papermill import NotebookTask

# %%
# We define a ``NotebookTask`` to run the `Jupyter notebook
# We define a ``NotebookTask`` to run the `Jupyter notebook
# <https://github.com/flyteorg/flytesnacks/blob/master/cookbook/case_studies/feature_engineering/eda/supermarket_regression.ipynb>`__.
#
# .. list-table:: ``NotebookTask`` Parameters
Expand Down Expand Up @@ -49,6 +49,8 @@

# %%
# Since a task need not be defined, we create a ``workflow`` and return the MAE score.


@workflow
def notebook_wf(
n_estimators: int = 150,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
EDA and Feature Engineering in Jupyter Notebook and Modeling in a Flyte Task
============================================================================
In this example, we will implement a simple pipeline that takes hyperparameters, does EDA, feature engineering
(step 1: EDA and feature engineering in notebook), and measures the Gradient Boosting model's performace using mean absolute error (MAE)
(step 2: Modeling in a Flyte Task).
In this example, we will implement a simple pipeline that takes hyperparameters, does EDA, feature engineering
(step 1: EDA and feature engineering in notebook), and measures the Gradient Boosting model's performance using mean absolute error (MAE)
(step 2: Modeling in a Flyte Task).
"""

# %%
Expand Down Expand Up @@ -37,7 +37,7 @@ class Hyperparameters(object):


# %%
# We define a ``NotebookTask`` to run the `Jupyter notebook
# We define a ``NotebookTask`` to run the `Jupyter notebook
# <https://github.com/flyteorg/flytesnacks/blob/master/cookbook/case_studies/feature_engineering/eda/supermarket_regression_1.ipynb>`__.
# This notebook returns ``dummified_data`` and ``dataset`` as the outputs.
#
Expand All @@ -55,6 +55,8 @@ class Hyperparameters(object):
# %%
# Next, we define a ``cross_validate`` function and a ``modeling`` task to compute the MAE score of the data against
# the Gradient Boosting Regressor.


def cross_validate(model, nfolds, feats, targets):
score = -1 * (
cross_val_score(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
EDA and Feature Engineering in One Jupyter Notebook and Modeling in the Other
=============================================================================
In this example, we will implement a simple pipeline that takes hyperparameters, does EDA, feature engineering
(step 1: EDA and feature engineering in notebook), and measures the Gradient Boosting model's performace using mean absolute error
(MAE) (step 2: Modeling in notebook).
In this example, we will implement a simple pipeline that takes hyperparameters, does EDA, feature engineering
(step 1: EDA and feature engineering in notebook), and measures the Gradient Boosting model's performance using mean absolute error
(MAE) (step 2: Modeling in notebook).
"""

# %%
Expand All @@ -17,7 +17,7 @@
from flytekitplugins.papermill import NotebookTask

# %%
# We define a ``NotebookTask`` to run the `Jupyter notebook
# We define a ``NotebookTask`` to run the `Jupyter notebook
# <https://github.com/flyteorg/flytesnacks/blob/master/cookbook/case_studies/feature_engineering/eda/supermarket_regression_1.ipynb>`__ (EDA).
# This notebook returns ``dummified_data`` and ``dataset`` as the outputs.
#
Expand All @@ -35,8 +35,8 @@
)

# %%
# We define a ``NotebookTask`` to run the `Jupyter notebook
# <https://github.com/flyteorg/flytesnacks/blob/master/cookbook/case_studies/feature_engineering/eda/supermarket_regression_2.ipynb>`__
# We define a ``NotebookTask`` to run the `Jupyter notebook
# <https://github.com/flyteorg/flytesnacks/blob/master/cookbook/case_studies/feature_engineering/eda/supermarket_regression_2.ipynb>`__
# (Modeling).
#
# This notebook returns ``mae_score`` as the output.
Expand All @@ -60,6 +60,8 @@

# %%
# We define a ``Workflow`` to run the notebook tasks.


@workflow
def notebook_wf(
n_estimators: int = 150,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@
"source": [
"#### Retrieve the latest registered version of the pipeline\n",
"\n",
"FlyteRemote provides convienient methods to retrieve version of the pipeline from the remote server.\n",
"FlyteRemote provides convenient methods to retrieve version of the pipeline from the remote server.\n",
"\n",
"**NOTE** It is possible to get a specific version of the workflow and trigger a launch for that, but let's just get the latest."
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,26 +18,26 @@
"""

import logging
import random
import typing

# %%
# Let's import the libraries.
from datetime import datetime, timedelta
import random

import boto3
import joblib
import pandas as pd
from feast import Entity, Feature, FeatureStore, FeatureView, FileSource, ValueType
from flytekit import task, workflow, TaskMetadata, Resources
from feast_dataobjects import FeatureStore, FeatureStoreConfig # noqa : F811
from flytekit import Resources, TaskMetadata, task, workflow
from flytekit.configuration.internal import AWS
from flytekit.extras.sqlite3.task import SQLite3Config, SQLite3Task
from flytekit.types.file import JoblibSerializedFile
from flytekit.types.schema import FlyteSchema
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

from feast_dataobjects import FeatureStore, FeatureStoreConfig
from .feature_eng_tasks import mean_median_imputer, univariate_selection

logger = logging.getLogger(__file__)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@


# %%
# We define a ``mean_median_imputer`` task to fill in the missing values of the dataset, for which we use the
# We define a ``mean_median_imputer`` task to fill in the missing values of the dataset, for which we use the
# `SimpleImputer <https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html>`__ class from the ``scikit-learn`` library.
@task(cache=True, cache_version="1.0")
def mean_median_imputer(
Expand All @@ -53,7 +53,7 @@ def mean_median_imputer(

# %%
# Let's define the other task called ``univariate_selection`` that does feature selection.
# The `SelectKBest <https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html#sklearn.feature_selection.SelectKBest>`__ method removes all
# The `SelectKBest <https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html#sklearn.feature_selection.SelectKBest>`__ method removes all
# but the highest scoring features (DataFrame columns).
@task(cache=True, cache_version="1.0")
def univariate_selection(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,20 @@
Predicting House Price in a Region Using XGBoost
------------------------------------------------
`XGBoost <https://xgboost.readthedocs.io/en/latest/>`__ is an optimized distributed gradient boosting library designed to be efficient, flexible, and portable.
`XGBoost <https://xgboost.readthedocs.io/en/latest/>`__ is an optimized distributed gradient boosting library designed to be efficient, flexible, and portable.
It uses `gradient boosting <https://en.wikipedia.org/wiki/Gradient_boosting>`__ technique to implement Machine Learning algorithms.
In this tutorial, we will understand how to predict house prices using XGBoost, and Flyte.
We will split the generated dataset into train, test and validation set.
We will split the generated dataset into train, test and validation set.
Next, we will create three Flyte tasks, that will:
1. Generate house details, and split the dataset.
2. Train the model using XGBoost.
3. Generate predictions.
Let's get started with the example!
"""
Expand All @@ -30,20 +31,21 @@
# pip install joblib
# pip install xgboost

import os

# %%
# First, let's import the required packages into the environment.
import typing
from typing import Tuple

import os
import flytekit
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from flytekit import Resources, task, workflow
from flytekit.types.file import JoblibSerializedFile
from typing import Tuple
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

# %%
# We initialize a variable to represent columns in the dataset. The other variables help generate the dataset.
Expand All @@ -66,6 +68,8 @@
# =====================
#
# We define a function to compute the price of a house based on multiple factors (``number of bedrooms``, ``number of bathrooms``, ``area``, ``garage space``, and ``year built``).


def gen_price(house) -> int:
_base_price = int(house["SQUARE_FEET"] * 150)
_price = int(
Expand Down Expand Up @@ -93,7 +97,7 @@ def gen_houses(num_houses) -> pd.DataFrame:
"YEAR_BUILT": min(MAX_YEAR, int(np.random.normal(1995, 10))),
}
_price = gen_price(_house)
# column names/features
# column names/features
_house_list.append(
[
_price,
Expand All @@ -105,13 +109,14 @@ def gen_houses(num_houses) -> pd.DataFrame:
_house["GARAGE_SPACES"],
]
)
# convert the list to a DataFrame
# convert the list to a DataFrame
_df = pd.DataFrame(
_house_list,
columns=COLUMNS,
)
return _df


# %%
# Data Preprocessing and Splitting
# ===================================
Expand All @@ -122,24 +127,24 @@ def split_data(
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:

seed = seed
val_size = split[1] # 0.3
test_size = split[2] # 0.1
val_size = split[1] # 0.3
test_size = split[2] # 0.1

num_samples = df.shape[0]
# retain the features, skip the target column
x1 = df.values[:num_samples, 1:]
x1 = df.values[:num_samples, 1:]
# retain the target column
y1 = df.values[:num_samples, :1]
y1 = df.values[:num_samples, :1]

# divide the features and target column into random train and test subsets, based on `test_size`
# divide the features and target column into random train and test subsets, based on `test_size`
x_train, x_test, y_train, y_test = train_test_split(
x1, y1, test_size=test_size, random_state=seed
)
# divide the train data into train and validation subsets, based on `test_size`
x_train, x_val, y_train, y_val = train_test_split(
x_train,
y_train,
test_size=(val_size / (1 - test_size)), # here, `test_size` computes to 0.3
test_size=(val_size / (1 - test_size)), # here, `test_size` computes to 0.3
random_state=seed,
)

Expand All @@ -164,6 +169,7 @@ def split_data(
),
)


# %%
# Next, we create a ``NamedTuple`` to map a variable name to its respective data type.
dataset = typing.NamedTuple(
Expand All @@ -175,11 +181,14 @@ def split_data(

# %%
# We define a task to call the aforementioned functions.


@task(cache=True, cache_version="0.1", limits=Resources(mem="600Mi"))
def generate_and_split_data(number_of_houses: int, seed: int) -> dataset:
_houses = gen_houses(number_of_houses)
return split_data(_houses, seed, split=SPLIT_RATIOS)


# %%
# Training
# ==========
Expand Down Expand Up @@ -261,4 +270,4 @@ def house_price_predictor_trainer(
#
# We can run the workflow locally provided the required libraries are installed. The output would be a list of house prices, generated using the XGBoost model.
if __name__ == "__main__":
print(house_price_predictor_trainer())
print(house_price_predictor_trainer())
Loading

0 comments on commit a2363f7

Please sign in to comment.