From 660cc0260f9fe11641650460c657fe40a29f7442 Mon Sep 17 00:00:00 2001 From: Beverly Klemme <35578090+bjklemme-intel@users.noreply.github.com> Date: Tue, 23 May 2023 17:27:19 -0700 Subject: [PATCH] Add new tutorial example to OpenFL interactive API (#812) * Add new tutorial example to OpenFL interactive API This adds a new tutorial example on distributing a linear regression task over OpenFL cluster The model is defined by scikit-learn which is able to run over both cpu (by default) and gpu. The dataset is 1-dimensional noisy data of sinusoid with pre-defined parameters. Fixes #798 Co-authored-by: Beverly Klemme Co-authored-by: Grant Baker Signed-off-by: Yi CAO * reduced requirements.txt in workspace Signed-off-by: Beverly Klemme --------- Signed-off-by: Yi CAO Signed-off-by: Beverly Klemme Co-authored-by: Yi CAO --- .../scikit_learn_linear_regression/README.md | 55 +++ .../director/director_config.yaml | 6 + .../director/start_director.sh | 4 + .../envoy/envoy_config.yaml | 12 + .../envoy/linreg_shard_descriptor.py | 60 +++ .../envoy/requirements.txt | 7 + .../envoy/start_envoy.sh | 6 + .../workspace/custom_adapter.py | 21 + .../workspace/requirements.txt | 7 + .../scikit_learn_linear_regression.ipynb | 420 ++++++++++++++++++ 10 files changed, 598 insertions(+) create mode 100644 openfl-tutorials/interactive_api/scikit_learn_linear_regression/README.md create mode 100644 openfl-tutorials/interactive_api/scikit_learn_linear_regression/director/director_config.yaml create mode 100755 openfl-tutorials/interactive_api/scikit_learn_linear_regression/director/start_director.sh create mode 100644 openfl-tutorials/interactive_api/scikit_learn_linear_regression/envoy/envoy_config.yaml create mode 100644 openfl-tutorials/interactive_api/scikit_learn_linear_regression/envoy/linreg_shard_descriptor.py create mode 100644 openfl-tutorials/interactive_api/scikit_learn_linear_regression/envoy/requirements.txt create mode 100755 openfl-tutorials/interactive_api/scikit_learn_linear_regression/envoy/start_envoy.sh create mode 100644 openfl-tutorials/interactive_api/scikit_learn_linear_regression/workspace/custom_adapter.py create mode 100644 openfl-tutorials/interactive_api/scikit_learn_linear_regression/workspace/requirements.txt create mode 100644 openfl-tutorials/interactive_api/scikit_learn_linear_regression/workspace/scikit_learn_linear_regression.ipynb diff --git a/openfl-tutorials/interactive_api/scikit_learn_linear_regression/README.md b/openfl-tutorials/interactive_api/scikit_learn_linear_regression/README.md new file mode 100644 index 0000000000..2485306208 --- /dev/null +++ b/openfl-tutorials/interactive_api/scikit_learn_linear_regression/README.md @@ -0,0 +1,55 @@ +# Scikit-learn based Linear Regression Tutorial + +### 1. About dataset + +Generate 1-dimensional noisy data for linear regression of sinusoid. + +Define the below pamameter in shard_config in the envoy_config.yaml file as the random seed for the dataset generation for a specific Envoy +- rank + +### 2. About model + +Linear Regression Lasso Model based on Scikit-learn. + + +### 3. How to run this tutorial (without TLC and locally as a simulation): + +1. Run director: + +```sh +cd director folder +./start_director.sh +``` + +2. Run envoy: + +Step 1: Activate virtual environment and install packages +``` +cd envoy folder +pip install -r requirements.txt +``` +Step 2: start the envoy +```sh +./start_envoy.sh env_instance_1 envoy_config.yaml +``` + +Optional: start second envoy: + +- Copy `envoy_folder` to another place and follow the same process as above: + +```sh +./start_envoy.sh env_instance_2 envoy_config_2.yaml +``` + +3. Run `scikit_learn_linear_regression.ipynb` jupyter notebook: + +```sh +cd workspace +jupyter lab scikit_learn_linear_regression.ipynb +``` + +4. Visualization + +``` +tensorboard --logdir logs/ +``` diff --git a/openfl-tutorials/interactive_api/scikit_learn_linear_regression/director/director_config.yaml b/openfl-tutorials/interactive_api/scikit_learn_linear_regression/director/director_config.yaml new file mode 100644 index 0000000000..d22b4b7766 --- /dev/null +++ b/openfl-tutorials/interactive_api/scikit_learn_linear_regression/director/director_config.yaml @@ -0,0 +1,6 @@ +settings: + listen_host: localhost + listen_port: 50050 + sample_shape: ['1'] # Modify this param if experimenting with `n_features` of shard_descriptor. + target_shape: ['1'] + envoy_health_check_period: 5 # in seconds diff --git a/openfl-tutorials/interactive_api/scikit_learn_linear_regression/director/start_director.sh b/openfl-tutorials/interactive_api/scikit_learn_linear_regression/director/start_director.sh new file mode 100755 index 0000000000..5806a6cc0a --- /dev/null +++ b/openfl-tutorials/interactive_api/scikit_learn_linear_regression/director/start_director.sh @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +fx director start --disable-tls -c director_config.yaml \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/scikit_learn_linear_regression/envoy/envoy_config.yaml b/openfl-tutorials/interactive_api/scikit_learn_linear_regression/envoy/envoy_config.yaml new file mode 100644 index 0000000000..107c566945 --- /dev/null +++ b/openfl-tutorials/interactive_api/scikit_learn_linear_regression/envoy/envoy_config.yaml @@ -0,0 +1,12 @@ +params: + cuda_devices: [] + +optional_plugin_components: {} + +shard_descriptor: + template: linreg_shard_descriptor.LinRegSD + params: + rank: 1 + n_samples: 80 + noise: 0.15 + \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/scikit_learn_linear_regression/envoy/linreg_shard_descriptor.py b/openfl-tutorials/interactive_api/scikit_learn_linear_regression/envoy/linreg_shard_descriptor.py new file mode 100644 index 0000000000..12bfed9df4 --- /dev/null +++ b/openfl-tutorials/interactive_api/scikit_learn_linear_regression/envoy/linreg_shard_descriptor.py @@ -0,0 +1,60 @@ +# Copyright (C) 2020-2021 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +"""Noisy-Sin Shard Descriptor.""" + +from typing import List + +import numpy as np + +from openfl.interface.interactive_api.shard_descriptor import ShardDescriptor + + +class LinRegSD(ShardDescriptor): + """Shard descriptor class.""" + + def __init__(self, rank: int, n_samples: int = 10, noise: float = 0.15) -> None: + """ + Initialize LinReg Shard Descriptor. + + This Shard Descriptor generate random data. Sample features are + floats between pi/3 and 5*pi/3, and targets are calculated + calculated as sin(feature) + normal_noise. + """ + np.random.seed(rank) # Setting seed for reproducibility + self.n_samples = max(n_samples, 5) + self.interval = 240 + self.x_start = 60 + x = np.random.rand(n_samples, 1) * self.interval + self.x_start + x *= np.pi / 180 + y = np.sin(x) + np.random.normal(0, noise, size=(n_samples, 1)) + self.data = np.concatenate((x, y), axis=1) + + def get_dataset(self, dataset_type: str) -> np.ndarray: + """ + Return a shard dataset by type. + + A simple list with elements (x, y) implemets the Shard Dataset interface. + """ + if dataset_type == 'train': + return self.data[:self.n_samples // 2] + elif dataset_type == 'val': + return self.data[self.n_samples // 2:] + else: + pass + + @property + def sample_shape(self) -> List[str]: + """Return the sample shape info.""" + (*x, _) = self.data[0] + return [str(i) for i in np.array(x, ndmin=1).shape] + + @property + def target_shape(self) -> List[str]: + """Return the target shape info.""" + (*_, y) = self.data[0] + return [str(i) for i in np.array(y, ndmin=1).shape] + + @property + def dataset_description(self) -> str: + """Return the dataset description.""" + return 'Allowed dataset types are `train` and `val`' diff --git a/openfl-tutorials/interactive_api/scikit_learn_linear_regression/envoy/requirements.txt b/openfl-tutorials/interactive_api/scikit_learn_linear_regression/envoy/requirements.txt new file mode 100644 index 0000000000..f1ad86dd90 --- /dev/null +++ b/openfl-tutorials/interactive_api/scikit_learn_linear_regression/envoy/requirements.txt @@ -0,0 +1,7 @@ +openfl>=1.2.1 +numpy>=1.13.3 +scikit-learn>=0.24.1 +matplotlib>=2.0.0 +mistune>=2.0.3 # not directly required, pinned by Snyk to avoid a vulnerability +setuptools>=65.5.1 # not directly required, pinned by Snyk to avoid a vulnerability +wheel>=0.38.0 # not directly required, pinned by Snyk to avoid a vulnerability diff --git a/openfl-tutorials/interactive_api/scikit_learn_linear_regression/envoy/start_envoy.sh b/openfl-tutorials/interactive_api/scikit_learn_linear_regression/envoy/start_envoy.sh new file mode 100755 index 0000000000..4da07821af --- /dev/null +++ b/openfl-tutorials/interactive_api/scikit_learn_linear_regression/envoy/start_envoy.sh @@ -0,0 +1,6 @@ +#!/bin/bash +set -e +ENVOY_NAME=$1 +ENVOY_CONF=$2 + +fx envoy start -n "$ENVOY_NAME" --disable-tls --envoy-config-path "$ENVOY_CONF" -dh localhost -dp 50050 diff --git a/openfl-tutorials/interactive_api/scikit_learn_linear_regression/workspace/custom_adapter.py b/openfl-tutorials/interactive_api/scikit_learn_linear_regression/workspace/custom_adapter.py new file mode 100644 index 0000000000..6991d2b0ff --- /dev/null +++ b/openfl-tutorials/interactive_api/scikit_learn_linear_regression/workspace/custom_adapter.py @@ -0,0 +1,21 @@ +# Copyright (C) 2020-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +"""Custom model numpy adapter.""" + +from openfl.plugins.frameworks_adapters.framework_adapter_interface import ( + FrameworkAdapterPluginInterface, +) + + +class CustomFrameworkAdapter(FrameworkAdapterPluginInterface): + """Framework adapter plugin class.""" + + @staticmethod + def get_tensor_dict(model, optimizer=None): + """Extract tensors from a model.""" + return {'w': model.weights} + + @staticmethod + def set_tensor_dict(model, tensor_dict, optimizer=None, device='cpu'): + """Load tensors to a model.""" + model.weights = tensor_dict['w'] diff --git a/openfl-tutorials/interactive_api/scikit_learn_linear_regression/workspace/requirements.txt b/openfl-tutorials/interactive_api/scikit_learn_linear_regression/workspace/requirements.txt new file mode 100644 index 0000000000..f1ad86dd90 --- /dev/null +++ b/openfl-tutorials/interactive_api/scikit_learn_linear_regression/workspace/requirements.txt @@ -0,0 +1,7 @@ +openfl>=1.2.1 +numpy>=1.13.3 +scikit-learn>=0.24.1 +matplotlib>=2.0.0 +mistune>=2.0.3 # not directly required, pinned by Snyk to avoid a vulnerability +setuptools>=65.5.1 # not directly required, pinned by Snyk to avoid a vulnerability +wheel>=0.38.0 # not directly required, pinned by Snyk to avoid a vulnerability diff --git a/openfl-tutorials/interactive_api/scikit_learn_linear_regression/workspace/scikit_learn_linear_regression.ipynb b/openfl-tutorials/interactive_api/scikit_learn_linear_regression/workspace/scikit_learn_linear_regression.ipynb new file mode 100644 index 0000000000..4ef7023cc9 --- /dev/null +++ b/openfl-tutorials/interactive_api/scikit_learn_linear_regression/workspace/scikit_learn_linear_regression.ipynb @@ -0,0 +1,420 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "689ee822", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -r requirements.txt" + ] + }, + { + "cell_type": "markdown", + "id": "d63e64c6-9955-4afc-8d04-d8c85bb28edc", + "metadata": {}, + "source": [ + "# Scikit-learn Linear Regression Example - Interactive API" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c9eee14-22a1-4d48-a7da-e68d01037cd4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from typing import List, Union\n", + "\n", + "from sklearn.linear_model import Lasso\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.datasets import make_regression\n", + "from sklearn.metrics import mean_squared_error\n", + "import numpy as np\n", + "import random\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "from matplotlib.pylab import rcParams\n", + "rcParams['figure.figsize'] = 7, 5" + ] + }, + { + "cell_type": "markdown", + "id": "c4b334ef-6a72-4b82-b978-1401973d0512", + "metadata": { + "tags": [] + }, + "source": [ + "# We will use MSE as loss function and Ridge weights regularization\n", + "![image.png](https://www.analyticsvidhya.com/wp-content/uploads/2016/01/eq5-1.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4cc8ec2-b818-4db8-8700-39c1a12917df", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "class SklearnLinearRegressionLasso:\n", + " def __init__(self, n_feat: int, alpha: float = 1.0) -> None:\n", + " self.model = Lasso(alpha=alpha)\n", + " self.scaler = StandardScaler()\n", + " self.weights = np.ones((n_feat + 1)) \n", + " \n", + " def predict(self, feature_vector: Union[np.ndarray, List[int]]) -> float:\n", + " '''\n", + " feature_vector may be a list or have shape (n_feat,)\n", + " or it may be a bunch of vectors (n_vec, nfeat)\n", + " '''\n", + " feature_vector = np.array(feature_vector)\n", + " if len(feature_vector.shape) == 1:\n", + " feature_vector = feature_vector[:,np.newaxis]\n", + " \n", + " feature_vector = self.scaler.transform(feature_vector)\n", + " return self.model.predict(feature_vector)\n", + " \n", + " def mse(self, X: np.ndarray, Y: np.ndarray) -> float:\n", + " Y_predict = self.predict(X)\n", + " return mean_squared_error(Y, Y_predict)\n", + " \n", + " def fit(self, X: np.ndarray, Y: np.ndarray, silent: bool=False) -> None:\n", + " \n", + " X = self.scaler.fit_transform(X)\n", + " self.model.fit(X, Y)\n", + " mse = self.mse(X, Y)\n", + " #self.weights[:-1] = self.model.coef_\n", + " #self.weights[-1] = self.model.intercept_\n", + " if not silent:\n", + " print(f'MSE: {mse}')\n", + " \n", + " def print_parameters(self) -> None:\n", + " print('Final parameters: ')\n", + " print(f'Weights: {self.model.coef_}')\n", + " print(f'Bias: {self.model.intercept_}')\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af89e7e5-6cfc-46bc-acd2-7d5bfb373091", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Define input array with angles from 60deg to 300deg converted to radians\n", + "x = np.array([i*np.pi/180 for i in range(60,300,4)])\n", + "np.random.seed(10) # Setting seed for reproducibility\n", + "y = np.sin(x) + np.random.normal(0,0.15,len(x))\n", + "# plt.plot(x,y,'.')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffefca2b-d7f6-4111-8872-c017c182a2de", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "lr_model = SklearnLinearRegressionLasso(n_feat=1, alpha=0.1)\n", + "\n", + "lr_model.fit(x[:,np.newaxis], y)\n", + "\n", + "#print the final parameters\n", + "lr_model.print_parameters()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "410f2d80-989a-43ab-958f-7b68fd8f2e90", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# We can also solve this 1D problem using Numpy\n", + "numpy_solution = np.polyfit(x,y,1)\n", + "predictor_np = np.poly1d(numpy_solution)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6cb323db-9f3a-42af-94da-4b170adef867", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "y_hat = lr_model.predict(x)\n", + "y_np = predictor_np(x)\n", + "plt.plot(x,y,'.')\n", + "plt.plot(x,y_hat,'.')\n", + "plt.plot(x,y_np,'--')" + ] + }, + { + "cell_type": "markdown", + "id": "ffd4d2d7-5537-496a-88c1-301da87d979c", + "metadata": {}, + "source": [ + "# Now we run the same training on federated data" + ] + }, + { + "cell_type": "markdown", + "id": "09cf7090-da51-4f4e-9d28-2a5c6e3bca02", + "metadata": {}, + "source": [ + "## Connect to a Federation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b3c0039-e1f7-4047-b98b-a2d4bd42f015", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Create a federation\n", + "from openfl.interface.interactive_api.federation import Federation\n", + "\n", + "# please use the same identificator that was used in signed certificate\n", + "client_id = 'frontend'\n", + "director_node_fqdn = 'localhost'\n", + "director_port = 50050\n", + "\n", + "federation = Federation(\n", + " client_id=client_id,\n", + " director_node_fqdn=director_node_fqdn,\n", + " director_port=director_port,\n", + " tls=False\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7815120e-b704-4a7d-a65a-3c7542023ead", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "shard_registry = federation.get_shard_registry()\n", + "shard_registry" + ] + }, + { + "cell_type": "markdown", + "id": "b011dd95-64a7-4a8b-91ec-e61cdf885bbb", + "metadata": {}, + "source": [ + "### Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1985ac9-a2b1-4561-a962-6adfe35c3b97", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from openfl.interface.interactive_api.experiment import TaskInterface, DataInterface, ModelInterface, FLExperiment\n", + "\n", + "class LinRegDataSet(DataInterface):\n", + " def __init__(self, **kwargs):\n", + " \"\"\"Initialize DataLoader.\"\"\"\n", + " self.kwargs = kwargs\n", + " pass\n", + "\n", + " @property\n", + " def shard_descriptor(self):\n", + " \"\"\"Return shard descriptor.\"\"\"\n", + " return self._shard_descriptor\n", + " \n", + " @shard_descriptor.setter\n", + " def shard_descriptor(self, shard_descriptor):\n", + " \"\"\"\n", + " Describe per-collaborator procedures or sharding.\n", + "\n", + " This method will be called during a collaborator initialization.\n", + " Local shard_descriptor will be set by Envoy.\n", + " \"\"\"\n", + " self._shard_descriptor = shard_descriptor\n", + " self.train_set = shard_descriptor.get_dataset(\"train\")\n", + " self.val_set = shard_descriptor.get_dataset(\"val\")\n", + "\n", + " def get_train_loader(self, **kwargs):\n", + " \"\"\"Output of this method will be provided to tasks with optimizer in contract.\"\"\"\n", + " return self.train_set\n", + "\n", + " def get_valid_loader(self, **kwargs):\n", + " \"\"\"Output of this method will be provided to tasks without optimizer in contract.\"\"\"\n", + " return self.val_set\n", + "\n", + " def get_train_data_size(self):\n", + " \"\"\"Information for aggregation.\"\"\"\n", + " return len(self.train_set)\n", + "\n", + " def get_valid_data_size(self):\n", + " \"\"\"Information for aggregation.\"\"\"\n", + " return len(self.val_set)\n", + " \n", + "lin_reg_dataset = LinRegDataSet()" + ] + }, + { + "cell_type": "markdown", + "id": "b8909127-99d1-4dba-86fe-01a1b86585e7", + "metadata": {}, + "source": [ + "### Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9523c9a2-a259-461f-937f-1fb054bd2886", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "framework_adapter = 'custom_adapter.CustomFrameworkAdapter'\n", + "fed_model = SklearnLinearRegressionLasso(n_feat=1, alpha=0.1)\n", + "MI = ModelInterface(model=fed_model, optimizer=None, framework_plugin=framework_adapter)\n", + "\n", + "# Save the initial model state\n", + "initial_model = SklearnLinearRegressionLasso(n_feat=1, alpha=0.1)" + ] + }, + { + "cell_type": "markdown", + "id": "2e3558bb-b21b-48ac-b07e-43cf75e6907b", + "metadata": {}, + "source": [ + "### Tasks\n", + "We need to employ a trick reporting metrics. OpenFL decides which model is the best based on an *increasing* metric." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f73e1ff9-d54a-49b5-9ce8-8bc72c6a2c6f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "TI = TaskInterface()\n", + "\n", + "@TI.register_fl_task(model='my_model', data_loader='train_data', \\\n", + " device='device', optimizer='optimizer') \n", + "def train(my_model, train_data, optimizer, device):\n", + " X, Y = train_data[:,:-1], train_data[:,-1]\n", + " my_model.fit(X, Y, silent=True)\n", + " return {'train_MSE': my_model.mse(X, Y),}\n", + "\n", + "@TI.register_fl_task(model='my_model', data_loader='val_data', device='device') \n", + "def validate(my_model, val_data, device):\n", + " X, Y = val_data[:,:-1], val_data[:,-1] \n", + " return {'validation_MSE': my_model.mse(X, Y),}" + ] + }, + { + "cell_type": "markdown", + "id": "ee7659cc-6e03-43f5-9078-95707fa0e4d5", + "metadata": { + "tags": [] + }, + "source": [ + "### Run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "749100e8-05ce-418c-a980-545e3beb900b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "experiment_name = 'linear_regression_experiment'\n", + "fl_experiment = FLExperiment(federation=federation, experiment_name=experiment_name,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16bf1df7-8ca8-4a5e-a833-47c265c11e05", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "fl_experiment.start(model_provider=MI, \n", + " task_keeper=TI,\n", + " data_loader=lin_reg_dataset,\n", + " rounds_to_train=10,)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1178d1ea-05e6-46be-ac07-21620bd6ec76", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "fl_experiment.stream_metrics()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c04b4ab2-1d40-44c7-907b-a6a7d176c959", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}