From 0f429c800dbd38b35290ef572e5968b888cbf170 Mon Sep 17 00:00:00 2001 From: jeff-shepherd <39775772+jeff-shepherd@users.noreply.github.com> Date: Thu, 31 Oct 2024 09:44:37 -0700 Subject: [PATCH] Removed horovod samples (#3434) These samples were not compatible with tensorflow 2.16, which is required to avoid security vulnerabilities --- ...nsorflow-mnist-distributed-horovod-job.yml | 68 ------- ...d-tensorflow-mnist-distributed-horovod.yml | 94 --------- cli/README.md | 1 - .../mnist-distributed-horovod/job.yml | 17 -- .../mnist-distributed-horovod/src/train.py | 120 ----------- sdk/python/README.md | 1 - .../mnist-distributed-horovod/src/train.py | 120 ----------- ...tensorflow-mnist-distributed-horovod.ipynb | 191 ------------------ 8 files changed, 612 deletions(-) delete mode 100644 .github/workflows/cli-jobs-single-step-tensorflow-mnist-distributed-horovod-job.yml delete mode 100644 .github/workflows/sdk-jobs-single-step-tensorflow-mnist-distributed-horovod-tensorflow-mnist-distributed-horovod.yml delete mode 100644 cli/jobs/single-step/tensorflow/mnist-distributed-horovod/job.yml delete mode 100644 cli/jobs/single-step/tensorflow/mnist-distributed-horovod/src/train.py delete mode 100644 sdk/python/jobs/single-step/tensorflow/mnist-distributed-horovod/src/train.py delete mode 100644 sdk/python/jobs/single-step/tensorflow/mnist-distributed-horovod/tensorflow-mnist-distributed-horovod.ipynb diff --git a/.github/workflows/cli-jobs-single-step-tensorflow-mnist-distributed-horovod-job.yml b/.github/workflows/cli-jobs-single-step-tensorflow-mnist-distributed-horovod-job.yml deleted file mode 100644 index c882ef830a1..00000000000 --- a/.github/workflows/cli-jobs-single-step-tensorflow-mnist-distributed-horovod-job.yml +++ /dev/null @@ -1,68 +0,0 @@ -# This code is autogenerated. -# Code is generated by running custom script: python3 readme.py -# Any manual changes to this file may cause incorrect behavior. -# Any manual changes will be overwritten if the code is regenerated. - -name: cli-jobs-single-step-tensorflow-mnist-distributed-horovod-job -on: - workflow_dispatch: - schedule: - - cron: "54 3/12 * * *" - pull_request: - branches: - - main - paths: - - cli/jobs/single-step/tensorflow/mnist-distributed-horovod/** - - infra/bootstrapping/** - - .github/workflows/cli-jobs-single-step-tensorflow-mnist-distributed-horovod-job.yml - - cli/setup.sh -permissions: - id-token: write -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true -jobs: - build: - runs-on: ubuntu-latest - steps: - - name: check out repo - uses: actions/checkout@v2 - - name: azure login - uses: azure/login@v1 - with: - client-id: ${{ secrets.OIDC_AZURE_CLIENT_ID }} - tenant-id: ${{ secrets.OIDC_AZURE_TENANT_ID }} - subscription-id: ${{ secrets.OIDC_AZURE_SUBSCRIPTION_ID }} - - name: bootstrap resources - run: | - echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}'; - bash bootstrap.sh - working-directory: infra/bootstrapping - continue-on-error: false - - name: setup-cli - run: | - source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; - source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; - bash setup.sh - working-directory: cli - continue-on-error: true - - name: Eagerly cache access tokens for required scopes - run: | - # Workaround for azure-cli's lack of support for ID token refresh - # Taken from: https://github.com/Azure/login/issues/372#issuecomment-2056289617 - - # Management - az account get-access-token --scope https://management.azure.com/.default --output none - # ML - az account get-access-token --scope https://ml.azure.com/.default --output none - - name: run job - run: | - source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; - source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; - bash -x ../../../../run-job.sh job.yml - working-directory: cli/jobs/single-step/tensorflow/mnist-distributed-horovod - - name: validate readme - run: | - python check-readme.py "${{ github.workspace }}/cli/jobs/single-step/tensorflow/mnist-distributed-horovod" - working-directory: infra/bootstrapping - continue-on-error: false diff --git a/.github/workflows/sdk-jobs-single-step-tensorflow-mnist-distributed-horovod-tensorflow-mnist-distributed-horovod.yml b/.github/workflows/sdk-jobs-single-step-tensorflow-mnist-distributed-horovod-tensorflow-mnist-distributed-horovod.yml deleted file mode 100644 index b612841ebd3..00000000000 --- a/.github/workflows/sdk-jobs-single-step-tensorflow-mnist-distributed-horovod-tensorflow-mnist-distributed-horovod.yml +++ /dev/null @@ -1,94 +0,0 @@ -# This code is autogenerated. -# Code is generated by running custom script: python3 readme.py -# Any manual changes to this file may cause incorrect behavior. -# Any manual changes will be overwritten if the code is regenerated. - -name: sdk-jobs-single-step-tensorflow-mnist-distributed-horovod-tensorflow-mnist-distributed-horovod -# This file is created by sdk/python/readme.py. -# Please do not edit directly. -on: - workflow_dispatch: - schedule: - - cron: "9 5/12 * * *" - pull_request: - branches: - - main - paths: - - sdk/python/jobs/single-step/tensorflow/mnist-distributed-horovod/** - - .github/workflows/sdk-jobs-single-step-tensorflow-mnist-distributed-horovod-tensorflow-mnist-distributed-horovod.yml - - sdk/python/dev-requirements.txt - - infra/bootstrapping/** - - sdk/python/setup.sh - -permissions: - id-token: write -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true -jobs: - build: - runs-on: ubuntu-latest - steps: - - name: check out repo - uses: actions/checkout@v2 - - name: setup python - uses: actions/setup-python@v2 - with: - python-version: "3.10" - - name: pip install notebook reqs - run: pip install -r sdk/python/dev-requirements.txt - - name: azure login - uses: azure/login@v1 - with: - client-id: ${{ secrets.OIDC_AZURE_CLIENT_ID }} - tenant-id: ${{ secrets.OIDC_AZURE_TENANT_ID }} - subscription-id: ${{ secrets.OIDC_AZURE_SUBSCRIPTION_ID }} - - name: bootstrap resources - run: | - echo '${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}'; - bash bootstrap.sh - working-directory: infra/bootstrapping - continue-on-error: false - - name: setup SDK - run: | - source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; - source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; - bash setup.sh - working-directory: sdk/python - continue-on-error: true - - name: validate readme - run: | - python check-readme.py "${{ github.workspace }}/sdk/python/jobs/single-step/tensorflow/mnist-distributed-horovod" - working-directory: infra/bootstrapping - continue-on-error: false - - name: setup-cli - run: | - source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; - source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; - bash setup.sh - working-directory: cli - continue-on-error: true - - name: Eagerly cache access tokens for required scopes - run: | - # Workaround for azure-cli's lack of support for ID token refresh - # Taken from: https://github.com/Azure/login/issues/372#issuecomment-2056289617 - - # Management - az account get-access-token --scope https://management.azure.com/.default --output none - # ML - az account get-access-token --scope https://ml.azure.com/.default --output none - - name: run jobs/single-step/tensorflow/mnist-distributed-horovod/tensorflow-mnist-distributed-horovod.ipynb - run: | - source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh"; - source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh"; - bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" generate_workspace_config "../../.azureml/config.json"; - bash "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh" replace_template_values "tensorflow-mnist-distributed-horovod.ipynb"; - [ -f "../../.azureml/config" ] && cat "../../.azureml/config"; - papermill -k python tensorflow-mnist-distributed-horovod.ipynb tensorflow-mnist-distributed-horovod.output.ipynb - working-directory: sdk/python/jobs/single-step/tensorflow/mnist-distributed-horovod - - name: upload notebook's working folder as an artifact - if: ${{ always() }} - uses: ./.github/actions/upload-artifact - with: - name: tensorflow-mnist-distributed-horovod - path: sdk/python/jobs/single-step/tensorflow/mnist-distributed-horovod diff --git a/cli/README.md b/cli/README.md index 71e82420480..09c73dc6de1 100644 --- a/cli/README.md +++ b/cli/README.md @@ -97,7 +97,6 @@ path|status|description [jobs/single-step/scikit-learn/iris/job-docker-context.yml](jobs/single-step/scikit-learn/iris/job-docker-context.yml)|[![jobs/single-step/scikit-learn/iris/job-docker-context](https://github.com/Azure/azureml-examples/workflows/cli-jobs-single-step-scikit-learn-iris-job-docker-context/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-single-step-scikit-learn-iris-job-docker-context.yml)|Train a scikit-learn SVM on the Iris dataset using a custom Docker container build. [jobs/single-step/scikit-learn/iris/job-sweep.yml](jobs/single-step/scikit-learn/iris/job-sweep.yml)|[![jobs/single-step/scikit-learn/iris/job-sweep](https://github.com/Azure/azureml-examples/workflows/cli-jobs-single-step-scikit-learn-iris-job-sweep/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-single-step-scikit-learn-iris-job-sweep.yml)|Sweep hyperparemeters for training a scikit-learn SVM on the Iris dataset. [jobs/single-step/scikit-learn/iris/job.yml](jobs/single-step/scikit-learn/iris/job.yml)|[![jobs/single-step/scikit-learn/iris/job](https://github.com/Azure/azureml-examples/workflows/cli-jobs-single-step-scikit-learn-iris-job/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-single-step-scikit-learn-iris-job.yml)|Train a scikit-learn SVM on the Iris dataset. -[jobs/single-step/tensorflow/mnist-distributed-horovod/job.yml](jobs/single-step/tensorflow/mnist-distributed-horovod/job.yml)|[![jobs/single-step/tensorflow/mnist-distributed-horovod/job](https://github.com/Azure/azureml-examples/workflows/cli-jobs-single-step-tensorflow-mnist-distributed-horovod-job/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-single-step-tensorflow-mnist-distributed-horovod-job.yml)|Train a basic neural network with TensorFlow on the MNIST dataset, distributed via Horovod. [jobs/single-step/tensorflow/mnist-distributed/job.yml](jobs/single-step/tensorflow/mnist-distributed/job.yml)|[![jobs/single-step/tensorflow/mnist-distributed/job](https://github.com/Azure/azureml-examples/workflows/cli-jobs-single-step-tensorflow-mnist-distributed-job/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-single-step-tensorflow-mnist-distributed-job.yml)|Train a basic neural network with TensorFlow on the MNIST dataset, distributed via TensorFlow. [jobs/single-step/tensorflow/mnist/job.yml](jobs/single-step/tensorflow/mnist/job.yml)|[![jobs/single-step/tensorflow/mnist/job](https://github.com/Azure/azureml-examples/workflows/cli-jobs-single-step-tensorflow-mnist-job/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-single-step-tensorflow-mnist-job.yml)|Train a basic neural network with TensorFlow on the MNIST dataset. [jobs/basics/hello-code.yml](jobs/basics/hello-code.yml)|[![jobs/basics/hello-code](https://github.com/Azure/azureml-examples/workflows/cli-jobs-basics-hello-code/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-jobs-basics-hello-code.yml)|*no description* diff --git a/cli/jobs/single-step/tensorflow/mnist-distributed-horovod/job.yml b/cli/jobs/single-step/tensorflow/mnist-distributed-horovod/job.yml deleted file mode 100644 index 856c9cfe160..00000000000 --- a/cli/jobs/single-step/tensorflow/mnist-distributed-horovod/job.yml +++ /dev/null @@ -1,17 +0,0 @@ -$schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json -code: src -command: >- - python train.py - --epochs ${{inputs.epochs}} -inputs: - epochs: 1 -environment: azureml:AzureML-tensorflow-2.12-cuda11@latest -compute: azureml:gpu-cluster -resources: - instance_count: 2 -distribution: - type: mpi - process_count_per_instance: 1 -display_name: tensorflow-mnist-distributed-horovod-example -experiment_name: tensorflow-mnist-distributed-horovod-example -description: Train a basic neural network with TensorFlow on the MNIST dataset, distributed via Horovod. diff --git a/cli/jobs/single-step/tensorflow/mnist-distributed-horovod/src/train.py b/cli/jobs/single-step/tensorflow/mnist-distributed-horovod/src/train.py deleted file mode 100644 index ddc4b4f3f91..00000000000 --- a/cli/jobs/single-step/tensorflow/mnist-distributed-horovod/src/train.py +++ /dev/null @@ -1,120 +0,0 @@ -# Copyright 2019 Uber Technologies, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Script adapted from: https://github.com/horovod/horovod/blob/master/examples/tensorflow2_keras_mnist.py -# ============================================================================== - -import tensorflow as tf -import horovod.tensorflow.keras as hvd - -import os -import argparse - -parser = argparse.ArgumentParser() -parser.add_argument("--learning-rate", "-lr", type=float, default=0.001) -parser.add_argument("--epochs", type=int, default=24) -parser.add_argument("--steps-per-epoch", type=int, default=500) -args = parser.parse_args() - -# Horovod: initialize Horovod. -hvd.init() - -# Horovod: pin GPU to be used to process local rank (one GPU per process) -gpus = tf.config.experimental.list_physical_devices("GPU") -for gpu in gpus: - tf.config.experimental.set_memory_growth(gpu, True) -if gpus: - tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU") - -(mnist_images, mnist_labels), _ = tf.keras.datasets.mnist.load_data( - path="mnist-%d.npz" % hvd.rank() -) - -dataset = tf.data.Dataset.from_tensor_slices( - ( - tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), - tf.cast(mnist_labels, tf.int64), - ) -) -dataset = dataset.repeat().shuffle(10000).batch(128) - -mnist_model = tf.keras.Sequential( - [ - tf.keras.layers.Conv2D(32, [3, 3], activation="relu"), - tf.keras.layers.Conv2D(64, [3, 3], activation="relu"), - tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), - tf.keras.layers.Dropout(0.25), - tf.keras.layers.Flatten(), - tf.keras.layers.Dense(128, activation="relu"), - tf.keras.layers.Dropout(0.5), - tf.keras.layers.Dense(10, activation="softmax"), - ] -) - -# Horovod: adjust learning rate based on number of GPUs. -scaled_lr = args.learning_rate * hvd.size() -opt = tf.optimizers.Adam(scaled_lr) - -# Horovod: add Horovod DistributedOptimizer. -opt = hvd.DistributedOptimizer(opt) - -# Horovod: Specify `experimental_run_tf_function=False` to ensure TensorFlow -# uses hvd.DistributedOptimizer() to compute gradients. -mnist_model.compile( - loss=tf.losses.SparseCategoricalCrossentropy(), - optimizer=opt, - metrics=["accuracy"], - experimental_run_tf_function=False, -) - -callbacks = [ - # Horovod: broadcast initial variable states from rank 0 to all other processes. - # This is necessary to ensure consistent initialization of all workers when - # training is started with random weights or restored from a checkpoint. - hvd.callbacks.BroadcastGlobalVariablesCallback(0), - # Horovod: average metrics among workers at the end of every epoch. - # - # Note: This callback must be in the list before the ReduceLROnPlateau, - # TensorBoard or other metrics-based callbacks. - hvd.callbacks.MetricAverageCallback(), - # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final - # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during - # the first three epochs. See https://arxiv.org/abs/1706.02677 for details. - hvd.callbacks.LearningRateWarmupCallback( - warmup_epochs=3, initial_lr=scaled_lr, verbose=1 - ), -] - -# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. -if hvd.rank() == 0: - output_dir = "./outputs" - os.makedirs(output_dir, exist_ok=True) - callbacks.append( - tf.keras.callbacks.ModelCheckpoint( - os.path.join(output_dir, "checkpoint-{epoch}.h5") - ) - ) - -# Horovod: write logs on worker 0. -verbose = 1 if hvd.rank() == 0 else 0 - -# Train the model. -# Horovod: adjust number of steps based on number of GPUs. -mnist_model.fit( - dataset, - steps_per_epoch=args.steps_per_epoch // hvd.size(), - callbacks=callbacks, - epochs=args.epochs, - verbose=verbose, -) diff --git a/sdk/python/README.md b/sdk/python/README.md index 3a635e0b760..cb853bf96ac 100644 --- a/sdk/python/README.md +++ b/sdk/python/README.md @@ -278,7 +278,6 @@ Test Status is for branch - **_main_** |jobs|single-step|[iris-scikit-learn](jobs/single-step/scikit-learn/iris/iris-scikit-learn.ipynb)|Run Command to train a scikit-learn SVM on the Iris dataset|[![iris-scikit-learn](https://github.com/Azure/azureml-examples/actions/workflows/sdk-jobs-single-step-scikit-learn-iris-iris-scikit-learn.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-jobs-single-step-scikit-learn-iris-iris-scikit-learn.yml)| |jobs|single-step|[sklearn-mnist](jobs/single-step/scikit-learn/mnist/sklearn-mnist.ipynb)|Run a Command to train a scikit-learn SVM on the mnist dataset.|[![sklearn-mnist](https://github.com/Azure/azureml-examples/actions/workflows/sdk-jobs-single-step-scikit-learn-mnist-sklearn-mnist.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-jobs-single-step-scikit-learn-mnist-sklearn-mnist.yml)| |jobs|single-step|[train-hyperparameter-tune-with-sklearn](jobs/single-step/scikit-learn/train-hyperparameter-tune-deploy-with-sklearn/train-hyperparameter-tune-with-sklearn.ipynb)|Train and tune a machine learning model using scikit-learn training scripts to build a to classify iris flower images. - _This sample is excluded from automated tests_|[![train-hyperparameter-tune-with-sklearn](https://github.com/Azure/azureml-examples/actions/workflows/sdk-jobs-single-step-scikit-learn-train-hyperparameter-tune-deploy-with-sklearn-train-hyperparameter-tune-with-sklearn.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-jobs-single-step-scikit-learn-train-hyperparameter-tune-deploy-with-sklearn-train-hyperparameter-tune-with-sklearn.yml)| -|jobs|single-step|[tensorflow-mnist-distributed-horovod](jobs/single-step/tensorflow/mnist-distributed-horovod/tensorflow-mnist-distributed-horovod.ipynb)|Run a **Distributed Command** to train a basic neural network with distributed MPI on the MNIST dataset using Horovod|[![tensorflow-mnist-distributed-horovod](https://github.com/Azure/azureml-examples/actions/workflows/sdk-jobs-single-step-tensorflow-mnist-distributed-horovod-tensorflow-mnist-distributed-horovod.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-jobs-single-step-tensorflow-mnist-distributed-horovod-tensorflow-mnist-distributed-horovod.yml)| |jobs|single-step|[tensorflow-mnist-distributed](jobs/single-step/tensorflow/mnist-distributed/tensorflow-mnist-distributed.ipynb)|Run a **Distributed Command** to train a basic neural network with TensorFlow on the MNIST dataset|[![tensorflow-mnist-distributed](https://github.com/Azure/azureml-examples/actions/workflows/sdk-jobs-single-step-tensorflow-mnist-distributed-tensorflow-mnist-distributed.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-jobs-single-step-tensorflow-mnist-distributed-tensorflow-mnist-distributed.yml)| |jobs|single-step|[tensorflow-mnist](jobs/single-step/tensorflow/mnist/tensorflow-mnist.ipynb)|Run a Command to train a basic neural network with TensorFlow on the MNIST dataset|[![tensorflow-mnist](https://github.com/Azure/azureml-examples/actions/workflows/sdk-jobs-single-step-tensorflow-mnist-tensorflow-mnist.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-jobs-single-step-tensorflow-mnist-tensorflow-mnist.yml)| |jobs|single-step|[train-hyperparameter-tune-deploy-with-keras](jobs/single-step/tensorflow/train-hyperparameter-tune-deploy-with-keras/train-hyperparameter-tune-deploy-with-keras.ipynb)|Train, hyperparameter tune, and deploy a Keras model to classify handwritten digits using a deep neural network (DNN). - _This sample is excluded from automated tests_|[![train-hyperparameter-tune-deploy-with-keras](https://github.com/Azure/azureml-examples/actions/workflows/sdk-jobs-single-step-tensorflow-train-hyperparameter-tune-deploy-with-keras-train-hyperparameter-tune-deploy-with-keras.yml/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/sdk-jobs-single-step-tensorflow-train-hyperparameter-tune-deploy-with-keras-train-hyperparameter-tune-deploy-with-keras.yml)| diff --git a/sdk/python/jobs/single-step/tensorflow/mnist-distributed-horovod/src/train.py b/sdk/python/jobs/single-step/tensorflow/mnist-distributed-horovod/src/train.py deleted file mode 100644 index ddc4b4f3f91..00000000000 --- a/sdk/python/jobs/single-step/tensorflow/mnist-distributed-horovod/src/train.py +++ /dev/null @@ -1,120 +0,0 @@ -# Copyright 2019 Uber Technologies, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Script adapted from: https://github.com/horovod/horovod/blob/master/examples/tensorflow2_keras_mnist.py -# ============================================================================== - -import tensorflow as tf -import horovod.tensorflow.keras as hvd - -import os -import argparse - -parser = argparse.ArgumentParser() -parser.add_argument("--learning-rate", "-lr", type=float, default=0.001) -parser.add_argument("--epochs", type=int, default=24) -parser.add_argument("--steps-per-epoch", type=int, default=500) -args = parser.parse_args() - -# Horovod: initialize Horovod. -hvd.init() - -# Horovod: pin GPU to be used to process local rank (one GPU per process) -gpus = tf.config.experimental.list_physical_devices("GPU") -for gpu in gpus: - tf.config.experimental.set_memory_growth(gpu, True) -if gpus: - tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU") - -(mnist_images, mnist_labels), _ = tf.keras.datasets.mnist.load_data( - path="mnist-%d.npz" % hvd.rank() -) - -dataset = tf.data.Dataset.from_tensor_slices( - ( - tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), - tf.cast(mnist_labels, tf.int64), - ) -) -dataset = dataset.repeat().shuffle(10000).batch(128) - -mnist_model = tf.keras.Sequential( - [ - tf.keras.layers.Conv2D(32, [3, 3], activation="relu"), - tf.keras.layers.Conv2D(64, [3, 3], activation="relu"), - tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), - tf.keras.layers.Dropout(0.25), - tf.keras.layers.Flatten(), - tf.keras.layers.Dense(128, activation="relu"), - tf.keras.layers.Dropout(0.5), - tf.keras.layers.Dense(10, activation="softmax"), - ] -) - -# Horovod: adjust learning rate based on number of GPUs. -scaled_lr = args.learning_rate * hvd.size() -opt = tf.optimizers.Adam(scaled_lr) - -# Horovod: add Horovod DistributedOptimizer. -opt = hvd.DistributedOptimizer(opt) - -# Horovod: Specify `experimental_run_tf_function=False` to ensure TensorFlow -# uses hvd.DistributedOptimizer() to compute gradients. -mnist_model.compile( - loss=tf.losses.SparseCategoricalCrossentropy(), - optimizer=opt, - metrics=["accuracy"], - experimental_run_tf_function=False, -) - -callbacks = [ - # Horovod: broadcast initial variable states from rank 0 to all other processes. - # This is necessary to ensure consistent initialization of all workers when - # training is started with random weights or restored from a checkpoint. - hvd.callbacks.BroadcastGlobalVariablesCallback(0), - # Horovod: average metrics among workers at the end of every epoch. - # - # Note: This callback must be in the list before the ReduceLROnPlateau, - # TensorBoard or other metrics-based callbacks. - hvd.callbacks.MetricAverageCallback(), - # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final - # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during - # the first three epochs. See https://arxiv.org/abs/1706.02677 for details. - hvd.callbacks.LearningRateWarmupCallback( - warmup_epochs=3, initial_lr=scaled_lr, verbose=1 - ), -] - -# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. -if hvd.rank() == 0: - output_dir = "./outputs" - os.makedirs(output_dir, exist_ok=True) - callbacks.append( - tf.keras.callbacks.ModelCheckpoint( - os.path.join(output_dir, "checkpoint-{epoch}.h5") - ) - ) - -# Horovod: write logs on worker 0. -verbose = 1 if hvd.rank() == 0 else 0 - -# Train the model. -# Horovod: adjust number of steps based on number of GPUs. -mnist_model.fit( - dataset, - steps_per_epoch=args.steps_per_epoch // hvd.size(), - callbacks=callbacks, - epochs=args.epochs, - verbose=verbose, -) diff --git a/sdk/python/jobs/single-step/tensorflow/mnist-distributed-horovod/tensorflow-mnist-distributed-horovod.ipynb b/sdk/python/jobs/single-step/tensorflow/mnist-distributed-horovod/tensorflow-mnist-distributed-horovod.ipynb deleted file mode 100644 index 8b2d74e689f..00000000000 --- a/sdk/python/jobs/single-step/tensorflow/mnist-distributed-horovod/tensorflow-mnist-distributed-horovod.ipynb +++ /dev/null @@ -1,191 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Train a basic neural network with distributed MPI on the MNIST dataset using Horovod\n", - "\n", - "**Requirements** - In order to benefit from this tutorial, you will need:\n", - "- A basic understanding of Machine Learning\n", - "- An Azure account with an active subscription. [Create an account for free](https://azure.microsoft.com/free/?WT.mc_id=A261C142F)\n", - "- An Azure ML workspace with computer cluster - [Configure workspace](../../../configuration.ipynb) \n", - "\n", - "- A python environment\n", - "- Installed Azure Machine Learning Python SDK v2 - [install instructions](../../../../README.md) - check the getting started section\n", - "\n", - "**Learning Objectives** - By the end of this tutorial, you should be able to:\n", - "- Connect to your AML workspace from the Python SDK\n", - "- Create and run a **distributed** `Command` which executes a Python command\n", - "- Use a local file as an `input` to the Command\n", - "\n", - "**Motivations** - This notebook explains how to setup and run a Command. The Command is a fundamental construct of Azure Machine Learning. It can be used to run a task on a specified compute (either local or on the cloud). The Command accepts `environment` and `compute` to setup required infrastructure. You can define a `command` to run on this infrastructure with `inputs`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 1. Connect to Azure Machine Learning Workspace\n", - "\n", - "The [workspace](https://docs.microsoft.com/en-us/azure/machine-learning/concept-workspace) is the top-level resource for Azure Machine Learning, providing a centralized place to work with all the artifacts you create when you use Azure Machine Learning. In this section we will connect to the workspace in which the job will be run.\n", - "\n", - "## 1.1. Import the required libraries" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# import required libraries\n", - "from azure.ai.ml import MLClient\n", - "from azure.ai.ml import command, MpiDistribution\n", - "from azure.identity import DefaultAzureCredential" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1.2. Configure workspace details and get a handle to the workspace\n", - "\n", - "To connect to a workspace, we need identifier parameters - a subscription, resource group and workspace name. We will use these details in the `MLClient` from `azure.ai.ml` to get a handle to the required Azure Machine Learning workspace. We use the default [default azure authentication](https://docs.microsoft.com/en-us/python/api/azure-identity/azure.identity.defaultazurecredential?view=azure-python) for this tutorial. Check the [configuration notebook](../../../configuration.ipynb) for more details on how to configure credentials and connect to a workspace." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Enter details of your AML workspace\n", - "subscription_id = \"\"\n", - "resource_group = \"\"\n", - "workspace = \"\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# get a handle to the workspace\n", - "ml_client = MLClient(\n", - " DefaultAzureCredential(), subscription_id, resource_group, workspace\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 2. Configure and run the Command\n", - "In this section we will configure and run a standalone job using the `command` class. The `command` class can be used to run standalone jobs and can also be used as a function inside pipelines.\n", - "\n", - "## 2.1 Configure the Command\n", - "The `command` allows user to configure the following key aspects.\n", - "- `code` - This is the path where the code to run the command is located\n", - "- `command` - This is the command that needs to be run\n", - "- `inputs` - This is the dictionary of inputs using name value pairs to the command. The key is a name for the input within the context of the job and the value is the input value. Inputs can be referenced in the `command` using the `${{inputs.}}` expression. To use files or folders as inputs, we can use the `Input` class. The `Input` class supports three parameters:\n", - " - `type` - The type of input. This can be a `uri_file` or `uri_folder`. The default is `uri_folder`. \n", - " - `path` - The path to the file or folder. These can be local or remote files or folders. For remote files - http/https, wasb are supported. \n", - " - Azure ML `data`/`dataset` or `datastore` are of type `uri_folder`. To use `data`/`dataset` as input, you can use registered dataset in the workspace using the format ':'. For e.g Input(type='uri_folder', path='my_dataset:1')\n", - " - `mode` - \tMode of how the data should be delivered to the compute target. Allowed values are `ro_mount`, `rw_mount` and `download`. Default is `ro_mount`\n", - "- `environment` - This is the environment needed for the command to run. Curated or custom environments from the workspace can be used. Or a custom environment can be created and used as well. Check out the [environment](../../../../assets/environment/environment.ipynb) notebook for more examples.\n", - "- `compute` - The compute on which the command will run. In this example we are using a compute called `cpu-cluster` present in the workspace. You can replace it any other compute in the workspace. You can run it on the local machine by using `local` for the compute. This will run the command on the local machine and all the run details and output of the job will be uploaded to the Azure ML workspace.\n", - "- `distribution` - Distribution configuration for distributed training scenarios. Azure Machine Learning supports PyTorch, TensorFlow, and MPI-based distributed training. The allowed values are `PyTorch`, `TensorFlow` or `Mpi`.\n", - "- `display_name` - The display name of the Job\n", - "- `description` - The description of the experiment\n", - "\n", - "In this example we will use `MPI` for distribution." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "name": "job" - }, - "outputs": [], - "source": [ - "from azure.ai.ml import command, MpiDistribution\n", - "\n", - "job = command(\n", - " code=\"./src\", # local path where the code is stored\n", - " command=\"python train.py --epochs ${{inputs.epochs}}\",\n", - " inputs={\"epochs\": 1},\n", - " environment=\"AzureML-tensorflow-2.12-cuda11@latest\",\n", - " compute=\"gpu-cluster\",\n", - " instance_count=2,\n", - " distribution=MpiDistribution(process_count_per_instance=2),\n", - " display_name=\"tensorflow-mnist-distributed-horovod-example\"\n", - " # experiment_name: tensorflow-mnist-distributed-horovod-example\n", - " # description: Train a basic neural network with TensorFlow on the MNIST dataset, distributed via Horovod.\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2.2 Run the Command\n", - "Using the `MLClient` created earlier, we will now run this Command as a job in the workspace." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "name": "returned_job" - }, - "outputs": [], - "source": [ - "# submit the command\n", - "returned_job = ml_client.create_or_update(job)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Next Steps\n", - "You can see further examples of running a job [here](../../../single-step/)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - } - ], - "metadata": { - "description": { - "description": "Run a **Distributed Command** to train a basic neural network with distributed MPI on the MNIST dataset using Horovod" - }, - "interpreter": { - "hash": "45ee23ad53d8447c1a4a7f9f605254595f8ee53c2e1723e7948bbd485e96ca91" - }, - "kernelspec": { - "display_name": "Python 3.10 - SDK V2", - "language": "python", - "name": "python310-sdkv2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.12" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -}