From e3c1c44308a9981a8e1ca7e939d8d3774b6bf861 Mon Sep 17 00:00:00 2001
From: Sirui Tao <1318342565@qq.com>
Date: Tue, 10 Jan 2023 13:14:47 +0800
Subject: [PATCH] [Nano] How-To Guides: Accelerate PyTorch Training with IPEX &
 Multi-instance & BF16 & Channels last (#7035)

* add pytorch training ipex guide

* add pytorch training multi-instance guide

* add bf16 guide

* small changes of presentation

* add channels last guide

* remove validation loader

* hide code block

* update based on comments

* add guide for reference

* update guides w.r.t. comments
---
 docs/readthedocs/source/_toc.yml              |   4 +
 .../accelerate_pytorch_training_bf16.nblink   |   3 +
 .../accelerate_pytorch_training_ipex.nblink   |   3 +
 ...ate_pytorch_training_multi_instance.nblink |   3 +
 .../pytorch_training_channels_last.nblink     |   3 +
 .../source/doc/Nano/Howto/index.rst           |   4 +
 ...ch_lightning_training_multi_instance.ipynb |   4 +-
 .../accelerate_pytorch_training_bf16.ipynb    | 382 ++++++++++++++++++
 .../accelerate_pytorch_training_ipex.ipynb    | 342 ++++++++++++++++
 ...rate_pytorch_training_multi_instance.ipynb | 371 +++++++++++++++++
 .../convert_pytorch_training_torchnano.ipynb  |  16 +-
 .../pytorch_training_channels_last.ipynb      | 366 +++++++++++++++++
 .../use_nano_decorator_pytorch_training.ipynb |  16 +-
 13 files changed, 1505 insertions(+), 12 deletions(-)
 create mode 100644 docs/readthedocs/source/doc/Nano/Howto/Training/PyTorch/accelerate_pytorch_training_bf16.nblink
 create mode 100644 docs/readthedocs/source/doc/Nano/Howto/Training/PyTorch/accelerate_pytorch_training_ipex.nblink
 create mode 100644 docs/readthedocs/source/doc/Nano/Howto/Training/PyTorch/accelerate_pytorch_training_multi_instance.nblink
 create mode 100644 docs/readthedocs/source/doc/Nano/Howto/Training/PyTorch/pytorch_training_channels_last.nblink
 create mode 100644 python/nano/tutorial/notebook/training/pytorch/accelerate_pytorch_training_bf16.ipynb
 create mode 100644 python/nano/tutorial/notebook/training/pytorch/accelerate_pytorch_training_ipex.ipynb
 create mode 100644 python/nano/tutorial/notebook/training/pytorch/accelerate_pytorch_training_multi_instance.ipynb
 create mode 100644 python/nano/tutorial/notebook/training/pytorch/pytorch_training_channels_last.ipynb

diff --git a/docs/readthedocs/source/_toc.yml b/docs/readthedocs/source/_toc.yml
index d4b319e557d..c03a4c78b13 100644
--- a/docs/readthedocs/source/_toc.yml
+++ b/docs/readthedocs/source/_toc.yml
@@ -108,6 +108,10 @@ subtrees:
                   - file: doc/Nano/Howto/Training/PyTorchLightning/pytorch_lightning_training_bf16
                   - file: doc/Nano/Howto/Training/PyTorch/convert_pytorch_training_torchnano
                   - file: doc/Nano/Howto/Training/PyTorch/use_nano_decorator_pytorch_training
+                  - file: doc/Nano/Howto/Training/PyTorch/accelerate_pytorch_training_ipex
+                  - file: doc/Nano/Howto/Training/PyTorch/accelerate_pytorch_training_multi_instance
+                  - file: doc/Nano/Howto/Training/PyTorch/pytorch_training_channels_last
+                  - file: doc/Nano/Howto/Training/PyTorch/accelerate_pytorch_training_bf16
                   - file: doc/Nano/Howto/Training/TensorFlow/accelerate_tensorflow_training_multi_instance
                   - file: doc/Nano/Howto/Training/TensorFlow/tensorflow_training_embedding_sparseadam
                   - file: doc/Nano/Howto/Training/TensorFlow/tensorflow_training_bf16
diff --git a/docs/readthedocs/source/doc/Nano/Howto/Training/PyTorch/accelerate_pytorch_training_bf16.nblink b/docs/readthedocs/source/doc/Nano/Howto/Training/PyTorch/accelerate_pytorch_training_bf16.nblink
new file mode 100644
index 00000000000..38883226477
--- /dev/null
+++ b/docs/readthedocs/source/doc/Nano/Howto/Training/PyTorch/accelerate_pytorch_training_bf16.nblink
@@ -0,0 +1,3 @@
+{
+    "path": "../../../../../../../../python/nano/tutorial/notebook/training/pytorch/accelerate_pytorch_training_bf16.ipynb"
+}
\ No newline at end of file
diff --git a/docs/readthedocs/source/doc/Nano/Howto/Training/PyTorch/accelerate_pytorch_training_ipex.nblink b/docs/readthedocs/source/doc/Nano/Howto/Training/PyTorch/accelerate_pytorch_training_ipex.nblink
new file mode 100644
index 00000000000..c27ec97293e
--- /dev/null
+++ b/docs/readthedocs/source/doc/Nano/Howto/Training/PyTorch/accelerate_pytorch_training_ipex.nblink
@@ -0,0 +1,3 @@
+{
+    "path": "../../../../../../../../python/nano/tutorial/notebook/training/pytorch/accelerate_pytorch_training_ipex.ipynb"
+}
\ No newline at end of file
diff --git a/docs/readthedocs/source/doc/Nano/Howto/Training/PyTorch/accelerate_pytorch_training_multi_instance.nblink b/docs/readthedocs/source/doc/Nano/Howto/Training/PyTorch/accelerate_pytorch_training_multi_instance.nblink
new file mode 100644
index 00000000000..f5ad8c48759
--- /dev/null
+++ b/docs/readthedocs/source/doc/Nano/Howto/Training/PyTorch/accelerate_pytorch_training_multi_instance.nblink
@@ -0,0 +1,3 @@
+{
+    "path": "../../../../../../../../python/nano/tutorial/notebook/training/pytorch/accelerate_pytorch_training_multi_instance.ipynb"
+}
\ No newline at end of file
diff --git a/docs/readthedocs/source/doc/Nano/Howto/Training/PyTorch/pytorch_training_channels_last.nblink b/docs/readthedocs/source/doc/Nano/Howto/Training/PyTorch/pytorch_training_channels_last.nblink
new file mode 100644
index 00000000000..271e0fbfcf9
--- /dev/null
+++ b/docs/readthedocs/source/doc/Nano/Howto/Training/PyTorch/pytorch_training_channels_last.nblink
@@ -0,0 +1,3 @@
+{
+    "path": "../../../../../../../../python/nano/tutorial/notebook/training/pytorch/pytorch_training_channels_last.ipynb"
+}
\ No newline at end of file
diff --git a/docs/readthedocs/source/doc/Nano/Howto/index.rst b/docs/readthedocs/source/doc/Nano/Howto/index.rst
index 0177a95da5b..880745fd1e4 100644
--- a/docs/readthedocs/source/doc/Nano/Howto/index.rst
+++ b/docs/readthedocs/source/doc/Nano/Howto/index.rst
@@ -27,6 +27,10 @@ PyTorch
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 * |convert_pytorch_training_torchnano|_
 * |use_nano_decorator_pytorch_training|_
+* `How to accelerate a PyTorch application on training workloads through Intel® Extension for PyTorch* <Training/PyTorch/accelerate_pytorch_training_ipex.html>`_
+* `How to accelerate a PyTorch application on training workloads through multiple instances <Training/PyTorch/accelerate_pytorch_training_multi_instance.html>`_
+* `How to use the channels last memory format in your PyTorch application for training <Training/PyTorch/pytorch_training_channels_last.html>`_
+* `How to conduct BFloat16 Mixed Precision training in your PyTorch application <Training/PyTorch/accelerate_pytorch_training_bf16.html>`_
 
 .. |use_nano_decorator_pytorch_training| replace:: How to accelerate your PyTorch training loop with ``@nano`` decorator
 .. _use_nano_decorator_pytorch_training: Training/PyTorch/use_nano_decorator_pytorch_training.html
diff --git a/python/nano/tutorial/notebook/training/pytorch-lightning/accelerate_pytorch_lightning_training_multi_instance.ipynb b/python/nano/tutorial/notebook/training/pytorch-lightning/accelerate_pytorch_lightning_training_multi_instance.ipynb
index cb1c9ea6c10..f8a9d2f53ac 100644
--- a/python/nano/tutorial/notebook/training/pytorch-lightning/accelerate_pytorch_lightning_training_multi_instance.ipynb
+++ b/python/nano/tutorial/notebook/training/pytorch-lightning/accelerate_pytorch_lightning_training_multi_instance.ipynb
@@ -192,9 +192,9 @@
    "source": [
     "> 📝 **Note**\n",
     ">\n",
-    "> By setting `num_processes`, Nano will launch the specific number of processes to perform data-parallel training. By default, CPU cores will be automatically and evenly distributed among processes to avoid conflicts and maximize training throughput. If you would like to specifiy the CPU cores used by each process, You could set `cpu_for_each_process` to a list of length `num_processes`, in which each item is a list of CPU indices.\n",
+    "> By setting `num_processes`, Nano will launch the specific number of processes to perform data-parallel training. By default, CPU cores will be automatically and evenly distributed among processes to avoid conflicts and maximize training throughput. If you would like to specify the CPU cores used by each process, You could set `cpu_for_each_process` to a list of length `num_processes`, in which each item is a list of CPU indices.\n",
     "> \n",
-    "> During multi-instance training, the effective batch size is the `batch_size` (in dataloader) $\\times$ `num_processes`, which will cause the number of iterations in each epoch to reduce by a factor of `num_processes`. To achieve the same effect as single instance training, a common practice to compensate is to gradually increase the learning rate to `num_processes` times. BigDL-Nano Trainer enable this pratice by default through `auto_lr=True`\n",
+    "> During multi-instance training, the effective batch size is the `batch_size` (in dataloader) $\\times$ `num_processes`, which will cause the number of iterations in each epoch to reduce by a factor of `num_processes`. To achieve the same effect as single instance training, a common practice to compensate is to gradually increase the learning rate to `num_processes` times. BigDL-Nano Trainer enable this practice by default through `auto_lr=True`\n",
     ">\n",
     "> Please refer to the [API doc](https://bigdl.readthedocs.io/en/latest/doc/PythonAPI/Nano/pytorch.html#bigdl.nano.pytorch.Trainer) for more detailed information regarding multi-instance related parameters in `bigdl.nano.pytorch.Trainer`."
    ]
diff --git a/python/nano/tutorial/notebook/training/pytorch/accelerate_pytorch_training_bf16.ipynb b/python/nano/tutorial/notebook/training/pytorch/accelerate_pytorch_training_bf16.ipynb
new file mode 100644
index 00000000000..2aaf2208cd7
--- /dev/null
+++ b/python/nano/tutorial/notebook/training/pytorch/accelerate_pytorch_training_bf16.ipynb
@@ -0,0 +1,382 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[View the runnable example on GitHub](https://github.com/intel-analytics/BigDL/tree/main/python/nano/tutorial/notebook/training/pytorch/accelerate_pytorch_training_bf16.ipynb)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Use BFloat16 Mixed Precision for PyTorch Training"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Brain Floating Point Format (BFloat16) is a custom 16-bit floating point format designed for machine learning. BFloat16 is comprised of 1 sign bit, 8 exponent bits, and 7 mantissa bits. With the same number of exponent bits, BFloat16 has the same dynamic range as FP32, but requires only half the memory usage.\n",
+    "\n",
+    "BFloat16 Mixed Precision combines BFloat16 and FP32 during training, which could lead to increased performance and reduced memory usage. Compared to FP16 mixed precision, BFloat16 mixed precision has better numerical stability.\n",
+    "\n",
+    "By using `TorchNano` (`bigdl.nano.pytorch.TorchNano`), you can make very few code changes to use BFloat16 mixed precision for training. Here we provide __2__ ways to achieve this: A) subclass `TorchNano` or B) use `@nano` decorator. You can choose the appropriate one depending on your (preferred) code structure."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "source": [
+    "## Prepare Environment for BigDL-Nano"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "source": [
+    "At first, you need to install BigDL-Nano for PyTorch:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install --pre --upgrade bigdl-nano[pytorch] # install the nightly-built version\n",
+    "!source bigdl-nano-init # set environment variables"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> 📝 **Note**\n",
+    ">\n",
+    "> Before starting your PyTorch application, it is highly recommended to run `source bigdl-nano-init` to set several environment variables based on your current hardware. Empirically, these variables will greatly improve performance for most PyTorch applications on training workloads."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "source": [
+    "> ⚠️ **Warning**\n",
+    "> \n",
+    "> For Jupyter Notebook users, we recommend to run the commands above, especially `source bigdl-nano-init` before jupyter kernel is started, or some of the optimizations may not take effect."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> ⚠️ **Warning**\n",
+    "> \n",
+    "> Using BFloat16 precision with `torch < 1.12` may result in extremely slow training."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "source": [
+    "## Pre-define Model and Dataloader"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "source": [
+    "In this guide, we take the fine-tuning of a [ResNet-18 model](https://pytorch.org/vision/main/models/generated/torchvision.models.resnet18.html) on [OxfordIIITPet dataset](https://pytorch.org/vision/main/generated/torchvision.datasets.OxfordIIITPet.html) as an example:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "outputs": [],
+   "source": [
+    "# Define model and dataloader\n",
+    "\n",
+    "from torch import nn\n",
+    "from torchvision.models import resnet18\n",
+    "\n",
+    "class MyPytorchModule(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super().__init__()\n",
+    "        self.model = resnet18(pretrained=True)\n",
+    "        num_ftrs = self.model.fc.in_features\n",
+    "        # Here the size of each output sample is set to 37.\n",
+    "        self.model.fc = nn.Linear(num_ftrs, 37)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        return self.model(x)\n",
+    "\n",
+    "\n",
+    "import torch\n",
+    "from torchvision import transforms\n",
+    "from torchvision.datasets import OxfordIIITPet\n",
+    "from torch.utils.data.dataloader import DataLoader\n",
+    "\n",
+    "def create_train_dataloader():\n",
+    "    train_transform = transforms.Compose([transforms.Resize(256),\n",
+    "                                          transforms.RandomCrop(224),\n",
+    "                                          transforms.RandomHorizontalFlip(),\n",
+    "                                          transforms.ColorJitter(brightness=.5, hue=.3),\n",
+    "                                          transforms.ToTensor(),\n",
+    "                                          transforms.Normalize([0.485, 0.456, 0.406],\n",
+    "                                                               [0.229, 0.224, 0.225])])\n",
+    "\n",
+    "    # apply data augmentation to the train_dataset\n",
+    "    train_dataset = OxfordIIITPet(root=\"/tmp/data\", transform=train_transform, download=True)\n",
+    "\n",
+    "    # prepare data loader\n",
+    "    train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)\n",
+    "\n",
+    "    return train_dataloader"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## A) Subclass `TorchNano`"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In general, two steps are required if you choose to subclass `TorchNano`:\n",
+    "\n",
+    "1) import and subclass `TorchNano`, and override its `train()` method\n",
+    "2) instantiate it with setting `precision='bf16'`, then call the `train()` method\n",
+    "\n",
+    "For step 1, you can refer to [this page](https://bigdl.readthedocs.io/en/latest/doc/Nano/Howto/Training/PyTorch/convert_pytorch_training_torchnano.html) to achieve it (for consistency, we use the same model and dataset as an example). Supposing that you've already got a well-defined subclass `MyNano`, below line will instantiate it with enabling BFloat16 mixed precision and train your model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "outputs": [],
+   "source": [
+    "from tqdm import tqdm\n",
+    "from bigdl.nano.pytorch import TorchNano # import TorchNano\n",
+    "\n",
+    "# subclass TorchNano and override its train method\n",
+    "class MyNano(TorchNano):\n",
+    "    def train(self):\n",
+    "        # Move the code for your custom training loops inside the train method\n",
+    "        model = MyPytorchModule()\n",
+    "        optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)\n",
+    "        loss_fuc = torch.nn.CrossEntropyLoss()\n",
+    "        train_loader = create_train_dataloader()\n",
+    "\n",
+    "        # call setup method to set up model, optimizer(s),\n",
+    "        # and dataloader(s) for accelerated training\n",
+    "        model, optimizer, train_loader = self.setup(model, optimizer, train_loader)\n",
+    "        num_epochs = 5\n",
+    "\n",
+    "        for epoch in range(num_epochs):\n",
+    "\n",
+    "            model.train()\n",
+    "            train_loss, num = 0, 0\n",
+    "            with tqdm(train_loader, unit=\"batch\") as tepoch:\n",
+    "                for data, target in tepoch:\n",
+    "                    tepoch.set_description(f\"Epoch {epoch}\")\n",
+    "                    optimizer.zero_grad()\n",
+    "                    output = model(data)\n",
+    "                    loss = loss_fuc(output, target)\n",
+    "                    # Replace loss.backward() with self.backward(loss)\n",
+    "                    self.backward(loss)\n",
+    "                    optimizer.step()\n",
+    "                    loss_value = loss.sum()\n",
+    "                    train_loss += loss_value\n",
+    "                    num += 1\n",
+    "                    tepoch.set_postfix(loss=loss_value)\n",
+    "            print(f'Train Epoch: {epoch}, avg_loss: {train_loss / num}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MyNano(precision='bf16').train()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; _The detailed definition of_ `MyNano` _can be found in the_ [runnable example](https://github.com/intel-analytics/BigDL/tree/main/python/nano/tutorial/notebook/training/pytorch/accelerate_pytorch_training_bf16.ipynb)."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "However, using BF16 precision on CPU without BF16 instruction support may affect training efficiency. You can set `use_ipex=True` and `precision='bf16'` simultaneously to enable IPEX ([Intel® Extension for PyTorch*](https://github.com/intel/intel-extension-for-pytorch)), which adopts AVX-512 Vector Neural Network Instructions (AVX512 VNNI) and other optimizations for BFloat16 mixed precision training to gain more acceleration:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MyNano(use_ipex=True, precision='bf16').train()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## B) Use `@nano` decorator"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`@nano` decorator is very friendly since you can only add 2 new lines (import it and wrap the training function) and enjoy the features brought by BigDL-Nano if you have already defined a PyTorch training function with a model, optimizers, and dataloaders as parameters. You can learn the usage and notes of it from [here](https://bigdl.readthedocs.io/en/latest/doc/Nano/Howto/Training/PyTorch/use_nano_decorator_pytorch_training.html). The only difference when using BFloat16 mixed precision for training is that you should specify the decorator as `@nano(precision='bf16')`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tqdm import tqdm\n",
+    "from bigdl.nano.pytorch import nano # import nano decorator\n",
+    "\n",
+    "@nano(precision='bf16') # apply the decorator to the training loop\n",
+    "def training_loop(model, optimizer, train_loader, num_epochs, loss_func):\n",
+    "\n",
+    "    for epoch in range(num_epochs):\n",
+    "\n",
+    "        model.train()\n",
+    "        train_loss, num = 0, 0\n",
+    "        with tqdm(train_loader, unit=\"batch\") as tepoch:\n",
+    "            for data, target in tepoch:\n",
+    "                tepoch.set_description(f\"Epoch {epoch}\")\n",
+    "                optimizer.zero_grad()\n",
+    "                output = model(data)\n",
+    "                loss = loss_func(output, target)\n",
+    "                loss.backward()\n",
+    "                optimizer.step()\n",
+    "                loss_value = loss.sum()\n",
+    "                train_loss += loss_value\n",
+    "                num += 1\n",
+    "                tepoch.set_postfix(loss=loss_value)\n",
+    "            print(f'Train Epoch: {epoch}, avg_loss: {train_loss / num}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "outputs": [],
+   "source": [
+    "model = MyPytorchModule()\n",
+    "optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)\n",
+    "loss_func = torch.nn.CrossEntropyLoss()\n",
+    "train_loader = create_train_dataloader()\n",
+    "\n",
+    "training_loop(model, optimizer, train_loader, num_epochs=5, loss_func=loss_func)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; _A runnable example including this_ `training_loop` _can be seen from_ [here](https://github.com/intel-analytics/BigDL/tree/main/python/nano/tutorial/notebook/training/pytorch/accelerate_pytorch_training_bf16.ipynb)."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "However, using BF16 precision on CPU without BF16 instruction support may affect training efficiency. You can set `use_ipex=True` and `precision='bf16'` simultaneously to enable IPEX ([Intel® Extension for PyTorch*](https://github.com/intel/intel-extension-for-pytorch)), which adopts AVX-512 Vector Neural Network Instructions (AVX512 VNNI) and other optimizations for BFloat16 mixed precision training to gain more acceleration."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> 📚 **Related Readings**\n",
+    "> \n",
+    "> - [How to install BigDL-Nano](https://bigdl.readthedocs.io/en/latest/doc/Nano/Overview/nano.html#install)\n",
+    "> - [How to convert your PyTorch training loop to use TorchNano for acceleration](https://bigdl.readthedocs.io/en/latest/doc/Nano/Howto/Training/PyTorch/convert_pytorch_training_torchnano.html)\n",
+    "> - [How to accelerate your PyTorch training loop with \\@nano decorator](https://bigdl.readthedocs.io/en/latest/doc/Nano/Howto/Training/PyTorch/use_nano_decorator_pytorch_training.html)\n",
+    "> - [How to accelerate a PyTorch application on training workloads through Intel® Extension for PyTorch*](https://bigdl.readthedocs.io/en/latest/doc/Nano/Howto/Training/PyTorch/accelerate_pytorch_training_ipex.html)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0 (default, Jun 28 2018, 13:15:42) \n[GCC 7.2.0]"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "8772eaeb16382a2d9dbb95ffcb3882976733f8dc8a0780f3e0ca9a3a7dc812c0"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/python/nano/tutorial/notebook/training/pytorch/accelerate_pytorch_training_ipex.ipynb b/python/nano/tutorial/notebook/training/pytorch/accelerate_pytorch_training_ipex.ipynb
new file mode 100644
index 00000000000..03df5069e02
--- /dev/null
+++ b/python/nano/tutorial/notebook/training/pytorch/accelerate_pytorch_training_ipex.ipynb
@@ -0,0 +1,342 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[View the runnable example on GitHub](https://github.com/intel-analytics/BigDL/tree/main/python/nano/tutorial/notebook/training/pytorch/accelerate_pytorch_training_ipex.ipynb)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Accelerate PyTorch Training using Intel® Extension for PyTorch*"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[Intel® Extension for PyTorch*](https://github.com/intel/intel-extension-for-pytorch) (also known as IPEX) can boost performance on Intel hardware with AVX-512 Vector Neural Network Instructions (AVX512 VNNI) and Intel® Advanced Matrix Extensions (Intel® AMX) on Intel CPUs. By using `TorchNano` (`bigdl.nano.pytorch.TorchNano`), you can make very few code changes to accelerate training loops via IPEX. Here we provide __2__ ways to achieve this: A) subclass `TorchNano` or B) use `@nano` decorator. You can choose the appropriate one depending on your (preferred) code structure."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "source": [
+    "## Prepare Environment for BigDL-Nano"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "source": [
+    "At first, you need to install BigDL-Nano for PyTorch:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install --pre --upgrade bigdl-nano[pytorch] # install the nightly-built version\n",
+    "!source bigdl-nano-init # set environment variables"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> 📝 **Note**\n",
+    ">\n",
+    "> Before starting your PyTorch application, it is highly recommended to run `source bigdl-nano-init` to set several environment variables based on your current hardware. Empirically, these variables will greatly improve performance for most PyTorch applications on training workloads."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "source": [
+    "> ⚠️ **Warning**\n",
+    "> \n",
+    "> For Jupyter Notebook users, we recommend to run the commands above, especially `source bigdl-nano-init` before jupyter kernel is started, or some of the optimizations may not take effect."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "source": [
+    "## Pre-define Model and Dataloader"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "source": [
+    "In this guide, we take the fine-tuning of a [ResNet-18 model](https://pytorch.org/vision/main/models/generated/torchvision.models.resnet18.html) on [OxfordIIITPet dataset](https://pytorch.org/vision/main/generated/torchvision.datasets.OxfordIIITPet.html) as an example:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "outputs": [],
+   "source": [
+    "# Define model and dataloader\n",
+    "\n",
+    "from torch import nn\n",
+    "from torchvision.models import resnet18\n",
+    "\n",
+    "class MyPytorchModule(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super().__init__()\n",
+    "        self.model = resnet18(pretrained=True)\n",
+    "        num_ftrs = self.model.fc.in_features\n",
+    "        # Here the size of each output sample is set to 37.\n",
+    "        self.model.fc = nn.Linear(num_ftrs, 37)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        return self.model(x)\n",
+    "\n",
+    "\n",
+    "import torch\n",
+    "from torchvision import transforms\n",
+    "from torchvision.datasets import OxfordIIITPet\n",
+    "from torch.utils.data.dataloader import DataLoader\n",
+    "\n",
+    "def create_train_dataloader():\n",
+    "    train_transform = transforms.Compose([transforms.Resize(256),\n",
+    "                                          transforms.RandomCrop(224),\n",
+    "                                          transforms.RandomHorizontalFlip(),\n",
+    "                                          transforms.ColorJitter(brightness=.5, hue=.3),\n",
+    "                                          transforms.ToTensor(),\n",
+    "                                          transforms.Normalize([0.485, 0.456, 0.406],\n",
+    "                                                               [0.229, 0.224, 0.225])])\n",
+    "\n",
+    "    # apply data augmentation to the train_dataset\n",
+    "    train_dataset = OxfordIIITPet(root=\"/tmp/data\", transform=train_transform, download=True)\n",
+    "\n",
+    "    # prepare data loader\n",
+    "    train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)\n",
+    "\n",
+    "    return train_dataloader"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## A) Subclass `TorchNano`"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In general, two steps are required if you choose to subclass `TorchNano`:\n",
+    "\n",
+    "1) import and subclass `TorchNano`, and override its `train()` method\n",
+    "2) instantiate it with setting `use_ipex=True` , then call the `train()` method\n",
+    "\n",
+    "For step 1, you can refer to [this page](https://bigdl.readthedocs.io/en/latest/doc/Nano/Howto/Training/PyTorch/convert_pytorch_training_torchnano.html) to achieve it (for consistency, we use the same model and dataset as an example). Supposing that you've already got a well-defined subclass `MyNano`, below line will instantiate it with enabling IPEX, and call its `train()` method."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "outputs": [],
+   "source": [
+    "from tqdm import tqdm\n",
+    "from bigdl.nano.pytorch import TorchNano # import TorchNano\n",
+    "\n",
+    "# subclass TorchNano and override its train method\n",
+    "class MyNano(TorchNano):\n",
+    "    def train(self):\n",
+    "        # Move the code for your custom training loops inside the train method\n",
+    "        model = MyPytorchModule()\n",
+    "        optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)\n",
+    "        loss_fuc = torch.nn.CrossEntropyLoss()\n",
+    "        train_loader = create_train_dataloader()\n",
+    "\n",
+    "        # call setup method to set up model, optimizer(s),\n",
+    "        # and dataloader(s) for accelerated training\n",
+    "        model, optimizer, train_loader = self.setup(model, optimizer, train_loader)\n",
+    "        num_epochs = 5\n",
+    "\n",
+    "        for epoch in range(num_epochs):\n",
+    "\n",
+    "            model.train()\n",
+    "            train_loss, num = 0, 0\n",
+    "            with tqdm(train_loader, unit=\"batch\") as tepoch:\n",
+    "                for data, target in tepoch:\n",
+    "                    tepoch.set_description(f\"Epoch {epoch}\")\n",
+    "                    optimizer.zero_grad()\n",
+    "                    output = model(data)\n",
+    "                    loss = loss_fuc(output, target)\n",
+    "                    # Replace loss.backward() with self.backward(loss)\n",
+    "                    self.backward(loss)\n",
+    "                    optimizer.step()\n",
+    "                    loss_value = loss.sum()\n",
+    "                    train_loss += loss_value\n",
+    "                    num += 1\n",
+    "                    tepoch.set_postfix(loss=loss_value)\n",
+    "            print(f'Train Epoch: {epoch}, avg_loss: {train_loss / num}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MyNano(use_ipex=True).train()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; _The detailed definition of_ `MyNano` _can be found in the_ [runnable example](https://github.com/intel-analytics/BigDL/tree/main/python/nano/tutorial/notebook/training/pytorch/accelerate_pytorch_training_ipex.ipynb)."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## B) Use `@nano` decorator"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`@nano` decorator is very friendly since you can only add 2 new lines (import it and wrap the training function) and enjoy the features brought by BigDL-Nano if you have already defined a PyTorch training function with a model, optimizers, and dataloaders as parameters. You can learn the usage and notes of it from [here](https://bigdl.readthedocs.io/en/latest/doc/Nano/Howto/Training/PyTorch/use_nano_decorator_pytorch_training.html). The only difference when using IPEX is that you should specify the decorator as `@nano(use_ipex=True)`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tqdm import tqdm\n",
+    "from bigdl.nano.pytorch import nano # import nano decorator\n",
+    "\n",
+    "@nano(use_ipex=True) # apply the decorator to the training loop\n",
+    "def training_loop(model, optimizer, train_loader, num_epochs, loss_func):\n",
+    "\n",
+    "    for epoch in range(num_epochs):\n",
+    "\n",
+    "        model.train()\n",
+    "        train_loss, num = 0, 0\n",
+    "        with tqdm(train_loader, unit=\"batch\") as tepoch:\n",
+    "            for data, target in tepoch:\n",
+    "                tepoch.set_description(f\"Epoch {epoch}\")\n",
+    "                optimizer.zero_grad()\n",
+    "                output = model(data)\n",
+    "                loss = loss_func(output, target)\n",
+    "                loss.backward()\n",
+    "                optimizer.step()\n",
+    "                loss_value = loss.sum()\n",
+    "                train_loss += loss_value\n",
+    "                num += 1\n",
+    "                tepoch.set_postfix(loss=loss_value)\n",
+    "            print(f'Train Epoch: {epoch}, avg_loss: {train_loss / num}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "outputs": [],
+   "source": [
+    "model = MyPytorchModule()\n",
+    "optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)\n",
+    "loss_func = torch.nn.CrossEntropyLoss()\n",
+    "train_loader = create_train_dataloader()\n",
+    "\n",
+    "training_loop(model, optimizer, train_loader, num_epochs=5, loss_func=loss_func)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; _A runnable example including this_ `training_loop` _can be seen from_ [here](https://github.com/intel-analytics/BigDL/tree/main/python/nano/tutorial/notebook/training/pytorch/accelerate_pytorch_training_ipex.ipynb)."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> 📚 **Related Readings**\n",
+    "> \n",
+    "> - [How to install BigDL-Nano](https://bigdl.readthedocs.io/en/latest/doc/Nano/Overview/nano.html#install)\n",
+    "> - [How to convert your PyTorch training loop to use TorchNano for acceleration](https://bigdl.readthedocs.io/en/latest/doc/Nano/Howto/Training/PyTorch/convert_pytorch_training_torchnano.html)\n",
+    "> - [How to accelerate your PyTorch training loop with \\@nano decorator](https://bigdl.readthedocs.io/en/latest/doc/Nano/Howto/Training/PyTorch/use_nano_decorator_pytorch_training.html)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0 (default, Jun 28 2018, 13:15:42) \n[GCC 7.2.0]"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "8772eaeb16382a2d9dbb95ffcb3882976733f8dc8a0780f3e0ca9a3a7dc812c0"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/python/nano/tutorial/notebook/training/pytorch/accelerate_pytorch_training_multi_instance.ipynb b/python/nano/tutorial/notebook/training/pytorch/accelerate_pytorch_training_multi_instance.ipynb
new file mode 100644
index 00000000000..40ede890c36
--- /dev/null
+++ b/python/nano/tutorial/notebook/training/pytorch/accelerate_pytorch_training_multi_instance.ipynb
@@ -0,0 +1,371 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[View the runnable example on GitHub](https://github.com/intel-analytics/BigDL/tree/main/python/nano/tutorial/notebook/training/pytorch/accelerate_pytorch_training_multi_instance.ipynb)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Accelerate PyTorch Training using Multiple Instances"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`TorchNano` (`bigdl.nano.pytorch.TorchNano`) supports multi-instance training that can make full usage of hardwares with multiple CPU cores or sockets (especially when the number of cores is large). Here we provide __2__ ways to achieve this: A) subclass `TorchNano` or B) use `@nano` decorator. You can choose the appropriate one depending on your (preferred) code structure."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "source": [
+    "## Prepare Environment for BigDL-Nano"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "source": [
+    "At first, you need to install BigDL-Nano for PyTorch:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install --pre --upgrade bigdl-nano[pytorch] # install the nightly-built version\n",
+    "!source bigdl-nano-init # set environment variables"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> 📝 **Note**\n",
+    ">\n",
+    "> Before starting your PyTorch application, it is highly recommended to run `source bigdl-nano-init` to set several environment variables based on your current hardware. Empirically, these variables will greatly improve performance for most PyTorch applications on training workloads."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "source": [
+    "> ⚠️ **Warning**\n",
+    "> \n",
+    "> For Jupyter Notebook users, we recommend to run the commands above, especially `source bigdl-nano-init` before jupyter kernel is started, or some of the optimizations may not take effect."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "source": [
+    "## Pre-define Model and Dataloader"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "source": [
+    "In this guide, we take the fine-tuning of a [ResNet-18 model](https://pytorch.org/vision/main/models/generated/torchvision.models.resnet18.html) on [OxfordIIITPet dataset](https://pytorch.org/vision/main/generated/torchvision.datasets.OxfordIIITPet.html) as an example:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "outputs": [],
+   "source": [
+    "# Define model and dataloader\n",
+    "\n",
+    "from torch import nn\n",
+    "from torchvision.models import resnet18\n",
+    "\n",
+    "class MyPytorchModule(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super().__init__()\n",
+    "        self.model = resnet18(pretrained=True)\n",
+    "        num_ftrs = self.model.fc.in_features\n",
+    "        # Here the size of each output sample is set to 37.\n",
+    "        self.model.fc = nn.Linear(num_ftrs, 37)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        return self.model(x)\n",
+    "\n",
+    "\n",
+    "import torch\n",
+    "from torchvision import transforms\n",
+    "from torchvision.datasets import OxfordIIITPet\n",
+    "from torch.utils.data.dataloader import DataLoader\n",
+    "\n",
+    "def create_train_dataloader():\n",
+    "    train_transform = transforms.Compose([transforms.Resize(256),\n",
+    "                                          transforms.RandomCrop(224),\n",
+    "                                          transforms.RandomHorizontalFlip(),\n",
+    "                                          transforms.ColorJitter(brightness=.5, hue=.3),\n",
+    "                                          transforms.ToTensor(),\n",
+    "                                          transforms.Normalize([0.485, 0.456, 0.406],\n",
+    "                                                               [0.229, 0.224, 0.225])])\n",
+    "\n",
+    "    # apply data augmentation to the train_dataset\n",
+    "    train_dataset = OxfordIIITPet(root=\"/tmp/data\", transform=train_transform, download=True)\n",
+    "\n",
+    "    # prepare data loader\n",
+    "    train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)\n",
+    "\n",
+    "    return train_dataloader"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## A) Subclass `TorchNano`"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In general, two steps are required if you choose to subclass `TorchNano`:\n",
+    "\n",
+    "1) import and subclass `TorchNano`, and override its `train()` method\n",
+    "2) instantiate it with setting `num_processes` , then call the `train()` method\n",
+    "\n",
+    "For step 1, you can refer to [this page](https://bigdl.readthedocs.io/en/latest/doc/Nano/Howto/Training/PyTorch/convert_pytorch_training_torchnano.html) to achieve it (for consistency, we use the same model and dataset as an example). Supposing that you've already got a well-defined subclass `MyNano`, below line will instantiate it and train your model with 2 processes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "outputs": [],
+   "source": [
+    "from tqdm import tqdm\n",
+    "from bigdl.nano.pytorch import TorchNano # import TorchNano\n",
+    "\n",
+    "# subclass TorchNano and override its train method\n",
+    "class MyNano(TorchNano):\n",
+    "    def train(self):\n",
+    "        # Move the code for your custom training loops inside the train method\n",
+    "        model = MyPytorchModule()\n",
+    "        optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)\n",
+    "        loss_fuc = torch.nn.CrossEntropyLoss()\n",
+    "        train_loader = create_train_dataloader()\n",
+    "\n",
+    "        # call setup method to set up model, optimizer(s),\n",
+    "        # and dataloader(s) for accelerated training\n",
+    "        model, optimizer, train_loader = self.setup(model, optimizer, train_loader)\n",
+    "        num_epochs = 5\n",
+    "\n",
+    "        for epoch in range(num_epochs):\n",
+    "\n",
+    "            model.train()\n",
+    "            train_loss, num = 0, 0\n",
+    "            with tqdm(train_loader, unit=\"batch\") as tepoch:\n",
+    "                for data, target in tepoch:\n",
+    "                    tepoch.set_description(f\"Epoch {epoch}\")\n",
+    "                    optimizer.zero_grad()\n",
+    "                    output = model(data)\n",
+    "                    loss = loss_fuc(output, target)\n",
+    "                    # Replace loss.backward() with self.backward(loss)\n",
+    "                    self.backward(loss)\n",
+    "                    optimizer.step()\n",
+    "                    loss_value = loss.sum()\n",
+    "                    train_loss += loss_value\n",
+    "                    num += 1\n",
+    "                    tepoch.set_postfix(loss=loss_value)\n",
+    "            print(f'Train Epoch: {epoch}, avg_loss: {train_loss / num}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MyNano(num_processes=2).train()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; _The detailed definition of_ `MyNano` _can be found in the_ [runnable example](https://github.com/intel-analytics/BigDL/tree/main/python/nano/tutorial/notebook/training/pytorch/accelerate_pytorch_training_multi_instance.ipynb)."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> 📝 **Note**\n",
+    ">\n",
+    "> By setting `num_processes`, CPU cores will be automatically and evenly distributed among specific number of processes, to avoid conflicts and maximize training throughput. If you would like to specify the CPU cores used by each process, You could set `cpu_for_each_process` to a list of length `num_processes`, in which each item is a list of CPU indices.\n",
+    "> \n",
+    "> Currently, `‘subprocess’` (default), `‘spawn’` and `‘ray’` are supported as `distributed_backend` for `TorchNano`.\n",
+    "> \n",
+    "> Also note that, when using data-parallel training, the batch size is equivalent to becoming `num_processes` times larger. The learning rate warm-up strategy that gradually increases the learning rate to `num_processes` times is a compensate to achieve the same effect as single instance training. Nano enables this strategy by default through `auto_lr=True`."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## B) Use `@nano` decorator"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`@nano` decorator is very friendly since you can only add 2 new lines (import it and wrap the training function) and enjoy the features brought by BigDL-Nano if you have already defined a PyTorch training function with a model, optimizers, and dataloaders as parameters. You can learn the usage and notes of it from [here](https://bigdl.readthedocs.io/en/latest/doc/Nano/Howto/Training/PyTorch/use_nano_decorator_pytorch_training.html). The only difference when using multi-instance training is that you should specify the decorator as `@nano(num_processes=n)` with _n_ being the expected number of processes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tqdm import tqdm\n",
+    "from bigdl.nano.pytorch import nano # import nano decorator\n",
+    "\n",
+    "@nano(num_processes=2) # apply the decorator to the training loop\n",
+    "def training_loop(model, optimizer, train_loader, num_epochs, loss_func):\n",
+    "\n",
+    "    for epoch in range(num_epochs):\n",
+    "\n",
+    "        model.train()\n",
+    "        train_loss, num = 0, 0\n",
+    "        with tqdm(train_loader, unit=\"batch\") as tepoch:\n",
+    "            for data, target in tepoch:\n",
+    "                tepoch.set_description(f\"Epoch {epoch}\")\n",
+    "                optimizer.zero_grad()\n",
+    "                output = model(data)\n",
+    "                loss = loss_func(output, target)\n",
+    "                loss.backward()\n",
+    "                optimizer.step()\n",
+    "                loss_value = loss.sum()\n",
+    "                train_loss += loss_value\n",
+    "                num += 1\n",
+    "                tepoch.set_postfix(loss=loss_value)\n",
+    "            print(f'Train Epoch: {epoch}, avg_loss: {train_loss / num}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "outputs": [],
+   "source": [
+    "model = MyPytorchModule()\n",
+    "optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)\n",
+    "loss_func = torch.nn.CrossEntropyLoss()\n",
+    "train_loader = create_train_dataloader()\n",
+    "\n",
+    "training_loop(model, optimizer, train_loader, num_epochs=5, loss_func=loss_func)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; _A runnable example including this_ `training_loop` _can be seen from_ [here](https://github.com/intel-analytics/BigDL/tree/main/python/nano/tutorial/notebook/training/pytorch/accelerate_pytorch_training_multi_instance.ipynb)."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> 📝 **Note**\n",
+    "> \n",
+    "> By setting `num_processes`, CPU cores will be automatically and evenly distributed among specific number of processes, to avoid conflicts and maximize training throughput. If you would like to specify the CPU cores used by each process, You could set `cpu_for_each_process` to a list of length `num_processes`, in which each item is a list of CPU indices.\n",
+    "> \n",
+    "> Currently, `‘subprocess’` (default), and `‘ray’` are supported as `distributed_backend` for `@nano` decorator (`'spawn'` is not supported by `@nano`).\n",
+    "> \n",
+    "> Also note that, when using data-parallel training, the batch size is equivalent to becoming `num_processes` times larger. The learning rate warm-up strategy that gradually increases the learning rate to `num_processes` times is a compensate to achieve the same effect as single instance training. Nano enables this strategy by default through `auto_lr=True`."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> 📚 **Related Readings**\n",
+    "> \n",
+    "> - [How to install BigDL-Nano](https://bigdl.readthedocs.io/en/latest/doc/Nano/Overview/nano.html#install)\n",
+    "> - [How to convert your PyTorch training loop to use TorchNano for acceleration](https://bigdl.readthedocs.io/en/latest/doc/Nano/Howto/Training/PyTorch/convert_pytorch_training_torchnano.html)\n",
+    "> - [How to accelerate your PyTorch training loop with \\@nano decorator](https://bigdl.readthedocs.io/en/latest/doc/Nano/Howto/Training/PyTorch/use_nano_decorator_pytorch_training.html)\n",
+    "> - [How to choose the number of processes for multi-instance training](https://bigdl.readthedocs.io/en/latest/doc/Nano/Howto/Training/General/choose_num_processes_training.html)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0 (default, Jun 28 2018, 13:15:42) \n[GCC 7.2.0]"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "8772eaeb16382a2d9dbb95ffcb3882976733f8dc8a0780f3e0ca9a3a7dc812c0"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/python/nano/tutorial/notebook/training/pytorch/convert_pytorch_training_torchnano.ipynb b/python/nano/tutorial/notebook/training/pytorch/convert_pytorch_training_torchnano.ipynb
index 6080f0b8b70..fc7d826e48c 100644
--- a/python/nano/tutorial/notebook/training/pytorch/convert_pytorch_training_torchnano.ipynb
+++ b/python/nano/tutorial/notebook/training/pytorch/convert_pytorch_training_torchnano.ipynb
@@ -268,7 +268,9 @@
    "source": [
     "> 📝 **Note**\n",
     ">\n",
-    "> Due to the optimized environment variables set by `source bigdl-nano-init`, you could already experience some training acceleration after converting your PyTorch code to use `TorchNano`."
+    "> Due to the optimized environment variables set by `source bigdl-nano-init`, you could already experience some training acceleration after converting your PyTorch code to use `TorchNano`.\n",
+    "> \n",
+    "> For more optimizations provided by `TorchNano`, you can refer to the Related Readings."
    ]
   },
   {
@@ -278,24 +280,28 @@
    "source": [
     "> 📚 **Related Readings**\n",
     "> \n",
-    "> - [How to install BigDL-Nano](https://bigdl.readthedocs.io/en/latest/doc/Nano/Overview/nano.html#install)"
+    "> - [How to install BigDL-Nano](https://bigdl.readthedocs.io/en/latest/doc/Nano/Overview/nano.html#install)\n",
+    "> - [How to accelerate a PyTorch application on training workloads through Intel® Extension for PyTorch*](https://bigdl.readthedocs.io/en/latest/doc/Nano/Howto/Training/PyTorch/accelerate_pytorch_training_ipex.html)\n",
+    "> - [How to accelerate a PyTorch application on training workloads through multiple instances](https://bigdl.readthedocs.io/en/latest/doc/Nano/Howto/Training/PyTorch/accelerate_pytorch_training_multi_instance.html)\n",
+    "> - [How to use the channels last memory format in your PyTorch application for training](https://bigdl.readthedocs.io/en/latest/doc/Nano/Howto/Training/PyTorch/pytorch_training_channels_last.html)\n",
+    "> - [How to conduct BFloat16 Mixed Precision training in your PyTorch application](https://bigdl.readthedocs.io/en/latest/doc/Nano/Howto/Training/PyTorch/accelerate_pytorch_training_bf16.html)"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3.7.13 ('nano-pytorch': conda)",
+   "display_name": "base",
    "language": "python",
    "name": "python3"
   },
   "language_info": {
    "name": "python",
-   "version": "3.7.15 (default, Nov 24 2022, 21:12:53) \n[GCC 11.2.0]"
+   "version": "3.7.0 (default, Jun 28 2018, 13:15:42) \n[GCC 7.2.0]"
   },
   "orig_nbformat": 4,
   "vscode": {
    "interpreter": {
-    "hash": "09344c7f3239fd422839751f876786d6b1a624c40f19af1b43cb2737f421c2b2"
+    "hash": "8772eaeb16382a2d9dbb95ffcb3882976733f8dc8a0780f3e0ca9a3a7dc812c0"
    }
   }
  },
diff --git a/python/nano/tutorial/notebook/training/pytorch/pytorch_training_channels_last.ipynb b/python/nano/tutorial/notebook/training/pytorch/pytorch_training_channels_last.ipynb
new file mode 100644
index 00000000000..f5dfe3d649b
--- /dev/null
+++ b/python/nano/tutorial/notebook/training/pytorch/pytorch_training_channels_last.ipynb
@@ -0,0 +1,366 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[View the runnable example on GitHub](https://github.com/intel-analytics/BigDL/tree/main/python/nano/tutorial/notebook/training/pytorch/pytorch_training_channels_last.ipynb)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Use Channels Last Memory Format in PyTorch Training"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`TorchNano` (`bigdl.nano.pytorch.TorchNano`) supports the channels last memory format to store models and tensors, i.e. NHW**C** (batch size, height, width, **channels**), in substitution for classic/contiguous N**C**HW order. Here we provide __2__ ways to achieve this: A) subclass `TorchNano` or B) use `@nano` decorator. You can choose the appropriate one depending on your (preferred) code structure."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "source": [
+    "## Prepare Environment for BigDL-Nano"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "source": [
+    "At first, you need to install BigDL-Nano for PyTorch:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install --pre --upgrade bigdl-nano[pytorch] # install the nightly-built version\n",
+    "!source bigdl-nano-init # set environment variables"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> 📝 **Note**\n",
+    ">\n",
+    "> Before starting your PyTorch application, it is highly recommended to run `source bigdl-nano-init` to set several environment variables based on your current hardware. Empirically, these variables will greatly improve performance for most PyTorch applications on training workloads."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "source": [
+    "> ⚠️ **Warning**\n",
+    "> \n",
+    "> For Jupyter Notebook users, we recommend to run the commands above, especially `source bigdl-nano-init` before jupyter kernel is started, or some of the optimizations may not take effect."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "source": [
+    "## Pre-define Model and Dataloader"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "source": [
+    "In this guide, we take the fine-tuning of a [ResNet-18 model](https://pytorch.org/vision/main/models/generated/torchvision.models.resnet18.html) on [OxfordIIITPet dataset](https://pytorch.org/vision/main/generated/torchvision.datasets.OxfordIIITPet.html) as an example:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "outputs": [],
+   "source": [
+    "# Define model and dataloader\n",
+    "\n",
+    "from torch import nn\n",
+    "from torchvision.models import resnet18\n",
+    "\n",
+    "class MyPytorchModule(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super().__init__()\n",
+    "        self.model = resnet18(pretrained=True)\n",
+    "        num_ftrs = self.model.fc.in_features\n",
+    "        # Here the size of each output sample is set to 37.\n",
+    "        self.model.fc = nn.Linear(num_ftrs, 37)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        return self.model(x)\n",
+    "\n",
+    "\n",
+    "import torch\n",
+    "from torchvision import transforms\n",
+    "from torchvision.datasets import OxfordIIITPet\n",
+    "from torch.utils.data.dataloader import DataLoader\n",
+    "\n",
+    "def create_train_dataloader():\n",
+    "    train_transform = transforms.Compose([transforms.Resize(256),\n",
+    "                                          transforms.RandomCrop(224),\n",
+    "                                          transforms.RandomHorizontalFlip(),\n",
+    "                                          transforms.ColorJitter(brightness=.5, hue=.3),\n",
+    "                                          transforms.ToTensor(),\n",
+    "                                          transforms.Normalize([0.485, 0.456, 0.406],\n",
+    "                                                               [0.229, 0.224, 0.225])])\n",
+    "\n",
+    "    # apply data augmentation to the train_dataset\n",
+    "    train_dataset = OxfordIIITPet(root=\"/tmp/data\", transform=train_transform, download=True)\n",
+    "\n",
+    "    # prepare data loader\n",
+    "    train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)\n",
+    "\n",
+    "    return train_dataloader"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## A) Subclass `TorchNano`"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In general, two steps are required if you choose to subclass `TorchNano`:\n",
+    "\n",
+    "1) import and subclass `TorchNano`, and override its `train()` method\n",
+    "2) instantiate it with setting `channels_last=True`, then call the `train()` method\n",
+    "\n",
+    "For step 1, you can refer to [this page](https://bigdl.readthedocs.io/en/latest/doc/Nano/Howto/Training/PyTorch/convert_pytorch_training_torchnano.html) to achieve it (for consistency, we use the same model and dataset as an example). Supposing that you've already got a well-defined subclass `MyNano`, below line will instantiate it with enabling channels last memory format, and call its `train()` method."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "outputs": [],
+   "source": [
+    "from tqdm import tqdm\n",
+    "from bigdl.nano.pytorch import TorchNano # import TorchNano\n",
+    "\n",
+    "# subclass TorchNano and override its train method\n",
+    "class MyNano(TorchNano):\n",
+    "    def train(self):\n",
+    "        # Move the code for your custom training loops inside the train method\n",
+    "        model = MyPytorchModule()\n",
+    "        optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)\n",
+    "        loss_fuc = torch.nn.CrossEntropyLoss()\n",
+    "        train_loader = create_train_dataloader()\n",
+    "\n",
+    "        # call setup method to set up model, optimizer(s),\n",
+    "        # and dataloader(s) for accelerated training\n",
+    "        model, optimizer, train_loader = self.setup(model, optimizer, train_loader)\n",
+    "        num_epochs = 5\n",
+    "\n",
+    "        for epoch in range(num_epochs):\n",
+    "\n",
+    "            model.train()\n",
+    "            train_loss, num = 0, 0\n",
+    "            with tqdm(train_loader, unit=\"batch\") as tepoch:\n",
+    "                for data, target in tepoch:\n",
+    "                    tepoch.set_description(f\"Epoch {epoch}\")\n",
+    "                    optimizer.zero_grad()\n",
+    "                    output = model(data)\n",
+    "                    loss = loss_fuc(output, target)\n",
+    "                    # Replace loss.backward() with self.backward(loss)\n",
+    "                    self.backward(loss)\n",
+    "                    optimizer.step()\n",
+    "                    loss_value = loss.sum()\n",
+    "                    train_loss += loss_value\n",
+    "                    num += 1\n",
+    "                    tepoch.set_postfix(loss=loss_value)\n",
+    "            print(f'Train Epoch: {epoch}, avg_loss: {train_loss / num}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MyNano(channels_last=True).train()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; _The detailed definition of_ `MyNano` _can be found in the_ [runnable example](https://github.com/intel-analytics/BigDL/tree/main/python/nano/tutorial/notebook/training/pytorch/pytorch_training_channels_last.ipynb)."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> 📝 **Note**\n",
+    ">\n",
+    "> Channels last memory format (NHWC) is currently only implemented as an alternative to 4-dimensional NCHW tensors.\n",
+    "> \n",
+    "> To use the channels last memory format, there is no need to modify your `torch.nn.Module` and dataloaders, the only change to make is setting `channels_last=True`."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## B) Use `@nano` decorator"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`@nano` decorator is very friendly since you can only add 2 new lines (import it and wrap the training function) and enjoy the features brought by BigDL-Nano if you have already defined a PyTorch training function with a model, optimizers, and dataloaders as parameters. You can learn the usage and notes of it from [here](https://bigdl.readthedocs.io/en/latest/doc/Nano/Howto/Training/PyTorch/use_nano_decorator_pytorch_training.html). The only difference when using channels last memory format is that you should specify the decorator as `@nano(channels_last=True)`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tqdm import tqdm\n",
+    "from bigdl.nano.pytorch import nano # import nano decorator\n",
+    "\n",
+    "@nano(channels_last=True) # apply the decorator to the training loop\n",
+    "def training_loop(model, optimizer, train_loader, num_epochs, loss_func):\n",
+    "\n",
+    "    for epoch in range(num_epochs):\n",
+    "\n",
+    "        model.train()\n",
+    "        train_loss, num = 0, 0\n",
+    "        with tqdm(train_loader, unit=\"batch\") as tepoch:\n",
+    "            for data, target in tepoch:\n",
+    "                tepoch.set_description(f\"Epoch {epoch}\")\n",
+    "                optimizer.zero_grad()\n",
+    "                output = model(data)\n",
+    "                loss = loss_func(output, target)\n",
+    "                loss.backward()\n",
+    "                optimizer.step()\n",
+    "                loss_value = loss.sum()\n",
+    "                train_loss += loss_value\n",
+    "                num += 1\n",
+    "                tepoch.set_postfix(loss=loss_value)\n",
+    "            print(f'Train Epoch: {epoch}, avg_loss: {train_loss / num}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "nbsphinx": "hidden"
+   },
+   "outputs": [],
+   "source": [
+    "model = MyPytorchModule()\n",
+    "optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)\n",
+    "loss_func = torch.nn.CrossEntropyLoss()\n",
+    "train_loader = create_train_dataloader()\n",
+    "\n",
+    "training_loop(model, optimizer, train_loader, num_epochs=5, loss_func=loss_func)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; _A runnable example including this_ `training_loop` _can be seen from_ [here](https://github.com/intel-analytics/BigDL/tree/main/python/nano/tutorial/notebook/training/pytorch/pytorch_training_channels_last.ipynb)."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> 📝 **Note**\n",
+    ">\n",
+    "> Channels last memory format (NHWC) is currently only implemented as an alternative to 4-dimensional NCHW tensors.\n",
+    "> \n",
+    "> To use the channels last memory format, there is no need to modify your `torch.nn.Module` and dataloaders, the only change to make is setting `channels_last=True`."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> 📚 **Related Readings**\n",
+    "> \n",
+    "> - [How to install BigDL-Nano](https://bigdl.readthedocs.io/en/latest/doc/Nano/Overview/nano.html#install)\n",
+    "> - [How to convert your PyTorch training loop to use TorchNano for acceleration](https://bigdl.readthedocs.io/en/latest/doc/Nano/Howto/Training/PyTorch/convert_pytorch_training_torchnano.html)\n",
+    "> - [How to accelerate your PyTorch training loop with \\@nano decorator](https://bigdl.readthedocs.io/en/latest/doc/Nano/Howto/Training/PyTorch/use_nano_decorator_pytorch_training.html)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0 (default, Jun 28 2018, 13:15:42) \n[GCC 7.2.0]"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "8772eaeb16382a2d9dbb95ffcb3882976733f8dc8a0780f3e0ca9a3a7dc812c0"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/python/nano/tutorial/notebook/training/pytorch/use_nano_decorator_pytorch_training.ipynb b/python/nano/tutorial/notebook/training/pytorch/use_nano_decorator_pytorch_training.ipynb
index a71d5285a47..736f1383ffc 100644
--- a/python/nano/tutorial/notebook/training/pytorch/use_nano_decorator_pytorch_training.ipynb
+++ b/python/nano/tutorial/notebook/training/pytorch/use_nano_decorator_pytorch_training.ipynb
@@ -194,7 +194,9 @@
    "source": [
     "> 📝 **Note**\n",
     ">\n",
-    "> Due to the optimized environment variables set by `source bigdl-nano-init`, you could already experience some training acceleration after wrapping your custom training loop with `@nano` decorator."
+    "> Due to the optimized environment variables set by `source bigdl-nano-init`, you could already experience some training acceleration after wrapping your custom training loop with `@nano` decorator.\n",
+    "> \n",
+    "> For more optimizations provided by `@nano` decorator, you can refer to the Related Readings."
    ]
   },
   {
@@ -204,24 +206,28 @@
    "source": [
     "> 📚 **Related Readings**\n",
     "> \n",
-    "> - [How to install BigDL-Nano](https://bigdl.readthedocs.io/en/latest/doc/Nano/Overview/nano.html#install)"
+    "> - [How to install BigDL-Nano](https://bigdl.readthedocs.io/en/latest/doc/Nano/Overview/nano.html#install)\n",
+    "> - [How to accelerate a PyTorch application on training workloads through Intel® Extension for PyTorch*](https://bigdl.readthedocs.io/en/latest/doc/Nano/Howto/Training/PyTorch/accelerate_pytorch_training_ipex.html)\n",
+    "> - [How to accelerate a PyTorch application on training workloads through multiple instances](https://bigdl.readthedocs.io/en/latest/doc/Nano/Howto/Training/PyTorch/accelerate_pytorch_training_multi_instance.html)\n",
+    "> - [How to use the channels last memory format in your PyTorch application for training](https://bigdl.readthedocs.io/en/latest/doc/Nano/Howto/Training/PyTorch/pytorch_training_channels_last.html)\n",
+    "> - [How to conduct BFloat16 Mixed Precision training in your PyTorch application](https://bigdl.readthedocs.io/en/latest/doc/Nano/Howto/Training/PyTorch/accelerate_pytorch_training_bf16.html)"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "nano-pytorch",
+   "display_name": "base",
    "language": "python",
    "name": "python3"
   },
   "language_info": {
    "name": "python",
-   "version": "3.7.15 (default, Nov 24 2022, 21:12:53) \n[GCC 11.2.0]"
+   "version": "3.7.0 (default, Jun 28 2018, 13:15:42) \n[GCC 7.2.0]"
   },
   "orig_nbformat": 4,
   "vscode": {
    "interpreter": {
-    "hash": "09344c7f3239fd422839751f876786d6b1a624c40f19af1b43cb2737f421c2b2"
+    "hash": "8772eaeb16382a2d9dbb95ffcb3882976733f8dc8a0780f3e0ca9a3a7dc812c0"
    }
   }
  },