microsoft · mrwyattii · Jun 28, 2023 · May 26, 2023 · Jun 7, 2023 · Jun 12, 2023
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,7 @@
+[flake8]
+ignore = E,F403,F405,F541,F841,W
+select = E9,F,W6
+per-file-ignores =
+    __init__.py:F401
+    tests/unit/inference/test_inference.py:F811
+    tests/unit/inference/test_model_profiling.py:F811
diff --git a/.github/workflows/amd-mi100.yml b/.github/workflows/amd-mi100.yml
@@ -1,12 +1,6 @@
 name: amd-mi100
 
 on:
-  push:
-    branches:
-      - 'staging**'
-    paths-ignore:
-      - 'docs/**'
-      - 'blogs/**'
   pull_request:
     paths-ignore:
       - 'docs/**'

diff --git a/.github/workflows/amd-mi200.yml b/.github/workflows/amd-mi200.yml
@@ -1,12 +1,6 @@
 name: amd-mi200
 
 on:
-  push:
-    branches:
-      - 'staging**'
-    paths-ignore:
-      - 'docs/**'
-      - 'blogs/**'
   pull_request:
     paths-ignore:
       - 'docs/**'

diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml
@@ -1,12 +1,6 @@
 name: cpu-inference
 
 on:
-  push:
-    branches:
-      - 'staging**'
-    paths-ignore:
-      - 'docs/**'
-      - 'blogs/**'
   pull_request:
     paths-ignore:
       - 'docs/**'

diff --git a/.github/workflows/formatting.yml b/.github/workflows/formatting.yml
@@ -1,9 +1,6 @@
 name: Formatting
 
 on:
-  push:
-    branches:
-      - 'staging**'
   pull_request:
     branches:
       '**'
@@ -30,7 +27,7 @@ jobs:
 
       - name: Install deepspeed
         run: |
-          pip install .[dev,autotuning]
+          pip install .[dev,autotuning,triton]
           ds_report
 
       - name: Formatting checks

diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml
@@ -1,12 +1,6 @@
 name: nv-accelerate-v100
 
 on:
-  push:
-    branches:
-      - 'staging**'
-    paths-ignore:
-      - 'docs/**'
-      - 'blogs/**'
   pull_request:
     paths-ignore:
       - 'docs/**'

diff --git a/.github/workflows/nv-inference.yml b/.github/workflows/nv-inference.yml
@@ -1,12 +1,6 @@
 name: nv-inference
 
 on:
-  push:
-    branches:
-      - 'staging**'
-    paths-ignore:
-      - 'docs/**'
-      - 'blogs/**'
   pull_request:
     paths-ignore:
       - 'docs/**'

diff --git a/.github/workflows/nv-lightning-v100.yml b/.github/workflows/nv-lightning-v100.yml
@@ -1,12 +1,6 @@
 name: nv-lightning-v100
 
 on:
-  push:
-    branches:
-      - 'staging**'
-    paths-ignore:
-      - 'docs/**'
-      - 'blogs/**'
   pull_request:
     paths-ignore:
       - 'docs/**'

diff --git a/.github/workflows/nv-megatron.yml b/.github/workflows/nv-megatron.yml
@@ -1,12 +1,6 @@
 name: nv-megatron
 
 on:
-  push:
-    branches:
-      - 'staging**'
-    paths-ignore:
-      - 'docs/**'
-      - 'blogs/**'
   pull_request:
     paths-ignore:
       - 'docs/**'

diff --git a/.github/workflows/nv-mii.yml b/.github/workflows/nv-mii.yml
@@ -1,12 +1,6 @@
 name: nv-mii
 
 on:
-  push:
-    branches:
-      - 'staging**'
-    paths-ignore:
-      - 'docs/**'
-      - 'blogs/**'
   pull_request:
     paths-ignore:
       - 'docs/**'

diff --git a/.github/workflows/nv-pre-compile-ops.yml b/.github/workflows/nv-pre-compile-ops.yml
@@ -1,12 +1,6 @@
 name: nv-pre-compile-ops
 
 on:
-  push:
-    branches:
-      - 'staging**'
-    paths-ignore:
-      - 'docs/**'
-      - 'blogs/**'
   pull_request:
     branches:
       '**'

diff --git a/.github/workflows/nv-torch-latest-cpu.yml b/.github/workflows/nv-torch-latest-cpu.yml
@@ -1,12 +1,6 @@
 name: nv-torch-latest-cpu
 
 on:
-  push:
-    branches:
-      - 'staging**'
-    paths-ignore:
-      - 'docs/**'
-      - 'blogs/**'
   pull_request:
     paths-ignore:
       - 'docs/**'

diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml
@@ -1,12 +1,6 @@
 name: nv-torch-latest-v100
 
 on:
-  push:
-    branches:
-      - 'staging**'
-    paths-ignore:
-      - 'docs/**'
-      - 'blogs/**'
   pull_request:
     paths-ignore:
       - 'docs/**'

diff --git a/.github/workflows/nv-torch19-p40.yml b/.github/workflows/nv-torch19-p40.yml
@@ -1,12 +1,6 @@
 name: nv-torch19-p40
 
 on:
-  push:
-    branches:
-      - 'staging**'
-    paths-ignore:
-      - 'docs/**'
-      - 'blogs/**'
   pull_request:
     paths-ignore:
       - 'docs/**'

diff --git a/.github/workflows/nv-torch19-v100.yml b/.github/workflows/nv-torch19-v100.yml
@@ -1,12 +1,6 @@
 name: nv-torch19-v100
 
 on:
-  push:
-    branches:
-      - 'staging**'
-    paths-ignore:
-      - 'docs/**'
-      - 'blogs/**'
   pull_request:
     paths-ignore:
       - 'docs/**'

diff --git a/.github/workflows/nv-transformers-v100.yml b/.github/workflows/nv-transformers-v100.yml
@@ -1,12 +1,6 @@
 name: nv-transformers-v100
 
 on:
-  push:
-    branches:
-      - 'staging**'
-    paths-ignore:
-      - 'docs/**'
-      - 'blogs/**'
   pull_request:
     paths-ignore:
       - 'docs/**'

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
@@ -1,12 +1,6 @@
 name: python
 
 on:
-  push:
-    branches:
-      - 'staging**'
-    paths-ignore:
-      - 'docs/**'
-      - 'blogs/**'
   pull_request:
     branches:
       '**'

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -67,7 +67,7 @@ repos:
     rev: 4.0.1
     hooks:
     -   id: flake8
-        args: ['--ignore=E,F403,F405,F541,F841,W', '--select=E9,F,W6', '--per-file-ignores=__init__.py:F401']
+        args: ['--config=.flake8']
 
 -   repo: local
     hooks:

diff --git a/README.md b/README.md
@@ -15,14 +15,11 @@
 ## Latest News
 <b> <span style="color:orange" > DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat)</span>.</b>
 
-* ***[2023/04] 🚀 [DeepSpeed Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat)*** [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/japanese/README.md)]🚀
+* [2023/06] [ZeRO++: A leap in speed for LLM and chat model training with 4X less communication](https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/)[[English](https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/zeropp/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/zeropp/japanese/README.md)]
+* [2023/04] 🚀 [DeepSpeed Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/japanese/README.md)]🚀
 * [2023/03] [Scaling Large-Scale Generative Mixture-of-Expert Multimodal Model With VL-MoE](https://www.deepspeed.ai/2023/03/30/multi-modal.html)
 * [2023/02] [Automatic Tensor Parallelism: Enables tensor parallelism by default without an injection policy](https://www.deepspeed.ai/tutorials/automatic-tensor-parallelism/)
 * [2022/12] [DeepSpeed Data Efficiency: A composable library that makes better use of data, increases training efficiency, and improves model quality](https://www.deepspeed.ai/2022/12/11/data-efficiency.html)
-* [2022/11] [Stable Diffusion Image Generation under 1 second w. DeepSpeed MII](https://github.com/microsoft/DeepSpeed-MII/tree/main/examples/benchmark/txt2img)
-* [2022/10] [DeepSpeed-MII: instant speedup on 24,000+ open-source DL models with up to 40x cheaper inference](https://www.deepspeed.ai/2022/10/10/mii.html)
-* [2022/09] [ZeRO-Inference: Democratizing massive model inference](https://www.deepspeed.ai/2022/09/09/zero-inference.html)
-* [2022/07] [Azure and DeepSpeed empower easy-to-use and high-performance model training](https://azure.microsoft.com/en-us/blog/azure-empowers-easytouse-highperformance-and-hyperscale-model-training-using-deepspeed/)
 
 ---
 

diff --git a/blogs/assets/images/triton-bert-base-latency.png b/blogs/assets/images/triton-bert-base-latency.png
diff --git a/blogs/assets/images/triton-bert-large-latency.png b/blogs/assets/images/triton-bert-large-latency.png
diff --git a/blogs/deepspeed-triton/README.md b/blogs/deepspeed-triton/README.md
@@ -0,0 +1,95 @@
+# DeepSpeed with Triton compiler
+
+# 1. Overview
+
+We have integrated [Triton](https://github.com/openai/triton), an open source compiler for GPU programming, into DeepSpeed, which further boosts the inference speed of BERT-like models in float16 precision.
+By replacing some CUDA kernels or torch operators with Triton kernels, we achieved 1.14\~1.68x speedup (or 12\~41% latency reduction) for different models and GPUs, as shown in Table 1.
+
+<div align="center">
+
+| Hardware | Bert-base | Bert-large | Roberta-base | Roberta-large |
+|----------|:------:|:------:|:------:|:------:|
+| A100 |1.65x | 1.68x | 1.53x | 1.61x |
+| V100 | 1.29x | 1.14x | 1.23x | 1.21x |
+
+Table 1. The average speedup (see NOTE below for more detail)
+
+
+</div>
+
+For those transformer operators in float16, we have implemented kernels written in Triton language that replace ordinary CUDA kernels or torch operators.
+The Triton kernels we implemented include softmax, layer-normalization, residual-addition and all the matrix multiplications except MLP layers (see NOTE below for details).
+In our experiments, Triton kernels help to reduce the average latecy (over difference sequence lengths) by 6\~24% (depending on model and hardware) when compared to the latency with CUDA-only kernels.
+
+
+Figures below show the latency reduction in more detail.
+Figure 1 visualizes latency reduction in different sequence lengths in A100 GPU for Bert-base model.
+The baseline (blue) is from Huggingface transformers without any kernel injection, the orange is from Deepspeed with CUDA-only kernels and the gray is from Deepspeed with Triton kernels.
+Figure 2 shows the same plot for Bert-large model in A100 GPU.
+
+<div align="center">
+
+<img src="../assets/images/triton-bert-base-latency.png" width="500px" alt="triton-bert-base-latency"/>
+
+*Figure 1: Normalized P90 latency for Bert-base model in A100 GPU across different sequence lengths*
+
+<img src="../assets/images/triton-bert-large-latency.png" width="500px" alt="triton-bert-large-latency"/>
+
+*Figure 2: Normalized P90 latency for Bert-large model in A100 GPU across different sequence lengths*
+
+</div>
+
+
+Next, we dive deeper into this new feature in DeepSpeed.
+
+# 2. How to use Triton in Deepspeed
+
+You can enable Triton compilers to optimize these kernels by setting a flag in the DeepSpeed config file.
+
+```
+pipe = pipeline('fill-mask', model='bert-base-cased', framework='pt', device=0)
+pipe.model = deepspeed.init_inference(pipe.model,
+                                        dtype=torch.float16,
+                                        replace_with_kernel_inject=True,
+                                        enable_cuda_graph=True,
+                                        use_triton=True,
+                                        triton_autotune=True,
+                                        max_out_tokens=pipe.tokenizer.model_max_length)
+```
+
+
+## Running BERT inference with Triton kernels
+
+We use an example of Bert-base here.
+
+```python
+pip install deepspeed[triton]
+
+git clone https://github.com/microsoft/DeepSpeedExamples.git
+cd DeepSpeedExamples/inference/huggingface/fill-mask
+
+deepspeed --num_gpus 1 test-bert.py --triton
+```
+
+To run a performance benchmark, you can use the following command:
+
+```python
+pip install deepspeed[triton]
+
+git clone https://github.com/microsoft/DeepSpeedExamples.git
+cd DeepSpeedExamples/benchmarks/inference
+
+deepspeed --num_gpus 1 triton-bert-benchmark.py --model bert-base-cased --dtype fp16 --kernel-inject --deepspeed --graphs --triton
+```
+
+# NOTE
+<!-- **_NOTE:_** -->
+* For more information on how to use DeepSpeed, please visit our [GitHub Page](https://github.com/microsoft/DeepSpeedExamples) and our [website](https://www.deepspeed.ai/), where you can find blog posts, tutorials, and documentation.
+
+* This feature is currently only supported for BERT, Roberta and other BERT-like models, and not for text-generation models yet.
+
+* To achieve the best performance with Triton optimization, you need to activate CUDA graph and ‘triton_autotune’ in the DeepSpeed config. CUDA graph prevents the overhead of JIT compilation and a deep call stack in Triton. ‘triton_autotune’ executes an initial step to find the most suitable parameters for Triton kernels, which may take some time.
+
+* We used [Triton 2.0.0.post1 release](https://pypi.org/project/triton/2.0.0.post1/) in our experiments.
+
+* In our experiments, we used a batch size of 1, a sequence length range of 8 to 512, and a ‘fill-mask’ task. Table 1 shows the average P90 latency over the entire sequence length range, while Figures 1 and 2 show the P90 latency for specific sub-ranges. The baseline is the Huggingface transformers without any optimization. The speedup is calculated as (baseline P90 latency)/(DeepSpeed-Triton P90 Latency). We found that the CUDA kernel in MLP performed better than the Triton kernel in our experiments, so we used a hybrid approach that combines both kernels when Triton is enabled in the DeepSpeed config.
diff --git a/blogs/zeropp/assets/images/eval1.png b/blogs/zeropp/assets/images/eval1.png
diff --git a/blogs/zeropp/assets/images/eval2.png b/blogs/zeropp/assets/images/eval2.png
diff --git a/blogs/zeropp/assets/images/eval3.png b/blogs/zeropp/assets/images/eval3.png
diff --git a/blogs/zeropp/assets/images/hpz.png b/blogs/zeropp/assets/images/hpz.png
diff --git a/blogs/zeropp/assets/images/overview.png b/blogs/zeropp/assets/images/overview.png
diff --git a/blogs/zeropp/assets/images/qgz.gif b/blogs/zeropp/assets/images/qgz.gif
diff --git a/blogs/zeropp/assets/images/qwz.png b/blogs/zeropp/assets/images/qwz.png
diff --git a/blogs/zeropp/assets/images/rlhf-eval.png b/blogs/zeropp/assets/images/rlhf-eval.png
diff --git a/blogs/zeropp/assets/images/zero-overview.gif b/blogs/zeropp/assets/images/zero-overview.gif