Skip to content

Commit

Permalink
Improve pre-commit hooks (microsoft#1602)
Browse files Browse the repository at this point in the history
Co-authored-by: Jeff Rasley <[email protected]>
  • Loading branch information
aphedges and jeffra authored Dec 1, 2021
1 parent 8159c1b commit fc2f378
Show file tree
Hide file tree
Showing 37 changed files with 102 additions and 103 deletions.
29 changes: 18 additions & 11 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,27 +1,34 @@

repos:
- repo: meta
hooks:
- id: check-hooks-apply
- id: check-useless-excludes

- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v1.2.3
rev: v4.0.1
hooks:
- id: trailing-whitespace
exclude: "DeepSpeedExamples/"
- id: check-case-conflict
- id: check-json
- id: check-symlinks
- id: check-yaml
exclude: "DeepSpeedExamples/"
- id: destroyed-symlinks
- id: end-of-file-fixer
exclude: "DeepSpeedExamples/"
exclude: "docs/CNAME"
exclude: docs/CNAME
- id: fix-byte-order-marker
- id: fix-encoding-pragma
args: [--remove]
- id: mixed-line-ending
exclude: "DeepSpeedExamples/"
args: [--fix=lf]
- id: requirements-txt-fixer
- id: trailing-whitespace

- repo: https://github.com/pre-commit/mirrors-yapf
rev: v0.29.0
rev: v0.31.0
hooks:
- id: yapf
exclude: "examples/"

- repo: https://gitlab.com/daverona/pre-commit-cpp
rev: 0.6.0
rev: 0.8.0
hooks:
- id: clang-format # formatter of C/C++ code based on a style guide: LLVM, Google, Chromium, Mozilla, and WebKit available
args: []
2 changes: 1 addition & 1 deletion deepspeed/autotuning/config_templates/template_zero3.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@
"stage3_prefetch_bucket_size": 5e8,
"stage3_param_persistence_threshold": 1e6,
"stage3_gather_fp16_weights_on_model_save": false,
"sub_group_size": 1e12,
"sub_group_size": 1e12
}
}
12 changes: 6 additions & 6 deletions deepspeed/module_inject/replace_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,12 +450,12 @@ def replace_fn(child, _policy, layer_id=0):
else:
# copy relevant state from child -> new module
if replace_with_kernel_inject:
new_module = replace_with_policy(
child,
_policy,
inference=True,
preln=(_policy is not HFBertLayerPolicy),
layer_id=layer_id)
new_module = replace_with_policy(child,
_policy,
inference=True,
preln=(_policy
is not HFBertLayerPolicy),
layer_id=layer_id)
else:
new_module = replace_wo_policy(child, _policy)

Expand Down
10 changes: 4 additions & 6 deletions deepspeed/moe/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,8 @@ def is_moe_param(param: torch.Tensor) -> bool:


def split_params_into_shared_and_expert_params(
params: List[torch.nn.Parameter]
) -> Tuple[torch.nn.Parameter,
torch.nn.Parameter]:
params: List[torch.nn.Parameter]) -> Tuple[torch.nn.Parameter,
torch.nn.Parameter]:
shared_params, expert_params = [], []
for p in params:
if is_moe_param(p):
Expand All @@ -22,9 +21,8 @@ def split_params_into_shared_and_expert_params(


def split_params_grads_into_shared_and_expert_params(
group: List[torch.nn.Parameter]
) -> Tuple[torch.nn.Parameter,
torch.nn.Parameter]:
group: List[torch.nn.Parameter]) -> Tuple[torch.nn.Parameter,
torch.nn.Parameter]:
"""Split grad of parameters into grads of non-expert params
and grads of expert params. This is useful while computing
grad-norms for clipping and overflow detection
Expand Down
1 change: 1 addition & 0 deletions deepspeed/ops/adam/fused_adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import torch
import importlib
from .multi_tensor_apply import MultiTensorApply

multi_tensor_applier = MultiTensorApply(2048 * 32)
from ..op_builder import FusedAdamBuilder

Expand Down
12 changes: 6 additions & 6 deletions deepspeed/ops/sparse_attention/sparse_self_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@ class SparseSelfAttention(nn.Module):
For usage example please see, TODO DeepSpeed Sparse Transformer Tutorial.
"""
def __init__(
self,
# SparsityConfig parameters needs to be set accordingly
sparsity_config=SparsityConfig(num_heads=4),
key_padding_mask_mode='add',
attn_mask_mode='mul',
max_seq_length=2048):
self,
# SparsityConfig parameters needs to be set accordingly
sparsity_config=SparsityConfig(num_heads=4),
key_padding_mask_mode='add',
attn_mask_mode='mul',
max_seq_length=2048):
"""Initialize the sparse self attention layer.
Arguments:
sparsity_config: optional: this parameter determines sparsity pattern configuration; it is based on SparsityConfig class.
Expand Down
37 changes: 18 additions & 19 deletions deepspeed/runtime/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,32 +284,31 @@ def get_quantize_training(param_dict):
if QUANTIZE_SCHEDULE in param_dict[QUANTIZE_TRAINING].keys() and
SCHEDULE_OFFSET in param_dict[QUANTIZE_TRAINING][QUANTIZE_SCHEDULE].keys()
else QUANTIZE_OFFSET_DEFAULT),
(param_dict[QUANTIZE_TRAINING][QUANTIZE_GROUPS]
if QUANTIZE_GROUPS in param_dict[QUANTIZE_TRAINING].keys() else
QUANTIZE_GROUPS_DEFAULT),
(param_dict[QUANTIZE_TRAINING][QUANTIZE_GROUPS] if QUANTIZE_GROUPS
in param_dict[QUANTIZE_TRAINING].keys() else QUANTIZE_GROUPS_DEFAULT),
(param_dict[QUANTIZE_TRAINING][FP16_MIXED_QUANTIZE]
[FP16_MIXED_QUANTIZE_ENABLED]
if FP16_MIXED_QUANTIZE in param_dict[QUANTIZE_TRAINING].keys()
and FP16_MIXED_QUANTIZE_ENABLED in param_dict[QUANTIZE_TRAINING]
[FP16_MIXED_QUANTIZE].keys() else FP16_MIXED_QUANTIZE_ENABLED_DEFAULT),
and FP16_MIXED_QUANTIZE_ENABLED
in param_dict[QUANTIZE_TRAINING][FP16_MIXED_QUANTIZE].keys() else
FP16_MIXED_QUANTIZE_ENABLED_DEFAULT),
(param_dict[QUANTIZE_TRAINING][FP16_MIXED_QUANTIZE][QUANTIZE_CHANGE_RATIO]
if FP16_MIXED_QUANTIZE in param_dict[QUANTIZE_TRAINING].keys()
and QUANTIZE_CHANGE_RATIO in param_dict[QUANTIZE_TRAINING]
[FP16_MIXED_QUANTIZE].keys() else QUANTIZE_CHANGE_RATIO_DEFAULT),
and QUANTIZE_CHANGE_RATIO
in param_dict[QUANTIZE_TRAINING][FP16_MIXED_QUANTIZE].keys() else
QUANTIZE_CHANGE_RATIO_DEFAULT),
(1 if QUANTIZE_ALGO in param_dict[QUANTIZE_TRAINING]
and QUANTIZE_TYPE in param_dict[QUANTIZE_TRAINING][QUANTIZE_ALGO].keys()
and param_dict[QUANTIZE_TRAINING][QUANTIZE_ALGO][QUANTIZE_TYPE] ==
QUANTIZE_ASYMMETRIC else QUANTIZE_TYPE_DEFAULT),
(1 if QUANTIZE_ALGO in param_dict[QUANTIZE_TRAINING] and
QUANTIZE_ROUNDING in param_dict[QUANTIZE_TRAINING][QUANTIZE_ALGO].keys()
and param_dict[QUANTIZE_TRAINING][QUANTIZE_ALGO][QUANTIZE_ROUNDING] ==
STOCHASTIC_ROUNDING else QUANTIZE_ROUNDING_DEFAULT),
(param_dict[QUANTIZE_TRAINING][QUANTIZE_VERBOSE]
if QUANTIZE_VERBOSE in param_dict[QUANTIZE_TRAINING].keys() else
QUANTIZE_VERBOSE_DEFAULT),
(param_dict[QUANTIZE_TRAINING][QUANTIZER_KERNEL]
if QUANTIZER_KERNEL in param_dict[QUANTIZE_TRAINING].keys() else
QUANTIZER_KERNEL_DEFAULT),
and param_dict[QUANTIZE_TRAINING][QUANTIZE_ALGO][QUANTIZE_TYPE]
== QUANTIZE_ASYMMETRIC else QUANTIZE_TYPE_DEFAULT),
(1 if QUANTIZE_ALGO in param_dict[QUANTIZE_TRAINING] and QUANTIZE_ROUNDING
in param_dict[QUANTIZE_TRAINING][QUANTIZE_ALGO].keys()
and param_dict[QUANTIZE_TRAINING][QUANTIZE_ALGO][QUANTIZE_ROUNDING]
== STOCHASTIC_ROUNDING else QUANTIZE_ROUNDING_DEFAULT),
(param_dict[QUANTIZE_TRAINING][QUANTIZE_VERBOSE] if QUANTIZE_VERBOSE
in param_dict[QUANTIZE_TRAINING].keys() else QUANTIZE_VERBOSE_DEFAULT),
(param_dict[QUANTIZE_TRAINING][QUANTIZER_KERNEL] if QUANTIZER_KERNEL
in param_dict[QUANTIZE_TRAINING].keys() else QUANTIZER_KERNEL_DEFAULT),
)
else:
return (
Expand Down
12 changes: 6 additions & 6 deletions deepspeed/runtime/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -511,10 +511,10 @@ def tensorboard_job_name(self):
return self._config.tensorboard_job_name

def get_summary_writer(
self,
name="DeepSpeedJobName",
base=os.path.join(os.path.expanduser("~"),
"tensorboard"),
self,
name="DeepSpeedJobName",
base=os.path.join(os.path.expanduser("~"),
"tensorboard"),
):
if self.tensorboard_output_path():
base_dir = self.tensorboard_output_path()
Expand Down Expand Up @@ -1570,8 +1570,8 @@ def forward(self, *inputs, **kwargs):
else:
see_memory_usage("Engine before forward", force=self.memory_breakdown())

flops_profiler_active = (self.flops_profiler_enabled() and
self.global_steps == self.flops_profiler_profile_step()
flops_profiler_active = (self.flops_profiler_enabled() and self.global_steps
== self.flops_profiler_profile_step()
and self.global_rank == 0)

if flops_profiler_active:
Expand Down
4 changes: 2 additions & 2 deletions deepspeed/runtime/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,8 +354,8 @@ def clip_grad_norm_(parameters, max_norm, norm_type=2, mpu=None):
total_norm = 0
for p in parameters:
if mpu is not None:
if (mpu.get_model_parallel_rank() == 0
) or is_model_parallel_parameter(p):
if (mpu.get_model_parallel_rank()
== 0) or is_model_parallel_parameter(p):
param_norm = p.grad.data.norm(norm_type)
total_norm += param_norm.item()**norm_type
else:
Expand Down
5 changes: 2 additions & 3 deletions deepspeed/runtime/zero/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,8 @@ def _initialize(self, zero_config_dict):
self.overlap_comm = get_scalar_param(
zero_config_dict,
ZERO_OPTIMIZATION_OVERLAP_COMM,
ZERO3_OPTIMIZATION_OVERLAP_COMM_DEFAULT
if self.stage == ZERO_OPTIMIZATION_WEIGHTS else
ZERO_OPTIMIZATION_OVERLAP_COMM_DEFAULT)
ZERO3_OPTIMIZATION_OVERLAP_COMM_DEFAULT if self.stage
== ZERO_OPTIMIZATION_WEIGHTS else ZERO_OPTIMIZATION_OVERLAP_COMM_DEFAULT)

self.allgather_partitions = get_scalar_param(
zero_config_dict,
Expand Down
9 changes: 4 additions & 5 deletions deepspeed/runtime/zero/partition_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,8 +488,8 @@ def get_model():
# Remote device is the device where parameter partiitons are stored
# It can be same as local_device or it could be CPU or NVMe.
self.remote_device = self.local_device if remote_device is None else remote_device
self.pin_memory = pin_memory if (
self.remote_device == OFFLOAD_CPU_DEVICE) else False
self.pin_memory = pin_memory if (self.remote_device
== OFFLOAD_CPU_DEVICE) else False

# Enable fp16 param swapping to NVMe
if self.remote_device == OFFLOAD_NVME_DEVICE:
Expand Down Expand Up @@ -783,9 +783,8 @@ def _partition_param(self, param, buffer=None, has_been_updated=False):
partitioned_tensor = torch.empty(
partition_size,
dtype=param.dtype,
device=OFFLOAD_CPU_DEVICE
if self.remote_device == OFFLOAD_NVME_DEVICE else
self.remote_device)
device=OFFLOAD_CPU_DEVICE if self.remote_device
== OFFLOAD_NVME_DEVICE else self.remote_device)
if self.pin_memory:
partitioned_tensor = partitioned_tensor.pin_memory()

Expand Down
4 changes: 2 additions & 2 deletions deepspeed/runtime/zero/stage3.py
Original file line number Diff line number Diff line change
Expand Up @@ -2733,8 +2733,8 @@ def _optimizer_states_and_gradient_swap_out(self, sub_group_id, timer_names=set(

self.optimizer_swapper.swap_out_optimizer_state(
parameter=self.fp32_partitioned_groups_flat[sub_group_id],
async_swap=self.next_swappable_fp32_partitioned_groups[sub_group_id] is
not None)
async_swap=self.next_swappable_fp32_partitioned_groups[sub_group_id]
is not None)

self.stop_timers([OPTIMIZER_SWAP_OUT_STATE])
see_memory_usage(
Expand Down
10 changes: 5 additions & 5 deletions docs/_posts/2020-05-28-fastest-bert-training.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ example, DeepSpeed can attain a staggering 64 teraflops of single GPU
performance on a NVIDIA V100 GPU which is over 50% of the hardware peak.

In this blog post, we will discuss four technological improvements that enable
DeepSpeed to achieve this record-breaking BERT training time.
DeepSpeed to achieve this record-breaking BERT training time.

1. Highly optimized transformer kernels to improve compute efficiency
2. Overlapping I/O with computation through asynchronous prefetching queue
3. Sparse output processing to eliminate wasteful computation
3. Sparse output processing to eliminate wasteful computation
4. Layer-norm reordering for training stability and faster convergence

These optimizations not only benefit BERT; they are also applicable to many
Expand Down Expand Up @@ -143,7 +143,7 @@ transferring data to and from global memory and overhead from kernel launching.
Existing compiler-based approaches perform fine-grained fusion (e.g., fusion of
element-wise operations), leading to missed fusion opportunities. In contrast,
we fully exploit both fine-grain and coarse-grained fusion, tailored for
Transformer blocks.
Transformer blocks.

**QKV and various fusions.** We merge the three Query (Q), Key (K), and Value (V)
weight matrices to dispatch a larger QKV GEMM to expose more parallelism and
Expand All @@ -160,7 +160,7 @@ order to rearrange the data in a way that we can put the data in consecutive
parts of memory. Even though we produce an uncoalesced pattern when accessing
shared memory, we reduce the cost of uncoalesced access to main memory to
better exploit memory bandwidth, resulting in 3% to 5% performance improvement
in the end-to-end training.
in the end-to-end training.

![QKV-Fusion](/assets/images/qkv_fusion.png){: .align-center}

Expand Down Expand Up @@ -280,7 +280,7 @@ a modification described by several recent works for neural machine
translation. The Pre-LayerNorm results in several useful characteristics such
as avoiding vanishing gradient, stable optimization, and performance gain. It
allows us to train at aggregated batch size of 64K with increased learning rate
and faster convergence.
and faster convergence.


To try out these optimizations and training recipe, please check out our [BERT
Expand Down
6 changes: 3 additions & 3 deletions docs/_posts/2021-03-08-zero3-offload.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Today we are announcing the release of ZeRO-3 Offload, a highly efficient and ea
* Extremely Easy to use:
* Scale to over a trillion parameters without the need to combine multiple parallelism techniques in complicated ways.
* For existing DeepSpeed users, turn on ZeRO-3 Offload with just a few flags in DeepSpeed Config file.
* High-performance per-GPU throughput and super-linear scalability across GPUs for distributed training.
* High-performance per-GPU throughput and super-linear scalability across GPUs for distributed training.
* With 1 Trillion parameters, ZeRO-3 Offload sustains 25 PetaFlops in compute performance on 512 NVIDIA V100 GPUs, achieving 49 TFlops/GPU.
* Up to 2x improvement in throughput compared to ZeRO- 2 Offload on single GPU

Expand Down Expand Up @@ -64,7 +64,7 @@ i) With ground-breaking memory efficiency, ZeRO-3 and ZeRO-3 Offload are the onl
ii) ZeRO-3 Offload requires virtually no model refactoring from model scientists, liberating data scientists to scale up complex models to hundreds of billions to trillions of parameters.

<h2>Excellent training efficiency</h2>
<i>High-performance per-GPU throughput on multiple nodes</i>: ZeRO-3 Offload offers excellent training efficiency for multi-billion and trillion parameter models on multiple nodes. It achieves a sustained throughput of up to 50 Tflops per GPU running on 32 DGX2 nodes comprising 512 NVIDIA V100 GPUs (see Figure 2). In comparison, the standard data parallel training with PyTorch can only achieve 30 TFlops per GPU for a 1.2B parameter model, the largest model that can be trained using data parallelism alone.
<i>High-performance per-GPU throughput on multiple nodes</i>: ZeRO-3 Offload offers excellent training efficiency for multi-billion and trillion parameter models on multiple nodes. It achieves a sustained throughput of up to 50 Tflops per GPU running on 32 DGX2 nodes comprising 512 NVIDIA V100 GPUs (see Figure 2). In comparison, the standard data parallel training with PyTorch can only achieve 30 TFlops per GPU for a 1.2B parameter model, the largest model that can be trained using data parallelism alone.

<a href="/assets/images/zero3-offload-512-v100.png">
<img src="/assets/images/zero3-offload-512-v100.png">
Expand All @@ -74,7 +74,7 @@ Figure 2. ZeRO-3 Offload: Multi-billion and trillion parameter model throughput
ZeRO-3 Offload obtains high efficiency despite the 50% communication overhead of ZeRO Stage 3 compared to standard data parallel training for a fixed batch size. This is made possible through a communication overlap centric design and implementation, which allows ZeRO-3 Offload to hide nearly all of the communication volume with computation, while taking advantage of a larger batch size for improved efficiency resulting from better GPU memory efficiency.


<i>Efficient multi-billion parameter model training on a single GPU</i>: ZeRO-3 Offload further democratizes AI by enabling efficient training of multi-billion parameter models on a single GPU. For single GPU training, ZeRO-3 Offload provides benefits over ZeRO-2 Offload along two dimensions. First, ZeRO-3 Offload increases the size of models trainable on a single V100 from 13B to 40B. Second, for ZeRO-3 Offload provides speedups (e.g., 2.3X for 13B) compared to ZeRO-2 Offload for model sizes trainable by both solutions. These results are summarized in Figure 3.
<i>Efficient multi-billion parameter model training on a single GPU</i>: ZeRO-3 Offload further democratizes AI by enabling efficient training of multi-billion parameter models on a single GPU. For single GPU training, ZeRO-3 Offload provides benefits over ZeRO-2 Offload along two dimensions. First, ZeRO-3 Offload increases the size of models trainable on a single V100 from 13B to 40B. Second, for ZeRO-3 Offload provides speedups (e.g., 2.3X for 13B) compared to ZeRO-2 Offload for model sizes trainable by both solutions. These results are summarized in Figure 3.

<a href="/assets/images/zero3-offload-1-v100.png">
<img src="/assets/images/zero3-offload-1-v100.png">
Expand Down
2 changes: 1 addition & 1 deletion docs/_posts/2021-05-05-MoQ.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ For enabling quantization through Deepspeed, we only need to pass the scheduling

## Improving quantization accuracy.

To show how our quantization scheme preserves accuracy, we have experimented MoQ on several tasks and networks: GLUE tasks on Bert-Base and SQuAD on Bert-Large. Table 1 shows the accuracy results for the baseline without quantization (w/o Quant), basic quantization without using any scheduling during training (Basic Quant), and our MoQ scheme. Without using any scheduling, the accuracy for 8-bit quantization is often inferior to the baseline, and in this workload, it suffers from a drop of 1.02 point in accuracy (ACC). In contrast, MoQ powers 8-bit quantization to obtain comparable accuracy as the FP16 baseline, even with a slightly higher ACC, demonstrating the effectiveness of our quantization approach.
To show how our quantization scheme preserves accuracy, we have experimented MoQ on several tasks and networks: GLUE tasks on Bert-Base and SQuAD on Bert-Large. Table 1 shows the accuracy results for the baseline without quantization (w/o Quant), basic quantization without using any scheduling during training (Basic Quant), and our MoQ scheme. Without using any scheduling, the accuracy for 8-bit quantization is often inferior to the baseline, and in this workload, it suffers from a drop of 1.02 point in accuracy (ACC). In contrast, MoQ powers 8-bit quantization to obtain comparable accuracy as the FP16 baseline, even with a slightly higher ACC, demonstrating the effectiveness of our quantization approach.

|Task |STSB |MRPC |COLA |WNLI |SST2 |RTE |QNLI |QQP |MNLI |SQuAD|ACC+ |
|-------------|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|
Expand Down
Loading

0 comments on commit fc2f378

Please sign in to comment.