From ad8417c5e6224f345829b7b0c076d98fd1cfab7b Mon Sep 17 00:00:00 2001 From: Quentin Anthony Date: Fri, 10 May 2024 11:58:20 -0400 Subject: [PATCH 1/2] Add TE skeleton --- megatron/model/transformer_engine.py | 100 +++++++++++++++++++++++++++ tests/README.md | 1 + 2 files changed, 101 insertions(+) create mode 100644 megatron/model/transformer_engine.py diff --git a/megatron/model/transformer_engine.py b/megatron/model/transformer_engine.py new file mode 100644 index 000000000..8e3d0d527 --- /dev/null +++ b/megatron/model/transformer_engine.py @@ -0,0 +1,100 @@ +import transformer_engine as te +import torch +from pkg_resources import packaging + +_te_version = packaging.version.Version(version("transformer-engine")) + + +class TENorm: + """ + A conditional wrapper to initialize an instance of Transformer-Engine's + `LayerNorm` or `RMSNorm` based on input + """ + + def __new__(): + return + # TODO ??? + + +class TELinear(te.pytorch.Linear): + """ + Wrapper for the Transformer-Engine's `Linear` layer. + """ + + def __init__(self): + return + # TODO: Nick + + def forward(self, x): + return + # TODO: Nick + + +class TELayerNormColumnParallelLinear(te.pytorch.LayerNormLinear): + """ + Wrapper for the Transformer-Engine's `LayerNormLinear` layer that combines + layernorm and linear layers + """ + + def __init__(self): + return + # TODO: Nick + + def forward(self, x): + return + # TODO: Nick + + +class TEColumnParallelLinear(TELinear): + """ + Wrapper for the Transformer-Engine's `Linear` layer but specialized similar + to megatron's `ColumnParallelLinear` layer. + """ + + def __init__(self): + # TODO: Nick + return + + def forward(self, x): + return + # TODO: Nick + + +class TERowParallelLinear(TELinear): + """ + Wrapper for the Transformer-Engine's `Linear` layer but specialized similar + to megatron's `RowParallelLinear` layer. + """ + + def __init__(self): + # TODO: Nick + return + + def forward(self, x): + # TODO: Nick + return + + +class TEDotProductAttention(te.pytorch.DotProductAttention): + """ + Wrapper for the Transformer-Engine's `DotProductAttention` layer that also + has "flash attention" enabled. + """ + + def __init__(self): + # TODO: tfidia + return + + def forward(self, x): + # TODO: tfidia + return + + +class TEDelayedScaling(te.common.recipe.DelayedScaling): + """ + Wrapper for the Transformer-Engine's `DelayedScaling` layer. + """ + + def __init__(self): + # TODO: ??? + return diff --git a/tests/README.md b/tests/README.md index 316096cc5..c1fac0f81 100644 --- a/tests/README.md +++ b/tests/README.md @@ -3,6 +3,7 @@ Tests use pytests with coverage and forked plugins. Install with: ```bash +pip install -r requirements/requirements.txt pip install -r requirements/requirements-dev.txt ``` From 3dccb2f03a569bd40b84c9d4d24ad44d0971c24f Mon Sep 17 00:00:00 2001 From: github-actions Date: Fri, 10 May 2024 16:00:43 +0000 Subject: [PATCH 2/2] Update NeoXArgs docs automatically --- configs/neox_arguments.md | 47 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index c8e1492ae..24313b68d 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 6fb840e + Default = ad8417c current git hash of repository @@ -1201,7 +1201,7 @@ Text Generation arguments -- **num_experts**: int +- **moe_num_experts**: int Default = 1 @@ -1243,7 +1243,7 @@ Text Generation arguments - **moe_token_dropping**: bool - Default = True + Default = False Whether to drop tokens when exceeding capacity @@ -1273,6 +1273,47 @@ Text Generation arguments +- **moe_type**: str + + Default = megablocks + + Either `deepspeed` or `megablocks` + + + +- **moe_glu**: bool + + Default = False + + Use gated linear units in MoE + + + +- **moe_lbl_in_fp32**: bool + + Default = False + + Whether to compute the load balancing loss in fp32. + + + +- **moe_jitter_eps**: float + + Default = None + + Coefficient for MoE routing jitter. Jitter is + not used if set to None + + + +- **enable_expert_tensor_parallelism**: bool + + Default = False + + Enable expert tensor parallelism + + + ## NeoXArgsTokenizer Tokenizer Arguments