From ad8417c5e6224f345829b7b0c076d98fd1cfab7b Mon Sep 17 00:00:00 2001
From: Quentin Anthony <qganthony@yahoo.com>
Date: Fri, 10 May 2024 11:58:20 -0400
Subject: [PATCH 1/2] Add TE skeleton

---
 megatron/model/transformer_engine.py | 100 +++++++++++++++++++++++++++
 tests/README.md                      |   1 +
 2 files changed, 101 insertions(+)
 create mode 100644 megatron/model/transformer_engine.py

diff --git a/megatron/model/transformer_engine.py b/megatron/model/transformer_engine.py
new file mode 100644
index 000000000..8e3d0d527
--- /dev/null
+++ b/megatron/model/transformer_engine.py
@@ -0,0 +1,100 @@
+import transformer_engine as te
+import torch
+from pkg_resources import packaging
+
+_te_version = packaging.version.Version(version("transformer-engine"))
+
+
+class TENorm:
+    """
+    A conditional wrapper to initialize an instance of Transformer-Engine's
+    `LayerNorm` or `RMSNorm` based on input
+    """
+
+    def __new__():
+        return
+        # TODO ???
+
+
+class TELinear(te.pytorch.Linear):
+    """
+    Wrapper for the Transformer-Engine's `Linear` layer.
+    """
+
+    def __init__(self):
+        return
+        # TODO: Nick
+
+    def forward(self, x):
+        return
+        # TODO: Nick
+
+
+class TELayerNormColumnParallelLinear(te.pytorch.LayerNormLinear):
+    """
+    Wrapper for the Transformer-Engine's `LayerNormLinear` layer that combines
+    layernorm and linear layers
+    """
+
+    def __init__(self):
+        return
+        # TODO: Nick
+
+    def forward(self, x):
+        return
+        # TODO: Nick
+
+
+class TEColumnParallelLinear(TELinear):
+    """
+    Wrapper for the Transformer-Engine's `Linear` layer but specialized similar
+    to megatron's `ColumnParallelLinear` layer.
+    """
+
+    def __init__(self):
+        # TODO: Nick
+        return
+
+    def forward(self, x):
+        return
+        # TODO: Nick
+
+
+class TERowParallelLinear(TELinear):
+    """
+    Wrapper for the Transformer-Engine's `Linear` layer but specialized similar
+    to megatron's `RowParallelLinear` layer.
+    """
+
+    def __init__(self):
+        # TODO: Nick
+        return
+
+    def forward(self, x):
+        # TODO: Nick
+        return
+
+
+class TEDotProductAttention(te.pytorch.DotProductAttention):
+    """
+    Wrapper for the Transformer-Engine's `DotProductAttention` layer that also
+    has "flash attention" enabled.
+    """
+
+    def __init__(self):
+        # TODO: tfidia
+        return
+
+    def forward(self, x):
+        # TODO: tfidia
+        return
+
+
+class TEDelayedScaling(te.common.recipe.DelayedScaling):
+    """
+    Wrapper for the Transformer-Engine's `DelayedScaling` layer.
+    """
+
+    def __init__(self):
+        # TODO: ???
+        return
diff --git a/tests/README.md b/tests/README.md
index 316096cc5..c1fac0f81 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -3,6 +3,7 @@
 Tests use pytests with coverage and forked plugins. Install with:
 
 ```bash
+pip install -r requirements/requirements.txt
 pip install -r requirements/requirements-dev.txt
 ```
 

From 3dccb2f03a569bd40b84c9d4d24ad44d0971c24f Mon Sep 17 00:00:00 2001
From: github-actions <github-actions@github.com>
Date: Fri, 10 May 2024 16:00:43 +0000
Subject: [PATCH 2/2] Update NeoXArgs docs automatically

---
 configs/neox_arguments.md | 47 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 44 insertions(+), 3 deletions(-)

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index c8e1492ae..24313b68d 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = 6fb840e
+    Default = ad8417c
 
     current git hash of repository
 
@@ -1201,7 +1201,7 @@ Text Generation arguments
 
 
 
-- **num_experts**: int
+- **moe_num_experts**: int
 
     Default = 1
 
@@ -1243,7 +1243,7 @@ Text Generation arguments
 
 - **moe_token_dropping**: bool
 
-    Default = True
+    Default = False
 
     Whether to drop tokens when exceeding capacity
 
@@ -1273,6 +1273,47 @@ Text Generation arguments
 
 
 
+- **moe_type**: str
+
+    Default = megablocks
+
+    Either `deepspeed` or `megablocks`
+
+
+
+- **moe_glu**: bool
+
+    Default = False
+
+    Use gated linear units in MoE
+
+
+
+- **moe_lbl_in_fp32**: bool
+
+    Default = False
+
+    Whether to compute the load balancing loss in fp32.
+
+
+
+- **moe_jitter_eps**: float
+
+    Default = None
+
+    Coefficient for MoE routing jitter. Jitter is
+    not used if set to None
+
+
+
+- **enable_expert_tensor_parallelism**: bool
+
+    Default = False
+
+    Enable expert tensor parallelism
+
+
+
 ## NeoXArgsTokenizer
 
 Tokenizer Arguments