From effb2fb83d8200cbf965e9053a5e07c3fe79c894 Mon Sep 17 00:00:00 2001 From: Jake Tae Date: Sun, 8 Aug 2021 21:50:34 +0900 Subject: [PATCH] Add GLU variants (#47) * feat: add glu variant activations * fix: rm extraneous parentheses * feat: rm bias to support jit * fix: replace negative dim with explicit dim * fix: use `x.ndim` for generic dim handling * docs: add note on version for posterity Co-authored-by: Stas Bekman * docs: specify jit in `x.ndim` comment Co-authored-by: Stas Bekman * test: add simple tests to check activations * fix: use `torch.testing` for tensor checks * test: use seed-controlled random batch inputs Co-authored-by: Stas Bekman --- megatron/model/activations.py | 40 ++++++++++++++++++++++++++++++++ tests/test_activations.py | 43 +++++++++++++++++++++++++++++++++++ tests/utils.py | 10 ++++++++ 3 files changed, 93 insertions(+) create mode 100644 megatron/model/activations.py create mode 100644 tests/test_activations.py create mode 100644 tests/utils.py diff --git a/megatron/model/activations.py b/megatron/model/activations.py new file mode 100644 index 000000000..82ccdf098 --- /dev/null +++ b/megatron/model/activations.py @@ -0,0 +1,40 @@ +import torch +from torch import nn +from torch.nn import functional as F + + +class _GLUBaseModule(nn.Module): + def __init__(self, activation_fn): + super().__init__() + self.activation_fn = activation_fn + + def forward(self, x): + # dim=-1 breaks in jit for pt<1.10 + x1, x2 = x.chunk(2, dim=(x.ndim-1)) + return x1 * self.activation_fn(x2) + + +class LiGLU(_GLUBaseModule): + def __init__(self): + super().__init__(nn.Identity()) + + +class GEGLU(_GLUBaseModule): + def __init__(self): + super().__init__(F.gelu) + + +class ReGLU(_GLUBaseModule): + def __init__(self): + super().__init__(F.relu) + + +class SwiGLU(_GLUBaseModule): + def __init__(self): + super().__init__(F.silu) + + +liglu = torch.jit.script(LiGLU()) +geglu = torch.jit.script(GEGLU()) +reglu = torch.jit.script(ReGLU()) +swiglu = torch.jit.script(SwiGLU()) diff --git a/tests/test_activations.py b/tests/test_activations.py new file mode 100644 index 000000000..6d1f8ab1d --- /dev/null +++ b/tests/test_activations.py @@ -0,0 +1,43 @@ +import random +import unittest + +import torch +from torch.nn import functional as F + +from megatron.model.activations import liglu, geglu, reglu, swiglu + +from .utils import set_seed + + +class TestActivations(unittest.TestCase): + def setUp(self): + """setup an input of reasonable size""" + set_seed() + self.batch_size = random.randint(2, 64) + self.seq_len = random.randint(256, 1025) + self.num_channels = random.randint(1, 384) * 2 + self.x = torch.randn(self.batch_size, self.seq_len, self.num_channels) + self.x1, self.x2 = self.x.chunk(2, dim=-1) + + def test_shapes(self): + # glu should halve the last dimension + output_shape = [self.batch_size, self.seq_len, self.num_channels // 2] + for activation_fn in [liglu, geglu, reglu, swiglu]: + output = activation_fn(self.x) + self.assertEqual(list(output.shape), output_shape) + + def test_liglu(self): + expected = self.x1 * self.x2 + torch.testing.assert_equal(liglu(self.x), expected) + + def test_geglu(self): + expected = self.x1 * F.gelu(self.x2) + torch.testing.assert_equal(geglu(self.x), expected) + + def test_reglu(self): + expected = self.x1 * F.relu(self.x2) + torch.testing.assert_equal(reglu(self.x), expected) + + def test_swiglu(self): + expected = self.x1 * F.silu(self.x2) + torch.testing.assert_equal(swiglu(self.x), expected) \ No newline at end of file diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 000000000..623897aa8 --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,10 @@ +import random + +import numpy as np +import torch + + +def set_seed(seed=42): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) \ No newline at end of file