cleanup activation

Fix #62. Fix #63.
rwth-i6 · Nov 5, 2021 · 640168f · 640168f
1 parent 162208e
commit 640168f
Show file tree

Hide file tree

Showing 3 changed files with 22 additions and 100 deletions.
diff --git a/nn/_generate_layers.py b/nn/_generate_layers.py
@@ -44,6 +44,7 @@
   "cond", "masked_computation", "subnetwork",
 
   "source",  # we have get_extern_data instead
+  "activation",  # will be explicit. https://github.com/rwth-i6/returnn_common/issues/63
   "swap_axes",
   "gather_nd",  # -> gather
   "softmax",  # misleading (because not just activation), also we will have a separate softmax activation
@@ -507,6 +508,7 @@ def has_recurrent_state(self) -> bool:
   _IgnoreParamNames = {
     "self", "name", "network", "output",
     "n_out", "out_type", "sources", "target", "loss", "loss_", "size_target",
+    "activation",  # more explicitly decoupled. https://github.com/rwth-i6/returnn_common/issues/62
     "name_scope", "reuse_params",
     "rec_previous_layer", "control_dependencies_on_output",
     "state", "initial_state", "initial_output",

diff --git a/nn/_generated_layers.py b/nn/_generated_layers.py
@@ -1,6 +1,6 @@
 """
 This file is auto-generated by _generate_layers.py.
-RETURNN: 1.20211027.132338+git.a773137
+RETURNN: 1.20211104.153922+git.bc19cba
 
 These are the RETURNN layers directly wrapped.
 Note that we intentionally exclude some layers or options for more consistency.
@@ -222,72 +222,6 @@ def scaled_gradient(
   return mod(source, name=name)
 
 
-class _Activation(_Base):
-  """
-  This layer just applies an activation function.
-  See :func:`returnn.tf.util.basic.get_activation_function` about supported functions.
-  Also see :class:`EvalLayer` and :class:`CombineLayer` for similar layers.
-  """
-  returnn_layer_class = 'activation'
-  has_recurrent_state = False
-  has_variables = False
-
-  # noinspection PyShadowingBuiltins,PyShadowingNames
-  def __init__(self,
-               *,
-               activation: str,
-               **kwargs):
-    """
-    :param str activation: e.g. "relu", "tanh", etc
-    """
-    super().__init__(**kwargs)
-    self.activation = activation
-
-  def get_opts(self):
-    """
-    Return all options
-    """
-    opts = {
-      'activation': self.activation,
-    }
-    opts = {key: value for (key, value) in opts.items() if value is not NotSpecified}
-    return {**opts, **super().get_opts()}
-
-  # noinspection PyShadowingBuiltins,PyShadowingNames
-  def make_layer_dict(self,
-                      source: LayerRef,
-                      ) -> LayerDictRaw:
-    """
-    Make layer dict
-    """
-    assert isinstance(source, LayerRef)
-    return {
-      'class': 'activation',
-      'from': source,
-      **self.get_opts()}
-
-
-# noinspection PyShadowingBuiltins,PyShadowingNames
-def activation(
-               source: LayerRef,
-               *,
-               activation: str,
-               name: Optional[Union[str, NameCtx]] = None) -> Layer:
-  """
-  This layer just applies an activation function.
-  See :func:`returnn.tf.util.basic.get_activation_function` about supported functions.
-  Also see :class:`EvalLayer` and :class:`CombineLayer` for similar layers.
-
-  :param LayerRef source:
-  :param str activation: e.g. "relu", "tanh", etc
-  :param str|None name:
-  """
-  mod = _Activation(
-    activation=activation,
-    )
-  return mod(source, name=name)
-
-
 class BatchNorm(_Copy):
   """
   Implements batch-normalization (https://arxiv.org/abs/1502.03167) as a separate layer.
@@ -1143,7 +1077,6 @@ class Linear(_Base):
   def __init__(self,
                n_out: int,
                *,
-               activation: Optional[str] = NotSpecified,
                with_bias: bool = NotSpecified,
                grad_filter: Optional[float] = NotSpecified,
                forward_weights_init: str = NotSpecified,
@@ -1152,7 +1085,6 @@ def __init__(self,
                **kwargs):
     """
     :param int n_out: output dimension
-    :param str|None activation: e.g. "relu", or None
     :param bool with_bias:
     :param float|None grad_filter: if grad norm is higher than this threshold (before activation), the grad is removed
     :param str forward_weights_init: see :func:`returnn.tf.util.basic.get_initializer`
@@ -1161,7 +1093,6 @@ def __init__(self,
     """
     super().__init__(**kwargs)
     self.n_out = n_out
-    self.activation = activation
     self.with_bias = with_bias
     self.grad_filter = grad_filter
     self.forward_weights_init = forward_weights_init
@@ -1174,7 +1105,6 @@ def get_opts(self):
     """
     opts = {
       'n_out': self.n_out,
-      'activation': self.activation,
       'with_bias': self.with_bias,
       'grad_filter': self.grad_filter,
       'forward_weights_init': self.forward_weights_init,
@@ -3170,7 +3100,6 @@ def __init__(self,
                input_split_feature_dim: Optional[int] = NotSpecified,
                auto_use_channel_first: bool = NotSpecified,
                with_bias: Union[bool, NotSpecified] = NotSpecified,
-               activation: Optional[str] = NotSpecified,
                forward_weights_init: Any = NotSpecified,
                bias_init: Any = NotSpecified,
                filter_perm: Optional[Dict[str, str]] = NotSpecified,
@@ -3193,7 +3122,6 @@ def __init__(self,
       will be divided by input_split_feature_dim, thus it must be a multiple of that value.
     :param bool auto_use_channel_first: convert the input to NCHW or not
     :param bool|NotSpecified with_bias: if True, will add a bias to the output features. False by default
-    :param None|str activation: if set, will apply this function at the end
     :param forward_weights_init:
     :param bias_init:
     :param dict[str,str]|None filter_perm: transposes the filter (input filter as layer)
@@ -3210,7 +3138,6 @@ def __init__(self,
     self.input_split_feature_dim = input_split_feature_dim
     self.auto_use_channel_first = auto_use_channel_first
     self.with_bias = with_bias
-    self.activation = activation
     self.forward_weights_init = forward_weights_init
     self.bias_init = bias_init
     self.filter_perm = filter_perm
@@ -3231,7 +3158,6 @@ def get_opts(self):
       'input_split_feature_dim': self.input_split_feature_dim,
       'auto_use_channel_first': self.auto_use_channel_first,
       'with_bias': self.with_bias,
-      'activation': self.activation,
       'forward_weights_init': self.forward_weights_init,
       'bias_init': self.bias_init,
       'filter_perm': self.filter_perm,
@@ -3455,7 +3381,6 @@ def __init__(self,
                n_out: int,
                *,
                filter_size: List[int],
-               activation: Optional[str],
                strides: Optional[List[int]] = NotSpecified,
                padding: str = NotSpecified,
                remove_padding: Any = NotSpecified,
@@ -3468,7 +3393,6 @@ def __init__(self,
     """
     :param int n_out: output dimension
     :param list[int] filter_size:
-    :param str|None activation:
     :param list[int]|None strides: specifies the upscaling. by default, same as filter_size
     :param str padding: "same" or "valid"
     :param list[int]|int remove_padding:
@@ -3482,7 +3406,6 @@ def __init__(self,
     super().__init__(**kwargs)
     self.n_out = n_out
     self.filter_size = filter_size
-    self.activation = activation
     self.strides = strides
     self.padding = padding
     self.remove_padding = remove_padding
@@ -3499,7 +3422,6 @@ def get_opts(self):
     opts = {
       'n_out': self.n_out,
       'filter_size': self.filter_size,
-      'activation': self.activation,
       'strides': self.strides,
       'padding': self.padding,
       'remove_padding': self.remove_padding,
@@ -4523,7 +4445,6 @@ class _Combine(_Base):
   def __init__(self,
                *,
                kind: str,
-               activation: Optional[str] = NotSpecified,
                with_bias: bool = NotSpecified,
                eval: Union[str, callable] = NotSpecified,
                eval_locals: Optional[Dict[str]] = NotSpecified,
@@ -4532,15 +4453,13 @@ def __init__(self,
     """
     :param str kind:
       currently accepted values are `average`, `add`, `sub`, `mul`, `truediv`, `logical_and`, `logical_or`, or `eval`
-    :param str|None activation: if provided, activation function to apply, e.g. "tanh" or "relu"
     :param bool with_bias: if given, will add a trainable bias tensor
     :param str|callable eval: for kind="eval", will eval this string. or function. see :func:`_op_kind_eval`
     :param dict[str]|None eval_locals: locals for eval
     :param bool eval_for_output_loss: will do the same eval on layer.output_loss
     """
     super().__init__(**kwargs)
     self.kind = kind
-    self.activation = activation
     self.with_bias = with_bias
     self.eval = eval
     self.eval_locals = eval_locals
@@ -4552,7 +4471,6 @@ def get_opts(self):
     """
     opts = {
       'kind': self.kind,
-      'activation': self.activation,
       'with_bias': self.with_bias,
       'eval': self.eval,
       'eval_locals': self.eval_locals,
@@ -4580,7 +4498,6 @@ def _combine(
              source: Union[List[LayerRef], Tuple[LayerRef]],
              *,
              kind: str,
-             activation: Optional[str] = NotSpecified,
              with_bias: bool = NotSpecified,
              eval: Union[str, callable] = NotSpecified,
              eval_locals: Optional[Dict[str]] = NotSpecified,
@@ -4597,7 +4514,6 @@ def _combine(
   :param list[LayerRef]|tuple[LayerRef] source:
   :param str kind:
     currently accepted values are `average`, `add`, `sub`, `mul`, `truediv`, `logical_and`, `logical_or`, or `eval`
-  :param str|None activation: if provided, activation function to apply, e.g. "tanh" or "relu"
   :param bool with_bias: if given, will add a trainable bias tensor
   :param str|callable eval: for kind="eval", will eval this string. or function. see :func:`_op_kind_eval`
   :param dict[str]|None eval_locals: locals for eval
@@ -4606,7 +4522,6 @@ def _combine(
   """
   mod = _Combine(
     kind=kind,
-    activation=activation,
     with_bias=with_bias,
     eval=eval,
     eval_locals=eval_locals,
@@ -4673,7 +4588,6 @@ def eval(
          source: Union[LayerRef, List[LayerRef], Tuple[LayerRef]],
          *,
          eval: str,
-         activation: Optional[str] = NotSpecified,
          with_bias: bool = NotSpecified,
          eval_locals: Optional[Dict[str]] = NotSpecified,
          eval_for_output_loss: bool = NotSpecified,
@@ -4689,15 +4603,13 @@ def eval(
 
   :param LayerRef|list[LayerRef]|tuple[LayerRef] source:
   :param str eval: will eval this string. see :func:`_op_kind_eval`
-  :param str|None activation: if provided, activation function to apply, e.g. "tanh" or "relu"
   :param bool with_bias: if given, will add a trainable bias tensor
   :param dict[str]|None eval_locals: locals for eval
   :param bool eval_for_output_loss: will do the same eval on layer.output_loss
   :param str|None name:
   """
   mod = _Eval(
     eval=eval,
-    activation=activation,
     with_bias=with_bias,
     eval_locals=eval_locals,
     eval_for_output_loss=eval_for_output_loss,

diff --git a/nn/math_.py b/nn/math_.py
@@ -3,50 +3,58 @@
 (potential activation functions).
 """
 
-from .base import LayerRef, Layer
-from ._generated_layers import activation
+from .base import LayerRef, Layer, make_layer
 
 
 def relu(x: LayerRef) -> Layer:
   """ReLU"""
-  return activation(x, activation="relu")
+  return _activation(x, activation="relu")
 
 
 def elu(x: LayerRef) -> Layer:
   """ELU https://arxiv.org/abs/1511.07289"""
-  return activation(x, activation="elu")
+  return _activation(x, activation="elu")
 
 
 def selu(x: LayerRef) -> Layer:
   """SELU https://arxiv.org/abs/1706.02515"""
-  return activation(x, activation="selu")
+  return _activation(x, activation="selu")
 
 
 def gelu(x: LayerRef) -> Layer:
   """GELU https://arxiv.org/abs/1606.08415"""
-  return activation(x, activation="gelu")
+  return _activation(x, activation="gelu")
 
 
 def exp(x: LayerRef) -> Layer:
   """exp"""
-  return activation(x, activation="exp")
+  return _activation(x, activation="exp")
 
 
 def log(x: LayerRef) -> Layer:
   """log"""
-  return activation(x, activation="log")
+  return _activation(x, activation="log")
 
 
 def tanh(x: LayerRef) -> Layer:
   """tanh"""
-  return activation(x, activation="tanh")
+  return _activation(x, activation="tanh")
 
 
 def sigmoid(x: LayerRef) -> Layer:
   """sigmoid"""
-  return activation(x, activation="sigmoid")
+  return _activation(x, activation="sigmoid")
 
 
 def swish(x: LayerRef) -> Layer:
   """swish"""
-  return activation(x, activation="swish")
+  return _activation(x, activation="swish")
+
+
+def _activation(x: LayerRef, activation: str) -> Layer:
+  """
+  RETURNN ActivationLayer.
+  Only for internal use.
+  If anything is missing here in this module, please just add it.
+  """
+  return make_layer({"class": "activation", "from": x, "activation": activation}, name=activation)