Force DenseWithSparseWeights to produce dense output and use all inpu…

…ts (#8011) * Force dense output for DenseLayerWithSparseWeights * Fix type error * Draft `LocallyConnectedDense` * Develop `LocallyConnectedDense` * Trying things... * Use `density` instead of `sparsity` * Ensure all inputs are connected * Rename connection_density * Fix doc strings * Add changelog * Fix change log * Fix missing rename * Fix doc string formatting * Add more doc strings * Fix more doc strings * Deprecate WEIGHT_SPARSITY * Simplify * Simplify * Update doc strings * Add info about components * Adjust num_extra_connections * Add test * Fix strings * Change doc strings * Use cast-uniform mask generation * Remove obsolete code * Simplify * Add migration guide * Lint * Increase epochs in test config We need more epochs when we don't have much data. Otherwise tests are failing depending on the seed. * Fix heading depth * Use increased learning rate instead of epochs * Fix merge
RasaHQ · Apr 29, 2021 · 6a73b2e · 6a73b2e
1 parent 5cb7fe0
commit 6a73b2e
Show file tree

Hide file tree

Showing 15 changed files with 268 additions and 84 deletions.
diff --git a/changelog/7999.improvement.md b/changelog/7999.improvement.md
@@ -0,0 +1,5 @@
+Replace `weight_sparsity` with `connection_density` in all transformer-based models and add guarantees about internal layers.
+
+We rename `DenseWithSparseWeights` into `RandomlyConnectedDense`, and guarantee that even at density zero the output is dense and every input is connected to at least one output. The former `weight_sparsity` parameter of DIET, TED, and the ResponseSelector, is now roughly equivalent to `1 - connection_density`, except at very low densities (high sparsities). 
+
+All layers and components that used to have a `sparsity` argument (`Ffnn`, `TransformerRasaModel`, `MultiHeadAttention`, `TransformerEncoderLayer`, `TransformerEncoder`) now have a `density` argument instead.
diff --git a/data/test_response_selector_bot/config.yml b/data/test_response_selector_bot/config.yml
@@ -6,9 +6,11 @@ pipeline:
   - name: "DIETClassifier"
     entity_recognition: False
     epochs: 8
-    random_seed: 42
+    learning_rate: 0.01
+    random_seed: 2021
   - name: ResponseSelector
     epochs: 5
+    learning_rate: 0.01
     random_seed: 42
 
 policies:

diff --git a/docs/docs/migration-guide.mdx b/docs/docs/migration-guide.mdx
@@ -69,6 +69,17 @@ forms:
 
 ### Machine Learning Components
 
+#### `DIET`, `TED`, and `ResponseSelector`
+
+The former `weight_sparsity` parameter of the `DIETClassifier`, `TEDPolicy`, and the `ResponseSelector`, 
+is now deprecated and superseded by the new `connection_density` parameter.
+The old `weight_sparsity` is roughly equivalent to `1 - connection_density`, except at very low densities 
+(high sparsities). 
+
+To avoid deprication issues, you should set `connection_density` to 
+`1 - your former weight_sparsity setting` throughout the config file. (If you left 
+`weight_sparsity` at its default setting, you don't need to do anything.)
+
 #### SpaCy 3.0
 
 Rasa now supports spaCy 3.0. This means that we can support more features for more

diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
@@ -79,7 +79,7 @@
     DROP_RATE_LABEL,
     DROP_RATE,
     DROP_RATE_ATTENTION,
-    WEIGHT_SPARSITY,
+    CONNECTION_DENSITY,
     KEY_RELATIVE_ATTENTION,
     VALUE_RELATIVE_ATTENTION,
     MAX_RELATIVE_POSITION,
@@ -252,8 +252,8 @@ class TEDPolicy(Policy):
         DROP_RATE_LABEL: 0.0,
         # Dropout rate for attention.
         DROP_RATE_ATTENTION: 0.0,
-        # Sparsity of the weights in dense layers
-        WEIGHT_SPARSITY: 0.8,
+        # Fraction of trainable weights in internal layers.
+        CONNECTION_DENSITY: 0.2,
         # If 'True' apply dropout to sparse input tensors
         SPARSE_INPUT_DROPOUT: True,
         # If 'True' apply dropout to dense input tensors
@@ -366,6 +366,9 @@ def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
         self.config = rasa.utils.train_utils.update_deprecated_loss_type(self.config)
         self.config = rasa.utils.train_utils.update_similarity_type(self.config)
         self.config = rasa.utils.train_utils.update_evaluation_parameters(self.config)
+        self.config = rasa.utils.train_utils.update_deprecated_sparsity_to_density(
+            self.config
+        )
 
     def _create_label_data(
         self, domain: Domain, interpreter: NaturalLanguageInterpreter

diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
@@ -72,7 +72,7 @@
     UNIDIRECTIONAL_ENCODER,
     DROP_RATE,
     DROP_RATE_ATTENTION,
-    WEIGHT_SPARSITY,
+    CONNECTION_DENSITY,
     NEGATIVE_MARGIN_SCALE,
     REGULARIZATION_CONSTANT,
     SCALE_LOSS,
@@ -207,8 +207,8 @@ def required_components(cls) -> List[Type[Component]]:
         DROP_RATE: 0.2,
         # Dropout rate for attention
         DROP_RATE_ATTENTION: 0,
-        # Sparsity of the weights in dense layers
-        WEIGHT_SPARSITY: 0.8,
+        # Fraction of trainable weights in internal layers.
+        CONNECTION_DENSITY: 0.2,
         # If 'True' apply dropout to sparse input tensors
         SPARSE_INPUT_DROPOUT: True,
         # If 'True' apply dropout to dense input tensors
@@ -304,6 +304,10 @@ def _check_config_parameters(self) -> None:
             self.component_config
         )
 
+        self.component_config = train_utils.update_deprecated_sparsity_to_density(
+            self.component_config
+        )
+
         self.component_config = train_utils.update_similarity_type(
             self.component_config
         )

diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
@@ -51,7 +51,7 @@
     UNIDIRECTIONAL_ENCODER,
     DROP_RATE,
     DROP_RATE_ATTENTION,
-    WEIGHT_SPARSITY,
+    CONNECTION_DENSITY,
     NEGATIVE_MARGIN_SCALE,
     REGULARIZATION_CONSTANT,
     SCALE_LOSS,
@@ -196,8 +196,8 @@ def required_components(cls) -> List[Type[Component]]:
         # ## Regularization parameters
         # The scale of regularization
         REGULARIZATION_CONSTANT: 0.002,
-        # Sparsity of the weights in dense layers
-        WEIGHT_SPARSITY: 0.0,
+        # Fraction of trainable weights in internal layers.
+        CONNECTION_DENSITY: 1.0,
         # The scale of how important is to minimize the maximum similarity
         # between embeddings of different labels.
         NEGATIVE_MARGIN_SCALE: 0.8,

diff --git a/rasa/shared/constants.py b/rasa/shared/constants.py
@@ -25,6 +25,9 @@
 DOCS_URL_MIGRATION_GUIDE_MD_DEPRECATION = (
     f"{DOCS_URL_MIGRATION_GUIDE}#rasa-21-to-rasa-22"
 )
+DOCS_URL_MIGRATION_GUIDE_WEIGHT_SPARSITY = (
+    f"{DOCS_URL_MIGRATION_GUIDE}#rasa-24-to-rasa-25"
+)
 DOCS_URL_TELEMETRY = DOCS_BASE_URL + "/telemetry/telemetry"
 DOCS_BASE_URL_RASA_X = "https://rasa.com/docs/rasa-x"
 DOCS_BASE_URL_ACTION_SERVER = "https://rasa.com/docs/action-server"

diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
@@ -40,7 +40,8 @@
 DROP_RATE_LABEL = "drop_rate_label"
 CONSTRAIN_SIMILARITIES = "constrain_similarities"
 
-WEIGHT_SPARSITY = "weight_sparsity"
+WEIGHT_SPARSITY = "weight_sparsity"  # Deprecated and superseeded by CONNECTION_DENSITY
+CONNECTION_DENSITY = "connection_density"
 
 EVAL_NUM_EPOCHS = "evaluate_every_number_of_epochs"
 EVAL_NUM_EXAMPLES = "evaluate_on_number_of_examples"

diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
@@ -154,38 +154,26 @@ def call(self, inputs: tf.SparseTensor) -> tf.Tensor:
         return outputs
 
 
-class DenseWithSparseWeights(tf.keras.layers.Dense):
-    """Just your regular densely-connected NN layer but with sparse weights.
+class RandomlyConnectedDense(tf.keras.layers.Dense):
+    """Layer with dense ouputs that are connected to a random subset of inputs.
 
-    `Dense` implements the operation:
+    `RandomlyConnectedDense` implements the operation:
     `output = activation(dot(input, kernel) + bias)`
     where `activation` is the element-wise activation function
     passed as the `activation` argument, `kernel` is a weights matrix
     created by the layer, and `bias` is a bias vector created by the layer
     (only applicable if `use_bias` is `True`).
-    It creates `kernel_mask` to set fraction of the `kernel` weights to zero.
+    It creates `kernel_mask` to set a fraction of the `kernel` weights to zero.
 
     Note: If the input to the layer has a rank greater than 2, then
     it is flattened prior to the initial dot product with `kernel`.
 
-    Arguments:
-        sparsity: Float between 0 and 1. Fraction of the `kernel`
-            weights to set to zero.
-        units: Positive integer, dimensionality of the output space.
-        activation: Activation function to use.
-            If you don't specify anything, no activation is applied
-            (ie. "linear" activation: `a(x) = x`).
-        use_bias: Boolean, whether the layer uses a bias vector.
-        kernel_initializer: Initializer for the `kernel` weights matrix.
-        bias_initializer: Initializer for the bias vector.
-        kernel_regularizer: Regularizer function applied to
-            the `kernel` weights matrix.
-        bias_regularizer: Regularizer function applied to the bias vector.
-        activity_regularizer: Regularizer function applied to
-            the output of the layer (its "activation")..
-        kernel_constraint: Constraint function applied to
-            the `kernel` weights matrix.
-        bias_constraint: Constraint function applied to the bias vector.
+    The output is guaranteed to be dense (each output is connected to at least one
+    input), and no input is disconnected (each input is connected to at least one
+    output).
+
+    At `density = 0.0` the number of trainable weights is `max(input_size, units)`. At
+    `density = 1.0` this layer is equivalent to `tf.keras.layers.Dense`.
 
     Input shape:
         N-D tensor with shape: `(batch_size, ..., input_dim)`.
@@ -198,24 +186,118 @@ class DenseWithSparseWeights(tf.keras.layers.Dense):
         the output would have shape `(batch_size, units)`.
     """
 
-    def __init__(self, sparsity: float = 0.8, **kwargs: Any) -> None:
+    def __init__(self, density: float = 0.2, **kwargs: Any) -> None:
+        """Declares instance variables with default values.
+
+        Args:
+            density: Float between 0 and 1. Approximate fraction of trainable weights.
+            units: Positive integer, dimensionality of the output space.
+            activation: Activation function to use.
+                If you don't specify anything, no activation is applied
+                (ie. "linear" activation: `a(x) = x`).
+            use_bias: Boolean, whether the layer uses a bias vector.
+            kernel_initializer: Initializer for the `kernel` weights matrix.
+            bias_initializer: Initializer for the bias vector.
+            kernel_regularizer: Regularizer function applied to
+                the `kernel` weights matrix.
+            bias_regularizer: Regularizer function applied to the bias vector.
+            activity_regularizer: Regularizer function applied to
+                the output of the layer (its "activation")..
+            kernel_constraint: Constraint function applied to
+                the `kernel` weights matrix.
+            bias_constraint: Constraint function applied to the bias vector.
+        """
         super().__init__(**kwargs)
-        self.sparsity = sparsity
+
+        if density < 0.0 or density > 1.0:
+            raise TFLayerConfigException("Layer density must be in [0, 1].")
+
+        self.density = density
 
     def build(self, input_shape: tf.TensorShape) -> None:
+        """Prepares the kernel mask.
+
+        Args:
+            input_shape: Shape of the inputs to this layer
+        """
         super().build(input_shape)
-        # create random mask to set fraction of the `kernel` weights to zero
-        kernel_mask = tf.random.uniform(tf.shape(self.kernel), 0, 1)
-        kernel_mask = tf.cast(
-            tf.greater_equal(kernel_mask, self.sparsity), self.kernel.dtype
-        )
+
+        if self.density == 1.0:
+            self.kernel_mask = None
+            return
+
+        # Construct mask with given density and guarantee that every output is
+        # connected to at least one input
+        kernel_mask = self._minimal_mask() + self._random_mask()
+
+        # We might accidently have added a random connection on top of
+        # a fixed connection
+        kernel_mask = tf.clip_by_value(kernel_mask, 0, 1)
+
         self.kernel_mask = tf.Variable(
             initial_value=kernel_mask, trainable=False, name="kernel_mask"
         )
 
+    def _random_mask(self) -> tf.Tensor:
+        """Creates a random matrix with `num_ones` 1s and 0s otherwise.
+
+        Returns:
+            A random mask matrix
+        """
+        mask = tf.random.uniform(tf.shape(self.kernel), 0, 1)
+        mask = tf.cast(tf.math.less(mask, self.density), self.kernel.dtype)
+        return mask
+
+    def _minimal_mask(self) -> tf.Tensor:
+        """Creates a matrix with a minimal number of 1s to connect everythinig.
+
+        If num_rows == num_cols, this creates the identity matrix.
+        If num_rows > num_cols, this creates
+            1 0 0 0
+            0 1 0 0
+            0 0 1 0
+            0 0 0 1
+            1 0 0 0
+            0 1 0 0
+            0 0 1 0
+            . . . .
+            . . . .
+            . . . .
+        If num_rows < num_cols, this creates
+            1 0 0 1 0 0 1 ...
+            0 1 0 0 1 0 0 ...
+            0 0 1 0 0 1 0 ...
+
+        Returns:
+            A tiled and croped identity matrix.
+        """
+        kernel_shape = tf.shape(self.kernel)
+        num_rows = kernel_shape[0]
+        num_cols = kernel_shape[1]
+        short_dimension = tf.minimum(num_rows, num_cols)
+
+        mask = tf.tile(
+            tf.eye(short_dimension, dtype=self.kernel.dtype),
+            [
+                tf.math.ceil(num_rows / short_dimension),
+                tf.math.ceil(num_cols / short_dimension),
+            ],
+        )[:num_rows, :num_cols]
+
+        return mask
+
     def call(self, inputs: tf.Tensor) -> tf.Tensor:
-        # set fraction of the `kernel` weights to zero according to precomputed mask
-        self.kernel.assign(self.kernel * self.kernel_mask)
+        """Processes the given inputs.
+
+        Args:
+            inputs: What goes into this layer
+
+        Returns:
+            The processed inputs.
+        """
+        if self.density < 1.0:
+            # Set fraction of the `kernel` weights to zero according to precomputed mask
+            self.kernel.assign(self.kernel * self.kernel_mask)
         return super().call(inputs)
 
 
@@ -226,8 +308,7 @@ class Ffnn(tf.keras.layers.Layer):
         layer_sizes: List of integers with dimensionality of the layers.
         dropout_rate: Float between 0 and 1; fraction of the input units to drop.
         reg_lambda: Float, regularization factor.
-        sparsity: Float between 0 and 1. Fraction of the `kernel`
-            weights to set to zero.
+        density: Float between 0 and 1. Approximate fraction of trainable weights.
         layer_name_suffix: Text added to the name of the layers.
 
     Input shape:
@@ -246,7 +327,7 @@ def __init__(
         layer_sizes: List[int],
         dropout_rate: float,
         reg_lambda: float,
-        sparsity: float,
+        density: float,
         layer_name_suffix: Text,
     ) -> None:
         super().__init__(name=f"ffnn_{layer_name_suffix}")
@@ -255,9 +336,9 @@ def __init__(
         self._ffn_layers = []
         for i, layer_size in enumerate(layer_sizes):
             self._ffn_layers.append(
-                DenseWithSparseWeights(
+                RandomlyConnectedDense(
                     units=layer_size,
-                    sparsity=sparsity,
+                    density=density,
                     activation=tfa.activations.gelu,
                     kernel_regularizer=l2_regularizer,
                     name=f"hidden_layer_{layer_name_suffix}_{i}",

diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
@@ -23,7 +23,7 @@
     EMBEDDING_DIMENSION,
     REGULARIZATION_CONSTANT,
     SIMILARITY_TYPE,
-    WEIGHT_SPARSITY,
+    CONNECTION_DENSITY,
     NUM_NEG,
     LOSS_TYPE,
     MAX_POS_SIM,
@@ -570,7 +570,7 @@ def _prepare_ffnn_layer(
             layer_sizes,
             drop_rate,
             self.config[REGULARIZATION_CONSTANT],
-            self.config[WEIGHT_SPARSITY],
+            self.config[CONNECTION_DENSITY],
             layer_name_suffix=name,
         )
 

diff --git a/rasa/utils/tensorflow/rasa_layers.py b/rasa/utils/tensorflow/rasa_layers.py
@@ -6,7 +6,7 @@
 from rasa.utils.tensorflow.model_data import FeatureSignature
 from rasa.utils.tensorflow.constants import (
     REGULARIZATION_CONSTANT,
-    WEIGHT_SPARSITY,
+    CONNECTION_DENSITY,
     NUM_TRANSFORMER_LAYERS,
     TRANSFORMER_SIZE,
     NUM_HEADS,
@@ -347,7 +347,7 @@ def _prepare_sequence_sentence_concat(
                         layer_sizes=[config[CONCAT_DIMENSION][attribute]],
                         dropout_rate=config[DROP_RATE],
                         reg_lambda=config[REGULARIZATION_CONSTANT],
-                        sparsity=config[WEIGHT_SPARSITY],
+                        density=config[CONNECTION_DENSITY],
                     )
 
     def _calculate_output_units(self, attribute: Text, config: Dict[Text, Any]) -> int:
@@ -635,7 +635,7 @@ def __init__(
                 config[HIDDEN_LAYERS_SIZES][attribute],
                 config[DROP_RATE],
                 config[REGULARIZATION_CONSTANT],
-                config[WEIGHT_SPARSITY],
+                config[CONNECTION_DENSITY],
                 layer_name_suffix=attribute,
             ),
         }
@@ -940,7 +940,7 @@ def prepare_transformer_layer(
             config[REGULARIZATION_CONSTANT],
             dropout_rate=drop_rate,
             attention_dropout_rate=config[DROP_RATE_ATTENTION],
-            sparsity=config[WEIGHT_SPARSITY],
+            density=config[CONNECTION_DENSITY],
             unidirectional=unidirectional,
             use_key_relative_position=config[KEY_RELATIVE_ATTENTION],
             use_value_relative_position=config[VALUE_RELATIVE_ATTENTION],