Skip to content

Commit

Permalink
Force DenseWithSparseWeights to produce dense output and use all inpu…
Browse files Browse the repository at this point in the history
…ts (#8011)

* Force dense output for DenseLayerWithSparseWeights

* Fix type error

* Draft `LocallyConnectedDense`

* Develop `LocallyConnectedDense`

* Trying things...

* Use `density` instead of `sparsity`

* Ensure all inputs are connected

* Rename connection_density

* Fix doc strings

* Add changelog

* Fix change log

* Fix missing rename

* Fix doc string formatting

* Add more doc strings

* Fix more doc strings

* Deprecate WEIGHT_SPARSITY

* Simplify

* Simplify

* Update doc strings

* Add info about components

* Adjust num_extra_connections

* Add test

* Fix strings

* Change doc strings

* Use cast-uniform mask generation

* Remove obsolete code

* Simplify

* Add migration guide

* Lint

* Increase epochs in test config

We need more epochs when we don't have much data. Otherwise tests are failing
depending on the seed.

* Fix heading depth

* Use increased learning rate instead of epochs

* Fix merge
  • Loading branch information
Johannes E. M. Mosig authored Apr 29, 2021
1 parent 5cb7fe0 commit 6a73b2e
Show file tree
Hide file tree
Showing 15 changed files with 268 additions and 84 deletions.
5 changes: 5 additions & 0 deletions changelog/7999.improvement.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Replace `weight_sparsity` with `connection_density` in all transformer-based models and add guarantees about internal layers.

We rename `DenseWithSparseWeights` into `RandomlyConnectedDense`, and guarantee that even at density zero the output is dense and every input is connected to at least one output. The former `weight_sparsity` parameter of DIET, TED, and the ResponseSelector, is now roughly equivalent to `1 - connection_density`, except at very low densities (high sparsities).

All layers and components that used to have a `sparsity` argument (`Ffnn`, `TransformerRasaModel`, `MultiHeadAttention`, `TransformerEncoderLayer`, `TransformerEncoder`) now have a `density` argument instead.
4 changes: 3 additions & 1 deletion data/test_response_selector_bot/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@ pipeline:
- name: "DIETClassifier"
entity_recognition: False
epochs: 8
random_seed: 42
learning_rate: 0.01
random_seed: 2021
- name: ResponseSelector
epochs: 5
learning_rate: 0.01
random_seed: 42

policies:
Expand Down
11 changes: 11 additions & 0 deletions docs/docs/migration-guide.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,17 @@ forms:

### Machine Learning Components

#### `DIET`, `TED`, and `ResponseSelector`

The former `weight_sparsity` parameter of the `DIETClassifier`, `TEDPolicy`, and the `ResponseSelector`,
is now deprecated and superseded by the new `connection_density` parameter.
The old `weight_sparsity` is roughly equivalent to `1 - connection_density`, except at very low densities
(high sparsities).

To avoid deprication issues, you should set `connection_density` to
`1 - your former weight_sparsity setting` throughout the config file. (If you left
`weight_sparsity` at its default setting, you don't need to do anything.)

#### SpaCy 3.0

Rasa now supports spaCy 3.0. This means that we can support more features for more
Expand Down
9 changes: 6 additions & 3 deletions rasa/core/policies/ted_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@
DROP_RATE_LABEL,
DROP_RATE,
DROP_RATE_ATTENTION,
WEIGHT_SPARSITY,
CONNECTION_DENSITY,
KEY_RELATIVE_ATTENTION,
VALUE_RELATIVE_ATTENTION,
MAX_RELATIVE_POSITION,
Expand Down Expand Up @@ -252,8 +252,8 @@ class TEDPolicy(Policy):
DROP_RATE_LABEL: 0.0,
# Dropout rate for attention.
DROP_RATE_ATTENTION: 0.0,
# Sparsity of the weights in dense layers
WEIGHT_SPARSITY: 0.8,
# Fraction of trainable weights in internal layers.
CONNECTION_DENSITY: 0.2,
# If 'True' apply dropout to sparse input tensors
SPARSE_INPUT_DROPOUT: True,
# If 'True' apply dropout to dense input tensors
Expand Down Expand Up @@ -366,6 +366,9 @@ def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
self.config = rasa.utils.train_utils.update_deprecated_loss_type(self.config)
self.config = rasa.utils.train_utils.update_similarity_type(self.config)
self.config = rasa.utils.train_utils.update_evaluation_parameters(self.config)
self.config = rasa.utils.train_utils.update_deprecated_sparsity_to_density(
self.config
)

def _create_label_data(
self, domain: Domain, interpreter: NaturalLanguageInterpreter
Expand Down
10 changes: 7 additions & 3 deletions rasa/nlu/classifiers/diet_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@
UNIDIRECTIONAL_ENCODER,
DROP_RATE,
DROP_RATE_ATTENTION,
WEIGHT_SPARSITY,
CONNECTION_DENSITY,
NEGATIVE_MARGIN_SCALE,
REGULARIZATION_CONSTANT,
SCALE_LOSS,
Expand Down Expand Up @@ -207,8 +207,8 @@ def required_components(cls) -> List[Type[Component]]:
DROP_RATE: 0.2,
# Dropout rate for attention
DROP_RATE_ATTENTION: 0,
# Sparsity of the weights in dense layers
WEIGHT_SPARSITY: 0.8,
# Fraction of trainable weights in internal layers.
CONNECTION_DENSITY: 0.2,
# If 'True' apply dropout to sparse input tensors
SPARSE_INPUT_DROPOUT: True,
# If 'True' apply dropout to dense input tensors
Expand Down Expand Up @@ -304,6 +304,10 @@ def _check_config_parameters(self) -> None:
self.component_config
)

self.component_config = train_utils.update_deprecated_sparsity_to_density(
self.component_config
)

self.component_config = train_utils.update_similarity_type(
self.component_config
)
Expand Down
6 changes: 3 additions & 3 deletions rasa/nlu/selectors/response_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
UNIDIRECTIONAL_ENCODER,
DROP_RATE,
DROP_RATE_ATTENTION,
WEIGHT_SPARSITY,
CONNECTION_DENSITY,
NEGATIVE_MARGIN_SCALE,
REGULARIZATION_CONSTANT,
SCALE_LOSS,
Expand Down Expand Up @@ -196,8 +196,8 @@ def required_components(cls) -> List[Type[Component]]:
# ## Regularization parameters
# The scale of regularization
REGULARIZATION_CONSTANT: 0.002,
# Sparsity of the weights in dense layers
WEIGHT_SPARSITY: 0.0,
# Fraction of trainable weights in internal layers.
CONNECTION_DENSITY: 1.0,
# The scale of how important is to minimize the maximum similarity
# between embeddings of different labels.
NEGATIVE_MARGIN_SCALE: 0.8,
Expand Down
3 changes: 3 additions & 0 deletions rasa/shared/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@
DOCS_URL_MIGRATION_GUIDE_MD_DEPRECATION = (
f"{DOCS_URL_MIGRATION_GUIDE}#rasa-21-to-rasa-22"
)
DOCS_URL_MIGRATION_GUIDE_WEIGHT_SPARSITY = (
f"{DOCS_URL_MIGRATION_GUIDE}#rasa-24-to-rasa-25"
)
DOCS_URL_TELEMETRY = DOCS_BASE_URL + "/telemetry/telemetry"
DOCS_BASE_URL_RASA_X = "https://rasa.com/docs/rasa-x"
DOCS_BASE_URL_ACTION_SERVER = "https://rasa.com/docs/action-server"
Expand Down
3 changes: 2 additions & 1 deletion rasa/utils/tensorflow/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@
DROP_RATE_LABEL = "drop_rate_label"
CONSTRAIN_SIMILARITIES = "constrain_similarities"

WEIGHT_SPARSITY = "weight_sparsity"
WEIGHT_SPARSITY = "weight_sparsity" # Deprecated and superseeded by CONNECTION_DENSITY
CONNECTION_DENSITY = "connection_density"

EVAL_NUM_EPOCHS = "evaluate_every_number_of_epochs"
EVAL_NUM_EXAMPLES = "evaluate_on_number_of_examples"
Expand Down
153 changes: 117 additions & 36 deletions rasa/utils/tensorflow/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,38 +154,26 @@ def call(self, inputs: tf.SparseTensor) -> tf.Tensor:
return outputs


class DenseWithSparseWeights(tf.keras.layers.Dense):
"""Just your regular densely-connected NN layer but with sparse weights.
class RandomlyConnectedDense(tf.keras.layers.Dense):
"""Layer with dense ouputs that are connected to a random subset of inputs.
`Dense` implements the operation:
`RandomlyConnectedDense` implements the operation:
`output = activation(dot(input, kernel) + bias)`
where `activation` is the element-wise activation function
passed as the `activation` argument, `kernel` is a weights matrix
created by the layer, and `bias` is a bias vector created by the layer
(only applicable if `use_bias` is `True`).
It creates `kernel_mask` to set fraction of the `kernel` weights to zero.
It creates `kernel_mask` to set a fraction of the `kernel` weights to zero.
Note: If the input to the layer has a rank greater than 2, then
it is flattened prior to the initial dot product with `kernel`.
Arguments:
sparsity: Float between 0 and 1. Fraction of the `kernel`
weights to set to zero.
units: Positive integer, dimensionality of the output space.
activation: Activation function to use.
If you don't specify anything, no activation is applied
(ie. "linear" activation: `a(x) = x`).
use_bias: Boolean, whether the layer uses a bias vector.
kernel_initializer: Initializer for the `kernel` weights matrix.
bias_initializer: Initializer for the bias vector.
kernel_regularizer: Regularizer function applied to
the `kernel` weights matrix.
bias_regularizer: Regularizer function applied to the bias vector.
activity_regularizer: Regularizer function applied to
the output of the layer (its "activation")..
kernel_constraint: Constraint function applied to
the `kernel` weights matrix.
bias_constraint: Constraint function applied to the bias vector.
The output is guaranteed to be dense (each output is connected to at least one
input), and no input is disconnected (each input is connected to at least one
output).
At `density = 0.0` the number of trainable weights is `max(input_size, units)`. At
`density = 1.0` this layer is equivalent to `tf.keras.layers.Dense`.
Input shape:
N-D tensor with shape: `(batch_size, ..., input_dim)`.
Expand All @@ -198,24 +186,118 @@ class DenseWithSparseWeights(tf.keras.layers.Dense):
the output would have shape `(batch_size, units)`.
"""

def __init__(self, sparsity: float = 0.8, **kwargs: Any) -> None:
def __init__(self, density: float = 0.2, **kwargs: Any) -> None:
"""Declares instance variables with default values.
Args:
density: Float between 0 and 1. Approximate fraction of trainable weights.
units: Positive integer, dimensionality of the output space.
activation: Activation function to use.
If you don't specify anything, no activation is applied
(ie. "linear" activation: `a(x) = x`).
use_bias: Boolean, whether the layer uses a bias vector.
kernel_initializer: Initializer for the `kernel` weights matrix.
bias_initializer: Initializer for the bias vector.
kernel_regularizer: Regularizer function applied to
the `kernel` weights matrix.
bias_regularizer: Regularizer function applied to the bias vector.
activity_regularizer: Regularizer function applied to
the output of the layer (its "activation")..
kernel_constraint: Constraint function applied to
the `kernel` weights matrix.
bias_constraint: Constraint function applied to the bias vector.
"""
super().__init__(**kwargs)
self.sparsity = sparsity

if density < 0.0 or density > 1.0:
raise TFLayerConfigException("Layer density must be in [0, 1].")

self.density = density

def build(self, input_shape: tf.TensorShape) -> None:
"""Prepares the kernel mask.
Args:
input_shape: Shape of the inputs to this layer
"""
super().build(input_shape)
# create random mask to set fraction of the `kernel` weights to zero
kernel_mask = tf.random.uniform(tf.shape(self.kernel), 0, 1)
kernel_mask = tf.cast(
tf.greater_equal(kernel_mask, self.sparsity), self.kernel.dtype
)

if self.density == 1.0:
self.kernel_mask = None
return

# Construct mask with given density and guarantee that every output is
# connected to at least one input
kernel_mask = self._minimal_mask() + self._random_mask()

# We might accidently have added a random connection on top of
# a fixed connection
kernel_mask = tf.clip_by_value(kernel_mask, 0, 1)

self.kernel_mask = tf.Variable(
initial_value=kernel_mask, trainable=False, name="kernel_mask"
)

def _random_mask(self) -> tf.Tensor:
"""Creates a random matrix with `num_ones` 1s and 0s otherwise.
Returns:
A random mask matrix
"""
mask = tf.random.uniform(tf.shape(self.kernel), 0, 1)
mask = tf.cast(tf.math.less(mask, self.density), self.kernel.dtype)
return mask

def _minimal_mask(self) -> tf.Tensor:
"""Creates a matrix with a minimal number of 1s to connect everythinig.
If num_rows == num_cols, this creates the identity matrix.
If num_rows > num_cols, this creates
1 0 0 0
0 1 0 0
0 0 1 0
0 0 0 1
1 0 0 0
0 1 0 0
0 0 1 0
. . . .
. . . .
. . . .
If num_rows < num_cols, this creates
1 0 0 1 0 0 1 ...
0 1 0 0 1 0 0 ...
0 0 1 0 0 1 0 ...
Returns:
A tiled and croped identity matrix.
"""
kernel_shape = tf.shape(self.kernel)
num_rows = kernel_shape[0]
num_cols = kernel_shape[1]
short_dimension = tf.minimum(num_rows, num_cols)

mask = tf.tile(
tf.eye(short_dimension, dtype=self.kernel.dtype),
[
tf.math.ceil(num_rows / short_dimension),
tf.math.ceil(num_cols / short_dimension),
],
)[:num_rows, :num_cols]

return mask

def call(self, inputs: tf.Tensor) -> tf.Tensor:
# set fraction of the `kernel` weights to zero according to precomputed mask
self.kernel.assign(self.kernel * self.kernel_mask)
"""Processes the given inputs.
Args:
inputs: What goes into this layer
Returns:
The processed inputs.
"""
if self.density < 1.0:
# Set fraction of the `kernel` weights to zero according to precomputed mask
self.kernel.assign(self.kernel * self.kernel_mask)
return super().call(inputs)


Expand All @@ -226,8 +308,7 @@ class Ffnn(tf.keras.layers.Layer):
layer_sizes: List of integers with dimensionality of the layers.
dropout_rate: Float between 0 and 1; fraction of the input units to drop.
reg_lambda: Float, regularization factor.
sparsity: Float between 0 and 1. Fraction of the `kernel`
weights to set to zero.
density: Float between 0 and 1. Approximate fraction of trainable weights.
layer_name_suffix: Text added to the name of the layers.
Input shape:
Expand All @@ -246,7 +327,7 @@ def __init__(
layer_sizes: List[int],
dropout_rate: float,
reg_lambda: float,
sparsity: float,
density: float,
layer_name_suffix: Text,
) -> None:
super().__init__(name=f"ffnn_{layer_name_suffix}")
Expand All @@ -255,9 +336,9 @@ def __init__(
self._ffn_layers = []
for i, layer_size in enumerate(layer_sizes):
self._ffn_layers.append(
DenseWithSparseWeights(
RandomlyConnectedDense(
units=layer_size,
sparsity=sparsity,
density=density,
activation=tfa.activations.gelu,
kernel_regularizer=l2_regularizer,
name=f"hidden_layer_{layer_name_suffix}_{i}",
Expand Down
4 changes: 2 additions & 2 deletions rasa/utils/tensorflow/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
EMBEDDING_DIMENSION,
REGULARIZATION_CONSTANT,
SIMILARITY_TYPE,
WEIGHT_SPARSITY,
CONNECTION_DENSITY,
NUM_NEG,
LOSS_TYPE,
MAX_POS_SIM,
Expand Down Expand Up @@ -570,7 +570,7 @@ def _prepare_ffnn_layer(
layer_sizes,
drop_rate,
self.config[REGULARIZATION_CONSTANT],
self.config[WEIGHT_SPARSITY],
self.config[CONNECTION_DENSITY],
layer_name_suffix=name,
)

Expand Down
8 changes: 4 additions & 4 deletions rasa/utils/tensorflow/rasa_layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from rasa.utils.tensorflow.model_data import FeatureSignature
from rasa.utils.tensorflow.constants import (
REGULARIZATION_CONSTANT,
WEIGHT_SPARSITY,
CONNECTION_DENSITY,
NUM_TRANSFORMER_LAYERS,
TRANSFORMER_SIZE,
NUM_HEADS,
Expand Down Expand Up @@ -347,7 +347,7 @@ def _prepare_sequence_sentence_concat(
layer_sizes=[config[CONCAT_DIMENSION][attribute]],
dropout_rate=config[DROP_RATE],
reg_lambda=config[REGULARIZATION_CONSTANT],
sparsity=config[WEIGHT_SPARSITY],
density=config[CONNECTION_DENSITY],
)

def _calculate_output_units(self, attribute: Text, config: Dict[Text, Any]) -> int:
Expand Down Expand Up @@ -635,7 +635,7 @@ def __init__(
config[HIDDEN_LAYERS_SIZES][attribute],
config[DROP_RATE],
config[REGULARIZATION_CONSTANT],
config[WEIGHT_SPARSITY],
config[CONNECTION_DENSITY],
layer_name_suffix=attribute,
),
}
Expand Down Expand Up @@ -940,7 +940,7 @@ def prepare_transformer_layer(
config[REGULARIZATION_CONSTANT],
dropout_rate=drop_rate,
attention_dropout_rate=config[DROP_RATE_ATTENTION],
sparsity=config[WEIGHT_SPARSITY],
density=config[CONNECTION_DENSITY],
unidirectional=unidirectional,
use_key_relative_position=config[KEY_RELATIVE_ATTENTION],
use_value_relative_position=config[VALUE_RELATIVE_ATTENTION],
Expand Down
Loading

0 comments on commit 6a73b2e

Please sign in to comment.