diff --git a/changelog/6410.removal.md b/changelog/6410.removal.md
new file mode 100644
index 000000000000..e01679e4485f
--- /dev/null
+++ b/changelog/6410.removal.md
@@ -0,0 +1,5 @@
+`Domain.random_template_for` is deprecated and will be removed in Rasa Open Source 
+3.0.0. You can alternatively use the `TemplatedNaturalLanguageGenerator`.
+
+`Domain.action_names` is deprecated and will be removed in Rasa Open Source 
+3.0.0. Please use `Domain.action_names_or_texts` instead.
diff --git a/changelog/7436.improvement.md b/changelog/7436.improvement.md
new file mode 100644
index 000000000000..349c84fea7ce
--- /dev/null
+++ b/changelog/7436.improvement.md
@@ -0,0 +1,4 @@
+Make `rasa data validate stories` work for end-to-end.
+
+The `rasa data validate stories` function now considers the tokenized user text instead of the plain text that is part of a state. 
+This is closer to what Rasa Core actually uses to distinguish states and thus captures more story structure problems.
diff --git a/changelog/7496.feature.md b/changelog/7496.feature.md
new file mode 100644
index 000000000000..ddb20b9bbbd9
--- /dev/null
+++ b/changelog/7496.feature.md
@@ -0,0 +1,36 @@
+Make [TED Policy](./policies.mdx#ted-policy) an end-to-end policy. Namely, make it possible to train TED on stories that contain
+intent and entities or user text and bot actions or bot text.
+If you don't have text in your stories, TED will behave the same way as before.
+Add possibility to predict entities using TED.
+
+Here's an example of a dialogue in the Rasa story format:
+
+```rasa-yaml
+stories:
+- story: collect restaurant booking info  # name of the story - just for debugging
+  steps:
+  - intent: greet                          # user message with no entities
+  - action: utter_ask_howcanhelp           # action that the bot should execute
+  - intent: inform                         # user message with entities
+    entities:
+    - location: "rome"
+    - price: "cheap"
+  - bot: On it                             # actual text that bot can output
+  - action: utter_ask_cuisine
+  - user: I would like [spanish](cuisine). # actual text that user input
+  - action: utter_ask_num_people
+```
+
+Some model options for `TEDPolicy` got renamed.
+Please update your configuration files using the following mapping:
+
+|      Old model option       |                  New model option                      |
+|-----------------------------|--------------------------------------------------------|
+|transformer_size             |dictionary “transformer_size” with keys                 |
+|                             |“text”, “action_text”, “label_action_text”, “dialogue”  |
+|number_of_transformer_layers |dictionary “number_of_transformer_layers” with keys     |
+|                             |“text”, “action_text”, “label_action_text”, “dialogue”  |
+|dense_dimension              |dictionary “dense_dimension” with keys                  |
+|                             |“text”, “action_text”, “label_action_text”, “intent”,   |
+|                             |“action_name”, “label_action_name”, “entities”, “slots”,|
+|                             |“active_loop”                                           |
diff --git a/data/test_dialogues/default.json b/data/test_dialogues/default.json
index 7875246bd5b9..11ac23fda5d5 100644
--- a/data/test_dialogues/default.json
+++ b/data/test_dialogues/default.json
@@ -45,7 +45,8 @@
         "text": "Hi my name is Peter"
       },
       "text": "Hi my name is Peter",
-      "timestamp": 1551953035.076376
+      "timestamp": 1551953035.076376,
+      "use_text_for_featurization": false
     },
     {
       "py/object": "rasa.shared.core.events.SlotSet",
diff --git a/data/test_dialogues/formbot.json b/data/test_dialogues/formbot.json
index 65aff91caa79..272ec05f8010 100644
--- a/data/test_dialogues/formbot.json
+++ b/data/test_dialogues/formbot.json
@@ -5,6 +5,7 @@
     {
       "py/object":"rasa.shared.core.events.ActionExecuted",
       "action_name":"action_listen",
+      "action_text": null,
       "confidence":null,
       "policy":null,
       "timestamp":1551884035.892855,
@@ -32,11 +33,13 @@
         "text":"Hi I'm desperate to talk to you"
       },
       "text":"Hi I'm desperate to talk to you",
-      "timestamp":1551884050.259948
+      "timestamp":1551884050.259948,
+      "use_text_for_featurization": false
     },
     {
       "py/object":"rasa.shared.core.events.ActionExecuted",
       "action_name":"utter_greet",
+      "action_text": null,
       "confidence":1.0,
       "policy":"policy_2_MemoizationPolicy",
       "timestamp":1551884060.466681,
@@ -55,6 +58,7 @@
     {
       "py/object":"rasa.shared.core.events.ActionExecuted",
       "action_name":"action_listen",
+      "action_text": null,
       "confidence":1.0,
       "policy":"policy_2_MemoizationPolicy",
       "timestamp":1551884061.9350882,
@@ -104,11 +108,13 @@
         "text":"I'm looking for an indian restaurant...in Bombay"
       },
       "text":"I'm looking for an indian restaurant...in Bombay",
-      "timestamp":1551884090.9653602
+      "timestamp":1551884090.9653602,
+      "use_text_for_featurization": false
     },
     {
       "py/object":"rasa.shared.core.events.ActionExecuted",
       "action_name":"restaurant_form",
+      "action_text": null,
       "confidence":1.0,
       "policy":"policy_2_MemoizationPolicy",
       "timestamp":1551884095.542748,
@@ -117,6 +123,7 @@
     {
       "py/object":"rasa.shared.core.events.ActionExecuted",
       "action_name":"utter_slots_values",
+      "action_text": null,
       "confidence":1.0,
       "policy":"policy_2_MemoizationPolicy",
       "timestamp":1551884097.570883,
@@ -135,6 +142,7 @@
     {
       "py/object":"rasa.shared.core.events.ActionExecuted",
       "action_name":"action_listen",
+      "action_text": null,
       "confidence":1.0,
       "policy":"policy_2_MemoizationPolicy",
       "timestamp":1551884098.8006358,
@@ -162,11 +170,13 @@
         "text":"Let's just pretend everything went correctly"
       },
       "text":"Let's just pretend everything went correctly",
-      "timestamp":1551884208.092693
+      "timestamp":1551884208.092693,
+      "use_text_for_featurization": false
     },
     {
       "py/object":"rasa.shared.core.events.ActionExecuted",
       "action_name":"action_deactivate_loop",
+      "action_text": null,
       "confidence":null,
       "policy":null,
       "timestamp":1551884214.951055,
@@ -186,6 +196,7 @@
     {
       "py/object":"rasa.shared.core.events.ActionExecuted",
       "action_name":"action_listen",
+      "action_text": null,
       "confidence":0.7680902069097734,
       "policy":"policy_0_TEDPolicy",
       "timestamp":1551884216.705635,
diff --git a/data/test_dialogues/moodbot.json b/data/test_dialogues/moodbot.json
index e65201396bd7..f7869cb773b9 100644
--- a/data/test_dialogues/moodbot.json
+++ b/data/test_dialogues/moodbot.json
@@ -59,7 +59,8 @@
         "text":"Hi talk to me"
       },
       "text":"Hi talk to me",
-      "timestamp":1551883971.410778
+      "timestamp":1551883971.410778,
+      "use_text_for_featurization": false
     },
     {
       "py/object":"rasa.shared.core.events.ActionExecuted",
@@ -146,7 +147,8 @@
         "text":"Super sad"
       },
       "text":"Super sad",
-      "timestamp":1551883982.540276
+      "timestamp":1551883982.540276,
+      "use_text_for_featurization": false
     },
     {
       "py/object":"rasa.shared.core.events.ActionExecuted",
@@ -243,7 +245,8 @@
         "text":"No"
       },
       "text":"No",
-      "timestamp":1551883989.0720608
+      "timestamp":1551883989.0720608,
+      "use_text_for_featurization": false
     },
     {
       "py/object":"rasa.shared.core.events.ActionExecuted",
diff --git a/data/test_trackers/tracker_moodbot.json b/data/test_trackers/tracker_moodbot.json
index 115c56dcde12..2921e6a1883c 100644
--- a/data/test_trackers/tracker_moodbot.json
+++ b/data/test_trackers/tracker_moodbot.json
@@ -37,6 +37,7 @@
       "timestamp": 1517821726.211031,
       "event": "action",
       "name": "action_listen",
+      "action_text": null,
       "policy": null,
       "confidence": null
     },
@@ -75,6 +76,7 @@
       "timestamp": 1517821726.200373,
       "event": "action",
       "name": "utter_greet",
+      "action_text": null,
       "policy": null,
       "confidence": null
     },
@@ -82,6 +84,7 @@
       "timestamp": 1517821726.211038,
       "event": "action",
       "name": "action_listen",
+      "action_text": null,
       "policy": null,
       "confidence": null
     },
@@ -120,6 +123,7 @@
       "timestamp": 1517821726.209908,
       "event": "action",
       "name": "utter_happy",
+      "action_text": null,
       "policy": "policy_1_TEDPolicy",
       "confidence": 0.8
     },
@@ -127,6 +131,7 @@
       "timestamp": 1517821726.211042,
       "event": "action",
       "name": "action_listen",
+      "action_text": null,
       "policy": "policy_2_MemoizationPolicy",
       "confidence": 1.0
     }
diff --git a/data/test_yaml_stories/stories_e2e.yml b/data/test_yaml_stories/stories_e2e.yml
new file mode 100644
index 000000000000..b20be88ed78d
--- /dev/null
+++ b/data/test_yaml_stories/stories_e2e.yml
@@ -0,0 +1,30 @@
+version: "2.0"
+
+stories:
+- story: happy path (intent to action)
+  steps:
+  - intent: greet
+  - action: utter_greet
+  - intent: mood_great
+  - action: utter_happy
+
+- story: sad path (text to text)
+  steps:
+  - user: "[Hello](bla)"
+  - bot: "Welcome to moodbot. How are you feeling today?"
+  - user: "Horrible"
+  - bot: "Oh no! Here is a kitten photo. Did it help?"
+  - user: "Yes"
+  - bot: "Perfect"
+
+- story: sad path 2 (mixed)
+  steps:
+  - intent: greet
+  - action: utter_greet
+  - user: "I'm not happy"
+  - action: utter_cheer_up
+  - action: utter_did_that_help
+  - user: "Not at all"
+  - bot: "Goodbye!"
+
+
diff --git a/data/test_yaml_stories/stories_hybrid_e2e.yml b/data/test_yaml_stories/stories_hybrid_e2e.yml
new file mode 100644
index 000000000000..bdedd78c808a
--- /dev/null
+++ b/data/test_yaml_stories/stories_hybrid_e2e.yml
@@ -0,0 +1,15 @@
+stories:
+- story: My hybrid End-to-End story
+  steps:
+  # Regular story with labels
+  - intent: simple
+  - action: utter_greet
+  # Actual messages are given instead of labels
+  - user: "I am looking for a [Kenyan](cuisine) restaurant"
+  - bot: "good for you"
+  # Regular labeled events
+  - intent: goodbye
+  - action: utter_goodbye
+  # Actual messages are given instead of labels
+  - user: One more thing
+  - bot: What?
diff --git a/data/test_yaml_stories/stories_simple.yml b/data/test_yaml_stories/stories_simple.yml
new file mode 100644
index 000000000000..4b5fe55fd88c
--- /dev/null
+++ b/data/test_yaml_stories/stories_simple.yml
@@ -0,0 +1,8 @@
+- story: simple_story
+  steps:
+  - intent: greet
+  - action: utter_greet
+  - intent: default
+  - action: utter_default
+  - intent: goodbye
+  - action: utter_goodbye
diff --git a/docs/docs/command-line-interface.mdx b/docs/docs/command-line-interface.mdx
index 01952c3efc37..85a1dfb24e05 100644
--- a/docs/docs/command-line-interface.mdx
+++ b/docs/docs/command-line-interface.mdx
@@ -391,6 +391,10 @@ rasa data validate stories
 :::note
 Running `rasa data validate` does **not** test if your [rules](./rules.mdx) are consistent with your stories. 
 However, during training, the `RulePolicy` checks for conflicts between rules and stories. Any such conflict will abort training.
+
+Also, if you use end-to-end stories, then this might not capture all conflicts. Specifically, if two user inputs
+result in different tokens yet exactly the same featurization, then conflicting actions after these inputs
+may exist but will not be reported by the tool. 
 :::
 
 To interrupt validation even for minor issues such as unused intents or responses, use the `--fail-on-warnings` flag.
diff --git a/docs/docs/components.mdx b/docs/docs/components.mdx
index e6764790ba90..fa23ed2d93cd 100644
--- a/docs/docs/components.mdx
+++ b/docs/docs/components.mdx
@@ -1485,7 +1485,7 @@ However, additional parameters exist that can be adapted.
 +=================================+==================+==============================================================+
 | hidden_layers_sizes             | text: []         | Hidden layer sizes for layers before the embedding layers    |
 |                                 | label: []        | for user messages and labels. The number of hidden layers is |
-|                                 |                  | equal to the length of the corresponding.                    |
+|                                 |                  | equal to the length of the corresponding list.               |
 +---------------------------------+------------------+--------------------------------------------------------------+
 | share_hidden_layers             | False            | Whether to share the hidden layer weights between user       |
 |                                 |                  | messages and labels.                                         |
@@ -1519,8 +1519,8 @@ However, additional parameters exist that can be adapted.
 +---------------------------------+------------------+--------------------------------------------------------------+
 | embedding_dimension             | 20               | Dimension size of embedding vectors.                         |
 +---------------------------------+------------------+--------------------------------------------------------------+
-| dense_dimension                 | text: 128        | Dense dimension for sparse features to use if no dense       |
-|                                 | label: 20        | features are present.                                        |
+| dense_dimension                 | text: 128        | Dense dimension for sparse features to use.                  |
+|                                 | label: 20        |                                                              |
 +---------------------------------+------------------+--------------------------------------------------------------+
 | concat_dimension                | text: 128        | Concat dimension for sequence and sentence features.         |
 |                                 | label: 20        |                                                              |
diff --git a/docs/docs/migration-guide.mdx b/docs/docs/migration-guide.mdx
index 56eced375460..1d51261010ae 100644
--- a/docs/docs/migration-guide.mdx
+++ b/docs/docs/migration-guide.mdx
@@ -12,8 +12,54 @@ how you can migrate from one version to another.
 
 ## Rasa 2.1 to Rasa 2.2
 
+### General
+
+`TEDPolicy`'s  `transformer_size`, `number_of_transformer_layers`,
+and `dense_dimensions` parameters have been renamed.
+Please update your configuration files using the following mapping:
+
+|      Old Model Parameter    |                 New Model Parameter                    |
+|-----------------------------|--------------------------------------------------------|
+|`transformer_size`           |dictionary `transformer_size` with keys                 |
+|                             |`text`, `action_text`, `label_action_text`, `dialogue`  |
+|`number_of_transformer_layers`|dictionary `number_of_transformer_layers` with keys    |
+|                             |`text`, `action_text`, `label_action_text`, `dialogue`  |
+|`dense_dimension`            |dictionary `dense_dimension` with keys                  |
+|                             |`text`, `action_text`, `label_action_text`, `intent`,   |
+|                             |`action_name`, `label_action_name`, `entities`, `slots`,|
+|                             |`active_loop`                                           |
+
+For example:
+
+```yaml-rasa title="config.yml"
+policies:
+  - name: TEDPolicy
+    transformer_size:
+      text: 128
+      action_text: 128
+      label_action_text: 128
+      dialogue: 128
+    number_of_transformer_layers:
+      text: 1
+      action_text: 1
+      label_action_text: 1
+      dialogue: 1
+    dense_dimension:
+      text: 128
+      action_text: 128
+      label_action_text: 128
+      intent: 20
+      action_name: 20
+      label_action_name: 20
+      entities: 20
+      slots: 20
+      active_loop: 20
+```
+
+
 ### Deprecations
 
+#### Markdown Data
 Training and test data in Markdown format is now deprecated. This includes:
 - reading and writing of story files in Markdown format
 - reading and writing of NLU data in Markdown format
@@ -24,6 +70,7 @@ Support for Markdown data will be removed entirely in Rasa Open Source 3.0.0.
 Please convert your existing Markdown data by using the commands
 described [here](./migration-guide.mdx#training-data-files).
 
+
 ### Policies
 
 [Policies](./policies.mdx) now require a `**kwargs` argument in their constructor and `load` method.
@@ -31,6 +78,15 @@ Policies without `**kwargs` will be supported until Rasa version `3.0.0`.
 However when using [incremental training](./command-line-interface.mdx#incremental-training)
 `**kwargs` **must** be included.
 
+
+#### Other
+
+* `Domain.random_template_for` is deprecated and will be removed in Rasa Open Source
+   3.0.0. You can alternatively use the `TemplatedNaturalLanguageGenerator`.
+* `Domain.action_names` is deprecated and will be removed in Rasa Open Source
+   3.0.0. Please use `Domain.action_names_or_texts` instead.
+
+
 ## Rasa 2.0 to Rasa 2.1
 
 ### Deprecations
diff --git a/docs/docs/policies.mdx b/docs/docs/policies.mdx
index fdcfb4290829..6dcd139907cf 100644
--- a/docs/docs/policies.mdx
+++ b/docs/docs/policies.mdx
@@ -80,25 +80,44 @@ Doing so can lead to unexpected and undesired bot behavior.
 
 ### TED Policy
 
-The Transformer Embedding Dialogue (TED) Policy is described in
+The Transformer Embedding Dialogue (TED) Policy is
+a multi-task architecture for next action prediction and entity
+recognition. The architecture several transformer encoders which are shared for both tasks.
+A sequence of entity labels is predicted through a Conditional Random Field (CRF) tagging layer on top of the
+user sequence transformer encoder output corresponding to the input sequence of tokens.
+For the next action prediction the dialogue transformer encoder output and system action labels are embedded into a
+single semantic vector space. We use the dot-product loss to maximize the similarity with the target label and
+minimize similarities with negative samples.
+
+If you want to learn more about the model, check out
 [our paper](https://arxiv.org/abs/1910.00486) and on our
 [youtube channel](https://www.youtube.com/watch?v=j90NvurJI4I&list=PL75e0qA87dlG-za8eLI6t0_Pbxafk-cxb&index=14&ab_channel=Rasa).
+where we explain the model architecture in detail.
 
-This policy has a pre-defined architecture, which comprises the
-following steps:
+TED Policy architecture comprises the following steps:
 
-1. Concatenate user input (user intent and entities), previous system actions, slots and active forms for each time
-  step into an input vector to pre-transformer embedding layer.
+1. Concatenate features for
+   - user input (user intent and entities) or user text processed through a user sequence transformer encoder,
+   - previous system actions or bot utterances processed through a bot sequence transformer encoder,
+   - slots and active forms
 
-2. Feed the input vector into a transformer.
+   for each time step into an input vector to the embedding layer that precedes the 
+   dialogue transformer.
 
-3. Apply a dense layer to the output of the transformer to get embeddings of a dialogue for each time step.
+2. Feed the embedding of the input vector into the dialogue transformer encoder.
+
+3. Apply a dense layer to the output of the dialogue transformer to get embeddings of the dialogue for each time step.
 
 4. Apply a dense layer to create embeddings for system actions for each time step.
 
 5. Calculate the similarity between the dialogue embedding and embedded system actions.
    This step is based on the [StarSpace](https://arxiv.org/abs/1709.03856) idea.
 
+6. Concatenate the token-level output of the user sequence transformer encoder
+   with the output of the dialogue transformer encoder for each time step.
+
+7. Apply CRF algorithm to predict contextual entities for each user text input.
+
 **Configuration:**
 
 You can pass configuration parameters to the `TEDPolicy` using the `config.yml` file.
@@ -135,35 +154,20 @@ If you want to fine-tune your model, start by modifying the following parameters
     max_history: 8
   ```
 
-* `hidden_layers_sizes`:
-  This parameter allows you to define the number of feed forward layers and their output
-  dimensions for dialogues and intents (it defaults to: `dialogue: [], label: []`).
-  Every entry in the list corresponds to a feed forward layer.
-  For example, if you use the following configuration:
-
-  ```yaml-rasa title="config.yml"
-  policies:
-  - name: TEDPolicy
-    hidden_layers_sizes:
-      dialogue: [256, 128]
-  ```
-
-  Rasa Open Source will add two feed forward layers in front of the transformer.
-  The vectors of the input tokens (coming from the dialogue) will be passed on to those
-  layers. The first layer will have an output dimension of 256 and the second layer will have an output
-  dimension of 128. If an empty list is used (default behavior), no feed forward layer will be
-  added.
-  Make sure to use only positive integer values. Usually, numbers of power of two are used.
-  Also, it is usual practice to have decreasing values in the list: next value is smaller or equal to the
-  value before.
-
 * `number_of_transformer_layers`:
-  This parameter sets the number of transformer layers to use (default: `1`).
-  The number of transformer layers corresponds to the transformer blocks to use for the model.
+  This parameter sets the number of sequence transformer encoder layers to use for
+  sequential transformer encoders for user, action and action label texts and for
+  dialogue transformer encoder.
+  (defaults: `text: 1, action_text: 1, label_action_text: 1, dialogue: 1`).
+  The number of sequence transformer encoder layers corresponds
+  to the transformer blocks to use for the model.
 
 * `transformer_size`:
-  This parameter sets the number of units in the transformer (default: `128`).
-  The vectors coming out of the transformers will have the given `transformer_size`.
+  This parameter sets the number of units in the sequence transformer encoder layers to use for
+  sequential transformer encoders for user, action and action label texts and for
+  dialogue transformer encoder.
+  (defaults: `text: 128, action_text: 128, label_action_text: 128, dialogue: 128`).
+  The vectors coming out of the transformer encoders will have the given `transformer_size`.
 
 * `weight_sparsity`:
   This parameter defines the fraction of kernel weights that are set to 0 for all feed forward layers
@@ -178,105 +182,144 @@ However, additional parameters exist that can be adapted.
 <details><summary>More configurable parameters</summary>
 
 ```
-+---------------------------------+------------------+--------------------------------------------------------------+
-| Parameter                       | Default Value    | Description                                                  |
-+=================================+==================+==============================================================+
-| hidden_layers_sizes             | dialogue: []     | Hidden layer sizes for layers before the embedding layers    |
-|                                 | label: []        | for dialogue and labels. The number of hidden layers is      |
-|                                 |                  | equal to the length of the corresponding.                    |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| transformer_size                | 128              | Number of units in transformer.                              |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| number_of_transformer_layers    | 1                | Number of transformer layers.                                |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| number_of_attention_heads       | 4                | Number of attention heads in transformer.                    |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| use_key_relative_attention      | False            | If 'True' use key relative embeddings in attention.          |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| use_value_relative_attention    | False            | If 'True' use value relative embeddings in attention.        |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| max_relative_position           | None             | Maximum position for relative embeddings.                    |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| batch_size                      | [64, 256]        | Initial and final value for batch sizes.                     |
-|                                 |                  | Batch size will be linearly increased for each epoch.        |
-|                                 |                  | If constant `batch_size` is required, pass an int, e.g. `8`. |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| batch_strategy                  | "balanced"       | Strategy used when creating batches.                         |
-|                                 |                  | Can be either 'sequence' or 'balanced'.                      |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| epochs                          | 1                | Number of epochs to train.                                   |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| random_seed                     | None             | Set random seed to any 'int' to get reproducible results.    |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| embedding_dimension             | 20               | Dimension size of embedding vectors.                         |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| number_of_negative_examples     | 20               | The number of incorrect labels. The algorithm will minimize  |
-|                                 |                  | their similarity to the user input during training.          |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| similarity_type                 | "auto"           | Type of similarity measure to use, either 'auto' or 'cosine' |
-|                                 |                  | or 'inner'.                                                  |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| loss_type                       | "softmax"        | The type of the loss function, either 'softmax' or 'margin'. |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| ranking_length                  | 10               | Number of top actions to normalize scores for loss type      |
-|                                 |                  | 'softmax'. Set to 0 to turn off normalization.               |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| maximum_positive_similarity     | 0.8              | Indicates how similar the algorithm should try to make       |
-|                                 |                  | embedding vectors for correct labels.                        |
-|                                 |                  | Should be 0.0 < ... < 1.0 for 'cosine' similarity type.      |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| maximum_negative_similarity     | -0.2             | Maximum negative similarity for incorrect labels.            |
-|                                 |                  | Should be -1.0 < ... < 1.0 for 'cosine' similarity type.     |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| use_maximum_negative_similarity | True             | If 'True' the algorithm only minimizes maximum similarity    |
-|                                 |                  | over incorrect intent labels, used only if 'loss_type' is    |
-|                                 |                  | set to 'margin'.                                             |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| scale_loss                      | True             | Scale loss inverse proportionally to confidence of correct   |
-|                                 |                  | prediction.                                                  |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| regularization_constant         | 0.001            | The scale of regularization.                                 |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| negative_margin_scale           | 0.8              | The scale of how important it is to minimize the maximum     |
-|                                 |                  | similarity between embeddings of different labels.           |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| drop_rate_dialogue              | 0.1              | Dropout rate for embedding layers of dialogue features.      |
-|                                 |                  | Value should be between 0 and 1.                             |
-|                                 |                  | The higher the value the higher the regularization effect.   |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| drop_rate_label                 | 0.0              | Dropout rate for embedding layers of label features.         |
-|                                 |                  | Value should be between 0 and 1.                             |
-|                                 |                  | The higher the value the higher the regularization effect.   |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| drop_rate_attention             | 0.0              | Dropout rate for attention. Value should be between 0 and 1. |
-|                                 |                  | The higher the value the higher the regularization effect.   |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| weight_sparsity                 | 0.8              | Sparsity of the weights in dense layers.                     |
-|                                 |                  | Value should be between 0 and 1.                             |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| evaluate_every_number_of_epochs | 20               | How often to calculate validation accuracy.                  |
-|                                 |                  | Set to '-1' to evaluate just once at the end of training.    |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| evaluate_on_number_of_examples  | 0                | How many examples to use for hold out validation set.        |
-|                                 |                  | Large values may hurt performance, e.g. model accuracy.      |
-|                                 |                  | Keep at 0 if your data set contains a lot of unique examples |
-|                                 |                  | of dialogue turns.                                           |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| tensorboard_log_directory       | None             | If you want to use tensorboard to visualize training         |
-|                                 |                  | metrics, set this option to a valid output directory. You    |
-|                                 |                  | can view the training metrics after training in tensorboard  |
-|                                 |                  | via 'tensorboard --logdir <path-to-given-directory>'.        |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| tensorboard_log_level           | "epoch"          | Define when training metrics for tensorboard should be       |
-|                                 |                  | logged. Either after every epoch ('epoch') or for every      |
-|                                 |                  | training step ('minibatch').                                 |
-+---------------------------------+------------------+--------------------------------------------------------------+
-| checkpoint_model                | False            | Save the best performing model during training. Models are   |
-|                                 |                  | stored to the location specified by `--out`. Only the one    |
-|                                 |                  | best model will be saved.                                    |
-|                                 |                  | Requires `evaluate_on_number_of_examples > 0` and            |
-|                                 |                  | `evaluate_every_number_of_epochs > 0`                        |
-+---------------------------------+------------------+--------------------------------------------------------------+
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| Parameter                             | Default Value          | Description                                                  |
++=======================================+========================+==============================================================+
+| hidden_layers_sizes                   | text: []               | Hidden layer sizes for layers before the embedding layers    |
+|                                       | action_text: []        | for user messages and bot messages in previous actions       |
+|                                       | label_action_text: []  | and labels. The number of hidden layers is                   |
+|                                       |                        | equal to the length of the corresponding list.               |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| dense_dimension                       | text: 128              | Dense dimension for sparse features to use after they are    |
+|                                       | action_text: 128       | converted into dense features.                               |
+|                                       | label_action_text: 128 |                                                              |
+|                                       | intent: 20             |                                                              |
+|                                       | action_name: 20        |                                                              |
+|                                       | label_action_name: 20  |                                                              |
+|                                       | entities: 20           |                                                              |
+|                                       | slots: 20              |                                                              |
+|                                       | active_loop: 20        |                                                              |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| concat_dimension                      | text: 128              | Common dimension to which sequence and sentence features of  |
+|                                       | action_text: 128       | different dimensions get converted before concatenation.     |
+|                                       | label_action_text: 128 |                                                              |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| encoding_dimension                    | 50                     | Dimension size of embedding vectors                          |
+|                                       |                        | before the dialogue transformer encoder.                     |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| transformer_size                      | text: 128              | Number of units in user text sequence transformer encoder.   |
+|                                       | action_text: 128       | Number of units in bot text sequence transformer encoder.    |
+|                                       | label_action_text: 128 | Number of units in bot text sequence transformer encoder.    |
+|                                       | dialogue: 128          | Number of units in dialogue transformer encoder.             |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| number_of_transformer_layers          | text: 1                | Number of layers in user text sequence transformer encoder.  |
+|                                       | action_text: 1         | Number of layers in bot text sequence transformer encoder.   |
+|                                       | label_action_text: 1   | Number of layers in bot text sequence transformer encoder.   |
+|                                       | dialogue: 1            | Number of layers in dialogue transformer encoder.            |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| number_of_attention_heads             | 4                      | Number of self-attention heads in transformers.              |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| use_key_relative_attention            | False                  | If 'True' use key relative embeddings in attention.          |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| use_value_relative_attention          | False                  | If 'True' use value relative embeddings in attention.        |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| max_relative_position                 | None                   | Maximum position for relative embeddings.                    |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| batch_size                            | [64, 256]              | Initial and final value for batch sizes.                     |
+|                                       |                        | Batch size will be linearly increased for each epoch.        |
+|                                       |                        | If constant `batch_size` is required, pass an int, e.g. `8`. |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| batch_strategy                        | "balanced"             | Strategy used when creating batches.                         |
+|                                       |                        | Can be either 'sequence' or 'balanced'.                      |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| epochs                                | 1                      | Number of epochs to train.                                   |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| random_seed                           | None                   | Set random seed to any 'int' to get reproducible results.    |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| embedding_dimension                   | 20                     | Dimension size of dialogue & system action embedding vectors.|
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| number_of_negative_examples           | 20                     | The number of incorrect labels. The algorithm will minimize  |
+|                                       |                        | their similarity to the user input during training.          |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| similarity_type                       | "auto"                 | Type of similarity measure to use, either 'auto' or 'cosine' |
+|                                       |                        | or 'inner'.                                                  |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| loss_type                             | "softmax"              | The type of the loss function, either 'softmax' or 'margin'. |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| ranking_length                        | 10                     | Number of top actions to normalize scores for loss type      |
+|                                       |                        | 'softmax'. Set to 0 to turn off normalization.               |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| maximum_positive_similarity           | 0.8                    | Indicates how similar the algorithm should try to make       |
+|                                       |                        | embedding vectors for correct labels.                        |
+|                                       |                        | Should be 0.0 < ... < 1.0 for 'cosine' similarity type.      |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| maximum_negative_similarity           | -0.2                   | Maximum negative similarity for incorrect labels.            |
+|                                       |                        | Should be -1.0 < ... < 1.0 for 'cosine' similarity type.     |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| use_maximum_negative_similarity       | True                   | If 'True' the algorithm only minimizes maximum similarity    |
+|                                       |                        | over incorrect intent labels, used only if 'loss_type' is    |
+|                                       |                        | set to 'margin'.                                             |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| scale_loss                            | True                   | Scale loss inverse proportionally to confidence of correct   |
+|                                       |                        | prediction.                                                  |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| regularization_constant               | 0.001                  | The scale of regularization.                                 |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| negative_margin_scale                 | 0.8                    | The scale of how important it is to minimize the maximum     |
+|                                       |                        | similarity between embeddings of different labels.           |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| drop_rate_dialogue                    | 0.1                    | Dropout rate for embedding layers of dialogue features.      |
+|                                       |                        | Value should be between 0 and 1.                             |
+|                                       |                        | The higher the value the higher the regularization effect.   |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| drop_rate_label                       | 0.0                    | Dropout rate for embedding layers of label features.         |
+|                                       |                        | Value should be between 0 and 1.                             |
+|                                       |                        | The higher the value the higher the regularization effect.   |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| drop_rate_attention                   | 0.0                    | Dropout rate for attention. Value should be between 0 and 1. |
+|                                       |                        | The higher the value the higher the regularization effect.   |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| weight_sparsity                       | 0.8                    | Sparsity of the weights in dense layers.                     |
+|                                       |                        | Value should be between 0 and 1.                             |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| use_sparse_input_dropout              | True                   | If 'True' apply dropout to sparse input tensors.             |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| use_dense_input_dropout               | True                   | If 'True' apply dropout to sparse features after they are    |
+|                                       |                        | converted into dense features.                               |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| evaluate_every_number_of_epochs       | 20                     | How often to calculate validation accuracy.                  |
+|                                       |                        | Set to '-1' to evaluate just once at the end of training.    |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| evaluate_on_number_of_examples        | 0                      | How many examples to use for hold out validation set.        |
+|                                       |                        | Large values may hurt performance, e.g. model accuracy.      |
+|                                       |                        | Keep at 0 if your data set contains a lot of unique examples |
+|                                       |                        | of dialogue turns.                                           |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| tensorboard_log_directory             | None                   | If you want to use tensorboard to visualize training         |
+|                                       |                        | metrics, set this option to a valid output directory. You    |
+|                                       |                        | can view the training metrics after training in tensorboard  |
+|                                       |                        | via 'tensorboard --logdir <path-to-given-directory>'.        |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| tensorboard_log_level                 | "epoch"                | Define when training metrics for tensorboard should be       |
+|                                       |                        | logged. Either after every epoch ('epoch') or for every      |
+|                                       |                        | training step ('minibatch').                                 |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| checkpoint_model                      | False                  | Save the best performing model during training. Models are   |
+|                                       |                        | stored to the location specified by `--out`. Only the one    |
+|                                       |                        | best model will be saved.                                    |
+|                                       |                        | Requires `evaluate_on_number_of_examples > 0` and            |
+|                                       |                        | `evaluate_every_number_of_epochs > 0`                        |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| e2e_confidence_threshold              | 0.5                    | The threshold that ensures that end-to-end is picked only if |
+|                                       |                        | the policy is confident enough.                              |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| featurizers                           | []                     | List of featurizer names (alias names). Only features        |
+|                                       |                        | coming from the listed names are used. If list is empty      |
+|                                       |                        | all available features are used.                             |
++---------------------------------------+------------------------+--------------------------------------------------------------+
+| entity_recognition                    | True                   | If 'True' entity recognition is trained and entities are     |
+|                                       |                        | extracted.                                                   |
++---------------------------------------+------------------------+--------------------------------------------------------------+
 ```
 
 :::note
diff --git a/docs/docs/setting-up-ci-cd.mdx b/docs/docs/setting-up-ci-cd.mdx
index 7772555cda61..6183eba77b1e 100644
--- a/docs/docs/setting-up-ci-cd.mdx
+++ b/docs/docs/setting-up-ci-cd.mdx
@@ -59,6 +59,10 @@ always good to run this check before training a model. By including the
 :::note
 Running `rasa data validate` does **not** test if your [rules](./rules.mdx) are consistent with your stories. 
 However, during training, the `RulePolicy` checks for conflicts between rules and stories. Any such conflict will abort training.
+
+Also, if you use end-to-end stories, then this might not capture all conflicts. Specifically, if two user inputs
+result in different tokens yet exactly the same featurization, then conflicting actions after these inputs
+may exist but will not be reported by the tool. 
 :::
 
 To read more about the validator and all of the available options, see [the documentation for 
diff --git a/docs/docs/stories.mdx b/docs/docs/stories.mdx
index 7cd907d47575..e50895c1e446 100644
--- a/docs/docs/stories.mdx
+++ b/docs/docs/stories.mdx
@@ -10,7 +10,8 @@ abstract: Stories are a type of training data used to train your assistant's dia
 
 A story is a representation of a conversation between a user and an AI assistant,
 converted into a specific format where user inputs are expressed as intents
-(and entities when necessary), while the assistant's responses and actions are expressed as action names.
+(and entities when necessary),
+while the assistant's responses and actions are expressed as action names.
 
 Here's an example of a dialogue in the Rasa story format:
 
@@ -32,6 +33,7 @@ stories:
   - action: utter_ask_num_people
 ```
 
+
 ### User Messages
 
 While writing stories, you do not have to deal with the specific contents of
@@ -185,3 +187,56 @@ into a single file for evaluation. Read more about this format in [Testing Your
 This format is only used for testing and cannot be used for training.
 
 :::
+
+
+## End-to-end Training
+
+:::caution experimental feature
+End-to-end training is an experimental feature.
+We introduce experimental features to get feedback from our community, so we encourage you to try it out!
+However, the functionality might be changed or removed in the future.
+If you have feedback (positive or negative) please share it with us on the [Rasa Forum](https://forum.rasa.com).
+
+:::
+
+With end-to-end training, you do not have to deal with the specific
+intents of the messages that are extracted by the NLU pipeline
+or with separate `utter_` responses in the domain file.
+Instead, you can include the text of the user messages and/or bot responses directly in your stories.
+See the [training data format](./training-data-format.mdx#end-to-end-training)
+for detailed description of how to write end-to-end stories.
+
+You can mix training data in the end-to-end format with labeled training data which has 
+`intent`s and `action`s specified: Stories can have some steps defined by intents/actions
+and other steps defined directly by user or bot utterances.
+
+We call it end-to-end training because policies can consume and predict actual text.
+For end-to-end user inputs, intents classified by the NLU pipeline
+and extracted entities are ignored.
+
+
+Only [Rule Policy](./policies.mdx#rule-policy)
+and [TED Policy](./policies.mdx#ted-policy) allow end-to-end training.
+
+- `RulePolicy` uses simple string matching during prediction. Namely,
+  rules based on user text will only match if the user
+  text strings inside your rules and input during prediction are identical.
+
+- `TEDPolicy` passes user text through an additional Neural Network to create
+  hidden representations of the text. In order to obtain robust performance you
+  need to provide enough training stories to capture a variety of user texts for any 
+  end-to-end dialogue turn.
+
+Rasa policies are trained for next utterance selection.
+The only difference to creating `utter_` response is how `TEDPolicy` featurizes
+bot utterances.
+In case of an `utter_` action, `TEDPolicy` sees only the name of the action, while
+if you provide actual utterance using `bot` key,
+`TEDPolicy` will featurize it as textual input depending on the NLU configuration.
+This can help in case of similar utterances in slightly different situations.
+However, this can also make things harder to learn because the fact that different
+utterances have similar texts make it easier for `TEDPolicy` to confuse these utterances.
+
+End-to-end training requires significantly more parameters in `TEDPolicy`.
+Therefore, training an end-to-end model might require significant computational
+resources depending on how many end-to-end turns you have in your stories.
diff --git a/docs/docs/training-data-format.mdx b/docs/docs/training-data-format.mdx
index 8ec5d5ae71d4..800777d7a14b 100644
--- a/docs/docs/training-data-format.mdx
+++ b/docs/docs/training-data-format.mdx
@@ -381,11 +381,11 @@ stories:
 Each step can be one of the following:
 
   - A [user message](#user-messages), represented by **intent** and **entities**.
-  - An [or statement](#or-statement), which includes two or more user messages under it
-  - A bot [action](#actions)
-  - A [form](#forms)
-  - A [slot was set](#slots) event
-  - A [checkpoint](#checkpoints), which connects the story to another story
+  - An [or statement](#or-statement), which includes two or more user messages under it.
+  - A bot [action](#actions).
+  - A [form](#forms).
+  - A [slot was set](#slots) event.
+  - A [checkpoint](#checkpoints), which connects the story to another story.
 
 
 #### User Messages
@@ -401,13 +401,14 @@ messages the users can send with the same meaning.
 
 User messages follow the format:
 
-```yaml-rasa
+```yaml-rasa {4-6}
 stories:
 - story: user message structure
   steps:
     - intent: intent_name  # Required
       entities:  # Optional
       - entity_name: entity_value
+    - action: action_name
 ```
 
 For example, to represent the sentence
@@ -685,3 +686,84 @@ rasa test
 
 If you want to know more about testing head over to
 [Testing Your Assistant](testing-your-assistant.mdx).
+
+
+## End-to-end Training
+
+:::caution experimental feature
+End-to-end training is an experimental feature.
+We introduce experimental features to get feedback from our community, so we encourage you to try it out!
+However, the functionality might be changed or removed in the future.
+If you have feedback (positive or negative) please share it with us on the [Rasa Forum](https://forum.rasa.com).
+
+:::
+
+With [end-to-end training](stories.mdx#end-to-end-training), you do not have to deal with the specific
+intents of the messages that are extracted by the NLU pipeline.
+Instead, you can put the text of the user message directly in the stories,
+by using `user` key.
+
+These end-to-end user messages follow the format:
+
+```yaml-rasa {4}
+stories:
+- story: user message structure
+  steps:
+    - user: the actual text of the user message
+    - action: action_name
+```
+
+In addition, you can add entity tags that can be extracted
+by the [TED Policy](./policies.mdx#ted-policy).
+The syntax for entity tags is the same as in
+[the NLU training data](./training-data-format.mdx#entities).
+For example, the following story contains the user utterance
+` I can always go for sushi`. By using the syntax from the NLU training data
+`[sushi](cuisine)`, you can mark `sushi` as an entity of type `cuisine`.
+
+```yaml-rasa {4}
+stories:
+- story: story with entities
+  steps:
+  - user: I can always go for [sushi](cuisine)
+  - action: utter_suggest_cuisine
+```
+
+
+Similarly, you can put bot utterances directly in the stories,
+by using the `bot` key followed by the text that you want your bot to say.
+
+A story with only a bot utterance might look like this:
+
+```yaml-rasa {7}
+stories:
+- story: story with an end-to-end response
+  steps:
+  - intent: greet
+    entities:
+    - name: Ivan
+  - bot: Hello, a person with a name!
+```
+
+You can also have a mixed end-to-end story:
+
+```yaml-rasa
+stories:
+- story: full end-to-end story
+  steps:
+  - intent: greet
+    entities:
+    - name: Ivan
+  - bot: Hello, a person with a name!
+  - intent: search_restaurant
+  - action: utter_suggest_cuisine
+  - user: I can always go for [sushi](cuisine)
+  - bot: Personally, I prefer pizza, but sure let's search sushi restaurants
+  - action: utter_suggest_cuisine
+  - user: Have a beautiful day!
+  - action: utter_goodbye
+```
+
+Rasa end-to-end training is fully integrated with standard Rasa approach.
+It means that you can have mixed stories with some steps defined by actions or intents
+and other steps defined directly by user messages or bot responses.
diff --git a/examples/e2ebot/config.yml b/examples/e2ebot/config.yml
new file mode 100644
index 000000000000..f38558adb0ad
--- /dev/null
+++ b/examples/e2ebot/config.yml
@@ -0,0 +1,16 @@
+language: en
+pipeline:
+  - name: WhitespaceTokenizer
+    intent_tokenization_flag: True
+  - name: RegexFeaturizer
+  - name: LexicalSyntacticFeaturizer
+  - name: CountVectorsFeaturizer
+  - name: CountVectorsFeaturizer
+    analyzer: char_wb
+    min_ngram: 1
+    max_ngram: 4
+  - name: DIETClassifier
+    epochs: 200
+policies:
+- name: TEDPolicy
+  epochs: 200
diff --git a/examples/e2ebot/data/nlu.yml b/examples/e2ebot/data/nlu.yml
new file mode 100644
index 000000000000..157115096bf2
--- /dev/null
+++ b/examples/e2ebot/data/nlu.yml
@@ -0,0 +1,48 @@
+version: "2.0"
+
+nlu:
+- intent: greet
+  examples: |
+    - hey
+    - hello
+    - hi
+    - hello there
+    - good morning
+    - good evening
+    - moin
+    - hey there
+    - let's go
+    - hey dude
+    - goodmorning
+    - goodevening
+    - good afternoon
+
+
+- intent: affirm
+  examples: |
+    - yes
+    - y
+    - indeed
+    - of course
+    - that sounds good
+    - correct
+    - yeah
+
+- intent: deny
+  examples: |
+    - no
+    - n
+    - never
+    - I don't think so
+    - don't like that
+    - no way
+    - had better not
+
+- intent: search_restaurant
+  examples: |
+    - I'm looking for some food
+    - show me a place to eat
+    - where should I eat tonight?
+    - list restaurants
+    - food
+    - I'm hungry
diff --git a/examples/e2ebot/data/stories.yml b/examples/e2ebot/data/stories.yml
new file mode 100644
index 000000000000..5308b96ea756
--- /dev/null
+++ b/examples/e2ebot/data/stories.yml
@@ -0,0 +1,38 @@
+version: "2.0"
+
+stories:
+- story: accepts suggestion
+  steps:
+  - intent: greet
+  - action: utter_greet
+  - intent: search_restaurant
+  - action: utter_suggest_cuisine
+  - intent: affirm
+  - action: utter_search_restaurants
+
+- story: accepts 3rd suggestion
+  steps:
+  - intent: search_restaurant
+  - action: utter_suggest_cuisine
+  - intent: deny
+  - action: utter_suggest_cuisine
+  - intent: deny
+  - action: utter_suggest_cuisine
+  - intent: affirm
+  - action: utter_search_restaurants
+
+- story: accepts suggestion, indirectly
+  steps:
+  - intent: greet
+  - action: utter_greet
+  - intent: search_restaurant
+  - action: utter_suggest_cuisine
+  - user: "I can always go for sushi"
+  - action: utter_search_restaurants
+
+- story: rejects suggestion, indirectly
+  steps:
+  - intent: search_restaurant
+  - action: utter_suggest_cuisine
+  - user: "I had that yesterday"
+  - action: utter_suggest_cuisine
diff --git a/examples/e2ebot/domain.yml b/examples/e2ebot/domain.yml
new file mode 100644
index 000000000000..16602d284dd9
--- /dev/null
+++ b/examples/e2ebot/domain.yml
@@ -0,0 +1,23 @@
+version: "2.0"
+
+actions:
+ - utter_greet
+ - utter_suggest_cuisine
+ - utter_search_restaurants
+
+intents:
+ - greet
+ - affirm
+ - deny
+ - search_restaurant
+
+responses:
+  utter_greet:
+  - text: "hi!"
+  utter_suggest_cuisine:
+  - text: "how about greek food?"
+  - text: "how about chinese food?"
+  - text: "how about italian food?"
+  - text: "how about sushi?"
+  utter_search_restaurants:
+  - text: "great! here's what I found ..."
diff --git a/rasa/__main__.py b/rasa/__main__.py
index 76f3968224e1..cd61be1a26fb 100644
--- a/rasa/__main__.py
+++ b/rasa/__main__.py
@@ -6,6 +6,9 @@
 
 from rasa_sdk import __version__ as rasa_sdk_version
 
+import rasa.telemetry
+import rasa.utils.io
+import rasa.utils.tensorflow.environment as tf_env
 from rasa import version
 from rasa.cli import (
     data,
@@ -24,10 +27,7 @@
 from rasa.cli.utils import parse_last_positional_argument_as_model_path
 from rasa.shared.exceptions import RasaException
 from rasa.shared.utils.cli import print_error
-import rasa.telemetry
 from rasa.utils.common import set_log_and_warnings_filters, set_log_level
-import rasa.utils.io
-import rasa.utils.tensorflow.environment as tf_env
 
 logger = logging.getLogger(__name__)
 
diff --git a/rasa/cli/data.py b/rasa/cli/data.py
index 0a104596bf9c..53b7edbdabc4 100644
--- a/rasa/cli/data.py
+++ b/rasa/cli/data.py
@@ -212,6 +212,7 @@ def _append_story_structure_arguments(parser: argparse.ArgumentParser) -> None:
         default=None,
         help="Number of turns taken into account for story structure validation.",
     )
+    default_arguments.add_config_param(parser)
 
 
 def split_nlu_data(args: argparse.Namespace) -> None:
@@ -241,8 +242,13 @@ def validate_files(args: argparse.Namespace, stories_only: bool = False) -> None
         args: Commandline arguments
         stories_only: If `True`, only the story structure is validated.
     """
+
+    config = rasa.cli.utils.get_validated_path(
+        args.config, "config", DEFAULT_CONFIG_PATH, none_is_valid=True
+    )
+
     file_importer = RasaFileImporter(
-        domain_path=args.domain, training_data_paths=args.data
+        domain_path=args.domain, training_data_paths=args.data, config_file=config,
     )
 
     validator = rasa.utils.common.run_in_loop(Validator.from_importer(file_importer))
diff --git a/rasa/core/actions/action.py b/rasa/core/actions/action.py
index 53793313d0f0..33d7694db9e5 100644
--- a/rasa/core/actions/action.py
+++ b/rasa/core/actions/action.py
@@ -6,6 +6,7 @@
 import aiohttp
 
 import rasa.core
+from rasa.core.policies.policy import PolicyPrediction
 
 from rasa.shared.core import events
 from rasa.core.constants import DEFAULT_REQUEST_TIMEOUT
@@ -101,30 +102,9 @@ def action_for_index(
             f"Domain has {domain.num_actions} actions."
         )
 
-    return action_for_name(domain.action_names[index], domain, action_endpoint)
-
-
-def action_for_name(
-    action_name: Text, domain: Domain, action_endpoint: Optional[EndpointConfig]
-) -> "Action":
-    """Create an `Action` object based on the name of the `Action`.
-
-    Args:
-        action_name: The name of the `Action`.
-        domain: The `Domain` of the current model. The domain contains the actions
-            provided by the user + the default actions.
-        action_endpoint: Can be used to run `custom_actions`
-            (e.g. using the `rasa-sdk`).
-
-    Returns:
-        The instantiated `Action` or `None` if no `Action` was found for the given
-        index.
-    """
-
-    if action_name not in domain.action_names:
-        domain.raise_action_not_found_exception(action_name)
-
-    return action_from_name(action_name, domain, action_endpoint)
+    return action_for_name_or_text(
+        domain.action_names_or_texts[index], domain, action_endpoint
+    )
 
 
 def is_retrieval_action(action_name: Text, retrieval_intents: List[Text]) -> bool:
@@ -147,41 +127,53 @@ def is_retrieval_action(action_name: Text, retrieval_intents: List[Text]) -> boo
     )
 
 
-def action_from_name(
-    name: Text, domain: Domain, action_endpoint: Optional[EndpointConfig]
+def action_for_name_or_text(
+    action_name_or_text: Text, domain: Domain, action_endpoint: Optional[EndpointConfig]
 ) -> "Action":
-    """Retrieves an action by its name.
+    """Retrieves an action by its name or by its text in case it's an end-to-end action.
 
     Args:
-        name: The name of the action.
+        action_name_or_text: The name of the action.
         domain: The current model domain.
         action_endpoint: The endpoint to execute custom actions.
 
+    Raises:
+        ActionNotFoundException: If action not in current domain.
+
     Returns:
         The instantiated action.
     """
+    if action_name_or_text not in domain.action_names_or_texts:
+        domain.raise_action_not_found_exception(action_name_or_text)
+
     defaults = {a.name(): a for a in default_actions(action_endpoint)}
 
-    if name in defaults and name not in domain.user_actions_and_forms:
-        return defaults[name]
+    if (
+        action_name_or_text in defaults
+        and action_name_or_text not in domain.user_actions_and_forms
+    ):
+        return defaults[action_name_or_text]
 
-    if name.startswith(UTTER_PREFIX) and is_retrieval_action(
-        name, domain.retrieval_intents
+    if action_name_or_text.startswith(UTTER_PREFIX) and is_retrieval_action(
+        action_name_or_text, domain.retrieval_intents
     ):
-        return ActionRetrieveResponse(name)
+        return ActionRetrieveResponse(action_name_or_text)
+
+    if action_name_or_text in domain.action_texts:
+        return ActionEndToEndResponse(action_name_or_text)
 
-    if name.startswith(UTTER_PREFIX):
-        return ActionUtterTemplate(name)
+    if action_name_or_text.startswith(UTTER_PREFIX):
+        return ActionUtterTemplate(action_name_or_text)
 
-    is_form = name in domain.form_names
+    is_form = action_name_or_text in domain.form_names
     # Users can override the form by defining an action with the same name as the form
-    user_overrode_form_action = is_form and name in domain.user_actions
+    user_overrode_form_action = is_form and action_name_or_text in domain.user_actions
     if is_form and not user_overrode_form_action:
         from rasa.core.actions.forms import FormAction
 
-        return FormAction(name, action_endpoint)
+        return FormAction(action_name_or_text, action_endpoint)
 
-    return RemoteAction(name, action_endpoint)
+    return RemoteAction(action_name_or_text, action_endpoint)
 
 
 def create_bot_utterance(message: Dict[Text, Any]) -> BotUttered:
@@ -242,16 +234,40 @@ async def run(
         raise NotImplementedError
 
     def __str__(self) -> Text:
-        return "Action('{}')".format(self.name())
+        """Returns text representation of form."""
+        return f"{self.__class__.__name__}('{self.name()}')"
+
+    def event_for_successful_execution(
+        self, prediction: PolicyPrediction
+    ) -> ActionExecuted:
+        """Event which should be logged for the successful execution of this action.
+
+        Args:
+            prediction: Prediction which led to the execution of this event.
+
+        Returns:
+            Event which should be logged onto the tracker.
+        """
+        return ActionExecuted(
+            self.name(), prediction.policy_name, prediction.max_confidence
+        )
 
 
 class ActionUtterTemplate(Action):
     """An action which only effect is to utter a template when it is run.
 
     Both, name and utter template, need to be specified using
-    the `name` method."""
+    the `name` method.
+    """
 
-    def __init__(self, name: Text, silent_fail: Optional[bool] = False):
+    def __init__(self, name: Text, silent_fail: Optional[bool] = False) -> None:
+        """Creates action.
+
+        Args:
+            name: Name of the action.
+            silent_fail: `True` if the action should fail silently in case no response
+                was defined for this action.
+        """
         self.template_name = name
         self.silent_fail = silent_fail
 
@@ -277,16 +293,61 @@ async def run(
         return [create_bot_utterance(message)]
 
     def name(self) -> Text:
+        """Returns action name."""
         return self.template_name
 
-    def __str__(self) -> Text:
-        return "ActionUtterTemplate('{}')".format(self.name())
+
+class ActionEndToEndResponse(Action):
+    """Action to utter end-to-end responses to the user."""
+
+    def __init__(self, action_text: Text) -> None:
+        """Creates action.
+
+        Args:
+            action_text: Text of end-to-end bot response.
+        """
+        self.action_text = action_text
+
+    def name(self) -> Text:
+        """Returns action name."""
+        # In case of an end-to-end action there is no label (aka name) for the action.
+        # We fake a name by returning the text which the bot sends back to the user.
+        return self.action_text
+
+    async def run(
+        self,
+        output_channel: "OutputChannel",
+        nlg: "NaturalLanguageGenerator",
+        tracker: "DialogueStateTracker",
+        domain: "Domain",
+    ) -> List[Event]:
+        """Runs action (see parent class for full docstring)."""
+        message = {"text": self.action_text}
+        return [create_bot_utterance(message)]
+
+    def event_for_successful_execution(
+        self, prediction: PolicyPrediction
+    ) -> ActionExecuted:
+        """Event which should be logged for the successful execution of this action.
+
+        Args:
+            prediction: Prediction which led to the execution of this event.
+
+        Returns:
+            Event which should be logged onto the tracker.
+        """
+        return ActionExecuted(
+            policy=prediction.policy_name,
+            confidence=prediction.max_confidence,
+            action_text=self.action_text,
+        )
 
 
 class ActionRetrieveResponse(ActionUtterTemplate):
     """An action which queries the Response Selector for the appropriate response."""
 
-    def __init__(self, name: Text, silent_fail: Optional[bool] = False):
+    def __init__(self, name: Text, silent_fail: Optional[bool] = False) -> None:
+        """Creates action. See docstring of parent class."""
         super().__init__(name, silent_fail)
         self.action_name = name
         self.silent_fail = silent_fail
@@ -342,11 +403,9 @@ async def run(
         return await super().run(output_channel, nlg, tracker, domain)
 
     def name(self) -> Text:
+        """Returns action name."""
         return self.action_name
 
-    def __str__(self) -> Text:
-        return "ActionRetrieveResponse('{}')".format(self.name())
-
 
 class ActionBack(ActionUtterTemplate):
     """Revert the tracker state by two user utterances."""
diff --git a/rasa/core/actions/forms.py b/rasa/core/actions/forms.py
index c805e5e8fc1b..c2254b681c6b 100644
--- a/rasa/core/actions/forms.py
+++ b/rasa/core/actions/forms.py
@@ -395,7 +395,7 @@ async def validate_slots(
 
         validate_name = f"validate_{self.name()}"
 
-        if validate_name not in domain.action_names:
+        if validate_name not in domain.action_names_or_texts:
             return events
 
         _tracker = self._temporary_tracker(tracker, events, domain)
@@ -525,20 +525,21 @@ def _find_next_slot_to_request(
             None,
         )
 
-    def _name_of_utterance(self, domain: Domain, slot_name: Text) -> Text:
+    def _name_of_utterance(self, domain: Domain, slot_name: Text) -> Optional[Text]:
         search_path = [
             f"action_ask_{self._form_name}_{slot_name}",
             f"{UTTER_PREFIX}ask_{self._form_name}_{slot_name}",
             f"action_ask_{slot_name}",
+            f"{UTTER_PREFIX}ask_{slot_name}",
         ]
 
         found_actions = (
             action_name
             for action_name in search_path
-            if action_name in domain.action_names
+            if action_name in domain.action_names_or_texts
         )
 
-        return next(found_actions, f"{UTTER_PREFIX}ask_{slot_name}")
+        return next(found_actions, None)
 
     async def _ask_for_slot(
         self,
@@ -550,13 +551,21 @@ async def _ask_for_slot(
     ) -> List[Event]:
         logger.debug(f"Request next slot '{slot_name}'")
 
-        action_to_ask_for_next_slot = action.action_from_name(
-            self._name_of_utterance(domain, slot_name), domain, self.action_endpoint
+        action_to_ask_for_next_slot = self._name_of_utterance(domain, slot_name)
+        if not action_to_ask_for_next_slot:
+            # Use a debug log as the user might have asked as part of a custom action
+            logger.debug(
+                f"There was no action found to ask for slot '{slot_name}' "
+                f"name to be filled."
+            )
+            return []
+
+        action_to_ask_for_next_slot = action.action_for_name_or_text(
+            action_to_ask_for_next_slot, domain, self.action_endpoint
         )
-        events_to_ask_for_next_slot = await action_to_ask_for_next_slot.run(
+        return await action_to_ask_for_next_slot.run(
             output_channel, nlg, tracker, domain
         )
-        return events_to_ask_for_next_slot
 
     # helpers
     @staticmethod
diff --git a/rasa/core/actions/two_stage_fallback.py b/rasa/core/actions/two_stage_fallback.py
index d94b9a3a4926..d72031fdf554 100644
--- a/rasa/core/actions/two_stage_fallback.py
+++ b/rasa/core/actions/two_stage_fallback.py
@@ -54,7 +54,7 @@ async def _ask_affirm(
         tracker: DialogueStateTracker,
         domain: Domain,
     ) -> List[Event]:
-        affirm_action = action.action_from_name(
+        affirm_action = action.action_for_name_or_text(
             ACTION_DEFAULT_ASK_AFFIRMATION_NAME, domain, self._action_endpoint
         )
 
@@ -67,7 +67,7 @@ async def _ask_rephrase(
         tracker: DialogueStateTracker,
         domain: Domain,
     ) -> List[Event]:
-        rephrase = action.action_from_name(
+        rephrase = action.action_for_name_or_text(
             ACTION_DEFAULT_ASK_REPHRASE_NAME, domain, self._action_endpoint
         )
 
@@ -112,7 +112,7 @@ async def _give_up(
         tracker: DialogueStateTracker,
         domain: Domain,
     ) -> List[Event]:
-        fallback = action.action_from_name(
+        fallback = action.action_for_name_or_text(
             ACTION_DEFAULT_FALLBACK_NAME, domain, self._action_endpoint
         )
 
diff --git a/rasa/core/featurizers/single_state_featurizer.py b/rasa/core/featurizers/single_state_featurizer.py
index 0f96128959ce..b3c337319bc6 100644
--- a/rasa/core/featurizers/single_state_featurizer.py
+++ b/rasa/core/featurizers/single_state_featurizer.py
@@ -1,10 +1,11 @@
 import logging
 import numpy as np
 import scipy.sparse
-from typing import List, Optional, Dict, Text, Set
+from typing import List, Optional, Dict, Text, Set, Any
 from collections import defaultdict
 
 import rasa.shared.utils.io
+from rasa.nlu.constants import TOKENS_NAMES
 from rasa.shared.core.domain import SubState, State, Domain
 from rasa.shared.nlu.interpreter import NaturalLanguageInterpreter, RegexInterpreter
 from rasa.shared.core.constants import PREVIOUS_ACTION, ACTIVE_LOOP, USER, SLOTS
@@ -16,9 +17,15 @@
     ACTION_TEXT,
     ACTION_NAME,
     INTENT,
+    TEXT,
+    NO_ENTITY_TAG,
+    ENTITY_ATTRIBUTE_TYPE,
+    ENTITY_TAGS,
 )
 from rasa.shared.nlu.training_data.features import Features
 from rasa.shared.nlu.training_data.message import Message
+from rasa.utils.tensorflow.model_data_utils import TAG_ID_ORIGIN
+from rasa.utils.tensorflow.constants import IDS
 
 logger = logging.getLogger(__name__)
 
@@ -42,6 +49,23 @@ def __init__(self) -> None:
         self._use_regex_interpreter = False
         self._default_feature_states = {}
         self.action_texts = []
+        self.entity_tag_id_mapping = {}
+
+    def get_entity_tag_ids(self) -> Dict[Text, int]:
+        """Returns the tag to index mapping for entities.
+
+        Returns:
+            Tag to index mapping.
+        """
+        if ENTITIES not in self._default_feature_states:
+            return {}
+
+        tag_ids = {
+            tag: idx + 1  # +1 to keep 0 for the NO_ENTITY_TAG
+            for tag, idx in self._default_feature_states[ENTITIES].items()
+        }
+        tag_ids[NO_ENTITY_TAG] = 0
+        return tag_ids
 
     def prepare_for_training(
         self, domain: Domain, interpreter: NaturalLanguageInterpreter
@@ -64,11 +88,14 @@ def convert_to_dict(feature_states: List[Text]) -> Dict[Text, int]:
             }
 
         self._default_feature_states[INTENT] = convert_to_dict(domain.intents)
-        self._default_feature_states[ACTION_NAME] = convert_to_dict(domain.action_names)
+        self._default_feature_states[ACTION_NAME] = convert_to_dict(
+            domain.action_names_or_texts
+        )
         self._default_feature_states[ENTITIES] = convert_to_dict(domain.entity_states)
         self._default_feature_states[SLOTS] = convert_to_dict(domain.slot_states)
         self._default_feature_states[ACTIVE_LOOP] = convert_to_dict(domain.form_names)
         self.action_texts = domain.action_texts
+        self.entity_tag_id_mapping = self.get_entity_tag_ids()
 
     def _state_features_for_attribute(
         self, sub_state: SubState, attribute: Text
@@ -98,7 +125,7 @@ def _create_features(
 
         features = np.zeros(len(self._default_feature_states[attribute]), np.float32)
         for state_feature, value in state_features.items():
-            # check that the value is in default_feature_states to be able to assigh
+            # check that the value is in default_feature_states to be able to assign
             # its value
             if state_feature in self._default_feature_states[attribute]:
                 features[self._default_feature_states[attribute][state_feature]] = value
@@ -238,6 +265,53 @@ def encode_state(
 
         return state_features
 
+    def encode_entities(
+        self, entity_data: Dict[Text, Any], interpreter: NaturalLanguageInterpreter
+    ) -> Dict[Text, List["Features"]]:
+        """Encode the given entity data with the help of the given interpreter.
+
+        Produce numeric entity tags for tokens.
+
+        Args:
+            entity_data: The dict containing the text and entity labels and locations
+            interpreter: The interpreter used to encode the state
+
+        Returns:
+            A dictionary of entity type to list of features.
+        """
+        from rasa.nlu.test import determine_token_labels
+
+        # TODO
+        #  The entity states used to create the tag-idx-mapping contains the
+        #  entities and the concatenated entity and roles/groups. We do not
+        #  distinguish between entities and roles/groups right now.
+        # TODO
+        #  Should we support BILOU tagging?
+
+        if TEXT not in entity_data or len(self.entity_tag_id_mapping) < 2:
+            # we cannot build a classifier with fewer than 2 classes
+            return {}
+
+        parsed_text = interpreter.featurize_message(Message({TEXT: entity_data[TEXT]}))
+        if not parsed_text:
+            return {}
+        entities = entity_data.get(ENTITIES, [])
+
+        _tags = []
+        for token in parsed_text.get(TOKENS_NAMES[TEXT], []):
+            _tag = determine_token_labels(
+                token, entities, attribute_key=ENTITY_ATTRIBUTE_TYPE
+            )
+            # TODO handle if tag is not in mapping
+            _tags.append(self.entity_tag_id_mapping[_tag])
+
+        # transpose to have seq_len x 1
+        return {
+            ENTITY_TAGS: [
+                Features(np.array([_tags]).T, IDS, ENTITY_TAGS, TAG_ID_ORIGIN)
+            ]
+        }
+
     def _encode_action(
         self, action: Text, interpreter: NaturalLanguageInterpreter
     ) -> Dict[Text, List["Features"]]:
@@ -262,7 +336,8 @@ def encode_all_actions(
         """
 
         return [
-            self._encode_action(action, interpreter) for action in domain.action_names
+            self._encode_action(action, interpreter)
+            for action in domain.action_names_or_texts
         ]
 
 
diff --git a/rasa/core/featurizers/tracker_featurizers.py b/rasa/core/featurizers/tracker_featurizers.py
index ab0c01ced352..26560266f440 100644
--- a/rasa/core/featurizers/tracker_featurizers.py
+++ b/rasa/core/featurizers/tracker_featurizers.py
@@ -3,21 +3,23 @@
 import jsonpickle
 import logging
 
-from rasa.shared.exceptions import RasaException
-from rasa.shared.nlu.constants import TEXT, INTENT
 from tqdm import tqdm
-from typing import Tuple, List, Optional, Dict, Text, Union
+from typing import Tuple, List, Optional, Dict, Text, Union, Any
 import numpy as np
 
 from rasa.core.featurizers.single_state_featurizer import SingleStateFeaturizer
 from rasa.shared.core.domain import State, Domain
-from rasa.shared.core.events import ActionExecuted
-from rasa.shared.core.trackers import DialogueStateTracker
+from rasa.shared.core.events import ActionExecuted, UserUttered
+from rasa.shared.core.trackers import (
+    DialogueStateTracker,
+    is_prev_action_listen_in_state,
+)
 from rasa.shared.nlu.interpreter import NaturalLanguageInterpreter
 from rasa.shared.core.constants import USER
+from rasa.shared.nlu.constants import TEXT, INTENT, ENTITIES
+from rasa.shared.exceptions import RasaException
 import rasa.shared.utils.io
 from rasa.shared.nlu.training_data.features import Features
-from rasa.shared.constants import INTENT_MESSAGE_PREFIX
 
 FEATURIZER_FILE = "featurizer.json"
 
@@ -78,7 +80,8 @@ def _featurize_states(
     def _convert_labels_to_ids(
         trackers_as_actions: List[List[Text]], domain: Domain
     ) -> np.ndarray:
-        # store labels in numpy arrays so that it corresponds to np arrays of input features
+        # store labels in numpy arrays so that it corresponds to np arrays of input
+        # features
         return np.array(
             [
                 np.array(
@@ -88,6 +91,53 @@ def _convert_labels_to_ids(
             ]
         )
 
+    def _create_entity_tags(
+        self,
+        trackers_as_entities: List[List[Dict[Text, Any]]],
+        interpreter: NaturalLanguageInterpreter,
+    ) -> List[List[Dict[Text, List["Features"]]]]:
+        return [
+            [
+                self.state_featurizer.encode_entities(entity_data, interpreter)
+                for entity_data in trackers_entities
+            ]
+            for trackers_entities in trackers_as_entities
+        ]
+
+    @staticmethod
+    def _entity_data(event: UserUttered) -> Dict[Text, Any]:
+        # train stories support both text and intent,
+        # but if intent is present, the text is ignored
+        if event.text and not event.intent_name:
+            return {TEXT: event.text, ENTITIES: event.entities}
+
+        # input is not textual, so add empty dict
+        return {}
+
+    @staticmethod
+    def _remove_user_text_if_intent(trackers_as_states: List[List[State]]) -> None:
+        for states in trackers_as_states:
+            for state in states:
+                # remove text features to only use intent
+                if state.get(USER, {}).get(INTENT) and state.get(USER, {}).get(TEXT):
+                    del state[USER][TEXT]
+
+    def training_states_actions_and_entities(
+        self, trackers: List[DialogueStateTracker], domain: Domain
+    ) -> Tuple[List[List[State]], List[List[Text]], List[List[Dict[Text, Any]]]]:
+        """Transforms list of trackers to lists of states, actions and entity data.
+
+        Args:
+            trackers: The trackers to transform
+            domain: The domain
+
+        Returns:
+            A tuple of list of states, list of actions and list of entity data.
+        """
+        raise NotImplementedError(
+            f"`{self.__class__.__name__}` should implement how to encode trackers as feature vectors"
+        )
+
     def training_states_and_actions(
         self, trackers: List[DialogueStateTracker], domain: Domain
     ) -> Tuple[List[List[State]], List[List[Text]]]:
@@ -100,16 +150,23 @@ def training_states_and_actions(
         Returns:
             A tuple of list of states and list of actions.
         """
-        raise NotImplementedError(
-            "Featurizer must have the capacity to encode trackers to feature vectors"
-        )
+        (
+            trackers_as_states,
+            trackers_as_actions,
+            _,
+        ) = self.training_states_actions_and_entities(trackers, domain)
+        return trackers_as_states, trackers_as_actions
 
     def featurize_trackers(
         self,
         trackers: List[DialogueStateTracker],
         domain: Domain,
         interpreter: NaturalLanguageInterpreter,
-    ) -> Tuple[List[List[Dict[Text, List["Features"]]]], np.ndarray]:
+    ) -> Tuple[
+        List[List[Dict[Text, List["Features"]]]],
+        np.ndarray,
+        List[List[Dict[Text, List["Features"]]]],
+    ]:
         """Featurize the training trackers.
 
         Args:
@@ -121,8 +178,11 @@ def featurize_trackers(
             - a dictionary of state types (INTENT, TEXT, ACTION_NAME, ACTION_TEXT,
               ENTITIES, SLOTS, ACTIVE_LOOP) to a list of features for all dialogue
               turns in all training trackers
-            - the label ids (e.g. action ids) for every dialuge turn in all training
+            - the label ids (e.g. action ids) for every dialogue turn in all training
               trackers
+            - A dictionary of entity type (ENTITY_TAGS) to a list of features
+              containing entity tag ids for text user inputs otherwise empty dict
+              for all dialogue turns in all training trackers
         """
         if self.state_featurizer is None:
             raise ValueError(
@@ -134,23 +194,55 @@ def featurize_trackers(
 
         self.state_featurizer.prepare_for_training(domain, interpreter)
 
-        trackers_as_states, trackers_as_actions = self.training_states_and_actions(
-            trackers, domain
-        )
+        (
+            trackers_as_states,
+            trackers_as_actions,
+            trackers_as_entities,
+        ) = self.training_states_actions_and_entities(trackers, domain)
 
         tracker_state_features = self._featurize_states(trackers_as_states, interpreter)
         label_ids = self._convert_labels_to_ids(trackers_as_actions, domain)
+        entity_tags = self._create_entity_tags(trackers_as_entities, interpreter)
 
-        return tracker_state_features, label_ids
+        return tracker_state_features, label_ids, entity_tags
+
+    def _choose_last_user_input(
+        self, trackers_as_states: List[List[State]], use_text_for_last_user_input: bool
+    ) -> None:
+        for states in trackers_as_states:
+            last_state = states[-1]
+            # only update the state of the real user utterance
+            if not is_prev_action_listen_in_state(last_state):
+                continue
+
+            if use_text_for_last_user_input:
+                # remove intent features to only use text
+                if last_state.get(USER, {}).get(INTENT):
+                    del last_state[USER][INTENT]
+                # don't add entities if text is used for featurization
+                if last_state.get(USER, {}).get(ENTITIES):
+                    del last_state[USER][ENTITIES]
+            else:
+                # remove text features to only use intent
+                if last_state.get(USER, {}).get(TEXT):
+                    del last_state[USER][TEXT]
+
+        # make sure that all dialogue steps are either intent or text based
+        self._remove_user_text_if_intent(trackers_as_states)
 
     def prediction_states(
-        self, trackers: List[DialogueStateTracker], domain: Domain
+        self,
+        trackers: List[DialogueStateTracker],
+        domain: Domain,
+        use_text_for_last_user_input: bool = False,
     ) -> List[List[State]]:
         """Transforms list of trackers to lists of states for prediction.
 
         Args:
             trackers: The trackers to transform
             domain: The domain
+            use_text_for_last_user_input: Indicates whether to use text or intent label
+                for featurizing last user input.
 
         Returns:
             A list of states.
@@ -164,6 +256,7 @@ def create_state_features(
         trackers: List[DialogueStateTracker],
         domain: Domain,
         interpreter: NaturalLanguageInterpreter,
+        use_text_for_last_user_input: bool = False,
     ) -> List[List[Dict[Text, List["Features"]]]]:
         """Create state features for prediction.
 
@@ -171,13 +264,17 @@ def create_state_features(
             trackers: A list of state trackers
             domain: The domain
             interpreter: The interpreter
+            use_text_for_last_user_input: Indicates whether to use text or intent label
+                for featurizing last user input.
 
         Returns:
             A dictionary of state type (INTENT, TEXT, ACTION_NAME, ACTION_TEXT,
             ENTITIES, SLOTS, ACTIVE_LOOP) to a list of features for all dialogue
             turns in all trackers.
         """
-        trackers_as_states = self.prediction_states(trackers, domain)
+        trackers_as_states = self.prediction_states(
+            trackers, domain, use_text_for_last_user_input
+        )
         return self._featurize_states(trackers_as_states, interpreter)
 
     def persist(self, path: Union[Text, Path]) -> None:
@@ -222,23 +319,21 @@ class FullDialogueTrackerFeaturizer(TrackerFeaturizer):
     Training data is padded up to the length of the longest dialogue with -1.
     """
 
-    def training_states_and_actions(
+    def training_states_actions_and_entities(
         self, trackers: List[DialogueStateTracker], domain: Domain
-    ) -> Tuple[List[List[State]], List[List[Text]]]:
-        """Transforms list of trackers to lists of states and actions.
-
-        Training data is padded up to the length of the longest dialogue with -1.
+    ) -> Tuple[List[List[State]], List[List[Text]], List[List[Dict[Text, Any]]]]:
+        """Transforms list of trackers to lists of states, actions and entity data.
 
         Args:
             trackers: The trackers to transform
             domain: The domain
 
         Returns:
-            A tuple of list of states and list of actions.
+            A tuple of list of states, list of actions and list of entity data.
         """
-
         trackers_as_states = []
         trackers_as_actions = []
+        trackers_as_entities = []
 
         logger.debug(
             "Creating states and action examples from "
@@ -255,7 +350,12 @@ def training_states_and_actions(
 
             delete_first_state = False
             actions = []
+            entities = []
+            entity_data = {}
             for event in tracker.applied_events():
+                if isinstance(event, UserUttered):
+                    entity_data = self._entity_data(event)
+
                 if not isinstance(event, ActionExecuted):
                     continue
 
@@ -263,6 +363,7 @@ def training_states_and_actions(
                     # only actions which can be
                     # predicted at a stories start
                     actions.append(event.action_name or event.action_text)
+                    entities.append(entity_data)
                 else:
                     # unpredictable actions can be
                     # only the first in the story
@@ -273,36 +374,41 @@ def training_states_and_actions(
                         )
                     delete_first_state = True
 
+                # reset entity_data for the the next turn
+                entity_data = {}
+
             if delete_first_state:
                 states = states[1:]
 
             trackers_as_states.append(states[:-1])
             trackers_as_actions.append(actions)
+            trackers_as_entities.append(entities)
 
-        return trackers_as_states, trackers_as_actions
+        self._remove_user_text_if_intent(trackers_as_states)
+
+        return trackers_as_states, trackers_as_actions, trackers_as_entities
 
     def prediction_states(
-        self, trackers: List[DialogueStateTracker], domain: Domain
+        self,
+        trackers: List[DialogueStateTracker],
+        domain: Domain,
+        use_text_for_last_user_input: bool = False,
     ) -> List[List[State]]:
         """Transforms list of trackers to lists of states for prediction.
 
         Args:
             trackers: The trackers to transform
-            domain: The domain
+            domain: The domain,
+            use_text_for_last_user_input: Indicates whether to use text or intent label
+                for featurizing last user input.
 
         Returns:
             A list of states.
         """
-
         trackers_as_states = [
             self._create_states(tracker, domain) for tracker in trackers
         ]
-        # TODO there is no prediction support for e2e input right now, therefore
-        #  temporary remove TEXT features from USER state during prediction
-        for states in trackers_as_states:
-            for state in states:
-                if state.get(USER, {}).get(TEXT):
-                    del state[USER][TEXT]
+        self._choose_last_user_input(trackers_as_states, use_text_for_last_user_input)
 
         return trackers_as_states
 
@@ -357,23 +463,21 @@ def _hash_example(
         frozen_actions = (action,)
         return hash((frozen_states, frozen_actions))
 
-    def training_states_and_actions(
+    def training_states_actions_and_entities(
         self, trackers: List[DialogueStateTracker], domain: Domain
-    ) -> Tuple[List[List[State]], List[List[Text]]]:
-        """Transforms list of trackers to lists of states and actions.
-
-        Training data is padded up to the length of the longest dialogue with -1.
+    ) -> Tuple[List[List[State]], List[List[Text]], List[List[Dict[Text, Any]]]]:
+        """Transforms list of trackers to lists of states, actions and entity data.
 
         Args:
             trackers: The trackers to transform
             domain: The domain
 
         Returns:
-            A tuple of list of states and list of actions.
+            A tuple of list of states, list of actions and list of entity data.
         """
-
         trackers_as_states = []
         trackers_as_actions = []
+        trackers_as_entities = []
 
         # from multiple states that create equal featurizations
         # we only need to keep one.
@@ -393,7 +497,11 @@ def training_states_and_actions(
             states = self._create_states(tracker, domain)
 
             states_length_for_action = 0
+            entity_data = {}
             for event in tracker.applied_events():
+                if isinstance(event, UserUttered):
+                    entity_data = self._entity_data(event)
+
                 if not isinstance(event, ActionExecuted):
                     continue
 
@@ -419,29 +527,39 @@ def training_states_and_actions(
                         trackers_as_actions.append(
                             [event.action_name or event.action_text]
                         )
+                        trackers_as_entities.append([entity_data])
                 else:
                     trackers_as_states.append(sliced_states)
                     trackers_as_actions.append([event.action_name or event.action_text])
+                    trackers_as_entities.append([entity_data])
 
+                # reset entity_data for the the next turn
+                entity_data = {}
                 pbar.set_postfix({"# actions": "{:d}".format(len(trackers_as_actions))})
 
+        self._remove_user_text_if_intent(trackers_as_states)
+
         logger.debug("Created {} action examples.".format(len(trackers_as_actions)))
 
-        return trackers_as_states, trackers_as_actions
+        return trackers_as_states, trackers_as_actions, trackers_as_entities
 
     def prediction_states(
-        self, trackers: List[DialogueStateTracker], domain: Domain
+        self,
+        trackers: List[DialogueStateTracker],
+        domain: Domain,
+        use_text_for_last_user_input: bool = False,
     ) -> List[List[State]]:
         """Transforms list of trackers to lists of states for prediction.
 
         Args:
             trackers: The trackers to transform
             domain: The domain
+            use_text_for_last_user_input: Indicates whether to use text or intent label
+                for featurizing last user input.
 
         Returns:
             A list of states.
         """
-
         trackers_as_states = [
             self._create_states(tracker, domain) for tracker in trackers
         ]
@@ -449,11 +567,6 @@ def prediction_states(
             self.slice_state_history(states, self.max_history)
             for states in trackers_as_states
         ]
-        # TODO there is no prediction support for e2e input right now, therefore
-        #  temporary remove TEXT features from USER state during prediction
-        for states in trackers_as_states:
-            for state in states:
-                if state.get(USER, {}).get(TEXT):
-                    del state[USER][TEXT]
+        self._choose_last_user_input(trackers_as_states, use_text_for_last_user_input)
 
         return trackers_as_states
diff --git a/rasa/core/policies/ensemble.py b/rasa/core/policies/ensemble.py
index d12eb094cf82..8f0757a735ea 100644
--- a/rasa/core/policies/ensemble.py
+++ b/rasa/core/policies/ensemble.py
@@ -31,7 +31,11 @@
     ACTION_BACK_NAME,
 )
 from rasa.shared.core.domain import InvalidDomain, Domain
-from rasa.shared.core.events import ActionExecutionRejected, ActionExecuted
+from rasa.shared.core.events import (
+    ActionExecutionRejected,
+    ActionExecuted,
+    DefinePrevUserUtteredFeaturization,
+)
 from rasa.core.exceptions import UnsupportedDialogueModelError
 from rasa.core.featurizers.tracker_featurizers import MaxHistoryTrackerFeaturizer
 from rasa.shared.nlu.interpreter import NaturalLanguageInterpreter, RegexInterpreter
@@ -765,8 +769,7 @@ def probabilities_using_best_policy(
                 additional features.
 
         Returns:
-            best_probabilities: the list of probabilities for the next actions
-            best_policy_name: the name of the picked policy
+            The best policy prediction.
         """
         winning_prediction = self._best_policy_prediction(tracker, domain, interpreter)
 
@@ -781,6 +784,20 @@ def probabilities_using_best_policy(
         ):
             winning_prediction = self._fallback_after_listen(domain, winning_prediction)
 
+        if tracker.latest_action_name == ACTION_LISTEN_NAME:
+            if winning_prediction.is_end_to_end_prediction:
+                logger.debug("Made e2e prediction using user text.")
+                logger.debug("Added `DefinePrevUserUtteredFeaturization(True)` event.")
+                winning_prediction.events.append(
+                    DefinePrevUserUtteredFeaturization(True)
+                )
+            else:
+                logger.debug("Made prediction using user intent.")
+                logger.debug("Added `DefinePrevUserUtteredFeaturization(False)` event.")
+                winning_prediction.events.append(
+                    DefinePrevUserUtteredFeaturization(False)
+                )
+
         logger.debug(f"Predicted next action using {winning_prediction.policy_name}.")
         return winning_prediction
 
diff --git a/rasa/core/policies/mapping_policy.py b/rasa/core/policies/mapping_policy.py
index fac4492a0df7..84ed0ff08f0d 100644
--- a/rasa/core/policies/mapping_policy.py
+++ b/rasa/core/policies/mapping_policy.py
@@ -6,7 +6,6 @@
 import rasa.shared.utils.io
 from rasa.shared.constants import DOCS_URL_POLICIES, DOCS_URL_MIGRATION_GUIDE
 from rasa.shared.nlu.constants import INTENT_NAME_KEY
-from rasa.utils import common as common_utils
 from rasa.shared.core.constants import (
     USER_INTENT_BACK,
     USER_INTENT_RESTART,
diff --git a/rasa/core/policies/policy.py b/rasa/core/policies/policy.py
index bfe3a37af921..0294705baa33 100644
--- a/rasa/core/policies/policy.py
+++ b/rasa/core/policies/policy.py
@@ -136,7 +136,6 @@ def _get_valid_params(func: Callable, **kwargs: Any) -> Dict:
         Returns:
             the dictionary of parameters
         """
-
         valid_keys = rasa.shared.utils.common.arguments_of(func)
 
         params = {key: kwargs.get(key) for key in valid_keys if kwargs.get(key)}
@@ -152,7 +151,11 @@ def featurize_for_training(
         domain: Domain,
         interpreter: NaturalLanguageInterpreter,
         **kwargs: Any,
-    ) -> Tuple[List[List[Dict[Text, List["Features"]]]], np.ndarray]:
+    ) -> Tuple[
+        List[List[Dict[Text, List["Features"]]]],
+        np.ndarray,
+        List[List[Dict[Text, List["Features"]]]],
+    ]:
         """Transform training trackers into a vector representation.
 
         The trackers, consisting of multiple turns, will be transformed
@@ -170,9 +173,11 @@ def featurize_for_training(
               all training trackers
             - the label ids (e.g. action ids) for every dialogue turn in all training
               trackers
+            - A dictionary of entity type (ENTITY_TAGS) to a list of features
+              containing entity tag ids for text user inputs otherwise empty dict
+              for all dialogue turns in all training trackers
         """
-
-        state_features, label_ids = self.featurizer.featurize_trackers(
+        state_features, label_ids, entity_tags = self.featurizer.featurize_trackers(
             training_trackers, domain, interpreter
         )
 
@@ -184,8 +189,9 @@ def featurize_for_training(
             )
             state_features = state_features[:max_training_samples]
             label_ids = label_ids[:max_training_samples]
+            entity_tags = entity_tags[:max_training_samples]
 
-        return state_features, label_ids
+        return state_features, label_ids, entity_tags
 
     def train(
         self,
@@ -202,7 +208,6 @@ def train(
             domain: the :class:`rasa.shared.core.domain.Domain`
             interpreter: Interpreter which can be used by the polices for featurization.
         """
-
         raise NotImplementedError("Policy must have the capacity to train.")
 
     def predict_action_probabilities(
@@ -334,7 +339,6 @@ def _default_predictions(domain: Domain) -> List[float]:
         Returns:
             the list of the length of the number of actions
         """
-
         return [0.0] * domain.num_actions
 
     def format_tracker_states(self, states: List[Dict]) -> Text:
@@ -496,7 +500,6 @@ def confidence_scores_for(
     Returns:
         the list of the length of the number of actions
     """
-
     results = [0.0] * domain.num_actions
     idx = domain.index_for_action(action_name)
     results[idx] = value
diff --git a/rasa/core/policies/rule_policy.py b/rasa/core/policies/rule_policy.py
index 033ace294237..4d056569a09c 100644
--- a/rasa/core/policies/rule_policy.py
+++ b/rasa/core/policies/rule_policy.py
@@ -163,7 +163,7 @@ def validate_against_domain(
 
         if (
             domain is None
-            or rule_policy._fallback_action_name not in domain.action_names
+            or rule_policy._fallback_action_name not in domain.action_names_or_texts
         ):
             raise InvalidDomain(
                 f"The fallback action '{rule_policy._fallback_action_name}' which was "
@@ -420,7 +420,9 @@ def _predict_next_action(
             probabilities != self._default_predictions(domain)
             or tracker.is_rule_tracker
         ):
-            predicted_action_name = domain.action_names[np.argmax(probabilities)]
+            predicted_action_name = domain.action_names_or_texts[
+                np.argmax(probabilities)
+            ]
 
         return predicted_action_name
 
@@ -769,23 +771,39 @@ def _find_action_from_loop_happy_path(
             return ACTION_LISTEN_NAME
 
     def _find_action_from_rules(
-        self, tracker: DialogueStateTracker, domain: Domain
+        self,
+        tracker: DialogueStateTracker,
+        domain: Domain,
+        use_text_for_last_user_input: bool,
     ) -> Tuple[Optional[Text], Optional[Text], bool]:
         """Predicts the next action based on the memoized rules.
 
         Args:
             tracker: The current conversation tracker.
             domain: The domain of the current model.
+            use_text_for_last_user_input: `True` if text of last user message
+                should be used for the prediction. `False` if intent should be used.
 
         Returns:
-            A tuple of the predicted action name (or `None` if no matching rule was
-            found), a description of the matching rule, and `True` if a loop action
+            A tuple of the predicted action name or text (or `None` if no matching rule
+            was found), a description of the matching rule, and `True` if a loop action
             was predicted after the loop has been in an unhappy path before.
         """
-        tracker_as_states = self.featurizer.prediction_states([tracker], domain)
+        if (
+            use_text_for_last_user_input
+            and not tracker.latest_action_name == ACTION_LISTEN_NAME
+        ):
+            # make text prediction only directly after user utterance
+            # because we've otherwise already decided whether to use
+            # the text or the intent
+            return None, None, False
+
+        tracker_as_states = self.featurizer.prediction_states(
+            [tracker], domain, use_text_for_last_user_input
+        )
         states = tracker_as_states[0]
-        current_states = self.format_tracker_states(states)
 
+        current_states = self.format_tracker_states(states)
         logger.debug(f"Current tracker state:{current_states}")
 
         # Tracks if we are returning after an unhappy loop path. If this becomes `True`
@@ -867,43 +885,85 @@ def predict_action_probabilities(
         **kwargs: Any,
     ) -> PolicyPrediction:
         """Predicts the next action (see parent class for more information)."""
-        result = self._default_predictions(domain)
+        (
+            rules_action_name_from_text,
+            self._prediction_source,
+            returning_from_unhappy_path_from_text,
+        ) = self._find_action_from_rules(
+            tracker, domain, use_text_for_last_user_input=True
+        )
 
         # Rasa Open Source default actions overrule anything. If users want to achieve
         # the same, they need to write a rule or make sure that their loop rejects
         # accordingly.
         default_action_name = self._find_action_from_default_actions(tracker)
-        if default_action_name:
+
+        # text has priority over intents including default,
+        # however loop happy path has priority over rules prediction
+        if default_action_name and not rules_action_name_from_text:
             self._prediction_source = DEFAULT_RULES
             return self._prediction(
                 self._prediction_result(default_action_name, tracker, domain)
             )
 
-        # A loop has priority over any other rule.
+        # A loop has priority over any other rule except defaults.
         # The rules or any other prediction will be applied only if a loop was rejected.
         # If we are in a loop, and the loop didn't run previously or rejected, we can
         # simply force predict the loop.
         loop_happy_path_action_name = self._find_action_from_loop_happy_path(tracker)
         if loop_happy_path_action_name:
             self._prediction_source = LOOP_RULES
+            # this prediction doesn't use user input
+            # and happy user input anyhow should be ignored during featurization
             return self._prediction(
                 self._prediction_result(loop_happy_path_action_name, tracker, domain)
             )
 
-        (
-            rules_action_name,
-            source,
-            returning_from_unhappy_path,
-        ) = self._find_action_from_rules(tracker, domain)
-        # we want to remember the source even if rules didn't predict any action
-        self._prediction_source = source
-
-        policy_events = [LoopInterrupted(True)] if returning_from_unhappy_path else []
+        # predict rules from text first
+        if rules_action_name_from_text:
+            return self._prediction_with_unhappy_path(
+                self._prediction_result(rules_action_name_from_text, tracker, domain),
+                returning_from_unhappy_path=returning_from_unhappy_path_from_text,
+                is_end_to_end_prediction=True,
+            )
 
-        if rules_action_name:
-            result = self._prediction_result(rules_action_name, tracker, domain)
+        (
+            rules_action_name_from_intent,
+            # we want to remember the source even if rules didn't predict any action
+            self._prediction_source,
+            returning_from_unhappy_path_from_intent,
+        ) = self._find_action_from_rules(
+            tracker, domain, use_text_for_last_user_input=False
+        )
+        if rules_action_name_from_intent:
+            probabilities = self._prediction_result(
+                rules_action_name_from_intent, tracker, domain
+            )
+        else:
+            probabilities = self._default_predictions(domain)
+
+        return self._prediction_with_unhappy_path(
+            probabilities,
+            returning_from_unhappy_path=(
+                # returning_from_unhappy_path is a negative condition,
+                # so `or` should be applied
+                returning_from_unhappy_path_from_text
+                or returning_from_unhappy_path_from_intent
+            ),
+            is_end_to_end_prediction=False,
+        )
 
-        return self._prediction(result, events=policy_events)
+    def _prediction_with_unhappy_path(
+        self,
+        probabilities: List[float],
+        returning_from_unhappy_path: bool,
+        is_end_to_end_prediction: bool,
+    ) -> "PolicyPrediction":
+        return self._prediction(
+            probabilities,
+            events=[LoopInterrupted(True)] if returning_from_unhappy_path else [],
+            is_end_to_end_prediction=is_end_to_end_prediction,
+        )
 
     def _default_predictions(self, domain: Domain) -> List[float]:
         result = super()._default_predictions(domain)
diff --git a/rasa/core/policies/sklearn_policy.py b/rasa/core/policies/sklearn_policy.py
index 3c1340d3e4e2..2efcab236ce3 100644
--- a/rasa/core/policies/sklearn_policy.py
+++ b/rasa/core/policies/sklearn_policy.py
@@ -1,12 +1,18 @@
 import json
 import logging
 import typing
+import scipy.sparse
+import numpy as np
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Text, Tuple, Union
 from collections import defaultdict, OrderedDict
-import scipy.sparse
 
-import numpy as np
+from sklearn.base import clone
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import GridSearchCV
+from sklearn.preprocessing import LabelEncoder
+
+import rasa.shared.utils.io
 import rasa.utils.io as io_utils
 import rasa.utils.tensorflow.model_data_utils as model_data_utils
 from rasa.core.constants import DEFAULT_POLICY_PRIORITY
@@ -20,15 +26,10 @@
 from rasa.core.policies.policy import Policy, PolicyPrediction
 from rasa.shared.core.trackers import DialogueStateTracker
 from rasa.shared.core.generator import TrackerWithCachedStates
-import rasa.shared.utils.io
-from sklearn.base import clone
-from sklearn.linear_model import LogisticRegression
-from sklearn.model_selection import GridSearchCV
-from sklearn.preprocessing import LabelEncoder
 from rasa.shared.nlu.constants import ACTION_TEXT, TEXT
 from rasa.shared.nlu.training_data.features import Features
-from rasa.utils.tensorflow.constants import EPOCHS, SENTENCE
-from rasa.utils.tensorflow.model_data import Data
+from rasa.utils.tensorflow.model_data import Data, FeatureArray
+from rasa.utils.tensorflow.constants import SENTENCE
 
 # noinspection PyProtectedMember
 from sklearn.utils import shuffle as sklearn_shuffle
@@ -161,20 +162,32 @@ def _fill_in_features_to_max_length(
         ]
         return features
 
-    def _get_features_for_attribute(self, attribute_data: Dict[Text, List[np.ndarray]]):
-        """
-        Given a list of all features for one attribute, turn it into a numpy array;
+    def _get_features_for_attribute(
+        self, attribute_data: Dict[Text, List[FeatureArray]]
+    ):
+        """Given a list of all features for one attribute, turn it into a numpy array.
+
         shape_attribute = features[SENTENCE][0][0].shape[-1]
             (Shape of features of one attribute)
+
         Args:
-            attribute_data: all features in the attribute stored in a np.array;
-        Output:
+            attribute_data: all features in the attribute stored in a FeatureArray
+
+        Returns:
             2D np.ndarray with features for an attribute with
                 shape [num_dialogs x (max_history * shape_attribute)]
         """
         sentence_features = attribute_data[SENTENCE][0]
-        if isinstance(sentence_features[0], scipy.sparse.coo_matrix):
+
+        # vstack serves as removing dimension
+        if sentence_features.is_sparse:
+            sentence_features = [
+                scipy.sparse.vstack(value) for value in sentence_features
+            ]
             sentence_features = [feature.toarray() for feature in sentence_features]
+        else:
+            sentence_features = [np.vstack(value) for value in sentence_features]
+
         # MaxHistoryFeaturizer is always used with SkLearn policy;
         max_history = self.featurizer.max_history
         features = self._fill_in_features_to_max_length(sentence_features, max_history)
@@ -221,7 +234,7 @@ def train(
         interpreter: NaturalLanguageInterpreter,
         **kwargs: Any,
     ) -> None:
-        tracker_state_features, label_ids = self.featurize_for_training(
+        tracker_state_features, label_ids, _ = self.featurize_for_training(
             training_trackers, domain, interpreter, **kwargs
         )
         training_data, zero_state_features = model_data_utils.convert_to_data_format(
diff --git a/rasa/core/policies/ted_policy.py b/rasa/core/policies/ted_policy.py
index eaa725ef32ff..f06530b26a54 100644
--- a/rasa/core/policies/ted_policy.py
+++ b/rasa/core/policies/ted_policy.py
@@ -1,16 +1,19 @@
-import copy
 import logging
 from pathlib import Path
 from collections import defaultdict
 
 import numpy as np
+
 import rasa.shared.utils.io
 import tensorflow as tf
 import tensorflow_addons as tfa
-import typing
-from typing import Any, List, Optional, Text, Dict, Tuple, Union
+from typing import Any, List, Optional, Text, Dict, Tuple, Union, TYPE_CHECKING
 
 import rasa.utils.io as io_utils
+import rasa.core.actions.action
+from rasa.nlu.constants import TOKENS_NAMES
+from rasa.nlu.extractors.extractor import EntityExtractor
+from rasa.nlu.classifiers.diet_classifier import EntityTagSpec
 from rasa.shared.core.domain import Domain
 from rasa.core.featurizers.tracker_featurizers import (
     TrackerFeaturizer,
@@ -18,19 +21,36 @@
     MaxHistoryTrackerFeaturizer,
 )
 from rasa.core.featurizers.single_state_featurizer import SingleStateFeaturizer
-from rasa.shared.nlu.constants import ACTION_TEXT, ACTION_NAME, INTENT, TEXT, ENTITIES
+from rasa.shared.nlu.constants import (
+    ACTION_TEXT,
+    ACTION_NAME,
+    INTENT,
+    TEXT,
+    ENTITIES,
+    VALID_FEATURE_TYPES,
+    FEATURE_TYPE_SENTENCE,
+    ENTITY_ATTRIBUTE_TYPE,
+    ENTITY_TAGS,
+    EXTRACTOR,
+)
 from rasa.shared.nlu.interpreter import NaturalLanguageInterpreter
 from rasa.core.policies.policy import Policy, PolicyPrediction
 from rasa.core.constants import DEFAULT_POLICY_PRIORITY, DIALOGUE
-from rasa.shared.core.constants import ACTIVE_LOOP, SLOTS
+from rasa.shared.core.constants import ACTIVE_LOOP, SLOTS, ACTION_LISTEN_NAME
 from rasa.shared.core.trackers import DialogueStateTracker
 from rasa.shared.core.generator import TrackerWithCachedStates
-from rasa.utils import train_utils
+import rasa.utils.train_utils
 from rasa.utils.tensorflow.models import RasaModel, TransformerRasaModel
-from rasa.utils.tensorflow.model_data import RasaModelData, FeatureSignature
+from rasa.utils.tensorflow.model_data import (
+    RasaModelData,
+    FeatureSignature,
+    FeatureArray,
+    Data,
+)
 from rasa.utils.tensorflow.model_data_utils import convert_to_data_format
 from rasa.utils.tensorflow.constants import (
     LABEL,
+    IDS,
     TRANSFORMER_SIZE,
     NUM_TRANSFORMER_LAYERS,
     NUM_HEADS,
@@ -69,24 +89,36 @@
     UNIDIRECTIONAL_ENCODER,
     SEQUENCE,
     SENTENCE,
+    SEQUENCE_LENGTH,
     DENSE_DIMENSION,
+    CONCAT_DIMENSION,
+    SPARSE_INPUT_DROPOUT,
+    DENSE_INPUT_DROPOUT,
+    MASKED_LM,
+    MASK,
+    HIDDEN_LAYERS_SIZES,
+    FEATURIZERS,
+    ENTITY_RECOGNITION,
 )
+from rasa.shared.core.events import EntitiesAdded, Event
+from rasa.shared.nlu.training_data.message import Message
 
-
-if typing.TYPE_CHECKING:
+if TYPE_CHECKING:
     from rasa.shared.nlu.training_data.features import Features
 
 
 logger = logging.getLogger(__name__)
 
-MASK = "mask"
+E2E_CONFIDENCE_THRESHOLD = "e2e_confidence_threshold"
 LABEL_KEY = LABEL
-LABEL_SUB_KEY = "ids"
+LABEL_SUB_KEY = IDS
 LENGTH = "length"
-POSSIBLE_FEATURE_TYPES = [SEQUENCE, SENTENCE]
-FEATURES_TO_ENCODE = [INTENT, TEXT, ACTION_NAME, ACTION_TEXT]
+INDICES = "indices"
+SENTENCE_FEATURES_TO_ENCODE = [INTENT, TEXT, ACTION_NAME, ACTION_TEXT]
+SEQUENCE_FEATURES_TO_ENCODE = [TEXT, ACTION_TEXT, f"{LABEL}_{ACTION_TEXT}"]
 LABEL_FEATURES_TO_ENCODE = [f"{LABEL}_{ACTION_NAME}", f"{LABEL}_{ACTION_TEXT}"]
 STATE_LEVEL_FEATURES = [ENTITIES, SLOTS, ACTIVE_LOOP]
+PREDICTION_FEATURES = STATE_LEVEL_FEATURES + SENTENCE_FEATURES_TO_ENCODE + [DIALOGUE]
 
 SAVE_MODEL_FILE_NAME = "ted_policy"
 
@@ -112,16 +144,40 @@ class TEDPolicy(Policy):
     # please make sure to update the docs when changing a default parameter
     defaults = {
         # ## Architecture of the used neural network
-        # Hidden layer sizes for layers before the dialogue and label embedding layers.
-        # The number of hidden layers is equal to the length of the corresponding
-        # list.
-        # TODO add 2 parallel NNs: transformer for text and ffnn for names
-        DENSE_DIMENSION: 20,
+        # Hidden layer sizes for layers before the embedding layers for user message
+        # and labels.
+        # The number of hidden layers is equal to the length of the corresponding list.
+        HIDDEN_LAYERS_SIZES: {TEXT: [], ACTION_TEXT: [], f"{LABEL}_{ACTION_TEXT}": []},
+        # Dense dimension to use for sparse features.
+        DENSE_DIMENSION: {
+            TEXT: 128,
+            ACTION_TEXT: 128,
+            f"{LABEL}_{ACTION_TEXT}": 128,
+            INTENT: 20,
+            ACTION_NAME: 20,
+            f"{LABEL}_{ACTION_NAME}": 20,
+            ENTITIES: 20,
+            SLOTS: 20,
+            ACTIVE_LOOP: 20,
+        },
+        # Default dimension to use for concatenating sequence and sentence features.
+        CONCAT_DIMENSION: {TEXT: 128, ACTION_TEXT: 128, f"{LABEL}_{ACTION_TEXT}": 128},
+        # Dimension size of embedding vectors before the dialogue transformer encoder.
         ENCODING_DIMENSION: 50,
-        # Number of units in transformer
-        TRANSFORMER_SIZE: 128,
-        # Number of transformer layers
-        NUM_TRANSFORMER_LAYERS: 1,
+        # Number of units in transformer encoders
+        TRANSFORMER_SIZE: {
+            TEXT: 128,
+            ACTION_TEXT: 128,
+            f"{LABEL}_{ACTION_TEXT}": 128,
+            DIALOGUE: 128,
+        },
+        # Number of layers in transformer encoders
+        NUM_TRANSFORMER_LAYERS: {
+            TEXT: 1,
+            ACTION_TEXT: 1,
+            f"{LABEL}_{ACTION_TEXT}": 1,
+            DIALOGUE: 1,
+        },
         # Number of attention heads in transformer
         NUM_HEADS: 4,
         # If 'True' use key relative embeddings in attention
@@ -183,9 +239,16 @@ class TEDPolicy(Policy):
         # Dropout rate for embedding layers of label, e.g. action, features.
         DROP_RATE_LABEL: 0.0,
         # Dropout rate for attention.
-        DROP_RATE_ATTENTION: 0,
+        DROP_RATE_ATTENTION: 0.0,
         # Sparsity of the weights in dense layers
         WEIGHT_SPARSITY: 0.8,
+        # If 'True' apply dropout to sparse input tensors
+        SPARSE_INPUT_DROPOUT: True,
+        # If 'True' apply dropout to dense input tensors
+        DENSE_INPUT_DROPOUT: True,
+        # If 'True' random tokens of the input message will be masked and the model
+        # should predict those tokens.
+        MASKED_LM: False,
         # ## Evaluation parameters
         # How often calculate validation accuracy.
         # Small values may hurt performance, e.g. model accuracy.
@@ -202,6 +265,13 @@ class TEDPolicy(Policy):
         TENSORBOARD_LOG_LEVEL: "epoch",
         # Perform model checkpointing
         CHECKPOINT_MODEL: False,
+        # Only pick e2e prediction if the policy is confident enough
+        E2E_CONFIDENCE_THRESHOLD: 0.5,
+        # Specify what features to use as sequence and sentence features.
+        # By default all features in the pipeline are used.
+        FEATURIZERS: [],
+        # If set to true, entities are predicted in user utterances.
+        ENTITY_RECOGNITION: True,
     }
 
     @staticmethod
@@ -216,7 +286,8 @@ def __init__(
         priority: int = DEFAULT_POLICY_PRIORITY,
         max_history: Optional[int] = None,
         model: Optional[RasaModel] = None,
-        zero_state_features: Optional[Dict[Text, List["Features"]]] = None,
+        fake_features: Optional[Dict[Text, List["Features"]]] = None,
+        entity_tag_specs: Optional[List[EntityTagSpec]] = None,
         should_finetune: bool = False,
         **kwargs: Any,
     ) -> None:
@@ -236,19 +307,43 @@ def __init__(
 
         self.model = model
 
-        self.zero_state_features = zero_state_features or defaultdict(list)
+        self._entity_tag_specs = entity_tag_specs
+
+        self.fake_features = fake_features or defaultdict(list)
+        # TED is only e2e if only text is present in fake features, which represent
+        # all possible input features for current version of this trained ted
+        self.only_e2e = TEXT in self.fake_features and INTENT not in self.fake_features
 
         self._label_data: Optional[RasaModelData] = None
         self.data_example: Optional[Dict[Text, List[np.ndarray]]] = None
 
     def _load_params(self, **kwargs: Dict[Text, Any]) -> None:
-        self.config = copy.deepcopy(self.defaults)
-        self.config.update(kwargs)
-
-        self.config = train_utils.check_deprecated_options(self.config)
+        new_config = rasa.utils.train_utils.check_core_deprecated_options(kwargs)
+        self.config = rasa.utils.train_utils.override_defaults(
+            self.defaults, new_config
+        )
+        self.config = rasa.utils.train_utils.update_similarity_type(self.config)
+        self.config = rasa.utils.train_utils.update_evaluation_parameters(self.config)
+
+    def _create_entity_tag_specs(self) -> List[EntityTagSpec]:
+        """Create entity tag specifications with their respective tag id mappings."""
+        _tag_specs = []
+
+        tag_id_index_mapping = self.featurizer.state_featurizer.get_entity_tag_ids()
+
+        if tag_id_index_mapping:
+            _tag_specs.append(
+                EntityTagSpec(
+                    tag_name=ENTITY_ATTRIBUTE_TYPE,
+                    tags_to_ids=tag_id_index_mapping,
+                    ids_to_tags={
+                        value: key for key, value in tag_id_index_mapping.items()
+                    },
+                    num_tags=len(tag_id_index_mapping),
+                )
+            )
 
-        self.config = train_utils.update_similarity_type(self.config)
-        self.config = train_utils.update_evaluation_parameters(self.config)
+        return _tag_specs
 
     def _create_label_data(
         self, domain: Domain, interpreter: NaturalLanguageInterpreter
@@ -257,33 +352,66 @@ def _create_label_data(
         state_featurizer = self.featurizer.state_featurizer
         encoded_all_labels = state_featurizer.encode_all_actions(domain, interpreter)
 
-        attribute_data, _ = convert_to_data_format(encoded_all_labels)
+        attribute_data, _ = convert_to_data_format(
+            encoded_all_labels, featurizers=self.config[FEATURIZERS]
+        )
 
         label_data = RasaModelData()
         label_data.add_data(attribute_data, key_prefix=f"{LABEL_KEY}_")
+        label_data.add_lengths(
+            f"{LABEL}_{ACTION_TEXT}",
+            SEQUENCE_LENGTH,
+            f"{LABEL}_{ACTION_TEXT}",
+            SEQUENCE,
+        )
 
         label_ids = np.arange(domain.num_actions)
         label_data.add_features(
-            LABEL_KEY, LABEL_SUB_KEY, [np.expand_dims(label_ids, -1)]
+            LABEL_KEY,
+            LABEL_SUB_KEY,
+            [FeatureArray(np.expand_dims(label_ids, -1), number_of_dimensions=2)],
         )
 
         return label_data, encoded_all_labels
 
+    def _create_data_for_entities(
+        self, entity_tags: Optional[List[List[Dict[Text, List["Features"]]]]]
+    ) -> Optional[Data]:
+        if not self.config[ENTITY_RECOGNITION]:
+            return
+
+        # check that there are real entity tags
+        if entity_tags and any([any(turn_tags) for turn_tags in entity_tags]):
+            entity_tags_data, _ = convert_to_data_format(entity_tags)
+            return entity_tags_data
+
+        # there are no "real" entity tags
+        logger.debug(
+            f"Entity recognition cannot be performed, "
+            f"set '{ENTITY_RECOGNITION}' config parameter to 'False'."
+        )
+        self.config[ENTITY_RECOGNITION] = False
+
     def _create_model_data(
         self,
         tracker_state_features: List[List[Dict[Text, List["Features"]]]],
         label_ids: Optional[np.ndarray] = None,
+        entity_tags: Optional[List[List[Dict[Text, List["Features"]]]]] = None,
         encoded_all_labels: Optional[List[Dict[Text, List["Features"]]]] = None,
     ) -> RasaModelData:
         """Combine all model related data into RasaModelData.
 
         Args:
-            tracker_state_features: a dictionary of attributes (INTENT, TEXT, ACTION_NAME, ACTION_TEXT,
-                ENTITIES, SLOTS, ACTIVE_LOOP) to a list of features for all dialogue
-                turns in all training trackers
+            tracker_state_features: a dictionary of attributes
+                (INTENT, TEXT, ACTION_NAME, ACTION_TEXT, ENTITIES, SLOTS, ACTIVE_LOOP)
+                to a list of features for all dialogue turns in all training trackers
             label_ids: the label ids (e.g. action ids) for every dialogue turn in all
                 training trackers
-            encoded_all_labels: a list of dictionaries containing attribute features for labels ids
+            entity_tags: a dictionary of entity type (ENTITY_TAGS) to a list of features
+                containing entity tag ids for text user inputs otherwise empty dict
+                for all dialogue turns in all training trackers
+            encoded_all_labels: a list of dictionaries containing attribute features
+                for label ids
 
         Returns:
             RasaModelData
@@ -291,25 +419,48 @@ def _create_model_data(
         model_data = RasaModelData(label_key=LABEL_KEY, label_sub_key=LABEL_SUB_KEY)
 
         if label_ids is not None and encoded_all_labels is not None:
-
             label_ids = np.array(
                 [np.expand_dims(seq_label_ids, -1) for seq_label_ids in label_ids]
             )
-            model_data.add_features(LABEL_KEY, LABEL_SUB_KEY, [label_ids])
+            model_data.add_features(
+                LABEL_KEY,
+                LABEL_SUB_KEY,
+                [FeatureArray(label_ids, number_of_dimensions=3)],
+            )
 
-            attribute_data, self.zero_state_features = convert_to_data_format(
-                tracker_state_features
+            attribute_data, self.fake_features = convert_to_data_format(
+                tracker_state_features, featurizers=self.config[FEATURIZERS]
             )
+
+            entity_tags_data = self._create_data_for_entities(entity_tags)
+            if entity_tags_data is not None:
+                model_data.add_data(entity_tags_data)
         else:
             # method is called during prediction
             attribute_data, _ = convert_to_data_format(
-                tracker_state_features, self.zero_state_features
+                tracker_state_features,
+                self.fake_features,
+                featurizers=self.config[FEATURIZERS],
             )
 
         model_data.add_data(attribute_data)
-        model_data.add_lengths(
-            DIALOGUE, LENGTH, next(iter(list(attribute_data.keys()))), MASK
+        model_data.add_lengths(TEXT, SEQUENCE_LENGTH, TEXT, SEQUENCE)
+        model_data.add_lengths(ACTION_TEXT, SEQUENCE_LENGTH, ACTION_TEXT, SEQUENCE)
+
+        # add the dialogue lengths
+        attribute_present = next(iter(list(attribute_data.keys())))
+        dialogue_lengths = np.array(
+            [
+                np.size(np.squeeze(f, -1))
+                for f in model_data.data[attribute_present][MASK][0]
+            ]
         )
+        model_data.data[DIALOGUE][LENGTH] = [
+            FeatureArray(dialogue_lengths, number_of_dimensions=1)
+        ]
+
+        # make sure all keys are in the same order during training and prediction
+        model_data.sort()
 
         return model_data
 
@@ -330,7 +481,7 @@ def train(
             return
 
         # dealing with training data
-        tracker_state_features, label_ids = self.featurize_for_training(
+        tracker_state_features, label_ids, entity_tags = self.featurize_for_training(
             training_trackers, domain, interpreter, **kwargs
         )
 
@@ -340,7 +491,7 @@ def train(
 
         # extract actual training data to feed to model
         model_data = self._create_model_data(
-            tracker_state_features, label_ids, encoded_all_labels
+            tracker_state_features, label_ids, entity_tags, encoded_all_labels
         )
         if model_data.is_empty():
             logger.error(
@@ -349,6 +500,9 @@ def train(
             )
             return
 
+        if self.config[ENTITY_RECOGNITION]:
+            self._entity_tag_specs = self._create_entity_tag_specs()
+
         # keep one example for persisting and loading
         self.data_example = model_data.first_data_example()
 
@@ -361,6 +515,7 @@ def train(
                 self.config,
                 isinstance(self.featurizer, MaxHistoryTrackerFeaturizer),
                 self._label_data,
+                self._entity_tag_specs,
             )
 
         self.model.fit(
@@ -372,6 +527,59 @@ def train(
             batch_strategy=self.config[BATCH_STRATEGY],
         )
 
+    def _featurize_tracker_for_e2e(
+        self,
+        tracker: DialogueStateTracker,
+        domain: Domain,
+        interpreter: NaturalLanguageInterpreter,
+    ) -> List[List[Dict[Text, List["Features"]]]]:
+        # construct two examples in the batch to be fed to the model -
+        # one by featurizing last user text
+        # and second - an optional one (see conditions below),
+        # the first example in the constructed batch either does not contain user input
+        # or uses intent or text based on whether TED is e2e only.
+        tracker_state_features = self.featurizer.create_state_features(
+            [tracker], domain, interpreter, use_text_for_last_user_input=self.only_e2e
+        )
+        # the second - text, but only after user utterance and if not only e2e
+        if (
+            tracker.latest_action_name == ACTION_LISTEN_NAME
+            and TEXT in self.fake_features
+            and not self.only_e2e
+        ):
+            tracker_state_features += self.featurizer.create_state_features(
+                [tracker], domain, interpreter, use_text_for_last_user_input=True
+            )
+        return tracker_state_features
+
+    def _pick_confidence(
+        self, confidences: np.ndarray, similarities: np.ndarray
+    ) -> Tuple[np.ndarray, bool]:
+        # the confidences and similarities have shape (batch-size x number of actions)
+        # batch-size can only be 1 or 2;
+        # in the case batch-size==2, the first example contain user intent as features,
+        # the second - user text as features
+        if confidences.shape[0] > 2:
+            raise ValueError(
+                "We cannot pick prediction from batches of size more than 2."
+            )
+        # we use heuristic to pick correct prediction
+        if confidences.shape[0] == 2:
+            # we use similarities to pick appropriate input,
+            # since it seems to be more accurate measure,
+            # policy is trained to maximize the similarity not the confidence
+            if (
+                np.max(confidences[1]) > self.config[E2E_CONFIDENCE_THRESHOLD]
+                # TODO maybe compare confidences is better
+                and np.max(similarities[1]) > np.max(similarities[0])
+            ):
+                return confidences[1], True
+
+            return confidences[0], False
+
+        # by default the first example in a batch is the one to use for prediction
+        return confidences[0], self.only_e2e
+
     def predict_action_probabilities(
         self,
         tracker: DialogueStateTracker,
@@ -387,21 +595,81 @@ def predict_action_probabilities(
             return self._prediction(self._default_predictions(domain))
 
         # create model data from tracker
-        tracker_state_features = self.featurizer.create_state_features(
-            [tracker], domain, interpreter
+        tracker_state_features = self._featurize_tracker_for_e2e(
+            tracker, domain, interpreter
         )
         model_data = self._create_model_data(tracker_state_features)
 
         output = self.model.predict(model_data)
 
-        confidence = output["action_scores"].numpy()
-        # remove batch dimension and take the last prediction in the sequence
-        confidence = confidence[0, -1, :]
+        # take the last prediction in the sequence
+        similarities = output["similarities"].numpy()[:, -1, :]
+        confidences = output["action_scores"].numpy()[:, -1, :]
+        # take correct prediction from batch
+        confidence, is_e2e_prediction = self._pick_confidence(confidences, similarities)
 
         if self.config[LOSS_TYPE] == SOFTMAX and self.config[RANKING_LENGTH] > 0:
-            confidence = train_utils.normalize(confidence, self.config[RANKING_LENGTH])
+            confidence = rasa.utils.train_utils.normalize(
+                confidence, self.config[RANKING_LENGTH]
+            )
+
+        optional_events = self._create_optional_event_for_entities(
+            output, is_e2e_prediction, interpreter, tracker
+        )
+
+        return self._prediction(
+            confidence.tolist(),
+            is_end_to_end_prediction=is_e2e_prediction,
+            optional_events=optional_events,
+        )
+
+    def _create_optional_event_for_entities(
+        self,
+        prediction_output: Dict[Text, tf.Tensor],
+        is_e2e_prediction: bool,
+        interpreter: NaturalLanguageInterpreter,
+        tracker: DialogueStateTracker,
+    ) -> Optional[List[Event]]:
+        if tracker.latest_action_name != ACTION_LISTEN_NAME or not is_e2e_prediction:
+            # entities belong only to the last user message
+            # and only if user text was used for prediction,
+            # a user message always comes after action listen
+            return
 
-        return self._prediction(confidence.tolist())
+        if not self.config[ENTITY_RECOGNITION]:
+            # entity recognition is not turned on, no entities can be predicted
+            return
+
+        # The batch dimension of entity prediction is not the same as batch size,
+        # rather it is the number of last (if max history featurizer else all)
+        # text inputs in the batch
+        # therefore, in order to pick entities from the latest user message
+        # we need to pick entities from the last batch dimension of entity prediction
+        (
+            predicted_tags,
+            confidence_values,
+        ) = rasa.utils.train_utils.entity_label_to_tags(
+            prediction_output, self._entity_tag_specs, prediction_index=-1
+        )
+
+        if ENTITY_ATTRIBUTE_TYPE not in predicted_tags:
+            # no entities detected
+            return
+
+        # entities belong to the last message of the tracker
+        # convert the predicted tags to actual entities
+        text = tracker.latest_message.text
+        parsed_message = interpreter.featurize_message(Message(data={TEXT: text}))
+        tokens = parsed_message.get(TOKENS_NAMES[TEXT])
+        entities = EntityExtractor.convert_predictions_into_entities(
+            text, tokens, predicted_tags, confidences=confidence_values
+        )
+
+        # add the extractor name
+        for entity in entities:
+            entity[EXTRACTOR] = "TEDPolicy"
+
+        return [EntitiesAdded(entities)]
 
     def persist(self, path: Union[Text, Path]) -> None:
         """Persists the policy to a storage."""
@@ -434,14 +702,24 @@ def persist(self, path: Union[Text, Path]) -> None:
             model_path / f"{SAVE_MODEL_FILE_NAME}.data_example.pkl", self.data_example
         )
         io_utils.pickle_dump(
-            model_path / f"{SAVE_MODEL_FILE_NAME}.zero_state_features.pkl",
-            self.zero_state_features,
+            model_path / f"{SAVE_MODEL_FILE_NAME}.fake_features.pkl",
+            self.fake_features,
         )
         io_utils.pickle_dump(
             model_path / f"{SAVE_MODEL_FILE_NAME}.label_data.pkl",
             dict(self._label_data.data),
         )
 
+        entity_tag_specs = (
+            [tag_spec._asdict() for tag_spec in self._entity_tag_specs]
+            if self._entity_tag_specs
+            else []
+        )
+        rasa.shared.utils.io.dump_obj_as_json_to_file(
+            model_path / f"{SAVE_MODEL_FILE_NAME}.entity_tag_specs.json",
+            entity_tag_specs,
+        )
+
     @classmethod
     def load(
         cls,
@@ -475,19 +753,35 @@ def load(
         label_data = io_utils.pickle_load(
             model_path / f"{SAVE_MODEL_FILE_NAME}.label_data.pkl"
         )
-        zero_state_features = io_utils.pickle_load(
-            model_path / f"{SAVE_MODEL_FILE_NAME}.zero_state_features.pkl"
+        fake_features = io_utils.pickle_load(
+            model_path / f"{SAVE_MODEL_FILE_NAME}.fake_features.pkl"
         )
         label_data = RasaModelData(data=label_data)
         meta = io_utils.pickle_load(model_path / f"{SAVE_MODEL_FILE_NAME}.meta.pkl")
         priority = io_utils.json_unpickle(
             model_path / f"{SAVE_MODEL_FILE_NAME}.priority.pkl"
         )
+        entity_tag_specs = rasa.shared.utils.io.read_json_file(
+            model_path / f"{SAVE_MODEL_FILE_NAME}.entity_tag_specs.json"
+        )
+        entity_tag_specs = [
+            EntityTagSpec(
+                tag_name=tag_spec["tag_name"],
+                ids_to_tags={
+                    int(key): value for key, value in tag_spec["ids_to_tags"].items()
+                },
+                tags_to_ids={
+                    key: int(value) for key, value in tag_spec["tags_to_ids"].items()
+                },
+                num_tags=tag_spec["num_tags"],
+            )
+            for tag_spec in entity_tag_specs
+        ]
 
         model_data_example = RasaModelData(
             label_key=LABEL_KEY, label_sub_key=LABEL_SUB_KEY, data=loaded_data
         )
-        meta = train_utils.update_similarity_type(meta)
+        meta = rasa.utils.train_utils.update_similarity_type(meta)
 
         meta[EPOCHS] = epoch_override
 
@@ -496,24 +790,25 @@ def load(
             model_data_example,
             data_signature=model_data_example.get_signature(),
             config=meta,
-            max_history_tracker_featurizer_used=isinstance(
-                featurizer, MaxHistoryTrackerFeaturizer
-            ),
+            # during prediction we don't care about previous dialogue turns,
+            # so to save computation time, use only the last one
+            use_only_last_dialogue_turns=True,
             label_data=label_data,
+            entity_tag_specs=entity_tag_specs,
             finetune_mode=should_finetune,
         )
 
         if not should_finetune:
             # build the graph for prediction
-
-            features_to_select = STATE_LEVEL_FEATURES + FEATURES_TO_ENCODE + [DIALOGUE]
             predict_data_example = RasaModelData(
                 label_key=LABEL_KEY,
                 label_sub_key=LABEL_SUB_KEY,
                 data={
                     feature_name: features
                     for feature_name, features in model_data_example.items()
-                    if feature_name in features_to_select
+                    if feature_name
+                    # we need to remove label features for prediction if they are present
+                    in PREDICTION_FEATURES
                 },
             )
             model.build_for_predict(predict_data_example)
@@ -522,7 +817,8 @@ def load(
             featurizer=featurizer,
             priority=priority,
             model=model,
-            zero_state_features=zero_state_features,
+            fake_features=fake_features,
+            entity_tag_specs=entity_tag_specs,
             should_finetune=should_finetune,
             **meta,
         )
@@ -533,26 +829,42 @@ def __init__(
         self,
         data_signature: Dict[Text, Dict[Text, List[FeatureSignature]]],
         config: Dict[Text, Any],
-        max_history_tracker_featurizer_used: bool,
+        use_only_last_dialogue_turns: bool,
         label_data: RasaModelData,
+        entity_tag_specs: Optional[List[EntityTagSpec]],
     ) -> None:
+        """Intializes the TED model.
+
+        Args:
+            data_signature: the data signature of the input data
+            config: the model configuration
+            use_only_last_dialogue_turns: if 'True' only the last dialogue turn will be used
+            label_data: the label data
+            entity_tag_specs: the entity tag specifications
+        """
         super().__init__("TED", config, data_signature, label_data)
 
-        self.max_history_tracker_featurizer_used = max_history_tracker_featurizer_used
+        self.use_only_last_dialogue_turns = use_only_last_dialogue_turns
 
         self.predict_data_signature = {
             feature_name: features
             for feature_name, features in data_signature.items()
-            if feature_name in STATE_LEVEL_FEATURES + FEATURES_TO_ENCODE + [DIALOGUE]
+            if feature_name in PREDICTION_FEATURES
         }
 
+        self._entity_tag_specs = entity_tag_specs
+
         # optimizer
         self.optimizer = tf.keras.optimizers.Adam()
 
         # metrics
         self.action_loss = tf.keras.metrics.Mean(name="loss")
         self.action_acc = tf.keras.metrics.Mean(name="acc")
+        self.entity_loss = tf.keras.metrics.Mean(name="e_loss")
+        self.entity_f1 = tf.keras.metrics.Mean(name="e_f1")
         self.metrics_to_log += ["loss", "acc"]
+        if self.config[ENTITY_RECOGNITION]:
+            self.metrics_to_log += ["e_loss", "e_f1"]
 
         # needed for efficient prediction
         self.all_labels_embed: Optional[tf.Tensor] = None
@@ -579,17 +891,27 @@ def _check_data(self) -> None:
                 f"Cannot train '{self.__class__.__name__}' model."
             )
 
+    # ---CREATING LAYERS HELPERS---
+
     def _prepare_layers(self) -> None:
         for name in self.data_signature.keys():
             self._prepare_sparse_dense_layer_for(name, self.data_signature)
+            if name in SEQUENCE_FEATURES_TO_ENCODE:
+                self._prepare_sequence_layers(name)
             self._prepare_encoding_layers(name)
 
         for name in self.label_signature.keys():
             self._prepare_sparse_dense_layer_for(name, self.label_signature)
+            if name in SEQUENCE_FEATURES_TO_ENCODE:
+                self._prepare_sequence_layers(name)
             self._prepare_encoding_layers(name)
 
         self._prepare_transformer_layer(
-            DIALOGUE, self.config[DROP_RATE_DIALOGUE], self.config[DROP_RATE_ATTENTION]
+            DIALOGUE,
+            self.config[NUM_TRANSFORMER_LAYERS][DIALOGUE],
+            self.config[TRANSFORMER_SIZE][DIALOGUE],
+            self.config[DROP_RATE_DIALOGUE],
+            self.config[DROP_RATE_ATTENTION],
         )
 
         self._prepare_embed_layers(DIALOGUE)
@@ -597,19 +919,23 @@ def _prepare_layers(self) -> None:
 
         self._prepare_dot_product_loss(LABEL, self.config[SCALE_LOSS])
 
+        if self.config[ENTITY_RECOGNITION]:
+            self._prepare_entity_recognition_layers()
+
     def _prepare_sparse_dense_layer_for(
         self, name: Text, signature: Dict[Text, Dict[Text, List[FeatureSignature]]]
     ) -> None:
-        """Prepare the sparse dense layer for the given attribute name. It is used to
-        combine the sparse and dense features of the attribute at the beginning of
-        the model.
+        """Prepares the sparse dense layer for the given attribute name.
+
+        It is used to combine the sparse and dense features of the attribute at the
+        beginning of the model.
 
         Args:
             name: the attribute name
             signature: data signature
         """
-        for feature_type in POSSIBLE_FEATURE_TYPES:
-            if name not in signature or feature_type not in signature[name]:
+        for feature_type in VALID_FEATURE_TYPES:
+            if feature_type not in signature[name]:
                 # features for feature type are not present
                 continue
 
@@ -621,7 +947,7 @@ def _prepare_sparse_dense_layer_for(
             self._prepare_sparse_dense_layers(
                 signature[name][feature_type],
                 f"{name}_{feature_type}",
-                self.config[DENSE_DIMENSION],
+                self.config[DENSE_DIMENSION][name],
             )
 
     def _prepare_encoding_layers(self, name: Text) -> None:
@@ -631,34 +957,59 @@ def _prepare_encoding_layers(self, name: Text) -> None:
         Args:
             name: attribute name
         """
-        feature_type = SENTENCE
         # create encoding layers only for the features which should be encoded;
-        if name not in FEATURES_TO_ENCODE + LABEL_FEATURES_TO_ENCODE:
+        if name not in SENTENCE_FEATURES_TO_ENCODE + LABEL_FEATURES_TO_ENCODE:
             return
         # check that there are SENTENCE features for the attribute name in data
-        if name in FEATURES_TO_ENCODE and feature_type not in self.data_signature[name]:
+        if (
+            name in SENTENCE_FEATURES_TO_ENCODE
+            and FEATURE_TYPE_SENTENCE not in self.data_signature[name]
+        ):
             return
         #  same for label_data
         if (
             name in LABEL_FEATURES_TO_ENCODE
-            and feature_type not in self.label_signature[name]
+            and FEATURE_TYPE_SENTENCE not in self.label_signature[name]
         ):
             return
 
         self._prepare_ffnn_layer(
-            f"{name}_{feature_type}",
+            f"{name}",
             [self.config[ENCODING_DIMENSION]],
             self.config[DROP_RATE_DIALOGUE],
+            prefix="encoding_layer",
         )
 
+    # ---GRAPH BUILDING HELPERS---
+
+    @staticmethod
+    def _compute_dialogue_indices(
+        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]]
+    ) -> None:
+        dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], dtype=tf.int32)
+        # wrap in a list, because that's the structure of tf_batch_data
+        tf_batch_data[DIALOGUE][INDICES] = [
+            (
+                tf.map_fn(
+                    tf.range,
+                    dialogue_lengths,
+                    fn_output_signature=tf.RaggedTensorSpec(
+                        shape=[None], dtype=tf.int32
+                    ),
+                )
+            ).values
+        ]
+
     def _create_all_labels_embed(self) -> Tuple[tf.Tensor, tf.Tensor]:
         all_label_ids = self.tf_label_data[LABEL_KEY][LABEL_SUB_KEY][0]
-
-        all_labels_encoded = {
-            key: self._encode_features_per_attribute(self.tf_label_data, key)
-            for key in self.tf_label_data.keys()
-            if key != LABEL_KEY
-        }
+        # labels cannot have all features "fake"
+        all_labels_encoded = {}
+        for key in self.tf_label_data.keys():
+            if key != LABEL_KEY:
+                attribute_features, _, _ = self._encode_real_features_per_attribute(
+                    self.tf_label_data, key
+                )
+                all_labels_encoded[key] = attribute_features
 
         if (
             all_labels_encoded.get(f"{LABEL_KEY}_{ACTION_TEXT}") is not None
@@ -680,75 +1031,375 @@ def _create_all_labels_embed(self) -> Tuple[tf.Tensor, tf.Tensor]:
 
         return all_label_ids, all_labels_embed
 
-    def _emebed_dialogue(
-        self, dialogue_in: tf.Tensor, sequence_lengths: tf.Tensor
-    ) -> Tuple[tf.Tensor, tf.Tensor]:
+    def _embed_dialogue(
+        self,
+        dialogue_in: tf.Tensor,
+        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
         """Create dialogue level embedding and mask."""
-
-        mask = self._compute_mask(sequence_lengths)
+        dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
+        mask = self._compute_mask(dialogue_lengths)
 
         dialogue_transformed = self._tf_layers[f"transformer.{DIALOGUE}"](
             dialogue_in, 1 - mask, self._training
         )
         dialogue_transformed = tfa.activations.gelu(dialogue_transformed)
 
-        if self.max_history_tracker_featurizer_used:
+        if self.use_only_last_dialogue_turns:
             # pick last vector if max history featurizer is used
             dialogue_transformed = tf.expand_dims(
-                self._last_token(dialogue_transformed, sequence_lengths), 1
+                self._last_token(dialogue_transformed, dialogue_lengths), 1
             )
-            mask = tf.expand_dims(self._last_token(mask, sequence_lengths), 1)
+            mask = tf.expand_dims(self._last_token(mask, dialogue_lengths), 1)
 
         dialogue_embed = self._tf_layers[f"embed.{DIALOGUE}"](dialogue_transformed)
 
-        return dialogue_embed, mask
+        return dialogue_embed, mask, dialogue_transformed
 
     def _encode_features_per_attribute(
         self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]], attribute: Text
-    ) -> Optional[tf.Tensor]:
-        """
-        Encodes features for a given attribute
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+        # The input is a representation of 4d tensor of
+        # shape (batch-size x dialogue-len x sequence-len x units) in 3d of shape
+        # (sum of dialogue history length for all tensors in the batch x
+        # max sequence length x number of features).
+
+        # However, some dialogue turns contain non existent state features,
+        # e.g. `intent` and `text` features are mutually exclusive,
+        # as well as `action_name` and `action_text` are mutually exclusive,
+        # or some dialogue turns don't contain any `slots`.
+        # In order to create 4d full tensors, we created "fake" zero features for
+        # these non existent state features. And filtered them during batch generation.
+        # Therefore the first dimensions for different attributes are different.
+        # It could happen that some batches don't contain "real" features at all,
+        # e.g. large number of stories don't contain any `slots`.
+        # Therefore actual input tensors will be empty.
+        # Since we need actual numbers to create dialogue turn features, we create
+        # zero tensors in `_encode_fake_features_per_attribute` for these attributes.
+        return tf.cond(
+            tf.shape(tf_batch_data[attribute][SENTENCE][0])[0] > 0,
+            lambda: self._encode_real_features_per_attribute(tf_batch_data, attribute),
+            lambda: self._encode_fake_features_per_attribute(tf_batch_data, attribute),
+        )
+
+    def _get_dense_units(
+        self, attribute_features_list: List[tf.Tensor], attribute: Text
+    ) -> int:
+        # TODO this should be done in corresponding layers once in init
+        units = 0
+        for f in attribute_features_list:
+            if isinstance(f, tf.SparseTensor):
+                units += self.config[DENSE_DIMENSION][attribute]
+            else:
+                units += f.shape[-1]
+        return units
+
+    def _get_concat_units(
+        self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]], attribute: Text
+    ) -> int:
+        # TODO this should be done in corresponding layers once in init
+        # calculate concat sequence sentence dim
+        sentence_units = self._get_dense_units(
+            tf_batch_data[attribute][SENTENCE], attribute
+        )
+        sequence_units = self._get_dense_units(
+            tf_batch_data[attribute][SEQUENCE], attribute
+        )
+
+        if sequence_units and not sentence_units:
+            return sequence_units
+
+        if sentence_units and not sequence_units:
+            return sentence_units
+
+        if sentence_units != sequence_units:
+            return self.config[CONCAT_DIMENSION][TEXT]
+
+        return sentence_units
+
+    def _encode_fake_features_per_attribute(
+        self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]], attribute: Text
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+        # we need to create real zero tensors with appropriate batch and dialogue dim
+        # because they are passed to dialogue transformer
+        attribute_mask = tf_batch_data[attribute][MASK][0]
+
+        batch_dim = tf.shape(attribute_mask)[0]
+        dialogue_dim = tf.shape(attribute_mask)[1]
+        if attribute in set(SENTENCE_FEATURES_TO_ENCODE + LABEL_FEATURES_TO_ENCODE):
+            units = self.config[ENCODING_DIMENSION]
+        else:
+            units = self._get_dense_units(tf_batch_data[attribute][SENTENCE], attribute)
+
+        attribute_features = tf.zeros(
+            (batch_dim, dialogue_dim, units), dtype=tf.float32
+        )
+        if attribute == TEXT:
+            # if the input features are fake, we don't process them further,
+            # but we need to calculate correct last dim (units) so that tf could infer
+            # the last shape of the tensors
+            if self.config[NUM_TRANSFORMER_LAYERS][TEXT] > 0:
+                text_transformer_units = self.config[TRANSFORMER_SIZE][TEXT]
+            elif self.config[HIDDEN_LAYERS_SIZES][TEXT]:
+                text_transformer_units = self.config[HIDDEN_LAYERS_SIZES][TEXT][-1]
+            else:
+                text_transformer_units = self._get_concat_units(
+                    tf_batch_data, attribute
+                )
+
+            text_transformer_output = tf.zeros(
+                (0, 0, text_transformer_units), dtype=tf.float32
+            )
+            text_sequence_lengths = tf.zeros((0, 1), dtype=tf.int32)
+        else:
+            # simulate None with empty tensor of zeros
+            text_transformer_output = tf.zeros((0,))
+            text_sequence_lengths = tf.zeros((0,))
+
+        return attribute_features, text_transformer_output, text_sequence_lengths
+
+    @staticmethod
+    def _create_last_dialogue_turns_mask(
+        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]], attribute: Text
+    ) -> tf.Tensor:
+        # Since use_only_last_dialogue_turns is True,
+        # we need to find the locations of last dialogue turns in
+        # (combined batch dimension and dialogue length,) dimension,
+        # so that we can use `_sequence_lengths` as a boolean  mask to pick
+        # which ones are "real" textual input in these last dialogue turns.
+
+        # In order to do that we can use given `dialogue_lengths`.
+        # For example:
+        # If we have `dialogue_lengths = [2, 1, 3]`, than
+        # `dialogue_indices = [0, 1, 0, 0, 1, 2]` here we can spot that `0`
+        # always indicates the first dialogue turn,
+        # which means that previous dialogue turn is the last dialogue turn.
+        # Combining this with the fact that the last element in
+        # `dialogue_indices` is always the last dialogue turn, we can add
+        # a `0` to the end, getting
+        # `_dialogue_indices = [0, 1, 0, 0, 1, 2, 0]`.
+        # Then removing the first element
+        # `_last_dialogue_turn_inverse_indicator = [1, 0, 0, 1, 2, 0]`
+        # we see that `0` points to the last dialogue turn.
+        # We convert all positive numbers to `True` and take
+        # the inverse mask to get
+        # `last_dialogue_mask = [0, 1, 1, 0, 0, 1],
+        # which precisely corresponds to the fact that first dialogue is of
+        # length 2, the second 1 and the third 3.
+        last_dialogue_turn_mask = tf.math.logical_not(
+            tf.cast(
+                tf.concat(
+                    [
+                        tf_batch_data[DIALOGUE][INDICES][0],
+                        tf.zeros((1,), dtype=tf.int32),
+                    ],
+                    axis=0,
+                )[1:],
+                dtype=tf.bool,
+            )
+        )
+        # get only the indices of real inputs
+        return tf.boolean_mask(
+            last_dialogue_turn_mask,
+            tf.reshape(tf_batch_data[attribute][SEQUENCE_LENGTH][0], (-1,)),
+        )
+
+    def _encode_real_features_per_attribute(
+        self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]], attribute: Text
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+        """Encodes features for a given attribute.
+
         Args:
             tf_batch_data: dictionary mapping every attribute to its features and masks
-            attribute: the attribute we will encode features for (e.g., ACTION_NAME, INTENT)
+            attribute: the attribute we will encode features for
+            (e.g., ACTION_NAME, INTENT)
+
         Returns:
             A tensor combining  all features for `attribute`
         """
+        # simulate None with empty tensor of zeros
+        text_transformer_output = tf.zeros((0,))
+        text_sequence_lengths = tf.zeros((0,))
+
+        if attribute in SEQUENCE_FEATURES_TO_ENCODE:
+            # sequence_lengths contain `0` for "fake" features, while
+            # tf_batch_data[attribute] contain only "real" features
+            sequence_lengths = tf_batch_data[attribute][SEQUENCE_LENGTH][0]
+            # extract only nonzero lengths and cast to int
+            sequence_lengths = tf.cast(
+                tf.boolean_mask(sequence_lengths, sequence_lengths), dtype=tf.int32
+            )
+            # boolean mask returns flat tensor
+            sequence_lengths = tf.expand_dims(sequence_lengths, axis=-1)
+
+            mask_sequence_text = tf.squeeze(
+                self._compute_mask(sequence_lengths), axis=1
+            )
+            # add 1 to sequence lengths to account for sentence features
+            sequence_lengths += 1
+            mask_text = tf.squeeze(self._compute_mask(sequence_lengths), axis=1)
+
+            attribute_features, _, _, _ = self._create_sequence(
+                tf_batch_data[attribute][SEQUENCE],
+                tf_batch_data[attribute][SENTENCE],
+                mask_sequence_text,
+                mask_text,
+                attribute,
+                sparse_dropout=self.config[SPARSE_INPUT_DROPOUT],
+                dense_dropout=self.config[DENSE_INPUT_DROPOUT],
+                masked_lm_loss=self.config[MASKED_LM],
+                sequence_ids=False,
+            )
 
-        if not tf_batch_data[attribute]:
-            return None
+            if attribute == TEXT:
+                text_transformer_output = attribute_features
+                text_sequence_lengths = sequence_lengths
+
+                if self.use_only_last_dialogue_turns:
+                    # get the location of all last dialogue inputs
+                    last_dialogue_turns_mask = self._create_last_dialogue_turns_mask(
+                        tf_batch_data, attribute
+                    )
+                    # pick outputs that correspond to the last dialogue turns
+                    text_transformer_output = tf.boolean_mask(
+                        text_transformer_output, last_dialogue_turns_mask
+                    )
+                    text_sequence_lengths = tf.boolean_mask(
+                        text_sequence_lengths, last_dialogue_turns_mask
+                    )
+
+            # resulting attribute features will have shape
+            # combined batch dimension and dialogue length x 1 x units
+            attribute_features = tf.expand_dims(
+                self._last_token(
+                    attribute_features, tf.squeeze(sequence_lengths, axis=-1)
+                ),
+                axis=1,
+            )
 
-        attribute_mask = tf_batch_data[attribute][MASK][0]
-        # TODO transformer has to be used to process sequence features
-        attribute_features = self._combine_sparse_dense_features(
-            tf_batch_data[attribute][SENTENCE],
-            f"{attribute}_{SENTENCE}",
-            mask=attribute_mask,
-        )
+        else:
+            # resulting attribute features will have shape
+            # combined batch dimension and dialogue length x 1 x units
+            attribute_features = self._combine_sparse_dense_features(
+                tf_batch_data[attribute][SENTENCE], f"{attribute}_{SENTENCE}"
+            )
 
-        if attribute in FEATURES_TO_ENCODE + LABEL_FEATURES_TO_ENCODE:
-            attribute_features = self._tf_layers[f"ffnn.{attribute}_{SENTENCE}"](
+        if attribute in SENTENCE_FEATURES_TO_ENCODE + LABEL_FEATURES_TO_ENCODE:
+            attribute_features = self._tf_layers[f"encoding_layer.{attribute}"](
                 attribute_features
             )
 
-        return attribute_features * attribute_mask
+        # attribute features have shape
+        # (combined batch dimension and dialogue length x 1 x units)
+        # convert them back to their original shape of
+        # batch size x dialogue length x units
+        attribute_features = self._convert_to_original_shape(
+            attribute_features, tf_batch_data, attribute
+        )
+
+        return attribute_features, text_transformer_output, text_sequence_lengths
+
+    @staticmethod
+    def _convert_to_original_shape(
+        attribute_features: tf.Tensor,
+        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
+        attribute: Text,
+    ) -> tf.Tensor:
+        """Transform attribute features back to original shape.
+
+        Given shape: (combined batch and dialogue dimension x 1 x units)
+        Original shape: (batch x dialogue length x units)
+
+        Args:
+            attribute_features: the "real" features to convert
+            attribute_mask:  the tensor containing the position of "real" features
+                in the dialogue, shape is (batch-size x dialogue_len x 1)
+            dialogue_lengths: the tensor containing the actual dialogue length,
+                shape is (batch-size,)
+
+        Returns:
+            The converted attribute features
+        """
+        # in order to convert the attribute features with shape
+        # (combined batch-size and dialogue length x 1 x units)
+        # to a shape of (batch-size x dialogue length x units)
+        # we use tf.scatter_nd. Therefore, we need the target shape and the indices
+        # mapping the values of attribute features to the position in the resulting
+        # tensor.
+
+        # attribute_mask has shape batch x dialogue_len x 1
+        attribute_mask = tf_batch_data[attribute][MASK][0]
+
+        if attribute in SENTENCE_FEATURES_TO_ENCODE + STATE_LEVEL_FEATURES:
+            dialogue_lengths = tf.cast(
+                tf_batch_data[DIALOGUE][LENGTH][0], dtype=tf.int32
+            )
+            dialogue_indices = tf_batch_data[DIALOGUE][INDICES][0]
+        else:
+            # for labels, dialogue length is a fake dim and equal to 1
+            dialogue_lengths = tf.ones((tf.shape(attribute_mask)[0],), dtype=tf.int32)
+            dialogue_indices = tf.zeros((tf.shape(attribute_mask)[0],), dtype=tf.int32)
+
+        batch_dim = tf.shape(attribute_mask)[0]
+        dialogue_dim = tf.shape(attribute_mask)[1]
+        units = attribute_features.shape[-1]
+
+        # attribute_mask has shape (batch x dialogue_len x 1), remove last dimension
+        attribute_mask = tf.cast(tf.squeeze(attribute_mask, axis=-1), dtype=tf.int32)
+        # sum of attribute mask contains number of dialogue turns with "real" features
+        non_fake_dialogue_lengths = tf.reduce_sum(attribute_mask, axis=-1)
+        # create the batch indices
+        batch_indices = tf.repeat(tf.range(batch_dim), non_fake_dialogue_lengths)
+
+        # attribute_mask has shape (batch x dialogue_len x 1), while
+        # dialogue_indices has shape (combined_dialogue_len,)
+        # in order to find positions of real input we need to flatten
+        # attribute mask to (combined_dialogue_len,)
+        dialogue_indices_mask = tf.boolean_mask(
+            attribute_mask, tf.sequence_mask(dialogue_lengths, dtype=tf.int32)
+        )
+        # pick only those indices that contain "real" input
+        dialogue_indices = tf.boolean_mask(dialogue_indices, dialogue_indices_mask)
+
+        indices = tf.stack([batch_indices, dialogue_indices], axis=1)
+
+        shape = tf.convert_to_tensor([batch_dim, dialogue_dim, units])
+        attribute_features = tf.squeeze(attribute_features, axis=1)
+
+        return tf.scatter_nd(indices, attribute_features, shape)
 
     def _process_batch_data(
         self, tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]]
-    ) -> tf.Tensor:
-        """Encodes batch data; combines intent and text and action name and action text if both are present
+    ) -> Tuple[tf.Tensor, Optional[tf.Tensor], Optional[tf.Tensor]]:
+        """Encodes batch data.
+
+        Combines intent and text and action name and action text if both are present.
+
         Args:
             tf_batch_data: dictionary mapping every attribute to its features and masks
+
         Returns:
              Tensor: encoding of all features in the batch, combined;
         """
         # encode each attribute present in tf_batch_data
-        batch_encoded = {
-            key: self._encode_features_per_attribute(tf_batch_data, key)
-            for key in tf_batch_data.keys()
-            if LABEL_KEY not in key and DIALOGUE not in key
-        }
-        # if both action text and action name are present, combine them; otherwise, return the one which is present
+        text_transformer_output = None
+        text_sequence_lengths = None
+        batch_encoded = {}
+        for attribute in tf_batch_data.keys():
+            if attribute in SENTENCE_FEATURES_TO_ENCODE + STATE_LEVEL_FEATURES:
+                (
+                    attribute_features,
+                    _text_transformer_output,
+                    _text_sequence_lengths,
+                ) = self._encode_features_per_attribute(tf_batch_data, attribute)
+
+                batch_encoded[attribute] = attribute_features
+                if attribute == TEXT:
+                    text_transformer_output = _text_transformer_output
+                    text_sequence_lengths = _text_sequence_lengths
+
+        # if both action text and action name are present, combine them; otherwise,
+        # return the one which is present
 
         if (
             batch_encoded.get(ACTION_TEXT) is not None
@@ -780,7 +1431,134 @@ def _process_batch_data(
 
         batch_features = tf.concat(batch_features, axis=-1)
 
-        return batch_features
+        return batch_features, text_transformer_output, text_sequence_lengths
+
+    def _reshape_for_entities(
+        self,
+        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
+        dialogue_transformer_output: tf.Tensor,
+        text_transformer_output: tf.Tensor,
+        text_sequence_lengths: tf.Tensor,
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+        # The first dim of the output of the text sequence transformer is the same
+        # as number of "real" features for `text` at the last dialogue turns
+        # (let's call it `N`),
+        # which corresponds to the first dim of the tag ids tensor.
+        # To calculate the loss for entities we need the output of the text
+        # sequence transformer (shape: N x sequence length x units),
+        # the output of the dialogue transformer
+        # (shape: batch size x dialogue length x units) and the tag ids for the
+        # entities (shape: N x sequence length - 1 x units)
+        # In order to process the tensors, they need to have the same shape.
+        # Convert the output of the dialogue transformer to shape
+        # (N x 1 x units).
+
+        # Note: The CRF layer cannot handle 4D tensors. E.g. we cannot use the shape
+        # batch size x dialogue length x sequence length x units
+
+        # convert the output of the dialogue transformer
+        # to shape (real entity dim x 1 x units)
+        attribute_mask = tf_batch_data[TEXT][MASK][0]
+        dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
+
+        if self.use_only_last_dialogue_turns:
+            # pick outputs that correspond to the last dialogue turns
+            attribute_mask = tf.expand_dims(
+                self._last_token(attribute_mask, dialogue_lengths), axis=1
+            )
+        dialogue_transformer_output = tf.boolean_mask(
+            dialogue_transformer_output, tf.squeeze(attribute_mask, axis=-1)
+        )
+
+        # boolean mask removed axis=1, add it back
+        dialogue_transformer_output = tf.expand_dims(
+            dialogue_transformer_output, axis=1
+        )
+
+        # broadcast the dialogue transformer output sequence-length-times to get the
+        # same shape as the text sequence transformer output
+        dialogue_transformer_output = tf.tile(
+            dialogue_transformer_output, (1, tf.shape(text_transformer_output)[1], 1)
+        )
+
+        # concat the output of the dialogue transformer to the output of the text
+        # sequence transformer (adding context)
+        # resulting shape (N x sequence length x 2 units)
+        # N = number of "real" features for `text` at the last dialogue turns
+        text_transformed = tf.concat(
+            [text_transformer_output, dialogue_transformer_output], axis=-1
+        )
+
+        text_mask = tf.squeeze(self._compute_mask(text_sequence_lengths), axis=1)
+        # add zeros to match the shape of text_transformed, because
+        # max sequence length might differ, since it is calculated dynamically
+        # based on a subset of sequence lengths
+        sequence_diff = tf.shape(text_transformed)[1] - tf.shape(text_mask)[1]
+        text_mask = tf.pad(text_mask, [[0, 0], [0, sequence_diff], [0, 0]])
+
+        # remove additional dims and sentence features
+        text_sequence_lengths = tf.reshape(text_sequence_lengths, (-1,)) - 1
+
+        return text_transformed, text_mask, text_sequence_lengths
+
+    # ---TRAINING---
+
+    def _batch_loss_entities(
+        self,
+        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
+        dialogue_transformer_output: tf.Tensor,
+        text_transformer_output: tf.Tensor,
+        text_sequence_lengths: tf.Tensor,
+    ) -> tf.Tensor:
+        # It could happen that some batches don't contain "real" features for `text`,
+        # e.g. large number of stories are intent only.
+        # Therefore actual `text_transformer_output` will be empty.
+        # We cannot create a loss with empty tensors.
+        # Since we need actual numbers to create a full loss, we output
+        # zero in this case.
+        return tf.cond(
+            tf.shape(text_transformer_output)[0] > 0,
+            lambda: self._real_batch_loss_entities(
+                tf_batch_data,
+                dialogue_transformer_output,
+                text_transformer_output,
+                text_sequence_lengths,
+            ),
+            lambda: tf.constant(0.0),
+        )
+
+    def _real_batch_loss_entities(
+        self,
+        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
+        dialogue_transformer_output: tf.Tensor,
+        text_transformer_output: tf.Tensor,
+        text_sequence_lengths: tf.Tensor,
+    ) -> tf.Tensor:
+
+        text_transformed, text_mask, text_sequence_lengths = self._reshape_for_entities(
+            tf_batch_data,
+            dialogue_transformer_output,
+            text_transformer_output,
+            text_sequence_lengths,
+        )
+
+        tag_ids = tf_batch_data[ENTITY_TAGS][IDS][0]
+        # add a zero (no entity) for the sentence features to match the shape of inputs
+        sequence_diff = tf.shape(text_transformed)[1] - tf.shape(tag_ids)[1]
+        tag_ids = tf.pad(tag_ids, [[0, 0], [0, sequence_diff], [0, 0]])
+
+        loss, f1, _ = self._calculate_entity_loss(
+            text_transformed,
+            tag_ids,
+            text_mask,
+            text_sequence_lengths,
+            ENTITY_ATTRIBUTE_TYPE,
+        )
+
+        self.entity_loss.update_state(loss)
+        self.entity_f1.update_state(f1)
+
+        return loss
 
     @staticmethod
     def _get_labels_embed(
@@ -797,21 +1575,36 @@ def _get_labels_embed(
     def batch_loss(
         self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> tf.Tensor:
-        tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
+        """Calculates the loss for the given batch.
 
-        dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
+        Args:
+            batch_in: The batch.
+
+        Returns:
+            The loss of the given batch.
+        """
+        tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
+        self._compute_dialogue_indices(tf_batch_data)
 
         all_label_ids, all_labels_embed = self._create_all_labels_embed()
 
         label_ids = tf_batch_data[LABEL_KEY][LABEL_SUB_KEY][0]
         labels_embed = self._get_labels_embed(label_ids, all_labels_embed)
 
-        dialogue_in = self._process_batch_data(tf_batch_data)
-        dialogue_embed, dialogue_mask = self._emebed_dialogue(
-            dialogue_in, dialogue_lengths
-        )
+        (
+            dialogue_in,
+            text_transformer_output,
+            text_sequence_lengths,
+        ) = self._process_batch_data(tf_batch_data)
+        (
+            dialogue_embed,
+            dialogue_mask,
+            dialogue_transformer_output,
+        ) = self._embed_dialogue(dialogue_in, tf_batch_data)
         dialogue_mask = tf.squeeze(dialogue_mask, axis=-1)
 
+        losses = []
+
         loss, acc = self._tf_layers[f"loss.{LABEL}"](
             dialogue_embed,
             labels_embed,
@@ -820,28 +1613,65 @@ def batch_loss(
             all_label_ids,
             dialogue_mask,
         )
+        losses.append(loss)
+
+        if (
+            self.config[ENTITY_RECOGNITION]
+            and text_transformer_output is not None
+            and text_sequence_lengths is not None
+        ):
+            losses.append(
+                self._batch_loss_entities(
+                    tf_batch_data,
+                    dialogue_transformer_output,
+                    text_transformer_output,
+                    text_sequence_lengths,
+                )
+            )
 
         self.action_loss.update_state(loss)
         self.action_acc.update_state(acc)
 
-        return loss
+        return tf.math.add_n(losses)
+
+    # ---PREDICTION---
+
+    def prepare_for_predict(self) -> None:
+        """Prepares the model for prediction."""
+        _, self.all_labels_embed = self._create_all_labels_embed()
 
     def batch_predict(
         self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> Dict[Text, tf.Tensor]:
-        tf_batch_data = self.batch_to_model_data_format(
-            batch_in, self.predict_data_signature
-        )
+        """Predicts the output of the given batch.
 
-        dialogue_lengths = tf.cast(tf_batch_data[DIALOGUE][LENGTH][0], tf.int32)
+        Args:
+            batch_in: The batch.
 
+        Returns:
+            The output to predict.
+        """
         if self.all_labels_embed is None:
-            _, self.all_labels_embed = self._create_all_labels_embed()
+            raise ValueError(
+                "The model was not prepared for prediction. "
+                "Call `prepare_for_predict` first."
+            )
 
-        dialogue_in = self._process_batch_data(tf_batch_data)
-        dialogue_embed, dialogue_mask = self._emebed_dialogue(
-            dialogue_in, dialogue_lengths
+        tf_batch_data = self.batch_to_model_data_format(
+            batch_in, self.predict_data_signature
         )
+        self._compute_dialogue_indices(tf_batch_data)
+
+        (
+            dialogue_in,
+            text_transformer_output,
+            text_sequence_lengths,
+        ) = self._process_batch_data(tf_batch_data)
+        (
+            dialogue_embed,
+            dialogue_mask,
+            dialogue_transformer_output,
+        ) = self._embed_dialogue(dialogue_in, tf_batch_data)
         dialogue_mask = tf.squeeze(dialogue_mask, axis=-1)
 
         sim_all = self._tf_layers[f"loss.{LABEL}"].sim(
@@ -853,5 +1683,73 @@ def batch_predict(
         scores = self._tf_layers[f"loss.{LABEL}"].confidence_from_sim(
             sim_all, self.config[SIMILARITY_TYPE]
         )
+        predictions = {"action_scores": scores, "similarities": sim_all}
+
+        if (
+            self.config[ENTITY_RECOGNITION]
+            and text_transformer_output is not None
+            and text_sequence_lengths is not None
+        ):
+            pred_ids, confidences = self._batch_predict_entities(
+                tf_batch_data,
+                dialogue_transformer_output,
+                text_transformer_output,
+                text_sequence_lengths,
+            )
+            name = ENTITY_ATTRIBUTE_TYPE
+            predictions[f"e_{name}_ids"] = pred_ids
+            predictions[f"e_{name}_scores"] = confidences
+
+        return predictions
+
+    def _batch_predict_entities(
+        self,
+        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
+        dialogue_transformer_output: tf.Tensor,
+        text_transformer_output: tf.Tensor,
+        text_sequence_lengths: tf.Tensor,
+    ) -> Tuple[tf.Tensor, tf.Tensor]:
+        # It could happen that current prediction turn don't contain
+        # "real" features for `text`,
+        # Therefore actual `text_transformer_output` will be empty.
+        # We cannot predict entities with empty tensors.
+        # Since we need to output some tensors of the same shape, we output
+        # zero tensors.
+        return tf.cond(
+            tf.shape(text_transformer_output)[0] > 0,
+            lambda: self._real_batch_predict_entities(
+                tf_batch_data,
+                dialogue_transformer_output,
+                text_transformer_output,
+                text_sequence_lengths,
+            ),
+            lambda: (
+                # the output is of shape (batch_size, max_seq_len)
+                tf.zeros(tf.shape(text_transformer_output)[:2], dtype=tf.int32),
+                tf.zeros(tf.shape(text_transformer_output)[:2], dtype=tf.float32),
+            ),
+        )
+
+    def _real_batch_predict_entities(
+        self,
+        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
+        dialogue_transformer_output: tf.Tensor,
+        text_transformer_output: tf.Tensor,
+        text_sequence_lengths: tf.Tensor,
+    ) -> Tuple[tf.Tensor, tf.Tensor]:
+
+        text_transformed, _, text_sequence_lengths = self._reshape_for_entities(
+            tf_batch_data,
+            dialogue_transformer_output,
+            text_transformer_output,
+            text_sequence_lengths,
+        )
+
+        name = ENTITY_ATTRIBUTE_TYPE
+
+        _logits = self._tf_layers[f"embed.{name}.logits"](text_transformed)
+
+        return self._tf_layers[f"crf.{name}"](_logits, text_sequence_lengths)
+
 
-        return {"action_scores": scores}
+# pytype: enable=key-error
diff --git a/rasa/core/processor.py b/rasa/core/processor.py
index 2e2b9a32ba8f..2cdadd8ed8b4 100644
--- a/rasa/core/processor.py
+++ b/rasa/core/processor.py
@@ -7,6 +7,7 @@
 import rasa.shared.utils.io
 import rasa.core.actions.action
 from rasa.core import jobs
+from rasa.core.actions.action import Action
 from rasa.core.channels.channel import (
     CollectingOutputChannel,
     OutputChannel,
@@ -151,7 +152,7 @@ def predict_next_with_tracker(
 
         scores = [
             {"action": a, "score": p}
-            for a, p in zip(self.domain.action_names, prediction.probabilities)
+            for a, p in zip(self.domain.action_names_or_texts, prediction.probabilities)
         ]
         return {
             "scores": scores,
@@ -510,7 +511,7 @@ def _check_for_unseen_features(self, parse_data: Dict[Text, Any]) -> None:
                 )
 
     def _get_action(self, action_name) -> Optional[rasa.core.actions.action.Action]:
-        return rasa.core.actions.action.action_for_name(
+        return rasa.core.actions.action.action_for_name_or_text(
             action_name, self.domain, self.action_endpoint
         )
 
@@ -766,7 +767,7 @@ async def _run_action(
             )
             events = []
 
-        self._log_action_on_tracker(tracker, action.name(), events, prediction)
+        self._log_action_on_tracker(tracker, action, events, prediction)
         if action.name() != ACTION_LISTEN_NAME and not action.name().startswith(
             UTTER_PREFIX
         ):
@@ -809,7 +810,7 @@ def _warn_about_new_slots(self, tracker, action_name, events) -> None:
     def _log_action_on_tracker(
         self,
         tracker: DialogueStateTracker,
-        action_name: Text,
+        action: Action,
         events: Optional[List[Event]],
         prediction: PolicyPrediction,
     ) -> None:
@@ -819,23 +820,19 @@ def _log_action_on_tracker(
         if events is None:
             events = []
 
-        self._warn_about_new_slots(tracker, action_name, events)
+        self._warn_about_new_slots(tracker, action.name(), events)
 
         action_was_rejected_manually = any(
             isinstance(event, ActionExecutionRejected) for event in events
         )
-        if action_name is not None and not action_was_rejected_manually:
+        if not action_was_rejected_manually:
             logger.debug(f"Policy prediction ended with events '{prediction.events}'.")
             tracker.update_with_events(prediction.events, self.domain)
 
             # log the action and its produced events
-            tracker.update(
-                ActionExecuted(
-                    action_name, prediction.policy_name, prediction.max_confidence
-                )
-            )
+            tracker.update(action.event_for_successful_execution(prediction))
 
-        logger.debug(f"Action '{action_name}' ended with events '{events}'.")
+        logger.debug(f"Action '{action.name()}' ended with events '{events}'.")
         tracker.update_with_events(events, self.domain)
 
     def _has_session_expired(self, tracker: DialogueStateTracker) -> bool:
@@ -883,7 +880,7 @@ def _get_next_action_probabilities(
         followup_action = tracker.followup_action
         if followup_action:
             tracker.clear_followup_action()
-            if followup_action in self.domain.action_names:
+            if followup_action in self.domain.action_names_or_texts:
                 return PolicyPrediction.for_action_name(
                     self.domain, followup_action, FOLLOWUP_ACTION
                 )
diff --git a/rasa/core/test.py b/rasa/core/test.py
index 667861ac9e3d..deeaa092abc7 100644
--- a/rasa/core/test.py
+++ b/rasa/core/test.py
@@ -240,14 +240,26 @@ class WronglyPredictedAction(ActionExecuted):
     def __init__(
         self,
         action_name_target: Text,
+        action_text_target: Text,
         action_name_prediction: Text,
         policy: Optional[Text] = None,
         confidence: Optional[float] = None,
         timestamp: Optional[float] = None,
         metadata: Optional[Dict] = None,
     ) -> None:
+        """Creates event for a successful event execution.
+
+        See the docstring of the parent class `ActionExecuted` for more information.
+        """
         self.action_name_prediction = action_name_prediction
-        super().__init__(action_name_target, policy, confidence, timestamp, metadata)
+        super().__init__(
+            action_name_target,
+            policy,
+            confidence,
+            timestamp,
+            metadata,
+            action_text=action_text_target,
+        )
 
     def inline_comment(self) -> Text:
         """A comment attached to this event. Used during dumping."""
@@ -296,17 +308,18 @@ def __init__(self, event: UserUttered, eval_store: EvaluationStore) -> None:
 
     def inline_comment(self) -> Text:
         """A comment attached to this event. Used during dumping."""
-        from rasa.shared.core.events import md_format_message
+        from rasa.shared.core.events import format_message
 
-        predicted_message = md_format_message(
+        predicted_message = format_message(
             self.text, self.predicted_intent, self.predicted_entities
         )
         return f"predicted: {self.predicted_intent}: {predicted_message}"
 
     def as_story_string(self, e2e: bool = True) -> Text:
-        from rasa.shared.core.events import md_format_message
+        """Returns text representation of event."""
+        from rasa.shared.core.events import format_message
 
-        correct_message = md_format_message(
+        correct_message = format_message(
             self.text, self.intent.get("name"), self.entities
         )
         return (
@@ -480,7 +493,9 @@ def _collect_action_executed_predictions(
 
     action_executed_eval_store = EvaluationStore()
 
-    gold = event.action_name or event.action_text
+    gold_action_name = event.action_name
+    gold_action_text = event.action_text
+    gold = gold_action_name or gold_action_text
 
     if circuit_breaker_tripped:
         prediction = PolicyPrediction([], policy_name=None)
@@ -514,7 +529,8 @@ def _collect_action_executed_predictions(
     if action_executed_eval_store.has_prediction_target_mismatch():
         partial_tracker.update(
             WronglyPredictedAction(
-                gold,
+                gold_action_name,
+                gold_action_text,
                 predicted,
                 prediction.policy_name,
                 prediction.max_confidence,
diff --git a/rasa/core/training/interactive.py b/rasa/core/training/interactive.py
index a025062cdd4e..7b3677895389 100644
--- a/rasa/core/training/interactive.py
+++ b/rasa/core/training/interactive.py
@@ -534,7 +534,7 @@ def add_user_cell(data, cell):
 
     for idx, event in enumerate(applied_events):
         if isinstance(event, ActionExecuted):
-            bot_column.append(colored(event.action_name, "autocyan"))
+            bot_column.append(colored(str(event), "autocyan"))
             if event.confidence is not None:
                 bot_column[-1] += colored(f" {event.confidence:03.2f}", "autowhite")
 
diff --git a/rasa/core/training/story_conflict.py b/rasa/core/training/story_conflict.py
index b1e76bcd8e3f..0f5080e4309d 100644
--- a/rasa/core/training/story_conflict.py
+++ b/rasa/core/training/story_conflict.py
@@ -1,13 +1,21 @@
 from collections import defaultdict
 import logging
-from typing import Dict, Generator, List, NamedTuple, Optional, Text, Tuple
+import json
+from typing import Dict, Generator, List, NamedTuple, Optional, Text, Tuple, Any
 
 from rasa.core.featurizers.tracker_featurizers import MaxHistoryTrackerFeaturizer
 from rasa.shared.core.constants import ACTION_LISTEN_NAME, PREVIOUS_ACTION, USER
 from rasa.shared.core.domain import Domain, PREV_PREFIX, State, SubState
 from rasa.shared.core.events import ActionExecuted, Event
 from rasa.shared.core.generator import TrackerWithCachedStates
-from rasa.shared.nlu.constants import INTENT
+
+from rasa.nlu.model import Trainer
+from rasa.nlu.components import Component
+from rasa.nlu.tokenizers.tokenizer import Tokenizer
+from rasa.nlu.config import RasaNLUModelConfig
+from rasa.shared.nlu.constants import TEXT
+from rasa.shared.nlu.training_data.message import Message
+import rasa.shared.utils.io
 
 logger = logging.getLogger(__name__)
 
@@ -124,7 +132,8 @@ class TrackerEventStateTuple(NamedTuple):
 
     @property
     def sliced_states_hash(self) -> int:
-        return hash(str(list(self.sliced_states)))
+        """Returns the hash of the sliced states."""
+        return hash(json.dumps(self.sliced_states, sort_keys=True))
 
 
 def _get_length_of_longest_story(
@@ -146,6 +155,7 @@ def find_story_conflicts(
     trackers: List[TrackerWithCachedStates],
     domain: Domain,
     max_history: Optional[int] = None,
+    nlu_config: Optional[RasaNLUModelConfig] = None,
 ) -> List[StoryConflict]:
     """Generates `StoryConflict` objects, describing conflicts in the given trackers.
 
@@ -153,34 +163,70 @@ def find_story_conflicts(
         trackers: Trackers in which to search for conflicts.
         domain: The domain.
         max_history: The maximum history length to be taken into account.
+        nlu_config: NLU config.
 
     Returns:
         StoryConflict objects.
     """
-    if not max_history:
-        max_history = _get_length_of_longest_story(trackers, domain)
+    if max_history:
+        logger.info(
+            f"Considering the preceding {max_history} turns for conflict analysis."
+        )
+    else:
+        logger.info("Considering all preceding turns for conflict analysis.")
 
-    logger.info(f"Considering the preceding {max_history} turns for conflict analysis.")
+    tokenizer = _get_tokenizer_from_nlu_config(nlu_config)
 
     # We do this in two steps, to reduce memory consumption:
 
     # Create a 'state -> list of actions' dict, where the state is
     # represented by its hash
     conflicting_state_action_mapping = _find_conflicting_states(
-        trackers, domain, max_history
+        trackers, domain, max_history, tokenizer
     )
 
     # Iterate once more over all states and note the (unhashed) state,
     # for which a conflict occurs
     conflicts = _build_conflicts_from_states(
-        trackers, domain, max_history, conflicting_state_action_mapping
+        trackers, domain, max_history, conflicting_state_action_mapping, tokenizer,
     )
 
     return conflicts
 
 
+def _get_tokenizer_from_nlu_config(
+    nlu_config: Optional[RasaNLUModelConfig] = None,
+) -> Optional[Tokenizer]:
+    """Extracts the first Tokenizer in the NLU pipeline.
+
+    Args:
+        nlu_config: NLU Config.
+
+    Returns:
+        The first Tokenizer in the NLU pipeline, if any.
+    """
+    if not nlu_config:
+        return None
+
+    pipeline: List[Component] = Trainer(nlu_config, skip_validation=True).pipeline
+    tokenizer: Optional[Tokenizer] = None
+    for component in pipeline:
+        if isinstance(component, Tokenizer):
+            if tokenizer:
+                rasa.shared.utils.io.raise_warning(
+                    "The pipeline contains more than one tokenizer. "
+                    "Only the first tokenizer will be used for story validation.",
+                )
+            tokenizer = component
+
+    return tokenizer
+
+
 def _find_conflicting_states(
-    trackers: List[TrackerWithCachedStates], domain: Domain, max_history: int
+    trackers: List[TrackerWithCachedStates],
+    domain: Domain,
+    max_history: Optional[int],
+    tokenizer: Optional[Tokenizer],
 ) -> Dict[int, Optional[List[Text]]]:
     """Identifies all states from which different actions follow.
 
@@ -188,6 +234,7 @@ def _find_conflicting_states(
         trackers: Trackers that contain the states.
         domain: The domain object.
         max_history: Number of turns to take into account for the state descriptions.
+        tokenizer: A tokenizer to tokenize the user messages.
 
     Returns:
         A dictionary mapping state-hashes to a list of actions that follow from each state.
@@ -195,10 +242,11 @@ def _find_conflicting_states(
     # Create a 'state -> list of actions' dict, where the state is
     # represented by its hash
     state_action_mapping = defaultdict(list)
-    for element in _sliced_states_iterator(trackers, domain, max_history):
+    for element in _sliced_states_iterator(trackers, domain, max_history, tokenizer):
         hashed_state = element.sliced_states_hash
-        if element.event.as_story_string() not in state_action_mapping[hashed_state]:
-            state_action_mapping[hashed_state] += [element.event.as_story_string()]
+        current_hash = hash(element.event)
+        if current_hash not in state_action_mapping[hashed_state]:
+            state_action_mapping[hashed_state] += [current_hash]
 
     # Keep only conflicting `state_action_mapping`s
     return {
@@ -211,8 +259,9 @@ def _find_conflicting_states(
 def _build_conflicts_from_states(
     trackers: List[TrackerWithCachedStates],
     domain: Domain,
-    max_history: int,
+    max_history: Optional[int],
     conflicting_state_action_mapping: Dict[int, Optional[List[Text]]],
+    tokenizer: Optional[Tokenizer],
 ) -> List["StoryConflict"]:
     """Builds a list of `StoryConflict` objects for each given conflict.
 
@@ -222,6 +271,7 @@ def _build_conflicts_from_states(
         max_history: Number of turns to take into account for the state descriptions.
         conflicting_state_action_mapping: A dictionary mapping state-hashes to a list of actions
                                           that follow from each state.
+        tokenizer: A tokenizer to tokenize the user messages.
 
     Returns:
         A list of `StoryConflict` objects that describe inconsistencies in the story
@@ -230,7 +280,7 @@ def _build_conflicts_from_states(
     # Iterate once more over all states and note the (unhashed) state,
     # for which a conflict occurs
     conflicts = {}
-    for element in _sliced_states_iterator(trackers, domain, max_history):
+    for element in _sliced_states_iterator(trackers, domain, max_history, tokenizer):
         hashed_state = element.sliced_states_hash
 
         if hashed_state in conflicting_state_action_mapping:
@@ -238,8 +288,7 @@ def _build_conflicts_from_states(
                 conflicts[hashed_state] = StoryConflict(element.sliced_states)
 
             conflicts[hashed_state].add_conflicting_action(
-                action=element.event.as_story_string(),
-                story_name=element.tracker.sender_id,
+                action=str(element.event), story_name=element.tracker.sender_id,
             )
 
     # Return list of conflicts that arise from unpredictable actions
@@ -252,7 +301,10 @@ def _build_conflicts_from_states(
 
 
 def _sliced_states_iterator(
-    trackers: List[TrackerWithCachedStates], domain: Domain, max_history: int
+    trackers: List[TrackerWithCachedStates],
+    domain: Domain,
+    max_history: Optional[int],
+    tokenizer: Optional[Tokenizer],
 ) -> Generator[TrackerEventStateTuple, None, None]:
     """Creates an iterator over sliced states.
 
@@ -263,6 +315,7 @@ def _sliced_states_iterator(
         trackers: List of trackers.
         domain: Domain (used for tracker.past_states).
         max_history: Assumed `max_history` value for slicing.
+        tokenizer: A tokenizer to tokenize the user messages.
 
     Yields:
         A (tracker, event, sliced_states) triplet.
@@ -276,10 +329,30 @@ def _sliced_states_iterator(
                 sliced_states = MaxHistoryTrackerFeaturizer.slice_state_history(
                     states[: idx + 1], max_history
                 )
+                if tokenizer:
+                    _apply_tokenizer_to_states(tokenizer, sliced_states)
+                # ToDo: deal with oov (different tokens can lead to identical features if some of those tokens are out of vocabulary for all featurizers)
                 yield TrackerEventStateTuple(tracker, event, sliced_states)
                 idx += 1
 
 
+def _apply_tokenizer_to_states(tokenizer: Tokenizer, states: List[State]) -> None:
+    """Split each user text into tokens and concatenate them again.
+
+    Args:
+        tokenizer: A tokenizer to tokenize the user messages.
+        states: The states to be tokenized.
+    """
+    for state in states:
+        if USER in state:
+            state[USER][TEXT] = " ".join(
+                token.text
+                for token in tokenizer.tokenize(
+                    Message({TEXT: state[USER][TEXT]}), TEXT
+                )
+            )
+
+
 def _get_previous_event(
     state: Optional[State],
 ) -> Tuple[Optional[Text], Optional[Text]]:
diff --git a/rasa/core/training/training.py b/rasa/core/training/training.py
index 2deb89163418..dde083940857 100644
--- a/rasa/core/training/training.py
+++ b/rasa/core/training/training.py
@@ -14,13 +14,13 @@
 def _find_events_after_actions(
     trackers: List["DialogueStateTracker"],
 ) -> Dict[Text, Set["Event"]]:
-    """Creates a dictionary of action names and events that follow these actions.
+    """Creates a mapping of action names / texts and events that follow these actions.
 
     Args:
         trackers: the list of trackers
 
     Returns:
-        a dictionary of action names and events that follow these actions
+        A mapping of action names / texts and events that follow these actions.
     """
     events_after_actions = defaultdict(set)
 
diff --git a/rasa/exceptions.py b/rasa/exceptions.py
index 6c07487d32a7..93794bbc8284 100644
--- a/rasa/exceptions.py
+++ b/rasa/exceptions.py
@@ -35,4 +35,5 @@ def __init__(self, timestamp: float) -> None:
         super(PublishingError, self).__init__()
 
     def __str__(self) -> Text:
+        """Returns string representation of exception."""
         return str(self.timestamp)
diff --git a/rasa/model.py b/rasa/model.py
index fd224eb2c2af..ff057cf08596 100644
--- a/rasa/model.py
+++ b/rasa/model.py
@@ -444,7 +444,8 @@ def move_model(source: Text, target: Text) -> bool:
 def should_retrain(
     new_fingerprint: Fingerprint,
     old_model: Text,
-    train_path: Union[Text, Path],
+    train_path: Text,
+    has_e2e_examples: bool = False,
     force_training: bool = False,
 ) -> FingerprintComparisonResult:
     """Check which components of a model should be retrained.
@@ -453,6 +454,7 @@ def should_retrain(
         new_fingerprint: The fingerprint of the new model to be trained.
         old_model: Path to the old zipped model file.
         train_path: Path to the directory in which the new model will be trained.
+        has_e2e_examples: Whether the new training data contains e2e examples.
         force_training: Indicates if the model needs to be retrained even if the data has not changed.
 
     Returns:
@@ -482,6 +484,10 @@ def should_retrain(
             force_training=force_training,
         )
 
+        # We should retrain core if nlu data changes and there are e2e stories.
+        if has_e2e_examples and fingerprint_comparison.should_retrain_nlu():
+            fingerprint_comparison.core = True
+
         core_merge_failed = False
         if not fingerprint_comparison.should_retrain_core():
             target_path = os.path.join(train_path, DEFAULT_CORE_SUBDIRECTORY_NAME)
diff --git a/rasa/nlu/classifiers/diet_classifier.py b/rasa/nlu/classifiers/diet_classifier.py
index 9ffa2b477d0d..d1f26fec25fd 100644
--- a/rasa/nlu/classifiers/diet_classifier.py
+++ b/rasa/nlu/classifiers/diet_classifier.py
@@ -7,7 +7,6 @@
 import os
 import scipy.sparse
 import tensorflow as tf
-import tensorflow_addons as tfa
 
 from typing import Any, Dict, List, Optional, Text, Tuple, Union, Type, NamedTuple
 
@@ -18,12 +17,15 @@
 from rasa.nlu.components import Component
 from rasa.nlu.classifiers.classifier import IntentClassifier
 from rasa.nlu.extractors.extractor import EntityExtractor
-from rasa.nlu.test import determine_token_labels
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.utils import train_utils
 from rasa.utils.tensorflow import layers
 from rasa.utils.tensorflow.models import RasaModel, TransformerRasaModel
-from rasa.utils.tensorflow.model_data import RasaModelData, FeatureSignature
+from rasa.utils.tensorflow.model_data import (
+    RasaModelData,
+    FeatureSignature,
+    FeatureArray,
+)
 from rasa.nlu.constants import TOKENS_NAMES
 from rasa.shared.nlu.constants import (
     TEXT,
@@ -43,6 +45,7 @@
 from rasa.nlu.model import Metadata
 from rasa.utils.tensorflow.constants import (
     LABEL,
+    IDS,
     HIDDEN_LAYERS_SIZES,
     SHARE_HIDDEN_LAYERS,
     TRANSFORMER_SIZE,
@@ -89,19 +92,18 @@
     CHECKPOINT_MODEL,
     SEQUENCE,
     SENTENCE,
+    SEQUENCE_LENGTH,
     DENSE_DIMENSION,
+    MASK,
 )
 
-
 logger = logging.getLogger(__name__)
 
 
 SPARSE = "sparse"
 DENSE = "dense"
-SEQUENCE_LENGTH = f"{SEQUENCE}_lengths"
 LABEL_KEY = LABEL
-LABEL_SUB_KEY = "ids"
-TAG_IDS = "tag_ids"
+LABEL_SUB_KEY = IDS
 
 POSSIBLE_TAGS = [ENTITY_ATTRIBUTE_TYPE, ENTITY_ATTRIBUTE_ROLE, ENTITY_ATTRIBUTE_GROUP]
 
@@ -137,8 +139,7 @@ def required_components(cls) -> List[Type[Component]]:
         # ## Architecture of the used neural network
         # Hidden layer sizes for layers before the embedding layers for user message
         # and labels.
-        # The number of hidden layers is equal to the length of the corresponding
-        # list.
+        # The number of hidden layers is equal to the length of the corresponding list.
         HIDDEN_LAYERS_SIZES: {TEXT: [], LABEL: []},
         # Whether to share the hidden layer weights between user message and labels.
         SHARE_HIDDEN_LAYERS: False,
@@ -172,7 +173,7 @@ def required_components(cls) -> List[Type[Component]]:
         # ## Parameters for embeddings
         # Dimension size of embedding vectors
         EMBEDDING_DIMENSION: 20,
-        # Default dense dimension to use if no dense features are present.
+        # Dense dimension to use for sparse features.
         DENSE_DIMENSION: {TEXT: 128, LABEL: 20},
         # Default dimension to use for concatenating sequence and sentence features.
         CONCAT_DIMENSION: {TEXT: 128, LABEL: 20},
@@ -329,7 +330,7 @@ def __init__(
         self.model = model
 
         self._label_data: Optional[RasaModelData] = None
-        self._data_example: Optional[Dict[Text, List[np.ndarray]]] = None
+        self._data_example: Optional[Dict[Text, List[FeatureArray]]] = None
 
         self.split_entities_config = self.init_split_entities()
 
@@ -509,12 +510,11 @@ def _extract_features(
 
     def _check_input_dimension_consistency(self, model_data: RasaModelData) -> None:
         """Checks if features have same dimensionality if hidden layers are shared."""
-
         if self.component_config.get(SHARE_HIDDEN_LAYERS):
-            num_text_sentence_features = model_data.feature_dimension(TEXT, SENTENCE)
-            num_label_sentence_features = model_data.feature_dimension(LABEL, SENTENCE)
-            num_text_sequence_features = model_data.feature_dimension(TEXT, SEQUENCE)
-            num_label_sequence_features = model_data.feature_dimension(LABEL, SEQUENCE)
+            num_text_sentence_features = model_data.number_of_units(TEXT, SENTENCE)
+            num_label_sentence_features = model_data.number_of_units(LABEL, SENTENCE)
+            num_text_sequence_features = model_data.number_of_units(TEXT, SEQUENCE)
+            num_label_sequence_features = model_data.number_of_units(LABEL, SEQUENCE)
 
             if (0 < num_text_sentence_features != num_label_sentence_features > 0) or (
                 0 < num_text_sequence_features != num_label_sequence_features > 0
@@ -526,9 +526,8 @@ def _check_input_dimension_consistency(self, model_data: RasaModelData) -> None:
 
     def _extract_labels_precomputed_features(
         self, label_examples: List[Message], attribute: Text = INTENT
-    ) -> Tuple[List[np.ndarray], List[np.ndarray]]:
+    ) -> Tuple[List[FeatureArray], List[FeatureArray]]:
         """Collects precomputed encodings."""
-
         features = defaultdict(list)
 
         for e in label_examples:
@@ -540,23 +539,31 @@ def _extract_labels_precomputed_features(
         sentence_features = []
         for feature_name, feature_value in features.items():
             if SEQUENCE in feature_name:
-                sequence_features.append(np.array(features[feature_name]))
+                sequence_features.append(
+                    FeatureArray(np.array(feature_value), number_of_dimensions=3)
+                )
             else:
-                sentence_features.append(np.array(features[feature_name]))
+                sentence_features.append(
+                    FeatureArray(np.array(feature_value), number_of_dimensions=3)
+                )
 
-        return (sequence_features, sentence_features)
+        return sequence_features, sentence_features
 
     @staticmethod
     def _compute_default_label_features(
         labels_example: List[Message],
-    ) -> List[np.ndarray]:
+    ) -> List[FeatureArray]:
         """Computes one-hot representation for the labels."""
-
         logger.debug("No label features found. Computing default label features.")
 
         eye_matrix = np.eye(len(labels_example), dtype=np.float32)
         # add sequence dimension to one-hot labels
-        return [np.array([np.expand_dims(a, 0) for a in eye_matrix])]
+        return [
+            FeatureArray(
+                np.array([np.expand_dims(a, 0) for a in eye_matrix]),
+                number_of_dimensions=3,
+            )
+        ]
 
     def _create_label_data(
         self,
@@ -571,7 +578,6 @@ def _create_label_data(
         If the features are already computed, fetch them from the message object
         else compute a one hot encoding for the label as the feature vector.
         """
-
         # Collect one example for each label
         labels_idx_examples = []
         for label_name, idx in label_id_dict.items():
@@ -609,16 +615,23 @@ def _create_label_data(
         # explicitly add last dimension to label_ids
         # to track correctly dynamic sequences
         label_data.add_features(
-            LABEL_KEY, LABEL_SUB_KEY, [np.expand_dims(label_ids, -1)]
+            LABEL_KEY,
+            LABEL_SUB_KEY,
+            [FeatureArray(np.expand_dims(label_ids, -1), number_of_dimensions=2)],
         )
 
         label_data.add_lengths(LABEL, SEQUENCE_LENGTH, LABEL, SEQUENCE)
 
         return label_data
 
-    def _use_default_label_features(self, label_ids: np.ndarray) -> List[np.ndarray]:
+    def _use_default_label_features(self, label_ids: np.ndarray) -> List[FeatureArray]:
         all_label_features = self._label_data.get(LABEL, SENTENCE)[0]
-        return [np.array([all_label_features[label_id] for label_id in label_ids])]
+        return [
+            FeatureArray(
+                np.array([all_label_features[label_id] for label_id in label_ids]),
+                number_of_dimensions=all_label_features.number_of_dimensions,
+            )
+        ]
 
     def _create_model_data(
         self,
@@ -627,90 +640,105 @@ def _create_model_data(
         label_attribute: Optional[Text] = None,
         training: bool = True,
     ) -> RasaModelData:
-        """Prepare data for training and create a RasaModelData object"""
+        """Prepare data for training and create a RasaModelData object."""
+        from rasa.utils.tensorflow import model_data_utils
+
+        attributes_to_consider = [TEXT]
+        if training and self.component_config[INTENT_CLASSIFICATION]:
+            # we don't have any intent labels during prediction, just add them during
+            # training
+            attributes_to_consider.append(label_attribute)
+        if training and self.component_config[ENTITY_RECOGNITION]:
+            # we don't have any entity tags during prediction, just add them during
+            # training
+            attributes_to_consider.append(ENTITIES)
+
+        if training and label_attribute is not None:
+            # only use those training examples that have the label_attribute set
+            # during training
+            training_data = [
+                example for example in training_data if label_attribute in example.data
+            ]
 
-        # TODO: simplify model data creation
-        #   convert training data into a list of attribute to features and reuse some
-        #   of the methods of TED (they most likely need to change a bit)
+        if not training_data:
+            # no training data are present to train
+            return RasaModelData()
 
-        features = defaultdict(lambda: defaultdict(list))
-        label_ids = []
+        features_for_examples = model_data_utils.featurize_training_examples(
+            training_data,
+            attributes_to_consider,
+            entity_tag_specs=self._entity_tag_specs,
+            featurizers=self.component_config[FEATURIZERS],
+            bilou_tagging=self.component_config[BILOU_FLAG],
+        )
+        attribute_data, _ = model_data_utils.convert_to_data_format(
+            features_for_examples, consider_dialogue_dimension=False
+        )
 
-        for example in training_data:
-            if label_attribute is None or example.get(label_attribute):
-                text_features = self._extract_features(example, TEXT)
-                for feature_key, feature_value in text_features.items():
-                    features[TEXT][feature_key].append(feature_value)
+        model_data = RasaModelData(
+            label_key=self.label_key, label_sub_key=self.label_sub_key
+        )
+        model_data.add_data(attribute_data)
+        model_data.add_lengths(TEXT, SEQUENCE_LENGTH, TEXT, SEQUENCE)
 
-            # only add features for intent labels during training
-            if training and example.get(label_attribute):
-                label_features = self._extract_features(example, label_attribute)
-                for feature_key, feature_value in label_features.items():
-                    features[LABEL][feature_key].append(feature_value)
+        self._add_label_features(
+            model_data, training_data, label_attribute, label_id_dict, training
+        )
 
-                if label_id_dict:
-                    label_ids.append(label_id_dict[example.get(label_attribute)])
+        # make sure all keys are in the same order during training and prediction
+        # as we rely on the order of key and sub-key when constructing the actual
+        # tensors from the model data
+        model_data.sort()
 
-            # only add tag_ids during training
-            if training and self.component_config.get(ENTITY_RECOGNITION):
-                for tag_spec in self._entity_tag_specs:
-                    features[ENTITIES][tag_spec.tag_name].append(
-                        self._tag_ids_for_crf(example, tag_spec)
-                    )
+        return model_data
 
-        model_data = RasaModelData(
-            label_key=self.label_key, label_sub_key=self.label_sub_key
-        )
-        for key, attribute_features in features.items():
-            for sub_key, _features in attribute_features.items():
-                sub_key = sub_key.replace(f"{SPARSE}_", "").replace(f"{DENSE}_", "")
-                model_data.add_features(key, sub_key, [np.array(_features)])
+    def _add_label_features(
+        self,
+        model_data: RasaModelData,
+        training_data: List[Message],
+        label_attribute: Text,
+        label_id_dict: Dict[Text, int],
+        training: bool = True,
+    ):
+        label_ids = []
+        if training and self.component_config[INTENT_CLASSIFICATION]:
+            for example in training_data:
+                if example.get(label_attribute):
+                    label_ids.append(label_id_dict[example.get(label_attribute)])
+
+            # explicitly add last dimension to label_ids
+            # to track correctly dynamic sequences
+            model_data.add_features(
+                LABEL_KEY,
+                LABEL_SUB_KEY,
+                [FeatureArray(np.expand_dims(label_ids, -1), number_of_dimensions=2)],
+            )
 
         if (
             label_attribute
-            and model_data.does_feature_not_exist(LABEL, SENTENCE)
-            and model_data.does_feature_not_exist(LABEL, SEQUENCE)
+            and model_data.does_feature_not_exist(label_attribute, SENTENCE)
+            and model_data.does_feature_not_exist(label_attribute, SEQUENCE)
         ):
             # no label features are present, get default features from _label_data
             model_data.add_features(
                 LABEL, SENTENCE, self._use_default_label_features(np.array(label_ids))
             )
 
-        # explicitly add last dimension to label_ids
-        # to track correctly dynamic sequences
-        model_data.add_features(
-            LABEL_KEY, LABEL_SUB_KEY, [np.expand_dims(label_ids, -1)]
-        )
+        # as label_attribute can have different values, e.g. INTENT or RESPONSE,
+        # copy over the features to the LABEL key to make
+        # it easier to access the label features inside the model itself
+        model_data.update_key(label_attribute, SENTENCE, LABEL, SENTENCE)
+        model_data.update_key(label_attribute, SEQUENCE, LABEL, SEQUENCE)
+        model_data.update_key(label_attribute, MASK, LABEL, MASK)
 
-        model_data.add_lengths(TEXT, SEQUENCE_LENGTH, TEXT, SEQUENCE)
         model_data.add_lengths(LABEL, SEQUENCE_LENGTH, LABEL, SEQUENCE)
 
-        return model_data
-
-    def _tag_ids_for_crf(self, example: Message, tag_spec: EntityTagSpec) -> np.ndarray:
-        """Create a np.array containing the tag ids of the given message."""
-        if self.component_config[BILOU_FLAG]:
-            _tags = bilou_utils.bilou_tags_to_ids(
-                example, tag_spec.tags_to_ids, tag_spec.tag_name
-            )
-        else:
-            _tags = []
-            for token in example.get(TOKENS_NAMES[TEXT]):
-                _tag = determine_token_labels(
-                    token, example.get(ENTITIES), attribute_key=tag_spec.tag_name
-                )
-                _tags.append(tag_spec.tags_to_ids[_tag])
-
-        # transpose to have seq_len x 1
-        return np.array([_tags]).T
-
     # train helpers
     def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
         """Prepares data for training.
 
         Performs sanity checks on training data, extracts encodings for labels.
         """
-
         if self.component_config[BILOU_FLAG]:
             bilou_utils.apply_bilou_schema(training_data)
 
@@ -868,7 +896,9 @@ def _predict_entities(
         if predict_out is None:
             return []
 
-        predicted_tags, confidence_values = self._entity_label_to_tags(predict_out)
+        predicted_tags, confidence_values = train_utils.entity_label_to_tags(
+            predict_out, self._entity_tag_specs, self.component_config[BILOU_FLAG]
+        )
 
         entities = self.convert_predictions_into_entities(
             message.get(TEXT),
@@ -883,31 +913,8 @@ def _predict_entities(
 
         return entities
 
-    def _entity_label_to_tags(
-        self, predict_out: Dict[Text, Any]
-    ) -> Tuple[Dict[Text, List[Text]], Dict[Text, List[float]]]:
-        predicted_tags = {}
-        confidence_values = {}
-
-        for tag_spec in self._entity_tag_specs:
-            predictions = predict_out[f"e_{tag_spec.tag_name}_ids"].numpy()
-            confidences = predict_out[f"e_{tag_spec.tag_name}_scores"].numpy()
-            confidences = [float(c) for c in confidences[0]]
-            tags = [tag_spec.ids_to_tags[p] for p in predictions[0]]
-
-            if self.component_config[BILOU_FLAG]:
-                tags, confidences = bilou_utils.ensure_consistent_bilou_tagging(
-                    tags, confidences
-                )
-
-            predicted_tags[tag_spec.tag_name] = tags
-            confidence_values[tag_spec.tag_name] = confidences
-
-        return predicted_tags, confidence_values
-
     def process(self, message: Message, **kwargs: Any) -> None:
         """Return the most likely label and its similarity to the input."""
-
         out = self._predict(message)
 
         if self.component_config[INTENT_CLASSIFICATION]:
@@ -1056,7 +1063,7 @@ def _load_model(
         entity_tag_specs: List[EntityTagSpec],
         label_data: RasaModelData,
         meta: Dict[Text, Any],
-        data_example: Dict[Text, Dict[Text, List[np.ndarray]]],
+        data_example: Dict[Text, Dict[Text, List[FeatureArray]]],
         model_dir: Text,
         finetune_mode: bool = False,
     ) -> "RasaModel":
@@ -1291,39 +1298,6 @@ def _prepare_layers(self) -> None:
         if self.config[ENTITY_RECOGNITION]:
             self._prepare_entity_recognition_layers()
 
-    def _prepare_input_layers(self, name: Text) -> None:
-        self._prepare_ffnn_layer(
-            name, self.config[HIDDEN_LAYERS_SIZES][name], self.config[DROP_RATE]
-        )
-
-        for feature_type in [SENTENCE, SEQUENCE]:
-            if (
-                name not in self.data_signature
-                or feature_type not in self.data_signature[name]
-            ):
-                continue
-
-            self._prepare_sparse_dense_dropout_layers(
-                f"{name}_{feature_type}", self.config[DROP_RATE]
-            )
-            self._prepare_sparse_dense_layers(
-                self.data_signature[name][feature_type],
-                f"{name}_{feature_type}",
-                self.config[DENSE_DIMENSION][name],
-            )
-            self._prepare_ffnn_layer(
-                f"{name}_{feature_type}",
-                [self.config[CONCAT_DIMENSION][name]],
-                self.config[DROP_RATE],
-                prefix="concat_layer",
-            )
-
-    def _prepare_sequence_layers(self, name: Text) -> None:
-        self._prepare_input_layers(name)
-        self._prepare_transformer_layer(
-            name, self.config[DROP_RATE], self.config[DROP_RATE_ATTENTION]
-        )
-
     def _prepare_mask_lm_layers(self, name: Text) -> None:
         self._tf_layers[f"{name}_input_mask"] = layers.InputMask()
 
@@ -1340,116 +1314,6 @@ def _prepare_label_classification_layers(self) -> None:
 
         self._prepare_dot_product_loss(LABEL, self.config[SCALE_LOSS])
 
-    def _prepare_entity_recognition_layers(self) -> None:
-        for tag_spec in self._entity_tag_specs:
-            name = tag_spec.tag_name
-            num_tags = tag_spec.num_tags
-            self._tf_layers[f"embed.{name}.logits"] = layers.Embed(
-                num_tags, self.config[REGULARIZATION_CONSTANT], f"logits.{name}"
-            )
-            self._tf_layers[f"crf.{name}"] = layers.CRF(
-                num_tags, self.config[REGULARIZATION_CONSTANT], self.config[SCALE_LOSS]
-            )
-            self._tf_layers[f"embed.{name}.tags"] = layers.Embed(
-                self.config[EMBEDDING_DIMENSION],
-                self.config[REGULARIZATION_CONSTANT],
-                f"tags.{name}",
-            )
-
-    def _features_as_seq_ids(
-        self, features: List[Union[np.ndarray, tf.Tensor, tf.SparseTensor]], name: Text
-    ) -> Optional[tf.Tensor]:
-        """Creates dense labels for negative sampling."""
-
-        # if there are dense features - we can use them
-        for f in features:
-            if not isinstance(f, tf.SparseTensor):
-                seq_ids = tf.stop_gradient(f)
-                # add a zero to the seq dimension for the sentence features
-                seq_ids = tf.pad(seq_ids, [[0, 0], [0, 1], [0, 0]])
-                return seq_ids
-
-        # use additional sparse to dense layer
-        for f in features:
-            if isinstance(f, tf.SparseTensor):
-                seq_ids = tf.stop_gradient(
-                    self._tf_layers[f"sparse_to_dense_ids.{name}"](f)
-                )
-                # add a zero to the seq dimension for the sentence features
-                seq_ids = tf.pad(seq_ids, [[0, 0], [0, 1], [0, 0]])
-                return seq_ids
-
-        return None
-
-    def _combine_sequence_sentence_features(
-        self,
-        sequence_features: List[Union[tf.Tensor, tf.SparseTensor]],
-        sentence_features: List[Union[tf.Tensor, tf.SparseTensor]],
-        mask_sequence: tf.Tensor,
-        mask_text: tf.Tensor,
-        name: Text,
-        sparse_dropout: bool = False,
-        dense_dropout: bool = False,
-    ) -> tf.Tensor:
-        sequence_x = self._combine_sparse_dense_features(
-            sequence_features,
-            f"{name}_{SEQUENCE}",
-            mask_sequence,
-            sparse_dropout,
-            dense_dropout,
-        )
-        sentence_x = self._combine_sparse_dense_features(
-            sentence_features, f"{name}_{SENTENCE}", None, sparse_dropout, dense_dropout
-        )
-
-        if sequence_x is not None and sentence_x is None:
-            return sequence_x
-
-        if sequence_x is None and sentence_x is not None:
-            return sentence_x
-
-        if sequence_x is not None and sentence_x is not None:
-            return self._concat_sequence_sentence_features(
-                sequence_x, sentence_x, name, mask_text
-            )
-
-        raise ValueError(
-            "No features are present. Please check your configuration file."
-        )
-
-    def _concat_sequence_sentence_features(
-        self,
-        sequence_x: tf.Tensor,
-        sentence_x: tf.Tensor,
-        name: Text,
-        mask_text: tf.Tensor,
-    ):
-        if sequence_x.shape[-1] != sentence_x.shape[-1]:
-            sequence_x = self._tf_layers[f"concat_layer.{name}_{SEQUENCE}"](
-                sequence_x, self._training
-            )
-            sentence_x = self._tf_layers[f"concat_layer.{name}_{SENTENCE}"](
-                sentence_x, self._training
-            )
-
-        # we need to concatenate the sequence features with the sentence features
-        # we cannot use tf.concat as the sequence features are padded
-
-        # (1) get position of sentence features in mask
-        last = mask_text * tf.math.cumprod(
-            1 - mask_text, axis=1, exclusive=True, reverse=True
-        )
-        # (2) multiply by sentence features so that we get a matrix of
-        #     batch-dim x seq-dim x feature-dim with zeros everywhere except for
-        #     for the sentence features
-        sentence_x = last * sentence_x
-
-        # (3) add a zero to the end of sequence matrix to match the final shape
-        sequence_x = tf.pad(sequence_x, [[0, 0], [0, 1], [0, 0]])
-
-        # (4) sum up sequence features and sentence features
-        return sequence_x + sentence_x
-
     def _create_bow(
         self,
         sequence_features: List[Union[tf.Tensor, tf.SparseTensor]],
@@ -1473,52 +1337,6 @@ def _create_bow(
         x = tf.reduce_sum(x, axis=1)  # convert to bag-of-words
         return self._tf_layers[f"ffnn.{name}"](x, self._training)
 
-    def _create_sequence(
-        self,
-        sequence_features: List[Union[tf.Tensor, tf.SparseTensor]],
-        sentence_features: List[Union[tf.Tensor, tf.SparseTensor]],
-        mask_sequence: tf.Tensor,
-        mask: tf.Tensor,
-        name: Text,
-        sparse_dropout: bool = False,
-        dense_dropout: bool = False,
-        masked_lm_loss: bool = False,
-        sequence_ids: bool = False,
-    ) -> Tuple[tf.Tensor, tf.Tensor, Optional[tf.Tensor], Optional[tf.Tensor]]:
-        if sequence_ids:
-            seq_ids = self._features_as_seq_ids(sequence_features, f"{name}_{SEQUENCE}")
-        else:
-            seq_ids = None
-
-        inputs = self._combine_sequence_sentence_features(
-            sequence_features,
-            sentence_features,
-            mask_sequence,
-            mask,
-            name,
-            sparse_dropout,
-            dense_dropout,
-        )
-        inputs = self._tf_layers[f"ffnn.{name}"](inputs, self._training)
-
-        if masked_lm_loss:
-            transformer_inputs, lm_mask_bool = self._tf_layers[f"{name}_input_mask"](
-                inputs, mask, self._training
-            )
-        else:
-            transformer_inputs = inputs
-            lm_mask_bool = None
-
-        outputs = self._tf_layers[f"transformer.{name}"](
-            transformer_inputs, 1 - mask, self._training
-        )
-
-        if self.config[NUM_TRANSFORMER_LAYERS] > 0:
-            # apply activation
-            outputs = tfa.activations.gelu(outputs)
-
-        return outputs, inputs, seq_ids, lm_mask_bool
-
     def _create_all_labels(self) -> Tuple[tf.Tensor, tf.Tensor]:
         all_label_ids = self.tf_label_data[LABEL_KEY][LABEL_SUB_KEY][0]
 
@@ -1577,62 +1395,20 @@ def _calculate_label_loss(
             text_embed, label_embed, label_ids, all_labels_embed, all_label_ids
         )
 
-    def _calculate_entity_loss(
-        self,
-        inputs: tf.Tensor,
-        tag_ids: tf.Tensor,
-        mask: tf.Tensor,
-        sequence_lengths: tf.Tensor,
-        tag_name: Text,
-        entity_tags: Optional[tf.Tensor] = None,
-    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
-
-        tag_ids = tf.cast(tag_ids[:, :, 0], tf.int32)
-
-        if entity_tags is not None:
-            _tags = self._tf_layers[f"embed.{tag_name}.tags"](entity_tags)
-            inputs = tf.concat([inputs, _tags], axis=-1)
-
-        logits = self._tf_layers[f"embed.{tag_name}.logits"](inputs)
-
-        # should call first to build weights
-        pred_ids, _ = self._tf_layers[f"crf.{tag_name}"](logits, sequence_lengths)
-        loss = self._tf_layers[f"crf.{tag_name}"].loss(
-            logits, tag_ids, sequence_lengths
-        )
-        f1 = self._tf_layers[f"crf.{tag_name}"].f1_score(tag_ids, pred_ids, mask)
-
-        return loss, f1, logits
-
-    @staticmethod
-    def _get_sequence_lengths(
-        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
-        key: Text,
-        sub_key: Text,
-        batch_dim: int = 1,
-    ) -> tf.Tensor:
-        # sentence features have a sequence lengths of 1
-        # if sequence features are present we add the sequence lengths of those
-
-        sequence_lengths = tf.ones([batch_dim], dtype=tf.int32)
-        if key in tf_batch_data and sub_key in tf_batch_data[key]:
-            sequence_lengths += tf.cast(tf_batch_data[key][sub_key][0], dtype=tf.int32)
-
-        return sequence_lengths
-
-    @staticmethod
-    def _get_batch_dim(tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]]) -> int:
-        if TEXT in tf_batch_data and SEQUENCE in tf_batch_data[TEXT]:
-            return tf.shape(tf_batch_data[TEXT][SEQUENCE][0])[0]
-
-        return tf.shape(tf_batch_data[TEXT][SENTENCE][0])[0]
-
     def batch_loss(
         self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> tf.Tensor:
+        """Calculates the loss for the given batch.
+
+        Args:
+            batch_in: The batch.
+
+        Returns:
+            The loss of the given batch.
+        """
         tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
 
-        batch_dim = self._get_batch_dim(tf_batch_data)
+        batch_dim = self._get_batch_dim(tf_batch_data[TEXT])
         mask_sequence_text = self._get_mask_for(tf_batch_data, TEXT, SEQUENCE_LENGTH)
         sequence_lengths = self._get_sequence_lengths(
             tf_batch_data, TEXT, SEQUENCE_LENGTH, batch_dim
@@ -1766,9 +1542,22 @@ def _update_entity_metrics(self, loss: tf.Tensor, f1: tf.Tensor, tag_name: Text)
             self.entity_role_loss.update_state(loss)
             self.entity_role_f1.update_state(f1)
 
+    def prepare_for_predict(self) -> None:
+        """Prepares the model for prediction."""
+        if self.config[INTENT_CLASSIFICATION]:
+            _, self.all_labels_embed = self._create_all_labels()
+
     def batch_predict(
         self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> Dict[Text, tf.Tensor]:
+        """Predicts the output of the given batch.
+
+        Args:
+            batch_in: The batch.
+
+        Returns:
+            The output to predict.
+        """
         tf_batch_data = self.batch_to_model_data_format(
             batch_in, self.predict_data_signature
         )
@@ -1843,7 +1632,10 @@ def _batch_predict_intents(
     ) -> Dict[Text, tf.Tensor]:
 
         if self.all_labels_embed is None:
-            _, self.all_labels_embed = self._create_all_labels()
+            raise ValueError(
+                "The model was not prepared for prediction. "
+                "Call `prepare_for_predict` first."
+            )
 
         # get sentence feature vector for intent classification
         sentence_vector = self._last_token(text_transformed, sequence_lengths)
diff --git a/rasa/nlu/components.py b/rasa/nlu/components.py
index 3bea874e0914..73a31751ea14 100644
--- a/rasa/nlu/components.py
+++ b/rasa/nlu/components.py
@@ -4,6 +4,7 @@
 import typing
 from typing import Any, Dict, Hashable, List, Optional, Set, Text, Tuple, Type, Iterable
 
+import rasa.utils.train_utils
 from rasa.exceptions import MissingDependencyException
 from rasa.shared.exceptions import RasaException
 from rasa.shared.nlu.constants import TRAINABLE_EXTRACTORS
@@ -442,7 +443,7 @@ def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         # this is important for e.g. persistence
         component_config["name"] = self.name
 
-        self.component_config = rasa.nlu.config.override_defaults(
+        self.component_config = rasa.utils.train_utils.override_defaults(
             self.defaults, component_config
         )
 
diff --git a/rasa/nlu/config.py b/rasa/nlu/config.py
index 00678ced09e7..afdb39af1e17 100644
--- a/rasa/nlu/config.py
+++ b/rasa/nlu/config.py
@@ -1,18 +1,16 @@
-import copy
 import logging
 import os
-import ruamel.yaml as yaml
 from typing import Any, Dict, List, Optional, Text, Union
 
-from rasa.shared.exceptions import InvalidConfigException, RasaException
+from rasa.shared.exceptions import InvalidConfigException
 import rasa.shared.utils.io
 import rasa.utils.io
 from rasa.shared.constants import (
     DOCS_URL_PIPELINE,
-    DOCS_URL_MIGRATION_GUIDE,
     DEFAULT_CONFIG_PATH,
 )
 from rasa.shared.utils.io import json_to_string
+import rasa.utils.train_utils
 
 logger = logging.getLogger(__name__)
 
@@ -53,32 +51,24 @@ def _load_from_dict(config: Dict, **kwargs: Any) -> "RasaNLUModelConfig":
     return RasaNLUModelConfig(config)
 
 
-def override_defaults(
-    defaults: Optional[Dict[Text, Any]], custom: Optional[Dict[Text, Any]]
-) -> Dict[Text, Any]:
-    if defaults:
-        cfg = copy.deepcopy(defaults)
-    else:
-        cfg = {}
-
-    if custom:
-        for key in custom.keys():
-            if isinstance(cfg.get(key), dict):
-                cfg[key].update(custom[key])
-            else:
-                cfg[key] = custom[key]
-
-    return cfg
-
-
 def component_config_from_pipeline(
     index: int,
     pipeline: List[Dict[Text, Any]],
     defaults: Optional[Dict[Text, Any]] = None,
 ) -> Dict[Text, Any]:
+    """Get config of the component with the given index in the pipeline.
+
+    Args:
+        index: index the component in the pipeline
+        pipeline: a list of component configs in the NLU pipeline
+        defaults: default config of the component
+
+    Returns:
+        config of the component
+    """
     try:
         c = pipeline[index]
-        return override_defaults(defaults, c)
+        return rasa.utils.train_utils.override_defaults(defaults, c)
     except IndexError:
         rasa.shared.utils.io.raise_warning(
             f"Tried to get configuration value for component "
@@ -86,13 +76,17 @@ def component_config_from_pipeline(
             f"Returning `defaults`.",
             docs=DOCS_URL_PIPELINE,
         )
-        return override_defaults(defaults, {})
+        return rasa.utils.train_utils.override_defaults(defaults, {})
 
 
 class RasaNLUModelConfig:
+    """A class that stores NLU model configuration parameters."""
+
     def __init__(self, configuration_values: Optional[Dict[Text, Any]] = None) -> None:
-        """Create a model configuration, optionally overriding
-        defaults with a dictionary ``configuration_values``.
+        """Create a model configuration.
+
+        Args:
+            configuration_values: optional dictionary to override defaults.
         """
         if not configuration_values:
             configuration_values = {}
diff --git a/rasa/nlu/extractors/extractor.py b/rasa/nlu/extractors/extractor.py
index 18ad420bb350..4d79b89ab622 100644
--- a/rasa/nlu/extractors/extractor.py
+++ b/rasa/nlu/extractors/extractor.py
@@ -128,8 +128,8 @@ def filter_trainable_entities(
 
         return filtered
 
+    @staticmethod
     def convert_predictions_into_entities(
-        self,
         text: Text,
         tokens: List[Token],
         tags: Dict[Text, List[Text]],
@@ -158,16 +158,22 @@ def convert_predictions_into_entities(
         last_token_end = -1
 
         for idx, token in enumerate(tokens):
-            current_entity_tag = self.get_tag_for(tags, ENTITY_ATTRIBUTE_TYPE, idx)
+            current_entity_tag = EntityExtractor.get_tag_for(
+                tags, ENTITY_ATTRIBUTE_TYPE, idx
+            )
 
             if current_entity_tag == NO_ENTITY_TAG:
                 last_entity_tag = NO_ENTITY_TAG
                 last_token_end = token.end
                 continue
 
-            current_group_tag = self.get_tag_for(tags, ENTITY_ATTRIBUTE_GROUP, idx)
+            current_group_tag = EntityExtractor.get_tag_for(
+                tags, ENTITY_ATTRIBUTE_GROUP, idx
+            )
             current_group_tag = bilou_utils.tag_without_prefix(current_group_tag)
-            current_role_tag = self.get_tag_for(tags, ENTITY_ATTRIBUTE_ROLE, idx)
+            current_role_tag = EntityExtractor.get_tag_for(
+                tags, ENTITY_ATTRIBUTE_ROLE, idx
+            )
             current_role_tag = bilou_utils.tag_without_prefix(current_role_tag)
 
             group_or_role_changed = (
@@ -207,7 +213,7 @@ def convert_predictions_into_entities(
 
             if new_tag_found:
                 # new entity found
-                entity = self._create_new_entity(
+                entity = EntityExtractor._create_new_entity(
                     list(tags.keys()),
                     current_entity_tag,
                     current_group_tag,
@@ -217,7 +223,7 @@ def convert_predictions_into_entities(
                     confidences,
                 )
                 entities.append(entity)
-            elif self._check_is_single_entity(
+            elif EntityExtractor._check_is_single_entity(
                 text, token, last_token_end, split_entities_config, current_entity_tag
             ):
                 # current token has the same entity tag as the token before and
@@ -226,14 +232,16 @@ def convert_predictions_into_entities(
                 # and a whitespace.
                 entities[-1][ENTITY_ATTRIBUTE_END] = token.end
                 if confidences is not None:
-                    self._update_confidence_values(entities, confidences, idx)
+                    EntityExtractor._update_confidence_values(
+                        entities, confidences, idx
+                    )
 
             else:
                 # the token has the same entity tag as the token before but the two
                 # tokens are separated by at least 2 symbols (e.g. multiple spaces,
                 # a comma and a space, etc.) and also shouldn't be represented as a
                 # single entity
-                entity = self._create_new_entity(
+                entity = EntityExtractor._create_new_entity(
                     list(tags.keys()),
                     current_entity_tag,
                     current_group_tag,
diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
index b12173ec550e..c06c6449d327 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
@@ -262,7 +262,9 @@ def _features_for_patterns(
         sentence_features = np.zeros([1, max_number_patterns])
 
         for pattern_index, pattern in enumerate(self.known_patterns):
-            matches = re.finditer(pattern["pattern"], message.get(TEXT), flags=flags)
+            matches = re.finditer(
+                pattern["pattern"], message.get(attribute), flags=flags
+            )
             matches = list(matches)
 
             for token_index, t in enumerate(tokens):
@@ -273,7 +275,7 @@ def _features_for_patterns(
                     if t.start < match.end() and t.end > match.start():
                         patterns[pattern["name"]] = True
                         sequence_features[token_index][pattern_index] = 1.0
-                        if attribute in [RESPONSE, TEXT]:
+                        if attribute in [RESPONSE, TEXT, ACTION_TEXT]:
                             # sentence vector should contain all patterns
                             sentence_features[0][pattern_index] = 1.0
 
diff --git a/rasa/nlu/selectors/response_selector.py b/rasa/nlu/selectors/response_selector.py
index f468449b658b..ac78b6d3964a 100644
--- a/rasa/nlu/selectors/response_selector.py
+++ b/rasa/nlu/selectors/response_selector.py
@@ -641,7 +641,7 @@ def batch_loss(
     ) -> tf.Tensor:
         tf_batch_data = self.batch_to_model_data_format(batch_in, self.data_signature)
 
-        batch_dim = self._get_batch_dim(tf_batch_data)
+        batch_dim = self._get_batch_dim(tf_batch_data[TEXT])
         sequence_mask_text = super()._get_mask_for(tf_batch_data, TEXT, SEQUENCE_LENGTH)
         sequence_lengths_text = self._get_sequence_lengths(
             tf_batch_data, TEXT, SEQUENCE_LENGTH, batch_dim
diff --git a/rasa/nlu/utils/bilou_utils.py b/rasa/nlu/utils/bilou_utils.py
index de29ae67cfb6..8950c862775b 100644
--- a/rasa/nlu/utils/bilou_utils.py
+++ b/rasa/nlu/utils/bilou_utils.py
@@ -1,10 +1,7 @@
 import logging
 from collections import defaultdict, Counter
-from typing import List, Tuple, Text, Optional, Dict, Any
+from typing import List, Tuple, Text, Optional, Dict, Any, TYPE_CHECKING
 
-from rasa.nlu.tokenizers.tokenizer import Token
-from rasa.shared.nlu.training_data.training_data import TrainingData
-from rasa.shared.nlu.training_data.message import Message
 from rasa.nlu.constants import (
     TOKENS_NAMES,
     BILOU_ENTITIES,
@@ -22,6 +19,11 @@
     NO_ENTITY_TAG,
 )
 
+if TYPE_CHECKING:
+    from rasa.nlu.tokenizers.tokenizer import Token
+    from rasa.shared.nlu.training_data.training_data import TrainingData
+    from rasa.shared.nlu.training_data.message import Message
+
 logger = logging.getLogger(__name__)
 
 BEGINNING = "B-"
@@ -58,7 +60,7 @@ def tag_without_prefix(tag: Text) -> Text:
 
 
 def bilou_tags_to_ids(
-    message: Message,
+    message: "Message",
     tag_id_dict: Dict[Text, int],
     tag_name: Text = ENTITY_ATTRIBUTE_TYPE,
 ) -> List[int]:
@@ -115,7 +117,7 @@ def remove_bilou_prefixes(tags: List[Text]) -> List[Text]:
 
 
 def build_tag_id_dict(
-    training_data: TrainingData, tag_name: Text = ENTITY_ATTRIBUTE_TYPE
+    training_data: "TrainingData", tag_name: Text = ENTITY_ATTRIBUTE_TYPE
 ) -> Optional[Dict[Text, int]]:
     """Create a mapping of unique tags to ids.
 
@@ -151,7 +153,7 @@ def build_tag_id_dict(
     return tag_id_dict
 
 
-def apply_bilou_schema(training_data: TrainingData) -> None:
+def apply_bilou_schema(training_data: "TrainingData") -> None:
     """Get a list of BILOU entity tags and set them on the given messages.
 
     Args:
@@ -176,7 +178,7 @@ def apply_bilou_schema(training_data: TrainingData) -> None:
 
 
 def map_message_entities(
-    message: Message, attribute_key: Text = ENTITY_ATTRIBUTE_TYPE
+    message: "Message", attribute_key: Text = ENTITY_ATTRIBUTE_TYPE
 ) -> List[Tuple[int, int, Text]]:
     """Maps the entities of the given message to their start, end, and tag values.
 
@@ -203,7 +205,7 @@ def convert_entity(entity: Dict[Text, Any]) -> Tuple[int, int, Text]:
 
 
 def bilou_tags_from_offsets(
-    tokens: List[Token], entities: List[Tuple[int, int, Text]]
+    tokens: List["Token"], entities: List[Tuple[int, int, Text]]
 ) -> List[Text]:
     """Creates BILOU tags for the given tokens and entities.
 
diff --git a/rasa/nlu/utils/mitie_utils.py b/rasa/nlu/utils/mitie_utils.py
index 91d37cc392d7..4631e30ee641 100644
--- a/rasa/nlu/utils/mitie_utils.py
+++ b/rasa/nlu/utils/mitie_utils.py
@@ -3,7 +3,8 @@
 from typing import Any, Dict, List, Optional, Text
 
 from rasa.nlu.components import Component
-from rasa.nlu.config import RasaNLUModelConfig, override_defaults
+from rasa.nlu.config import RasaNLUModelConfig
+import rasa.utils.train_utils
 from rasa.nlu.model import Metadata
 
 if typing.TYPE_CHECKING:
@@ -37,7 +38,9 @@ def create(
     ) -> "MitieNLP":
         import mitie
 
-        component_config = override_defaults(cls.defaults, component_config)
+        component_config = rasa.utils.train_utils.override_defaults(
+            cls.defaults, component_config
+        )
 
         model_file = component_config.get("model")
         if not model_file:
diff --git a/rasa/nlu/utils/spacy_utils.py b/rasa/nlu/utils/spacy_utils.py
index 3186da23a243..4392b4a96921 100644
--- a/rasa/nlu/utils/spacy_utils.py
+++ b/rasa/nlu/utils/spacy_utils.py
@@ -3,7 +3,8 @@
 from typing import Any, Dict, List, Optional, Text, Tuple
 
 from rasa.nlu.components import Component
-from rasa.nlu.config import RasaNLUModelConfig, override_defaults
+from rasa.nlu.config import RasaNLUModelConfig
+import rasa.utils.train_utils
 from rasa.shared.nlu.training_data.training_data import TrainingData
 from rasa.shared.nlu.training_data.message import Message
 from rasa.nlu.model import InvalidModelError
@@ -64,7 +65,9 @@ def create(
         cls, component_config: Dict[Text, Any], config: RasaNLUModelConfig
     ) -> "SpacyNLP":
 
-        component_config = override_defaults(cls.defaults, component_config)
+        component_config = rasa.utils.train_utils.override_defaults(
+            cls.defaults, component_config
+        )
 
         spacy_model_name = component_config.get("model")
 
diff --git a/rasa/shared/core/constants.py b/rasa/shared/core/constants.py
index 224bcbf78241..35dd052770ff 100644
--- a/rasa/shared/core/constants.py
+++ b/rasa/shared/core/constants.py
@@ -77,4 +77,5 @@
 USER = "user"
 SLOTS = "slots"
 
+USE_TEXT_FOR_FEATURIZATION = "use_text_for_featurization"
 ENTITY_LABEL_SEPARATOR = "#"
diff --git a/rasa/shared/core/domain.py b/rasa/shared/core/domain.py
index d651e55ca2d4..afffdf79fed1 100644
--- a/rasa/shared/core/domain.py
+++ b/rasa/shared/core/domain.py
@@ -505,10 +505,25 @@ def __init__(
         store_entities_as_slots: bool = True,
         session_config: SessionConfig = SessionConfig.default(),
     ) -> None:
+        """Creates a `Domain`.
+
+        Args:
+            intents: Intent labels.
+            entities: The names of entities which might be present in user messages.
+            slots: Slots to store information during the conversation.
+            templates: Bot responses. If an action with the same name is executed, it
+                will send the matching response to the user.
+            action_names: Names of custom actions.
+            forms: Form names and their slot mappings.
+            action_texts: End-to-End bot utterances from end-to-end stories.
+            store_entities_as_slots: If `True` Rasa will automatically create `SlotSet`
+                events for entities if there are slots with the same name as the entity.
+            session_config: Configuration for conversation sessions. Conversations are
+                restarted at the end of a session.
+        """
         self.entities, self.roles, self.groups = self.collect_entity_properties(
             entities
         )
-
         self.intent_properties = self.collect_intent_properties(
             intents, self.entities, self.roles, self.groups
         )
@@ -530,8 +545,9 @@ def __init__(
         # only includes custom actions and utterance actions
         self.user_actions = self._combine_with_templates(action_names, templates)
 
-        # includes all actions (custom, utterance, default actions and forms)
-        self.action_names = (
+        # includes all action names (custom, utterance, default actions and forms)
+        # and action texts from end-to-end bot utterances
+        self.action_names_or_texts = (
             self._combine_user_with_default_actions(self.user_actions)
             + [
                 form_name
@@ -645,7 +661,7 @@ def fingerprint(self) -> Text:
         ] = rasa.shared.utils.common.sort_list_of_dicts_by_first_key(
             self_as_dict[KEY_INTENTS]
         )
-        self_as_dict[KEY_ACTIONS] = self.action_names
+        self_as_dict[KEY_ACTIONS] = self.action_names_or_texts
         return rasa.shared.utils.io.get_dictionary_fingerprint(self_as_dict)
 
     @rasa.shared.utils.common.lazy_property
@@ -655,11 +671,25 @@ def user_actions_and_forms(self):
         return self.user_actions + self.form_names
 
     @rasa.shared.utils.common.lazy_property
-    def num_actions(self):
-        """Returns the number of available actions."""
+    def action_names(self) -> List[Text]:
+        """Returns action names or texts."""
+        # Raise `DeprecationWarning` instead of `FutureWarning` as we only want to
+        # notify developers about the deprecation (e.g. developers who are using the
+        # Python API or writing custom policies). End users can't change anything
+        # about this warning except making their developers change any custom code
+        # which calls this.
+        rasa.shared.utils.io.raise_warning(
+            f"{Domain.__name__}.{Domain.action_names.__name__} "
+            f"is deprecated and will be removed version 3.0.0.",
+            category=DeprecationWarning,
+        )
+        return self.action_names_or_texts
 
+    @rasa.shared.utils.common.lazy_property
+    def num_actions(self) -> int:
+        """Returns the number of available actions."""
         # noinspection PyTypeChecker
-        return len(self.action_names)
+        return len(self.action_names_or_texts)
 
     @rasa.shared.utils.common.lazy_property
     def num_states(self):
@@ -749,7 +779,7 @@ def _add_knowledge_base_slots(self) -> None:
         """
         if (
             rasa.shared.core.constants.DEFAULT_KNOWLEDGE_BASE_ACTION
-            in self.action_names
+            in self.action_names_or_texts
         ):
             logger.warning(
                 "You are using an experiential feature: Action '{}'!".format(
@@ -779,22 +809,50 @@ def add_knowledge_base_slots(self) -> None:
     def index_for_action(self, action_name: Text) -> Optional[int]:
         """Looks up which action index corresponds to this action name."""
         try:
-            return self.action_names.index(action_name)
+            return self.action_names_or_texts.index(action_name)
         except ValueError:
             self.raise_action_not_found_exception(action_name)
 
-    def raise_action_not_found_exception(self, action_name) -> NoReturn:
-        action_names = "\n".join([f"\t - {a}" for a in self.action_names])
+    def raise_action_not_found_exception(self, action_name_or_text: Text) -> NoReturn:
+        """Raises exception if action name or text not part of the domain or stories.
+
+        Args:
+            action_name_or_text: Name of an action or its text in case it's an
+                end-to-end bot utterance.
+
+        Raises:
+            ActionNotFoundException: If `action_name_or_text` are not part of this
+                domain.
+        """
+        action_names = "\n".join([f"\t - {a}" for a in self.action_names_or_texts])
         raise ActionNotFoundException(
-            f"Cannot access action '{action_name}', "
+            f"Cannot access action '{action_name_or_text}', "
             f"as that name is not a registered "
             f"action for this domain. "
             f"Available actions are: \n{action_names}"
         )
 
     def random_template_for(self, utter_action: Text) -> Optional[Dict[Text, Any]]:
+        """Returns a random response for an action name.
+
+        Args:
+            utter_action: The name of the utter action.
+
+        Returns:
+            A response for an utter action.
+        """
         import numpy as np
 
+        # Raise `DeprecationWarning` instead of `FutureWarning` as we only want to
+        # notify developers about the deprecation (e.g. developers who are using the
+        # Python API or writing custom policies). End users can't change anything
+        # about this warning except making their developers change any custom code
+        # which calls this.
+        rasa.shared.utils.io.raise_warning(
+            f"'{Domain.__name__}.{Domain.random_template_for.__class__}' "
+            f"is deprecated and will be removed version 3.0.0.",
+            category=DeprecationWarning,
+        )
         if utter_action in self.templates:
             return np.random.choice(self.templates[utter_action])
         else:
@@ -861,12 +919,11 @@ def input_state_map(self) -> Dict[Text, int]:
     @rasa.shared.utils.common.lazy_property
     def input_states(self) -> List[Text]:
         """Returns all available states."""
-
         return (
             self.intents
             + self.entity_states
             + self.slot_states
-            + self.action_names
+            + self.action_names_or_texts
             + self.form_names
         )
 
@@ -923,10 +980,13 @@ def _get_user_sub_state(
 
         # filter entities based on intent config
         # sub_state will be transformed to frozenset therefore we need to
-        # convert the list to the tuple
+        # convert the set to the tuple
         # sub_state is transformed to frozenset because we will later hash it
         # for deduplication
-        entities = tuple(self._get_featurized_entities(latest_message))
+        entities = tuple(
+            self._get_featurized_entities(latest_message)
+            & set(sub_state.get(rasa.shared.nlu.constants.ENTITIES, ()))
+        )
         if entities:
             sub_state[rasa.shared.nlu.constants.ENTITIES] = entities
         else:
@@ -1077,7 +1137,7 @@ def _slot_definitions(self) -> Dict[Any, Dict[str, Any]]:
         return {slot.name: slot.persistence_info() for slot in self._user_slots}
 
     def as_dict(self) -> Dict[Text, Any]:
-        """Returns serialized domain."""
+        """Return serialized `Domain`."""
         return {
             "config": {"store_entities_as_slots": self.store_entities_as_slots},
             SESSION_CONFIG_KEY: {
@@ -1088,7 +1148,7 @@ def as_dict(self) -> Dict[Text, Any]:
             KEY_ENTITIES: self._transform_entities_for_file(),
             KEY_SLOTS: self._slot_definitions(),
             KEY_RESPONSES: self.templates,
-            KEY_ACTIONS: self._custom_actions,  # class names of the actions
+            KEY_ACTIONS: self._custom_actions,
             KEY_FORMS: self.forms,
             KEY_E2E_ACTIONS: self.action_texts,
         }
@@ -1399,13 +1459,12 @@ def get_duplicates(my_items):
         def check_mappings(
             intent_properties: Dict[Text, Dict[Text, Union[bool, List]]]
         ) -> List[Tuple[Text, Text]]:
-            """Check whether intent-action mappings use proper action names."""
-
+            """Checks whether intent-action mappings use valid action names or texts."""
             incorrect = []
             for intent, properties in intent_properties.items():
                 if "triggers" in properties:
                     triggered_action = properties.get("triggers")
-                    if triggered_action not in self.action_names:
+                    if triggered_action not in self.action_names_or_texts:
                         incorrect.append((intent, str(triggered_action)))
             return incorrect
 
@@ -1454,7 +1513,7 @@ def get_duplicate_exception_message(
                     )
             return message
 
-        duplicate_actions = get_duplicates(self.action_names)
+        duplicate_actions = get_duplicates(self.action_names_or_texts)
         duplicate_slots = get_duplicates([s.name for s in self.slots])
         duplicate_entities = get_duplicates(self.entities)
         incorrect_mappings = check_mappings(self.intent_properties)
@@ -1481,7 +1540,7 @@ def check_missing_templates(self) -> None:
 
         utterances = [
             a
-            for a in self.action_names
+            for a in self.action_names_or_texts
             if a.startswith(rasa.shared.constants.UTTER_PREFIX)
         ]
 
diff --git a/rasa/shared/core/events.py b/rasa/shared/core/events.py
index 02ac0902efb9..be081e2038ad 100644
--- a/rasa/shared/core/events.py
+++ b/rasa/shared/core/events.py
@@ -1,6 +1,8 @@
+import abc
 import json
 import logging
 import re
+from abc import ABC
 
 import jsonpickle
 import time
@@ -12,15 +14,19 @@
 import rasa.shared.utils.common
 from typing import Union
 
+from rasa.shared.constants import DOCS_URL_TRAINING_DATA
 from rasa.shared.core.constants import (
     LOOP_NAME,
     EXTERNAL_MESSAGE_PREFIX,
     ACTION_NAME_SENDER_ID_CONNECTOR_STR,
     IS_EXTERNAL,
+    USE_TEXT_FOR_FEATURIZATION,
     LOOP_INTERRUPTED,
     ENTITY_LABEL_SEPARATOR,
     ACTION_SESSION_START_NAME,
+    ACTION_LISTEN_NAME,
 )
+from rasa.shared.exceptions import UnsupportedFeatureException
 from rasa.shared.nlu.constants import (
     ENTITY_ATTRIBUTE_TYPE,
     INTENT,
@@ -70,7 +76,7 @@ def deserialise_entities(entities: Union[Text, List[Any]]) -> List[Dict[Text, An
     return [e for e in entities if isinstance(e, dict)]
 
 
-def md_format_message(
+def format_message(
     text: Text, intent: Optional[Text], entities: Union[Text, List[Any]]
 ) -> Text:
     """Uses NLU parser information to generate a message with inline entity annotations.
@@ -188,11 +194,13 @@ def do_events_begin_with_session_start(events: List["Event"]) -> bool:
     ]
 
 
-# noinspection PyProtectedMember
-class Event:
-    """Events describe everything that occurs in
-    a conversation and tell the :class:`rasa.shared.core.trackers.DialogueStateTracker`
-    how to update its state."""
+class Event(ABC):
+    """Describes events in conversation and how the affect the conversation state.
+
+    Immutable representation of everything which happened during a conversation of the
+    user with the assistant. Tells the `rasa.shared.core.trackers.DialogueStateTracker`
+    how to update its state as the events occur.
+    """
 
     type_name = "event"
 
@@ -221,7 +229,14 @@ def __ne__(self, other: Any) -> bool:
         # True at the same time
         return not (self == other)
 
+    @abc.abstractmethod
     def as_story_string(self) -> Optional[Text]:
+        """Returns the event as story string.
+
+        Returns:
+            textual representation of the event or None.
+        """
+        # Every class should implement this
         raise NotImplementedError
 
     @staticmethod
@@ -301,14 +316,52 @@ def resolve_by_type(
             raise ValueError(f"Unknown event name '{type_name}'.")
 
     def apply_to(self, tracker: "DialogueStateTracker") -> None:
+        """Applies event to current conversation state.
+
+        Args:
+            tracker: The current conversation state.
+        """
         pass
 
+    @abc.abstractmethod
+    def __eq__(self, other: Any) -> bool:
+        """Compares object with other object."""
+        # Every class should implement this
+        raise NotImplementedError()
+
+    def __str__(self) -> Text:
+        """Returns text representation of event."""
+        return f"{self.__class__.__name__}()"
+
+
+class AlwaysEqualEventMixin(Event, ABC):
+    """Class to deduplicate common behavior for events without additional attributes."""
+
+    def __eq__(self, other: Any) -> bool:
+        """Compares object with other object."""
+        if not isinstance(other, self.__class__):
+            return NotImplemented
+
+        return True
+
+
+class SkipEventInMDStoryMixin(Event, ABC):
+    """Skips the visualization of an event in Markdown stories."""
+
+    def as_story_string(self) -> None:
+        """Returns the event as story string.
+
+        Returns:
+            None, as this event should not appear inside the story.
+        """
+        return
+
 
-# noinspection PyProtectedMember
 class UserUttered(Event):
     """The user has said something to the bot.
 
-    As a side effect a new ``Turn`` will be created in the ``Tracker``."""
+    As a side effect a new `Turn` will be created in the `Tracker`.
+    """
 
     type_name = "user"
 
@@ -322,7 +375,23 @@ def __init__(
         input_channel: Optional[Text] = None,
         message_id: Optional[Text] = None,
         metadata: Optional[Dict] = None,
+        use_text_for_featurization: Optional[bool] = None,
     ) -> None:
+        """Creates event for incoming user message.
+
+        Args:
+            text: Text of user message.
+            intent: Intent prediction of user message.
+            entities: Extracted entities.
+            parse_data: Detailed NLU parsing result for message.
+            timestamp: When the event was created.
+            metadata: Additional event metadata.
+            input_channel: Which channel the user used to send message.
+            message_id: Unique ID for message.
+            use_text_for_featurization: `True` if the message's text was used to predict
+                next action. `False` if the message's intent was used.
+
+        """
         self.text = text
         self.intent = intent if intent else {}
         self.entities = entities if entities else []
@@ -331,10 +400,23 @@ def __init__(
 
         super().__init__(timestamp, metadata)
 
+        # The featurization is set by the policies during prediction time using a
+        # `DefinePrevUserUtteredFeaturization` event.
+        self.use_text_for_featurization = use_text_for_featurization
+        # define how this user utterance should be featurized
+        if self.text and not self.intent_name:
+            # happens during training
+            self.use_text_for_featurization = True
+        elif self.intent_name and not self.text:
+            # happens during training
+            self.use_text_for_featurization = False
+
         self.parse_data = {
-            "intent": self.intent,
-            "entities": self.entities,
-            "text": text,
+            INTENT: self.intent,
+            # Copy entities so that changes to `self.entities` don't affect
+            # `self.parse_data` and hence don't get persisted
+            ENTITIES: self.entities.copy(),
+            TEXT: self.text,
             "message_id": self.message_id,
             "metadata": self.metadata,
         }
@@ -353,8 +435,8 @@ def _from_parse_data(
     ):
         return UserUttered(
             text,
-            parse_data.get("intent"),
-            parse_data.get("entities", []),
+            parse_data.get(INTENT),
+            parse_data.get(ENTITIES, []),
             parse_data,
             timestamp,
             input_channel,
@@ -363,30 +445,46 @@ def _from_parse_data(
         )
 
     def __hash__(self) -> int:
-        return hash((self.text, self.intent_name, jsonpickle.encode(self.entities)))
+        """Returns unique hash of object."""
+        return hash(json.dumps(self.as_sub_state()))
 
     @property
     def intent_name(self) -> Optional[Text]:
+        """Returns intent name or `None` if no intent."""
         return self.intent.get(INTENT_NAME_KEY)
 
     def __eq__(self, other: Any) -> bool:
+        """Compares object with other object."""
         if not isinstance(other, UserUttered):
-            return False
-        else:
-            return (
-                self.text,
-                self.intent_name,
-                [jsonpickle.encode(ent) for ent in self.entities],
-            ) == (
-                other.text,
-                other.intent_name,
-                [jsonpickle.encode(ent) for ent in other.entities],
-            )
+            return NotImplemented
+
+        return (
+            self.text,
+            self.intent_name,
+            [jsonpickle.encode(ent) for ent in self.entities],
+        ) == (
+            other.text,
+            other.intent_name,
+            [jsonpickle.encode(ent) for ent in other.entities],
+        )
 
     def __str__(self) -> Text:
+        """Returns text representation of event."""
+        entities = ""
+        if self.entities:
+            entities = [
+                f"{entity[ENTITY_ATTRIBUTE_VALUE]} "
+                f"(Type: {entity[ENTITY_ATTRIBUTE_TYPE]}, "
+                f"Role: {entity.get(ENTITY_ATTRIBUTE_ROLE)}, "
+                f"Group: {entity.get(ENTITY_ATTRIBUTE_GROUP)})"
+                for entity in self.entities
+            ]
+            entities = f", entities: {', '.join(entities)}"
+
         return (
-            f"UserUttered(text: {self.text}, intent: {self.intent}, "
-            f"entities: {self.entities})"
+            f"UserUttered(text: {self.text}, intent: {self.intent_name}"
+            f"{entities}"
+            f", use_text_for_featurization: {self.use_text_for_featurization})"
         )
 
     @staticmethod
@@ -436,11 +534,14 @@ def as_sub_state(self) -> Dict[Text, Union[None, Text, List[Optional[Text]]]]:
         out = {}
         # During training we expect either intent_name or text to be set.
         # During prediction both will be set.
-        if self.intent_name:
-            out[INTENT] = self.intent_name
-        if self.text:
+        if self.text and (
+            self.use_text_for_featurization or self.use_text_for_featurization is None
+        ):
             out[TEXT] = self.text
-        if entities:
+        if self.intent_name and not self.use_text_for_featurization:
+            out[INTENT] = self.intent_name
+        # don't add entities for e2e utterances
+        if entities and not self.use_text_for_featurization:
             out[ENTITIES] = entities
 
         return out
@@ -461,34 +562,46 @@ def _from_story_string(cls, parameters: Dict[Text, Any]) -> Optional[List[Event]
         except KeyError as e:
             raise ValueError(f"Failed to parse bot uttered event. {e}")
 
+    def _entity_string(self) -> Text:
+        if self.entities:
+            return json.dumps(
+                {
+                    entity[ENTITY_ATTRIBUTE_TYPE]: entity[ENTITY_ATTRIBUTE_VALUE]
+                    for entity in self.entities
+                },
+                ensure_ascii=False,
+            )
+        return ""
+
     def as_story_string(self, e2e: bool = False) -> Text:
-        # TODO figure out how to print if TED chose to use text,
-        #  during prediction there will be always intent
-        if self.intent:
-            if self.entities:
-                ent_string = json.dumps(
-                    {
-                        entity[ENTITY_ATTRIBUTE_TYPE]: entity[ENTITY_ATTRIBUTE_VALUE]
-                        for entity in self.entities
-                    },
-                    ensure_ascii=False,
-                )
-            else:
-                ent_string = ""
+        """Return event as string for Markdown training format.
 
-            parse_string = f"{self.intent.get(INTENT_NAME_KEY, '')}{ent_string}"
+        Args:
+            e2e: `True` if the the event should be printed in the format for
+                end-to-end conversation tests.
 
-            if e2e:
-                message = md_format_message(
-                    self.text, self.intent.get(INTENT_NAME_KEY), self.entities
-                )
-                return f"{self.intent.get(INTENT_NAME_KEY)}: {message}"
+        Returns:
+            Event as string.
+        """
+        if self.use_text_for_featurization and not e2e:
+            raise UnsupportedFeatureException(
+                f"Printing end-to-end user utterances is not supported in the "
+                f"Markdown training format. Please use the YAML training data format "
+                f"instead. Please see {DOCS_URL_TRAINING_DATA} for more information."
+            )
+
+        if e2e:
+            text_with_entities = format_message(
+                self.text or "", self.intent_name, self.entities
+            )
 
-            return parse_string
+            intent_prefix = f"{self.intent_name}: " if self.intent_name else ""
+            return f"{intent_prefix}{text_with_entities}"
 
-        return self.text
+        return f"{self.intent_name or ''}{self._entity_string()}"
 
     def apply_to(self, tracker: "DialogueStateTracker") -> None:
+        """Applies event to tracker. See docstring of `Event`."""
         tracker.latest_message = self
         tracker.clear_followup_action()
 
@@ -507,17 +620,164 @@ def create_external(
         )
 
 
-# noinspection PyProtectedMember
-class BotUttered(Event):
+class DefinePrevUserUtteredFeaturization(SkipEventInMDStoryMixin):
+    """Stores information whether action was predicted based on text or intent."""
+
+    type_name = "user_featurization"
+
+    def __init__(
+        self,
+        use_text_for_featurization: bool,
+        timestamp: Optional[float] = None,
+        metadata: Optional[Dict[Text, Any]] = None,
+    ) -> None:
+        """Creates event.
+
+        Args:
+            use_text_for_featurization: `True` if message text was used to predict
+                action. `False` if intent was used.
+            timestamp: When the event was created.
+            metadata: Additional event metadata.
+        """
+        super().__init__(timestamp, metadata)
+        self.use_text_for_featurization = use_text_for_featurization
+
+    def __str__(self) -> Text:
+        """Returns text representation of event."""
+        return f"DefinePrevUserUtteredFeaturization({self.use_text_for_featurization})"
+
+    def __hash__(self) -> int:
+        """Returns unique hash for event."""
+        return hash(self.use_text_for_featurization)
+
+    @classmethod
+    def _from_parameters(
+        cls, parameters: Dict[Text, Any]
+    ) -> "DefinePrevUserUtteredFeaturization":
+        return DefinePrevUserUtteredFeaturization(
+            parameters.get(USE_TEXT_FOR_FEATURIZATION),
+            parameters.get("timestamp"),
+            parameters.get("metadata"),
+        )
+
+    def as_dict(self) -> Dict[Text, Any]:
+        """Returns serialized event."""
+        d = super().as_dict()
+        d.update({USE_TEXT_FOR_FEATURIZATION: self.use_text_for_featurization})
+        return d
+
+    def apply_to(self, tracker: "DialogueStateTracker") -> None:
+        """Applies event to current conversation state.
+
+        Args:
+            tracker: The current conversation state.
+        """
+        if tracker.latest_action_name != ACTION_LISTEN_NAME:
+            # featurization belong only to the last user message
+            # a user message is always followed by action listen
+            return
+
+        # update previous user message's featurization based on this event
+        tracker.latest_message.use_text_for_featurization = (
+            self.use_text_for_featurization
+        )
+
+    def __eq__(self, other) -> bool:
+        """Compares object with other object."""
+        if not isinstance(other, DefinePrevUserUtteredFeaturization):
+            return NotImplemented
+
+        return self.use_text_for_featurization == other.use_text_for_featurization
+
+
+class EntitiesAdded(SkipEventInMDStoryMixin):
+    """Event that is used to add extracted entities to the tracker state."""
+
+    type_name = "entities"
+
+    def __init__(
+        self,
+        entities: List[Dict[Text, Any]],
+        timestamp: Optional[float] = None,
+        metadata: Optional[Dict[Text, Any]] = None,
+    ) -> None:
+        """Initializes event.
+
+        Args:
+            entities: Entities extracted from previous user message. This can either
+                be done by NLU components or end-to-end policy predictions.
+            timestamp: the timestamp
+            metadata: some optional metadata
+        """
+        super().__init__(timestamp, metadata)
+        self.entities = entities
+
+    def __str__(self) -> Text:
+        """Returns the string representation of the event."""
+        entity_str = [e[ENTITY_ATTRIBUTE_TYPE] for e in self.entities]
+        return f"{self.__class__.__name__}({entity_str})"
+
+    def __hash__(self) -> int:
+        """Returns the hash value of the event."""
+        return hash(json.dumps(self.entities))
+
+    def __eq__(self, other: Any) -> bool:
+        """Compares this event with another event."""
+        return isinstance(other, EntitiesAdded)
+
+    @classmethod
+    def _from_parameters(cls, parameters: Dict[Text, Any]) -> "EntitiesAdded":
+        return EntitiesAdded(
+            parameters.get(ENTITIES),
+            parameters.get("timestamp"),
+            parameters.get("metadata"),
+        )
+
+    def as_dict(self) -> Dict[Text, Any]:
+        """Converts the event into a dict.
+
+        Returns:
+            A dict that represents this event.
+        """
+        d = super().as_dict()
+        d.update({ENTITIES: self.entities})
+        return d
+
+    def apply_to(self, tracker: "DialogueStateTracker") -> None:
+        """Applies event to current conversation state.
+
+        Args:
+            tracker: The current conversation state.
+        """
+        if tracker.latest_action_name != ACTION_LISTEN_NAME:
+            # entities belong only to the last user message
+            # a user message always comes after action listen
+            return
+
+        for entity in self.entities:
+            if entity not in tracker.latest_message.entities:
+                tracker.latest_message.entities.append(entity)
+
+
+class BotUttered(SkipEventInMDStoryMixin):
     """The bot has said something to the user.
 
     This class is not used in the story training as it is contained in the
 
-    ``ActionExecuted`` class. An entry is made in the ``Tracker``."""
+    ``ActionExecuted`` class. An entry is made in the ``Tracker``.
+    """
 
     type_name = "bot"
 
     def __init__(self, text=None, data=None, metadata=None, timestamp=None) -> None:
+        """Creates event for a bot response.
+
+        Args:
+            text: Plain text which bot responded with.
+            data: Additional data for more complex utterances (e.g. buttons).
+            timestamp: When the event was created.
+            metadata: Additional event metadata.
+        """
         self.text = text
         self.data = data or {}
         super().__init__(timestamp, metadata)
@@ -532,34 +792,34 @@ def __members(self):
         )
 
     def __hash__(self) -> int:
+        """Returns unique hash for event."""
         return hash(self.__members())
 
     def __eq__(self, other) -> bool:
+        """Compares object with other object."""
         if not isinstance(other, BotUttered):
-            return False
-        else:
-            return self.__members() == other.__members()
+            return NotImplemented
+
+        return self.__members() == other.__members()
 
     def __str__(self) -> Text:
+        """Returns text representation of event."""
         return "BotUttered(text: {}, data: {}, metadata: {})".format(
             self.text, json.dumps(self.data), json.dumps(self.metadata)
         )
 
     def __repr__(self) -> Text:
+        """Returns text representation of event for debugging."""
         return "BotUttered('{}', {}, {}, {})".format(
             self.text, json.dumps(self.data), json.dumps(self.metadata), self.timestamp
         )
 
     def apply_to(self, tracker: "DialogueStateTracker") -> None:
-
+        """Applies event to current conversation state."""
         tracker.latest_bot_utterance = self
 
-    def as_story_string(self) -> None:
-        return None
-
     def message(self) -> Dict[Text, Any]:
         """Return the complete message as a dictionary."""
-
         m = self.data.copy()
         m["text"] = self.text
         m["timestamp"] = self.timestamp
@@ -576,9 +836,11 @@ def message(self) -> Dict[Text, Any]:
 
     @staticmethod
     def empty() -> "BotUttered":
+        """Creates an empty bot utterance."""
         return BotUttered()
 
     def as_dict(self) -> Dict[Text, Any]:
+        """Returns serialized event."""
         d = super().as_dict()
         d.update({"text": self.text, "data": self.data, "metadata": self.metadata})
         return d
@@ -596,15 +858,15 @@ def _from_parameters(cls, parameters) -> "BotUttered":
             raise ValueError(f"Failed to parse bot uttered event. {e}")
 
 
-# noinspection PyProtectedMember
 class SlotSet(Event):
-    """The user has specified their preference for the value of a ``slot``.
+    """The user has specified their preference for the value of a `slot`.
 
     Every slot has a name and a value. This event can be used to set a
     value for a slot on a conversation.
 
-    As a side effect the ``Tracker``'s slots will be updated so
-    that ``tracker.slots[key]=value``."""
+    As a side effect the `Tracker`'s slots will be updated so
+    that `tracker.slots[key]=value`.
+    """
 
     type_name = "slot"
 
@@ -615,23 +877,35 @@ def __init__(
         timestamp: Optional[float] = None,
         metadata: Optional[Dict[Text, Any]] = None,
     ) -> None:
+        """Creates event to set slot.
+
+        Args:
+            key: Name of the slot which is set.
+            value: Value to which slot is set.
+            timestamp: When the event was created.
+            metadata: Additional event metadata.
+        """
         self.key = key
         self.value = value
         super().__init__(timestamp, metadata)
 
     def __str__(self) -> Text:
+        """Returns text representation of event."""
         return f"SlotSet(key: {self.key}, value: {self.value})"
 
     def __hash__(self) -> int:
+        """Returns unique hash for event."""
         return hash((self.key, jsonpickle.encode(self.value)))
 
     def __eq__(self, other) -> bool:
+        """Compares object with other object."""
         if not isinstance(other, SlotSet):
-            return False
-        else:
-            return (self.key, self.value) == (other.key, other.value)
+            return NotImplemented
+
+        return (self.key, self.value) == (other.key, other.value)
 
     def as_story_string(self) -> Text:
+        """Returns text representation of event."""
         props = json.dumps({self.key: self.value}, ensure_ascii=False)
         return f"{self.type_name}{props}"
 
@@ -648,6 +922,7 @@ def _from_story_string(cls, parameters: Dict[Text, Any]) -> Optional[List[Event]
             return None
 
     def as_dict(self) -> Dict[Text, Any]:
+        """Returns serialized event."""
         d = super().as_dict()
         d.update({"name": self.key, "value": self.value})
         return d
@@ -665,29 +940,26 @@ def _from_parameters(cls, parameters) -> "SlotSet":
             raise ValueError(f"Failed to parse set slot event. {e}")
 
     def apply_to(self, tracker: "DialogueStateTracker") -> None:
+        """Applies event to current conversation state."""
         tracker._set_slot(self.key, self.value)
 
 
-# noinspection PyProtectedMember
-class Restarted(Event):
+class Restarted(AlwaysEqualEventMixin):
     """Conversation should start over & history wiped.
 
     Instead of deleting all events, this event can be used to reset the
     trackers state (e.g. ignoring any past user messages & resetting all
-    the slots)."""
+    the slots).
+    """
 
     type_name = "restart"
 
     def __hash__(self) -> int:
+        """Returns unique hash for event."""
         return hash(32143124312)
 
-    def __eq__(self, other) -> bool:
-        return isinstance(other, Restarted)
-
-    def __str__(self) -> Text:
-        return "Restarted()"
-
     def as_story_string(self) -> Text:
+        """Returns text representation of event."""
         return self.type_name
 
     def apply_to(self, tracker: "DialogueStateTracker") -> None:
@@ -696,63 +968,58 @@ def apply_to(self, tracker: "DialogueStateTracker") -> None:
         tracker.trigger_followup_action(ACTION_SESSION_START_NAME)
 
 
-# noinspection PyProtectedMember
-class UserUtteranceReverted(Event):
+class UserUtteranceReverted(AlwaysEqualEventMixin):
     """Bot reverts everything until before the most recent user message.
 
     The bot will revert all events after the latest `UserUttered`, this
     also means that the last event on the tracker is usually `action_listen`
-    and the bot is waiting for a new user message."""
+    and the bot is waiting for a new user message.
+    """
 
     type_name = "rewind"
 
     def __hash__(self) -> int:
+        """Returns unique hash for event."""
         return hash(32143124315)
 
-    def __eq__(self, other) -> bool:
-        return isinstance(other, UserUtteranceReverted)
-
-    def __str__(self) -> Text:
-        return "UserUtteranceReverted()"
-
     def as_story_string(self) -> Text:
+        """Returns text representation of event."""
         return self.type_name
 
     def apply_to(self, tracker: "DialogueStateTracker") -> None:
+        """Applies event to current conversation state."""
         tracker._reset()
         tracker.replay_events()
 
 
-# noinspection PyProtectedMember
-class AllSlotsReset(Event):
+class AllSlotsReset(AlwaysEqualEventMixin):
     """All Slots are reset to their initial values.
 
     If you want to keep the dialogue history and only want to reset the
     slots, you can use this event to set all the slots to their initial
-    values."""
+    values.
+    """
 
     type_name = "reset_slots"
 
     def __hash__(self) -> int:
+        """Returns unique hash for event."""
         return hash(32143124316)
 
-    def __eq__(self, other) -> bool:
-        return isinstance(other, AllSlotsReset)
-
-    def __str__(self) -> Text:
-        return "AllSlotsReset()"
-
     def as_story_string(self) -> Text:
+        """Returns text representation of event."""
         return self.type_name
 
     def apply_to(self, tracker) -> None:
+        """Applies event to current conversation state."""
         tracker._reset_slots()
 
 
-# noinspection PyProtectedMember
 class ReminderScheduled(Event):
-    """Schedules the asynchronous triggering of a user intent
-    (with entities if needed) at a given time."""
+    """Schedules the asynchronous triggering of a user intent at a given time.
+
+    The triggered intent can include entities if needed.
+    """
 
     type_name = "reminder"
 
@@ -766,7 +1033,7 @@ def __init__(
         timestamp: Optional[float] = None,
         metadata: Optional[Dict[Text, Any]] = None,
     ) -> None:
-        """Creates the reminder
+        """Creates the reminder.
 
         Args:
             intent: Name of the intent to be triggered.
@@ -789,6 +1056,7 @@ def __init__(
         super().__init__(timestamp, metadata)
 
     def __hash__(self) -> int:
+        """Returns unique hash for event."""
         return hash(
             (
                 self.intent,
@@ -799,13 +1067,15 @@ def __hash__(self) -> int:
             )
         )
 
-    def __eq__(self, other) -> bool:
+    def __eq__(self, other: Any) -> bool:
+        """Compares object with other object."""
         if not isinstance(other, ReminderScheduled):
-            return False
-        else:
-            return self.name == other.name
+            return NotImplemented
+
+        return self.name == other.name
 
     def __str__(self) -> Text:
+        """Returns text representation of event."""
         return (
             f"ReminderScheduled(intent: {self.intent}, trigger_date: {self.trigger_date_time}, "
             f"entities: {self.entities}, name: {self.name})"
@@ -828,10 +1098,12 @@ def _properties(self) -> Dict[Text, Any]:
         }
 
     def as_story_string(self) -> Text:
+        """Returns text representation of event."""
         props = json.dumps(self._properties())
         return f"{self.type_name}{props}"
 
     def as_dict(self) -> Dict[Text, Any]:
+        """Returns serialized event."""
         d = super().as_dict()
         d.update(self._properties())
         return d
@@ -854,7 +1126,6 @@ def _from_story_string(cls, parameters: Dict[Text, Any]) -> Optional[List[Event]
         ]
 
 
-# noinspection PyProtectedMember
 class ReminderCancelled(Event):
     """Cancel certain jobs."""
 
@@ -880,22 +1151,24 @@ def __init__(
             timestamp: Optional timestamp.
             metadata: Optional event metadata.
         """
-
         self.name = name
         self.intent = intent
         self.entities = entities
         super().__init__(timestamp, metadata)
 
     def __hash__(self) -> int:
+        """Returns unique hash for event."""
         return hash((self.name, self.intent, str(self.entities)))
 
     def __eq__(self, other: Any) -> bool:
+        """Compares object with other object."""
         if not isinstance(other, ReminderCancelled):
-            return False
-        else:
-            return hash(self) == hash(other)
+            return NotImplemented
+
+        return hash(self) == hash(other)
 
     def __str__(self) -> Text:
+        """Returns text representation of event."""
         return f"ReminderCancelled(name: {self.name}, intent: {self.intent}, entities: {self.entities})"
 
     def cancels_job_with_name(self, job_name: Text, sender_id: Text) -> bool:
@@ -937,6 +1210,7 @@ def _matches_entities_hash(self, entities_hash: Text) -> bool:
         return str(hash(str(self.entities))) == entities_hash
 
     def as_story_string(self) -> Text:
+        """Returns text representation of event."""
         props = json.dumps(
             {"name": self.name, "intent": self.intent, "entities": self.entities}
         )
@@ -955,36 +1229,32 @@ def _from_story_string(cls, parameters: Dict[Text, Any]) -> Optional[List[Event]
         ]
 
 
-# noinspection PyProtectedMember
-class ActionReverted(Event):
+class ActionReverted(AlwaysEqualEventMixin):
     """Bot undoes its last action.
 
     The bot reverts everything until before the most recent action.
     This includes the action itself, as well as any events that
     action created, like set slot events - the bot will now
     predict a new action using the state before the most recent
-    action."""
+    action.
+    """
 
     type_name = "undo"
 
     def __hash__(self) -> int:
+        """Returns unique hash for event."""
         return hash(32143124318)
 
-    def __eq__(self, other) -> bool:
-        return isinstance(other, ActionReverted)
-
-    def __str__(self) -> Text:
-        return "ActionReverted()"
-
     def as_story_string(self) -> Text:
+        """Returns text representation of event."""
         return self.type_name
 
     def apply_to(self, tracker: "DialogueStateTracker") -> None:
+        """Applies event to current conversation state."""
         tracker._reset()
         tracker.replay_events()
 
 
-# noinspection PyProtectedMember
 class StoryExported(Event):
     """Story should get dumped to a file."""
 
@@ -996,18 +1266,20 @@ def __init__(
         timestamp: Optional[float] = None,
         metadata: Optional[Dict[Text, Any]] = None,
     ) -> None:
+        """Creates event about story exporting.
+
+        Args:
+            path: Path to which story was exported to.
+            timestamp: When the event was created.
+            metadata: Additional event metadata.
+        """
         self.path = path
         super().__init__(timestamp, metadata)
 
     def __hash__(self) -> int:
+        """Returns unique hash for event."""
         return hash(32143124319)
 
-    def __eq__(self, other) -> bool:
-        return isinstance(other, StoryExported)
-
-    def __str__(self) -> Text:
-        return "StoryExported()"
-
     @classmethod
     def _from_story_string(cls, parameters: Dict[Text, Any]) -> Optional[List[Event]]:
         return [
@@ -1019,14 +1291,22 @@ def _from_story_string(cls, parameters: Dict[Text, Any]) -> Optional[List[Event]
         ]
 
     def as_story_string(self) -> Text:
+        """Returns text representation of event."""
         return self.type_name
 
     def apply_to(self, tracker: "DialogueStateTracker") -> None:
+        """Applies event to current conversation state."""
         if self.path:
             tracker.export_stories_to_file(self.path)
 
+    def __eq__(self, other) -> bool:
+        """Compares object with other object."""
+        if not isinstance(other, StoryExported):
+            return NotImplemented
+
+        return self.path == other.path
+
 
-# noinspection PyProtectedMember
 class FollowupAction(Event):
     """Enqueue a followup action."""
 
@@ -1038,22 +1318,33 @@ def __init__(
         timestamp: Optional[float] = None,
         metadata: Optional[Dict[Text, Any]] = None,
     ) -> None:
+        """Creates an event which forces the model to run a certain action next.
+
+        Args:
+            name: Name of the action to run.
+            timestamp: When the event was created.
+            metadata: Additional event metadata.
+        """
         self.action_name = name
         super().__init__(timestamp, metadata)
 
     def __hash__(self) -> int:
+        """Returns unique hash for event."""
         return hash(self.action_name)
 
     def __eq__(self, other) -> bool:
+        """Compares object with other object."""
         if not isinstance(other, FollowupAction):
-            return False
-        else:
-            return self.action_name == other.action_name
+            return NotImplemented
+
+        return self.action_name == other.action_name
 
     def __str__(self) -> Text:
+        """Returns text representation of event."""
         return f"FollowupAction(action: {self.action_name})"
 
     def as_story_string(self) -> Text:
+        """Returns text representation of event."""
         props = json.dumps({"name": self.action_name})
         return f"{self.type_name}{props}"
 
@@ -1069,65 +1360,60 @@ def _from_story_string(cls, parameters: Dict[Text, Any]) -> Optional[List[Event]
         ]
 
     def as_dict(self) -> Dict[Text, Any]:
+        """Returns serialized event."""
         d = super().as_dict()
         d.update({"name": self.action_name})
         return d
 
     def apply_to(self, tracker: "DialogueStateTracker") -> None:
+        """Applies event to current conversation state."""
         tracker.trigger_followup_action(self.action_name)
 
 
-# noinspection PyProtectedMember
-class ConversationPaused(Event):
+class ConversationPaused(AlwaysEqualEventMixin):
     """Ignore messages from the user to let a human take over.
 
-    As a side effect the ``Tracker``'s ``paused`` attribute will
-    be set to ``True``."""
+    As a side effect the `Tracker`'s `paused` attribute will
+    be set to `True`.
+    """
 
     type_name = "pause"
 
     def __hash__(self) -> int:
+        """Returns unique hash for event."""
         return hash(32143124313)
 
-    def __eq__(self, other) -> bool:
-        return isinstance(other, ConversationPaused)
-
-    def __str__(self) -> Text:
-        return "ConversationPaused()"
-
     def as_story_string(self) -> Text:
-        return self.type_name
+        """Returns text representation of event."""
+        return str(self)
 
     def apply_to(self, tracker) -> None:
+        """Applies event to current conversation state."""
         tracker._paused = True
 
 
-# noinspection PyProtectedMember
-class ConversationResumed(Event):
+class ConversationResumed(AlwaysEqualEventMixin):
     """Bot takes over conversation.
 
-    Inverse of ``PauseConversation``. As a side effect the ``Tracker``'s
-    ``paused`` attribute will be set to ``False``."""
+    Inverse of `PauseConversation`. As a side effect the `Tracker`'s
+    `paused` attribute will be set to `False`.
+    """
 
     type_name = "resume"
 
     def __hash__(self) -> int:
+        """Returns unique hash for event."""
         return hash(32143124314)
 
-    def __eq__(self, other) -> bool:
-        return isinstance(other, ConversationResumed)
-
-    def __str__(self) -> Text:
-        return "ConversationResumed()"
-
     def as_story_string(self) -> Text:
+        """Returns text representation of event."""
         return self.type_name
 
     def apply_to(self, tracker) -> None:
+        """Applies event to current conversation state."""
         tracker._paused = False
 
 
-# noinspection PyProtectedMember
 class ActionExecuted(Event):
     """An operation describes an action taken + its result.
 
@@ -1146,6 +1432,18 @@ def __init__(
         metadata: Optional[Dict] = None,
         action_text: Optional[Text] = None,
     ) -> None:
+        """Creates event for a successful event execution.
+
+        Args:
+            action_name: Name of the action which was executed. `None` if it was an
+                end-to-end prediction.
+            policy: Policy which predicted action.
+            confidence: Confidence with which policy predicted action.
+            timestamp: When the event was created.
+            metadata: Additional event metadata.
+            action_text: In case it's an end-to-end action prediction, the text which
+                was predicted.
+        """
         self.action_name = action_name
         self.policy = policy
         self.confidence = confidence
@@ -1154,25 +1452,40 @@ def __init__(
 
         super().__init__(timestamp, metadata)
 
-    def __str__(self) -> Text:
+    def __repr__(self) -> Text:
+        """Returns event as string for debugging."""
         return "ActionExecuted(action: {}, policy: {}, confidence: {})".format(
             self.action_name, self.policy, self.confidence
         )
 
+    def __str__(self) -> Text:
+        """Returns event as human readable string."""
+        return self.action_name or self.action_text
+
     def __hash__(self) -> int:
-        return hash(self.action_name)
+        """Returns unique hash for event."""
+        return hash(self.action_name or self.action_text)
 
-    def __eq__(self, other) -> bool:
+    def __eq__(self, other: Any) -> bool:
+        """Checks if object is equal to another."""
         if not isinstance(other, ActionExecuted):
-            return False
-        else:
-            equal = self.action_name == other.action_name
-            if hasattr(self, "action_text") and hasattr(other, "action_text"):
-                equal = equal and self.action_text == other.action_text
+            return NotImplemented
 
-            return equal
+        equal = self.action_name == other.action_name
+        if hasattr(self, "action_text") and hasattr(other, "action_text"):
+            equal = equal and self.action_text == other.action_text
+
+        return equal
 
     def as_story_string(self) -> Text:
+        """Returns event in Markdown format."""
+        if self.action_text:
+            raise UnsupportedFeatureException(
+                f"Printing end-to-end bot utterances is not supported in the "
+                f"Markdown training format. Please use the YAML training data format "
+                f"instead. Please see {DOCS_URL_TRAINING_DATA} for more information."
+            )
+
         return self.action_name
 
     @classmethod
@@ -1190,6 +1503,7 @@ def _from_story_string(cls, parameters: Dict[Text, Any]) -> Optional[List[Event]
         ]
 
     def as_dict(self) -> Dict[Text, Any]:
+        """Returns serialized event."""
         d = super().as_dict()
         policy = None  # for backwards compatibility (persisted events)
         if hasattr(self, "policy"):
@@ -1198,14 +1512,24 @@ def as_dict(self) -> Dict[Text, Any]:
         if hasattr(self, "confidence"):
             confidence = self.confidence
 
-        d.update({"name": self.action_name, "policy": policy, "confidence": confidence})
+        d.update(
+            {
+                "name": self.action_name,
+                "policy": policy,
+                "confidence": confidence,
+                "action_text": self.action_text,
+            }
+        )
         return d
 
     def as_sub_state(self) -> Dict[Text, Text]:
         """Turns ActionExecuted into a dictionary containing action name or action text.
+
         One action cannot have both set at the same time
+
         Returns:
-            a dictionary containing action name or action text with the corresponding key
+            a dictionary containing action name or action text with the corresponding
+            key.
         """
         if self.action_name:
             return {ACTION_NAME: self.action_name}
@@ -1213,15 +1537,17 @@ def as_sub_state(self) -> Dict[Text, Text]:
             return {ACTION_TEXT: self.action_text}
 
     def apply_to(self, tracker: "DialogueStateTracker") -> None:
+        """Applies event to current conversation state."""
         tracker.set_latest_action(self.as_sub_state())
         tracker.clear_followup_action()
 
 
-class AgentUttered(Event):
+class AgentUttered(SkipEventInMDStoryMixin):
     """The agent has said something to the user.
 
     This class is not used in the story training as it is contained in the
-    ``ActionExecuted`` class. An entry is made in the ``Tracker``."""
+    ``ActionExecuted`` class. An entry is made in the ``Tracker``.
+    """
 
     type_name = "agent"
 
@@ -1232,43 +1558,37 @@ def __init__(
         timestamp: Optional[float] = None,
         metadata: Optional[Dict[Text, Any]] = None,
     ) -> None:
+        """See docstring of `BotUttered`."""
         self.text = text
         self.data = data
         super().__init__(timestamp, metadata)
 
     def __hash__(self) -> int:
+        """Returns unique hash for event."""
         return hash((self.text, jsonpickle.encode(self.data)))
 
     def __eq__(self, other) -> bool:
+        """Compares object with other object."""
         if not isinstance(other, AgentUttered):
-            return False
-        else:
-            return (self.text, jsonpickle.encode(self.data)) == (
-                other.text,
-                jsonpickle.encode(other.data),
-            )
+            return NotImplemented
+
+        return (self.text, jsonpickle.encode(self.data)) == (
+            other.text,
+            jsonpickle.encode(other.data),
+        )
 
     def __str__(self) -> Text:
+        """Returns text representation of event."""
         return "AgentUttered(text: {}, data: {})".format(
             self.text, json.dumps(self.data)
         )
 
-    def apply_to(self, tracker: "DialogueStateTracker") -> None:
-
-        pass
-
-    def as_story_string(self) -> None:
-        return None
-
     def as_dict(self) -> Dict[Text, Any]:
+        """Returns serialized event."""
         d = super().as_dict()
         d.update({"text": self.text, "data": self.data})
         return d
 
-    @staticmethod
-    def empty() -> "AgentUttered":
-        return AgentUttered()
-
     @classmethod
     def _from_parameters(cls, parameters) -> "AgentUttered":
         try:
@@ -1293,22 +1613,33 @@ def __init__(
         timestamp: Optional[float] = None,
         metadata: Optional[Dict[Text, Any]] = None,
     ) -> None:
+        """Creates event for active loop.
+
+        Args:
+            name: Name of activated loop or `None` if current loop is deactivated.
+            timestamp: When the event was created.
+            metadata: Additional event metadata.
+        """
         self.name = name
         super().__init__(timestamp, metadata)
 
     def __str__(self) -> Text:
+        """Returns text representation of event."""
         return f"Loop({self.name})"
 
     def __hash__(self) -> int:
+        """Returns unique hash for event."""
         return hash(self.name)
 
     def __eq__(self, other) -> bool:
+        """Compares object with other object."""
         if not isinstance(other, ActiveLoop):
-            return False
-        else:
-            return self.name == other.name
+            return NotImplemented
+
+        return self.name == other.name
 
     def as_story_string(self) -> Text:
+        """Returns text representation of event."""
         props = json.dumps({LOOP_NAME: self.name})
         return f"{ActiveLoop.type_name}{props}"
 
@@ -1324,11 +1655,13 @@ def _from_story_string(cls, parameters: Dict[Text, Any]) -> List["ActiveLoop"]:
         ]
 
     def as_dict(self) -> Dict[Text, Any]:
+        """Returns serialized event."""
         d = super().as_dict()
         d.update({LOOP_NAME: self.name})
         return d
 
     def apply_to(self, tracker: "DialogueStateTracker") -> None:
+        """Applies event to current conversation state."""
         tracker.change_loop_to(self.name)
 
 
@@ -1342,6 +1675,7 @@ class LegacyForm(ActiveLoop):
     type_name = "form"
 
     def as_dict(self) -> Dict[Text, Any]:
+        """Returns serialized event."""
         d = super().as_dict()
         # Dump old `Form` events as `ActiveLoop` events instead of keeping the old
         # event type.
@@ -1349,9 +1683,11 @@ def as_dict(self) -> Dict[Text, Any]:
         return d
 
 
-class LoopInterrupted(Event):
-    """Event added by FormPolicy and RulePolicy to notify form action
-    whether or not to validate the user input."""
+class LoopInterrupted(SkipEventInMDStoryMixin):
+    """Event added by FormPolicy and RulePolicy.
+
+    Notifies form action whether or not to validate the user input.
+    """
 
     type_name = "loop_interrupted"
 
@@ -1361,23 +1697,34 @@ def __init__(
         timestamp: Optional[float] = None,
         metadata: Optional[Dict[Text, Any]] = None,
     ) -> None:
+        """Event to notify that loop was interrupted.
+
+        This e.g. happens when a user is within a form, and is de-railing the
+        form-filling by asking FAQs.
+
+        Args:
+            is_interrupted: `True` if the loop execution was interrupted, and ML
+                policies had to take over the last prediction.
+            timestamp: When the event was created.
+            metadata: Additional event metadata.
+        """
         super().__init__(timestamp, metadata)
         self.is_interrupted = is_interrupted
 
     def __str__(self) -> Text:
+        """Returns text representation of event."""
         return f"{LoopInterrupted.__name__}({self.is_interrupted})"
 
     def __hash__(self) -> int:
+        """Returns unique hash for event."""
         return hash(self.is_interrupted)
 
     def __eq__(self, other) -> bool:
-        return (
-            isinstance(other, LoopInterrupted)
-            and self.is_interrupted == other.is_interrupted
-        )
+        """Compares object with other object."""
+        if not isinstance(other, LoopInterrupted):
+            return NotImplemented
 
-    def as_story_string(self) -> None:
-        return None
+        return self.is_interrupted == other.is_interrupted
 
     @classmethod
     def _from_parameters(cls, parameters) -> "LoopInterrupted":
@@ -1388,11 +1735,13 @@ def _from_parameters(cls, parameters) -> "LoopInterrupted":
         )
 
     def as_dict(self) -> Dict[Text, Any]:
+        """Returns serialized event."""
         d = super().as_dict()
         d.update({LOOP_INTERRUPTED: self.is_interrupted})
         return d
 
     def apply_to(self, tracker: "DialogueStateTracker") -> None:
+        """Applies event to current conversation state."""
         tracker.interrupt_loop(self.is_interrupted)
 
 
@@ -1412,6 +1761,7 @@ def __init__(
         timestamp: Optional[float] = None,
         metadata: Optional[Dict[Text, Any]] = None,
     ) -> None:
+        """See parent class docstring."""
         # `validate = True` is the same as `interrupted = False`
         super().__init__(not validate, timestamp, metadata)
 
@@ -1425,6 +1775,7 @@ def _from_parameters(cls, parameters: Dict) -> "LoopInterrupted":
         )
 
     def as_dict(self) -> Dict[Text, Any]:
+        """Returns serialized event."""
         d = super().as_dict()
         # Dump old `Form` events as `ActiveLoop` events instead of keeping the old
         # event type.
@@ -1432,8 +1783,8 @@ def as_dict(self) -> Dict[Text, Any]:
         return d
 
 
-class ActionExecutionRejected(Event):
-    """Notify Core that the execution of the action has been rejected"""
+class ActionExecutionRejected(SkipEventInMDStoryMixin):
+    """Notify Core that the execution of the action has been rejected."""
 
     type_name = "action_execution_rejected"
 
@@ -1445,12 +1796,22 @@ def __init__(
         timestamp: Optional[float] = None,
         metadata: Optional[Dict[Text, Any]] = None,
     ) -> None:
+        """Creates event.
+
+        Args:
+            action_name: Action which was rejected.
+            policy: Policy which predicted the rejected action.
+            confidence: Confidence with which the reject action was predicted.
+            timestamp: When the event was created.
+            metadata: Additional event metadata.
+        """
         self.action_name = action_name
         self.policy = policy
         self.confidence = confidence
         super().__init__(timestamp, metadata)
 
     def __str__(self) -> Text:
+        """Returns text representation of event."""
         return (
             "ActionExecutionRejected("
             "action: {}, policy: {}, confidence: {})"
@@ -1458,13 +1819,15 @@ def __str__(self) -> Text:
         )
 
     def __hash__(self) -> int:
+        """Returns unique hash for event."""
         return hash(self.action_name)
 
     def __eq__(self, other) -> bool:
+        """Compares object with other object."""
         if not isinstance(other, ActionExecutionRejected):
-            return False
-        else:
-            return self.action_name == other.action_name
+            return NotImplemented
+
+        return self.action_name == other.action_name
 
     @classmethod
     def _from_parameters(cls, parameters) -> "ActionExecutionRejected":
@@ -1476,10 +1839,8 @@ def _from_parameters(cls, parameters) -> "ActionExecutionRejected":
             parameters.get("metadata"),
         )
 
-    def as_story_string(self) -> None:
-        return None
-
     def as_dict(self) -> Dict[Text, Any]:
+        """Returns serialized event."""
         d = super().as_dict()
         d.update(
             {
@@ -1491,28 +1852,26 @@ def as_dict(self) -> Dict[Text, Any]:
         return d
 
     def apply_to(self, tracker: "DialogueStateTracker") -> None:
+        """Applies event to current conversation state."""
         tracker.reject_action(self.action_name)
 
 
-class SessionStarted(Event):
+class SessionStarted(AlwaysEqualEventMixin):
     """Mark the beginning of a new conversation session."""
 
     type_name = "session_started"
 
     def __hash__(self) -> int:
+        """Returns unique hash for event."""
         return hash(32143124320)
 
-    def __eq__(self, other: Any) -> bool:
-        return isinstance(other, SessionStarted)
-
-    def __str__(self) -> Text:
-        return "SessionStarted()"
-
     def as_story_string(self) -> None:
+        """Skips representing event in stories."""
         logger.warning(
             f"'{self.type_name}' events cannot be serialised as story strings."
         )
 
     def apply_to(self, tracker: "DialogueStateTracker") -> None:
+        """Applies event to current conversation state."""
         # noinspection PyProtectedMember
         tracker._reset()
diff --git a/rasa/shared/core/trackers.py b/rasa/shared/core/trackers.py
index bf6aa0eee334..96988d3e7498 100644
--- a/rasa/shared/core/trackers.py
+++ b/rasa/shared/core/trackers.py
@@ -30,6 +30,7 @@
     ENTITY_ATTRIBUTE_ROLE,
     ACTION_TEXT,
     ACTION_NAME,
+    ENTITIES,
 )
 from rasa.shared.core import events
 from rasa.shared.core.constants import (
@@ -57,6 +58,7 @@
     ActiveLoop,
     SessionStarted,
     ActionExecutionRejected,
+    EntitiesAdded,
 )
 from rasa.shared.core.domain import Domain, State
 from rasa.shared.core.slots import Slot
@@ -135,11 +137,26 @@ def from_events(
         slots: Optional[Iterable[Slot]] = None,
         max_event_history: Optional[int] = None,
         sender_source: Optional[Text] = None,
-    ):
+        domain: Optional[Domain] = None,
+    ) -> "DialogueStateTracker":
+        """Creates tracker from existing events.
+
+        Args:
+            sender_id: The ID of the conversation.
+            evts: Existing events which should be applied to the new tracker.
+            slots: Slots which can be set.
+            max_event_history: Maximum number of events which should be stored.
+            sender_source: File source of the messages.
+            domain: The current model domain.
+
+        Returns:
+            Instantiated tracker with its state updated according to the given
+            events.
+        """
         tracker = cls(sender_id, slots, max_event_history, sender_source)
 
         for e in evts:
-            tracker.update(e)
+            tracker.update(e, domain)
 
         return tracker
 
@@ -195,8 +212,7 @@ def __init__(
     def current_state(
         self, event_verbosity: EventVerbosity = EventVerbosity.NONE
     ) -> Dict[Text, Any]:
-        """Return the current tracker state as an object."""
-
+        """Returns the current tracker state as an object."""
         _events = self._events_for_verbosity(event_verbosity)
         if _events:
             _events = [e.as_dict() for e in _events]
@@ -207,7 +223,7 @@ def current_state(
         return {
             "sender_id": self.sender_id,
             "slots": self.current_slot_values(),
-            "latest_message": self.latest_message.parse_data,
+            "latest_message": self._latest_message_data(),
             "latest_event_time": latest_event_time,
             FOLLOWUP_ACTION: self.followup_action,
             "paused": self.is_paused(),
@@ -230,9 +246,25 @@ def _events_for_verbosity(
 
         return None
 
+    def _latest_message_data(self) -> Dict[Text, Any]:
+        parse_data_with_nlu_state = self.latest_message.parse_data.copy()
+        # Combine entities predicted by NLU with entities predicted by policies so that
+        # users can access them together via `latest_message` (e.g. in custom actions)
+        parse_data_with_nlu_state["entities"] = self.latest_message.entities
+
+        return parse_data_with_nlu_state
+
     @staticmethod
     def freeze_current_state(state: State) -> FrozenState:
-        frozen_state = frozenset(
+        """Convert State dict into a hashable format FrozenState.
+
+        Args:
+            state: The state which should be converted
+
+        Return:
+            hashable form of the state of type `FrozenState`
+        """
+        return frozenset(
             {
                 key: frozenset(values.items())
                 if isinstance(values, Dict)
@@ -240,7 +272,6 @@ def freeze_current_state(state: State) -> FrozenState:
                 for key, values in state.items()
             }.items()
         )
-        return frozen_state
 
     def past_states(self, domain: Domain) -> List[State]:
         """Generate the past states of this tracker based on the history.
@@ -253,7 +284,7 @@ def past_states(self, domain: Domain) -> List[State]:
         """
         return domain.states_for_tracker_history(self)
 
-    def change_loop_to(self, loop_name: Text) -> None:
+    def change_loop_to(self, loop_name: Optional[Text]) -> None:
         """Set the currently active loop.
 
         Args:
@@ -301,8 +332,12 @@ def reject_action(self, action_name: Text) -> None:
             self.active_loop[LOOP_REJECTED] = True
 
     def set_latest_action(self, action: Dict[Text, Text]) -> None:
-        """Set latest action name
-        and reset form validation and rejection parameters
+        """Sets latest action name or text.
+
+        Resets loop validation and rejection parameters.
+
+        Args:
+            action: Serialized action event.
         """
         self.latest_action = action
         if self.active_loop_name:
@@ -453,8 +488,11 @@ def applied_events(self) -> List[Event]:
 
     @staticmethod
     def _undo_till_previous(event_type: Type[Event], done_events: List[Event]) -> None:
-        """Removes events from `done_events` until the first occurrence `event_type`
-        is found which is also removed."""
+        """Removes events from `done_events`.
+
+        Removes events from `done_events` until the first occurrence `event_type`
+        is found which is also removed.
+        """
         # list gets modified - hence we need to copy events!
         for e in reversed(done_events[:]):
             del done_events[-1]
@@ -585,9 +623,17 @@ def update(self, event: Event, domain: Optional[Domain] = None) -> None:
         self.events.append(event)
         event.apply_to(self)
 
-        if domain and isinstance(event, UserUttered):
-            # store all entities as slots
-            for e in domain.slots_for_entities(event.parse_data["entities"]):
+        if domain and isinstance(event, (UserUttered, EntitiesAdded)):
+            if isinstance(event, UserUttered):
+                # Rather get entities from `parse_data` as
+                # `DefinePrevUserUtteredEntities` might have already affected the
+                # `UserUttered.entities` attribute (this might e.g. happen when the
+                # `InMemoryTrackerStore` is used).
+                entities = event.parse_data[ENTITIES]
+            else:
+                entities = event.entities
+
+            for e in domain.slots_for_entities(entities):
                 self.update(e)
 
     def update_with_events(
diff --git a/rasa/shared/core/training_data/story_reader/markdown_story_reader.py b/rasa/shared/core/training_data/story_reader/markdown_story_reader.py
index b3bac66b3b3a..9d1b35b4c804 100644
--- a/rasa/shared/core/training_data/story_reader/markdown_story_reader.py
+++ b/rasa/shared/core/training_data/story_reader/markdown_story_reader.py
@@ -226,13 +226,14 @@ def _add_e2e_messages(self, e2e_messages: List[Text], line_num: int) -> None:
             parsed_messages.append(parsed)
         self.current_step_builder.add_user_messages(parsed_messages)
 
-    @staticmethod
-    def parse_e2e_message(line: Text, is_used_for_training: bool = True) -> Message:
+    def parse_e2e_message(
+        self, line: Text, is_used_for_training: bool = True
+    ) -> Message:
         """Parses an md list item line based on the current section type.
 
         Matches expressions of the form `<intent>:<example>`. For the
-        syntax of `<example>` see the Rasa docs on NLU training data."""
-
+        syntax of `<example>` see the Rasa docs on NLU training data.
+        """
         # Match three groups:
         # 1) Potential "form" annotation
         # 2) The correct intent
@@ -254,7 +255,7 @@ def parse_e2e_message(line: Text, is_used_for_training: bool = True) -> Message:
         intent = match.group(2)
         message = match.group(4)
         example = entities_parser.parse_training_example(message, intent)
-        if not is_used_for_training:
+        if not is_used_for_training and not self.use_e2e:
             # In case this is a simple conversion from Markdown we should copy over
             # the original text and not parse the entities
             example.data[rasa.shared.nlu.constants.TEXT] = message
diff --git a/rasa/shared/core/training_data/story_reader/yaml_story_reader.py b/rasa/shared/core/training_data/story_reader/yaml_story_reader.py
index 060ea07c73cb..e1cb446e8971 100644
--- a/rasa/shared/core/training_data/story_reader/yaml_story_reader.py
+++ b/rasa/shared/core/training_data/story_reader/yaml_story_reader.py
@@ -13,6 +13,7 @@
     INTENT_NAME_KEY,
     PREDICTED_CONFIDENCE_KEY,
     FULL_RETRIEVAL_INTENT_NAME_KEY,
+    ACTION_TEXT,
 )
 from rasa.shared.nlu.training_data import entities_parser
 import rasa.shared.utils.validation
@@ -45,6 +46,7 @@
 KEY_SLOT_VALUE = "value"
 KEY_ACTIVE_LOOP = "active_loop"
 KEY_ACTION = "action"
+KEY_BOT_END_TO_END_MESSAGE = "bot"
 KEY_CHECKPOINT = "checkpoint"
 KEY_CHECKPOINT_SLOTS = "slot_was_set"
 KEY_METADATA = "metadata"
@@ -279,6 +281,8 @@ def _parse_step(self, step: Union[Text, Dict[Text, Any]]) -> None:
             self._parse_or_statement(step)
         elif KEY_ACTION in step.keys():
             self._parse_action(step)
+        elif KEY_BOT_END_TO_END_MESSAGE in step.keys():
+            self._parse_bot_message(step)
         elif KEY_CHECKPOINT in step.keys():
             self._parse_checkpoint(step)
         # This has to be after the checkpoint test as there can be a slot key within
@@ -308,12 +312,19 @@ def _get_docs_link(self) -> Text:
 
     def _parse_user_utterance(self, step: Dict[Text, Any]) -> None:
         utterance = self._parse_raw_user_utterance(step)
-        if utterance:
+
+        if not utterance:
+            return
+
+        is_end_to_end_utterance = KEY_USER_INTENT not in step
+        if is_end_to_end_utterance:
+            utterance.intent = {INTENT_NAME_KEY: None}
+        else:
             self._validate_that_utterance_is_in_domain(utterance)
-            self.current_step_builder.add_user_messages([utterance])
 
-    def _validate_that_utterance_is_in_domain(self, utterance: UserUttered) -> None:
+        self.current_step_builder.add_user_messages([utterance])
 
+    def _validate_that_utterance_is_in_domain(self, utterance: UserUttered) -> None:
         intent_name = utterance.intent.get(INTENT_NAME_KEY)
 
         # check if this is a retrieval intent
@@ -361,7 +372,7 @@ def _user_intent_from_step(
     ) -> Tuple[Text, Optional[Text]]:
         user_intent = step.get(KEY_USER_INTENT, "").strip()
 
-        if not user_intent:
+        if not user_intent and KEY_USER_MESSAGE not in step:
             rasa.shared.utils.io.raise_warning(
                 f"Issue found in '{self.source_name}':\n"
                 f"User utterance cannot be empty. "
@@ -514,6 +525,10 @@ def _parse_action(self, step: Dict[Text, Any]) -> None:
 
         self._add_event(action_name, {})
 
+    def _parse_bot_message(self, step: Dict[Text, Any]) -> None:
+        bot_message = step.get(KEY_BOT_END_TO_END_MESSAGE, "")
+        self._add_event("", {ACTION_TEXT: bot_message})
+
     def _parse_active_loop(self, active_loop_name: Optional[Text]) -> None:
         self._add_event(ActiveLoop.type_name, {LOOP_NAME: active_loop_name})
 
diff --git a/rasa/shared/core/training_data/story_writer/yaml_story_writer.py b/rasa/shared/core/training_data/story_writer/yaml_story_writer.py
index 9d9aea25ce4f..23d415ef594e 100644
--- a/rasa/shared/core/training_data/story_writer/yaml_story_writer.py
+++ b/rasa/shared/core/training_data/story_writer/yaml_story_writer.py
@@ -9,6 +9,7 @@
 import rasa.shared.utils.io
 import rasa.shared.core.constants
 from rasa.shared.constants import LATEST_TRAINING_DATA_FORMAT_VERSION
+import rasa.shared.core.events
 from rasa.shared.core.events import (
     UserUttered,
     ActionExecuted,
@@ -30,6 +31,7 @@
     KEY_OR,
     KEY_USER_MESSAGE,
     KEY_ACTIVE_LOOP,
+    KEY_BOT_END_TO_END_MESSAGE,
     KEY_RULES,
     KEY_RULE_FOR_CONVERSATION_START,
     KEY_WAIT_FOR_USER_INPUT_AFTER_RULE,
@@ -102,6 +104,7 @@ def stories_to_yaml(
 
         Args:
             story_steps: Original story steps to be converted to the YAML.
+            is_test_story: `True` if the story is an end-to-end conversation test story.
         """
         from rasa.shared.utils.validation import KEY_TRAINING_DATA_FORMAT_VERSION
 
@@ -184,13 +187,6 @@ def stories_contain_loops(stories: List[StoryStep]) -> bool:
             ]
         )
 
-    @staticmethod
-    def _text_is_real_message(user_utterance: UserUttered) -> bool:
-        return (
-            not user_utterance.intent
-            or user_utterance.text != user_utterance.as_story_string()
-        )
-
     @staticmethod
     def process_user_utterance(
         user_utterance: UserUttered, is_test_story: bool = False
@@ -206,21 +202,29 @@ def process_user_utterance(
             Dict with a user utterance.
         """
         result = CommentedMap()
-        result[KEY_USER_INTENT] = user_utterance.intent["name"]
+        if user_utterance.intent_name and not user_utterance.use_text_for_featurization:
+            result[KEY_USER_INTENT] = user_utterance.intent_name
 
         if hasattr(user_utterance, "inline_comment"):
             result.yaml_add_eol_comment(
                 user_utterance.inline_comment(), KEY_USER_INTENT
             )
 
-        if (
-            is_test_story
-            and YAMLStoryWriter._text_is_real_message(user_utterance)
-            and user_utterance.text
+        if user_utterance.text and (
+            # We only print the utterance text if it was an end-to-end prediction
+            user_utterance.use_text_for_featurization
+            # or if we want to print a conversation test story.
+            or is_test_story
         ):
-            result[KEY_USER_MESSAGE] = LiteralScalarString(user_utterance.text)
+            result[KEY_USER_MESSAGE] = LiteralScalarString(
+                rasa.shared.core.events.format_message(
+                    user_utterance.text,
+                    user_utterance.intent_name,
+                    user_utterance.entities,
+                )
+            )
 
-        if len(user_utterance.entities):
+        if len(user_utterance.entities) and not is_test_story:
             entities = []
             for entity in user_utterance.entities:
                 if entity["value"]:
@@ -245,10 +249,18 @@ def process_action(action: ActionExecuted) -> Optional[OrderedDict]:
             return None
 
         result = CommentedMap()
-        result[KEY_ACTION] = action.action_name
+        if action.action_name:
+            result[KEY_ACTION] = action.action_name
+        elif action.action_text:
+            result[KEY_BOT_END_TO_END_MESSAGE] = action.action_text
 
         if hasattr(action, "inline_comment"):
-            result.yaml_add_eol_comment(action.inline_comment(), KEY_ACTION)
+            if KEY_ACTION in result:
+                result.yaml_add_eol_comment(action.inline_comment(), KEY_ACTION)
+            elif KEY_BOT_END_TO_END_MESSAGE in result:
+                result.yaml_add_eol_comment(
+                    action.inline_comment(), KEY_BOT_END_TO_END_MESSAGE
+                )
 
         return result
 
diff --git a/rasa/shared/core/training_data/structures.py b/rasa/shared/core/training_data/structures.py
index ed07148ca1a1..8ad0d7b62af1 100644
--- a/rasa/shared/core/training_data/structures.py
+++ b/rasa/shared/core/training_data/structures.py
@@ -407,7 +407,11 @@ def fingerprint(self) -> Text:
         Returns:
             fingerprint of the stories
         """
-        self_as_string = self.as_story_string()
+        from rasa.shared.core.training_data.story_writer.yaml_story_writer import (
+            YAMLStoryWriter,
+        )
+
+        self_as_string = YAMLStoryWriter().dumps(self.story_steps)
         return rasa.shared.utils.io.get_text_hash(self_as_string)
 
     def ordered_steps(self) -> List[StoryStep]:
diff --git a/rasa/shared/exceptions.py b/rasa/shared/exceptions.py
index 156c30313b89..8a2eb43e3c48 100644
--- a/rasa/shared/exceptions.py
+++ b/rasa/shared/exceptions.py
@@ -73,3 +73,7 @@ class FileIOException(RasaException):
 
 class InvalidConfigException(ValueError, RasaException):
     """Raised if an invalid configuration is encountered."""
+
+
+class UnsupportedFeatureException(RasaCoreException):
+    """Raised if a requested feature is not supported."""
diff --git a/rasa/shared/importers/autoconfig.py b/rasa/shared/importers/autoconfig.py
index 4070d7042dac..05f2f0194f9f 100644
--- a/rasa/shared/importers/autoconfig.py
+++ b/rasa/shared/importers/autoconfig.py
@@ -36,7 +36,8 @@ class TrainingType(Enum):
 
 
 def get_configuration(
-    config_file_path: Text, training_type: Optional[TrainingType] = TrainingType.BOTH
+    config_file_path: Optional[Text],
+    training_type: Optional[TrainingType] = TrainingType.BOTH,
 ) -> Dict[Text, Any]:
     """Determine configuration from a configuration file.
 
diff --git a/rasa/shared/importers/importer.py b/rasa/shared/importers/importer.py
index 312dfbc1b94b..8300f62c7ef8 100644
--- a/rasa/shared/importers/importer.py
+++ b/rasa/shared/importers/importer.py
@@ -100,8 +100,7 @@ def load_core_importer_from_config(
         importer = TrainingDataImporter.load_from_config(
             config_path, domain_path, training_data_paths, TrainingType.CORE
         )
-
-        return CoreDataImporter(importer)
+        return importer
 
     @staticmethod
     def load_nlu_importer_from_config(
@@ -215,34 +214,9 @@ async def get_nlu_data(self, language: Optional[Text] = "en") -> TrainingData:
         return await self._importer.get_nlu_data(language)
 
 
-class CoreDataImporter(TrainingDataImporter):
-    """Importer that skips any NLU related file reading."""
-
-    def __init__(self, actual_importer: TrainingDataImporter):
-        self._importer = actual_importer
-
-    async def get_domain(self) -> Domain:
-        return await self._importer.get_domain()
-
-    async def get_stories(
-        self,
-        template_variables: Optional[Dict] = None,
-        use_e2e: bool = False,
-        exclusion_percentage: Optional[int] = None,
-    ) -> StoryGraph:
-        return await self._importer.get_stories(
-            template_variables, use_e2e, exclusion_percentage
-        )
-
-    async def get_config(self) -> Dict:
-        return await self._importer.get_config()
-
-    async def get_nlu_data(self, language: Optional[Text] = "en") -> TrainingData:
-        return TrainingData()
-
-
 class CombinedDataImporter(TrainingDataImporter):
     """A `TrainingDataImporter` that combines multiple importers.
+
     Uses multiple `TrainingDataImporter` instances
     to load the data as if they were a single instance.
     """
@@ -459,7 +433,13 @@ async def _get_domain_with_e2e_actions(self) -> Domain:
         additional_e2e_action_names = list(additional_e2e_action_names)
 
         return Domain(
-            [], [], [], {}, action_names=additional_e2e_action_names, forms={}
+            [],
+            [],
+            [],
+            {},
+            action_names=[],
+            forms={},
+            action_texts=additional_e2e_action_names,
         )
 
     async def get_stories(
@@ -469,6 +449,10 @@ async def get_stories(
         use_e2e: bool = False,
         exclusion_percentage: Optional[int] = None,
     ) -> StoryGraph:
+        """Retrieves the stories that should be used for training.
+
+        See parent class for details.
+        """
         return await self.importer.get_stories(
             template_variables, use_e2e, exclusion_percentage
         )
diff --git a/rasa/shared/nlu/constants.py b/rasa/shared/nlu/constants.py
index f0d962f9774e..4e66ed7c1737 100644
--- a/rasa/shared/nlu/constants.py
+++ b/rasa/shared/nlu/constants.py
@@ -28,6 +28,7 @@
 TRAINABLE_EXTRACTORS = {"MitieEntityExtractor", "CRFEntityExtractor", "DIETClassifier"}
 
 ENTITIES = "entities"
+ENTITY_TAGS = "entity_tags"
 ENTITY_ATTRIBUTE_TYPE = "entity"
 ENTITY_ATTRIBUTE_GROUP = "group"
 ENTITY_ATTRIBUTE_ROLE = "role"
diff --git a/rasa/shared/nlu/training_data/features.py b/rasa/shared/nlu/training_data/features.py
index c556d6e6c3ff..9911425fb3f6 100644
--- a/rasa/shared/nlu/training_data/features.py
+++ b/rasa/shared/nlu/training_data/features.py
@@ -3,11 +3,9 @@
 import numpy as np
 import scipy.sparse
 
-from rasa.shared.nlu.constants import VALID_FEATURE_TYPES
-
 
 class Features:
-    """Stores the features produces by any featurizer."""
+    """Stores the features produced by any featurizer."""
 
     def __init__(
         self,
@@ -16,21 +14,19 @@ def __init__(
         attribute: Text,
         origin: Union[Text, List[Text]],
     ) -> None:
-        self._validate_feature_type(feature_type)
+        """Initializes the Features object.
 
+        Args:
+            features: The features.
+            feature_type: Type of the feature, e.g. FEATURE_TYPE_SENTENCE.
+            attribute: Message attribute, e.g. INTENT or TEXT.
+            origin: Name of the component that created the features.
+        """
         self.features = features
         self.type = feature_type
         self.origin = origin
         self.attribute = attribute
 
-    @staticmethod
-    def _validate_feature_type(feature_type: Text) -> None:
-        if feature_type not in VALID_FEATURE_TYPES:
-            raise ValueError(
-                f"Invalid feature type '{feature_type}' used. Valid feature types are: "
-                f"{VALID_FEATURE_TYPES}."
-            )
-
     def is_sparse(self) -> bool:
         """Checks if features are sparse or not.
 
@@ -93,12 +89,23 @@ def __key__(
     ) -> Tuple[
         Text, Text, Union[np.ndarray, scipy.sparse.spmatrix], Union[Text, List[Text]]
     ]:
-        return (self.type, self.attribute, self.features, self.origin)
+        """Returns a 4-tuple of defining properties.
 
-    def __hash__(self) -> int:
-        return hash(self.__key__())
+        Returns:
+            Tuple of type, attribute, features, and origin properties.
+        """
+        return (self.type, self.attribute, self.features, self.origin)
 
     def __eq__(self, other: Any) -> bool:
+        """Tests if the `self` `Feature` equals to the `other`.
+
+        Args:
+            other: The other object.
+
+        Returns:
+            `True` when the other object is a `Feature` and has the same
+            type, attribute, and feature tensors.
+        """
         if not isinstance(other, Features):
             return False
 
diff --git a/rasa/shared/nlu/training_data/message.py b/rasa/shared/nlu/training_data/message.py
index 20d36341d980..5cf5596f8fa3 100644
--- a/rasa/shared/nlu/training_data/message.py
+++ b/rasa/shared/nlu/training_data/message.py
@@ -115,14 +115,15 @@ def build(
         example_metadata: Optional[Any] = None,
         **kwargs: Any,
     ) -> "Message":
-        """
-        Build a Message from `UserUttered` data.
+        """Builds a Message from `UserUttered` data.
+
         Args:
             text: text of a user's utterance
             intent: an intent of the user utterance
             entities: entities in the user's utterance
             intent_metadata: optional metadata for the intent
             example_metadata: optional metadata for the intent example
+
         Returns:
             Message
         """
@@ -183,12 +184,14 @@ def separate_intent_response_key(
     def get_sparse_features(
         self, attribute: Text, featurizers: Optional[List[Text]] = None
     ) -> Tuple[Optional["Features"], Optional["Features"]]:
-        """Get all sparse features for the given attribute that are coming from the
-        given list of featurizers.
+        """Gets all sparse features for the attribute given the list of featurizers.
+
         If no featurizers are provided, all available features will be considered.
+
         Args:
             attribute: message attribute
             featurizers: names of featurizers to consider
+
         Returns:
             Sparse features.
         """
@@ -207,12 +210,14 @@ def get_sparse_features(
     def get_dense_features(
         self, attribute: Text, featurizers: Optional[List[Text]] = None
     ) -> Tuple[Optional["Features"], Optional["Features"]]:
-        """Get all dense features for the given attribute that are coming from the given
-        list of featurizers.
+        """Gets all dense features for the attribute given the list of featurizers.
+
         If no featurizers are provided, all available features will be considered.
+
         Args:
             attribute: message attribute
             featurizers: names of featurizers to consider
+
         Returns:
             Dense features.
         """
@@ -228,17 +233,38 @@ def get_dense_features(
 
         return sequence_features, sentence_features
 
-    def features_present(
+    def get_all_features(
         self, attribute: Text, featurizers: Optional[List[Text]] = None
-    ) -> bool:
-        """Check if there are any features present for the given attribute and
-        featurizers.
+    ) -> List["Features"]:
+        """Gets all features for the attribute given the list of featurizers.
+
         If no featurizers are provided, all available features will be considered.
+
         Args:
             attribute: message attribute
             featurizers: names of featurizers to consider
+
+        Returns:
+            Features.
+        """
+        sparse_features = self.get_sparse_features(attribute, featurizers)
+        dense_features = self.get_dense_features(attribute, featurizers)
+
+        return [f for f in sparse_features + dense_features if f is not None]
+
+    def features_present(
+        self, attribute: Text, featurizers: Optional[List[Text]] = None
+    ) -> bool:
+        """Checks if there are any features present for the attribute and featurizers.
+
+        If no featurizers are provided, all available features will be considered.
+
+        Args:
+            attribute: Message attribute.
+            featurizers: Names of featurizers to consider.
+
         Returns:
-            ``True``, if features are present, ``False`` otherwise
+            ``True``, if features are present, ``False`` otherwise.
         """
         if featurizers is None:
             featurizers = []
@@ -316,13 +342,14 @@ def _combine_features(
 
         return combined_features
 
-    def is_core_message(self) -> bool:
-        """Checks whether the message is a core message or not.
+    def is_core_or_domain_message(self) -> bool:
+        """Checks whether the message is a core message or from the domain.
 
-        E.g. a core message is created from a story, not from the NLU data.
+        E.g. a core message is created from a story or a domain action,
+        not from the NLU data.
 
         Returns:
-            True, if message is a core message, false otherwise.
+            True, if message is a core or domain message, false otherwise.
         """
         return bool(
             self.data.get(ACTION_NAME)
@@ -336,3 +363,14 @@ def is_core_message(self) -> bool:
                 and not (self.data.get(INTENT) or self.data.get(RESPONSE))
             )
         )
+
+    def is_e2e_message(self) -> bool:
+        """Checks whether the message came from an e2e story.
+
+        Returns:
+            `True`, if message is a from an e2e story, `False` otherwise.
+        """
+        return bool(
+            (self.get(ACTION_TEXT) and not self.get(ACTION_NAME))
+            or (self.get(TEXT) and not self.get(INTENT))
+        )
diff --git a/rasa/shared/nlu/training_data/training_data.py b/rasa/shared/nlu/training_data/training_data.py
index 55352a410924..363aafb70910 100644
--- a/rasa/shared/nlu/training_data/training_data.py
+++ b/rasa/shared/nlu/training_data/training_data.py
@@ -22,7 +22,6 @@
     ENTITIES,
     TEXT,
     ACTION_NAME,
-    ACTION_TEXT,
 )
 from rasa.shared.nlu.training_data.message import Message
 from rasa.shared.nlu.training_data import util
@@ -177,18 +176,30 @@ def sanitize_examples(examples: List[Message]) -> List[Message]:
 
     @lazy_property
     def nlu_examples(self) -> List[Message]:
-        return [ex for ex in self.training_examples if not ex.is_core_message()]
+        """Return examples which have come from NLU training data.
+
+        E.g. If the example came from a story or domain it is not included.
+
+        Returns:
+            List of NLU training examples.
+        """
+        return [
+            ex for ex in self.training_examples if not ex.is_core_or_domain_message()
+        ]
 
     @lazy_property
     def intent_examples(self) -> List[Message]:
+        """Returns the list of examples that have intent."""
         return [ex for ex in self.nlu_examples if ex.get(INTENT)]
 
     @lazy_property
     def response_examples(self) -> List[Message]:
+        """Returns the list of examples that have response."""
         return [ex for ex in self.nlu_examples if ex.get(INTENT_RESPONSE_KEY)]
 
     @lazy_property
     def entity_examples(self) -> List[Message]:
+        """Returns the list of examples that have entities."""
         return [ex for ex in self.nlu_examples if ex.get(ENTITIES)]
 
     @lazy_property
@@ -679,7 +690,6 @@ def print_stats(self) -> None:
 
     def is_empty(self) -> bool:
         """Checks if any training data was loaded."""
-
         lists_to_check = [
             self.training_examples,
             self.entity_synonyms,
@@ -688,9 +698,8 @@ def is_empty(self) -> bool:
         ]
         return not any([len(lst) > 0 for lst in lists_to_check])
 
-    def can_train_nlu_model(self) -> bool:
+    def contains_no_pure_nlu_data(self) -> bool:
         """Checks if any NLU training data was loaded."""
-
         lists_to_check = [
             self.nlu_examples,
             self.entity_synonyms,
@@ -699,6 +708,20 @@ def can_train_nlu_model(self) -> bool:
         ]
         return not any([len(lst) > 0 for lst in lists_to_check])
 
+    def has_e2e_examples(self):
+        """Checks if there are any training examples from e2e stories."""
+        return any(message.is_e2e_message() for message in self.training_examples)
+
 
 def list_to_str(lst: List[Text], delim: Text = ", ", quote: Text = "'") -> Text:
+    """Converts list to a string.
+
+    Args:
+        lst: The list to convert.
+        delim: The delimiter that is used to separate list inputs.
+        quote: The quote that is used to wrap list inputs.
+
+    Returns:
+        The string.
+    """
     return delim.join([quote + e + quote for e in lst])
diff --git a/rasa/shared/utils/common.py b/rasa/shared/utils/common.py
index 6b861b8c2a4d..21ae589d3053 100644
--- a/rasa/shared/utils/common.py
+++ b/rasa/shared/utils/common.py
@@ -1,6 +1,7 @@
 import asyncio
 import functools
 import importlib
+import inspect
 import logging
 from typing import Text, Dict, Optional, Any, List, Callable, Collection
 
@@ -34,11 +35,12 @@ def class_from_module_path(
 
 def all_subclasses(cls: Any) -> List[Any]:
     """Returns all known (imported) subclasses of a class."""
-
-    return cls.__subclasses__() + [
+    classes = cls.__subclasses__() + [
         g for s in cls.__subclasses__() for g in all_subclasses(s)
     ]
 
+    return [subclass for subclass in classes if not inspect.isabstract(subclass)]
+
 
 def module_path_from_instance(inst: Any) -> Text:
     """Return the module path of an instance's class."""
diff --git a/rasa/telemetry.py b/rasa/telemetry.py
index fd6e410fcb3d..1983c6c957df 100644
--- a/rasa/telemetry.py
+++ b/rasa/telemetry.py
@@ -694,7 +694,7 @@ async def track_model_training(
             "policies": config.get("policies"),
             "num_intent_examples": len(nlu_data.intent_examples),
             "num_entity_examples": len(nlu_data.entity_examples),
-            "num_actions": len(domain.action_names),
+            "num_actions": len(domain.action_names_or_texts),
             # Old nomenclature from when 'responses' were still called
             # 'templates' in the domain
             "num_templates": len(domain.templates),
@@ -872,7 +872,7 @@ def track_project_init(path: Text) -> None:
         path: Location of the project
     """
     _track(
-        TELEMETRY_PROJECT_CREATED_EVENT, {"init_directory": _hash_directory_path(path)},
+        TELEMETRY_PROJECT_CREATED_EVENT, {"init_directory": _hash_directory_path(path)}
     )
 
 
diff --git a/rasa/train.py b/rasa/train.py
index ed5d48397316..20131538efd7 100644
--- a/rasa/train.py
+++ b/rasa/train.py
@@ -18,6 +18,7 @@
 from rasa import model, telemetry
 from rasa.model import FingerprintComparisonResult
 from rasa.shared.core.domain import Domain
+import rasa.shared.utils.common
 from rasa.nlu.model import Interpreter
 import rasa.utils.common
 import rasa.shared.utils.common
@@ -271,7 +272,7 @@ async def _train_async_internal(
     old_model = model.get_latest_model(output_path)
 
     fingerprint_comparison = model.should_retrain(
-        new_fingerprint, old_model, train_path, force_training
+        new_fingerprint, old_model, train_path, force_training=force_training
     )
 
     if dry_run:
@@ -280,7 +281,10 @@ async def _train_async_internal(
             print_warning(text) if code > 0 else print_success(text)
         return TrainingResult(code=code)
 
-    if stories.is_empty() and nlu_data.can_train_nlu_model():
+    if nlu_data.has_e2e_examples():
+        rasa.shared.utils.common.mark_as_experimental_feature("end-to-end training")
+
+    if stories.is_empty() and nlu_data.contains_no_pure_nlu_data():
         rasa.shared.utils.cli.print_error(
             "No training data given. Please provide stories and NLU data in "
             "order to train a Rasa model using the '--data' argument."
@@ -302,7 +306,8 @@ async def _train_async_internal(
         )
         return TrainingResult(model=trained_model)
 
-    if nlu_data.can_train_nlu_model():
+    # We will train nlu if there are any nlu example, including from e2e stories.
+    if nlu_data.contains_no_pure_nlu_data() and not nlu_data.has_e2e_examples():
         rasa.shared.utils.cli.print_warning(
             "No NLU data present. Just a Rasa Core model will be trained."
         )
@@ -314,8 +319,22 @@ async def _train_async_internal(
             model_to_finetune=model_to_finetune,
             finetuning_epoch_fraction=finetuning_epoch_fraction,
         )
+
         return TrainingResult(model=trained_model)
 
+    new_fingerprint = await model.model_fingerprint(file_importer)
+    old_model = model.get_latest_model(output_path)
+
+    if not force_training:
+        fingerprint_comparison = model.should_retrain(
+            new_fingerprint,
+            old_model,
+            train_path,
+            has_e2e_examples=nlu_data.has_e2e_examples(),
+        )
+    else:
+        fingerprint_comparison = FingerprintComparisonResult(force_training=True)
+
     if fingerprint_comparison.is_training_required():
         async with telemetry.track_model_training(
             file_importer, model_type="rasa",
@@ -491,7 +510,19 @@ async def train_core_async(
     file_importer = TrainingDataImporter.load_core_importer_from_config(
         config, domain, [stories]
     )
-    domain = await file_importer.get_domain()
+    stories, nlu_data, domain = await asyncio.gather(
+        file_importer.get_stories(),
+        file_importer.get_nlu_data(),
+        file_importer.get_domain(),
+    )
+
+    if nlu_data.has_e2e_examples():
+        rasa.shared.utils.cli.print_error(
+            "Stories file contains e2e stories. Please train using `rasa train` so that"
+            " the NLU model is also trained."
+        )
+        return None
+
     if domain.is_empty():
         rasa.shared.utils.cli.print_error(
             "Core training was skipped because no valid domain file was found. "
@@ -500,7 +531,7 @@ async def train_core_async(
         )
         return None
 
-    if not await file_importer.get_stories():
+    if not stories:
         rasa.shared.utils.cli.print_error(
             "No stories given. Please provide stories in order to "
             "train a Rasa Core model using the '--stories' argument."
@@ -707,7 +738,7 @@ async def _train_nlu_async(
     )
 
     training_data = await file_importer.get_nlu_data()
-    if training_data.can_train_nlu_model():
+    if training_data.contains_no_pure_nlu_data():
         rasa.shared.utils.cli.print_error(
             f"Path '{nlu_data}' doesn't contain valid NLU data in it. "
             f"Please verify the data format. "
diff --git a/rasa/utils/schemas/stories.yml b/rasa/utils/schemas/stories.yml
index 95544c22c0b3..aafe4c70828b 100644
--- a/rasa/utils/schemas/stories.yml
+++ b/rasa/utils/schemas/stories.yml
@@ -24,7 +24,6 @@ mapping:
             mapping: &intent_and_entities
               intent:
                 type: "str"
-                required: True
                 allowempty: False
               user:
                 type: "str"
@@ -52,6 +51,11 @@ mapping:
               action:
                 type: "str"
                 allowempty: False
+          - type: "map"
+            mapping:
+              bot:
+                type: "str"
+                allowempty: False
           - type: "map"
             mapping: &slot_was_set_seq
               slot_was_set: &slot_was_set_seq_value
diff --git a/rasa/utils/tensorflow/constants.py b/rasa/utils/tensorflow/constants.py
index 3c3b2c090888..29c046258dac 100644
--- a/rasa/utils/tensorflow/constants.py
+++ b/rasa/utils/tensorflow/constants.py
@@ -1,6 +1,7 @@
 # constants for configuration parameters of our tensorflow models
 
 LABEL = "label"
+IDS = "ids"
 HIDDEN_LAYERS_SIZES = "hidden_layers_sizes"
 SHARE_HIDDEN_LAYERS = "share_hidden_layers"
 
@@ -67,6 +68,7 @@
 BALANCED = "balanced"
 
 SEQUENCE = "sequence"
+SEQUENCE_LENGTH = f"{SEQUENCE}_lengths"
 SENTENCE = "sentence"
 
 POOLING = "pooling"
@@ -81,3 +83,5 @@
 
 FEATURIZERS = "featurizers"
 CHECKPOINT_MODEL = "checkpoint_model"
+
+MASK = "mask"
diff --git a/rasa/utils/tensorflow/layers.py b/rasa/utils/tensorflow/layers.py
index 1d64b1b26cb3..a9017094e945 100644
--- a/rasa/utils/tensorflow/layers.py
+++ b/rasa/utils/tensorflow/layers.py
@@ -136,7 +136,7 @@ def call(self, inputs: tf.SparseTensor) -> tf.Tensor:
         if len(inputs.shape) == 3:
             # reshape back
             outputs = tf.reshape(
-                outputs, (tf.shape(inputs)[0], tf.shape(inputs)[1], -1)
+                outputs, (tf.shape(inputs)[0], tf.shape(inputs)[1], self.units)
             )
 
         if self.use_bias:
@@ -630,13 +630,15 @@ def body(idx: tf.Tensor, out: tf.Tensor) -> List[tf.Tensor]:
         # create first random array of indices
         out1 = rand_idxs()  # (1, num_neg)
 
-        return tf.while_loop(
-            cond,
-            body,
-            loop_vars=[idx1, out1],
-            shape_invariants=[idx1.shape, tf.TensorShape([None, self.num_neg])],
-            parallel_iterations=self.parallel_iterations,
-            back_prop=False,
+        return tf.nest.map_structure(
+            tf.stop_gradient,
+            tf.while_loop(
+                cond,
+                body,
+                loop_vars=[idx1, out1],
+                shape_invariants=[idx1.shape, tf.TensorShape([None, self.num_neg])],
+                parallel_iterations=self.parallel_iterations,
+            ),
         )[1]
 
     @staticmethod
diff --git a/rasa/utils/tensorflow/model_data.py b/rasa/utils/tensorflow/model_data.py
index 2a3cd16811ab..aec21403855e 100644
--- a/rasa/utils/tensorflow/model_data.py
+++ b/rasa/utils/tensorflow/model_data.py
@@ -15,36 +15,277 @@
     Union,
     Generator,
     NamedTuple,
-    ValuesView,
     ItemsView,
 )
-from collections import defaultdict
+from collections import defaultdict, OrderedDict
 from rasa.utils.tensorflow.constants import BALANCED, SEQUENCE
 
 logger = logging.getLogger(__name__)
 
 
-# Mapping of attribute name and feature name to a list of numpy arrays representing
-# the actual features
-# For example:
-# "text" -> { "sentence": [
-#   "numpy array containing dense features for every training example",
-#   "numpy array containing sparse features for every training example"
-# ]}
-Data = Dict[Text, Dict[Text, List[np.ndarray]]]
+class FeatureArray(np.ndarray):
+    """Stores any kind of features ready to be used by a RasaModel.
+
+    Next to the input numpy array of features, it also received the number of dimensions of the features.
+    As our features can have 1 to 4 dimensions we might have different number of numpy arrays stacked.
+    The number of dimensions helps us to figure out how to handle this particular feature array.
+    Also, it is automatically determined whether the feature array is sparse or not and the number of units
+    is determined as well.
+
+    Subclassing np.array: https://numpy.org/doc/stable/user/basics.subclassing.html
+    """
+
+    def __new__(
+        cls, input_array: np.ndarray, number_of_dimensions: int
+    ) -> "FeatureArray":
+        """Create and return a new object.  See help(type) for accurate signature."""
+        FeatureArray._validate_number_of_dimensions(number_of_dimensions, input_array)
+
+        feature_array = np.asarray(input_array).view(cls)
+
+        if number_of_dimensions <= 2:
+            feature_array.units = input_array.shape[-1]
+            feature_array.is_sparse = isinstance(input_array[0], scipy.sparse.spmatrix)
+        elif number_of_dimensions == 3:
+            feature_array.units = input_array[0].shape[-1]
+            feature_array.is_sparse = isinstance(input_array[0], scipy.sparse.spmatrix)
+        elif number_of_dimensions == 4:
+            feature_array.units = input_array[0][0].shape[-1]
+            feature_array.is_sparse = isinstance(
+                input_array[0][0], scipy.sparse.spmatrix
+            )
+        else:
+            raise ValueError(
+                f"Number of dimensions '{number_of_dimensions}' currently not supported."
+            )
+
+        feature_array.number_of_dimensions = number_of_dimensions
+
+        return feature_array
+
+    def __init__(self, input_array: Any, number_of_dimensions: int, **kwargs):
+        """Initialize. FeatureArray.
+
+        Needed in order to avoid 'Invalid keyword argument number_of_dimensions
+        to function FeatureArray.__init__ '
+        Args:
+            input_array: the array that contains features
+            number_of_dimensions: number of dimensions in input_array
+        """
+        super().__init__(**kwargs)
+        self.number_of_dimensions = number_of_dimensions
+
+    def __array_finalize__(self, obj: Any) -> None:
+        """This method is called whenever the system internally allocates a new array from obj.
+
+        Args:
+            obj: A subclass (subtype) of ndarray.
+        """
+        if obj is None:
+            return
+
+        self.units = getattr(obj, "units", None)
+        self.number_of_dimensions = getattr(obj, "number_of_dimensions", None)
+        self.is_sparse = getattr(obj, "is_sparse", None)
+
+        default_attributes = {
+            "units": self.units,
+            "number_of_dimensions": self.number_of_dimensions,
+            "is_spare": self.is_sparse,
+        }
+        self.__dict__.update(default_attributes)
+
+    # pytype: disable=attribute-error
+    def __array_ufunc__(self, ufunc: Any, method: Text, *inputs, **kwargs) -> Any:
+        """Overwrite this method as we are subclassing numpy array.
+
+        Args:
+            ufunc: The ufunc object that was called.
+            method: A string indicating which Ufunc method was called
+                    (one of "__call__", "reduce", "reduceat", "accumulate", "outer",
+                    "inner").
+            *inputs: A tuple of the input arguments to the ufunc.
+            **kwargs: Any additional arguments
+
+        Returns:
+            The result of the operation.
+        """
+        f = {
+            "reduce": ufunc.reduce,
+            "accumulate": ufunc.accumulate,
+            "reduceat": ufunc.reduceat,
+            "outer": ufunc.outer,
+            "at": ufunc.at,
+            "__call__": ufunc,
+        }
+        # convert the inputs to np.ndarray to prevent recursion, call the function,
+        # then cast it back as FeatureArray
+        output = FeatureArray(
+            f[method](*(i.view(np.ndarray) for i in inputs), **kwargs),
+            number_of_dimensions=kwargs["number_of_dimensions"],
+        )
+        output.__dict__ = self.__dict__  # carry forward attributes
+        return output
+
+    def __reduce__(self) -> Tuple[Any, Any, Any]:
+        """Needed in order to pickle this object.
+
+        Returns:
+            A tuple.
+        """
+        pickled_state = super(FeatureArray, self).__reduce__()
+        new_state = pickled_state[2] + (
+            self.number_of_dimensions,
+            self.is_sparse,
+            self.units,
+        )
+        return pickled_state[0], pickled_state[1], new_state
+
+    def __setstate__(self, state, **kwargs) -> None:
+        """Sets the state.
+
+        Args:
+            state: The state argument must be a sequence that contains the following
+                   elements version, shape, dtype, isFortan, rawdata.
+            **kwargs: Any additional parameter
+        """
+        # Needed in order to load the object
+        self.number_of_dimensions = state[-3]
+        self.is_sparse = state[-2]
+        self.units = state[-1]
+        super(FeatureArray, self).__setstate__(state[0:-3], **kwargs)
+
+    # pytype: enable=attribute-error
+
+    @staticmethod
+    def _validate_number_of_dimensions(
+        number_of_dimensions: int, input_array: np.ndarray
+    ) -> None:
+        """Validates if the the input array has given number of dimensions.
+
+        Args:
+            number_of_dimensions: number of dimensions
+            input_array: input array
+
+        Raises: ValueError in case the dimensions do not match
+        """
+        _sub_array = input_array
+        dim = 0
+        # Go number_of_dimensions into the given input_array
+        for i in range(1, number_of_dimensions + 1):
+            _sub_array = _sub_array[0]
+            if isinstance(_sub_array, scipy.sparse.spmatrix):
+                dim = i
+                break
+            if isinstance(_sub_array, np.ndarray) and _sub_array.shape[0] == 0:
+                # sequence dimension is 0, we are dealing with "fake" features
+                dim = i
+                break
+
+        # If the resulting sub_array is sparse, the remaining number of dimensions
+        # should be at least 2
+        if isinstance(_sub_array, scipy.sparse.spmatrix):
+            if dim > 2:
+                raise ValueError(
+                    f"Given number of dimensions '{number_of_dimensions}' does not "
+                    f"match dimensions of given input array: {input_array}."
+                )
+        elif isinstance(_sub_array, np.ndarray) and _sub_array.shape[0] == 0:
+            # sequence dimension is 0, we are dealing with "fake" features,
+            # but they should be of dim 2
+            if dim > 2:
+                raise ValueError(
+                    f"Given number of dimensions '{number_of_dimensions}' does not "
+                    f"match dimensions of given input array: {input_array}."
+                )
+        # If the resulting sub_array is dense, the sub_array should be a single number
+        elif not np.issubdtype(type(_sub_array), np.integer) and not isinstance(
+            _sub_array, (np.float32, np.float64)
+        ):
+            raise ValueError(
+                f"Given number of dimensions '{number_of_dimensions}' does not match "
+                f"dimensions of given input array: {input_array}."
+            )
+
+    def get_shape_type_info(
+        self,
+    ) -> Tuple[
+        List[
+            Union[
+                int,
+                Tuple[None],
+                Tuple[None, int],
+                Tuple[None, None, int],
+                Tuple[None, None, None, int],
+            ]
+        ],
+        List[int],
+    ]:
+        """Returns shapes and types needed to convert this feature array into tensors.
+
+        Returns:
+            A list of shape tuples.
+            A list of type tuples.
+        """
+        if self.is_sparse:
+            # 4D tensors were converted into 3D tensors during padding
+            number_of_dimensions = (
+                self.number_of_dimensions if self.number_of_dimensions != 4 else 3
+            )
+            # scipy matrix is converted into indices, data, shape
+            return (
+                [(None, number_of_dimensions), (None,), (number_of_dimensions)],
+                [tf.int64, tf.float32, tf.int64],
+            )
+
+        if self.number_of_dimensions == 1:
+            return [(None,)], [tf.float32]
+
+        if self.number_of_dimensions == 2:
+            return [(None, self.units)], [tf.float32]
+
+        if self.number_of_dimensions == 3:
+            return [(None, None, self.units)], [tf.float32]
+
+        if self.number_of_dimensions == 4:
+            # 4D tensors were converted into 3D tensors during padding
+            return [(None, None, self.units)], [tf.float32]
+
+        return [], []
 
 
 class FeatureSignature(NamedTuple):
-    """Stores the shape and the type (sparse vs dense) of features."""
+    """Signature of feature arrays.
+
+    Stores the number of units, the type (sparse vs dense), and the number of
+    dimensions of features.
+    """
 
     is_sparse: bool
-    feature_dimension: Optional[int]
+    units: Optional[int]
+    number_of_dimensions: int
+
+
+# Mapping of attribute name and feature name to a list of feature arrays representing
+# the actual features
+# For example:
+# "text" -> { "sentence": [
+#   "feature array containing dense features for every training example",
+#   "feature array containing sparse features for every training example"
+# ]}
+Data = Dict[Text, Dict[Text, List[FeatureArray]]]
 
 
 class RasaModelData:
     """Data object used for all RasaModels.
 
     It contains all features needed to train the models.
+    'data' is a mapping of attribute name, e.g. TEXT, INTENT, etc., and feature name,
+    e.g. SENTENCE, SEQUENCE, etc., to a list of feature arrays representing the actual
+    features.
+    'label_key' and 'label_sub_key' point to the labels inside 'data'. For
+    example, if your intent labels are stored under INTENT -> IDS, 'label_key' would
+    be "INTENT" and 'label_sub_key' would be "IDS".
     """
 
     def __init__(
@@ -61,7 +302,6 @@ def __init__(
             label_sub_key: the sub key of a label used for balancing, etc.
             data: the data holding the features
         """
-
         self.data = data or defaultdict(lambda: defaultdict(list))
         self.label_key = label_key
         self.label_sub_key = label_sub_key
@@ -70,7 +310,7 @@ def __init__(
 
     def get(
         self, key: Text, sub_key: Optional[Text] = None
-    ) -> Union[Dict[Text, List[np.ndarray]], List[np.ndarray]]:
+    ) -> Union[Dict[Text, List[FeatureArray]], List[FeatureArray]]:
         """Get the data under the given keys.
 
         Args:
@@ -96,7 +336,7 @@ def items(self) -> ItemsView:
         """
         return self.data.items()
 
-    def values(self) -> ValuesView[Dict[Text, List[np.ndarray]]]:
+    def values(self) -> Any:
         """Return the values of the data attribute.
 
         Returns:
@@ -121,6 +361,12 @@ def keys(self, key: Optional[Text] = None) -> List[Text]:
 
         return []
 
+    def sort(self):
+        """Sorts data according to its keys."""
+        for key, attribute_data in self.data.items():
+            self.data[key] = OrderedDict(sorted(attribute_data.items()))
+        self.data = OrderedDict(sorted(self.data.items()))
+
     def first_data_example(self) -> Data:
         """Return the data with just one feature example per key, sub-key.
 
@@ -134,6 +380,18 @@ def first_data_example(self) -> Data:
                 out_data[key][sub_key] = [feature[:1] for feature in features]
         return out_data
 
+    def does_feature_exist(self, key: Text, sub_key: Optional[Text] = None) -> bool:
+        """Check if feature key (and sub-key) is present and features are available.
+
+        Args:
+            key: The key.
+            sub_key: The optional sub-key.
+
+        Returns:
+            False, if no features for the given keys exists, True otherwise.
+        """
+        return not self.does_feature_not_exist(key, sub_key)
+
     def does_feature_not_exist(self, key: Text, sub_key: Optional[Text] = None) -> bool:
         """Check if feature key (and sub-key) is present and features are available.
 
@@ -177,7 +435,7 @@ def number_of_examples(self, data: Optional[Data] = None) -> int:
             return 0
 
         example_lengths = [
-            f.shape[0]
+            len(f)
             for attribute_data in data.values()
             for features in attribute_data.values()
             for f in features
@@ -195,25 +453,25 @@ def number_of_examples(self, data: Optional[Data] = None) -> int:
 
         return example_lengths[0]
 
-    def feature_dimension(self, key: Text, sub_key: Text) -> int:
-        """Get the feature dimension of the given key.
+    def number_of_units(self, key: Text, sub_key: Text) -> int:
+        """Get the number of units of the given key.
 
         Args:
             key: The key.
             sub_key: The optional sub-key.
 
         Returns:
-            The feature dimension.
+            The number of units.
         """
         if key not in self.data or sub_key not in self.data[key]:
             return 0
 
-        number_of_features = 0
-        for data in self.data[key][sub_key]:
-            if data.size > 0:
-                number_of_features += data[0].shape[-1]
+        units = 0
+        for features in self.data[key][sub_key]:
+            if len(features) > 0:
+                units += features.units
 
-        return number_of_features
+        return units
 
     def add_data(self, data: Data, key_prefix: Optional[Text] = None) -> None:
         """Add incoming data to data.
@@ -229,8 +487,30 @@ def add_data(self, data: Data, key_prefix: Optional[Text] = None) -> None:
                 else:
                     self.add_features(key, sub_key, features)
 
+    def update_key(
+        self, from_key: Text, from_sub_key: Text, to_key: Text, to_sub_key: Text
+    ) -> None:
+        """Copies the features under the given keys to the new keys and deletes the old keys.
+
+        Args:
+            from_key: current feature key
+            from_sub_key: current feature sub-key
+            to_key: new key for feature
+            to_sub_key: new sub-key for feature
+        """
+        if from_key not in self.data or from_sub_key not in self.data[from_key]:
+            return
+
+        if to_key not in self.data:
+            self.data[to_key] = {}
+        self.data[to_key][to_sub_key] = self.get(from_key, from_sub_key)
+        del self.data[from_key][from_sub_key]
+
+        if not self.data[from_key]:
+            del self.data[from_key]
+
     def add_features(
-        self, key: Text, sub_key: Text, features: Optional[List[np.ndarray]]
+        self, key: Text, sub_key: Text, features: Optional[List[FeatureArray]]
     ) -> None:
         """Add list of features to data under specified key.
 
@@ -244,9 +524,9 @@ def add_features(
         if features is None:
             return
 
-        for data in features:
-            if data.size > 0:
-                self.data[key][sub_key].append(data)
+        for feature_array in features:
+            if len(feature_array) > 0:
+                self.data[key][sub_key].append(feature_array)
 
         if not self.data[key][sub_key]:
             del self.data[key][sub_key]
@@ -257,7 +537,7 @@ def add_features(
     def add_lengths(
         self, key: Text, sub_key: Text, from_key: Text, from_sub_key: Text
     ) -> None:
-        """Adds np.array of lengths of sequences to data under given key.
+        """Adds a feature array of lengths of sequences to data under given key.
 
         Args:
             key: The key to add the lengths to
@@ -272,11 +552,28 @@ def add_lengths(
 
         self.data[key][sub_key] = []
 
-        for data in self.data[from_key][from_sub_key]:
-            if data.size > 0:
-                lengths = np.array([x.shape[0] for x in data])
-                self.data[key][sub_key].extend([lengths])
-                break
+        for features in self.data[from_key][from_sub_key]:
+            if len(features) == 0:
+                continue
+
+            if features.number_of_dimensions == 4:
+                lengths = FeatureArray(
+                    np.array(
+                        [
+                            # add one more dim so that dialogue dim
+                            # would be a sequence
+                            np.array([[[x.shape[0]]] for x in _features])
+                            for _features in features
+                        ]
+                    ),
+                    number_of_dimensions=4,
+                )
+            else:
+                lengths = FeatureArray(
+                    np.array([x.shape[0] for x in features]), number_of_dimensions=1
+                )
+            self.data[key][sub_key].extend([lengths])
+            break
 
     def split(
         self, number_of_test_examples: int, random_seed: int
@@ -290,11 +587,10 @@ def split(
         Returns:
             A tuple of train and test RasaModelData.
         """
-
         self._check_label_key()
 
         if self.label_key is None or self.label_sub_key is None:
-            # randomly split data as no label key is split
+            # randomly split data as no label key is set
             multi_values = [
                 v
                 for attribute_data in self.data.values()
@@ -347,7 +643,9 @@ def split(
 
         return self._convert_train_test_split(output_values, solo_values)
 
-    def get_signature(self) -> Dict[Text, Dict[Text, List[FeatureSignature]]]:
+    def get_signature(
+        self, data: Optional[Data] = None
+    ) -> Dict[Text, Dict[Text, List[FeatureSignature]]]:
         """Get signature of RasaModelData.
 
         Signature stores the shape and whether features are sparse or not for every key.
@@ -356,19 +654,18 @@ def get_signature(self) -> Dict[Text, Dict[Text, List[FeatureSignature]]]:
             A dictionary of key and sub-key to a list of feature signatures
             (same structure as the data attribute).
         """
+        if not data:
+            data = self.data
 
         return {
             key: {
                 sub_key: [
-                    FeatureSignature(
-                        True if isinstance(f[0], scipy.sparse.spmatrix) else False,
-                        f[0].shape[-1] if f[0].shape else None,
-                    )
+                    FeatureSignature(f.is_sparse, f.units, f.number_of_dimensions)
                     for f in features
                 ]
                 for sub_key, features in attribute_data.items()
             }
-            for key, attribute_data in self.data.items()
+            for key, attribute_data in data.items()
         }
 
     def as_tf_dataset(
@@ -384,7 +681,6 @@ def as_tf_dataset(
         Returns:
             The tf.data.Dataset.
         """
-
         shapes, types = self._get_shapes_types()
 
         return tf.data.Dataset.from_generator(
@@ -414,7 +710,6 @@ def prepare_batch(
         Returns:
             The features of the batch.
         """
-
         if not data:
             data = self.data
 
@@ -440,7 +735,7 @@ def prepare_batch(
                     else:
                         _data = v[:]
 
-                    if isinstance(_data[0], scipy.sparse.spmatrix):
+                    if _data.is_sparse:
                         batch_data.extend(self._scipy_matrix_to_values(_data))
                     else:
                         batch_data.append(self._pad_dense_data(_data))
@@ -454,37 +749,15 @@ def _get_shapes_types(self) -> Tuple:
         Returns:
             A tuple of shapes and a tuple of types.
         """
-
         types = []
         shapes = []
 
-        def append_shape(features: np.ndarray) -> None:
-            if isinstance(features[0], scipy.sparse.spmatrix):
-                # scipy matrix is converted into indices, data, shape
-                shapes.append((None, features[0].ndim + 1))
-                shapes.append((None,))
-                shapes.append((features[0].ndim + 1))
-            elif features[0].ndim == 0:
-                shapes.append((None,))
-            elif features[0].ndim == 1:
-                shapes.append((None, features[0].shape[-1]))
-            else:
-                shapes.append((None, None, features[0].shape[-1]))
-
-        def append_type(features: np.ndarray) -> None:
-            if isinstance(features[0], scipy.sparse.spmatrix):
-                # scipy matrix is converted into indices, data, shape
-                types.append(tf.int64)
-                types.append(tf.float32)
-                types.append(tf.int64)
-            else:
-                types.append(tf.float32)
-
         for attribute_data in self.data.values():
             for features in attribute_data.values():
                 for f in features:
-                    append_shape(f)
-                    append_type(f)
+                    _shapes, _types = f.get_shape_type_info()
+                    shapes.extend(_shapes)
+                    types.extend(_types)
 
         return tuple(shapes), tuple(types)
 
@@ -497,7 +770,6 @@ def _shuffled_data(self, data: Data) -> Data:
         Returns:
             The shuffled data.
         """
-
         ids = np.random.permutation(self.num_examples)
         return self._data_for_ids(data, ids)
 
@@ -584,7 +856,12 @@ def _balanced_data(self, data: Data, batch_size: int, shuffle: bool) -> Data:
         for key, attribute_data in new_data.items():
             for sub_key, features in attribute_data.items():
                 for f in features:
-                    final_data[key][sub_key].append(np.concatenate(np.array(f)))
+                    final_data[key][sub_key].append(
+                        FeatureArray(
+                            np.concatenate(np.array(f)),
+                            number_of_dimensions=f[0].number_of_dimensions,
+                        )
+                    )
 
         return final_data
 
@@ -601,7 +878,6 @@ def _gen_batch(
         Returns:
             A generator over the batches.
         """
-
         data = self.data
         num_examples = self.num_examples
 
@@ -633,7 +909,6 @@ def _check_train_test_sizes(
         Raises:
             A ValueError if the number of examples does not fit.
         """
-
         if number_of_test_examples >= self.num_examples - len(label_counts):
             raise ValueError(
                 f"Test set of {number_of_test_examples} is too large. Remaining "
@@ -657,7 +932,6 @@ def _data_for_ids(data: Optional[Data], ids: np.ndarray) -> Data:
         Returns:
             The filtered data
         """
-
         new_data = defaultdict(lambda: defaultdict(list))
 
         if data is None:
@@ -682,10 +956,9 @@ def _split_by_label_ids(
         Returns:
             Reorganized RasaModelData
         """
-
         label_data = []
         for label_id in unique_label_ids:
-            matching_ids = label_ids == label_id
+            matching_ids = np.array(label_ids) == label_id
             label_data.append(
                 RasaModelData(
                     self.label_key,
@@ -726,7 +999,6 @@ def _convert_train_test_split(
         Returns:
             The test and train RasaModelData
         """
-
         data_train = defaultdict(lambda: defaultdict(list))
         data_val = defaultdict(lambda: defaultdict(list))
 
@@ -737,10 +1009,12 @@ def _convert_train_test_split(
         index = 0
         for key, attribute_data in self.data.items():
             for sub_key, features in attribute_data.items():
-                for _ in features:
+                for f in features:
                     data_train[key][sub_key].append(
                         self._combine_features(
-                            output_values[index * 2], solo_values[index]
+                            output_values[index * 2],
+                            solo_values[index],
+                            f.number_of_dimensions,
                         )
                     )
                     index += 1
@@ -762,7 +1036,8 @@ def _convert_train_test_split(
     def _combine_features(
         feature_1: Union[np.ndarray, scipy.sparse.spmatrix],
         feature_2: Union[np.ndarray, scipy.sparse.spmatrix],
-    ) -> Union[np.ndarray, scipy.sparse.spmatrix]:
+        number_of_dimensions: Optional[int] = 1,
+    ) -> FeatureArray:
         """Concatenate features.
 
         Args:
@@ -772,20 +1047,23 @@ def _combine_features(
         Returns:
             The combined features.
         """
-
         if isinstance(feature_1, scipy.sparse.spmatrix) and isinstance(
             feature_2, scipy.sparse.spmatrix
         ):
             if feature_2.shape[0] == 0:
-                return feature_1
+                return FeatureArray(feature_1, number_of_dimensions)
             if feature_1.shape[0] == 0:
-                return feature_2
-            return scipy.sparse.vstack([feature_1, feature_2])
+                return FeatureArray(feature_2, number_of_dimensions)
+            return FeatureArray(
+                scipy.sparse.vstack([feature_1, feature_2]), number_of_dimensions
+            )
 
-        return np.concatenate([feature_1, feature_2])
+        return FeatureArray(
+            np.concatenate([feature_1, feature_2]), number_of_dimensions
+        )
 
     @staticmethod
-    def _create_label_ids(label_ids: np.ndarray) -> np.ndarray:
+    def _create_label_ids(label_ids: FeatureArray) -> np.ndarray:
         """Convert various size label_ids into single dim array.
 
         For multi-label y, map each distinct row to a string representation
@@ -795,10 +1073,12 @@ def _create_label_ids(label_ids: np.ndarray) -> np.ndarray:
         Args:
             label_ids: The label ids.
 
+        Raises:
+            ValueError if dimensionality of label ids is not supported
+
         Returns:
             The single dim label array.
         """
-
         if label_ids.ndim == 1:
             return label_ids
 
@@ -814,7 +1094,24 @@ def _create_label_ids(label_ids: np.ndarray) -> np.ndarray:
         raise ValueError("Unsupported label_ids dimensions")
 
     @staticmethod
-    def _pad_dense_data(array_of_dense: np.ndarray) -> np.ndarray:
+    def _filter_out_fake_inputs(
+        array_of_array_of_features: FeatureArray,
+    ) -> Union[List[List[np.ndarray]], List[List[scipy.sparse.spmatrix]]]:
+        return list(
+            filter(
+                # filter empty lists created by another filter
+                lambda x: len(x) > 0,
+                [
+                    # filter all the "fake" inputs, we know the input is "fake",
+                    # when sequence dimension is `0`
+                    list(filter(lambda x: x.shape[0] > 0, array_of_features))
+                    for array_of_features in array_of_array_of_features
+                ],
+            )
+        )
+
+    @staticmethod
+    def _pad_dense_data(array_of_dense: FeatureArray) -> np.ndarray:
         """Pad data of different lengths.
 
         Sequential data is padded with zeros. Zeros are added to the end of data.
@@ -825,6 +1122,8 @@ def _pad_dense_data(array_of_dense: np.ndarray) -> np.ndarray:
         Returns:
             The padded array.
         """
+        if array_of_dense.number_of_dimensions == 4:
+            return RasaModelData._pad_4d_dense_data(array_of_dense)
 
         if array_of_dense[0].ndim < 2:
             # data doesn't contain a sequence
@@ -843,7 +1142,54 @@ def _pad_dense_data(array_of_dense: np.ndarray) -> np.ndarray:
         return data_padded.astype(np.float32)
 
     @staticmethod
-    def _scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
+    def _pad_4d_dense_data(array_of_array_of_dense: FeatureArray) -> np.ndarray:
+        # in case of dialogue data we may have 4 dimensions
+        # batch size x dialogue history length x sequence length x number of features
+
+        # as transformers cannot handle 4D tensors pad and reshape the data
+        # so that the resulting tensor is 3D
+        # the shape is (sum of dialogue history length for all tensors in the
+        # batch x max sequence length x number of features)
+        # the original shape and the original dialogue length is passed on to the model
+        # it can be used to transform the 3D tensor back into 4D
+
+        # in order to create 4d tensor inputs, we created "fake" zero features
+        # for nonexistent inputs. To save calculation we filter this features before
+        # input to tf methods.
+        number_of_features = array_of_array_of_dense[0][0].shape[-1]
+        array_of_array_of_dense = RasaModelData._filter_out_fake_inputs(
+            array_of_array_of_dense
+        )
+        if not array_of_array_of_dense:
+            # return empty 3d array with appropriate last dims
+            return np.zeros((0, 0, number_of_features), dtype=np.float32)
+
+        combined_dialogue_len = sum(
+            len(array_of_dense) for array_of_dense in array_of_array_of_dense
+        )
+        max_seq_len = max(
+            [
+                x.shape[0]
+                for array_of_dense in array_of_array_of_dense
+                for x in array_of_dense
+            ]
+        )
+
+        data_padded = np.zeros(
+            [combined_dialogue_len, max_seq_len, number_of_features],
+            dtype=array_of_array_of_dense[0][0].dtype,
+        )
+
+        current_sum_dialogue_len = 0
+        for i, array_of_dense in enumerate(array_of_array_of_dense):
+            for j, dense in enumerate(array_of_dense):
+                data_padded[current_sum_dialogue_len + j, : dense.shape[0], :] = dense
+            current_sum_dialogue_len += len(array_of_dense)
+
+        return data_padded.astype(np.float32)
+
+    @staticmethod
+    def _scipy_matrix_to_values(array_of_sparse: FeatureArray) -> List[np.ndarray]:
         """Convert a scipy matrix into indices, data, and shape.
 
         Args:
@@ -852,6 +1198,8 @@ def _scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
         Returns:
             A list of dense numpy arrays representing the sparse data.
         """
+        if array_of_sparse.number_of_dimensions == 4:
+            return RasaModelData._4d_scipy_matrix_to_values(array_of_sparse)
 
         # we need to make sure that the matrices are coo_matrices otherwise the
         # transformation does not work (e.g. you cannot access x.row, x.col)
@@ -878,3 +1226,78 @@ def _scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
             data.astype(np.float32),
             shape.astype(np.int64),
         ]
+
+    @staticmethod
+    def _4d_scipy_matrix_to_values(
+        array_of_array_of_sparse: FeatureArray,
+    ) -> List[np.ndarray]:
+        # in case of dialogue data we may have 4 dimensions
+        # batch size x dialogue history length x sequence length x number of features
+
+        # transformers cannot handle 4D tensors, therefore pad and reshape the data
+        # so that the resulting tensor is 3D
+        # the shape is (sum of dialogue history length for all tensors in the
+        # batch x max sequence length x number of features)
+        # the original shape and the original dialogue length is passed on to the model
+        # it can be used to transform the 3D tensor back into 4D
+
+        # in order to create 4d tensor inputs, we created "fake" zero features
+        # for nonexistent inputs. To save calculation we filter this features before
+        # input to tf methods.
+        number_of_features = array_of_array_of_sparse[0][0].shape[-1]
+        array_of_array_of_sparse = RasaModelData._filter_out_fake_inputs(
+            array_of_array_of_sparse
+        )
+        if not array_of_array_of_sparse:
+            # create empty array with appropriate last dims
+            return [
+                np.empty((0, 3), dtype=np.int64),
+                np.array([], dtype=np.float32),
+                np.array([0, 0, number_of_features], dtype=np.int64),
+            ]
+
+        # we need to make sure that the matrices are coo_matrices otherwise the
+        # transformation does not work (e.g. you cannot access x.row, x.col)
+        if not isinstance(array_of_array_of_sparse[0][0], scipy.sparse.coo_matrix):
+            array_of_array_of_sparse = [
+                [x.tocoo() for x in array_of_sparse]
+                for array_of_sparse in array_of_array_of_sparse
+            ]
+
+        dialogue_len = [
+            len(array_of_sparse) for array_of_sparse in array_of_array_of_sparse
+        ]
+        combined_dialogue_len = sum(dialogue_len)
+        max_seq_len = max(
+            [
+                x.shape[0]
+                for array_of_sparse in array_of_array_of_sparse
+                for x in array_of_sparse
+            ]
+        )
+        # get the indices of values
+        indices = np.hstack(
+            [
+                np.vstack(
+                    [sum(dialogue_len[:i]) + j * np.ones_like(x.row), x.row, x.col]
+                )
+                for i, array_of_sparse in enumerate(array_of_array_of_sparse)
+                for j, x in enumerate(array_of_sparse)
+            ]
+        ).T
+
+        data = np.hstack(
+            [
+                x.data
+                for array_of_sparse in array_of_array_of_sparse
+                for x in array_of_sparse
+            ]
+        )
+
+        shape = np.array((combined_dialogue_len, max_seq_len, number_of_features))
+
+        return [
+            indices.astype(np.int64),
+            data.astype(np.float32),
+            shape.astype(np.int64),
+        ]
diff --git a/rasa/utils/tensorflow/model_data_utils.py b/rasa/utils/tensorflow/model_data_utils.py
index 5ea202eb25c5..4b268c0be4b0 100644
--- a/rasa/utils/tensorflow/model_data_utils.py
+++ b/rasa/utils/tensorflow/model_data_utils.py
@@ -1,226 +1,381 @@
 import typing
-from typing import List, Optional, Text, Dict, Tuple, Union, Any
 import copy
 import numpy as np
-from collections import defaultdict, OrderedDict
 import scipy.sparse
+from collections import defaultdict, OrderedDict
+from typing import List, Optional, Text, Dict, Tuple, Union, Any
 
-from rasa.utils.tensorflow.model_data import Data
-from rasa.utils.tensorflow.constants import SEQUENCE
+from rasa.nlu.constants import TOKENS_NAMES
+from rasa.utils.tensorflow.model_data import Data, FeatureArray
+from rasa.utils.tensorflow.constants import MASK
+from rasa.shared.nlu.training_data.message import Message
+from rasa.shared.nlu.constants import (
+    TEXT,
+    ENTITIES,
+    FEATURE_TYPE_SEQUENCE,
+    ENTITY_ATTRIBUTE_TYPE,
+    ENTITY_ATTRIBUTE_GROUP,
+    ENTITY_ATTRIBUTE_ROLE,
+)
 
 if typing.TYPE_CHECKING:
     from rasa.shared.nlu.training_data.features import Features
+    from rasa.nlu.classifiers.diet_classifier import EntityTagSpec
+
+
+TAG_ID_ORIGIN = "tag_id_origin"
+
+
+def featurize_training_examples(
+    training_examples: List[Message],
+    attributes: List[Text],
+    entity_tag_specs: Optional[List["EntityTagSpec"]] = None,
+    featurizers: Optional[List[Text]] = None,
+    bilou_tagging: bool = False,
+) -> List[Dict[Text, List["Features"]]]:
+    """Converts training data into a list of attribute to features.
+
+    Possible attributes are, for example, INTENT, RESPONSE, TEXT, ACTION_TEXT,
+    ACTION_NAME or ENTITIES.
+
+    Args:
+        training_examples: the list of training examples
+        attributes: the attributes to consider
+        entity_tag_specs: the entity specs
+        featurizers: the featurizers to consider
+        bilou_tagging: indicates whether BILOU tagging should be used or not
+
+    Returns:
+        A list of attribute to features.
+    """
+    output = []
+
+    for example in training_examples:
+        attribute_to_features = {}
+        for attribute in attributes:
+            if attribute == ENTITIES:
+                attribute_to_features[attribute] = []
+                # in case of entities add the tag_ids
+                for tag_spec in entity_tag_specs:
+                    attribute_to_features[attribute].append(
+                        _get_tag_ids(example, tag_spec, bilou_tagging)
+                    )
+            elif attribute in example.data:
+                attribute_to_features[attribute] = example.get_all_features(
+                    attribute, featurizers
+                )
+        output.append(attribute_to_features)
 
-MASK = "mask"
+    return output
 
 
-def surface_attributes(
-    tracker_state_features: List[List[Dict[Text, List["Features"]]]]
+def _get_tag_ids(
+    example: Message, tag_spec: "EntityTagSpec", bilou_tagging: bool
+) -> "Features":
+    """Creates a feature array containing the entity tag ids of the given example."""
+    from rasa.nlu.test import determine_token_labels
+    from rasa.nlu.utils.bilou_utils import bilou_tags_to_ids
+    from rasa.shared.nlu.training_data.features import Features
+
+    if bilou_tagging:
+        _tags = bilou_tags_to_ids(example, tag_spec.tags_to_ids, tag_spec.tag_name)
+    else:
+        _tags = []
+        for token in example.get(TOKENS_NAMES[TEXT]):
+            _tag = determine_token_labels(
+                token, example.get(ENTITIES), attribute_key=tag_spec.tag_name
+            )
+            _tags.append(tag_spec.tags_to_ids[_tag])
+
+    # transpose to have seq_len x 1
+    return Features(
+        np.array([_tags]).T, FEATURE_TYPE_SEQUENCE, tag_spec.tag_name, TAG_ID_ORIGIN
+    )
+
+
+def _surface_attributes(
+    features: List[List[Dict[Text, List["Features"]]]],
+    featurizers: Optional[List[Text]] = None,
 ) -> Dict[Text, List[List[List["Features"]]]]:
     """Restructure the input.
 
+    "features" can, for example, be a dictionary of attributes (INTENT,
+    TEXT, ACTION_NAME, ACTION_TEXT, ENTITIES, SLOTS, FORM) to a list of features for
+    all dialogue turns in all training trackers.
+    For NLU training it would just be a dictionary of attributes (either INTENT or
+    RESPONSE, TEXT, and potentially ENTITIES) to a list of features for all training
+    examples.
+
+    The incoming "features" contain a dictionary as inner most value. This method
+    surfaces this dictionary, so that it becomes the outer most value.
+
     Args:
-        tracker_state_features: a dictionary of attributes (INTENT, TEXT, ACTION_NAME,
-            ACTION_TEXT, ENTITIES, SLOTS, FORM) to a list of features for all
-            dialogue turns in all training trackers
+        features: a dictionary of attributes to a list of features for all
+            examples in the training data
+        featurizers: the featurizers to consider
 
     Returns:
-        A dictionary of attributes to a list of features for all dialogue turns
-        and all training trackers.
+        A dictionary of attributes to a list of features for all examples.
     """
     # collect all attributes
     attributes = set(
         attribute
-        for features_in_tracker in tracker_state_features
-        for features_in_turn in features_in_tracker
-        for attribute in features_in_turn.keys()
+        for list_of_attribute_to_features in features
+        for attribute_to_features in list_of_attribute_to_features
+        for attribute in attribute_to_features.keys()
     )
 
-    attribute_to_features = defaultdict(list)
+    output = defaultdict(list)
 
-    for features_in_tracker in tracker_state_features:
+    for list_of_attribute_to_features in features:
         intermediate_features = defaultdict(list)
-
-        for features_in_dialogue in features_in_tracker:
+        for attribute_to_features in list_of_attribute_to_features:
             for attribute in attributes:
+                features = attribute_to_features.get(attribute)
+                if featurizers:
+                    features = _filter_features(features, featurizers)
+
                 # if attribute is not present in the example, populate it with None
-                intermediate_features[attribute].append(
-                    features_in_dialogue.get(attribute)
-                )
+                intermediate_features[attribute].append(features)
 
         for key, value in intermediate_features.items():
-            attribute_to_features[key].append(value)
+            output[key].append(value)
 
-    return attribute_to_features
+    return output
 
 
-def create_zero_features(
-    tracker_features: List[List[List["Features"]]],
-) -> List["Features"]:
-    # all features should have the same types
-    """
-    Computes default feature values for an attribute;
+def _filter_features(
+    features: Optional[List["Features"]], featurizers: List[Text]
+) -> Optional[List["Features"]]:
+    """Filter the given features.
+
+    Return only those features that are coming from one of the given featurizers.
+
     Args:
-        tracker_features: list containing all feature values encountered
-        in the dataset for an attribute;
+        features: list of features
+        featurizers: names of featurizers to consider
+
+    Returns:
+        The filtered list of features.
     """
+    if features is None or not featurizers:
+        return features
+
+    # it might be that the list of features also contains some tag_ids
+    # the origin of the tag_ids is set to TAG_ID_ORIGIN
+    # add TAG_ID_ORIGIN to the list of featurizers to make sure that we keep the
+    # tag_ids
+    featurizers.append(TAG_ID_ORIGIN)
+
+    # filter the features
+    return [f for f in features if f.origin in featurizers]
+
 
+def _create_fake_features(
+    all_features: List[List[List["Features"]]],
+) -> List["Features"]:
+    """Computes default feature values.
+
+    All given features should have the same type, e.g. dense or sparse.
+
+    Args:
+        all_features: list containing all feature values encountered in the dataset
+        for an attribute.
+
+    Returns:
+        The default features
+    """
     example_features = next(
         iter(
             [
                 list_of_features
-                for turn_features in tracker_features
-                for list_of_features in turn_features
+                for list_of_list_of_features in all_features
+                for list_of_features in list_of_list_of_features
                 if list_of_features is not None
             ]
         )
     )
 
-    # create zero_features for nones
-    zero_features = []
-    for features in example_features:
-        new_features = copy.deepcopy(features)
-        if features.is_dense():
-            new_features.features = np.zeros_like(features.features)
-        if features.is_sparse():
+    # create fake_features for Nones
+    fake_features = []
+    for _features in example_features:
+        new_features = copy.deepcopy(_features)
+        if _features.is_dense():
+            new_features.features = np.zeros(
+                (0, _features.features.shape[-1]), _features.features.dtype
+            )
+        if _features.is_sparse():
             new_features.features = scipy.sparse.coo_matrix(
-                features.features.shape, features.features.dtype
+                (0, _features.features.shape[-1]), _features.features.dtype
             )
-        zero_features.append(new_features)
+        fake_features.append(new_features)
 
-    return zero_features
+    return fake_features
 
 
 def convert_to_data_format(
-    tracker_state_features: Union[
+    features: Union[
         List[List[Dict[Text, List["Features"]]]], List[Dict[Text, List["Features"]]]
     ],
-    zero_state_features: Optional[Dict[Text, List["Features"]]] = None,
+    fake_features: Optional[Dict[Text, List["Features"]]] = None,
+    consider_dialogue_dimension: bool = True,
+    featurizers: Optional[List[Text]] = None,
 ) -> Tuple[Data, Optional[Dict[Text, List["Features"]]]]:
     """Converts the input into "Data" format.
 
+    "features" can, for example, be a dictionary of attributes (INTENT,
+    TEXT, ACTION_NAME, ACTION_TEXT, ENTITIES, SLOTS, FORM) to a list of features for
+    all dialogue turns in all training trackers.
+    For NLU training it would just be a dictionary of attributes (either INTENT or
+    RESPONSE, TEXT, and potentially ENTITIES) to a list of features for all training
+    examples.
+
+    The "Data" format corresponds to Dict[Text, Dict[Text, List[FeatureArray]]]. It's
+    a dictionary of attributes (e.g. TEXT) to a dictionary of secondary attributes
+    (e.g. SEQUENCE or SENTENCE) to the list of actual features.
+
     Args:
-        tracker_state_features: a dictionary of attributes (INTENT, TEXT, ACTION_NAME,
-            ACTION_TEXT, ENTITIES, SLOTS, FORM) to a list of features for all
-            dialogue turns in all training trackers
-        zero_state_features: Contains default feature values for attributes
+        features: a dictionary of attributes to a list of features for all
+            examples in the training data
+        fake_features: Contains default feature values for attributes
+        consider_dialogue_dimension: If set to false the dialogue dimension will be
+            removed from the resulting sequence features.
+        featurizers: the featurizers to consider
 
     Returns:
-        Input in "Data" format and zero state features
+        Input in "Data" format and fake features
     """
     training = False
-    if not zero_state_features:
+    if not fake_features:
         training = True
-        zero_state_features = defaultdict(list)
+        fake_features = defaultdict(list)
 
     # unify format of incoming features
-    if isinstance(tracker_state_features[0], Dict):
-        tracker_state_features = [[dicts] for dicts in tracker_state_features]
+    if isinstance(features[0], Dict):
+        features = [[dicts] for dicts in features]
 
-    state_to_tracker_features = surface_attributes(tracker_state_features)
+    attribute_to_features = _surface_attributes(features, featurizers)
 
     attribute_data = {}
 
-    # During prediction we need to iterate over the zero features attributes to
+    # During prediction we need to iterate over the fake features attributes to
+
     # have all keys in the resulting model data
     if training:
-        attributes = list(state_to_tracker_features.keys())
+        attributes = list(attribute_to_features.keys())
     else:
-        attributes = list(zero_state_features.keys())
+        attributes = list(fake_features.keys())
 
     # In case an attribute is not present during prediction, replace it with
-    # None values that will then be replaced by zero features
+    # None values that will then be replaced by fake features
     dialogue_length = 1
-    for tracker_features in state_to_tracker_features.values():
-        dialogue_length = max(dialogue_length, len(tracker_features[0]))
-    empty_features = [[None] * dialogue_length]
+    num_examples = 1
+    for _features in attribute_to_features.values():
+        num_examples = max(num_examples, len(_features))
+        dialogue_length = max(dialogue_length, len(_features[0]))
+    absent_features = [[None] * dialogue_length] * num_examples
 
     for attribute in attributes:
-        attribute_data[attribute] = _features_for_attribute(
+        attribute_data[attribute] = _feature_arrays_for_attribute(
             attribute,
-            empty_features,
-            state_to_tracker_features,
+            absent_features,
+            attribute_to_features,
             training,
-            zero_state_features,
+            fake_features,
+            consider_dialogue_dimension,
         )
 
     # ensure that all attributes are in the same order
     attribute_data = OrderedDict(sorted(attribute_data.items()))
 
-    return attribute_data, zero_state_features
+    return attribute_data, fake_features
 
 
-def _features_for_attribute(
+def _feature_arrays_for_attribute(
     attribute: Text,
-    empty_features: List[Any],
-    state_to_tracker_features: Dict[Text, List[List[List["Features"]]]],
+    absent_features: List[Any],
+    attribute_to_features: Dict[Text, List[List[List["Features"]]]],
     training: bool,
-    zero_state_features: Dict[Text, List["Features"]],
-) -> Dict[Text, List[np.ndarray]]:
-    """Create the features for the given attribute from the tracker features.
+    fake_features: Dict[Text, List["Features"]],
+    consider_dialogue_dimension: bool,
+) -> Dict[Text, List[FeatureArray]]:
+    """Create the features for the given attribute from the all examples features.
 
     Args:
-        attribute: the attribute
-        empty_features: empty features
-        state_to_tracker_features: tracker features for every state
+        attribute: the attribute of Message to be featurized
+        absent_features: list of Nones, used as features if `attribute_to_features`
+            does not contain the `attribute`
+        attribute_to_features: features for every example
         training: boolean indicating whether we are currently in training or not
-        zero_state_features: zero features
+        fake_features: zero features
+        consider_dialogue_dimension: If set to false the dialogue dimension will be
+          removed from the resulting sequence features.
 
     Returns:
         A dictionary of feature type to actual features for the given attribute.
     """
-    tracker_features = (
-        state_to_tracker_features[attribute]
-        if attribute in state_to_tracker_features
-        else empty_features
+    features = (
+        attribute_to_features[attribute]
+        if attribute in attribute_to_features
+        else absent_features
     )
 
-    # in case some features for a specific attribute and dialogue turn are
+    # in case some features for a specific attribute are
     # missing, replace them with a feature vector of zeros
     if training:
-        zero_state_features[attribute] = create_zero_features(tracker_features)
+        fake_features[attribute] = _create_fake_features(features)
 
-    (attribute_masks, _dense_features, _sparse_features) = map_tracker_features(
-        tracker_features, zero_state_features[attribute]
+    (attribute_masks, _dense_features, _sparse_features) = _extract_features(
+        features, fake_features[attribute], attribute
     )
 
-    sparse_features = defaultdict(list)
-    dense_features = defaultdict(list)
+    sparse_features = {}
+    dense_features = {}
 
-    # vstack serves as removing dimension
-    # TODO check vstack for sequence features
     for key, values in _sparse_features.items():
-        sparse_features[key] = [scipy.sparse.vstack(value) for value in values]
+        if consider_dialogue_dimension:
+            sparse_features[key] = FeatureArray(
+                np.array(values), number_of_dimensions=4
+            )
+        else:
+            sparse_features[key] = FeatureArray(
+                np.array([v[0] for v in values]), number_of_dimensions=3
+            )
+
     for key, values in _dense_features.items():
-        dense_features[key] = [np.vstack(value) for value in values]
+        if consider_dialogue_dimension:
+            dense_features[key] = FeatureArray(np.array(values), number_of_dimensions=4)
+        else:
+            dense_features[key] = FeatureArray(
+                np.array([v[0] for v in values]), number_of_dimensions=3
+            )
 
-    attribute_features = {MASK: [np.array(attribute_masks)]}
+    attribute_to_feature_arrays = {
+        MASK: [FeatureArray(np.array(attribute_masks), number_of_dimensions=3)]
+    }
 
     feature_types = set()
     feature_types.update(list(dense_features.keys()))
     feature_types.update(list(sparse_features.keys()))
 
     for feature_type in feature_types:
-        if feature_type == SEQUENCE:
-            # TODO we don't take sequence features because that makes us deal
-            #  with 4D sparse tensors
-            continue
-
-        attribute_features[feature_type] = []
+        attribute_to_feature_arrays[feature_type] = []
         if feature_type in sparse_features:
-            attribute_features[feature_type].append(
-                np.array(sparse_features[feature_type])
+            attribute_to_feature_arrays[feature_type].append(
+                sparse_features[feature_type]
             )
         if feature_type in dense_features:
-            attribute_features[feature_type].append(
-                np.array(dense_features[feature_type])
+            attribute_to_feature_arrays[feature_type].append(
+                dense_features[feature_type]
             )
 
-    return attribute_features
+    return attribute_to_feature_arrays
 
 
-def map_tracker_features(
-    tracker_features: List[List[List["Features"]]], zero_features: List["Features"]
+def _extract_features(
+    features: List[List[List["Features"]]],
+    fake_features: List["Features"],
+    attribute: Text,
 ) -> Tuple[
     List[np.ndarray],
     Dict[Text, List[List["Features"]]],
@@ -230,8 +385,8 @@ def map_tracker_features(
     into sparse and dense features.
 
     Args:
-        tracker_features: all features
-        zero_features: list of zero features
+        features: all features
+        fake_features: list of zero features
 
     Returns:
         - a list of attribute masks
@@ -242,33 +397,50 @@ def map_tracker_features(
     dense_features = defaultdict(list)
     attribute_masks = []
 
-    for turn_features in tracker_features:
+    for list_of_list_of_features in features:
         dialogue_sparse_features = defaultdict(list)
         dialogue_dense_features = defaultdict(list)
 
         # create a mask for every state
         # to capture which turn has which input
-        attribute_mask = np.expand_dims(np.ones(len(turn_features), np.float32), -1)
+        attribute_mask = np.ones(len(list_of_list_of_features), np.float32)
 
-        for i, list_of_features in enumerate(turn_features):
+        for i, list_of_features in enumerate(list_of_list_of_features):
 
             if list_of_features is None:
                 # use zero features and set mask to zero
                 attribute_mask[i] = 0
-                list_of_features = zero_features
+                list_of_features = fake_features
 
             for features in list_of_features:
+                # in case of ENTITIES, if the attribute type matches either 'entity',
+                # 'role', or 'group' the features correspond to the tag ids of that
+                # entity type in order to distinguish later on between the different
+                # tag ids, we use the entity type as key
+                if attribute == ENTITIES and features.attribute in [
+                    ENTITY_ATTRIBUTE_TYPE,
+                    ENTITY_ATTRIBUTE_GROUP,
+                    ENTITY_ATTRIBUTE_ROLE,
+                ]:
+                    key = features.attribute
+                else:
+                    key = features.type
+
                 # all features should have the same types
                 if features.is_sparse():
-                    dialogue_sparse_features[features.type].append(features.features)
+                    dialogue_sparse_features[key].append(features.features)
                 else:
-                    dialogue_dense_features[features.type].append(features.features)
+                    dialogue_dense_features[key].append(features.features)
 
         for key, value in dialogue_sparse_features.items():
             sparse_features[key].append(value)
         for key, value in dialogue_dense_features.items():
             dense_features[key].append(value)
 
+        # add additional dimension to attribute mask
+        # to get a vector of shape (dialogue length x 1),
+        # the batch dim will be added later
+        attribute_mask = np.expand_dims(attribute_mask, -1)
         attribute_masks.append(attribute_mask)
 
     return attribute_masks, dense_features, sparse_features
diff --git a/rasa/utils/tensorflow/models.py b/rasa/utils/tensorflow/models.py
index fb2949bdd3fb..f4ff88562645 100644
--- a/rasa/utils/tensorflow/models.py
+++ b/rasa/utils/tensorflow/models.py
@@ -1,6 +1,7 @@
 import datetime
 
 import tensorflow as tf
+import tensorflow_addons as tfa
 import numpy as np
 import logging
 import os
@@ -26,6 +27,7 @@
 from rasa.utils.tensorflow.model_data import RasaModelData, FeatureSignature
 from rasa.utils.tensorflow.constants import (
     SEQUENCE,
+    SENTENCE,
     TENSORBOARD_LOG_LEVEL,
     RANDOM_SEED,
     TENSORBOARD_LOG_DIR,
@@ -47,6 +49,12 @@
     MAX_NEG_SIM,
     USE_MAX_NEG_SIM,
     NEGATIVE_MARGIN_SCALE,
+    HIDDEN_LAYERS_SIZES,
+    DROP_RATE,
+    DENSE_DIMENSION,
+    CONCAT_DIMENSION,
+    DROP_RATE_ATTENTION,
+    SCALE_LOSS,
 )
 from rasa.utils.tensorflow import layers
 from rasa.utils.tensorflow.transformer import TransformerEncoder
@@ -113,7 +121,8 @@ def _set_up_tensorboard_writer(self) -> None:
         if self.tensorboard_log_dir is not None:
             if self.tensorboard_log_level not in TENSORBOARD_LOG_LEVELS:
                 raise ValueError(
-                    f"Provided '{TENSORBOARD_LOG_LEVEL}' ('{self.tensorboard_log_level}') "
+                    f"Provided '{TENSORBOARD_LOG_LEVEL}' "
+                    f"('{self.tensorboard_log_level}') "
                     f"is invalid! Valid values are: {TENSORBOARD_LOG_LEVELS}"
                 )
             self.tensorboard_log_on_epochs = self.tensorboard_log_level == "epoch"
@@ -131,16 +140,44 @@ def _set_up_tensorboard_writer(self) -> None:
             self.train_summary_writer = tf.summary.create_file_writer(train_log_dir)
             self.test_summary_writer = tf.summary.create_file_writer(test_log_dir)
 
-            self.model_summary_file = f"{self.tensorboard_log_dir}/{class_name}/{current_time}/model_summary.txt"
+            self.model_summary_file = (
+                f"{self.tensorboard_log_dir}/{class_name}/{current_time}"
+                f"/model_summary.txt"
+            )
 
     def batch_loss(
         self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> tf.Tensor:
+        """Calculates the loss for the given batch.
+
+        Args:
+            batch_in: The batch.
+
+        Returns:
+            The loss of the given batch.
+        """
         raise NotImplementedError
 
+    def prepare_for_predict(self) -> None:
+        """Prepares tf graph fpr prediction.
+
+        This method should contain necessary tf calculations
+        and set self variables that are used in `batch_predict`.
+        For example, pre calculation of `self.all_labels_embed`.
+        """
+        pass
+
     def batch_predict(
         self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> Dict[Text, tf.Tensor]:
+        """Predicts the output of the given batch.
+
+        Args:
+            batch_in: The batch.
+
+        Returns:
+            The output to predict.
+        """
         raise NotImplementedError
 
     def fit(
@@ -155,8 +192,7 @@ def fit(
         loading: bool = False,
         eager: bool = False,
     ) -> None:
-        """Fit model data"""
-
+        """Fit model data."""
         # don't setup tensorboard writers when training during loading
         if not loading:
             self._set_up_tensorboard_writer()
@@ -238,7 +274,8 @@ def fit(
 
         if self.checkpoint_model:
             logger.info(
-                f"The model of epoch {self.best_model_epoch} (out of {epochs} in total) will be stored!"
+                f"The model of epoch {self.best_model_epoch} "
+                f"(out of {epochs} in total) will be stored!"
             )
         if self.model_summary_file is not None:
             self._write_model_summary()
@@ -250,8 +287,7 @@ def fit(
     def train_on_batch(
         self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> None:
-        """Train on batch"""
-
+        """Train on batch."""
         # calculate supervision and regularization losses separately
         with tf.GradientTape(persistent=True) as tape:
             prediction_loss = self.batch_loss(batch_in)
@@ -288,6 +324,7 @@ def build_for_predict(
         self, predict_data: RasaModelData, eager: bool = False
     ) -> None:
         self._training = False  # needed for tf graph mode
+        self.prepare_for_predict()
         self._predict_function = self._get_tf_call_model_function(
             predict_data.as_tf_dataset, self.batch_predict, eager, "prediction"
         )
@@ -297,8 +334,8 @@ def predict(self, predict_data: RasaModelData) -> Dict[Text, tf.Tensor]:
             logger.debug("There is no tensorflow prediction graph.")
             self.build_for_predict(predict_data)
 
-        # Prepare a single batch of size 1
-        batch_in = predict_data.prepare_batch(start=0, end=1)
+        # Prepare a single batch of the size of the input
+        batch_in = predict_data.prepare_batch()
 
         self._training = False  # needed for eager mode
         return self._predict_function(batch_in)
@@ -374,8 +411,7 @@ def load(
     def _total_batch_loss(
         self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> tf.Tensor:
-        """Calculate total loss"""
-
+        """Calculate total loss."""
         prediction_loss = self.batch_loss(batch_in)
         regularization_loss = tf.math.add_n(self.losses)
         total_loss = prediction_loss + regularization_loss
@@ -392,7 +428,7 @@ def _batch_loop(
         offset: int,
         writer: Optional["ResourceSummaryWriter"] = None,
     ) -> int:
-        """Run on batches"""
+        """Run on batches."""
         self.reset_metrics()
 
         step = offset
@@ -415,8 +451,7 @@ def _get_tf_call_model_function(
         eager: bool,
         phase: Text,
     ) -> Callable:
-        """Convert functions to tensorflow functions"""
-
+        """Convert functions to tensorflow functions."""
         if eager:
             return call_model_function
 
@@ -435,7 +470,7 @@ def _get_tf_call_model_function(
     def _get_tf_train_functions(
         self, eager: bool, model_data: RasaModelData, batch_strategy: Text
     ) -> Tuple[Callable, Callable]:
-        """Create train tensorflow functions"""
+        """Create train tensorflow functions."""
 
         def train_dataset_function(_batch_size: int) -> tf.data.Dataset:
             return model_data.as_tf_dataset(_batch_size, batch_strategy, shuffle=True)
@@ -451,8 +486,7 @@ def train_dataset_function(_batch_size: int) -> tf.data.Dataset:
     def _get_tf_evaluation_functions(
         self, eager: bool, evaluation_model_data: Optional[RasaModelData]
     ) -> Tuple[Optional[Callable], Optional[Callable]]:
-        """Create evaluation tensorflow functions"""
-
+        """Create evaluation tensorflow functions."""
         if evaluation_model_data is None:
             return None, None
 
@@ -540,26 +574,23 @@ def batch_to_model_data_format(
         data, shape before, this methods converts them into sparse tensors. Dense data
         is kept.
         """
-
         batch_data = defaultdict(lambda: defaultdict(list))
 
         idx = 0
         for key, values in data_signature.items():
             for sub_key, signature in values.items():
-                for is_sparse, feature_dimension in signature:
+                for is_sparse, feature_dimension, number_of_dimensions in signature:
+                    number_of_dimensions = (
+                        number_of_dimensions if number_of_dimensions != 4 else 3
+                    )
                     if is_sparse:
                         # explicitly substitute last dimension in shape with known
                         # static value
+                        shape = [
+                            batch[idx + 2][i] for i in range(number_of_dimensions - 1)
+                        ] + [feature_dimension]
                         batch_data[key][sub_key].append(
-                            tf.SparseTensor(
-                                batch[idx],
-                                batch[idx + 1],
-                                [
-                                    batch[idx + 2][0],
-                                    batch[idx + 2][1],
-                                    feature_dimension,
-                                ],
-                            )
+                            tf.SparseTensor(batch[idx], batch[idx + 1], shape)
                         )
                         idx += 3
                     else:
@@ -582,7 +613,6 @@ def linearly_increasing_batch_size(
 
         The idea comes from https://arxiv.org/abs/1711.00489.
         """
-
         if not isinstance(batch_size, list):
             return int(batch_size)
 
@@ -721,16 +751,18 @@ def _prepare_ffnn_layer(
     def _prepare_transformer_layer(
         self,
         name: Text,
+        num_layers: int,
+        units: int,
         drop_rate: float,
         drop_rate_attention: float,
         prefix: Text = "transformer",
     ):
-        if self.config[NUM_TRANSFORMER_LAYERS] > 0:
+        if num_layers > 0:
             self._tf_layers[f"{prefix}.{name}"] = TransformerEncoder(
-                self.config[NUM_TRANSFORMER_LAYERS],
-                self.config[TRANSFORMER_SIZE],
+                num_layers,
+                units,
                 self.config[NUM_HEADS],
-                self.config[TRANSFORMER_SIZE] * 4,
+                units * 4,
                 self.config[REGULARIZATION_CONSTANT],
                 dropout_rate=drop_rate,
                 attention_dropout_rate=drop_rate_attention,
@@ -775,7 +807,7 @@ def _prepare_sparse_dense_layers(
     ) -> None:
         sparse = False
         dense = False
-        for is_sparse, _ in data_signature:
+        for is_sparse, _, _ in data_signature:
             if is_sparse:
                 sparse = True
             else:
@@ -790,9 +822,74 @@ def _prepare_sparse_dense_layers(
             if not dense:
                 # create dense labels for the input to use in negative sampling
                 self._tf_layers[f"sparse_to_dense_ids.{name}"] = layers.DenseForSparse(
-                    units=2, trainable=False, name=f"sparse_to_dense_ids.{name}"
+                    units=2,
+                    use_bias=False,
+                    trainable=False,
+                    name=f"sparse_to_dense_ids.{name}",
                 )
 
+    def _prepare_input_layers(self, name: Text) -> None:
+        self._prepare_ffnn_layer(
+            name, self.config[HIDDEN_LAYERS_SIZES][name], self.config[DROP_RATE]
+        )
+
+        for feature_type in [SENTENCE, SEQUENCE]:
+            if (
+                name not in self.data_signature
+                or feature_type not in self.data_signature[name]
+            ):
+                continue
+
+            self._prepare_sparse_dense_dropout_layers(
+                f"{name}_{feature_type}", self.config[DROP_RATE]
+            )
+            self._prepare_sparse_dense_layers(
+                self.data_signature[name][feature_type],
+                f"{name}_{feature_type}",
+                self.config[DENSE_DIMENSION][name],
+            )
+            self._prepare_ffnn_layer(
+                f"{name}_{feature_type}",
+                [self.config[CONCAT_DIMENSION][name]],
+                self.config[DROP_RATE],
+                prefix="concat_layer",
+            )
+
+    def _prepare_sequence_layers(self, name: Text) -> None:
+        self._prepare_input_layers(name)
+
+        size = self.config[TRANSFORMER_SIZE]
+        if isinstance(size, dict):
+            size = size[name]
+
+        num_layers = self.config[NUM_TRANSFORMER_LAYERS]
+        if isinstance(num_layers, dict):
+            num_layers = num_layers[name]
+
+        self._prepare_transformer_layer(
+            name,
+            num_layers,
+            size,
+            self.config[DROP_RATE],
+            self.config[DROP_RATE_ATTENTION],
+        )
+
+    def _prepare_entity_recognition_layers(self) -> None:
+        for tag_spec in self._entity_tag_specs:
+            name = tag_spec.tag_name
+            num_tags = tag_spec.num_tags
+            self._tf_layers[f"embed.{name}.logits"] = layers.Embed(
+                num_tags, self.config[REGULARIZATION_CONSTANT], f"logits.{name}"
+            )
+            self._tf_layers[f"crf.{name}"] = layers.CRF(
+                num_tags, self.config[REGULARIZATION_CONSTANT], self.config[SCALE_LOSS]
+            )
+            self._tf_layers[f"embed.{name}.tags"] = layers.Embed(
+                self.config[EMBEDDING_DIMENSION],
+                self.config[REGULARIZATION_CONSTANT],
+                f"tags.{name}",
+            )
+
     def _combine_sparse_dense_features(
         self,
         features: List[Union[np.ndarray, tf.Tensor, tf.SparseTensor]],
@@ -801,7 +898,6 @@ def _combine_sparse_dense_features(
         sparse_dropout: bool = False,
         dense_dropout: bool = False,
     ) -> Optional[tf.Tensor]:
-
         if not features:
             return None
 
@@ -832,6 +928,150 @@ def _combine_sparse_dense_features(
 
         return tf.concat(dense_features, axis=-1) * mask
 
+    def _combine_sequence_sentence_features(
+        self,
+        sequence_features: List[Union[tf.Tensor, tf.SparseTensor]],
+        sentence_features: List[Union[tf.Tensor, tf.SparseTensor]],
+        mask_sequence: tf.Tensor,
+        mask_text: tf.Tensor,
+        name: Text,
+        sparse_dropout: bool = False,
+        dense_dropout: bool = False,
+    ) -> tf.Tensor:
+        sequence_x = self._combine_sparse_dense_features(
+            sequence_features,
+            f"{name}_{SEQUENCE}",
+            mask_sequence,
+            sparse_dropout,
+            dense_dropout,
+        )
+        sentence_x = self._combine_sparse_dense_features(
+            sentence_features, f"{name}_{SENTENCE}", None, sparse_dropout, dense_dropout
+        )
+
+        if sequence_x is not None and sentence_x is None:
+            return sequence_x
+
+        if sequence_x is None and sentence_x is not None:
+            return sentence_x
+
+        if sequence_x is not None and sentence_x is not None:
+            return self._concat_sequence_sentence_features(
+                sequence_x, sentence_x, name, mask_text
+            )
+
+        raise ValueError(
+            "No features are present. Please check your configuration file."
+        )
+
+    def _concat_sequence_sentence_features(
+        self,
+        sequence_x: tf.Tensor,
+        sentence_x: tf.Tensor,
+        name: Text,
+        mask_text: tf.Tensor,
+    ):
+        if sequence_x.shape[-1] != sentence_x.shape[-1]:
+            sequence_x = self._tf_layers[f"concat_layer.{name}_{SEQUENCE}"](
+                sequence_x, self._training
+            )
+            sentence_x = self._tf_layers[f"concat_layer.{name}_{SENTENCE}"](
+                sentence_x, self._training
+            )
+
+        # we need to concatenate the sequence features with the sentence features
+        # we cannot use tf.concat as the sequence features are padded
+
+        # (1) get position of sentence features in mask
+        last = mask_text * tf.math.cumprod(
+            1 - mask_text, axis=1, exclusive=True, reverse=True
+        )
+        # (2) multiply by sentence features so that we get a matrix of
+        #     batch-dim x seq-dim x feature-dim with zeros everywhere except for
+        #     for the sentence features
+        sentence_x = last * sentence_x
+
+        # (3) add a zero to the end of sequence matrix to match the final shape
+        sequence_x = tf.pad(sequence_x, [[0, 0], [0, 1], [0, 0]])
+
+        # (4) sum up sequence features and sentence features
+        return sequence_x + sentence_x
+
+    def _features_as_seq_ids(
+        self, features: List[Union[np.ndarray, tf.Tensor, tf.SparseTensor]], name: Text
+    ) -> Optional[tf.Tensor]:
+        """Creates dense labels for negative sampling."""
+        # if there are dense features - we can use them
+        for f in features:
+            if not isinstance(f, tf.SparseTensor):
+                seq_ids = tf.stop_gradient(f)
+                # add a zero to the seq dimension for the sentence features
+                seq_ids = tf.pad(seq_ids, [[0, 0], [0, 1], [0, 0]])
+                return seq_ids
+
+        # use additional sparse to dense layer
+        for f in features:
+            if isinstance(f, tf.SparseTensor):
+                seq_ids = tf.stop_gradient(
+                    self._tf_layers[f"sparse_to_dense_ids.{name}"](f)
+                )
+                # add a zero to the seq dimension for the sentence features
+                seq_ids = tf.pad(seq_ids, [[0, 0], [0, 1], [0, 0]])
+                return seq_ids
+
+        return None
+
+    def _create_sequence(
+        self,
+        sequence_features: List[Union[tf.Tensor, tf.SparseTensor]],
+        sentence_features: List[Union[tf.Tensor, tf.SparseTensor]],
+        mask_sequence: tf.Tensor,
+        mask: tf.Tensor,
+        name: Text,
+        sparse_dropout: bool = False,
+        dense_dropout: bool = False,
+        masked_lm_loss: bool = False,
+        sequence_ids: bool = False,
+    ) -> Tuple[tf.Tensor, tf.Tensor, Optional[tf.Tensor], Optional[tf.Tensor]]:
+        if sequence_ids:
+            seq_ids = self._features_as_seq_ids(sequence_features, f"{name}_{SEQUENCE}")
+        else:
+            seq_ids = None
+
+        inputs = self._combine_sequence_sentence_features(
+            sequence_features,
+            sentence_features,
+            mask_sequence,
+            mask,
+            name,
+            sparse_dropout,
+            dense_dropout,
+        )
+        inputs = self._tf_layers[f"ffnn.{name}"](inputs, self._training)
+
+        if masked_lm_loss:
+            transformer_inputs, lm_mask_bool = self._tf_layers[f"{name}_input_mask"](
+                inputs, mask, self._training
+            )
+        else:
+            transformer_inputs = inputs
+            lm_mask_bool = None
+
+        outputs = self._tf_layers[f"transformer.{name}"](
+            transformer_inputs, 1 - mask, self._training
+        )
+
+        if isinstance(self.config[NUM_TRANSFORMER_LAYERS], int):
+            num_layers = self.config[NUM_TRANSFORMER_LAYERS]
+        else:
+            num_layers = self.config[NUM_TRANSFORMER_LAYERS][name]
+
+        if num_layers > 0:
+            # apply activation
+            outputs = tfa.activations.gelu(outputs)
+
+        return outputs, inputs, seq_ids, lm_mask_bool
+
     @staticmethod
     def _compute_mask(sequence_lengths: tf.Tensor) -> tf.Tensor:
         mask = tf.sequence_mask(sequence_lengths, dtype=tf.float32)
@@ -859,12 +1099,78 @@ def _get_mask_for(
         sequence_lengths = tf.cast(tf_batch_data[key][sub_key][0], dtype=tf.int32)
         return self._compute_mask(sequence_lengths)
 
+    @staticmethod
+    def _get_sequence_lengths(
+        tf_batch_data: Dict[Text, Dict[Text, List[tf.Tensor]]],
+        key: Text,
+        sub_key: Text,
+        batch_dim: int = 1,
+    ) -> tf.Tensor:
+        # sentence features have a sequence lengths of 1
+        # if sequence features are present we add the sequence lengths of those
+
+        sequence_lengths = tf.ones([batch_dim], dtype=tf.int32)
+        if key in tf_batch_data and sub_key in tf_batch_data[key]:
+            sequence_lengths += tf.cast(tf_batch_data[key][sub_key][0], dtype=tf.int32)
+
+        return tf.cast(tf_batch_data[key][sub_key][0], dtype=tf.int32) + 1
+
+    @staticmethod
+    def _get_batch_dim(attribute_data: Dict[Text, List[tf.Tensor]]) -> int:
+        if SEQUENCE in attribute_data:
+            return tf.shape(attribute_data[SEQUENCE][0])[0]
+
+        return tf.shape(attribute_data[SENTENCE][0])[0]
+
+    def _calculate_entity_loss(
+        self,
+        inputs: tf.Tensor,
+        tag_ids: tf.Tensor,
+        mask: tf.Tensor,
+        sequence_lengths: tf.Tensor,
+        tag_name: Text,
+        entity_tags: Optional[tf.Tensor] = None,
+    ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:
+
+        tag_ids = tf.cast(tag_ids[:, :, 0], tf.int32)
+
+        if entity_tags is not None:
+            _tags = self._tf_layers[f"embed.{tag_name}.tags"](entity_tags)
+            inputs = tf.concat([inputs, _tags], axis=-1)
+
+        logits = self._tf_layers[f"embed.{tag_name}.logits"](inputs)
+
+        # should call first to build weights
+        pred_ids, _ = self._tf_layers[f"crf.{tag_name}"](logits, sequence_lengths)
+        loss = self._tf_layers[f"crf.{tag_name}"].loss(
+            logits, tag_ids, sequence_lengths
+        )
+        f1 = self._tf_layers[f"crf.{tag_name}"].f1_score(tag_ids, pred_ids, mask)
+
+        return loss, f1, logits
+
     def batch_loss(
         self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> tf.Tensor:
+        """Calculates the loss for the given batch.
+
+        Args:
+            batch_in: The batch.
+
+        Returns:
+            The loss of the given batch.
+        """
         raise NotImplementedError
 
     def batch_predict(
         self, batch_in: Union[Tuple[tf.Tensor], Tuple[np.ndarray]]
     ) -> Dict[Text, tf.Tensor]:
+        """Predicts the output of the given batch.
+
+        Args:
+            batch_in: The batch.
+
+        Returns:
+            The output to predict.
+        """
         raise NotImplementedError
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 0b0dd30b4a6a..fb9ea1faf6ed 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -1,12 +1,12 @@
-from typing import Optional, Text, Dict, Any, Union, List, Tuple
-
+from typing import Optional, Text, Dict, Any, Union, List, Tuple, TYPE_CHECKING
+import copy
 import numpy as np
 
 import rasa.shared.utils.common
 import rasa.shared.utils.io
+import rasa.nlu.utils.bilou_utils
 from rasa.shared.constants import NEXT_MAJOR_VERSION_FOR_DEPRECATIONS
 from rasa.nlu.constants import NUMBER_OF_SUB_TOKENS
-from rasa.nlu.tokenizers.tokenizer import Token
 import rasa.utils.io as io_utils
 from rasa.utils.tensorflow.constants import (
     LOSS_TYPE,
@@ -19,14 +19,24 @@
     AUTO,
     INNER,
     COSINE,
+    TRANSFORMER_SIZE,
+    NUM_TRANSFORMER_LAYERS,
+    DENSE_DIMENSION,
 )
+from rasa.shared.nlu.constants import ACTION_NAME, INTENT, ENTITIES
+from rasa.shared.core.constants import ACTIVE_LOOP, SLOTS
+from rasa.core.constants import DIALOGUE
+
+if TYPE_CHECKING:
+    from rasa.nlu.classifiers.diet_classifier import EntityTagSpec
+    from rasa.nlu.tokenizers.tokenizer import Token
 
 
 def normalize(values: np.ndarray, ranking_length: Optional[int] = 0) -> np.ndarray:
     """Normalizes an array of positive numbers over the top `ranking_length` values.
+
     Other values will be set to 0.
     """
-
     new_values = values.copy()  # prevent mutation of the input
     if 0 < ranking_length < len(new_values):
         ranked = sorted(new_values, reverse=True)
@@ -57,7 +67,7 @@ def update_similarity_type(config: Dict[Text, Any]) -> Dict[Text, Any]:
 
 
 def align_token_features(
-    list_of_tokens: List[List[Token]],
+    list_of_tokens: List[List["Token"]],
     in_token_features: np.ndarray,
     shape: Optional[Tuple] = None,
 ) -> np.ndarray:
@@ -151,38 +161,177 @@ def _replace_deprecated_option(
     config: Dict[Text, Any],
     warn_until_version: Text = NEXT_MAJOR_VERSION_FOR_DEPRECATIONS,
 ) -> Dict[Text, Any]:
-    if old_option in config:
-        if isinstance(new_option, str):
-            rasa.shared.utils.io.raise_deprecation_warning(
-                f"Option '{old_option}' got renamed to '{new_option}'. "
-                f"Please update your configuration file.",
-                warn_until_version=warn_until_version,
-            )
-            config[new_option] = config[old_option]
-        else:
-            rasa.shared.utils.io.raise_deprecation_warning(
-                f"Option '{old_option}' got renamed to "
-                f"a dictionary '{new_option[0]}' with a key '{new_option[1]}'. "
-                f"Please update your configuration file.",
-                warn_until_version=warn_until_version,
-            )
-            option_dict = config.get(new_option[0], {})
-            option_dict[new_option[1]] = config[old_option]
-            config[new_option[0]] = option_dict
+    if old_option not in config:
+        return {}
+
+    if isinstance(new_option, str):
+        rasa.shared.utils.io.raise_deprecation_warning(
+            f"Option '{old_option}' got renamed to '{new_option}'. "
+            f"Please update your configuration file.",
+            warn_until_version=warn_until_version,
+        )
+        return {new_option: config[old_option]}
 
-    return config
+    rasa.shared.utils.io.raise_deprecation_warning(
+        f"Option '{old_option}' got renamed to "
+        f"a dictionary '{new_option[0]}' with a key '{new_option[1]}'. "
+        f"Please update your configuration file.",
+        warn_until_version=warn_until_version,
+    )
+    return {new_option[0]: {new_option[1]: config[old_option]}}
 
 
 def check_deprecated_options(config: Dict[Text, Any]) -> Dict[Text, Any]:
-    """
+    """Update the config according to changed config params.
+
     If old model configuration parameters are present in the provided config, replace
     them with the new parameters and log a warning.
+
     Args:
         config: model configuration
 
     Returns: updated model configuration
     """
+    # note: call _replace_deprecated_option() here when there are options to deprecate
+
+    return config
+
 
+def check_core_deprecated_options(config: Dict[Text, Any]) -> Dict[Text, Any]:
+    """Update the core config according to changed config params.
+
+    If old model configuration parameters are present in the provided config, replace
+    them with the new parameters and log a warning.
+
+    Args:
+        config: model configuration
+
+    Returns: updated model configuration
+    """
     # note: call _replace_deprecated_option() here when there are options to deprecate
+    new_config = {}
+    if isinstance(config.get(TRANSFORMER_SIZE), int):
+        new_config = override_defaults(
+            new_config,
+            _replace_deprecated_option(
+                TRANSFORMER_SIZE, [TRANSFORMER_SIZE, DIALOGUE], config
+            ),
+        )
+
+    if isinstance(config.get(NUM_TRANSFORMER_LAYERS), int):
+        new_config = override_defaults(
+            new_config,
+            _replace_deprecated_option(
+                NUM_TRANSFORMER_LAYERS, [NUM_TRANSFORMER_LAYERS, DIALOGUE], config
+            ),
+        )
+
+    if isinstance(config.get(DENSE_DIMENSION), int):
+        new_config = override_defaults(
+            new_config,
+            _replace_deprecated_option(
+                DENSE_DIMENSION, [DENSE_DIMENSION, INTENT], config
+            ),
+        )
+        new_config = override_defaults(
+            new_config,
+            _replace_deprecated_option(
+                DENSE_DIMENSION, [DENSE_DIMENSION, ACTION_NAME], config
+            ),
+        )
+        new_config = override_defaults(
+            new_config,
+            _replace_deprecated_option(
+                DENSE_DIMENSION, [DENSE_DIMENSION, ENTITIES], config
+            ),
+        )
+        new_config = override_defaults(
+            new_config,
+            _replace_deprecated_option(
+                DENSE_DIMENSION, [DENSE_DIMENSION, SLOTS], config
+            ),
+        )
+        new_config = override_defaults(
+            new_config,
+            _replace_deprecated_option(
+                DENSE_DIMENSION, [DENSE_DIMENSION, ACTIVE_LOOP], config
+            ),
+        )
+
+    config.update(new_config)
+    return config
+
+
+def entity_label_to_tags(
+    model_predictions: Dict[Text, Any],
+    entity_tag_specs: List["EntityTagSpec"],
+    bilou_flag: bool = False,
+    prediction_index: int = 0,
+) -> Tuple[Dict[Text, List[Text]], Dict[Text, List[float]]]:
+    """Convert the output predictions for entities to the actual entity tags.
+
+    Args:
+        model_predictions: the output predictions using the entity tag indices
+        entity_tag_specs: the entity tag specifications
+        bilou_flag: if 'True', the BILOU tagging schema was used
+        prediction_index: the index in the batch of predictions
+            to use for entity extraction
+
+    Returns:
+        A map of entity tag type, e.g. entity, role, group, to actual entity tags and
+        confidences.
+    """
+    predicted_tags = {}
+    confidence_values = {}
+
+    for tag_spec in entity_tag_specs:
+        predictions = model_predictions[f"e_{tag_spec.tag_name}_ids"].numpy()
+        confidences = model_predictions[f"e_{tag_spec.tag_name}_scores"].numpy()
+
+        if not np.any(predictions):
+            continue
+
+        confidences = [float(c) for c in confidences[prediction_index]]
+        tags = [tag_spec.ids_to_tags[p] for p in predictions[prediction_index]]
+
+        if bilou_flag:
+            (
+                tags,
+                confidences,
+            ) = rasa.nlu.utils.bilou_utils.ensure_consistent_bilou_tagging(
+                tags, confidences
+            )
+
+        predicted_tags[tag_spec.tag_name] = tags
+        confidence_values[tag_spec.tag_name] = confidences
+
+    return predicted_tags, confidence_values
+
+
+def override_defaults(
+    defaults: Optional[Dict[Text, Any]], custom: Optional[Dict[Text, Any]]
+) -> Dict[Text, Any]:
+    """Override default config with the given config.
+
+    We cannot use `dict.update` method because configs contain nested dicts.
+
+    Args:
+        defaults: default config
+        custom: user config containing new parameters
+
+    Returns:
+        updated config
+    """
+    if defaults:
+        config = copy.deepcopy(defaults)
+    else:
+        config = {}
+
+    if custom:
+        for key in custom.keys():
+            if isinstance(config.get(key), dict):
+                config[key].update(custom[key])
+            else:
+                config[key] = custom[key]
 
     return config
diff --git a/rasa/validator.py b/rasa/validator.py
index d5be60b9536a..f1ac3fc8f66e 100644
--- a/rasa/validator.py
+++ b/rasa/validator.py
@@ -1,6 +1,6 @@
 import logging
 from collections import defaultdict
-from typing import Set, Text, Optional
+from typing import Set, Text, Optional, Dict, Any
 
 import rasa.core.training.story_conflict
 import rasa.shared.nlu.constants
@@ -17,6 +17,7 @@
 from rasa.shared.core.training_data.structures import StoryGraph
 from rasa.shared.importers.importer import TrainingDataImporter
 from rasa.shared.nlu.training_data.training_data import TrainingData
+from rasa.nlu.config import RasaNLUModelConfig
 import rasa.shared.utils.io
 
 logger = logging.getLogger(__name__)
@@ -26,27 +27,37 @@ class Validator:
     """A class used to verify usage of intents and utterances."""
 
     def __init__(
-        self, domain: Domain, intents: TrainingData, story_graph: StoryGraph
+        self,
+        domain: Domain,
+        intents: TrainingData,
+        story_graph: StoryGraph,
+        config: Optional[Dict[Text, Any]],
     ) -> None:
-        """Initializes the Validator object. """
+        """Initializes the Validator object.
 
+        Args:
+            domain: The domain.
+            intents: Training data.
+            story_graph: The story graph.
+            config: The configuration.
+        """
         self.domain = domain
         self.intents = intents
         self.story_graph = story_graph
+        self.nlu_config = RasaNLUModelConfig(config)
 
     @classmethod
     async def from_importer(cls, importer: TrainingDataImporter) -> "Validator":
         """Create an instance from the domain, nlu and story files."""
-
         domain = await importer.get_domain()
         story_graph = await importer.get_stories()
         intents = await importer.get_nlu_data()
+        config = await importer.get_config()
 
-        return cls(domain, intents, story_graph)
+        return cls(domain, intents, story_graph, config)
 
     def verify_intents(self, ignore_warnings: bool = True) -> bool:
         """Compares list of intents in domain with intents in NLU training data."""
-
         everything_is_alright = True
 
         nlu_data_intents = {e.data["intent"] for e in self.intents.intent_examples}
@@ -138,13 +149,12 @@ def _gather_utterance_actions(self) -> Set[Text]:
         return responses | {
             utterance
             for utterance in self.domain.templates.keys()
-            if utterance in self.domain.action_names
+            if utterance in self.domain.action_names_or_texts
         }
 
     def verify_utterances(self, ignore_warnings: bool = True) -> bool:
         """Compares list of utterances in actions with utterances in responses."""
-
-        actions = self.domain.action_names
+        actions = self.domain.action_names_or_texts
         utterance_templates = set(self.domain.templates)
         everything_is_alright = True
 
@@ -236,7 +246,7 @@ def verify_story_structure(
 
         # Create a list of `StoryConflict` objects
         conflicts = rasa.core.training.story_conflict.find_story_conflicts(
-            trackers, self.domain, max_history
+            trackers, self.domain, max_history, self.nlu_config
         )
 
         if not conflicts:
diff --git a/tests/cli/test_rasa_data.py b/tests/cli/test_rasa_data.py
index 35a04553f2eb..635bf2cfa4ae 100644
--- a/tests/cli/test_rasa_data.py
+++ b/tests/cli/test_rasa_data.py
@@ -108,7 +108,8 @@ def test_data_validate_help(run: Callable[..., RunResult]):
     output = run("data", "validate", "--help")
 
     help_text = """usage: rasa data validate [-h] [-v] [-vv] [--quiet]
-                          [--max-history MAX_HISTORY] [--fail-on-warnings]"""
+                          [--max-history MAX_HISTORY] [-c CONFIG]
+                          [--fail-on-warnings] [-d DOMAIN] [--data DATA]"""
 
     lines = help_text.split("\n")
     # expected help text lines should appear somewhere in the output
@@ -157,6 +158,7 @@ def test_validate_files_exit_early():
             "domain": "data/test_domains/duplicate_intents.yml",
             "data": None,
             "max_history": None,
+            "config": None,
         }
         data.validate_files(namedtuple("Args", args.keys())(*args.values()))
 
diff --git a/tests/conftest.py b/tests/conftest.py
index a02c31f8803b..c1aa78f2a66c 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -2,8 +2,6 @@
 import copy
 import os
 import random
-
-import mock
 import pytest
 import sys
 import uuid
@@ -19,6 +17,8 @@
 from unittest.mock import Mock
 
 import rasa.shared.utils.io
+from rasa.nlu.components import ComponentBuilder
+from rasa.nlu.config import RasaNLUModelConfig
 from rasa import server
 from rasa.core import config
 from rasa.core.agent import Agent, load_agent
@@ -33,15 +33,17 @@
 import rasa.core.run
 from rasa.core.tracker_store import InMemoryTrackerStore, TrackerStore
 from rasa.model import get_model
-from rasa.train import TrainingResult, train_async, _train_nlu_async
+from rasa.train import train_async, _train_nlu_async
 from rasa.utils.common import TempDirectoryPath
 from tests.core.conftest import (
     DEFAULT_DOMAIN_PATH_WITH_SLOTS,
+    DEFAULT_E2E_STORIES_FILE,
     DEFAULT_STACK_CONFIG,
     DEFAULT_STORIES_FILE,
     DOMAIN_WITH_CATEGORICAL_SLOT,
     END_TO_END_STORY_FILE,
     INCORRECT_NLU_DATA,
+    SIMPLE_STORIES_FILE,
 )
 
 DEFAULT_CONFIG_PATH = "rasa/cli/default_config.yml"
@@ -159,6 +161,16 @@ def default_stories_file() -> Text:
     return DEFAULT_STORIES_FILE
 
 
+@pytest.fixture(scope="session")
+def default_e2e_stories_file() -> Text:
+    return DEFAULT_E2E_STORIES_FILE
+
+
+@pytest.fixture(scope="session")
+def simple_stories_file() -> Text:
+    return SIMPLE_STORIES_FILE
+
+
 @pytest.fixture(scope="session")
 def default_stack_config() -> Text:
     return DEFAULT_STACK_CONFIG
@@ -175,7 +187,7 @@ def incorrect_nlu_data() -> Text:
 
 
 @pytest.fixture(scope="session")
-def end_to_end_story_file() -> Text:
+def end_to_end_test_story_file() -> Text:
     return END_TO_END_STORY_FILE
 
 
@@ -232,6 +244,22 @@ async def trained_rasa_model(
     return trained_stack_model_path
 
 
+@pytest.fixture(scope="session")
+async def trained_simple_rasa_model(
+    trained_async: Callable,
+    default_domain_path: Text,
+    default_nlu_data: Text,
+    simple_stories_file: Text,
+) -> Text:
+    trained_stack_model_path = await trained_async(
+        domain=default_domain_path,
+        config=DEFAULT_STACK_CONFIG,
+        training_files=[default_nlu_data, simple_stories_file],
+    )
+
+    return trained_stack_model_path
+
+
 @pytest.fixture(scope="session")
 async def unpacked_trained_rasa_model(
     trained_rasa_model: Text,
@@ -273,6 +301,21 @@ async def trained_nlu_model(
     return trained_nlu_model_path
 
 
+@pytest.fixture(scope="session")
+async def trained_e2e_model(
+    trained_async,
+    default_domain_path,
+    default_stack_config,
+    default_nlu_data,
+    default_e2e_stories_file,
+) -> Text:
+    return await trained_async(
+        domain=default_domain_path,
+        config=default_stack_config,
+        training_files=[default_nlu_data, default_e2e_stories_file],
+    )
+
+
 @pytest.fixture(scope="session")
 def moodbot_domain() -> Domain:
     domain_path = os.path.join("examples", "moodbot", "domain.yml")
@@ -325,6 +368,22 @@ def project() -> Text:
     return directory
 
 
+@pytest.fixture(scope="session")
+def component_builder():
+    return ComponentBuilder()
+
+
+@pytest.fixture(scope="session")
+def spacy_nlp(component_builder: ComponentBuilder, blank_config: RasaNLUModelConfig):
+    spacy_nlp_config = {"name": "SpacyNLP"}
+    return component_builder.create_component(spacy_nlp_config, blank_config).nlp
+
+
+@pytest.fixture(scope="session")
+def blank_config() -> RasaNLUModelConfig:
+    return RasaNLUModelConfig({"language": "en", "pipeline": []})
+
+
 def write_endpoint_config_to_yaml(
     path: Path, data: Dict[Text, Any], endpoints_filename: Text = "endpoints.yml"
 ) -> Path:
diff --git a/tests/core/actions/test_forms.py b/tests/core/actions/test_forms.py
index 53efd2b06878..f00ebb3fd6b0 100644
--- a/tests/core/actions/test_forms.py
+++ b/tests/core/actions/test_forms.py
@@ -1,5 +1,5 @@
 from typing import Dict, Text, List, Optional, Any
-from unittest.mock import Mock, ANY
+from unittest.mock import Mock
 
 import pytest
 from _pytest.monkeypatch import MonkeyPatch
@@ -10,7 +10,7 @@
 from rasa.shared.core.constants import ACTION_LISTEN_NAME, REQUESTED_SLOT
 from rasa.core.actions.forms import FormAction
 from rasa.core.channels import CollectingOutputChannel
-from rasa.shared.core.domain import Domain, InvalidDomain
+from rasa.shared.core.domain import Domain
 from rasa.shared.core.events import (
     ActiveLoop,
     SlotSet,
@@ -446,10 +446,12 @@ async def test_validate_slots_on_activation_with_other_action_after_user_utteran
     ]
 
 
-def test_name_of_utterance():
-    form_name = "another_form"
+@pytest.mark.parametrize(
+    "utterance_name", ["utter_ask_my_form_num_people", "utter_ask_num_people"],
+)
+def test_name_of_utterance(utterance_name: Text):
+    form_name = "my_form"
     slot_name = "num_people"
-    full_utterance_name = f"utter_ask_{form_name}_{slot_name}"
 
     domain = f"""
     forms:
@@ -457,22 +459,14 @@ def test_name_of_utterance():
         {slot_name}:
         - type: from_text
     responses:
-        {full_utterance_name}:
+        {utterance_name}:
         - text: "How many people?"
     """
     domain = Domain.from_yaml(domain)
 
-    action_server_url = "http:/my-action-server:5055/webhook"
-
-    with aioresponses():
-        action_server = EndpointConfig(action_server_url)
-        action = FormAction(form_name, action_server)
+    action = FormAction(form_name, None)
 
-        assert action._name_of_utterance(domain, slot_name) == full_utterance_name
-        assert (
-            action._name_of_utterance(domain, "another_slot")
-            == "utter_ask_another_slot"
-        )
+    assert action._name_of_utterance(domain, slot_name) == utterance_name
 
 
 def test_temporary_tracker():
@@ -1044,7 +1038,6 @@ def test_extract_other_slots_with_entity(
 @pytest.mark.parametrize(
     "domain, expected_action",
     [
-        ({}, "utter_ask_sun"),
         (
             {
                 "actions": ["action_ask_my_form_sun", "action_ask_sun"],
@@ -1076,18 +1069,49 @@ def test_extract_other_slots_with_entity(
     ],
 )
 async def test_ask_for_slot(
-    domain: Dict, expected_action: Text, monkeypatch: MonkeyPatch
+    domain: Dict,
+    expected_action: Text,
+    monkeypatch: MonkeyPatch,
+    default_nlg: TemplatedNaturalLanguageGenerator,
 ):
     slot_name = "sun"
 
     action_from_name = Mock(return_value=action.ActionListen())
     endpoint_config = Mock()
-    monkeypatch.setattr(action, action.action_from_name.__name__, action_from_name)
+    monkeypatch.setattr(
+        action, action.action_for_name_or_text.__name__, action_from_name
+    )
 
     form = FormAction("my_form", endpoint_config)
     domain = Domain.from_dict(domain)
     await form._ask_for_slot(
-        domain, None, None, slot_name, DialogueStateTracker.from_events("dasd", [])
+        domain,
+        default_nlg,
+        CollectingOutputChannel(),
+        slot_name,
+        DialogueStateTracker.from_events("dasd", []),
     )
 
     action_from_name.assert_called_once_with(expected_action, domain, endpoint_config)
+
+
+async def test_ask_for_slot_if_not_utter_ask(
+    monkeypatch: MonkeyPatch, default_nlg: TemplatedNaturalLanguageGenerator
+):
+    action_from_name = Mock(return_value=action.ActionListen())
+    endpoint_config = Mock()
+    monkeypatch.setattr(
+        action, action.action_for_name_or_text.__name__, action_from_name
+    )
+
+    form = FormAction("my_form", endpoint_config)
+    events = await form._ask_for_slot(
+        Domain.empty(),
+        default_nlg,
+        CollectingOutputChannel(),
+        "some slot",
+        DialogueStateTracker.from_events("dasd", []),
+    )
+
+    assert not events
+    action_from_name.assert_not_called()
diff --git a/tests/core/conftest.py b/tests/core/conftest.py
index 8b4b3f0fb5e4..4582bca0daf2 100644
--- a/tests/core/conftest.py
+++ b/tests/core/conftest.py
@@ -28,7 +28,11 @@
 
 DEFAULT_DOMAIN_PATH_WITH_MAPPING = "data/test_domains/default_with_mapping.yml"
 
-DEFAULT_STORIES_FILE = "data/test_stories/stories_defaultdomain.md"
+DEFAULT_STORIES_FILE = "data/test_yaml_stories/stories_defaultdomain.yml"
+
+DEFAULT_E2E_STORIES_FILE = "data/test_yaml_stories/stories_e2e.yml"
+
+SIMPLE_STORIES_FILE = "data/test_yaml_stories/stories_simple.yml"
 
 DEFAULT_STACK_CONFIG = "data/test_config/stack_config.yml"
 
diff --git a/tests/core/featurizers/__init__.py b/tests/core/featurizers/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/core/test_featurizer.py b/tests/core/featurizers/test_single_state_featurizers.py
similarity index 68%
rename from tests/core/test_featurizer.py
rename to tests/core/featurizers/test_single_state_featurizers.py
index 634dafd4cfa3..f76a84bde98a 100644
--- a/tests/core/test_featurizer.py
+++ b/tests/core/featurizers/test_single_state_featurizers.py
@@ -1,21 +1,36 @@
 from typing import Text
-from rasa.core.featurizers.tracker_featurizers import TrackerFeaturizer
-from rasa.core.featurizers.single_state_featurizer import SingleStateFeaturizer
-from rasa.shared.core.domain import Domain
 import numpy as np
-from rasa.shared.nlu.constants import ACTION_TEXT, ACTION_NAME, ENTITIES, TEXT, INTENT
-from rasa.shared.core.constants import ACTIVE_LOOP, SLOTS, ENTITY_LABEL_SEPARATOR
-from rasa.shared.nlu.interpreter import RegexInterpreter
+from rasa.shared.core.constants import ENTITY_LABEL_SEPARATOR
 import scipy.sparse
 
+import pytest
 
-def test_fail_to_load_non_existent_featurizer():
-    assert TrackerFeaturizer.load("non_existent_class") is None
+from rasa.core.featurizers.single_state_featurizer import SingleStateFeaturizer
+from rasa.shared.core.domain import Domain
+from rasa.shared.nlu.constants import (
+    ACTION_TEXT,
+    ACTION_NAME,
+    ENTITIES,
+    TEXT,
+    INTENT,
+    FEATURE_TYPE_SEQUENCE,
+    FEATURE_TYPE_SENTENCE,
+    ENTITY_ATTRIBUTE_TYPE,
+    ENTITY_ATTRIBUTE_VALUE,
+    ENTITY_ATTRIBUTE_START,
+    ENTITY_ATTRIBUTE_END,
+    ENTITY_TAGS,
+)
+from rasa.shared.core.constants import ACTIVE_LOOP, SLOTS
+from rasa.shared.nlu.interpreter import RegexInterpreter
+from rasa.shared.core.slots import Slot
+from rasa.shared.nlu.training_data.features import Features
 
 
 def test_single_state_featurizer_without_interpreter_state_not_with_action_listen():
     """This test are for encoding state without a trained interpreter.
-    action_name is not action_listen, so, INTENT, TEXT and ENTITIES should not be featurized
+    action_name is not action_listen, so, INTENT, TEXT and ENTITIES should not be
+    featurized.
     """
     f = SingleStateFeaturizer()
     f._default_feature_states[INTENT] = {"a": 0, "b": 1}
@@ -32,6 +47,7 @@ def test_single_state_featurizer_without_interpreter_state_not_with_action_liste
         },
         interpreter=RegexInterpreter(),
     )
+
     # user input is ignored as prev action is not action_listen
     assert list(encoded.keys()) == [ACTION_NAME, ACTIVE_LOOP, SLOTS]
     assert (
@@ -44,8 +60,7 @@ def test_single_state_featurizer_without_interpreter_state_not_with_action_liste
 
 
 def test_single_state_featurizer_without_interpreter_state_with_action_listen():
-    """
-    This test are for encoding state without a trained interpreter.
+    """This test are for encoding state without a trained interpreter.
     action_name is action_listen, so, INTENT and ENTITIES should be featurized
     while text shouldn't because we don't have an interpreter.
     """
@@ -64,6 +79,7 @@ def test_single_state_featurizer_without_interpreter_state_with_action_listen():
         },
         interpreter=RegexInterpreter(),
     )
+
     # we featurize all the features except for *_text ones because NLU wasn't trained
     assert list(encoded.keys()) == [INTENT, ACTION_NAME, ACTIVE_LOOP, SLOTS]
     assert (encoded[INTENT][0].features != scipy.sparse.coo_matrix([[1, 0]])).nnz == 0
@@ -82,7 +98,9 @@ def test_single_state_featurizer_without_interpreter_state_no_intent_no_action_n
     f._default_feature_states[ACTION_NAME] = {"c": 0, "d": 1, "action_listen": 2}
     f._default_feature_states[SLOTS] = {"e_0": 0, "f_0": 1, "g_0": 2}
     f._default_feature_states[ACTIVE_LOOP] = {"h": 0, "i": 1, "j": 2, "k": 3}
-    # check that no intent / action_name features are added when the interpreter isn't there and
+
+    # check that no intent / action_name features are added when the interpreter
+    # isn't there and
     # intent / action_name not in input
     encoded = f.encode_state(
         {
@@ -93,6 +111,7 @@ def test_single_state_featurizer_without_interpreter_state_no_intent_no_action_n
         },
         interpreter=RegexInterpreter(),
     )
+
     assert list(encoded.keys()) == [ACTIVE_LOOP, SLOTS]
     assert (
         encoded[ACTIVE_LOOP][0].features != scipy.sparse.coo_matrix([[0, 0, 0, 1]])
@@ -104,14 +123,41 @@ def test_single_state_featurizer_correctly_encodes_non_existing_value():
     f = SingleStateFeaturizer()
     f._default_feature_states[INTENT] = {"a": 0, "b": 1}
     f._default_feature_states[ACTION_NAME] = {"c": 0, "d": 1}
+
     encoded = f.encode_state(
         {"user": {"intent": "e"}, "prev_action": {"action_name": "action_listen"}},
         interpreter=RegexInterpreter(),
     )
+
     assert list(encoded.keys()) == [INTENT, ACTION_NAME]
     assert (encoded[INTENT][0].features != scipy.sparse.coo_matrix([[0, 0]])).nnz == 0
 
 
+def test_single_state_featurizer_prepare_for_training():
+    domain = Domain(
+        intents=["greet"],
+        entities=["name"],
+        slots=[Slot("name")],
+        templates={},
+        forms=[],
+        action_names=["utter_greet", "action_check_weather"],
+    )
+
+    f = SingleStateFeaturizer()
+    f.prepare_for_training(domain, RegexInterpreter())
+
+    assert len(f._default_feature_states[INTENT]) > 1
+    assert "greet" in f._default_feature_states[INTENT]
+    assert len(f._default_feature_states[ENTITIES]) == 1
+    assert f._default_feature_states[ENTITIES]["name"] == 0
+    assert len(f._default_feature_states[SLOTS]) == 1
+    assert f._default_feature_states[SLOTS]["name_0"] == 0
+    assert len(f._default_feature_states[ACTION_NAME]) > 2
+    assert "utter_greet" in f._default_feature_states[ACTION_NAME]
+    assert "action_check_weather" in f._default_feature_states[ACTION_NAME]
+    assert len(f._default_feature_states[ACTIVE_LOOP]) == 0
+
+
 def test_single_state_featurizer_creates_encoded_all_actions():
     domain = Domain(
         intents=[],
@@ -121,10 +167,12 @@ def test_single_state_featurizer_creates_encoded_all_actions():
         forms={},
         action_names=["a", "b", "c", "d"],
     )
+
     f = SingleStateFeaturizer()
     f.prepare_for_training(domain, RegexInterpreter())
     encoded_actions = f.encode_all_actions(domain, RegexInterpreter())
-    assert len(encoded_actions) == len(domain.action_names)
+
+    assert len(encoded_actions) == len(domain.action_names_or_texts)
     assert all(
         [
             ACTION_NAME in encoded_action and ACTION_TEXT not in encoded_action
@@ -133,44 +181,48 @@ def test_single_state_featurizer_creates_encoded_all_actions():
     )
 
 
+@pytest.mark.timeout(300)  # these can take a longer time than the default timeout
 def test_single_state_featurizer_with_entity_roles_and_groups(
     unpacked_trained_moodbot_path: Text,
 ):
     from rasa.core.agent import Agent
 
     interpreter = Agent.load(unpacked_trained_moodbot_path).interpreter
-
+    # TODO roles and groups are not supported in e2e yet
+    domain = Domain(
+        intents=[],
+        entities=["city", f"city{ENTITY_LABEL_SEPARATOR}to"],
+        slots=[],
+        templates={},
+        forms={},
+        action_names=[],
+    )
     f = SingleStateFeaturizer()
-    f._default_feature_states[INTENT] = {"a": 0, "b": 1}
-    f._default_feature_states[ENTITIES] = {
-        "c": 0,
-        "d": 1,
-        f"d{ENTITY_LABEL_SEPARATOR}e": 2,
-    }
-    f._default_feature_states[ACTION_NAME] = {"e": 0, "d": 1, "action_listen": 2}
-    f._default_feature_states[SLOTS] = {"e_0": 0, "f_0": 1, "g_0": 2}
-    f._default_feature_states[ACTIVE_LOOP] = {"h": 0, "i": 1, "j": 2, "k": 3}
-    encoded = f.encode_state(
+    f.prepare_for_training(domain, RegexInterpreter())
+    encoded = f.encode_entities(
         {
-            "user": {
-                "text": "a ball",
-                "intent": "b",
-                "entities": ["c", f"d{ENTITY_LABEL_SEPARATOR}e"],
-            },
-            "prev_action": {
-                "action_name": "action_listen",
-                "action_text": "throw a ball",
-            },
-            "active_loop": {"name": "k"},
-            "slots": {"e": (1.0,)},
+            TEXT: "I am flying from London to Paris",
+            ENTITIES: [
+                {
+                    ENTITY_ATTRIBUTE_TYPE: "city",
+                    ENTITY_ATTRIBUTE_VALUE: "London",
+                    ENTITY_ATTRIBUTE_START: 17,
+                    ENTITY_ATTRIBUTE_END: 23,
+                },
+                {
+                    ENTITY_ATTRIBUTE_TYPE: f"city{ENTITY_LABEL_SEPARATOR}to",
+                    ENTITY_ATTRIBUTE_VALUE: "Paris",
+                    ENTITY_ATTRIBUTE_START: 27,
+                    ENTITY_ATTRIBUTE_END: 32,
+                },
+            ],
         },
         interpreter=interpreter,
     )
-    # check all the features are encoded and *_text features are encoded by a densefeaturizer
-    assert sorted(list(encoded.keys())) == sorted(
-        [TEXT, ENTITIES, ACTION_NAME, SLOTS, ACTIVE_LOOP, INTENT, ACTION_TEXT]
+    assert sorted(list(encoded.keys())) == sorted([ENTITY_TAGS])
+    assert np.all(
+        encoded[ENTITY_TAGS][0].features == [[0], [0], [0], [0], [1], [0], [2]]
     )
-    assert np.all(encoded[ENTITIES][0].features.toarray() == [1, 0, 1])
 
 
 def test_single_state_featurizer_uses_dtype_float():
@@ -178,6 +230,7 @@ def test_single_state_featurizer_uses_dtype_float():
     f._default_feature_states[INTENT] = {"a": 0, "b": 1}
     f._default_feature_states[ACTION_NAME] = {"e": 0, "d": 1}
     f._default_feature_states[ENTITIES] = {"c": 0}
+
     encoded = f.encode_state(
         {
             "user": {"intent": "a", "entities": ["c"]},
@@ -185,9 +238,11 @@ def test_single_state_featurizer_uses_dtype_float():
         },
         interpreter=RegexInterpreter(),
     )
+
     assert encoded[ACTION_NAME][0].features.dtype == np.float32
 
 
+@pytest.mark.timeout(300)  # these can take a longer time than the default timeout
 def test_single_state_featurizer_with_interpreter_state_with_action_listen(
     unpacked_trained_moodbot_path: Text,
 ):
@@ -196,24 +251,45 @@ def test_single_state_featurizer_with_interpreter_state_with_action_listen(
     interpreter = Agent.load(unpacked_trained_moodbot_path).interpreter
 
     f = SingleStateFeaturizer()
-    f._default_feature_states[INTENT] = {"a": 0, "b": 1}
-    f._default_feature_states[ENTITIES] = {"c": 0}
-    f._default_feature_states[ACTION_NAME] = {"e": 0, "d": 1, "action_listen": 2}
-    f._default_feature_states[SLOTS] = {"e_0": 0, "f_0": 1, "g_0": 2}
-    f._default_feature_states[ACTIVE_LOOP] = {"h": 0, "i": 1, "j": 2, "k": 3}
+    f._default_feature_states[INTENT] = {"greet": 0, "inform": 1}
+    f._default_feature_states[ENTITIES] = {
+        "city": 0,
+        "name": 1,
+        f"city{ENTITY_LABEL_SEPARATOR}to": 2,
+        f"city{ENTITY_LABEL_SEPARATOR}from": 3,
+    }
+    f._default_feature_states[ACTION_NAME] = {
+        "utter_ask_where_to": 0,
+        "utter_greet": 1,
+        "action_listen": 2,
+    }
+    # `_0` in slots represent feature dimension
+    f._default_feature_states[SLOTS] = {"slot_1_0": 0, "slot_2_0": 1, "slot_3_0": 2}
+    f._default_feature_states[ACTIVE_LOOP] = {
+        "active_loop_1": 0,
+        "active_loop_2": 1,
+        "active_loop_3": 2,
+        "active_loop_4": 3,
+    }
     encoded = f.encode_state(
         {
-            "user": {"text": "a ball", "intent": "b", "entities": ["c"]},
+            "user": {
+                "text": "I am flying from London to Paris",
+                "intent": "inform",
+                "entities": ["city", f"city{ENTITY_LABEL_SEPARATOR}to"],
+            },
             "prev_action": {
                 "action_name": "action_listen",
                 "action_text": "throw a ball",
             },
-            "active_loop": {"name": "k"},
-            "slots": {"e": (1.0,)},
+            "active_loop": {"name": "active_loop_4"},
+            "slots": {"slot_1": (1.0,)},
         },
         interpreter=interpreter,
     )
-    # check all the features are encoded and *_text features are encoded by a densefeaturizer
+
+    # check all the features are encoded and *_text features are encoded by a
+    # dense featurizer
     assert sorted(list(encoded.keys())) == sorted(
         [TEXT, ENTITIES, ACTION_NAME, SLOTS, ACTIVE_LOOP, INTENT, ACTION_TEXT]
     )
@@ -223,13 +299,14 @@ def test_single_state_featurizer_with_interpreter_state_with_action_listen(
     assert (
         encoded[ACTION_NAME][0].features != scipy.sparse.coo_matrix([[0, 0, 1]])
     ).nnz == 0
-    assert encoded[ENTITIES][0].features.shape[-1] == 1
+    assert encoded[ENTITIES][0].features.shape[-1] == 4
     assert (encoded[SLOTS][0].features != scipy.sparse.coo_matrix([[1, 0, 0]])).nnz == 0
     assert (
         encoded[ACTIVE_LOOP][0].features != scipy.sparse.coo_matrix([[0, 0, 0, 1]])
     ).nnz == 0
 
 
+@pytest.mark.timeout(300)  # these can take a longer time than the default timeout
 def test_single_state_featurizer_with_interpreter_state_not_with_action_listen(
     unpacked_trained_moodbot_path: Text,
 ):
@@ -243,6 +320,7 @@ def test_single_state_featurizer_with_interpreter_state_not_with_action_listen(
     f._default_feature_states[ACTION_NAME] = {"e": 0, "d": 1, "action_listen": 2}
     f._default_feature_states[SLOTS] = {"e_0": 0, "f_0": 1, "g_0": 2}
     f._default_feature_states[ACTIVE_LOOP] = {"h": 0, "i": 1, "j": 2, "k": 3}
+
     encoded = f.encode_state(
         {
             "user": {"text": "a ball", "intent": "b", "entities": ["c"]},
@@ -252,6 +330,7 @@ def test_single_state_featurizer_with_interpreter_state_not_with_action_listen(
         },
         interpreter=interpreter,
     )
+
     # check user input is ignored when action is not action_listen
     assert list(encoded.keys()) == [ACTION_TEXT, ACTION_NAME, ACTIVE_LOOP, SLOTS]
     assert encoded[ACTION_TEXT][0].features.shape[-1] == 300
@@ -264,6 +343,7 @@ def test_single_state_featurizer_with_interpreter_state_not_with_action_listen(
     ).nnz == 0
 
 
+@pytest.mark.timeout(300)  # these can take a longer time than the default timeout
 def test_single_state_featurizer_with_interpreter_state_with_no_action_name(
     unpacked_trained_moodbot_path: Text,
 ):
@@ -274,12 +354,14 @@ def test_single_state_featurizer_with_interpreter_state_with_no_action_name(
     from rasa.core.agent import Agent
 
     interpreter = Agent.load(unpacked_trained_moodbot_path).interpreter
+
     f = SingleStateFeaturizer()
     f._default_feature_states[INTENT] = {"a": 0, "b": 1}
     f._default_feature_states[ENTITIES] = {"c": 0}
     f._default_feature_states[ACTION_NAME] = {"e": 0, "d": 1, "action_listen": 2}
     f._default_feature_states[SLOTS] = {"e_0": 0, "f_0": 1, "g_0": 2}
     f._default_feature_states[ACTIVE_LOOP] = {"h": 0, "i": 1, "j": 2, "k": 3}
+
     encoded = f.encode_state(
         {
             "user": {"text": "a ball", "intent": "b", "entities": ["c"]},
@@ -289,6 +371,7 @@ def test_single_state_featurizer_with_interpreter_state_with_no_action_name(
         },
         interpreter=interpreter,
     )
+
     assert list(encoded.keys()) == [ACTION_TEXT, ACTIVE_LOOP, SLOTS]
     assert encoded[ACTION_TEXT][0].features.shape[-1] == 300
     assert (encoded[SLOTS][0].features != scipy.sparse.coo_matrix([[1, 0, 0]])).nnz == 0
@@ -297,6 +380,33 @@ def test_single_state_featurizer_with_interpreter_state_with_no_action_name(
     ).nnz == 0
 
 
+def test_state_features_for_attribute_raises_on_not_supported_attribute():
+    f = SingleStateFeaturizer()
+
+    with pytest.raises(ValueError):
+        f._state_features_for_attribute({}, "not-supported-attribute")
+
+
+def test_to_sparse_sentence_features():
+    features = [
+        Features(
+            scipy.sparse.csr_matrix(np.random.randint(5, size=(5, 10))),
+            FEATURE_TYPE_SEQUENCE,
+            TEXT,
+            "some-featurizer",
+        )
+    ]
+
+    sentence_features = SingleStateFeaturizer._to_sparse_sentence_features(features)
+
+    assert len(sentence_features) == 1
+    assert FEATURE_TYPE_SENTENCE == sentence_features[0].type
+    assert features[0].origin == sentence_features[0].origin
+    assert features[0].attribute == sentence_features[0].attribute
+    assert sentence_features[0].features.shape == (1, 10)
+
+
+@pytest.mark.timeout(300)  # these can take a longer time than the default timeout
 def test_single_state_featurizer_uses_regex_interpreter(
     unpacked_trained_moodbot_path: Text,
 ):
diff --git a/tests/core/featurizers/test_tracker_featurizer.py b/tests/core/featurizers/test_tracker_featurizer.py
new file mode 100644
index 000000000000..f3f083a6a158
--- /dev/null
+++ b/tests/core/featurizers/test_tracker_featurizer.py
@@ -0,0 +1,98 @@
+from typing import Text
+
+import numpy as np
+import pytest
+
+from rasa.core.featurizers.single_state_featurizer import SingleStateFeaturizer
+from rasa.core.featurizers.tracker_featurizers import (
+    TrackerFeaturizer,
+    FullDialogueTrackerFeaturizer,
+    MaxHistoryTrackerFeaturizer,
+)
+from rasa.shared.core.domain import Domain
+from rasa.shared.nlu.interpreter import RegexInterpreter
+from tests.core.conftest import DEFAULT_DOMAIN_PATH_WITH_SLOTS
+from tests.core.utilities import tracker_from_dialogue_file
+
+
+def test_fail_to_load_non_existent_featurizer():
+    assert TrackerFeaturizer.load("non_existent_class") is None
+
+
+def test_persist_and_load_tracker_featurizer(tmp_path: Text, moodbot_domain: Domain):
+    state_featurizer = SingleStateFeaturizer()
+    state_featurizer.prepare_for_training(moodbot_domain, RegexInterpreter())
+    tracker_featurizer = MaxHistoryTrackerFeaturizer(state_featurizer)
+
+    tracker_featurizer.persist(tmp_path)
+
+    loaded_tracker_featurizer = TrackerFeaturizer.load(tmp_path)
+
+    assert loaded_tracker_featurizer is not None
+    assert loaded_tracker_featurizer.state_featurizer is not None
+
+
+def test_convert_labels_to_ids():
+    trackers_as_actions = [
+        ["utter_greet", "utter_channel"],
+        ["utter_greet", "utter_default", "utter_goodbye"],
+    ]
+
+    tracker_featurizer = TrackerFeaturizer()
+    domain = Domain.load(DEFAULT_DOMAIN_PATH_WITH_SLOTS)
+
+    actual_output = tracker_featurizer._convert_labels_to_ids(
+        trackers_as_actions, domain
+    )
+    expected_output = np.array([np.array([14, 11]), np.array([14, 12, 13])])
+
+    assert expected_output.size == actual_output.size
+    for expected_array, actual_array in zip(expected_output, actual_output):
+        assert np.all(expected_array == actual_array)
+
+
+def test_featurize_trackers_raises_on_missing_state_featurizer(default_domain: Domain):
+    tracker_featurizer = TrackerFeaturizer()
+
+    with pytest.raises(ValueError):
+        tracker_featurizer.featurize_trackers([], default_domain, RegexInterpreter())
+
+
+def test_featurize_trackers_with_full_dialogue_tracker_featurizer(
+    moodbot_domain: Domain,
+):
+    state_featurizer = SingleStateFeaturizer()
+    tracker_featurizer = FullDialogueTrackerFeaturizer(state_featurizer)
+
+    tracker = tracker_from_dialogue_file(
+        "data/test_dialogues/moodbot.json", moodbot_domain
+    )
+    state_features, labels, entity_tags = tracker_featurizer.featurize_trackers(
+        [tracker], moodbot_domain, RegexInterpreter()
+    )
+
+    assert state_features is not None
+    assert len(state_features) == 1
+    assert labels is not None
+    assert len(labels) == 1
+    # moodbot doesn't contain e2e entities
+    assert not any([any(turn_tags) for turn_tags in entity_tags])
+
+
+def test_featurize_trackers_with_max_history_tracker_featurizer(moodbot_domain: Domain):
+    state_featurizer = SingleStateFeaturizer()
+    tracker_featurizer = MaxHistoryTrackerFeaturizer(state_featurizer)
+
+    tracker = tracker_from_dialogue_file(
+        "data/test_dialogues/moodbot.json", moodbot_domain
+    )
+    state_features, labels, entity_tags = tracker_featurizer.featurize_trackers(
+        [tracker], moodbot_domain, RegexInterpreter()
+    )
+
+    assert state_features is not None
+    assert len(state_features) == 7
+    assert labels is not None
+    assert len(labels) == 7
+    # moodbot doesn't contain e2e entities
+    assert not any([any(turn_tags) for turn_tags in entity_tags])
diff --git a/tests/core/policies/test_rule_policy.py b/tests/core/policies/test_rule_policy.py
index 1074f4365c93..2add4ead31ac 100644
--- a/tests/core/policies/test_rule_policy.py
+++ b/tests/core/policies/test_rule_policy.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import List, Text
+from typing import Text
 
 import pytest
 
@@ -770,7 +770,7 @@ def assert_predicted_action(
 ) -> None:
     assert prediction.max_confidence == confidence
     index_of_predicted_action = prediction.max_confidence_index
-    prediction_action_name = domain.action_names[index_of_predicted_action]
+    prediction_action_name = domain.action_names_or_texts[index_of_predicted_action]
     assert prediction_action_name == expected_action_name
 
 
@@ -816,6 +816,59 @@ async def test_predict_form_action_if_in_form():
     assert_predicted_action(prediction, domain, form_name)
 
 
+async def test_predict_loop_action_if_in_loop_but_there_is_e2e_rule():
+    loop_name = "some_loop"
+
+    domain = Domain.from_yaml(
+        f"""
+    intents:
+    - {GREET_INTENT_NAME}
+    actions:
+    - {UTTER_GREET_ACTION}
+    - some-action
+    slots:
+      {REQUESTED_SLOT}:
+        type: unfeaturized
+    forms:
+    - {loop_name}
+"""
+    )
+    e2e_rule = TrackerWithCachedStates.from_events(
+        "bla",
+        domain=domain,
+        evts=[
+            ActionExecuted(RULE_SNIPPET_ACTION_NAME),
+            ActionExecuted(ACTION_LISTEN_NAME),
+            UserUttered(text="haha"),
+            ActionExecuted(UTTER_GREET_ACTION),
+            ActionExecuted(ACTION_LISTEN_NAME),
+        ],
+        is_rule_tracker=True,
+    )
+    policy = RulePolicy()
+    policy.train([e2e_rule], domain, RegexInterpreter())
+
+    loop_conversation = DialogueStateTracker.from_events(
+        "in a loop",
+        evts=[
+            # We are in an activate form
+            ActionExecuted(loop_name),
+            ActiveLoop(loop_name),
+            SlotSet(REQUESTED_SLOT, "some value"),
+            ActionExecuted(ACTION_LISTEN_NAME),
+            # User sends message as response to a requested slot
+            UserUttered("haha", {"name": GREET_INTENT_NAME}),
+        ],
+        slots=domain.slots,
+    )
+
+    # RulePolicy triggers form again
+    prediction = policy.predict_action_probabilities(
+        loop_conversation, domain, RegexInterpreter()
+    )
+    assert_predicted_action(prediction, domain, loop_name)
+
+
 async def test_predict_form_action_if_multiple_turns():
     form_name = "some_form"
     other_intent = "bye"
@@ -1000,7 +1053,6 @@ async def test_form_unhappy_path():
     prediction = policy.predict_action_probabilities(
         unhappy_form_conversation, domain, RegexInterpreter()
     )
-
     assert_predicted_action(prediction, domain, UTTER_GREET_ACTION)
 
 
@@ -1449,7 +1501,6 @@ async def test_form_activation_rule():
         domain,
         RegexInterpreter(),
     )
-
     assert_predicted_action(prediction, domain, form_name)
 
 
@@ -1761,6 +1812,50 @@ def test_default_actions(intent_name: Text, expected_action_name: Text):
     assert_predicted_action(prediction, domain, expected_action_name)
 
 
+@pytest.mark.parametrize(
+    "intent_name", [USER_INTENT_RESTART, USER_INTENT_BACK, USER_INTENT_SESSION_START]
+)
+def test_e2e_beats_default_actions(intent_name: Text):
+    domain = Domain.from_yaml(
+        f"""
+intents:
+- {GREET_INTENT_NAME}
+actions:
+- {UTTER_GREET_ACTION}
+    """
+    )
+
+    e2e_rule = TrackerWithCachedStates.from_events(
+        "bla",
+        domain=domain,
+        evts=[
+            ActionExecuted(RULE_SNIPPET_ACTION_NAME),
+            ActionExecuted(ACTION_LISTEN_NAME),
+            UserUttered(text="haha"),
+            ActionExecuted(UTTER_GREET_ACTION),
+            ActionExecuted(ACTION_LISTEN_NAME),
+        ],
+        is_rule_tracker=True,
+    )
+
+    policy = RulePolicy()
+    policy.train([e2e_rule], domain, RegexInterpreter())
+
+    new_conversation = DialogueStateTracker.from_events(
+        "bla2",
+        evts=[
+            ActionExecuted(ACTION_LISTEN_NAME),
+            UserUttered("haha", {"name": GREET_INTENT_NAME}),
+            ActionExecuted(ACTION_LISTEN_NAME),
+            UserUttered("haha", {"name": intent_name}),
+        ],
+    )
+    prediction = policy.predict_action_probabilities(
+        new_conversation, domain, RegexInterpreter()
+    )
+    assert_predicted_action(prediction, domain, UTTER_GREET_ACTION)
+
+
 @pytest.mark.parametrize(
     "rule_policy, expected_confidence, expected_prediction",
     [
diff --git a/tests/core/test_actions.py b/tests/core/test_actions.py
index 40c429057240..8747c5fb036a 100644
--- a/tests/core/test_actions.py
+++ b/tests/core/test_actions.py
@@ -1,3 +1,4 @@
+import textwrap
 from typing import List, Text
 
 import pytest
@@ -11,16 +12,22 @@
     ActionDefaultAskRephrase,
     ActionDefaultFallback,
     ActionExecutionRejection,
-    ActionListen,
     ActionRestart,
     ActionUtterTemplate,
     ActionRetrieveResponse,
     RemoteAction,
     ActionSessionStart,
+    ActionEndToEndResponse,
 )
 from rasa.core.actions.forms import FormAction
 from rasa.core.channels import CollectingOutputChannel
-from rasa.shared.core.domain import ActionNotFoundException, SessionConfig, Domain
+from rasa.shared.constants import UTTER_PREFIX
+from rasa.shared.core.domain import (
+    ActionNotFoundException,
+    SessionConfig,
+    Domain,
+    KEY_E2E_ACTIONS,
+)
 from rasa.shared.core.events import (
     Restarted,
     SlotSet,
@@ -56,7 +63,7 @@
 
 
 @pytest.fixture(scope="module")
-def template_nlg():
+def template_nlg() -> TemplatedNaturalLanguageGenerator:
     templates = {
         "utter_ask_rephrase": [{"text": "can you rephrase that?"}],
         "utter_restart": [{"text": "congrats, you've restarted me!"}],
@@ -76,20 +83,9 @@ def template_nlg():
 
 
 @pytest.fixture(scope="module")
-def template_sender_tracker(_default_domain: Domain):
-    return DialogueStateTracker("template-sender", _default_domain.slots)
-
-
-def test_text_format():
-    assert "{}".format(ActionListen()) == "Action('action_listen')"
-    assert (
-        "{}".format(ActionUtterTemplate("my_action_name"))
-        == "ActionUtterTemplate('my_action_name')"
-    )
-    assert (
-        "{}".format(ActionRetrieveResponse("utter_test"))
-        == "ActionRetrieveResponse('utter_test')"
-    )
+def template_sender_tracker(default_domain_path: Text):
+    domain = Domain.load(default_domain_path)
+    return DialogueStateTracker("template-sender", domain.slots)
 
 
 def test_domain_action_instantiation():
@@ -103,8 +99,8 @@ def test_domain_action_instantiation():
     )
 
     instantiated_actions = [
-        action.action_for_name(action_name, domain, None)
-        for action_name in domain.action_names
+        action.action_for_name_or_text(action_name, domain, None)
+        for action_name in domain.action_names_or_texts
     ]
 
     assert len(instantiated_actions) == 14
@@ -708,16 +704,18 @@ async def test_action_default_ask_rephrase(
 def test_get_form_action(slot_mapping: Text):
     form_action_name = "my_business_logic"
     domain = Domain.from_yaml(
-        f"""
+        textwrap.dedent(
+            f"""
     actions:
     - my_action
     forms:
       {form_action_name}:
         {slot_mapping}
     """
+        )
     )
 
-    actual = action.action_for_name(form_action_name, domain, None)
+    actual = action.action_for_name_or_text(form_action_name, domain, None)
     assert isinstance(actual, FormAction)
 
 
@@ -725,42 +723,109 @@ def test_get_form_action_with_rasa_open_source_1_forms():
     form_action_name = "my_business_logic"
     with pytest.warns(FutureWarning):
         domain = Domain.from_yaml(
-            f"""
+            textwrap.dedent(
+                f"""
         actions:
         - my_action
         forms:
         - {form_action_name}
         """
+            )
         )
 
-    actual = action.action_for_name(form_action_name, domain, None)
+    actual = action.action_for_name_or_text(form_action_name, domain, None)
     assert isinstance(actual, RemoteAction)
 
 
 def test_overridden_form_action():
     form_action_name = "my_business_logic"
     domain = Domain.from_yaml(
-        f"""
+        textwrap.dedent(
+            f"""
     actions:
     - my_action
     - {form_action_name}
     forms:
         {form_action_name}:
     """
+        )
     )
 
-    actual = action.action_for_name(form_action_name, domain, None)
+    actual = action.action_for_name_or_text(form_action_name, domain, None)
     assert isinstance(actual, RemoteAction)
 
 
 def test_get_form_action_if_not_in_forms():
     form_action_name = "my_business_logic"
     domain = Domain.from_yaml(
-        """
+        textwrap.dedent(
+            """
     actions:
     - my_action
     """
+        )
     )
 
     with pytest.raises(ActionNotFoundException):
-        assert not action.action_for_name(form_action_name, domain, None)
+        assert not action.action_for_name_or_text(form_action_name, domain, None)
+
+
+@pytest.mark.parametrize(
+    "end_to_end_utterance", ["Hi", f"{UTTER_PREFIX} is a dangerous start"]
+)
+def test_get_end_to_end_utterance_action(end_to_end_utterance: Text):
+    domain = Domain.from_yaml(
+        textwrap.dedent(
+            f"""
+    actions:
+    - my_action
+    {KEY_E2E_ACTIONS}:
+    - {end_to_end_utterance}
+    - Bye Bye
+"""
+        )
+    )
+
+    actual = action.action_for_name_or_text(end_to_end_utterance, domain, None)
+
+    assert isinstance(actual, ActionEndToEndResponse)
+    assert actual.name() == end_to_end_utterance
+
+
+async def test_run_end_to_end_utterance_action():
+    end_to_end_utterance = "Hi"
+
+    domain = Domain.from_yaml(
+        textwrap.dedent(
+            f"""
+    actions:
+    - my_action
+    {KEY_E2E_ACTIONS}:
+    - {end_to_end_utterance}
+    - Bye Bye
+"""
+        )
+    )
+
+    e2e_action = action.action_for_name_or_text("Hi", domain, None)
+    events = await e2e_action.run(
+        CollectingOutputChannel(),
+        TemplatedNaturalLanguageGenerator(domain.templates),
+        DialogueStateTracker.from_events("sender", evts=[]),
+        domain,
+    )
+
+    assert events == [
+        BotUttered(
+            end_to_end_utterance,
+            {
+                "elements": None,
+                "quick_replies": None,
+                "buttons": None,
+                "attachment": None,
+                "image": None,
+                "custom": None,
+            },
+            {},
+        )
+    ]
diff --git a/tests/core/test_agent.py b/tests/core/test_agent.py
index a26671c67232..14ccc455e0e8 100644
--- a/tests/core/test_agent.py
+++ b/tests/core/test_agent.py
@@ -79,7 +79,7 @@ async def test_agent_train(trained_moodbot_path: Text):
     loaded = Agent.load(trained_moodbot_path)
 
     # test domain
-    assert loaded.domain.action_names == moodbot_domain.action_names
+    assert loaded.domain.action_names_or_texts == moodbot_domain.action_names_or_texts
     assert loaded.domain.intents == moodbot_domain.intents
     assert loaded.domain.entities == moodbot_domain.entities
     assert loaded.domain.templates == moodbot_domain.templates
diff --git a/tests/core/test_ensemble.py b/tests/core/test_ensemble.py
index 050ad361c45f..452bc1b29a7c 100644
--- a/tests/core/test_ensemble.py
+++ b/tests/core/test_ensemble.py
@@ -28,7 +28,7 @@
 
 from tests.core import utilities
 from rasa.core.constants import FORM_POLICY_PRIORITY
-from rasa.shared.core.events import ActionExecuted
+from rasa.shared.core.events import ActionExecuted, DefinePrevUserUtteredFeaturization
 from rasa.core.policies.two_stage_fallback import TwoStageFallbackPolicy
 from rasa.core.policies.mapping_policy import MappingPolicy
 from rasa.shared.core.constants import (
@@ -579,6 +579,60 @@ def test_prediction_applies_optional_policy_events(default_domain: Domain):
     assert all(event in prediction.events for event in must_have_events)
 
 
+def test_end_to_end_prediction_applies_define_featurization_events(
+    default_domain: Domain,
+):
+    ensemble = SimplePolicyEnsemble(
+        [
+            ConstantPolicy(priority=100, predict_index=0),
+            ConstantPolicy(priority=1, predict_index=1, is_end_to_end_prediction=True),
+        ]
+    )
+
+    # no events should be added if latest action is not action listen
+    tracker = DialogueStateTracker.from_events("test", evts=[])
+    prediction = ensemble.probabilities_using_best_policy(
+        tracker, default_domain, RegexInterpreter()
+    )
+    assert prediction.events == []
+
+    # DefinePrevUserUtteredFeaturization should be added after action listen
+    tracker = DialogueStateTracker.from_events(
+        "test", evts=[ActionExecuted(ACTION_LISTEN_NAME)]
+    )
+    prediction = ensemble.probabilities_using_best_policy(
+        tracker, default_domain, RegexInterpreter()
+    )
+    assert prediction.events == [DefinePrevUserUtteredFeaturization(True)]
+
+
+def test_intent_prediction_does_not_apply_define_featurization_events(
+    default_domain: Domain,
+):
+    ensemble = SimplePolicyEnsemble(
+        [
+            ConstantPolicy(priority=100, predict_index=0),
+            ConstantPolicy(priority=1, predict_index=1, is_end_to_end_prediction=False),
+        ]
+    )
+
+    # no events should be added if latest action is not action listen
+    tracker = DialogueStateTracker.from_events("test", evts=[])
+    prediction = ensemble.probabilities_using_best_policy(
+        tracker, default_domain, RegexInterpreter()
+    )
+    assert prediction.events == []
+
+    # DefinePrevUserUtteredFeaturization should be added after action listen
+    tracker = DialogueStateTracker.from_events(
+        "test", evts=[ActionExecuted(ACTION_LISTEN_NAME)]
+    )
+    prediction = ensemble.probabilities_using_best_policy(
+        tracker, default_domain, RegexInterpreter()
+    )
+    assert prediction.events == [DefinePrevUserUtteredFeaturization(False)]
+
+
 def test_with_float_returning_policy(default_domain: Domain):
     expected_index = 3
 
diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index 3aa109bd896d..997eb5228352 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -172,12 +172,13 @@ def test_prediction_on_empty_tracker(
         self, trained_policy: Policy, default_domain: Domain
     ):
         tracker = DialogueStateTracker(DEFAULT_SENDER_ID, default_domain.slots)
-        probabilities = trained_policy.predict_action_probabilities(
+        prediction = trained_policy.predict_action_probabilities(
             tracker, default_domain, RegexInterpreter()
-        ).probabilities
-        assert len(probabilities) == default_domain.num_actions
-        assert max(probabilities) <= 1.0
-        assert min(probabilities) >= 0.0
+        )
+        assert not prediction.is_end_to_end_prediction
+        assert len(prediction.probabilities) == default_domain.num_actions
+        assert max(prediction.probabilities) <= 1.0
+        assert min(prediction.probabilities) >= 0.0
 
     @pytest.mark.filterwarnings(
         "ignore:.*without a trained model present.*:UserWarning"
@@ -196,7 +197,7 @@ def _get_next_action(policy: Policy, events: List[Event], domain: Domain) -> Tex
             tracker, domain, RegexInterpreter()
         ).probabilities
         index = scores.index(max(scores))
-        return domain.action_names[index]
+        return domain.action_names_or_texts[index]
 
 
 class TestSklearnPolicy(PolicyTestCollection):
@@ -313,13 +314,14 @@ def test_missing_classes_filled_correctly(
         policy.train(
             new_trackers, domain=default_domain, interpreter=RegexInterpreter()
         )
-        predicted_probabilities = policy.predict_action_probabilities(
+        prediction = policy.predict_action_probabilities(
             tracker, default_domain, RegexInterpreter()
-        ).probabilities
+        )
 
-        assert len(predicted_probabilities) == default_domain.num_actions
-        assert np.allclose(sum(predicted_probabilities), 1.0)
-        for i, prob in enumerate(predicted_probabilities):
+        assert not prediction.is_end_to_end_prediction
+        assert len(prediction.probabilities) == default_domain.num_actions
+        assert np.allclose(sum(prediction.probabilities), 1.0)
+        for i, prob in enumerate(prediction.probabilities):
             if i in classes:
                 assert prob >= 0.0
             else:
@@ -405,16 +407,17 @@ def test_normalization(
         monkeypatch: MonkeyPatch,
     ):
         # first check the output is what we expect
-        predicted_probabilities = trained_policy.predict_action_probabilities(
+        prediction = trained_policy.predict_action_probabilities(
             tracker, default_domain, RegexInterpreter()
-        ).probabilities
+        )
+        assert not prediction.is_end_to_end_prediction
         # count number of non-zero confidences
         assert (
-            sum([confidence > 0 for confidence in predicted_probabilities])
+            sum([confidence > 0 for confidence in prediction.probabilities])
             == trained_policy.config[RANKING_LENGTH]
         )
         # check that the norm is still 1
-        assert sum(predicted_probabilities) == pytest.approx(1)
+        assert sum(prediction.probabilities) == pytest.approx(1)
 
         # also check our function is called
         mock = Mock()
@@ -428,69 +431,95 @@ def test_normalization(
     async def test_gen_batch(self, trained_policy: TEDPolicy, default_domain: Domain):
         training_trackers = await train_trackers(default_domain, augmentation_factor=0)
         interpreter = RegexInterpreter()
-        training_data, label_ids = trained_policy.featurize_for_training(
+        training_data, label_ids, entity_tags = trained_policy.featurize_for_training(
             training_trackers, default_domain, interpreter
         )
         label_data, all_labels = trained_policy._create_label_data(
             default_domain, interpreter
         )
         model_data = trained_policy._create_model_data(
-            training_data, label_ids, all_labels
+            training_data, label_ids, entity_tags, all_labels
         )
         batch_size = 2
+
+        # model data keys were sorted, so the order is alphabetical
         (
-            batch_label_ids,
+            batch_action_name_mask,
+            batch_action_name_sentence_indices,
+            batch_action_name_sentence_data,
+            batch_action_name_sentence_shape,
+            batch_dialogue_length,
             batch_entities_mask,
-            batch_entities_sentence_1,
-            batch_entities_sentence_2,
-            batch_entities_sentence_3,
+            batch_entities_sentence_indices,
+            batch_entities_sentence_data,
+            batch_entities_sentence_shape,
             batch_intent_mask,
-            batch_intent_sentence_1,
-            batch_intent_sentence_2,
-            batch_intent_sentence_3,
+            batch_intent_sentence_indices,
+            batch_intent_sentence_data,
+            batch_intent_sentence_shape,
+            batch_label_ids,
             batch_slots_mask,
-            batch_slots_sentence_1,
-            batch_slots_sentence_2,
-            batch_slots_sentence_3,
-            batch_action_name_mask,
-            batch_action_name_sentence_1,
-            batch_action_name_sentence_2,
-            batch_action_name_sentence_3,
-            batch_dialogue_length,
+            batch_slots_sentence_indices,
+            batch_slots_sentence_data,
+            batch_slots_sentence_shape,
         ) = next(model_data._gen_batch(batch_size=batch_size))
 
         assert (
-            batch_intent_mask.shape[0] == batch_size
-            and batch_action_name_mask.shape[0] == batch_size
+            batch_label_ids.shape[0] == batch_size
+            and batch_dialogue_length.shape[0] == batch_size
+        )
+        # batch and dialogue dimensions are NOT combined for masks
+        assert (
+            batch_slots_mask.shape[0] == batch_size
+            and batch_intent_mask.shape[0] == batch_size
             and batch_entities_mask.shape[0] == batch_size
-            and batch_slots_mask.shape[0] == batch_size
+            and batch_action_name_mask.shape[0] == batch_size
+        )
+        # some features might be "fake" so there sequence is `0`
+        seq_len = max(
+            [
+                batch_intent_sentence_shape[1],
+                batch_action_name_sentence_shape[1],
+                batch_entities_sentence_shape[1],
+                batch_slots_sentence_shape[1],
+            ]
+        )
+        assert (
+            batch_intent_sentence_shape[1] == seq_len
+            or batch_intent_sentence_shape[1] == 0
         )
         assert (
-            batch_intent_sentence_3[1]
-            == batch_action_name_sentence_3[1]
-            == batch_entities_sentence_3[1]
-            == batch_slots_sentence_3[1]
+            batch_action_name_sentence_shape[1] == seq_len
+            or batch_action_name_sentence_shape[1] == 0
+        )
+        assert (
+            batch_entities_sentence_shape[1] == seq_len
+            or batch_entities_sentence_shape[1] == 0
+        )
+        assert (
+            batch_slots_sentence_shape[1] == seq_len
+            or batch_slots_sentence_shape[1] == 0
         )
 
         (
-            batch_label_ids,
+            batch_action_name_mask,
+            batch_action_name_sentence_indices,
+            batch_action_name_sentence_data,
+            batch_action_name_sentence_shape,
+            batch_dialogue_length,
             batch_entities_mask,
-            batch_entities_sentence_1,
-            batch_entities_sentence_2,
-            batch_entities_sentence_3,
+            batch_entities_sentence_indices,
+            batch_entities_sentence_data,
+            batch_entities_sentence_shape,
             batch_intent_mask,
-            batch_intent_sentence_1,
-            batch_intent_sentence_2,
-            batch_intent_sentence_3,
+            batch_intent_sentence_indices,
+            batch_intent_sentence_data,
+            batch_intent_sentence_shape,
+            batch_label_ids,
             batch_slots_mask,
-            batch_slots_sentence_1,
-            batch_slots_sentence_2,
-            batch_slots_sentence_3,
-            batch_action_name_mask,
-            batch_action_name_sentence_1,
-            batch_action_name_sentence_2,
-            batch_action_name_sentence_3,
-            batch_dialogue_length,
+            batch_slots_sentence_indices,
+            batch_slots_sentence_data,
+            batch_slots_sentence_shape,
         ) = next(
             model_data._gen_batch(
                 batch_size=batch_size, batch_strategy="balanced", shuffle=True
@@ -498,16 +527,33 @@ async def test_gen_batch(self, trained_policy: TEDPolicy, default_domain: Domain
         )
 
         assert (
-            batch_intent_mask.shape[0] == batch_size
-            and batch_action_name_mask.shape[0] == batch_size
-            and batch_entities_mask.shape[0] == batch_size
-            and batch_slots_mask.shape[0] == batch_size
+            batch_label_ids.shape[0] == batch_size
+            and batch_dialogue_length.shape[0] == batch_size
+        )
+        # some features might be "fake" so there sequence is `0`
+        seq_len = max(
+            [
+                batch_intent_sentence_shape[1],
+                batch_action_name_sentence_shape[1],
+                batch_entities_sentence_shape[1],
+                batch_slots_sentence_shape[1],
+            ]
+        )
+        assert (
+            batch_intent_sentence_shape[1] == seq_len
+            or batch_intent_sentence_shape[1] == 0
         )
         assert (
-            batch_intent_sentence_3[1]
-            == batch_action_name_sentence_3[1]
-            == batch_entities_sentence_3[1]
-            == batch_slots_sentence_3[1]
+            batch_action_name_sentence_shape[1] == seq_len
+            or batch_action_name_sentence_shape[1] == 0
+        )
+        assert (
+            batch_entities_sentence_shape[1] == seq_len
+            or batch_entities_sentence_shape[1] == 0
+        )
+        assert (
+            batch_slots_sentence_shape[1] == seq_len
+            or batch_slots_sentence_shape[1] == 0
         )
 
 
@@ -956,13 +1002,14 @@ def test_predict_action_listen(
             ActionExecuted(intent_mapping[1], policy="policy_0_MappingPolicy"),
         ]
         tracker = get_tracker(events)
-        scores = policy.predict_action_probabilities(
+        prediction = policy.predict_action_probabilities(
             tracker, domain_with_mapping, RegexInterpreter()
-        ).probabilities
-        index = scores.index(max(scores))
-        action_planned = domain_with_mapping.action_names[index]
+        )
+        index = prediction.probabilities.index(max(prediction.probabilities))
+        action_planned = domain_with_mapping.action_names_or_texts[index]
+        assert not prediction.is_end_to_end_prediction
         assert action_planned == ACTION_LISTEN_NAME
-        assert scores != [0] * domain_with_mapping.num_actions
+        assert prediction.probabilities != [0] * domain_with_mapping.num_actions
 
     def test_do_not_follow_other_policy(
         self,
@@ -977,10 +1024,11 @@ def test_do_not_follow_other_policy(
             ActionExecuted(intent_mapping[1], policy="other_policy"),
         ]
         tracker = get_tracker(events)
-        scores = policy.predict_action_probabilities(
+        prediction = policy.predict_action_probabilities(
             tracker, domain_with_mapping, RegexInterpreter()
-        ).probabilities
-        assert scores == [0] * domain_with_mapping.num_actions
+        )
+        assert prediction.probabilities == [0] * domain_with_mapping.num_actions
+        assert not prediction.is_end_to_end_prediction
 
 
 class TestFallbackPolicy(PolicyTestCollection):
@@ -1088,8 +1136,8 @@ async def test_affirmation(
         )
 
         assert "greet" == tracker.latest_message.parse_data["intent"][INTENT_NAME_KEY]
-        assert tracker.export_stories(MarkdownStoryWriter()) == (
-            "## sender\n* greet\n    - utter_hello\n* greet\n"
+        assert tracker.export_stories(MarkdownStoryWriter(), e2e=True) == (
+            "## sender\n* greet: Random\n    - utter_hello\n* greet: Random\n"
         )
 
     def test_ask_rephrase(self, trained_policy: Policy, default_domain: Domain):
@@ -1217,8 +1265,8 @@ async def test_rephrasing_instead_affirmation(
         )
 
         assert "bye" == tracker.latest_message.parse_data["intent"][INTENT_NAME_KEY]
-        assert tracker.export_stories(MarkdownStoryWriter()) == (
-            "## sender\n* greet\n    - utter_hello\n* bye\n"
+        assert tracker.export_stories(MarkdownStoryWriter(), e2e=True) == (
+            "## sender\n* greet: Random\n    - utter_hello\n* bye: Random\n"
         )
 
     def test_unknown_instead_affirmation(
diff --git a/tests/core/test_processor.py b/tests/core/test_processor.py
index 86cf704b82e9..553edcf66774 100644
--- a/tests/core/test_processor.py
+++ b/tests/core/test_processor.py
@@ -39,6 +39,7 @@
     SessionStarted,
     Event,
     SlotSet,
+    DefinePrevUserUtteredFeaturization,
     ActionExecutionRejected,
     LoopInterrupted,
 )
@@ -741,6 +742,7 @@ async def test_handle_message_with_session_start(
             [{"entity": entity, "start": 6, "end": 22, "value": "Core"}],
         ),
         SlotSet(entity, slot_1[entity]),
+        DefinePrevUserUtteredFeaturization(False),
         ActionExecuted("utter_greet"),
         BotUttered("hey there Core!", metadata={"template_name": "utter_greet"}),
         ActionExecuted(ACTION_LISTEN_NAME),
@@ -762,6 +764,7 @@ async def test_handle_message_with_session_start(
             ],
         ),
         SlotSet(entity, slot_2[entity]),
+        DefinePrevUserUtteredFeaturization(False),
         ActionExecuted("utter_greet"),
         BotUttered(
             "hey there post-session start hello!",
@@ -899,10 +902,12 @@ async def test_restart_triggers_session_start(
             [{"entity": entity, "start": 6, "end": 23, "value": "name1"}],
         ),
         SlotSet(entity, slot_1[entity]),
+        DefinePrevUserUtteredFeaturization(use_text_for_featurization=False),
         ActionExecuted("utter_greet"),
         BotUttered("hey there name1!", metadata={"template_name": "utter_greet"}),
         ActionExecuted(ACTION_LISTEN_NAME),
         UserUttered("/restart", {INTENT_NAME_KEY: "restart", "confidence": 1.0}),
+        DefinePrevUserUtteredFeaturization(use_text_for_featurization=False),
         ActionExecuted(ACTION_RESTART_NAME),
         Restarted(),
         ActionExecuted(ACTION_SESSION_START_NAME),
@@ -910,7 +915,8 @@ async def test_restart_triggers_session_start(
         # No previous slot is set due to restart.
         ActionExecuted(ACTION_LISTEN_NAME),
     ]
-    assert list(tracker.events) == expected
+    for actual, expected in zip(tracker.events, expected):
+        assert actual == expected
 
 
 async def test_handle_message_if_action_manually_rejects(
@@ -1091,3 +1097,65 @@ async def mocked_run(*args: Any, **kwargs: Any) -> List[Event]:
     ]
     for event, expected in zip(tracker.events, expected_events):
         assert event == expected
+
+
+async def test_logging_of_end_to_end_action():
+    end_to_end_action = "hi, how are you?"
+    domain = Domain(
+        intents=["greet"],
+        entities=[],
+        slots=[],
+        templates={},
+        action_names=[],
+        forms={},
+        action_texts=[end_to_end_action],
+    )
+
+    conversation_id = "test_logging_of_end_to_end_action"
+    user_message = "/greet"
+
+    class ConstantEnsemble(PolicyEnsemble):
+        def __init__(self) -> None:
+            super().__init__([])
+            self.number_of_calls = 0
+
+        def probabilities_using_best_policy(
+            self,
+            tracker: DialogueStateTracker,
+            domain: Domain,
+            interpreter: NaturalLanguageInterpreter,
+            **kwargs: Any,
+        ) -> PolicyPrediction:
+            if self.number_of_calls == 0:
+                prediction = PolicyPrediction.for_action_name(
+                    domain, end_to_end_action, "some policy"
+                )
+                prediction.is_end_to_end_prediction = True
+                self.number_of_calls += 1
+                return prediction
+            else:
+                return PolicyPrediction.for_action_name(domain, ACTION_LISTEN_NAME)
+
+    tracker_store = InMemoryTrackerStore(domain)
+    processor = MessageProcessor(
+        RegexInterpreter(),
+        ConstantEnsemble(),
+        domain,
+        tracker_store,
+        NaturalLanguageGenerator.create(None, domain),
+    )
+
+    await processor.handle_message(UserMessage(user_message, sender_id=conversation_id))
+
+    tracker = tracker_store.retrieve(conversation_id)
+    expected_events = [
+        ActionExecuted(ACTION_SESSION_START_NAME),
+        SessionStarted(),
+        ActionExecuted(ACTION_LISTEN_NAME),
+        UserUttered(user_message, intent={"name": "greet"}),
+        ActionExecuted(action_text=end_to_end_action),
+        BotUttered("hi, how are you?", {}, {}, 123),
+        ActionExecuted(ACTION_LISTEN_NAME),
+    ]
+    for event, expected in zip(tracker.events, expected_events):
+        assert event == expected
diff --git a/tests/core/training/converters/test_story_markdown_to_yaml_converter.py b/tests/core/training/converters/test_story_markdown_to_yaml_converter.py
index afd19ff64b34..4bc1158089a2 100644
--- a/tests/core/training/converters/test_story_markdown_to_yaml_converter.py
+++ b/tests/core/training/converters/test_story_markdown_to_yaml_converter.py
@@ -7,14 +7,6 @@
     StoryMarkdownToYamlConverter,
 )
 
-from rasa.shared.core.training_data.story_reader.markdown_story_reader import (
-    MarkdownStoryReader,
-)
-
-from rasa.shared.core.training_data.story_reader.yaml_story_reader import (
-    YAMLStoryReader,
-)
-
 from rasa.shared.constants import LATEST_TRAINING_DATA_FORMAT_VERSION
 
 
@@ -34,10 +26,10 @@ def test_converter_filters_correct_files(training_data_file: Text, should_filter
 
 async def test_stories_are_converted(tmp_path: Path):
     converted_data_folder = tmp_path / "converted_data"
-    os.mkdir(converted_data_folder)
+    converted_data_folder.mkdir()
 
-    training_data_folder = tmp_path / "data/core"
-    os.makedirs(training_data_folder, exist_ok=True)
+    training_data_folder = tmp_path / "data" / "core"
+    training_data_folder.mkdir(parents=True)
     training_data_file = Path(training_data_folder / "stories.md")
 
     simple_story_md = """
@@ -48,8 +40,7 @@ async def test_stories_are_converted(tmp_path: Path):
         - slot{"name": ["value1", "value2"]}
     """
 
-    with open(training_data_file, "w") as f:
-        f.write(simple_story_md)
+    training_data_file.write_text(simple_story_md)
 
     with pytest.warns(None) as warnings:
         await StoryMarkdownToYamlConverter().convert_and_write(
@@ -81,10 +72,10 @@ async def test_stories_are_converted(tmp_path: Path):
 
 async def test_test_stories(tmp_path: Path):
     converted_data_folder = tmp_path / "converted_data"
-    os.mkdir(converted_data_folder)
+    converted_data_folder.mkdir()
 
     test_data_folder = tmp_path / "tests"
-    os.makedirs(test_data_folder, exist_ok=True)
+    test_data_folder.mkdir(exist_ok=True)
     test_data_file = Path(test_data_folder / "test_stories.md")
 
     simple_story_md = """
@@ -95,8 +86,7 @@ async def test_test_stories(tmp_path: Path):
         - action_set_faq_slot
     """
 
-    with open(test_data_file, "w") as f:
-        f.write(simple_story_md)
+    test_data_file.write_text(simple_story_md)
 
     with pytest.warns(None) as warnings:
         await StoryMarkdownToYamlConverter().convert_and_write(
@@ -105,7 +95,7 @@ async def test_test_stories(tmp_path: Path):
 
     assert not warnings
 
-    assert len(os.listdir(converted_data_folder)) == 1
+    assert len(list(converted_data_folder.glob("*"))) == 1
 
     with open(f"{converted_data_folder}/test_stories_converted.yml", "r") as f:
         content = f.read()
@@ -126,10 +116,10 @@ async def test_test_stories(tmp_path: Path):
 
 async def test_test_stories_conversion_response_key(tmp_path: Path):
     converted_data_folder = tmp_path / "converted_data"
-    os.mkdir(converted_data_folder)
+    converted_data_folder.mkdir()
 
     test_data_folder = tmp_path / "tests"
-    os.makedirs(test_data_folder, exist_ok=True)
+    test_data_folder.mkdir(exist_ok=True)
     test_data_file = Path(test_data_folder / "test_stories.md")
 
     simple_story_md = """
@@ -139,8 +129,7 @@ async def test_test_stories_conversion_response_key(tmp_path: Path):
         - utter_out_of_scope/other
     """
 
-    with open(test_data_file, "w") as f:
-        f.write(simple_story_md)
+    test_data_file.write_text(simple_story_md)
 
     await StoryMarkdownToYamlConverter().convert_and_write(
         test_data_file, converted_data_folder
@@ -163,10 +152,10 @@ async def test_test_stories_conversion_response_key(tmp_path: Path):
 
 async def test_stories_conversion_response_key(tmp_path: Path):
     converted_data_folder = tmp_path / "converted_data"
-    os.mkdir(converted_data_folder)
+    converted_data_folder.mkdir()
 
-    training_data_folder = tmp_path / "data/core"
-    os.makedirs(training_data_folder, exist_ok=True)
+    training_data_folder = tmp_path / "data" / "core"
+    training_data_folder.mkdir(parents=True)
     training_data_file = Path(training_data_folder / "stories.md")
 
     simple_story_md = """
@@ -175,8 +164,7 @@ async def test_stories_conversion_response_key(tmp_path: Path):
         - utter_out_of_scope/other
     """
 
-    with open(training_data_file, "w") as f:
-        f.write(simple_story_md)
+    training_data_file.write_text(simple_story_md)
 
     await StoryMarkdownToYamlConverter().convert_and_write(
         training_data_file, converted_data_folder
diff --git a/tests/core/training/test_interactive.py b/tests/core/training/test_interactive.py
index 2f7fcd146c6c..90b3203ee3df 100644
--- a/tests/core/training/test_interactive.py
+++ b/tests/core/training/test_interactive.py
@@ -597,8 +597,8 @@ async def test_write_domain_to_file_with_form(tmp_path: Path):
 
     interactive._write_domain_to_file(domain_path, events, old_domain)
 
-    assert set(Domain.from_path(domain_path).action_names) == set(
-        old_domain.action_names
+    assert set(Domain.from_path(domain_path).action_names_or_texts) == set(
+        old_domain.action_names_or_texts
     )
 
 
diff --git a/tests/docs/test_docs_training_data.py b/tests/docs/test_docs_training_data.py
index 706b44599fa2..1008f7248b9c 100644
--- a/tests/docs/test_docs_training_data.py
+++ b/tests/docs/test_docs_training_data.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import List, Text
+from typing import List, Text, Tuple
 import re
 
 import pytest
@@ -28,7 +28,7 @@ def test_docs_training_data(mdx_file_path: Path):
         mdx_content = handle.read()
 
     matches = TRAINING_DATA_CODEBLOCK_RE.finditer(mdx_content)
-    lines_with_errors: List[Text] = []
+    lines_with_errors: List[Tuple[Text, Text]] = []
 
     for match in matches:
         yaml_path = match.group("yaml_path")
@@ -46,11 +46,14 @@ def test_docs_training_data(mdx_file_path: Path):
         for schema in schemas_to_try:
             try:
                 rasa.shared.utils.validation.validate_yaml_schema(codeblock, schema)
-            except ValueError:
-                lines_with_errors.append(str(line_number))
+            except ValueError as error:
+                lines_with_errors.append((str(line_number), str(error)))
 
     if lines_with_errors:
+        error_details = "\n\n" + "\n".join(
+            f" - At line {line}: {error} " for line, error in lines_with_errors
+        )
         raise AssertionError(
             f"({mdx_file_path}): Invalid training data found "
-            f"at line{'s' if len(lines_with_errors) > 1 else ''} {', '.join(lines_with_errors)}"
+            f"at line{'s' if len(lines_with_errors) > 1 else ''}: {error_details}"
         )
diff --git a/tests/nlu/classifiers/test_diet_classifier.py b/tests/nlu/classifiers/test_diet_classifier.py
index 6f23f613fe59..90f20a61039e 100644
--- a/tests/nlu/classifiers/test_diet_classifier.py
+++ b/tests/nlu/classifiers/test_diet_classifier.py
@@ -3,7 +3,7 @@
 import numpy as np
 import pytest
 from unittest.mock import Mock
-from typing import List, Tuple, Text, Dict, Any, Optional
+from typing import List, Text, Dict, Any
 
 from rasa.shared.nlu.training_data.features import Features
 from rasa.nlu import train
@@ -27,6 +27,8 @@
     EVAL_NUM_EXAMPLES,
     CHECKPOINT_MODEL,
     BILOU_FLAG,
+    ENTITY_RECOGNITION,
+    INTENT_CLASSIFICATION,
 )
 from rasa.nlu.components import ComponentBuilder
 from rasa.nlu.classifiers.diet_classifier import DIETClassifier
@@ -171,6 +173,48 @@ async def test_train_persist_load_with_different_settings(component_builder, tmp
     )
 
 
+async def test_train_persist_load_with_only_entity_recognition(
+    component_builder, tmpdir
+):
+    pipeline = [
+        {"name": "WhitespaceTokenizer"},
+        {"name": "CountVectorsFeaturizer"},
+        {
+            "name": "DIETClassifier",
+            ENTITY_RECOGNITION: True,
+            INTENT_CLASSIFICATION: False,
+            EPOCHS: 1,
+        },
+    ]
+    await _train_persist_load_with_different_settings(
+        pipeline, component_builder, tmpdir, should_finetune=False
+    )
+    await _train_persist_load_with_different_settings(
+        pipeline, component_builder, tmpdir, should_finetune=True
+    )
+
+
+async def test_train_persist_load_with_only_intent_classification(
+    component_builder, tmpdir
+):
+    pipeline = [
+        {"name": "WhitespaceTokenizer"},
+        {"name": "CountVectorsFeaturizer"},
+        {
+            "name": "DIETClassifier",
+            ENTITY_RECOGNITION: False,
+            INTENT_CLASSIFICATION: True,
+            EPOCHS: 1,
+        },
+    ]
+    await _train_persist_load_with_different_settings(
+        pipeline, component_builder, tmpdir, should_finetune=False
+    )
+    await _train_persist_load_with_different_settings(
+        pipeline, component_builder, tmpdir, should_finetune=True
+    )
+
+
 async def test_raise_error_on_incorrect_pipeline(component_builder, tmp_path: Path):
     _config = RasaNLUModelConfig(
         {
diff --git a/tests/nlu/conftest.py b/tests/nlu/conftest.py
index 11ec0e1506fc..a62df12ae9fc 100644
--- a/tests/nlu/conftest.py
+++ b/tests/nlu/conftest.py
@@ -10,17 +10,6 @@
 DEFAULT_DATA_PATH = "data/examples/rasa/demo-rasa.json"
 
 
-@pytest.fixture(scope="session")
-def component_builder():
-    return ComponentBuilder()
-
-
-@pytest.fixture(scope="session")
-def spacy_nlp(component_builder, blank_config):
-    spacy_nlp_config = {"name": "SpacyNLP"}
-    return component_builder.create_component(spacy_nlp_config, blank_config).nlp
-
-
 @pytest.fixture(scope="session")
 def spacy_nlp_component(component_builder, blank_config):
     spacy_nlp_config = {"name": "SpacyNLP"}
diff --git a/tests/shared/core/test_domain.py b/tests/shared/core/test_domain.py
index c05077ac2d3f..53181a00a989 100644
--- a/tests/shared/core/test_domain.py
+++ b/tests/shared/core/test_domain.py
@@ -1,7 +1,7 @@
 import copy
 import json
 from pathlib import Path
-from typing import Dict, List, Text, Any, Union, Set
+from typing import Dict, List, Text, Any, Union, Set, Optional
 
 import pytest
 
@@ -33,6 +33,7 @@
     State,
     Domain,
     KEY_FORMS,
+    KEY_E2E_ACTIONS,
 )
 from rasa.shared.core.trackers import DialogueStateTracker
 from rasa.shared.core.events import ActionExecuted, SlotSet, UserUttered
@@ -181,7 +182,7 @@ def test_domain_from_template():
 
     assert not domain.is_empty()
     assert len(domain.intents) == 10 + len(DEFAULT_INTENTS)
-    assert len(domain.action_names) == 16
+    assert len(domain.action_names_or_texts) == 16
 
 
 def test_avoid_action_repetition(default_domain: Domain):
@@ -196,7 +197,7 @@ def test_avoid_action_repetition(default_domain: Domain):
     """
     )
 
-    assert len(domain.action_names) == len(DEFAULT_ACTION_NAMES) + 1
+    assert len(domain.action_names_or_texts) == len(DEFAULT_ACTION_NAMES) + 1
 
 
 def test_utter_templates():
@@ -257,7 +258,7 @@ def test_domain_fails_on_unknown_custom_slot_type(tmpdir, domain_unkown_slot_typ
 
 
 def test_domain_to_dict():
-    test_yaml = """
+    test_yaml = f"""
     actions:
     - action_save_world
     config:
@@ -272,6 +273,9 @@ def test_domain_to_dict():
     session_config:
       carry_over_slots_to_new_session: true
       session_expiration_time: 60
+    {KEY_E2E_ACTIONS}:
+    - Hello, dear user
+    - what's up
     slots:
       some_slot:
         type: categorical
@@ -302,6 +306,7 @@ def test_domain_to_dict():
                 "type": "rasa.shared.core.slots.CategoricalSlot",
             }
         },
+        KEY_E2E_ACTIONS: ["Hello, dear user", "what's up"],
     }
 
 
@@ -337,16 +342,18 @@ def test_domain_to_yaml():
 
 
 def test_merge_yaml_domains():
-    test_yaml_1 = """config:
+    test_yaml_1 = f"""config:
   store_entities_as_slots: true
 entities: []
 intents: []
-slots: {}
+slots: {{}}
 responses:
   utter_greet:
-  - text: hey there!"""
+  - text: hey there!
+{KEY_E2E_ACTIONS}:
+- Hi"""
 
-    test_yaml_2 = """config:
+    test_yaml_2 = f"""config:
   store_entities_as_slots: false
 session_config:
     session_expiration_time: 20
@@ -358,6 +365,8 @@ def test_merge_yaml_domains():
 slots:
   cuisine:
     type: text
+{KEY_E2E_ACTIONS}:
+- Bye
 responses:
   utter_goodbye:
   - text: bye!
@@ -391,6 +400,7 @@ def test_merge_yaml_domains():
         "utter_goodbye": [{"text": "bye!"}],
     }
     assert domain.session_config == SessionConfig(20, True)
+    assert domain.action_texts == ["Bye", "Hi"]
 
 
 @pytest.mark.parametrize("default_intent", DEFAULT_INTENTS)
@@ -462,7 +472,8 @@ def test_merge_with_empty_domain():
     assert merged.as_dict() == domain.as_dict()
 
 
-def test_merge_with_empty_other_domain():
+@pytest.mark.parametrize("other", [Domain.empty(), None])
+def test_merge_with_empty_other_domain(other: Optional[Domain]):
     domain = Domain.from_yaml(
         """config:
   store_entities_as_slots: false
@@ -483,7 +494,7 @@ def test_merge_with_empty_other_domain():
   - text: hey you!"""
     )
 
-    merged = domain.merge(Domain.empty(), override=True)
+    merged = domain.merge(other, override=True)
 
     assert merged.as_dict() == domain.as_dict()
 
@@ -1041,7 +1052,7 @@ def test_domain_deepcopy():
     assert new_domain.session_config == domain.session_config
     assert new_domain._custom_actions == domain._custom_actions
     assert new_domain.user_actions == domain.user_actions
-    assert new_domain.action_names == domain.action_names
+    assert new_domain.action_names_or_texts == domain.action_names_or_texts
     assert new_domain.store_entities_as_slots == domain.store_entities_as_slots
 
     # not the same objects
@@ -1059,7 +1070,7 @@ def test_domain_deepcopy():
     assert new_domain.session_config is not domain.session_config
     assert new_domain._custom_actions is not domain._custom_actions
     assert new_domain.user_actions is not domain.user_actions
-    assert new_domain.action_names is not domain.action_names
+    assert new_domain.action_names_or_texts is not domain.action_names_or_texts
 
 
 @pytest.mark.parametrize(
diff --git a/tests/shared/core/test_events.py b/tests/shared/core/test_events.py
index cbbd0e71c5df..f49063c09344 100644
--- a/tests/shared/core/test_events.py
+++ b/tests/shared/core/test_events.py
@@ -9,6 +9,7 @@
 
 import rasa.shared.utils.common
 import rasa.shared.core.events
+from rasa.shared.exceptions import UnsupportedFeatureException
 from rasa.shared.core.constants import ACTION_LISTEN_NAME, ACTION_SESSION_START_NAME
 from rasa.shared.core.events import (
     Event,
@@ -28,8 +29,9 @@
     UserUtteranceReverted,
     AgentUttered,
     SessionStarted,
-    md_format_message,
+    format_message,
 )
+from rasa.shared.nlu.constants import INTENT_NAME_KEY
 from tests.core.policies.test_rule_policy import GREET_INTENT_NAME, UTTER_GREET_ACTION
 
 
@@ -319,17 +321,15 @@ def test_user_uttered_intent_name(event: UserUttered, intent_name: Optional[Text
 
 
 def test_md_format_message():
-    assert (
-        md_format_message("Hello there!", intent="greet", entities=[]) == "Hello there!"
-    )
+    assert format_message("Hello there!", intent="greet", entities=[]) == "Hello there!"
 
 
 def test_md_format_message_empty():
-    assert md_format_message("", intent=None, entities=[]) == ""
+    assert format_message("", intent=None, entities=[]) == ""
 
 
 def test_md_format_message_using_short_entity_syntax():
-    formatted = md_format_message(
+    formatted = format_message(
         "I am from Berlin.",
         intent="location",
         entities=[{"start": 10, "end": 16, "entity": "city", "value": "Berlin"}],
@@ -338,7 +338,7 @@ def test_md_format_message_using_short_entity_syntax():
 
 
 def test_md_format_message_using_long_entity_syntax():
-    formatted = md_format_message(
+    formatted = format_message(
         "I am from Berlin in Germany.",
         intent="location",
         entities=[
@@ -507,3 +507,20 @@ def test_events_begin_with_session_start(
         rasa.shared.core.events.do_events_begin_with_session_start(test_events)
         == begin_with_session_start
     )
+
+
+@pytest.mark.parametrize(
+    "end_to_end_event",
+    [
+        ActionExecuted(action_text="I insist on using Markdown"),
+        UserUttered(text="Markdown is much more readable"),
+        UserUttered(
+            text="but YAML ❤️",
+            intent={INTENT_NAME_KEY: "use_yaml"},
+            use_text_for_featurization=True,
+        ),
+    ],
+)
+def test_print_end_to_end_events_in_markdown(end_to_end_event: Event):
+    with pytest.raises(UnsupportedFeatureException):
+        end_to_end_event.as_story_string()
diff --git a/tests/shared/core/test_trackers.py b/tests/shared/core/test_trackers.py
index 50d07251a4a8..755573743924 100644
--- a/tests/shared/core/test_trackers.py
+++ b/tests/shared/core/test_trackers.py
@@ -1,12 +1,14 @@
 import json
 import logging
 import os
+import textwrap
 import time
 from pathlib import Path
 import tempfile
 from typing import List, Text, Dict, Any, Type
 
 import fakeredis
+import freezegun
 import pytest
 
 import rasa.shared.utils.io
@@ -36,6 +38,8 @@
     LegacyForm,
     LegacyFormValidation,
     LoopInterrupted,
+    DefinePrevUserUtteredFeaturization,
+    EntitiesAdded,
 )
 from rasa.shared.core.slots import (
     FloatSlot,
@@ -65,9 +69,6 @@
     get_tracker,
 )
 
-from rasa.shared.core.training_data.story_writer.markdown_story_writer import (
-    MarkdownStoryWriter,
-)
 from rasa.shared.nlu.constants import ACTION_NAME, PREDICTED_CONFIDENCE_KEY
 
 domain = Domain.load("examples/moodbot/domain.yml")
@@ -204,10 +205,12 @@ async def test_tracker_state_regression_with_bot_utterance(default_agent: Agent)
         None,
         "action_listen",
         "greet",
+        None,  # DefinePrevUserUtteredFeaturization
         "utter_greet",
         None,
         "action_listen",
         "greet",
+        None,  # DefinePrevUserUtteredFeaturization
         "utter_greet",
         None,
         "action_listen",
@@ -230,6 +233,7 @@ async def test_bot_utterance_comes_after_action_event(default_agent):
         "session_started",
         "action",
         "user",
+        "user_featurization",
         "action",
         "bot",
         "action",
@@ -627,23 +631,6 @@ def test_session_started_not_part_of_applied_events(default_agent: Agent):
     assert tracker.applied_events() == list(tracker.events)[6:]
 
 
-async def test_tracker_dump_e2e_story(default_agent: Agent):
-    sender_id = "test_tracker_dump_e2e_story"
-
-    await default_agent.handle_text("/greet", sender_id=sender_id)
-    await default_agent.handle_text("/goodbye", sender_id=sender_id)
-    tracker = default_agent.tracker_store.get_or_create_tracker(sender_id)
-
-    story = tracker.export_stories(MarkdownStoryWriter(), e2e=True)
-    assert story.strip().split("\n") == [
-        "## test_tracker_dump_e2e_story",
-        "* greet: /greet",
-        "    - utter_greet",
-        "* goodbye: /goodbye",
-        "    - utter_goodbye",
-    ]
-
-
 def test_get_last_event_for():
     events = [ActionExecuted("one"), user_uttered("two", 1)]
 
@@ -1238,3 +1225,156 @@ def test_trackers_for_conversation_sessions(
     subtrackers = trackers_module.get_trackers_for_conversation_sessions(tracker)
 
     assert len(subtrackers) == n_subtrackers
+
+
+def test_policy_predictions_dont_change_persistence():
+    original_user_message = UserUttered("hi", intent={"name": "greet"})
+    tracker = DialogueStateTracker.from_events(
+        "Vova",
+        evts=[
+            ActionExecuted(ACTION_LISTEN_NAME),
+            UserUttered("hi", intent={"name": "greet"}),
+            DefinePrevUserUtteredFeaturization(True),
+            EntitiesAdded(entities=[{"entity": "entity1", "value": "value1"}]),
+        ],
+    )
+
+    user_message: UserUttered = list(tracker.events)[1]
+    # The entities from the policy predictions are accessible
+    assert user_message.entities
+
+    actual_serialized = user_message.as_dict()
+
+    # Assert entities predicted by policies are not persisted
+    assert not actual_serialized["parse_data"]["entities"]
+
+    expected_serialized = original_user_message.as_dict()
+    # don't compare timestamps
+    expected_serialized.pop("timestamp")
+    actual_serialized.pop("timestamp")
+
+    assert actual_serialized == expected_serialized
+
+
+@freezegun.freeze_time("2018-01-01")
+def test_policy_prediction_reflected_in_tracker_state():
+    entities_predicted_by_policy = [{"entity": "entity1", "value": "value1"}]
+    nlu_entities = [{"entity": "entityNLU", "value": "value100"}]
+
+    tracker = DialogueStateTracker.from_events(
+        "Tester",
+        evts=[
+            ActionExecuted(ACTION_LISTEN_NAME),
+            UserUttered(
+                "hi",
+                intent={"name": "greet"},
+                entities=nlu_entities.copy(),
+                message_id="unique",
+                metadata={"some": "data"},
+            ),
+            DefinePrevUserUtteredFeaturization(True),
+            EntitiesAdded(entities=entities_predicted_by_policy),
+        ],
+    )
+
+    tracker_state = tracker.current_state()
+
+    expected_state = {
+        "sender_id": "Tester",
+        "slots": {},
+        "latest_message": {
+            "intent": {"name": "greet"},
+            "entities": nlu_entities + entities_predicted_by_policy,
+            "text": "hi",
+            "message_id": "unique",
+            "metadata": {"some": "data"},
+        },
+        "latest_event_time": 1514764800.0,
+        "followup_action": None,
+        "paused": False,
+        "events": None,
+        "latest_input_channel": None,
+        "active_loop": {},
+        "latest_action": {"action_name": "action_listen"},
+        "latest_action_name": "action_listen",
+    }
+
+    assert tracker_state == expected_state
+
+    # Make sure we didn't change the actual event
+    assert tracker.latest_message.parse_data["entities"] == nlu_entities
+
+
+def test_autofill_slots_for_policy_entities():
+    policy_entity, policy_entity_value = "policy_entity", "end-to-end"
+    nlu_entity, nlu_entity_value = "nlu_entity", "nlu rocks"
+    domain = Domain.from_yaml(
+        textwrap.dedent(
+            f"""
+    entities:
+    - {nlu_entity}
+    - {policy_entity}
+
+    slots:
+        {nlu_entity}:
+            type: text
+        {policy_entity}:
+            type: text
+    """
+        )
+    )
+
+    tracker = DialogueStateTracker.from_events(
+        "some sender",
+        evts=[
+            ActionExecuted(ACTION_LISTEN_NAME),
+            UserUttered(
+                "hi",
+                intent={"name": "greet"},
+                entities=[{"entity": nlu_entity, "value": nlu_entity_value}],
+            ),
+            DefinePrevUserUtteredFeaturization(True),
+            EntitiesAdded(
+                entities=[
+                    {"entity": policy_entity, "value": policy_entity_value},
+                    {"entity": nlu_entity, "value": nlu_entity_value},
+                ]
+            ),
+        ],
+        domain=domain,
+        slots=domain.slots,
+    )
+
+    # Slots are correctly set
+    assert tracker.slots[nlu_entity].value == nlu_entity_value
+    assert tracker.slots[policy_entity].value == policy_entity_value
+
+    expected_events = [
+        ActionExecuted(ACTION_LISTEN_NAME),
+        UserUttered(
+            "hi",
+            intent={"name": "greet"},
+            entities=[
+                {"entity": nlu_entity, "value": nlu_entity_value},
+                # Added by `DefinePrevUserUtteredEntities`
+                {"entity": policy_entity, "value": policy_entity_value},
+            ],
+        ),
+        # SlotSet event added for entity predicted by NLU
+        SlotSet(nlu_entity, nlu_entity_value),
+        DefinePrevUserUtteredFeaturization(True),
+        EntitiesAdded(
+            entities=[
+                {"entity": policy_entity, "value": policy_entity_value},
+                {"entity": nlu_entity, "value": nlu_entity_value},
+            ]
+        ),
+        # SlotSet event added for entity predicted by policies
+        # This event is somewhat duplicate. We don't deduplicate as this is a true
+        # reflection of the given events and it doesn't change the actual state.
+        SlotSet(nlu_entity, nlu_entity_value),
+        SlotSet(policy_entity, policy_entity_value),
+    ]
+
+    for actual, expected in zip(tracker.events, expected_events):
+        assert actual == expected
diff --git a/tests/shared/core/training_data/story_reader/test_common_story_reader.py b/tests/shared/core/training_data/story_reader/test_common_story_reader.py
index c6926bb13e9e..338616e13d48 100644
--- a/tests/shared/core/training_data/story_reader/test_common_story_reader.py
+++ b/tests/shared/core/training_data/story_reader/test_common_story_reader.py
@@ -16,7 +16,7 @@
 
 from rasa.shared.nlu.interpreter import RegexInterpreter
 from rasa.shared.nlu.constants import ACTION_NAME, ENTITIES, INTENT, INTENT_NAME_KEY
-from rasa.utils.tensorflow.model_data_utils import surface_attributes
+from rasa.utils.tensorflow.model_data_utils import _surface_attributes
 
 
 @pytest.mark.parametrize(
@@ -109,7 +109,7 @@ async def test_generate_training_data_with_cycles(
         stories_file, default_domain, augmentation_factor=0
     )
 
-    training_data, label_ids = featurizer.featurize_trackers(
+    _, label_ids, _ = featurizer.featurize_trackers(
         training_trackers, default_domain, interpreter=RegexInterpreter()
     )
 
@@ -226,7 +226,7 @@ async def test_load_multi_file_training_data(
         hashed.append(json.dumps(sts + acts, sort_keys=True))
     hashed = sorted(hashed, reverse=True)
 
-    data, label_ids = featurizer.featurize_trackers(
+    data, label_ids, _ = featurizer.featurize_trackers(
         trackers, default_domain, interpreter=RegexInterpreter()
     )
 
@@ -244,7 +244,7 @@ async def test_load_multi_file_training_data(
         hashed_mul.append(json.dumps(sts_mul + acts_mul, sort_keys=True))
     hashed_mul = sorted(hashed_mul, reverse=True)
 
-    data_mul, label_ids_mul = featurizer_mul.featurize_trackers(
+    data_mul, label_ids_mul, _ = featurizer_mul.featurize_trackers(
         trackers_mul, default_domain, interpreter=RegexInterpreter()
     )
 
@@ -252,8 +252,8 @@ async def test_load_multi_file_training_data(
     # we check for intents, action names and entities -- the features which
     # are included in the story files
 
-    data = surface_attributes(data)
-    data_mul = surface_attributes(data_mul)
+    data = _surface_attributes(data)
+    data_mul = _surface_attributes(data_mul)
 
     for attribute in [INTENT, ACTION_NAME, ENTITIES]:
         if attribute not in data or attribute not in data_mul:
diff --git a/tests/shared/core/training_data/story_reader/test_markdown_story_reader.py b/tests/shared/core/training_data/story_reader/test_markdown_story_reader.py
index 8160472808d9..02f4a322defc 100644
--- a/tests/shared/core/training_data/story_reader/test_markdown_story_reader.py
+++ b/tests/shared/core/training_data/story_reader/test_markdown_story_reader.py
@@ -360,7 +360,7 @@ async def test_read_rules_without_stories(default_domain: Domain):
     ],
 )
 def test_e2e_parsing(line: Text, expected: Dict):
-    actual = MarkdownStoryReader.parse_e2e_message(line)
+    actual = MarkdownStoryReader().parse_e2e_message(line)
 
     assert actual.as_dict() == expected
 
diff --git a/tests/shared/core/training_data/story_reader/test_yaml_story_reader.py b/tests/shared/core/training_data/story_reader/test_yaml_story_reader.py
index 83c4a68e0b7c..7ba582d19d94 100644
--- a/tests/shared/core/training_data/story_reader/test_yaml_story_reader.py
+++ b/tests/shared/core/training_data/story_reader/test_yaml_story_reader.py
@@ -6,6 +6,7 @@
 from rasa.shared.exceptions import FileNotFoundException, YamlSyntaxException
 import rasa.shared.utils.io
 from rasa.shared.constants import LATEST_TRAINING_DATA_FORMAT_VERSION
+from rasa.core.actions.action import ACTION_LISTEN_NAME
 from rasa.core import training
 from rasa.shared.core.constants import RULE_SNIPPET_ACTION_NAME
 from rasa.shared.core.domain import Domain
@@ -375,6 +376,43 @@ async def test_no_warning_if_intent_in_domain(default_domain: Domain):
     assert not len(record)
 
 
+async def test_parsing_of_e2e_stories(default_domain: Domain):
+    yaml_file = "data/test_yaml_stories/stories_hybrid_e2e.yml"
+    tracker = await training.load_data(
+        yaml_file,
+        default_domain,
+        use_story_concatenation=False,
+        tracker_limit=1000,
+        remove_duplicates=False,
+    )
+
+    assert len(tracker) == 1
+
+    actual = list(tracker[0].events)
+
+    expected = [
+        ActionExecuted(ACTION_LISTEN_NAME),
+        UserUttered(intent={"name": "simple"}),
+        ActionExecuted("utter_greet"),
+        ActionExecuted(ACTION_LISTEN_NAME),
+        UserUttered(
+            "I am looking for a Kenyan restaurant",
+            {"name": None},
+            entities=[{"start": 19, "end": 25, "value": "Kenyan", "entity": "cuisine"}],
+        ),
+        ActionExecuted("", action_text="good for you"),
+        ActionExecuted(ACTION_LISTEN_NAME),
+        UserUttered(intent={"name": "goodbye"}),
+        ActionExecuted("utter_goodbye"),
+        ActionExecuted(ACTION_LISTEN_NAME),
+        UserUttered("One more thing", {"name": None}),
+        ActionExecuted("", action_text="What?"),
+        ActionExecuted(ACTION_LISTEN_NAME),
+    ]
+
+    assert actual == expected
+
+
 async def test_active_loop_is_parsed(default_domain: Domain):
     stories = (
         f'version: "{LATEST_TRAINING_DATA_FORMAT_VERSION}"\n'
@@ -431,8 +469,7 @@ def test_end_to_end_story_with_shortcut_intent():
     """
 
     story_as_yaml = rasa.shared.utils.io.read_yaml(story)
-
-    steps = YAMLStoryReader().read_from_parsed_yaml(story_as_yaml)
+    steps = YAMLStoryReader(use_e2e=True).read_from_parsed_yaml(story_as_yaml)
     user_uttered = steps[0].events[0]
 
     assert user_uttered == UserUttered(
@@ -527,3 +564,27 @@ async def test_story_with_retrieval_intent_warns(
         reader.read_from_file(file)
 
     assert len(record) == (1 if warning else 0)
+
+
+@pytest.mark.parametrize("is_conversation_test", [True, False])
+def test_handles_mixed_steps_for_test_and_e2e_stories(is_conversation_test):
+    stories = """
+    stories:
+    - story: hello world
+      steps:
+      - user: Hi
+      - bot: Hello?
+      - user: Well...
+        intent: suspicion
+    """
+
+    reader = YAMLStoryReader(use_e2e=is_conversation_test)
+    yaml_content = rasa.shared.utils.io.read_yaml(stories)
+
+    steps = reader.read_from_parsed_yaml(yaml_content)
+
+    events = steps[0].events
+    assert len(events) == 3
+    assert events[0].text == "Hi"
+    assert events[1].action_text == "Hello?"
+    assert events[2].text == "Well..."
diff --git a/tests/shared/core/training_data/story_writer/test_markdown_story_writer.py b/tests/shared/core/training_data/story_writer/test_markdown_story_writer.py
new file mode 100644
index 000000000000..64d21ca10854
--- /dev/null
+++ b/tests/shared/core/training_data/story_writer/test_markdown_story_writer.py
@@ -0,0 +1,21 @@
+from rasa.core.agent import Agent
+from rasa.shared.core.training_data.story_writer.markdown_story_writer import (
+    MarkdownStoryWriter,
+)
+
+
+async def test_tracker_dump_e2e_story(default_agent: Agent):
+    sender_id = "test_tracker_dump_e2e_story"
+
+    await default_agent.handle_text("/greet", sender_id=sender_id)
+    await default_agent.handle_text("/goodbye", sender_id=sender_id)
+    tracker = default_agent.tracker_store.get_or_create_tracker(sender_id)
+
+    story = tracker.export_stories(MarkdownStoryWriter(), e2e=True)
+    assert story.strip().split("\n") == [
+        "## test_tracker_dump_e2e_story",
+        "* greet: /greet",
+        "    - utter_greet",
+        "* goodbye: /goodbye",
+        "    - utter_goodbye",
+    ]
diff --git a/tests/shared/core/training_data/story_writer/test_yaml_story_writer.py b/tests/shared/core/training_data/story_writer/test_yaml_story_writer.py
index fa746263b082..38830a9ccb66 100644
--- a/tests/shared/core/training_data/story_writer/test_yaml_story_writer.py
+++ b/tests/shared/core/training_data/story_writer/test_yaml_story_writer.py
@@ -6,7 +6,11 @@
 
 from rasa.shared.core.constants import ACTION_SESSION_START_NAME, ACTION_LISTEN_NAME
 from rasa.shared.core.domain import Domain
-from rasa.shared.core.events import ActionExecuted, UserUttered
+from rasa.shared.core.events import (
+    ActionExecuted,
+    UserUttered,
+    DefinePrevUserUtteredFeaturization,
+)
 from rasa.shared.core.trackers import DialogueStateTracker
 from rasa.shared.core.training_data.story_reader.markdown_story_reader import (
     MarkdownStoryReader,
@@ -18,9 +22,6 @@
     YAMLStoryWriter,
 )
 from rasa.shared.core.training_data.structures import STORY_START
-from rasa.utils.endpoints import EndpointConfig
-
-import rasa.shared.utils.io
 
 
 @pytest.mark.parametrize(
@@ -139,10 +140,10 @@ def test_yaml_writer_avoids_dumping_not_existing_user_messages():
 
 
 @pytest.mark.parametrize(
-    "input_yaml_file", ["data/test_yaml_stories/rules_with_stories_sorted.yaml",],
+    "input_yaml_file", ["data/test_yaml_stories/rules_with_stories_sorted.yaml"]
 )
 def test_yaml_writer_dumps_rules(
-    input_yaml_file: Text, tmpdir: Path, default_domain: Domain,
+    input_yaml_file: Text, tmpdir: Path, default_domain: Domain
 ):
     original_yaml_reader = YAMLStoryReader(default_domain, None, False)
     original_yaml_story_steps = original_yaml_reader.read_from_file(input_yaml_file)
@@ -182,3 +183,105 @@ def test_yaml_writer_stories_to_yaml(default_domain: Domain):
     assert isinstance(result, OrderedDict)
     assert "stories" in result
     assert len(result["stories"]) == 1
+
+
+def test_writing_end_to_end_stories(default_domain: Domain):
+    story_name = "test_writing_end_to_end_stories"
+    events = [
+        # Training story story with intent and action labels
+        ActionExecuted(ACTION_LISTEN_NAME),
+        UserUttered(intent={"name": "greet"}),
+        ActionExecuted("utter_greet"),
+        ActionExecuted(ACTION_LISTEN_NAME),
+        # Prediction story story with intent and action labels
+        ActionExecuted(ACTION_LISTEN_NAME),
+        UserUttered(text="Hi", intent={"name": "greet"}),
+        DefinePrevUserUtteredFeaturization(use_text_for_featurization=False),
+        ActionExecuted("utter_greet"),
+        ActionExecuted(ACTION_LISTEN_NAME),
+        # End-To-End Training Story
+        UserUttered(text="Hi"),
+        ActionExecuted(action_text="Hi, I'm a bot."),
+        ActionExecuted(ACTION_LISTEN_NAME),
+        # End-To-End Prediction Story
+        UserUttered("Hi", intent={"name": "greet"}),
+        DefinePrevUserUtteredFeaturization(use_text_for_featurization=True),
+        ActionExecuted(action_text="Hi, I'm a bot."),
+        ActionExecuted(ACTION_LISTEN_NAME),
+    ]
+
+    tracker = DialogueStateTracker.from_events(story_name, events)
+    dump = YAMLStoryWriter().dumps(tracker.as_story().story_steps)
+
+    assert (
+        dump.strip()
+        == textwrap.dedent(
+            f"""
+        version: "2.0"
+        stories:
+        - story: {story_name}
+          steps:
+          - intent: greet
+          - action: utter_greet
+          - intent: greet
+          - action: utter_greet
+          - user: |-
+              Hi
+          - bot: Hi, I'm a bot.
+          - user: |-
+              Hi
+          - bot: Hi, I'm a bot.
+    """
+        ).strip()
+    )
+
+
+def test_reading_and_writing_end_to_end_stories_in_test_mode(default_domain: Domain):
+    story_name = "test_writing_end_to_end_stories_in_test_mode"
+
+    conversation_tests = f"""
+stories:
+- story: {story_name}
+  steps:
+  - intent: greet
+    user: Hi
+  - action: utter_greet
+  - intent: greet
+    user: |
+      [Hi](test)
+  - action: utter_greet
+  - user: Hi
+  - bot: Hi, I'm a bot.
+  - user: |
+      [Hi](test)
+  - bot: Hi, I'm a bot.
+    """
+
+    end_to_end_tests = YAMLStoryReader().read_from_string(conversation_tests)
+    dump = YAMLStoryWriter().dumps(end_to_end_tests, is_test_story=True)
+
+    assert (
+        dump.strip()
+        == textwrap.dedent(
+            f"""
+        version: "2.0"
+        stories:
+        - story: {story_name}
+          steps:
+          - intent: greet
+            user: |-
+              Hi
+          - action: utter_greet
+          - intent: greet
+            user: |-
+              [Hi](test)
+          - action: utter_greet
+          - user: |-
+              Hi
+          - bot: Hi, I'm a bot.
+          - user: |-
+              [Hi](test)
+          - bot: Hi, I'm a bot.
+    """
+        ).strip()
+    )
diff --git a/tests/shared/core/training_data/test_structures.py b/tests/shared/core/training_data/test_structures.py
index 5794aac4eb9f..c6e2236ac815 100644
--- a/tests/shared/core/training_data/test_structures.py
+++ b/tests/shared/core/training_data/test_structures.py
@@ -1,9 +1,22 @@
 import rasa.core
 from rasa.shared.core.constants import ACTION_SESSION_START_NAME
 from rasa.shared.core.domain import Domain
-from rasa.shared.core.events import SessionStarted, SlotSet, UserUttered, ActionExecuted
+from rasa.shared.core.events import (
+    SessionStarted,
+    SlotSet,
+    UserUttered,
+    ActionExecuted,
+    DefinePrevUserUtteredFeaturization,
+)
 from rasa.shared.core.trackers import DialogueStateTracker
+from rasa.shared.core.training_data.story_reader.yaml_story_reader import (
+    YAMLStoryReader,
+)
+from rasa.shared.core.training_data.story_writer.yaml_story_writer import (
+    YAMLStoryWriter,
+)
 from rasa.shared.core.training_data.structures import Story
+from rasa.shared.nlu.constants import INTENT_NAME_KEY
 
 domain = Domain.load("examples/moodbot/domain.yml")
 
@@ -19,17 +32,28 @@ def test_session_start_is_not_serialised(default_domain: Domain):
     # add the two SessionStarted events and a user event
     tracker.update(ActionExecuted(ACTION_SESSION_START_NAME))
     tracker.update(SessionStarted())
-    tracker.update(UserUttered("say something"))
+    tracker.update(
+        UserUttered("say something", intent={INTENT_NAME_KEY: "some_intent"})
+    )
+    tracker.update(DefinePrevUserUtteredFeaturization(False))
 
-    # make sure session start is not serialised
-    story = Story.from_events(tracker.events, "some-story01")
+    YAMLStoryWriter().dumps(
+        Story.from_events(tracker.events, "some-story01").story_steps
+    )
 
-    expected = """## some-story01
-    - slot{"slot": "value"}
-* say something
+    expected = """version: "2.0"
+stories:
+- story: some-story01
+  steps:
+  - slot_was_set:
+    - slot: value
+  - intent: some_intent
 """
 
-    assert story.as_story_string(flat=True) == expected
+    actual = YAMLStoryWriter().dumps(
+        Story.from_events(tracker.events, "some-story01").story_steps
+    )
+    assert actual == expected
 
 
 def test_as_story_string_or_statement():
diff --git a/tests/shared/importers/test_importer.py b/tests/shared/importers/test_importer.py
index 3c6b98f517e3..1f8d98c12bb2 100644
--- a/tests/shared/importers/test_importer.py
+++ b/tests/shared/importers/test_importer.py
@@ -17,7 +17,6 @@
     CombinedDataImporter,
     TrainingDataImporter,
     NluDataImporter,
-    CoreDataImporter,
     E2EImporter,
     ResponsesSyncImporter,
 )
@@ -155,29 +154,6 @@ async def test_nlu_only(project: Text):
     assert not nlu_data.is_empty()
 
 
-async def test_core_only(project: Text):
-    config_path = os.path.join(project, DEFAULT_CONFIG_PATH)
-    domain_path = os.path.join(project, DEFAULT_DOMAIN_PATH)
-    default_data_path = os.path.join(project, DEFAULT_DATA_PATH)
-    actual = TrainingDataImporter.load_core_importer_from_config(
-        config_path, domain_path, training_data_paths=[default_data_path]
-    )
-
-    assert isinstance(actual, CoreDataImporter)
-
-    stories = await actual.get_stories()
-    assert not stories.is_empty()
-
-    domain = await actual.get_domain()
-    assert not domain.is_empty()
-
-    config = await actual.get_config()
-    assert config
-
-    nlu_data = await actual.get_nlu_data()
-    assert nlu_data.is_empty()
-
-
 async def test_import_nlu_training_data_from_e2e_stories(
     default_importer: TrainingDataImporter,
 ):
@@ -210,9 +186,9 @@ async def mocked_stories(*_: Any, **__: Any) -> StoryGraph:
     importer_without_e2e.get_stories = mocked_stories
 
     # The wrapping `E2EImporter` simply forwards these method calls
-    assert (await importer_without_e2e.get_stories()).as_story_string() == (
+    assert (await importer_without_e2e.get_stories()).fingerprint() == (
         await default_importer.get_stories()
-    ).as_story_string()
+    ).fingerprint()
     assert (await importer_without_e2e.get_config()) == (
         await default_importer.get_config()
     )
@@ -333,7 +309,10 @@ async def mocked_stories(*_: Any, **__: Any) -> StoryGraph:
 
     domain = await default_importer.get_domain()
 
-    assert all(action_name in domain.action_names for action_name in additional_actions)
+    assert all(
+        action_name in domain.action_names_or_texts
+        for action_name in additional_actions
+    )
 
 
 async def test_nlu_data_domain_sync_with_retrieval_intents(project: Text):
@@ -343,16 +322,10 @@ async def test_nlu_data_domain_sync_with_retrieval_intents(project: Text):
         "data/test_nlu/default_retrieval_intents.md",
         "data/test_responses/default.md",
     ]
-    base_data_importer = TrainingDataImporter.load_from_dict(
+    importer = TrainingDataImporter.load_from_dict(
         {}, config_path, domain_path, data_paths
     )
 
-    nlu_importer = NluDataImporter(base_data_importer)
-    core_importer = CoreDataImporter(base_data_importer)
-
-    importer = ResponsesSyncImporter(
-        CombinedDataImporter([nlu_importer, core_importer])
-    )
     domain = await importer.get_domain()
     nlu_data = await importer.get_nlu_data()
 
@@ -360,7 +333,7 @@ async def test_nlu_data_domain_sync_with_retrieval_intents(project: Text):
     assert domain.intent_properties["chitchat"].get("is_retrieval_intent")
     assert domain.retrieval_intent_templates == nlu_data.responses
     assert domain.templates != nlu_data.responses
-    assert "utter_chitchat" in domain.action_names
+    assert "utter_chitchat" in domain.action_names_or_texts
 
 
 async def test_nlu_data_domain_sync_responses(project: Text):
@@ -368,16 +341,10 @@ async def test_nlu_data_domain_sync_responses(project: Text):
     domain_path = "data/test_domains/default.yml"
     data_paths = ["data/test_nlg/test_responses.yml"]
 
-    base_data_importer = TrainingDataImporter.load_from_dict(
+    importer = TrainingDataImporter.load_from_dict(
         {}, config_path, domain_path, data_paths
     )
 
-    nlu_importer = NluDataImporter(base_data_importer)
-    core_importer = CoreDataImporter(base_data_importer)
-
-    importer = ResponsesSyncImporter(
-        CombinedDataImporter([nlu_importer, core_importer])
-    )
     with pytest.warns(None):
         domain = await importer.get_domain()
 
diff --git a/tests/shared/importers/test_multi_project.py b/tests/shared/importers/test_multi_project.py
index 0c1dfac517ed..0565980c3e8d 100644
--- a/tests/shared/importers/test_multi_project.py
+++ b/tests/shared/importers/test_multi_project.py
@@ -364,4 +364,4 @@ async def test_multi_project_training(trained_async):
         "utter_goodbye",
     ]
 
-    assert all([a in domain.action_names for a in expected_actions])
+    assert all([a in domain.action_names_or_texts for a in expected_actions])
diff --git a/tests/shared/importers/test_rasa.py b/tests/shared/importers/test_rasa.py
index 8ba8f3c299e2..55c1814e3ba9 100644
--- a/tests/shared/importers/test_rasa.py
+++ b/tests/shared/importers/test_rasa.py
@@ -24,7 +24,7 @@ async def test_rasa_file_importer(project: Text):
     assert len(domain.intents) == 7 + len(DEFAULT_INTENTS)
     assert domain.slots == []
     assert domain.entities == []
-    assert len(domain.action_names) == 17
+    assert len(domain.action_names_or_texts) == 17
     assert len(domain.templates) == 6
 
     stories = await importer.get_stories()
diff --git a/tests/shared/nlu/training_data/test_message.py b/tests/shared/nlu/training_data/test_message.py
index 802f6026bf28..25ad054668f9 100644
--- a/tests/shared/nlu/training_data/test_message.py
+++ b/tests/shared/nlu/training_data/test_message.py
@@ -252,7 +252,7 @@ def test_features_present(
 
 
 @pytest.mark.parametrize(
-    "message, core_message",
+    "message, result",
     [
         (Message({INTENT: "intent", TEXT: "text"}), False),
         (Message({RESPONSE: "response", TEXT: "text"}), False),
@@ -262,7 +262,7 @@ def test_features_present(
         (Message({TEXT: "text"}), True),
     ],
 )
-def test_is_core_message(
-    message: Message, core_message: bool,
+def test_is_core_or_domain_message(
+    message: Message, result: bool,
 ):
-    assert core_message == message.is_core_message()
+    assert result == message.is_core_or_domain_message()
diff --git a/tests/shared/nlu/training_data/test_training_data.py b/tests/shared/nlu/training_data/test_training_data.py
index a0acba1460d1..f528c7dbc97b 100644
--- a/tests/shared/nlu/training_data/test_training_data.py
+++ b/tests/shared/nlu/training_data/test_training_data.py
@@ -1,3 +1,5 @@
+import asyncio
+from pathlib import Path
 from typing import Text, List
 
 import pytest
@@ -28,6 +30,10 @@
 )
 
 import rasa.shared.data
+from rasa.shared.core.domain import Domain
+from rasa.shared.core.events import UserUttered, ActionExecuted
+from rasa.shared.core.training_data.structures import StoryGraph, StoryStep
+from rasa.shared.importers.importer import TrainingDataImporter, E2EImporter
 
 
 def test_luis_data():
@@ -626,6 +632,40 @@ def test_custom_attributes(tmp_path):
     assert example.get("sentiment") == 0.8
 
 
+async def test_without_additional_e2e_examples(tmp_path: Path):
+    domain_path = tmp_path / "domain.yml"
+    domain_path.write_text(Domain.empty().as_yaml())
+
+    config_path = tmp_path / "config.yml"
+    config_path.touch()
+
+    existing = TrainingDataImporter.load_from_dict(
+        {}, str(config_path), str(domain_path), []
+    )
+
+    stories = StoryGraph(
+        [
+            StoryStep(
+                events=[
+                    UserUttered(None, {"name": "greet_from_stories"}),
+                    ActionExecuted("utter_greet_from_stories"),
+                ]
+            )
+        ]
+    )
+
+    # Patch to return our test stories
+    existing.get_stories = asyncio.coroutine(lambda *args: stories)
+
+    importer = E2EImporter(existing)
+
+    training_data = await importer.get_nlu_data()
+
+    assert training_data.training_examples
+    assert not training_data.is_empty()
+    assert len(training_data.nlu_examples) == 0
+
+
 def test_fingerprint_is_same_when_loading_data_again():
     from rasa.shared.importers.utils import training_data_from_paths
 
diff --git a/tests/shared/test_data.py b/tests/shared/test_data.py
index a06baf4d2c69..0b21c73d5301 100644
--- a/tests/shared/test_data.py
+++ b/tests/shared/test_data.py
@@ -213,21 +213,17 @@ def test_get_core_nlu_directories_with_none():
     assert all(not os.listdir(directory) for directory in directories)
 
 
-def test_same_file_names_get_resolved(tmp_path):
+def test_same_file_names_get_resolved(tmp_path: Path):
     # makes sure the resolution properly handles if there are two files with
     # with the same name in different directories
 
     (tmp_path / "one").mkdir()
     (tmp_path / "two").mkdir()
-    data_dir_one = str(tmp_path / "one" / "stories.md")
-    data_dir_two = str(tmp_path / "two" / "stories.md")
-    shutil.copy2(DEFAULT_STORIES_FILE, data_dir_one)
-    shutil.copy2(DEFAULT_STORIES_FILE, data_dir_two)
+    shutil.copy2(DEFAULT_STORIES_FILE, tmp_path / "one" / "stories.yml")
+    shutil.copy2(DEFAULT_STORIES_FILE, tmp_path / "two" / "stories.yml")
 
-    nlu_dir_one = str(tmp_path / "one" / "nlu.yml")
-    nlu_dir_two = str(tmp_path / "two" / "nlu.yml")
-    shutil.copy2(DEFAULT_NLU_DATA, nlu_dir_one)
-    shutil.copy2(DEFAULT_NLU_DATA, nlu_dir_two)
+    shutil.copy2(DEFAULT_NLU_DATA, tmp_path / "one" / "nlu.yml")
+    shutil.copy2(DEFAULT_NLU_DATA, tmp_path / "two" / "nlu.yml")
 
     core_directory, nlu_directory = rasa.shared.data.get_core_nlu_directories(
         [str(tmp_path)]
@@ -241,7 +237,7 @@ def test_same_file_names_get_resolved(tmp_path):
     stories = os.listdir(core_directory)
 
     assert len(stories) == 2
-    assert all(f.endswith("stories.md") for f in stories)
+    assert all(f.endswith("stories.yml") for f in stories)
 
 
 @pytest.mark.parametrize(
diff --git a/tests/test_server.py b/tests/test_server.py
index a04b201cd927..42e562305859 100644
--- a/tests/test_server.py
+++ b/tests/test_server.py
@@ -424,26 +424,19 @@ async def test_parse_on_invalid_emulation_mode(rasa_app_nlu: SanicASGITestClient
     assert response.status == HTTPStatus.BAD_REQUEST
 
 
-async def test_train_stack_success(
+async def test_train_stack_success_with_md(
     rasa_app: SanicASGITestClient,
     default_domain_path: Text,
-    default_stories_file: Text,
     default_stack_config: Text,
     default_nlu_data: Text,
     tmp_path: Path,
 ):
-    with ExitStack() as stack:
-        domain_file = stack.enter_context(open(default_domain_path))
-        config_file = stack.enter_context(open(default_stack_config))
-        stories_file = stack.enter_context(open(default_stories_file))
-        nlu_file = stack.enter_context(open(default_nlu_data))
-
-        payload = dict(
-            domain=domain_file.read(),
-            config=config_file.read(),
-            stories=stories_file.read(),
-            nlu=nlu_file.read(),
-        )
+    payload = dict(
+        domain=Path(default_domain_path).read_text(),
+        config=Path(default_stack_config).read_text(),
+        stories=Path("data/test_stories/stories_defaultdomain.md").read_text(),
+        nlu=Path(default_nlu_data).read_text(),
+    )
 
     _, response = await rasa_app.post("/model/train", json=payload)
     assert response.status == HTTPStatus.OK
@@ -496,25 +489,24 @@ async def test_train_nlu_success(
     assert os.path.exists(os.path.join(model_path, "fingerprint.json"))
 
 
-async def test_train_core_success(
+async def test_train_core_success_with(
     rasa_app: SanicASGITestClient,
     default_stack_config: Text,
     default_stories_file: Text,
     default_domain_path: Text,
     tmp_path: Path,
 ):
-    with ExitStack() as stack:
-        domain_file = stack.enter_context(open(default_domain_path))
-        config_file = stack.enter_context(open(default_stack_config))
-        core_file = stack.enter_context(open(default_stories_file))
+    payload = f"""
+{Path(default_domain_path).read_text()}
+{Path(default_stack_config).read_text()}
+{Path(default_stories_file).read_text()}
+    """
 
-        payload = dict(
-            domain=domain_file.read(),
-            config=config_file.read(),
-            stories=core_file.read(),
-        )
-
-    _, response = await rasa_app.post("/model/train", json=payload)
+    _, response = await rasa_app.post(
+        "/model/train",
+        data=payload,
+        headers={"Content-type": rasa.server.YAML_CONTENT_TYPE},
+    )
     assert response.status == HTTPStatus.OK
 
     # save model to temporary file
@@ -716,7 +708,11 @@ async def test_evaluate_stories(
 ):
     stories = rasa.shared.utils.io.read_file(default_stories_file)
 
-    _, response = await rasa_app.post("/model/test/stories", data=stories)
+    _, response = await rasa_app.post(
+        "/model/test/stories",
+        data=stories,
+        headers={"Content-type": rasa.server.YAML_CONTENT_TYPE},
+    )
 
     assert response.status == HTTPStatus.OK
 
@@ -750,9 +746,9 @@ async def test_evaluate_stories_not_ready_agent(
 
 
 async def test_evaluate_stories_end_to_end(
-    rasa_app: SanicASGITestClient, end_to_end_story_file: Text
+    rasa_app: SanicASGITestClient, end_to_end_test_story_file: Text
 ):
-    stories = rasa.shared.utils.io.read_file(end_to_end_story_file)
+    stories = rasa.shared.utils.io.read_file(end_to_end_test_story_file)
 
     _, response = await rasa_app.post("/model/test/stories?e2e=true", data=stories)
 
@@ -1052,6 +1048,7 @@ async def test_requesting_non_existent_tracker(rasa_app: SanicASGITestClient):
             "policy": None,
             "confidence": 1,
             "timestamp": 1514764800,
+            "action_text": None,
         },
         {"event": "session_started", "timestamp": 1514764800},
         {
@@ -1060,6 +1057,7 @@ async def test_requesting_non_existent_tracker(rasa_app: SanicASGITestClient):
             "policy": None,
             "confidence": None,
             "timestamp": 1514764800,
+            "action_text": None,
         },
     ]
     assert content["latest_message"] == {
diff --git a/tests/test_test.py b/tests/test_test.py
index d0629e8ad6c2..aea5aed8b157 100644
--- a/tests/test_test.py
+++ b/tests/test_test.py
@@ -184,10 +184,10 @@ def test_write_classification_errors():
         WronglyClassifiedUserUtterance(
             UserUttered("Hello", {"name": "goodbye"}), evaluation
         ),
-        WronglyPredictedAction("utter_greet", "utter_goodbye"),
+        WronglyPredictedAction("utter_greet", "", "utter_goodbye"),
     ]
     tracker = DialogueStateTracker.from_events("default", events)
-    dump = YAMLStoryWriter().dumps(tracker.as_story().story_steps, is_test_story=True)
+    dump = YAMLStoryWriter().dumps(tracker.as_story().story_steps)
     assert (
         dump.strip()
         == textwrap.dedent(
@@ -197,8 +197,6 @@ def test_write_classification_errors():
         - story: default
           steps:
           - intent: greet  # predicted: goodbye: Hello
-            user: |-
-              Hello
           - action: utter_greet  # predicted: utter_goodbye
 
     """
diff --git a/tests/test_train.py b/tests/test_train.py
index c9380371d9ef..35e2e167827d 100644
--- a/tests/test_train.py
+++ b/tests/test_train.py
@@ -1,12 +1,15 @@
+import logging
+import secrets
 import sys
 import tempfile
 import os
 from pathlib import Path
 from typing import Text, Dict, Any
-from unittest.mock import Mock, create_autospec
+from unittest.mock import Mock
 
 import pytest
 from _pytest.capture import CaptureFixture
+from _pytest.logging import LogCaptureFixture
 from _pytest.monkeypatch import MonkeyPatch
 
 from rasa.core.policies.ted_policy import TEDPolicy
@@ -353,6 +356,258 @@ def test_train_nlu_autoconfig(
     assert args[1] == autoconfig.TrainingType.NLU
 
 
+def mock_async(monkeypatch: MonkeyPatch, target: Any, name: Text) -> Mock:
+    mock = Mock()
+
+    async def mock_async_func(*args: Any, **kwargs: Any) -> None:
+        mock(*args, **kwargs)
+
+    monkeypatch.setattr(target, name, mock_async_func)
+    return mock
+
+
+def mock_core_training(monkeypatch: MonkeyPatch) -> Mock:
+    return mock_async(monkeypatch, rasa.core, rasa.core.train.__name__)
+
+
+def mock_nlu_training(monkeypatch: MonkeyPatch) -> Mock:
+    return mock_async(monkeypatch, rasa.nlu, rasa.nlu.train.__name__)
+
+
+def new_model_path_in_same_dir(old_model_path: Text) -> Text:
+    return str(Path(old_model_path).parent / (secrets.token_hex(8) + ".tar.gz"))
+
+
+class TestE2e:
+    def test_e2e_gives_experimental_warning(
+        self,
+        monkeypatch: MonkeyPatch,
+        trained_e2e_model: Text,
+        default_domain_path: Text,
+        default_stack_config: Text,
+        default_e2e_stories_file: Text,
+        default_nlu_data: Text,
+        caplog: LogCaptureFixture,
+    ):
+        mock_nlu_training(monkeypatch)
+        mock_core_training(monkeypatch)
+
+        with caplog.at_level(logging.WARNING):
+            train(
+                default_domain_path,
+                default_stack_config,
+                [default_e2e_stories_file, default_nlu_data],
+                output=new_model_path_in_same_dir(trained_e2e_model),
+            )
+
+        assert any(
+            [
+                "The end-to-end training is currently experimental" in record.message
+                for record in caplog.records
+            ]
+        )
+
+    def test_models_not_retrained_if_no_new_data(
+        self,
+        monkeypatch: MonkeyPatch,
+        trained_e2e_model: Text,
+        default_domain_path: Text,
+        default_stack_config: Text,
+        default_e2e_stories_file: Text,
+        default_nlu_data: Text,
+    ):
+        mocked_nlu_training = mock_nlu_training(monkeypatch)
+        mocked_core_training = mock_core_training(monkeypatch)
+
+        train(
+            default_domain_path,
+            default_stack_config,
+            [default_e2e_stories_file, default_nlu_data],
+            output=new_model_path_in_same_dir(trained_e2e_model),
+        )
+
+        mocked_core_training.assert_not_called()
+        mocked_nlu_training.assert_not_called()
+
+    def test_retrains_nlu_and_core_if_new_e2e_example(
+        self,
+        monkeypatch: MonkeyPatch,
+        trained_e2e_model: Text,
+        default_domain_path: Text,
+        default_stack_config: Text,
+        default_e2e_stories_file: Text,
+        default_nlu_data: Text,
+        tmp_path: Path,
+    ):
+        stories_yaml = rasa.shared.utils.io.read_yaml_file(default_e2e_stories_file)
+        stories_yaml["stories"][1]["steps"].append({"user": "new message!"})
+
+        new_stories_file = tmp_path / "new_stories.yml"
+        rasa.shared.utils.io.write_yaml(stories_yaml, new_stories_file)
+
+        mocked_nlu_training = mock_nlu_training(monkeypatch)
+        mocked_core_training = mock_core_training(monkeypatch)
+
+        new_model_path = train(
+            default_domain_path,
+            default_stack_config,
+            [new_stories_file, default_nlu_data],
+            output=new_model_path_in_same_dir(trained_e2e_model),
+        ).model
+        os.remove(new_model_path)
+
+        mocked_core_training.assert_called_once()
+        mocked_nlu_training.assert_called_once()
+
+    def test_retrains_only_core_if_new_e2e_example_seen_before(
+        self,
+        monkeypatch: MonkeyPatch,
+        trained_e2e_model: Text,
+        default_domain_path: Text,
+        default_stack_config: Text,
+        default_e2e_stories_file: Text,
+        default_nlu_data: Text,
+        tmp_path: Path,
+    ):
+        stories_yaml = rasa.shared.utils.io.read_yaml_file(default_e2e_stories_file)
+        stories_yaml["stories"][1]["steps"].append({"user": "Yes"})
+
+        new_stories_file = new_stories_file = tmp_path / "new_stories.yml"
+        rasa.shared.utils.io.write_yaml(stories_yaml, new_stories_file)
+
+        mocked_nlu_training = mock_nlu_training(monkeypatch)
+        mocked_core_training = mock_core_training(monkeypatch)
+
+        new_model_path = train(
+            default_domain_path,
+            default_stack_config,
+            [new_stories_file, default_nlu_data],
+            output=new_model_path_in_same_dir(trained_e2e_model),
+        ).model
+        os.remove(new_model_path)
+
+        mocked_core_training.assert_called_once()
+        mocked_nlu_training.assert_not_called()
+
+    def test_nlu_and_core_trained_if_no_nlu_data_but_e2e_stories(
+        self,
+        monkeypatch: MonkeyPatch,
+        default_domain_path: Text,
+        default_stack_config: Text,
+        default_e2e_stories_file: Text,
+        tmp_path: Path,
+    ):
+        mocked_nlu_training = mock_nlu_training(monkeypatch)
+        mocked_core_training = mock_core_training(monkeypatch)
+
+        output = self.make_tmp_model_dir(tmp_path)
+        train(
+            default_domain_path,
+            default_stack_config,
+            [default_e2e_stories_file],
+            output=output,
+        )
+
+        mocked_core_training.assert_called_once()
+        mocked_nlu_training.assert_called_once()
+
+    @staticmethod
+    def make_tmp_model_dir(tmp_path: Path) -> Text:
+        (tmp_path / "models").mkdir()
+        output = str(tmp_path / "models")
+        return output
+
+    def test_new_nlu_data_retrains_core_if_there_are_e2e_stories(
+        self,
+        monkeypatch: MonkeyPatch,
+        trained_e2e_model: Text,
+        default_domain_path: Text,
+        default_stack_config: Text,
+        default_e2e_stories_file: Text,
+        default_nlu_data: Text,
+        tmp_path: Path,
+    ):
+        nlu_yaml = rasa.shared.utils.io.read_yaml_file(default_nlu_data)
+        nlu_yaml["nlu"][0]["examples"] += "- surprise!\n"
+
+        new_nlu_file = tmp_path / "new_nlu.yml"
+        rasa.shared.utils.io.write_yaml(nlu_yaml, new_nlu_file)
+
+        mocked_nlu_training = mock_nlu_training(monkeypatch)
+        mocked_core_training = mock_core_training(monkeypatch)
+
+        new_model_path = train(
+            default_domain_path,
+            default_stack_config,
+            [default_e2e_stories_file, new_nlu_file],
+            output=new_model_path_in_same_dir(trained_e2e_model),
+        ).model
+        os.remove(new_model_path)
+
+        mocked_core_training.assert_called_once()
+        mocked_nlu_training.assert_called_once()
+
+    def test_new_nlu_data_does_not_retrain_core_if_there_are_no_e2e_stories(
+        self,
+        monkeypatch: MonkeyPatch,
+        trained_simple_rasa_model: Text,
+        default_domain_path: Text,
+        default_stack_config: Text,
+        simple_stories_file: Text,
+        default_nlu_data: Text,
+        tmp_path: Path,
+    ):
+        nlu_yaml = rasa.shared.utils.io.read_yaml_file(default_nlu_data)
+        nlu_yaml["nlu"][0]["examples"] += "- surprise!\n"
+
+        new_nlu_file = tmp_path / "new_nlu.yml"
+        rasa.shared.utils.io.write_yaml(nlu_yaml, new_nlu_file)
+
+        mocked_nlu_training = mock_nlu_training(monkeypatch)
+        mocked_core_training = mock_core_training(monkeypatch)
+
+        new_model_path = train(
+            default_domain_path,
+            default_stack_config,
+            [simple_stories_file, new_nlu_file],
+            output=new_model_path_in_same_dir(trained_simple_rasa_model),
+        ).model
+        os.remove(new_model_path)
+
+        mocked_core_training.assert_not_called()
+        mocked_nlu_training.assert_called_once()
+
+    def test_training_core_with_e2e_fails_gracefully(
+        self,
+        capsys: CaptureFixture,
+        monkeypatch: MonkeyPatch,
+        tmp_path: Path,
+        default_domain_path: Text,
+        default_stack_config: Text,
+        default_e2e_stories_file: Text,
+    ):
+
+        mocked_nlu_training = mock_nlu_training(monkeypatch)
+        mocked_core_training = mock_core_training(monkeypatch)
+
+        output = self.make_tmp_model_dir(tmp_path)
+        train_core(
+            default_domain_path,
+            default_stack_config,
+            default_e2e_stories_file,
+            output=output,
+        )
+
+        mocked_core_training.assert_not_called()
+        mocked_nlu_training.assert_not_called()
+
+        captured = capsys.readouterr()
+        assert (
+            "Stories file contains e2e stories. "
+            "Please train using `rasa train` so that the NLU model is also trained."
+        ) in captured.out
+
+
 @pytest.mark.timeout(300)
 @pytest.mark.parametrize("use_latest_model", [True, False])
 def test_model_finetuning(
diff --git a/tests/test_validator.py b/tests/test_validator.py
index dafeb2a2e66f..74de62caa13b 100644
--- a/tests/test_validator.py
+++ b/tests/test_validator.py
@@ -3,8 +3,10 @@
 import rasa.shared.utils.io
 from rasa.validator import Validator
 from rasa.shared.importers.rasa import RasaFileImporter
+from rasa.shared.importers.autoconfig import TrainingType
 from tests.conftest import DEFAULT_NLU_DATA
 from tests.core.conftest import DEFAULT_STORIES_FILE
+from pathlib import Path
 
 
 async def test_verify_intents_does_not_fail_on_valid_data():
@@ -77,6 +79,98 @@ async def test_verify_bad_story_structure():
     assert not validator.verify_story_structure(ignore_warnings=False)
 
 
+async def test_verify_bad_e2e_story_structure_when_text_identical(tmp_path: Path):
+    story_file_name = tmp_path / "stories.yml"
+    story_file_name.write_text(
+        """
+        version: "2.0"
+        stories:
+        - story: path 1
+          steps:
+          - user: |
+              amazing!
+          - action: utter_happy
+        - story: path 2 (should always conflict path 1)
+          steps:
+          - user: |
+              amazing!
+          - action: utter_cheer_up
+        """
+    )
+    # The two stories with identical user texts
+    importer = RasaFileImporter(
+        config_file="data/test_config/config_defaults.yml",
+        domain_path="data/test_domains/default.yml",
+        training_data_paths=[story_file_name],
+        training_type=TrainingType.NLU,
+    )
+    validator = await Validator.from_importer(importer)
+    assert not validator.verify_story_structure(ignore_warnings=False)
+
+
+async def test_verify_bad_e2e_story_structure_when_text_differs_by_whitespace(
+    tmp_path: Path,
+):
+    story_file_name = tmp_path / "stories.yml"
+    story_file_name.write_text(
+        """
+        version: "2.0"
+        stories:
+        - story: path 1
+          steps:
+          - user: |
+              truly amazing!
+          - action: utter_happy
+        - story: path 2 (should always conflict path 1)
+          steps:
+          - user: |
+              truly  amazing!
+          - action: utter_cheer_up
+        """
+    )
+    importer = RasaFileImporter(
+        config_file="data/test_config/config_defaults.yml",
+        domain_path="data/test_domains/default.yml",
+        training_data_paths=[story_file_name],
+        training_type=TrainingType.NLU,
+    )
+    validator = await Validator.from_importer(importer)
+    assert not validator.verify_story_structure(ignore_warnings=False)
+
+
+async def test_verify_correct_e2e_story_structure(tmp_path: Path):
+    story_file_name = tmp_path / "stories.yml"
+    with open(story_file_name, "w") as file:
+        file.write(
+            """
+            stories:
+            - story: path 1
+              steps:
+              - user: |
+                  hello assistant! Can you help me today?
+              - action: utter_greet
+            - story: path 2 - state is similar but different from the one in path 1
+              steps:
+              - user: |
+                  hello assistant! you Can help me today?
+              - action: utter_goodbye
+            - story: path 3
+              steps:
+              - user: |
+                  That's it for today. Chat again tomorrow!
+              - action: utter_goodbye
+            """
+        )
+    importer = RasaFileImporter(
+        config_file="data/test_config/config_defaults.yml",
+        domain_path="data/test_domains/default.yml",
+        training_data_paths=[story_file_name],
+        training_type=TrainingType.NLU,
+    )
+    validator = await Validator.from_importer(importer)
+    assert validator.verify_story_structure(ignore_warnings=False)
+
+
 async def test_verify_story_structure_ignores_rules():
     importer = RasaFileImporter(
         domain_path="data/test_domains/default.yml",
diff --git a/tests/utils/tensorflow/test_model_data.py b/tests/utils/tensorflow/test_model_data.py
index 26cf4d2d1781..304179434b13 100644
--- a/tests/utils/tensorflow/test_model_data.py
+++ b/tests/utils/tensorflow/test_model_data.py
@@ -1,64 +1,178 @@
 import copy
+from typing import Union, List
 
 import pytest
 import scipy.sparse
 import numpy as np
 
-from rasa.utils.tensorflow.model_data import RasaModelData
+from rasa.utils.tensorflow.model_data import RasaModelData, FeatureArray
 
 
 @pytest.fixture
 async def model_data() -> RasaModelData:
     return RasaModelData(
-        label_key="intent",
+        label_key="label",
         label_sub_key="ids",
         data={
-            "text_features": {
+            "text": {
                 "sentence": [
-                    np.array(
-                        [
-                            np.random.rand(5, 14),
-                            np.random.rand(2, 14),
-                            np.random.rand(3, 14),
-                            np.random.rand(1, 14),
-                            np.random.rand(3, 14),
-                        ]
+                    FeatureArray(
+                        np.array(
+                            [
+                                np.random.rand(5, 14),
+                                np.random.rand(2, 14),
+                                np.random.rand(3, 14),
+                                np.random.rand(1, 14),
+                                np.random.rand(3, 14),
+                            ]
+                        ),
+                        number_of_dimensions=3,
                     ),
-                    np.array(
-                        [
-                            scipy.sparse.csr_matrix(np.random.randint(5, size=(5, 10))),
-                            scipy.sparse.csr_matrix(np.random.randint(5, size=(2, 10))),
-                            scipy.sparse.csr_matrix(np.random.randint(5, size=(3, 10))),
-                            scipy.sparse.csr_matrix(np.random.randint(5, size=(1, 10))),
-                            scipy.sparse.csr_matrix(np.random.randint(5, size=(3, 10))),
-                        ]
+                    FeatureArray(
+                        np.array(
+                            [
+                                scipy.sparse.csr_matrix(
+                                    np.random.randint(5, size=(5, 10))
+                                ),
+                                scipy.sparse.csr_matrix(
+                                    np.random.randint(5, size=(2, 10))
+                                ),
+                                scipy.sparse.csr_matrix(
+                                    np.random.randint(5, size=(3, 10))
+                                ),
+                                scipy.sparse.csr_matrix(
+                                    np.random.randint(5, size=(1, 10))
+                                ),
+                                scipy.sparse.csr_matrix(
+                                    np.random.randint(5, size=(3, 10))
+                                ),
+                            ]
+                        ),
+                        number_of_dimensions=3,
                     ),
                 ]
             },
-            "intent_features": {
+            "action_text": {
+                "sequence": [
+                    FeatureArray(
+                        np.array(
+                            [
+                                [
+                                    scipy.sparse.csr_matrix(
+                                        np.random.randint(5, size=(5, 10))
+                                    ),
+                                    scipy.sparse.csr_matrix(
+                                        np.random.randint(5, size=(2, 10))
+                                    ),
+                                    scipy.sparse.csr_matrix(
+                                        np.random.randint(5, size=(3, 10))
+                                    ),
+                                    scipy.sparse.csr_matrix(
+                                        np.random.randint(5, size=(1, 10))
+                                    ),
+                                    scipy.sparse.csr_matrix(
+                                        np.random.randint(5, size=(3, 10))
+                                    ),
+                                ],
+                                [
+                                    scipy.sparse.csr_matrix(
+                                        np.random.randint(5, size=(5, 10))
+                                    ),
+                                    scipy.sparse.csr_matrix(
+                                        np.random.randint(5, size=(2, 10))
+                                    ),
+                                ],
+                                [
+                                    scipy.sparse.csr_matrix(
+                                        np.random.randint(5, size=(5, 10))
+                                    ),
+                                    scipy.sparse.csr_matrix(
+                                        np.random.randint(5, size=(1, 10))
+                                    ),
+                                    scipy.sparse.csr_matrix(
+                                        np.random.randint(5, size=(3, 10))
+                                    ),
+                                ],
+                                [
+                                    scipy.sparse.csr_matrix(
+                                        np.random.randint(5, size=(3, 10))
+                                    )
+                                ],
+                                [
+                                    scipy.sparse.csr_matrix(
+                                        np.random.randint(5, size=(3, 10))
+                                    ),
+                                    scipy.sparse.csr_matrix(
+                                        np.random.randint(5, size=(1, 10))
+                                    ),
+                                    scipy.sparse.csr_matrix(
+                                        np.random.randint(5, size=(7, 10))
+                                    ),
+                                ],
+                            ]
+                        ),
+                        number_of_dimensions=4,
+                    ),
+                    FeatureArray(
+                        np.array(
+                            [
+                                [
+                                    np.random.rand(5, 14),
+                                    np.random.rand(2, 14),
+                                    np.random.rand(3, 14),
+                                    np.random.rand(1, 14),
+                                    np.random.rand(3, 14),
+                                ],
+                                [np.random.rand(5, 14), np.random.rand(2, 14)],
+                                [
+                                    np.random.rand(5, 14),
+                                    np.random.rand(1, 14),
+                                    np.random.rand(3, 14),
+                                ],
+                                [np.random.rand(3, 14)],
+                                [
+                                    np.random.rand(3, 14),
+                                    np.random.rand(1, 14),
+                                    np.random.rand(7, 14),
+                                ],
+                            ]
+                        ),
+                        number_of_dimensions=4,
+                    ),
+                ]
+            },
+            "dialogue": {
                 "sentence": [
-                    np.array(
-                        [
-                            np.random.randint(2, size=(5, 10)),
-                            np.random.randint(2, size=(2, 10)),
-                            np.random.randint(2, size=(3, 10)),
-                            np.random.randint(2, size=(1, 10)),
-                            np.random.randint(2, size=(3, 10)),
-                        ]
+                    FeatureArray(
+                        np.array(
+                            [
+                                np.random.randint(2, size=(5, 10)),
+                                np.random.randint(2, size=(2, 10)),
+                                np.random.randint(2, size=(3, 10)),
+                                np.random.randint(2, size=(1, 10)),
+                                np.random.randint(2, size=(3, 10)),
+                            ]
+                        ),
+                        number_of_dimensions=3,
                     )
                 ]
             },
-            "intent": {"ids": [np.array([0, 1, 0, 1, 1])]},
+            "label": {
+                "ids": [FeatureArray(np.array([0, 1, 0, 1, 1]), number_of_dimensions=1)]
+            },
             "entities": {
                 "tag_ids": [
-                    np.array(
-                        [
-                            np.array([[0], [1], [1], [0], [2]]),
-                            np.array([[2], [0]]),
-                            np.array([[0], [1], [1]]),
-                            np.array([[0], [1]]),
-                            np.array([[0], [0], [0]]),
-                        ]
+                    FeatureArray(
+                        np.array(
+                            [
+                                np.array([[0], [1], [1], [0], [2]]),
+                                np.array([[2], [0]]),
+                                np.array([[0], [1], [1]]),
+                                np.array([[0], [1]]),
+                                np.array([[0], [0], [0]]),
+                            ]
+                        ),
+                        number_of_dimensions=3,
                     )
                 ]
             },
@@ -86,12 +200,17 @@ def test_shuffle_session_data(model_data: RasaModelData):
 
 def test_split_data_by_label(model_data: RasaModelData):
     split_model_data = model_data._split_by_label_ids(
-        model_data.data, model_data.get("intent", "ids")[0], np.array([0, 1])
+        model_data.data, model_data.get("label", "ids")[0], np.array([0, 1])
     )
 
     assert len(split_model_data) == 2
     for s in split_model_data:
-        assert len(set(s.get("intent", "ids")[0])) == 1
+        assert len(set(s.get("label", "ids")[0])) == 1
+
+    for key, attribute_data in split_model_data[0].items():
+        for sub_key, features in attribute_data.items():
+            assert len(features) == len(model_data.data[key][sub_key])
+            assert len(features[0]) == 2
 
 
 def test_split_data_by_none_label(model_data: RasaModelData):
@@ -106,9 +225,9 @@ def test_split_data_by_none_label(model_data: RasaModelData):
     test_data = split_model_data[1]
 
     # train data should have 3 examples
-    assert len(train_data.get("intent", "ids")[0]) == 3
+    assert len(train_data.get("label", "ids")[0]) == 3
     # test data should have 2 examples
-    assert len(test_data.get("intent", "ids")[0]) == 2
+    assert len(test_data.get("label", "ids")[0]) == 2
 
 
 def test_train_val_split(model_data: RasaModelData):
@@ -121,17 +240,23 @@ def test_train_val_split(model_data: RasaModelData):
             assert len(data) == len(train_model_data.get(key, sub_key))
             assert len(data) == len(test_model_data.get(key, sub_key))
             for i, v in enumerate(data):
-                assert v[0].dtype == train_model_data.get(key, sub_key)[i][0].dtype
+                if isinstance(v[0], list):
+                    assert (
+                        v[0][0].dtype
+                        == train_model_data.get(key, sub_key)[i][0][0].dtype
+                    )
+                else:
+                    assert v[0].dtype == train_model_data.get(key, sub_key)[i][0].dtype
 
     for values in train_model_data.values():
         for data in values.values():
             for v in data:
-                assert v.shape[0] == 3
+                assert np.array(v).shape[0] == 3
 
     for values in test_model_data.values():
         for data in values.values():
             for v in data:
-                assert v.shape[0] == 2
+                assert np.array(v).shape[0] == 2
 
 
 @pytest.mark.parametrize("size", [0, 1, 5])
@@ -146,7 +271,7 @@ def test_session_data_for_ids(model_data: RasaModelData):
     for values in filtered_data.values():
         for data in values.values():
             for v in data:
-                assert v.shape[0] == 2
+                assert np.array(v).shape[0] == 2
 
     key = model_data.keys()[0]
     sub_key = model_data.keys(key)[0]
@@ -174,27 +299,32 @@ def test_get_number_of_examples_raises_value_error(model_data: RasaModelData):
 
 def test_gen_batch(model_data: RasaModelData):
     iterator = model_data._gen_batch(2, shuffle=True, batch_strategy="balanced")
-    print(model_data.data["entities"]["tag_ids"][0])
+
     batch = next(iterator)
-    assert len(batch) == 7
+    assert len(batch) == 11
     assert len(batch[0]) == 2
 
     batch = next(iterator)
-    assert len(batch) == 7
+    assert len(batch) == 11
     assert len(batch[0]) == 2
 
     batch = next(iterator)
-    assert len(batch) == 7
+    assert len(batch) == 11
     assert len(batch[0]) == 1
 
     with pytest.raises(StopIteration):
         next(iterator)
 
 
+def test_is_in_4d_format(model_data: RasaModelData):
+    assert model_data.data["action_text"]["sequence"][0].number_of_dimensions == 4
+    assert model_data.data["text"]["sentence"][0].number_of_dimensions == 3
+
+
 def test_balance_model_data(model_data: RasaModelData):
     data = model_data._balanced_data(model_data.data, 2, False)
 
-    assert np.all(data["intent"]["ids"][0] == np.array([0, 1, 1, 0, 1]))
+    assert np.all(np.array(data["label"]["ids"][0]) == np.array([0, 1, 1, 0, 1]))
 
 
 def test_not_balance_model_data(model_data: RasaModelData):
@@ -210,6 +340,176 @@ def test_not_balance_model_data(model_data: RasaModelData):
 
 
 def test_get_num_of_features(model_data: RasaModelData):
-    num_features = model_data.feature_dimension("text_features", "sentence")
+    num_features = model_data.number_of_units("text", "sentence")
 
     assert num_features == 24
+
+
+@pytest.mark.parametrize(
+    "incoming_data, expected_shape",
+    [
+        (FeatureArray(np.random.rand(7, 12), number_of_dimensions=2), (7, 12)),
+        (FeatureArray(np.random.rand(7), number_of_dimensions=1), (7,)),
+        (
+            FeatureArray(
+                np.array(
+                    [
+                        np.random.rand(1, 10),
+                        np.random.rand(3, 10),
+                        np.random.rand(7, 10),
+                        np.random.rand(1, 10),
+                    ]
+                ),
+                number_of_dimensions=3,
+            ),
+            (4, 7, 10),
+        ),
+        (
+            FeatureArray(
+                np.array(
+                    [
+                        np.array(
+                            [
+                                np.random.rand(1, 10),
+                                np.random.rand(5, 10),
+                                np.random.rand(7, 10),
+                            ]
+                        ),
+                        np.array(
+                            [
+                                np.random.rand(1, 10),
+                                np.random.rand(3, 10),
+                                np.random.rand(3, 10),
+                                np.random.rand(7, 10),
+                            ]
+                        ),
+                        np.array([np.random.rand(2, 10)]),
+                    ]
+                ),
+                number_of_dimensions=4,
+            ),
+            (8, 7, 10),
+        ),
+    ],
+)
+def test_pad_dense_data(incoming_data: FeatureArray, expected_shape: np.ndarray):
+    padded_data = RasaModelData._pad_dense_data(incoming_data)
+
+    assert padded_data.shape == expected_shape
+
+
+@pytest.mark.parametrize(
+    "incoming_data, expected_shape",
+    [
+        (
+            FeatureArray(
+                np.array([scipy.sparse.csr_matrix(np.random.randint(5, size=(7, 12)))]),
+                number_of_dimensions=3,
+            ),
+            [1, 7, 12],
+        ),
+        (
+            FeatureArray(
+                np.array([scipy.sparse.csr_matrix(np.random.randint(5, size=(7,)))]),
+                number_of_dimensions=2,
+            ),
+            [1, 1, 7],
+        ),
+        (
+            FeatureArray(
+                np.array(
+                    [
+                        scipy.sparse.csr_matrix(np.random.randint(10, size=(1, 10))),
+                        scipy.sparse.csr_matrix(np.random.randint(10, size=(3, 10))),
+                        scipy.sparse.csr_matrix(np.random.randint(10, size=(7, 10))),
+                        scipy.sparse.csr_matrix(np.random.randint(10, size=(1, 10))),
+                    ]
+                ),
+                number_of_dimensions=3,
+            ),
+            (4, 7, 10),
+        ),
+        (
+            FeatureArray(
+                np.array(
+                    [
+                        np.array(
+                            [
+                                scipy.sparse.csr_matrix(
+                                    np.random.randint(10, size=(1, 10))
+                                ),
+                                scipy.sparse.csr_matrix(
+                                    np.random.randint(10, size=(5, 10))
+                                ),
+                                scipy.sparse.csr_matrix(
+                                    np.random.randint(10, size=(7, 10))
+                                ),
+                            ]
+                        ),
+                        np.array(
+                            [
+                                scipy.sparse.csr_matrix(
+                                    np.random.randint(10, size=(1, 10))
+                                ),
+                                scipy.sparse.csr_matrix(
+                                    np.random.randint(10, size=(3, 10))
+                                ),
+                                scipy.sparse.csr_matrix(
+                                    np.random.randint(10, size=(1, 10))
+                                ),
+                                scipy.sparse.csr_matrix(
+                                    np.random.randint(10, size=(7, 10))
+                                ),
+                            ]
+                        ),
+                        np.array(
+                            [
+                                scipy.sparse.csr_matrix(
+                                    np.random.randint(10, size=(2, 10))
+                                )
+                            ]
+                        ),
+                    ]
+                ),
+                number_of_dimensions=4,
+            ),
+            (8, 7, 10),
+        ),
+    ],
+)
+def test_scipy_matrix_to_values(
+    incoming_data: FeatureArray, expected_shape: np.ndarray
+):
+    indices, data, shape = RasaModelData._scipy_matrix_to_values(incoming_data)
+
+    assert np.all(shape == expected_shape)
+
+
+def test_sort(model_data: RasaModelData):
+    assert list(model_data.data.keys()) == [
+        "text",
+        "action_text",
+        "dialogue",
+        "label",
+        "entities",
+    ]
+
+    model_data.sort()
+
+    assert list(model_data.data.keys()) == [
+        "action_text",
+        "dialogue",
+        "entities",
+        "label",
+        "text",
+    ]
+
+
+def test_update_key(model_data: RasaModelData):
+    assert model_data.does_feature_exist("label", "ids")
+
+    model_data.update_key("label", "ids", "intent", "ids")
+
+    assert not model_data.does_feature_exist("label", "ids")
+    assert model_data.does_feature_exist("intent", "ids")
+    assert "label" not in model_data.data
diff --git a/tests/utils/tensorflow/test_model_data_utils.py b/tests/utils/tensorflow/test_model_data_utils.py
index 7da97a4b26e2..52cad62b4762 100644
--- a/tests/utils/tensorflow/test_model_data_utils.py
+++ b/tests/utils/tensorflow/test_model_data_utils.py
@@ -1,17 +1,36 @@
+from typing import Any, Text, Optional, Dict, List
+
+import pytest
 import scipy.sparse
 import numpy as np
 import copy
 
+from rasa.nlu.classifiers.diet_classifier import EntityTagSpec
+from rasa.nlu.constants import SPACY_DOCS
+from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
+from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
+    CountVectorsFeaturizer,
+)
+from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
 from rasa.shared.nlu.training_data.formats.markdown import INTENT
 from rasa.utils.tensorflow import model_data_utils
 from rasa.shared.nlu.training_data.features import Features
-from rasa.shared.nlu.constants import ACTION_NAME
+from rasa.shared.nlu.constants import (
+    ACTION_NAME,
+    TEXT,
+    ENTITIES,
+    FEATURE_TYPE_SENTENCE,
+    FEATURE_TYPE_SEQUENCE,
+)
 from rasa.utils.tensorflow.constants import SENTENCE
+from rasa.shared.nlu.training_data.message import Message
+from rasa.shared.nlu.training_data.training_data import TrainingData
+from rasa.utils.tensorflow.model_data_utils import TAG_ID_ORIGIN
 
 shape = 100
 
 
-def test_create_zero_features():
+def test_create_fake_features():
     # DENSE FEATURES
     dense_feature_sentence_features = Features(
         features=np.random.rand(shape),
@@ -21,10 +40,10 @@ def test_create_zero_features():
     )
     features = [[None, None, [dense_feature_sentence_features]]]
 
-    zero_features = model_data_utils.create_zero_features(features)
-    assert len(zero_features) == 1
-    assert zero_features[0].is_dense()
-    assert (zero_features[0].features == np.zeros(shape)).all()
+    fake_features = model_data_utils._create_fake_features(features)
+    assert len(fake_features) == 1
+    assert fake_features[0].is_dense()
+    assert fake_features[0].features.shape == (0, shape)
 
     # SPARSE FEATURES
     sparse_feature_sentence_features = Features(
@@ -34,10 +53,11 @@ def test_create_zero_features():
         origin=[],
     )
     features = [[None, None, [sparse_feature_sentence_features]]]
-    zero_features = model_data_utils.create_zero_features(features)
-    assert len(zero_features) == 1
-    assert zero_features[0].is_sparse()
-    assert (zero_features[0].features != scipy.sparse.coo_matrix((1, shape))).nnz == 0
+    fake_features = model_data_utils._create_fake_features(features)
+    assert len(fake_features) == 1
+    assert fake_features[0].is_sparse()
+    assert fake_features[0].features.shape == (0, shape)
+    assert fake_features[0].features.nnz == 0
 
 
 def test_surface_attributes():
@@ -47,8 +67,14 @@ def test_surface_attributes():
                 features=np.random.rand(shape),
                 attribute=INTENT,
                 feature_type=SENTENCE,
-                origin=[],
-            )
+                origin="featurizer-a",
+            ),
+            Features(
+                features=np.random.rand(shape),
+                attribute=INTENT,
+                feature_type=SENTENCE,
+                origin="featurizer-b",
+            ),
         ]
     }
 
@@ -59,7 +85,7 @@ def test_surface_attributes():
                 features=action_name_features,
                 attribute=ACTION_NAME,
                 feature_type=SENTENCE,
-                origin=[],
+                origin="featurizer-c",
             )
         ]
     }
@@ -67,7 +93,9 @@ def test_surface_attributes():
     state_features.update(copy.deepcopy(action_name_features))
     # test on 2 dialogs -- one with dialog length 3 the other one with dialog length 2
     dialogs = [[state_features, intent_features, {}], [{}, action_name_features]]
-    surfaced_features = model_data_utils.surface_attributes(dialogs)
+    surfaced_features = model_data_utils._surface_attributes(
+        dialogs, featurizers=["featurizer-a", "featurizer-c"]
+    )
     assert INTENT in surfaced_features and ACTION_NAME in surfaced_features
     # check that number of lists corresponds to number of dialogs
     assert (
@@ -113,19 +141,19 @@ def test_surface_attributes():
     )
 
 
-def test_map_tracker_features():
-    zero_features = np.zeros(shape)
-    zero_features_as_features = Features(
-        features=zero_features, attribute=INTENT, feature_type=SENTENCE, origin=[]
+def test_extract_features():
+    fake_features = np.zeros(shape)
+    fake_features_as_features = Features(
+        features=fake_features, attribute=INTENT, feature_type=SENTENCE, origin=[]
     )
     # create zero features
-    zero_features_list = [zero_features_as_features]
+    fake_features_list = [fake_features_as_features]
 
     # create tracker state features by setting a random index in the array to 1
     random_inds = np.random.randint(shape, size=6)
     list_of_features = []
     for idx in random_inds:
-        current_features = copy.deepcopy(zero_features_as_features)
+        current_features = copy.deepcopy(fake_features_as_features)
         current_features.features[idx] = 1
         list_of_features.append([current_features])
 
@@ -140,9 +168,163 @@ def test_map_tracker_features():
         attribute_masks,
         dense_features,
         sparse_features,
-    ) = model_data_utils.map_tracker_features(tracker_features, zero_features_list)
+    ) = model_data_utils._extract_features(tracker_features, fake_features_list, INTENT)
     expected_mask = np.array([[1, 0, 1], [0, 0, 1], [1, 1, 1]])
 
     assert np.all(np.squeeze(np.array(attribute_masks), 2) == expected_mask)
-    assert np.array(dense_features["sentence"]).shape[-1] == zero_features.shape[-1]
+    assert np.array(dense_features[SENTENCE]).shape[-1] == fake_features.shape[-1]
     assert sparse_features == {}
+
+
+@pytest.mark.parametrize(
+    "text, intent, entities, attributes",
+    [
+        ("Hello!", "greet", None, [TEXT]),
+        ("Hello!", "greet", None, [TEXT, INTENT]),
+        (
+            "Hello Max!",
+            "greet",
+            [{"entity": "name", "value": "Max", "start": 6, "end": 9}],
+            [TEXT, ENTITIES],
+        ),
+    ],
+)
+def test_convert_training_examples(
+    spacy_nlp: Any,
+    text: Text,
+    intent: Optional[Text],
+    entities: Optional[List[Dict[Text, Any]]],
+    attributes: List[Text],
+):
+    message = Message(data={TEXT: text, INTENT: intent, ENTITIES: entities})
+
+    tokenizer = SpacyTokenizer()
+    count_vectors_featurizer = CountVectorsFeaturizer()
+    spacy_featurizer = SpacyFeaturizer()
+
+    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))
+
+    training_data = TrainingData([message])
+    tokenizer.train(training_data)
+    count_vectors_featurizer.train(training_data)
+    spacy_featurizer.train(training_data)
+
+    entity_tag_spec = [
+        EntityTagSpec(
+            "entity",
+            {0: "O", 1: "name", 2: "location"},
+            {"O": 0, "name": 1, "location": 2},
+            3,
+        )
+    ]
+    output = model_data_utils.featurize_training_examples(
+        [message], attributes=attributes, entity_tag_specs=entity_tag_spec
+    )
+
+    assert len(output) == 1
+    for attribute in attributes:
+        assert attribute in output[0]
+    for attribute in {INTENT, TEXT, ENTITIES} - set(attributes):
+        assert attribute not in output[0]
+    # we have sparse sentence, sparse sequence, dense sentence, and dense sequence features in the list
+    assert len(output[0][TEXT]) == 4
+    if INTENT in attributes:
+        # we will just have space sentence features
+        assert len(output[0][INTENT]) == 1
+    if ENTITIES in attributes:
+        # we will just have space sentence features
+        assert len(output[0][ENTITIES]) == len(entity_tag_spec)
+
+
+@pytest.mark.parametrize(
+    "features, featurizers, expected_features",
+    [
+        ([], None, []),
+        (None, ["featurizer-a"], None),
+        (
+            [
+                Features(
+                    np.random.rand(5, 14), FEATURE_TYPE_SENTENCE, TEXT, "featurizer-a"
+                )
+            ],
+            None,
+            [
+                Features(
+                    np.random.rand(5, 14), FEATURE_TYPE_SENTENCE, TEXT, "featurizer-a"
+                )
+            ],
+        ),
+        (
+            [
+                Features(
+                    np.random.rand(5, 14), FEATURE_TYPE_SENTENCE, TEXT, "featurizer-a"
+                )
+            ],
+            ["featurizer-b"],
+            [],
+        ),
+        (
+            [
+                Features(
+                    np.random.rand(5, 14), FEATURE_TYPE_SENTENCE, TEXT, "featurizer-a"
+                ),
+                Features(
+                    np.random.rand(5, 14),
+                    FEATURE_TYPE_SEQUENCE,
+                    ACTION_NAME,
+                    "featurizer-b",
+                ),
+            ],
+            ["featurizer-b"],
+            [
+                Features(
+                    np.random.rand(5, 14),
+                    FEATURE_TYPE_SEQUENCE,
+                    ACTION_NAME,
+                    "featurizer-b",
+                )
+            ],
+        ),
+        (
+            [
+                Features(
+                    np.random.rand(5, 14), FEATURE_TYPE_SEQUENCE, "role", TAG_ID_ORIGIN
+                ),
+                Features(
+                    np.random.rand(5, 14),
+                    FEATURE_TYPE_SEQUENCE,
+                    ACTION_NAME,
+                    "featurizer-b",
+                ),
+            ],
+            ["featurizer-b"],
+            [
+                Features(
+                    np.random.rand(5, 14), FEATURE_TYPE_SEQUENCE, "role", TAG_ID_ORIGIN
+                ),
+                Features(
+                    np.random.rand(5, 14),
+                    FEATURE_TYPE_SEQUENCE,
+                    ACTION_NAME,
+                    "featurizer-b",
+                ),
+            ],
+        ),
+    ],
+)
+def test_filter_features(
+    features: Optional[List["Features"]],
+    featurizers: Optional[List[Text]],
+    expected_features: Optional[List["Features"]],
+):
+    actual_features = model_data_utils._filter_features(features, featurizers)
+
+    if expected_features is None:
+        assert actual_features is None
+        return
+
+    assert len(actual_features) == len(expected_features)
+    for actual_feature, expected_feature in zip(actual_features, expected_features):
+        assert expected_feature.origin == actual_feature.origin
+        assert expected_feature.type == actual_feature.type
+        assert expected_feature.attribute == actual_feature.attribute